package weka.filters.unsupervised.attribute;

PrincipalComponents

属性:

  /** The data to transform analyse/transform. */
protected Instances m_TrainInstances; /** Keep a copy for the class attribute (if set). */
protected Instances m_TrainCopy; /** The header for the transformed data format. */
protected Instances m_TransformedFormat; /** Data has a class set. */
protected boolean m_HasClass; /** Class index. */
protected int m_ClassIndex; /** Number of attributes. */
protected int m_NumAttribs; /** Number of instances. */
protected int m_NumInstances; /** Correlation matrix for the original data. */
protected double[][] m_Correlation; /**
* If true, center (rather than standardize) the data and
* compute PCA from covariance (rather than correlation)
* matrix.
*/
private boolean m_center = false; /** Will hold the unordered linear transformations of the (normalized)
original data. */
protected double[][] m_Eigenvectors; /** Eigenvalues for the corresponding eigenvectors. */
protected double[] m_Eigenvalues = null; /** Sorted eigenvalues. */
protected int[] m_SortedEigens; /** sum of the eigenvalues. */
protected double m_SumOfEigenValues = 0.0; /** Filters for replacing missing values. */
protected ReplaceMissingValues m_ReplaceMissingFilter; /** Filter for turning nominal values into numeric ones. */
protected NominalToBinary m_NominalToBinaryFilter; /** Filter for removing class attribute, nominal attributes with 0 or 1 value. */
protected Remove m_AttributeFilter; /** Filter for standardizing the data */
protected Standardize m_standardizeFilter; /** Filter for centering the data */
protected Center m_centerFilter; /** The number of attributes in the pc transformed data. */
protected int m_OutputNumAtts = -1; /** the amount of varaince to cover in the original data when
retaining the best n PC's. */
protected double m_CoverVariance = 0.95; /** maximum number of attributes in the transformed attribute name. */
protected int m_MaxAttrsInName = 5; /** maximum number of attributes in the transformed data (-1 for all). */
protected int m_MaxAttributes = -1;

计算协方差矩阵或相关系数矩阵

  protected void fillCovariance() throws Exception {    

    if (!m_center) {
fillCorrelation();
return;
} double[] att = new double[m_TrainInstances.numInstances()]; // now center the data by subtracting the mean
m_centerFilter = new Center();
m_centerFilter.setInputFormat(m_TrainInstances);
m_TrainInstances = Filter.useFilter(m_TrainInstances, m_centerFilter); // now compute the covariance matrix
m_Correlation = new double[m_NumAttribs][m_NumAttribs]; for (int i = 0; i < m_NumAttribs; i++) {
for (int j = 0; j < m_NumAttribs; j++) { double cov = 0;
for (int k = 0; k < m_NumInstances; k++) { if (i == j) {
cov += (m_TrainInstances.instance(k).value(i) *
m_TrainInstances.instance(k).value(i));
} else {
cov += (m_TrainInstances.instance(k).value(i) *
m_TrainInstances.instance(k).value(j));
}
} cov /= (double)(m_TrainInstances.numInstances() - 1);
m_Correlation[i][j] = cov;
m_Correlation[j][i] = cov;
}
}
} /**
* Fill the correlation matrix.
*/
protected void fillCorrelation() throws Exception {
int i;
int j;
int k;
double[] att1;
double[] att2;
double corr; m_Correlation = new double[m_NumAttribs][m_NumAttribs];
att1 = new double [m_NumInstances];
att2 = new double [m_NumInstances]; for (i = 0; i < m_NumAttribs; i++) {
for (j = 0; j < m_NumAttribs; j++) {
for (k = 0; k < m_NumInstances; k++) {
att1[k] = m_TrainInstances.instance(k).value(i);
att2[k] = m_TrainInstances.instance(k).value(j);
}
if (i == j) {
m_Correlation[i][j] = 1.0;
}
else {
corr = Utils.correlation(att1,att2,m_NumInstances);
m_Correlation[i][j] = corr;
m_Correlation[j][i] = corr;
}
}
} // now standardize the input data
m_standardizeFilter = new Standardize();
m_standardizeFilter.setInputFormat(m_TrainInstances);
m_TrainInstances = Filter.useFilter(m_TrainInstances, m_standardizeFilter);
}

处理数据

  /**
* Transform an instance in original (unormalized) format.
*
* @param instance an instance in the original (unormalized) format
* @return a transformed instance
* @throws Exception if instance can't be transformed
*/
protected Instance convertInstance(Instance instance) throws Exception {
Instance result;
double[] newVals;
Instance tempInst;
double cumulative;
int i;
int j;
double tempval;
int numAttsLowerBound; newVals = new double[m_OutputNumAtts];
tempInst = (Instance) instance.copy(); m_ReplaceMissingFilter.input(tempInst);
m_ReplaceMissingFilter.batchFinished();
tempInst = m_ReplaceMissingFilter.output(); m_NominalToBinaryFilter.input(tempInst);
m_NominalToBinaryFilter.batchFinished();
tempInst = m_NominalToBinaryFilter.output(); if (m_AttributeFilter != null) {
m_AttributeFilter.input(tempInst);
m_AttributeFilter.batchFinished();
tempInst = m_AttributeFilter.output();
} if (!m_center) {
m_standardizeFilter.input(tempInst);
m_standardizeFilter.batchFinished();
tempInst = m_standardizeFilter.output();
} else {
m_centerFilter.input(tempInst);
m_centerFilter.batchFinished();
tempInst = m_centerFilter.output();
} if (m_HasClass)
newVals[m_OutputNumAtts - 1] = instance.value(instance.classIndex()); if (m_MaxAttributes > 0)
numAttsLowerBound = m_NumAttribs - m_MaxAttributes;
else
numAttsLowerBound = 0;
if (numAttsLowerBound < 0)
numAttsLowerBound = 0; cumulative = 0;
for (i = m_NumAttribs - 1; i >= numAttsLowerBound; i--) {
tempval = 0.0;
for (j = 0; j < m_NumAttribs; j++)
tempval += m_Eigenvectors[j][m_SortedEigens[i]] * tempInst.value(j); newVals[m_NumAttribs - i - 1] = tempval;
cumulative += m_Eigenvalues[m_SortedEigens[i]];
if ((cumulative / m_SumOfEigenValues) >= m_CoverVariance)
break;
} // create instance
if (instance instanceof SparseInstance)
result = new SparseInstance(instance.weight(), newVals);
else
result = new DenseInstance(instance.weight(), newVals); return result;
} /**
* Initializes the filter with the given input data.
*
* @param instances the data to process
* @throws Exception in case the processing goes wrong
* @see #batchFinished()
*/
protected void setup(Instances instances) throws Exception {
int i;
int j;
Vector<Integer> deleteCols;
int[] todelete;
double[][] v;
Matrix corr;
EigenvalueDecomposition eig;
Matrix V; m_TrainInstances = new Instances(instances); // make a copy of the training data so that we can get the class
// column to append to the transformed data (if necessary)
m_TrainCopy = new Instances(m_TrainInstances, 0); m_ReplaceMissingFilter = new ReplaceMissingValues();
m_ReplaceMissingFilter.setInputFormat(m_TrainInstances);
m_TrainInstances = Filter.useFilter(m_TrainInstances, m_ReplaceMissingFilter); m_NominalToBinaryFilter = new NominalToBinary();
m_NominalToBinaryFilter.setInputFormat(m_TrainInstances);
m_TrainInstances = Filter.useFilter(m_TrainInstances, m_NominalToBinaryFilter); // delete any attributes with only one distinct value or are all missing
deleteCols = new Vector<Integer>();
for (i = 0; i < m_TrainInstances.numAttributes(); i++) {
if (m_TrainInstances.numDistinctValues(i) <= 1)
deleteCols.addElement(i);
} if (m_TrainInstances.classIndex() >=0) {
// get rid of the class column
m_HasClass = true;
m_ClassIndex = m_TrainInstances.classIndex();
deleteCols.addElement(new Integer(m_ClassIndex));
} // remove columns from the data if necessary
if (deleteCols.size() > 0) {
m_AttributeFilter = new Remove();
todelete = new int [deleteCols.size()];
for (i = 0; i < deleteCols.size(); i++)
todelete[i] = ((Integer)(deleteCols.elementAt(i))).intValue();
m_AttributeFilter.setAttributeIndicesArray(todelete);
m_AttributeFilter.setInvertSelection(false);
m_AttributeFilter.setInputFormat(m_TrainInstances);
m_TrainInstances = Filter.useFilter(m_TrainInstances, m_AttributeFilter);
} // can evaluator handle the processed data ? e.g., enough attributes?
getCapabilities().testWithFail(m_TrainInstances); m_NumInstances = m_TrainInstances.numInstances();
m_NumAttribs = m_TrainInstances.numAttributes(); //fillCorrelation();
fillCovariance(); // get eigen vectors/values
corr = new Matrix(m_Correlation);
eig = corr.eig();
V = eig.getV();
v = new double[m_NumAttribs][m_NumAttribs];
for (i = 0; i < v.length; i++) {
for (j = 0; j < v[0].length; j++)
v[i][j] = V.get(i, j);
}
m_Eigenvectors = (double[][]) v.clone();
m_Eigenvalues = (double[]) eig.getRealEigenvalues().clone(); // any eigenvalues less than 0 are not worth anything --- change to 0
for (i = 0; i < m_Eigenvalues.length; i++) {
if (m_Eigenvalues[i] < 0)
m_Eigenvalues[i] = 0.0;
}
m_SortedEigens = Utils.sort(m_Eigenvalues);
m_SumOfEigenValues = Utils.sum(m_Eigenvalues); m_TransformedFormat = determineOutputFormat(m_TrainInstances);
setOutputFormat(m_TransformedFormat); m_TrainInstances = null;
}

Weka——PrincipalComponents分析的更多相关文章

  1. Weka关联规则分析

    购物篮分析: Apriori算法: 参数设置: 1.car 如果设为真,则会挖掘类关联规则而不是全局关联规则. 2. classindex 类属性索引.如果设置为-1,最后的属性被当做类属性. 3. ...

  2. Weka算法Clusterers-DBSCAN源代码分析

    假设说世界上仅仅能存在一种基于密度的聚类算法的话.那么它必须是DBSCAN(Density-based spatial clustering of applications with noise).D ...

  3. Weka算法Clusterers-Xmeans源代码分析(一)

    <p></p><p><span style="font-size:18px">上几篇博客都是分析的分类器算法(有监督学习),这次就分 ...

  4. Weka学习之关联规则分析

    步骤: (一) 选择数据源 (二)选择要分析的字段 (三)选择需要的关联规则算法 (四)点击start运行 (五) 分析结果 算法选择: Apriori算法参数含义 1.car:如果设为真,则会挖掘类 ...

  5. Weka算法Classifier-meta-AdaBoostM1源代码分析(一)

    多分类器组合算法简单的来讲经常使用的有voting,bagging和boosting,当中就效果来说Boosting略占优势,而AdaBoostM1算法又相当于Boosting算法的"经典款 ...

  6. Weka算法Classifier-tree-J48源代码分析(一个)基本数据结构和算法

    大约一年,我没有照顾的博客,再次拿起笔不知从何写上,想来想去手从最近使用Weka要正确书写. Weka为一个Java基础上的机器学习工具.上手简单,并提供图形化界面.提供如分类.聚类.频繁项挖掘等工具 ...

  7. 数据挖掘:关联规则的apriori算法在weka的源码分析

    相对于机器学习,关联规则的apriori算法更偏向于数据挖掘. 1) 测试文档中调用weka的关联规则apriori算法,如下 try { File file = new File("F:\ ...

  8. Weka中数据挖掘与机器学习系列之Exploer界面(七)

    不多说,直接上干货! Weka的Explorer(探索者)界面,是Weka的主要图形化用户界面,其全部功能都可通过菜单选择或表单填写进行访问.本博客将详细介绍Weka探索者界面的图形化用户界面.预处理 ...

  9. Weka算法算法翻译(部分)

    目录 Weka算法翻译(部分) 1. 属性选择算法(select attributes) 1.1 属性评估方法 1.2 搜索方法 2. 分类算法 2.1 贝叶斯算法 2.2 Functions 2.3 ...

随机推荐

  1. purge recyclebin之后dba_segments仍然有BIN$段

    现象: purge recyclebin之后dba_segments仍然有BIN$段. 如下,执行了purge recyclebin之后: SQL> select segment_name,SE ...

  2. php 函数合并 array_merge 与 + 的区别

    array_merge()是PHP语言中的一个函数,作用是将两个或多个数组的单元合并起来,一个数组中的值附加在前一个数组的后面.返回作为结果的数组. 如果输入的数组中有相同的字符串键名,该键的键值为最 ...

  3. maven用变量的方法统一管理jar包版本

    <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/20 ...

  4. git回退之前版本

    所有没有 commit 的本地改动,都会随着 reset --hard 丢掉,无法恢复. 如果只是想回到 pull 之前当前分支所在的commit位置,则可以.比方说你在 master 分支上,可以用 ...

  5. ELK篇---------elasticsearch集群安装配置

    说明: 本次ELK的基础配置如下: 虚拟机:vmware 11 系统:centos7.2  两台 IP:172.16.1.15/16 一.下载es wget https://download.elas ...

  6. url写法细节

  7. Saltstack生产案例之Haproxy安装

    cd /srv/salt/prod/ mkdir haproxymkdir keepalivedmkdir nginxmkdir phpmkdir memcachedmkdir pkg cd pkg ...

  8. 在python中读写matlab文件

    scipy.io提供有两个函数loadmat和savemat,用来读取和存储mat的数据文件 import scipy.io as sio 还有一些其他常用的模块 import numpy as np ...

  9. python3学习笔记(4)_function-参数

    #python学习笔记 17/07/10 # !/usr/bin/evn python3 # -*- coding:utf-8 -*- import math #函数 函数的 定义 #定义一个求绝对值 ...

  10. eclipse反编译插件jadClipse安装使用教程

    previously:最近在学习Dependency Injection(依赖注入)模式,看了 martin fowler 的 文章(原文:https://martinfowler.com/artic ...