package weka.filters.unsupervised.attribute;

PrincipalComponents

属性:

  /** The data to transform analyse/transform. */
protected Instances m_TrainInstances; /** Keep a copy for the class attribute (if set). */
protected Instances m_TrainCopy; /** The header for the transformed data format. */
protected Instances m_TransformedFormat; /** Data has a class set. */
protected boolean m_HasClass; /** Class index. */
protected int m_ClassIndex; /** Number of attributes. */
protected int m_NumAttribs; /** Number of instances. */
protected int m_NumInstances; /** Correlation matrix for the original data. */
protected double[][] m_Correlation; /**
* If true, center (rather than standardize) the data and
* compute PCA from covariance (rather than correlation)
* matrix.
*/
private boolean m_center = false; /** Will hold the unordered linear transformations of the (normalized)
original data. */
protected double[][] m_Eigenvectors; /** Eigenvalues for the corresponding eigenvectors. */
protected double[] m_Eigenvalues = null; /** Sorted eigenvalues. */
protected int[] m_SortedEigens; /** sum of the eigenvalues. */
protected double m_SumOfEigenValues = 0.0; /** Filters for replacing missing values. */
protected ReplaceMissingValues m_ReplaceMissingFilter; /** Filter for turning nominal values into numeric ones. */
protected NominalToBinary m_NominalToBinaryFilter; /** Filter for removing class attribute, nominal attributes with 0 or 1 value. */
protected Remove m_AttributeFilter; /** Filter for standardizing the data */
protected Standardize m_standardizeFilter; /** Filter for centering the data */
protected Center m_centerFilter; /** The number of attributes in the pc transformed data. */
protected int m_OutputNumAtts = -1; /** the amount of varaince to cover in the original data when
retaining the best n PC's. */
protected double m_CoverVariance = 0.95; /** maximum number of attributes in the transformed attribute name. */
protected int m_MaxAttrsInName = 5; /** maximum number of attributes in the transformed data (-1 for all). */
protected int m_MaxAttributes = -1;

计算协方差矩阵或相关系数矩阵

  protected void fillCovariance() throws Exception {    

    if (!m_center) {
fillCorrelation();
return;
} double[] att = new double[m_TrainInstances.numInstances()]; // now center the data by subtracting the mean
m_centerFilter = new Center();
m_centerFilter.setInputFormat(m_TrainInstances);
m_TrainInstances = Filter.useFilter(m_TrainInstances, m_centerFilter); // now compute the covariance matrix
m_Correlation = new double[m_NumAttribs][m_NumAttribs]; for (int i = 0; i < m_NumAttribs; i++) {
for (int j = 0; j < m_NumAttribs; j++) { double cov = 0;
for (int k = 0; k < m_NumInstances; k++) { if (i == j) {
cov += (m_TrainInstances.instance(k).value(i) *
m_TrainInstances.instance(k).value(i));
} else {
cov += (m_TrainInstances.instance(k).value(i) *
m_TrainInstances.instance(k).value(j));
}
} cov /= (double)(m_TrainInstances.numInstances() - 1);
m_Correlation[i][j] = cov;
m_Correlation[j][i] = cov;
}
}
} /**
* Fill the correlation matrix.
*/
protected void fillCorrelation() throws Exception {
int i;
int j;
int k;
double[] att1;
double[] att2;
double corr; m_Correlation = new double[m_NumAttribs][m_NumAttribs];
att1 = new double [m_NumInstances];
att2 = new double [m_NumInstances]; for (i = 0; i < m_NumAttribs; i++) {
for (j = 0; j < m_NumAttribs; j++) {
for (k = 0; k < m_NumInstances; k++) {
att1[k] = m_TrainInstances.instance(k).value(i);
att2[k] = m_TrainInstances.instance(k).value(j);
}
if (i == j) {
m_Correlation[i][j] = 1.0;
}
else {
corr = Utils.correlation(att1,att2,m_NumInstances);
m_Correlation[i][j] = corr;
m_Correlation[j][i] = corr;
}
}
} // now standardize the input data
m_standardizeFilter = new Standardize();
m_standardizeFilter.setInputFormat(m_TrainInstances);
m_TrainInstances = Filter.useFilter(m_TrainInstances, m_standardizeFilter);
}

处理数据

  /**
* Transform an instance in original (unormalized) format.
*
* @param instance an instance in the original (unormalized) format
* @return a transformed instance
* @throws Exception if instance can't be transformed
*/
protected Instance convertInstance(Instance instance) throws Exception {
Instance result;
double[] newVals;
Instance tempInst;
double cumulative;
int i;
int j;
double tempval;
int numAttsLowerBound; newVals = new double[m_OutputNumAtts];
tempInst = (Instance) instance.copy(); m_ReplaceMissingFilter.input(tempInst);
m_ReplaceMissingFilter.batchFinished();
tempInst = m_ReplaceMissingFilter.output(); m_NominalToBinaryFilter.input(tempInst);
m_NominalToBinaryFilter.batchFinished();
tempInst = m_NominalToBinaryFilter.output(); if (m_AttributeFilter != null) {
m_AttributeFilter.input(tempInst);
m_AttributeFilter.batchFinished();
tempInst = m_AttributeFilter.output();
} if (!m_center) {
m_standardizeFilter.input(tempInst);
m_standardizeFilter.batchFinished();
tempInst = m_standardizeFilter.output();
} else {
m_centerFilter.input(tempInst);
m_centerFilter.batchFinished();
tempInst = m_centerFilter.output();
} if (m_HasClass)
newVals[m_OutputNumAtts - 1] = instance.value(instance.classIndex()); if (m_MaxAttributes > 0)
numAttsLowerBound = m_NumAttribs - m_MaxAttributes;
else
numAttsLowerBound = 0;
if (numAttsLowerBound < 0)
numAttsLowerBound = 0; cumulative = 0;
for (i = m_NumAttribs - 1; i >= numAttsLowerBound; i--) {
tempval = 0.0;
for (j = 0; j < m_NumAttribs; j++)
tempval += m_Eigenvectors[j][m_SortedEigens[i]] * tempInst.value(j); newVals[m_NumAttribs - i - 1] = tempval;
cumulative += m_Eigenvalues[m_SortedEigens[i]];
if ((cumulative / m_SumOfEigenValues) >= m_CoverVariance)
break;
} // create instance
if (instance instanceof SparseInstance)
result = new SparseInstance(instance.weight(), newVals);
else
result = new DenseInstance(instance.weight(), newVals); return result;
} /**
* Initializes the filter with the given input data.
*
* @param instances the data to process
* @throws Exception in case the processing goes wrong
* @see #batchFinished()
*/
protected void setup(Instances instances) throws Exception {
int i;
int j;
Vector<Integer> deleteCols;
int[] todelete;
double[][] v;
Matrix corr;
EigenvalueDecomposition eig;
Matrix V; m_TrainInstances = new Instances(instances); // make a copy of the training data so that we can get the class
// column to append to the transformed data (if necessary)
m_TrainCopy = new Instances(m_TrainInstances, 0); m_ReplaceMissingFilter = new ReplaceMissingValues();
m_ReplaceMissingFilter.setInputFormat(m_TrainInstances);
m_TrainInstances = Filter.useFilter(m_TrainInstances, m_ReplaceMissingFilter); m_NominalToBinaryFilter = new NominalToBinary();
m_NominalToBinaryFilter.setInputFormat(m_TrainInstances);
m_TrainInstances = Filter.useFilter(m_TrainInstances, m_NominalToBinaryFilter); // delete any attributes with only one distinct value or are all missing
deleteCols = new Vector<Integer>();
for (i = 0; i < m_TrainInstances.numAttributes(); i++) {
if (m_TrainInstances.numDistinctValues(i) <= 1)
deleteCols.addElement(i);
} if (m_TrainInstances.classIndex() >=0) {
// get rid of the class column
m_HasClass = true;
m_ClassIndex = m_TrainInstances.classIndex();
deleteCols.addElement(new Integer(m_ClassIndex));
} // remove columns from the data if necessary
if (deleteCols.size() > 0) {
m_AttributeFilter = new Remove();
todelete = new int [deleteCols.size()];
for (i = 0; i < deleteCols.size(); i++)
todelete[i] = ((Integer)(deleteCols.elementAt(i))).intValue();
m_AttributeFilter.setAttributeIndicesArray(todelete);
m_AttributeFilter.setInvertSelection(false);
m_AttributeFilter.setInputFormat(m_TrainInstances);
m_TrainInstances = Filter.useFilter(m_TrainInstances, m_AttributeFilter);
} // can evaluator handle the processed data ? e.g., enough attributes?
getCapabilities().testWithFail(m_TrainInstances); m_NumInstances = m_TrainInstances.numInstances();
m_NumAttribs = m_TrainInstances.numAttributes(); //fillCorrelation();
fillCovariance(); // get eigen vectors/values
corr = new Matrix(m_Correlation);
eig = corr.eig();
V = eig.getV();
v = new double[m_NumAttribs][m_NumAttribs];
for (i = 0; i < v.length; i++) {
for (j = 0; j < v[0].length; j++)
v[i][j] = V.get(i, j);
}
m_Eigenvectors = (double[][]) v.clone();
m_Eigenvalues = (double[]) eig.getRealEigenvalues().clone(); // any eigenvalues less than 0 are not worth anything --- change to 0
for (i = 0; i < m_Eigenvalues.length; i++) {
if (m_Eigenvalues[i] < 0)
m_Eigenvalues[i] = 0.0;
}
m_SortedEigens = Utils.sort(m_Eigenvalues);
m_SumOfEigenValues = Utils.sum(m_Eigenvalues); m_TransformedFormat = determineOutputFormat(m_TrainInstances);
setOutputFormat(m_TransformedFormat); m_TrainInstances = null;
}

Weka——PrincipalComponents分析的更多相关文章

  1. Weka关联规则分析

    购物篮分析: Apriori算法: 参数设置: 1.car 如果设为真,则会挖掘类关联规则而不是全局关联规则. 2. classindex 类属性索引.如果设置为-1,最后的属性被当做类属性. 3. ...

  2. Weka算法Clusterers-DBSCAN源代码分析

    假设说世界上仅仅能存在一种基于密度的聚类算法的话.那么它必须是DBSCAN(Density-based spatial clustering of applications with noise).D ...

  3. Weka算法Clusterers-Xmeans源代码分析(一)

    <p></p><p><span style="font-size:18px">上几篇博客都是分析的分类器算法(有监督学习),这次就分 ...

  4. Weka学习之关联规则分析

    步骤: (一) 选择数据源 (二)选择要分析的字段 (三)选择需要的关联规则算法 (四)点击start运行 (五) 分析结果 算法选择: Apriori算法参数含义 1.car:如果设为真,则会挖掘类 ...

  5. Weka算法Classifier-meta-AdaBoostM1源代码分析(一)

    多分类器组合算法简单的来讲经常使用的有voting,bagging和boosting,当中就效果来说Boosting略占优势,而AdaBoostM1算法又相当于Boosting算法的"经典款 ...

  6. Weka算法Classifier-tree-J48源代码分析(一个)基本数据结构和算法

    大约一年,我没有照顾的博客,再次拿起笔不知从何写上,想来想去手从最近使用Weka要正确书写. Weka为一个Java基础上的机器学习工具.上手简单,并提供图形化界面.提供如分类.聚类.频繁项挖掘等工具 ...

  7. 数据挖掘:关联规则的apriori算法在weka的源码分析

    相对于机器学习,关联规则的apriori算法更偏向于数据挖掘. 1) 测试文档中调用weka的关联规则apriori算法,如下 try { File file = new File("F:\ ...

  8. Weka中数据挖掘与机器学习系列之Exploer界面(七)

    不多说,直接上干货! Weka的Explorer(探索者)界面,是Weka的主要图形化用户界面,其全部功能都可通过菜单选择或表单填写进行访问.本博客将详细介绍Weka探索者界面的图形化用户界面.预处理 ...

  9. Weka算法算法翻译(部分)

    目录 Weka算法翻译(部分) 1. 属性选择算法(select attributes) 1.1 属性评估方法 1.2 搜索方法 2. 分类算法 2.1 贝叶斯算法 2.2 Functions 2.3 ...

随机推荐

  1. 解决Win7启动时出现“windows未能启动。原因可能是最近更改了硬件或软件”的问题

    昨天公司终于大发慈悲,统一更换电脑配置,终于要摆脱“手扶拖拉机”的时代了,赶上“动车时代”了.不过不想换硬盘,因为重新要安装太多东西,环境配置一大堆,所以就硬盘没有换,不过当我开机启动的时候,悲剧发生 ...

  2. Sencha Touch 扩展集合

    https://market.sencha.com/extensions http://try.sencha.com/touch/2.1.0/ http://www.mitchellsimoens.c ...

  3. MySQL知识小结

    MySQL的知识面试中还是经常被问到的,简单的使用似乎无法达到面试官的要求,很多问题会关于Mysql存储引擎,所以这里还是需要系统学习一下Mysql的一些知识,面试过程中游刃有余. MySQL体系结构 ...

  4. Cracking the Coding Interview(String and array)

    1.1实现一个算法判断一个字符串是否存在重复字符.如果不能利用另外的数据结构又该如何实现? My solution: /** *利用类似一个hash table的计数 *然后检查这个hash tabl ...

  5. Coding和Git的环境搭建

    Github太慢了.打开网页慢,下载也只有几kb. 于是找了国内的Git,据说coding不错.就申请了个. 其实csdn也有...但是没人家的专业... 1 注册coding  https://co ...

  6. springMVC 报错:Unknown return value type: java.lang.Integer

    controller层返回值类型为Integer,运行报错: Unknown return value type: java.lang.Integer 解决办法:在此方法上写上注解 @Response ...

  7. Spring Boot 利用插件构造QueryDSL语句时报错:You need to run build with JDK or have tools.jar on the classpath.If this occur....

    You need to run build with JDK or have tools.jar on the classpath.If this occures during eclipse bui ...

  8. 9.9Dajngo MTV

    2018-9-9 14:53:53 mvc框架和 Django的MTV框架 框架参考 :https://www.cnblogs.com/liwenzhou/p/8296964.html 2018-9- ...

  9. ABP之应用服务(2)

    在上一篇的笔记中,已经大致对Application层的使用作了简要的使用说明,感觉还是有些东西需要研究一下,所以承接上文,对AutoMapper这个方便的东西,稍微研究一下. 一.初识AutoMapp ...

  10. 洛谷P2216 理想的正方形

    题目描述 有一个a*b的整数组成的矩阵,现请你从中找出一个n*n的正方形区域,使得该区域所有数中的最大值和最小值的差最小. 输入输出格式 输入格式: 第一行为3个整数,分别表示a,b,n的值 第二行至 ...