Weka——PrincipalComponents分析

package weka.filters.unsupervised.attribute;

PrincipalComponents

属性：

  /** The data to transform analyse/transform. */

  protected Instances m_TrainInstances;

  /** Keep a copy for the class attribute (if set). */

  protected Instances m_TrainCopy;

  /** The header for the transformed data format. */

  protected Instances m_TransformedFormat;

  /** Data has a class set. */

  protected boolean m_HasClass;

  /** Class index. */

  protected int m_ClassIndex;

  /** Number of attributes. */

  protected int m_NumAttribs;

  /** Number of instances. */

  protected int m_NumInstances;

  /** Correlation matrix for the original data. */

  protected double[][] m_Correlation;

  /**

   * If true, center (rather than standardize) the data and

   * compute PCA from covariance (rather than correlation)

   * matrix.

   */

  private boolean m_center = false;

  /** Will hold the unordered linear transformations of the (normalized)

      original data. */

  protected double[][] m_Eigenvectors;

  /** Eigenvalues for the corresponding eigenvectors. */

  protected double[] m_Eigenvalues = null;

  /** Sorted eigenvalues. */

  protected int[] m_SortedEigens;

  /** sum of the eigenvalues. */

  protected double m_SumOfEigenValues = 0.0;

  /** Filters for replacing missing values. */

  protected ReplaceMissingValues m_ReplaceMissingFilter;

  /** Filter for turning nominal values into numeric ones. */

  protected NominalToBinary m_NominalToBinaryFilter;

  /** Filter for removing class attribute, nominal attributes with 0 or 1 value. */

  protected Remove m_AttributeFilter;

  /** Filter for standardizing the data */

  protected Standardize m_standardizeFilter;

  /** Filter for centering the data */

  protected Center m_centerFilter;

  /** The number of attributes in the pc transformed data. */

  protected int m_OutputNumAtts = -1;  

  /** the amount of varaince to cover in the original data when

      retaining the best n PC's. */

  protected double m_CoverVariance = 0.95;

  /** maximum number of attributes in the transformed attribute name. */

  protected int m_MaxAttrsInName = 5;

  /** maximum number of attributes in the transformed data (-1 for all). */

  protected int m_MaxAttributes = -1;

计算协方差矩阵或相关系数矩阵

  protected void fillCovariance() throws Exception {    

    if (!m_center) {

      fillCorrelation();

      return;

    }

    double[] att = new double[m_TrainInstances.numInstances()];

    // now center the data by subtracting the mean

    m_centerFilter = new Center();

    m_centerFilter.setInputFormat(m_TrainInstances);

    m_TrainInstances = Filter.useFilter(m_TrainInstances, m_centerFilter);

    // now compute the covariance matrix

    m_Correlation = new double[m_NumAttribs][m_NumAttribs];

    for (int i = 0; i < m_NumAttribs; i++) {

      for (int j = 0; j < m_NumAttribs; j++) {

        double cov = 0;

        for (int k = 0; k < m_NumInstances; k++) {

          if (i == j) {

            cov += (m_TrainInstances.instance(k).value(i) *

                m_TrainInstances.instance(k).value(i));

          } else {

          cov += (m_TrainInstances.instance(k).value(i) *

              m_TrainInstances.instance(k).value(j));

          }

        }

        cov /= (double)(m_TrainInstances.numInstances() - 1);

        m_Correlation[i][j] = cov;

        m_Correlation[j][i] = cov;

      }

    }

  }

  /**

   * Fill the correlation matrix.

   */

  protected void fillCorrelation() throws Exception {

    int        i;

    int        j;

    int        k;

    double[]     att1;

    double[]     att2;

    double     corr;

    m_Correlation = new double[m_NumAttribs][m_NumAttribs];

    att1          = new double [m_NumInstances];

    att2          = new double [m_NumInstances];

    for (i = 0; i < m_NumAttribs; i++) {

      for (j = 0; j < m_NumAttribs; j++) {

        for (k = 0; k < m_NumInstances; k++) {

          att1[k] = m_TrainInstances.instance(k).value(i);

          att2[k] = m_TrainInstances.instance(k).value(j);

        }

    if (i == j) {

      m_Correlation[i][j] = 1.0;

    }

    else {

      corr = Utils.correlation(att1,att2,m_NumInstances);

      m_Correlation[i][j] = corr;

      m_Correlation[j][i] = corr;

    }

      }

    }

    // now standardize the input data

    m_standardizeFilter = new Standardize();

    m_standardizeFilter.setInputFormat(m_TrainInstances);

    m_TrainInstances = Filter.useFilter(m_TrainInstances, m_standardizeFilter);

  }

处理数据

  /**

   * Transform an instance in original (unormalized) format.

   *

   * @param instance     an instance in the original (unormalized) format

   * @return         a transformed instance

   * @throws Exception     if instance can't be transformed

   */

  protected Instance convertInstance(Instance instance) throws Exception {

    Instance    result;

    double[]     newVals;

    Instance     tempInst;

    double     cumulative;

    int        i;

    int        j;

    double     tempval;

    int        numAttsLowerBound;

    newVals  = new double[m_OutputNumAtts];

    tempInst = (Instance) instance.copy();

    m_ReplaceMissingFilter.input(tempInst);

    m_ReplaceMissingFilter.batchFinished();

    tempInst = m_ReplaceMissingFilter.output();    

    m_NominalToBinaryFilter.input(tempInst);

    m_NominalToBinaryFilter.batchFinished();

    tempInst = m_NominalToBinaryFilter.output();

    if (m_AttributeFilter != null) {

      m_AttributeFilter.input(tempInst);

      m_AttributeFilter.batchFinished();

      tempInst = m_AttributeFilter.output();

    }

    if (!m_center) {

      m_standardizeFilter.input(tempInst);

      m_standardizeFilter.batchFinished();

      tempInst = m_standardizeFilter.output();

    } else {

      m_centerFilter.input(tempInst);

      m_centerFilter.batchFinished();

      tempInst = m_centerFilter.output();

    }

    if (m_HasClass)

      newVals[m_OutputNumAtts - 1] = instance.value(instance.classIndex());

    if (m_MaxAttributes > 0)

      numAttsLowerBound = m_NumAttribs - m_MaxAttributes;

    else

      numAttsLowerBound = 0;

    if (numAttsLowerBound < 0)

      numAttsLowerBound = 0;

    cumulative = 0;

    for (i = m_NumAttribs - 1; i >= numAttsLowerBound; i--) {

      tempval = 0.0;

      for (j = 0; j < m_NumAttribs; j++)

    tempval += m_Eigenvectors[j][m_SortedEigens[i]] * tempInst.value(j);

      newVals[m_NumAttribs - i - 1] = tempval;

      cumulative += m_Eigenvalues[m_SortedEigens[i]];

      if ((cumulative / m_SumOfEigenValues) >= m_CoverVariance)

    break;

    }

    // create instance

    if (instance instanceof SparseInstance)

      result = new SparseInstance(instance.weight(), newVals);

    else

      result = new DenseInstance(instance.weight(), newVals);

    return result;

  }

  /**

   * Initializes the filter with the given input data.

   *

   * @param instances   the data to process

   * @throws Exception  in case the processing goes wrong

   * @see               #batchFinished()

   */

  protected void setup(Instances instances) throws Exception {

    int                i;

    int                j;

    Vector<Integer>         deleteCols;

    int[]             todelete;

    double[][]             v;

    Matrix             corr;

    EigenvalueDecomposition     eig;

    Matrix             V;

    m_TrainInstances = new Instances(instances);

    // make a copy of the training data so that we can get the class

    // column to append to the transformed data (if necessary)

    m_TrainCopy = new Instances(m_TrainInstances, 0);

    m_ReplaceMissingFilter = new ReplaceMissingValues();

    m_ReplaceMissingFilter.setInputFormat(m_TrainInstances);

    m_TrainInstances = Filter.useFilter(m_TrainInstances, m_ReplaceMissingFilter);

    m_NominalToBinaryFilter = new NominalToBinary();

    m_NominalToBinaryFilter.setInputFormat(m_TrainInstances);

    m_TrainInstances = Filter.useFilter(m_TrainInstances, m_NominalToBinaryFilter);

    // delete any attributes with only one distinct value or are all missing

    deleteCols = new Vector<Integer>();

    for (i = 0; i < m_TrainInstances.numAttributes(); i++) {

      if (m_TrainInstances.numDistinctValues(i) <= 1)

    deleteCols.addElement(i);

    }

    if (m_TrainInstances.classIndex() >=0) {

      // get rid of the class column

      m_HasClass = true;

      m_ClassIndex = m_TrainInstances.classIndex();

      deleteCols.addElement(new Integer(m_ClassIndex));

    }

    // remove columns from the data if necessary

    if (deleteCols.size() > 0) {

      m_AttributeFilter = new Remove();

      todelete = new int [deleteCols.size()];

      for (i = 0; i < deleteCols.size(); i++)

    todelete[i] = ((Integer)(deleteCols.elementAt(i))).intValue();

      m_AttributeFilter.setAttributeIndicesArray(todelete);

      m_AttributeFilter.setInvertSelection(false);

      m_AttributeFilter.setInputFormat(m_TrainInstances);

      m_TrainInstances = Filter.useFilter(m_TrainInstances, m_AttributeFilter);

    }

    // can evaluator handle the processed data ? e.g., enough attributes?

    getCapabilities().testWithFail(m_TrainInstances);

    m_NumInstances = m_TrainInstances.numInstances();

    m_NumAttribs   = m_TrainInstances.numAttributes();

    //fillCorrelation();

    fillCovariance();

    // get eigen vectors/values

    corr = new Matrix(m_Correlation);

    eig  = corr.eig();

    V    = eig.getV();

    v    = new double[m_NumAttribs][m_NumAttribs];

    for (i = 0; i < v.length; i++) {

      for (j = 0; j < v[0].length; j++)

        v[i][j] = V.get(i, j);

    }

    m_Eigenvectors = (double[][]) v.clone();

    m_Eigenvalues  = (double[]) eig.getRealEigenvalues().clone();

    // any eigenvalues less than 0 are not worth anything --- change to 0

    for (i = 0; i < m_Eigenvalues.length; i++) {

      if (m_Eigenvalues[i] < 0)

    m_Eigenvalues[i] = 0.0;

    }

    m_SortedEigens     = Utils.sort(m_Eigenvalues);

    m_SumOfEigenValues = Utils.sum(m_Eigenvalues);

    m_TransformedFormat = determineOutputFormat(m_TrainInstances);

    setOutputFormat(m_TransformedFormat);

    m_TrainInstances = null;

  }

Weka——PrincipalComponents分析的更多相关文章

Weka关联规则分析
购物篮分析: Apriori算法: 参数设置: 1.car 如果设为真,则会挖掘类关联规则而不是全局关联规则. 2. classindex 类属性索引.如果设置为-1,最后的属性被当做类属性. 3. ...
Weka算法Clusterers-DBSCAN源代码分析
假设说世界上仅仅能存在一种基于密度的聚类算法的话.那么它必须是DBSCAN(Density-based spatial clustering of applications with noise).D ...
Weka算法Clusterers-Xmeans源代码分析（一）
<p></p><p><span style="font-size:18px">上几篇博客都是分析的分类器算法(有监督学习),这次就分 ...
Weka学习之关联规则分析
步骤: (一) 选择数据源 (二)选择要分析的字段 (三)选择需要的关联规则算法 (四)点击start运行 (五) 分析结果算法选择: Apriori算法参数含义 1.car:如果设为真,则会挖掘类 ...
Weka算法Classifier-meta-AdaBoostM1源代码分析（一）
多分类器组合算法简单的来讲经常使用的有voting,bagging和boosting,当中就效果来说Boosting略占优势,而AdaBoostM1算法又相当于Boosting算法的"经典款 ...
Weka算法Classifier-tree-J48源代码分析（一个）基本数据结构和算法
大约一年,我没有照顾的博客,再次拿起笔不知从何写上,想来想去手从最近使用Weka要正确书写. Weka为一个Java基础上的机器学习工具.上手简单,并提供图形化界面.提供如分类.聚类.频繁项挖掘等工具 ...
数据挖掘：关联规则的apriori算法在weka的源码分析
相对于机器学习,关联规则的apriori算法更偏向于数据挖掘. 1) 测试文档中调用weka的关联规则apriori算法,如下 try { File file = new File("F:\ ...
Weka中数据挖掘与机器学习系列之Exploer界面（七）
不多说,直接上干货! Weka的Explorer(探索者)界面,是Weka的主要图形化用户界面,其全部功能都可通过菜单选择或表单填写进行访问.本博客将详细介绍Weka探索者界面的图形化用户界面.预处理 ...
Weka算法算法翻译（部分）
目录 Weka算法翻译(部分) 1. 属性选择算法(select attributes) 1.1 属性评估方法 1.2 搜索方法 2. 分类算法 2.1 贝叶斯算法 2.2 Functions 2.3 ...

随机推荐

ANDROID – TOOLBAR 上的 NAVIGATION DRAWER（转）
在 Material Design 釋出後,Google 也開始陸續更新了 Google app 的介面,讓大家有個範例可以看.而過去大力推動的 actionbar 自然而然也成了眾開發者觀注的部份: ...
SQL Server设置登录验证模式
我们在安装SQL Server的时候可以设置“混合验证模式”,既可以使用windows身份验证登录,也可以使用SQL Server身份验证登录. 如果我们在安装的时候并未设置"混合验证模式& ...
【Nginx系列】Nginx之location
语法: location [=|~|~*|^~] patt { } 一.分类中括号可以不写任何参数,此时称为一般匹配也可以写参数因此,大类型可以分为3种 location = patt {} [ ...
部署OpenStack问题汇总（二）--openstack dashboard 问题解决方案
在打开dashboard的时候报错: LocationParseError at /admin/ (LocationParseError(...), 'Failed to parse: Failed ...
OpenCV获取IP摄像头视频
从开源中国博客搬来,合并博客实验室做一个智能小车的小项目,期间涉及到在PC端处理小车摄像头的视频.这里先用安卓手机代替一下进行试验.大致流程就是手机摄像头获取视频,开启一个IP摄像头服务软件,在局域 ...
Linux 帐户管理
一用户相关操作 1. 添加帐户 useradd 选项用户名 -c comment 指定一段注释性描述. -d 目录指定用户主目录,如果此目录不存在,则同时使用-m选项,可以创建主目录. -g 用 ...
C# IO流的操作（一）
C# IO流的操作非常重要,我们读写文件都会使用到这个技术,这里先演示一个文件内容复制的例子,简要说明C#中的IO操作. namespace ConsoleApplication1 { class P ...
【CF815D】Karen and Cards 单调栈+扫描线
[CF815D]Karen and Cards 题意:一张卡片有三个属性a,b,c,其上限分别为A,B,C,现在有n张卡片,定义一张卡片能打败另一张卡片当且仅当它的至少两项属性要严格大于另一张的对应属 ...
iOS开发过程中使用Core Data应避免的十个错误
原文出处: informit 译文出处:cocoachina Core Data是苹果针对Mac和iOS平台开发的一个框架,主要用来储存数据.对很多开发者来说,Core Data比较容易入手,但很 ...
一个简单web系统的接口性能分析及调优过程
在测试一个简单系统接口性能压力时,压到一定数量,程序总是崩溃,查看相关机器相关数据时,CPU.内存.IO占用均不高,问题自然出现在其它地方先介绍下系统部件架构 Resin版本为:[root@local ...

Weka——PrincipalComponents分析

Weka——PrincipalComponents分析的更多相关文章

随机推荐

热门专题