ID3算法

  • 思路:分类算法的输入为训练集,输出为对数据进行分类的函数。ID3算法为分类函数生成分类树
  • 需求:对水果训练集的一个维度(是否甜)进行预测
  • 实现:决策树,熵函数,ID3,weka库 J48类

ComputeGain.java

 1 public class ComputeGain {
2 public static void main(String[] args) {
3 System.out.printf("h(11,16) = %.4f%n", h(11,16));
4 System.out.println("Gain(Size):");
5 System.out.printf("\th(3,5) = %.4f%n", h(3,5));
6 System.out.printf("\th(6,7) = %.4f%n", h(6,7));
7 System.out.printf("\th(2,4) = %.4f%n", h(2,4));
8 System.out.printf("\tg({3,6,2},{5,7,4}) = %.4f%n",
9 g(new int[]{3,6,2},new int[]{5,7,4}));
10 System.out.println("Gain(Color):");
11 System.out.printf("\th(3,4) = %.4f%n", h(3,4));
12 System.out.printf("\th(3,5) = %.4f%n", h(3,5));
13 System.out.printf("\th(2,3) = %.4f%n", h(2,3));
14 System.out.printf("\th(2,4) = %.4f%n", h(2,4));
15 System.out.printf("\tg({3,3,2,2},{4,5,3,4}) = %.4f%n",
16 g(new int[]{3,3,2,2},new int[]{4,5,3,4}));
17 System.out.println("Gain(Surface):");
18 System.out.printf("\th(4,7) = %.4f%n", h(4,7));
19 System.out.printf("\th(4,6) = %.4f%n", h(4,6));
20 System.out.printf("\th(3,3) = %.4f%n", h(3,3));
21 System.out.printf("\tg({4,4,3},{7,6,3}) = %.4f%n",
22 g(new int[]{4,4,3},new int[]{7,6,3}));
23 System.out.println("Gain(Size|SMOOTH):");
24 System.out.printf("\th(1,3) = %.4f%n", h(1,3));
25 System.out.printf("\th(3,3) = %.4f%n", h(3,3));
26 System.out.printf("\tg({1,3,0},{3,3,1}) = %.4f%n",
27 g(new int[]{1,3,0},new int[]{3,3,1}));
28 System.out.println("Gain(Color|SMOOTH):");
29 System.out.printf("\th(2,3) = %.4f%n", h(2,3));
30 System.out.printf("\tg({2,2,0},{3,2,2}) = %.4f%n",
31 g(new int[]{2,2,0},new int[]{3,2,2}));
32 System.out.println("Gain(Size|ROUGH):");
33 System.out.printf("\th(3,6) = %.4f%n", h(3,6));
34 System.out.printf("\th(1,2) = %.4f%n", h(1,2));
35 System.out.printf("\tg({2,1,1},{2,2,2}) = %.4f%n",
36 g(new int[]{2,1,1},new int[]{2,2,2}));
37 System.out.println("Gain(Color|ROUGH):");
38 System.out.printf("\th(4,6) = %.4f%n", h(4,6));
39 System.out.printf("\tg({1,1,1},{2,2,2}) = %.4f%n",
40 g(new int[]{1,0,2,1},new int[]{1,2,2,1}));
41 }
42
43 /* Gain for the splitting {A1, A2, ...}, where Ai
44 has n[i] points, m[i] of which are favorable.
45 */
46 public static double g(int[] m, int[] n) {
47 int sm = 0, sn = 0;
48 double nsh = 0.0;
49 for (int i = 0; i < m.length; i++) {
50 sm += m[i];
51 sn += n[i];
52 nsh += n[i]*h(m[i],n[i]);
53 }
54 return h(sm, sn) - nsh/sn;
55 }
56
57 /* Entropy for m favorable items out of n.
58 */
59 public static double h(int m, int n) {
60 if (m == 0 || m == n) {
61 return 0;
62 }
63 double p = (double)m/n, q = 1 - p;
64 return -p*lg(p) - q*lg(q);
65 }
66
67 /* Returns the binary logarithm of x.
68 */
69 public static double lg(double x) {
70 return Math.log(x)/Math.log(2);
71 }
72 }

h(11,16) = 0.8960
Gain(Size):
h(3,5) = 0.9710
h(6,7) = 0.5917
h(2,4) = 1.0000
g({3,6,2},{5,7,4}) = 0.0838
Gain(Color):
h(3,4) = 0.8113
h(3,5) = 0.9710
h(2,3) = 0.9183
h(2,4) = 1.0000
g({3,3,2,2},{4,5,3,4}) = 0.0260
Gain(Surface):
h(4,7) = 0.9852
h(4,6) = 0.9183
h(3,3) = 0.0000
g({4,4,3},{7,6,3}) = 0.1206
Gain(Size|SMOOTH):
h(1,3) = 0.9183
h(3,3) = 0.0000
g({1,3,0},{3,3,1}) = 0.5917
Gain(Color|SMOOTH):
h(2,3) = 0.9183
g({2,2,0},{3,2,2}) = 0.5917
Gain(Size|ROUGH):
h(3,6) = 1.0000
h(1,2) = 1.0000
g({2,1,1},{2,2,2}) = 0.2516
Gain(Color|ROUGH):
h(4,6) = 0.9183
g({1,1,1},{2,2,2}) = 0.9183

 1 import weka.classifiers.trees.J48;
2 import weka.core.Instances;
3 import weka.core.Instance;
4 import weka.core.converters.ConverterUtils.DataSource;
5
6 public class TestWekaJ48 {
7 public static void main(String[] args) throws Exception {
8 DataSource source = new DataSource("data/AnonFruit.arff");
9 Instances instances = source.getDataSet();
10 instances.setClassIndex(3); // target attribute: (Sweet)
11
12 J48 j48 = new J48(); // an extension of ID3
13 j48.setOptions(new String[]{"-U"}); // use unpruned tree
14 j48.buildClassifier(instances);
15
16 for (Instance instance : instances) {
17 double prediction = j48.classifyInstance(instance);
18 System.out.printf("%4.0f%4.0f%n",
19 instance.classValue(), prediction);
20 }
21 }
22 }

1 1
1 1
1 1
1 0
1 1
0 0
1 1
0 0
0 0
0 0
1 1
1 1
1 1
1 1
0 0
1 1

贝叶斯分类

  • 思路:基于训练集计算的比率生成的函数进行分类

Fruit.java

 1 import java.io.File;
2 import java.io.FileNotFoundException;
3 import java.util.HashSet;
4 import java.util.Scanner;
5 import java.util.Set;
6
7 public class Fruit {
8 String name, size, color, surface;
9 boolean sweet;
10
11 public Fruit(String name, String size, String color, String surface,
12 boolean sweet) {
13 this.name = name;
14 this.size = size;
15 this.color = color;
16 this.surface = surface;
17 this.sweet = sweet;
18 }
19
20 @Override
21 public String toString() {
22 return String.format("%-12s%-8s%-8s%-8s%s",
23 name, size, color, surface, (sweet? "T": "F") );
24 }
25
26 public static Set<Fruit> loadData(File file) {
27 Set<Fruit> fruits = new HashSet();
28 try {
29 Scanner input = new Scanner(file);
30 for (int i = 0; i < 7; i++) { // read past metadata
31 input.nextLine();
32 }
33 while (input.hasNextLine()) {
34 String line = input.nextLine();
35 Scanner lineScanner = new Scanner(line);
36 String name = lineScanner.next();
37 String size = lineScanner.next();
38 String color = lineScanner.next();
39 String surface = lineScanner.next();
40 boolean sweet = (lineScanner.next().equals("T"));
41 Fruit fruit = new Fruit(name, size, color, surface, sweet);
42 fruits.add(fruit);
43 }
44 } catch (FileNotFoundException e) {
45 System.err.println(e);
46 }
47 return fruits;
48 }
49
50 public static void print(Set<Fruit> fruits) {
51 int k=1;
52 for (Fruit fruit : fruits) {
53 System.out.printf("%2d. %s%n", k++, fruit);
54 }
55 }
56 }

BayesianTest.java

 1 import java.io.File;
2 import java.util.Set;
3
4 public class BayesianTest {
5 private static Set<Fruit> fruits;
6
7 public static void main(String[] args) {
8 fruits = Fruit.loadData(new File("data/Fruit.arff"));
9 Fruit fruit = new Fruit("cola", "SMALL", "RED", "SMOOTH", false);
10 double n = fruits.size(); // total number of fruits in training set
11 double sum1 = 0; // number of sweet fruits
12 for (Fruit f : fruits) {
13 sum1 += (f.sweet? 1: 0);
14 }
15 double sum2 = n - sum1; // number of sour fruits
16 double[][] p = new double[4][3];
17 for (Fruit f : fruits) {
18 if (f.sweet) {
19 p[1][1] += (f.size.equals(fruit.size)? 1: 0)/sum1;
20 p[2][1] += (f.color.equals(fruit.color)? 1: 0)/sum1;
21 p[3][1] += (f.surface.equals(fruit.surface)? 1: 0)/sum1;
22 } else {
23 p[1][2] += (f.size.equals(fruit.size)? 1: 0)/sum2;
24 p[2][2] += (f.color.equals(fruit.color)? 1: 0)/sum2;
25 p[3][2] += (f.surface.equals(fruit.surface)? 1: 0)/sum2;
26 }
27 }
28 double pc1 = p[1][1]*p[2][1]*p[3][1]*sum1/n;
29 double pc2 = p[1][2]*p[2][2]*p[3][2]*sum2/n;
30 System.out.printf("pc1 = %.4f, pc2 = %.4f%n", pc1, pc2);
31 System.out.printf("Predict %s is %s.%n",
32 fruit.name, (pc1 > pc2? "sweet": "sour"));
33 }
34 }

pc1 = 0.0186, pc2 = 0.0150
Predict cola is sweet.

TestWekaBayes.java

 1 import java.util.List;
2 import weka.classifiers.Evaluation;
3 import weka.classifiers.bayes.NaiveBayes;
4 import weka.classifiers.evaluation.Prediction;
5 import weka.core.Instance;
6 import weka.core.Instances;
7 import weka.core.converters.ConverterUtils;
8 import weka.core.converters.ConverterUtils.DataSource;
9
10 public class TestWekaBayes {
11 public static void main(String[] args) throws Exception {
12 // ConverterUtils.DataSource source = new ConverterUtils.DataSource("data/AnonFruit.arff");
13 DataSource source = new DataSource("data/AnonFruit.arff");
14 Instances train = source.getDataSet();
15 train.setClassIndex(3); // target attribute: (Sweet)
16 //build model
17 NaiveBayes model=new NaiveBayes();
18 model.buildClassifier(train);
19
20 //use
21 Instances test = train;
22 Evaluation eval = new Evaluation(test);
23 eval.evaluateModel(model,test);
24 List <Prediction> predictions = eval.predictions();
25 int k = 0;
26 for (Instance instance : test) {
27 double actual = instance.classValue();
28 double prediction = eval.evaluateModelOnce(model, instance);
29 System.out.printf("%2d.%4.0f%4.0f", ++k, actual, prediction);
30 System.out.println(prediction != actual? " *": "");
31 }
32 }
33 }

1. 1 1
2. 1 1
3. 1 1
4. 1 1
5. 1 1
6. 0 1 *
7. 1 1
8. 0 0
9. 0 0
10. 0 1 *
11. 1 1
12. 1 1
13. 1 1
14. 1 1
15. 0 0
16. 1 1

SVM算法

  • 思路:生成超平面方程,计算数据点位于哪一边

逻辑回归

  • 思路:将目标值属性为布尔值的问题转化成一个数值变量,在转化后的问题上进行线性回归
  • 需求:某政党候选人想知道选举获胜的花费
  • 实现

 1 import org.apache.commons.math3.analysis.function.*;
2 import org.apache.commons.math3.stat.regression.SimpleRegression;
3
4 public class LogisticRegression {
5 static int n = 6;
6 static double[] x = {5, 15, 25, 35, 45, 55};
7 static double[] p = {2./6,2./5, 4./8, 5./9, 3./5, 4./5};
8 static double[] y = new double[n]; // y = logit(p)
9
10 public static void main(String[] args) {
11
12 // Transform p-values into y-values:
13 Logit logit = new Logit();
14 for (int i = 0; i < n; i++) {
15 y[i] = logit.value(p[i]);
16 }
17
18 // Set up input array for linear regression:
19 double[][] data = new double[n][n];
20 for (int i = 0; i < n; i++) {
21 data[i][0] = x[i];
22 data[i][1] = y[i];
23 }
24
25 // Run linear regression of y on x:
26 SimpleRegression sr = new SimpleRegression();
27 sr.addData(data);
28
29 // Print results:
30 for (int i = 0; i < n; i++) {
31 System.out.printf("x = %2.0f, y = %7.4f%n", x[i], sr.predict(x[i]));
32 }
33 System.out.println();
34
35 // Convert y-values back to p-values:
36 Sigmoid sigmoid = new Sigmoid();
37 for (int i = 0; i < n; i++) {
38 double p = sr.predict(x[i]);
39 System.out.printf("x = %2.0f, p = %6.4f%n", x[i], sigmoid.value(p));
40 }
41 }
42 }

x = 5, y = -0.7797
x = 15, y = -0.4067
x = 25, y = -0.0338
x = 35, y = 0.3392
x = 45, y = 0.7121
x = 55, y = 1.0851

x = 5, p = 0.3144
x = 15, p = 0.3997
x = 25, p = 0.4916
x = 35, p = 0.5840
x = 45, p = 0.6709
x = 55, p = 0.7475

k临近

  • 思路:根据临近范围内的样本进行分类

 1 import weka.classifiers.lazy.IBk;  // K-Nearest Neighbors
2 import weka.core.Instances;
3 import weka.core.Instance;
4 import weka.core.converters.ConverterUtils.DataSource;
5
6 public class TestIBk {
7 public static void main(String[] args) throws Exception {
8 DataSource source = new DataSource("data/AnonFruit.arff");
9 Instances instances = source.getDataSet();
10 instances.setClassIndex(3); // target attribute: (Sweet)
11
12 IBk ibk = new IBk();
13 ibk.buildClassifier(instances);
14
15 for (Instance instance : instances) {
16 double prediction = ibk.classifyInstance(instance);
17 System.out.printf("%4.0f%4.0f%n",
18 instance.classValue(), prediction);
19 }
20 }
21 }

1 1
1 1
1 1
1 0
1 1
0 0
1 1
0 0
0 0
0 0
1 1
1 1
1 1
1 1
0 0
1 1

[Java] 数据分析--分类的更多相关文章

  1. Java异常分类 转载

    Java异常分类 http://blog.csdn.net/woshixuye/article/details/8230407     一.基本概念 看java的异常结构图 Throwable是所有异 ...

  2. Java注释分类

    Java注释分类 1.单行注释    //打印结果    System.out.println("结果是:"+result); 2.多行注释    /**     * @autho ...

  3. Java应用分类

    Java应用分类     一.应用程序.指在操作系统上直接运行的,不是浏览器,Java环境用本机的,需要在客户端安装,Java环境可以一起安装.         1.GUI图形界面应用程序       ...

  4. Java凝视分类

    Java凝视分类 1.单行凝视    //打印结果    System.out.println("结果是:"+result); 2.多行凝视    /**     * @autho ...

  5. Java集合【1】--俯瞰java集合分类

    目录 (一) java集合分类 (1) Iterable接口 1. 内部定义的方法 1.1 iterator方法 1.2 forEach方法 1.3 spliterator方法 2. Collecti ...

  6. irms模拟数据生成及数据分析 分类: H_HISTORY 2015-03-06 14:17 212人阅读 评论(0) 收藏

    一.数据准备 1.每天生成随机一个文本,每小时向文本中追加2次数据,每次10万条 随机数据生成: 2,32  * * * *  bash /mnt/jediael/irms/signalGenerat ...

  7. 各种排序算法的分析及java实现 分类: B10_计算机基础 2015-02-03 20:09 186人阅读 评论(0) 收藏

    转载自:http://www.cnblogs.com/liuling/p/2013-7-24-01.html 另可参考:http://gengning938.blog.163.com/blog/sta ...

  8. Java 引用分类:StrongReference、SoftReference、WeakReference、PhantomReference

    一,定义 在Java中,引用的定义是:如果reference类型的数据中存储的数值代表的是另一块内存的起始地址,就称这块内存代表着一个引用.后面在JDK1.2开始,引用的概念被扩充,引用被分为强引用( ...

  9. Java异常分类

    一.基本概念 看java的异常结构图 Throwable是所有异常的根,java.lang.ThrowableError是错误,java.lang.ErrorException是异常,java.lan ...

随机推荐

  1. HTML5获取地理位置定位信息

    如何使用HTML5地理位置定位功能 定位功能(Geolocation)是HTML5的新特性,因此只有在支持HTML5的现代浏览器上运行,特别是手持设备如iphone,地理定位更加精确.首先我们要检测用 ...

  2. Java利用线程工厂监控线程池

    目录 ThreadFactory 监控线程池 扩展线程池 扩展线程池示例 优化线程池大小 线程池死锁 线程池异常信息捕获 ThreadFactory 线程池中的线程从哪里来呢?就是ThreadFoct ...

  3. Fundamentals of Power Electronics 目录

    Fundamentals of Power Electronics Translated By Siwei Yang (前六章翻译自Edition 2,后面部分翻译自Edition 3) Part I ...

  4. java面试-G1垃圾收集器

    一.以前收集器的特点 年轻代和老年代是各自独立且连续的内存块 年轻代收集器使用 eden + S0 + S1 进行复制算法 老年代收集必须扫描整个老年代区域 都是以尽可能的少而快速地执行 GC 为设计 ...

  5. python基础(四):切片和索引

    Python中的序列有元组.列表和字符串,因此我们都可以通过索引和切片的方式,来获取其中的元素. 索引 Python中的索引,对于正向索引,都是从0开始的.但是对于反向索引,确实从-1开始的.如图所示 ...

  6. hbuilderX打包苹果证书的申请方法

    现在uniapp越来越火,hbuilderX和apicloud这些工具使用html+js语言就可以开发强大的app,大大降低了app开发的技术门槛. hbuilderX或apicloud在打包ios应 ...

  7. 自动化kolla-ansible部署ubuntu20.04+openstack-victoria之基础配置-04

    自动化kolla-ansible部署ubuntu20.04+openstack-victoria之基础配置-04 欢迎加QQ群:1026880196 进行交流学习 近期我发现网上有人转载或者复制原创博 ...

  8. shell脚本 4 函数与正则

    shell函数 shell中允许将一组命令集合或语句形成一段可用代码,这些代码块称为shell函数.给这段代码起个名字称为函数名,后续可以直接调用该段代码. 格式 func() {   #指定函数名 ...

  9. HUAWEI AppGallery Connect 正式发布移动端App,随时随地掌握应用动态

    华为应用市场AppGallery Connect应用一站式服务平台正式发布移动端App,帮助您随时随地查看应用信息,获取运营分析数据,接收重要消息通知,快速回复用户评论等,提升应用的运营管理效率,更便 ...

  10. WebGL之绘制三维地球

    通过Three.js也许可以很方便的展示出3D模型,但是你知道它是怎么一步一步从构建网格到贴图到最终渲染出3D模型的吗?现在我们直接使用底层的webgl加上一点点的数学知识就可以实现它. 本节实现的效 ...