[Java] 数据分析--分类

ID3算法

思路：分类算法的输入为训练集，输出为对数据进行分类的函数。ID3算法为分类函数生成分类树
需求：对水果训练集的一个维度（是否甜）进行预测
实现：决策树，熵函数，ID3，weka库 J48类

ComputeGain.java

 1 public class ComputeGain {

 2     public static void main(String[] args) {

 3         System.out.printf("h(11,16) = %.4f%n", h(11,16));

 4         System.out.println("Gain(Size):");

 5         System.out.printf("\th(3,5) = %.4f%n", h(3,5));

 6         System.out.printf("\th(6,7) = %.4f%n", h(6,7));

 7         System.out.printf("\th(2,4) = %.4f%n", h(2,4));

 8         System.out.printf("\tg({3,6,2},{5,7,4}) = %.4f%n",

 9                     g(new int[]{3,6,2},new int[]{5,7,4}));

10         System.out.println("Gain(Color):");

11         System.out.printf("\th(3,4) = %.4f%n", h(3,4));

12         System.out.printf("\th(3,5) = %.4f%n", h(3,5));

13         System.out.printf("\th(2,3) = %.4f%n", h(2,3));

14         System.out.printf("\th(2,4) = %.4f%n", h(2,4));

15         System.out.printf("\tg({3,3,2,2},{4,5,3,4}) = %.4f%n",

16                     g(new int[]{3,3,2,2},new int[]{4,5,3,4}));

17         System.out.println("Gain(Surface):");

18         System.out.printf("\th(4,7) = %.4f%n", h(4,7));

19         System.out.printf("\th(4,6) = %.4f%n", h(4,6));

20         System.out.printf("\th(3,3) = %.4f%n", h(3,3));

21         System.out.printf("\tg({4,4,3},{7,6,3}) = %.4f%n",

22                     g(new int[]{4,4,3},new int[]{7,6,3}));

23         System.out.println("Gain(Size|SMOOTH):");

24         System.out.printf("\th(1,3) = %.4f%n", h(1,3));

25         System.out.printf("\th(3,3) = %.4f%n", h(3,3));

26         System.out.printf("\tg({1,3,0},{3,3,1}) = %.4f%n",

27                     g(new int[]{1,3,0},new int[]{3,3,1}));

28         System.out.println("Gain(Color|SMOOTH):");

29         System.out.printf("\th(2,3) = %.4f%n", h(2,3));

30         System.out.printf("\tg({2,2,0},{3,2,2}) = %.4f%n",

31                     g(new int[]{2,2,0},new int[]{3,2,2}));

32         System.out.println("Gain(Size|ROUGH):");

33         System.out.printf("\th(3,6) = %.4f%n", h(3,6));

34         System.out.printf("\th(1,2) = %.4f%n", h(1,2));

35         System.out.printf("\tg({2,1,1},{2,2,2}) = %.4f%n",

36                     g(new int[]{2,1,1},new int[]{2,2,2}));

37         System.out.println("Gain(Color|ROUGH):");

38         System.out.printf("\th(4,6) = %.4f%n", h(4,6));

39         System.out.printf("\tg({1,1,1},{2,2,2}) = %.4f%n",

40                     g(new int[]{1,0,2,1},new int[]{1,2,2,1}));

41     }

42

43     /*  Gain for the splitting {A1, A2, ...}, where Ai

44         has n[i] points, m[i] of which are favorable.

45     */

46     public static double g(int[] m, int[] n) {

47         int sm = 0, sn = 0;

48         double nsh = 0.0;

49         for (int i = 0; i < m.length; i++) {

50             sm += m[i];

51             sn += n[i];

52             nsh += n[i]*h(m[i],n[i]);

53         }

54         return h(sm, sn) - nsh/sn;

55     }

56

57     /*  Entropy for m favorable items out of n.

58     */

59     public static double h(int m, int n) {

60         if (m == 0 || m == n) {

61             return 0;

62         }

63         double p = (double)m/n, q = 1 - p;

64         return -p*lg(p) - q*lg(q);

65     }

66

67     /*  Returns the binary logarithm of x.

68     */

69     public static double lg(double x) {

70         return Math.log(x)/Math.log(2);

71     }

72 }

h(11,16) = 0.8960
Gain(Size):
h(3,5) = 0.9710
h(6,7) = 0.5917
h(2,4) = 1.0000
g({3,6,2},{5,7,4}) = 0.0838
Gain(Color):
h(3,4) = 0.8113
h(3,5) = 0.9710
h(2,3) = 0.9183
h(2,4) = 1.0000
g({3,3,2,2},{4,5,3,4}) = 0.0260
Gain(Surface):
h(4,7) = 0.9852
h(4,6) = 0.9183
h(3,3) = 0.0000
g({4,4,3},{7,6,3}) = 0.1206
Gain(Size|SMOOTH):
h(1,3) = 0.9183
h(3,3) = 0.0000
g({1,3,0},{3,3,1}) = 0.5917
Gain(Color|SMOOTH):
h(2,3) = 0.9183
g({2,2,0},{3,2,2}) = 0.5917
Gain(Size|ROUGH):
h(3,6) = 1.0000
h(1,2) = 1.0000
g({2,1,1},{2,2,2}) = 0.2516
Gain(Color|ROUGH):
h(4,6) = 0.9183
g({1,1,1},{2,2,2}) = 0.9183

 1 import weka.classifiers.trees.J48;

 2 import weka.core.Instances;

 3 import weka.core.Instance;

 4 import weka.core.converters.ConverterUtils.DataSource;

 5

 6 public class TestWekaJ48 {

 7     public static void main(String[] args) throws Exception {

 8         DataSource source = new DataSource("data/AnonFruit.arff");

 9         Instances instances = source.getDataSet();

10         instances.setClassIndex(3);  // target attribute: (Sweet)

11

12         J48 j48 = new J48();  // an extension of ID3

13         j48.setOptions(new String[]{"-U"});  // use unpruned tree

14         j48.buildClassifier(instances);

15

16         for (Instance instance : instances) {

17             double prediction = j48.classifyInstance(instance);

18             System.out.printf("%4.0f%4.0f%n",

19                     instance.classValue(), prediction);

20         }

21     }

22 }

1 1
1 1
1 1
1 0
1 1
0 0
1 1
0 0
0 0
0 0
1 1
1 1
1 1
1 1
0 0
1 1

贝叶斯分类

思路：基于训练集计算的比率生成的函数进行分类

Fruit.java

 1 import java.io.File;

 2 import java.io.FileNotFoundException;

 3 import java.util.HashSet;

 4 import java.util.Scanner;

 5 import java.util.Set;

 6

 7 public class Fruit {

 8     String name, size, color, surface;

 9     boolean sweet;

10

11     public Fruit(String name, String size, String color, String surface,

12             boolean sweet) {

13         this.name = name;

14         this.size = size;

15         this.color = color;

16         this.surface = surface;

17         this.sweet = sweet;

18     }

19

20     @Override

21     public String toString() {

22         return String.format("%-12s%-8s%-8s%-8s%s",

23                 name, size, color, surface, (sweet? "T": "F") );

24     }

25

26     public static Set<Fruit> loadData(File file) {

27         Set<Fruit> fruits = new HashSet();

28         try {

29             Scanner input = new Scanner(file);

30             for (int i = 0; i < 7; i++) {  // read past metadata

31                 input.nextLine();

32             }

33             while (input.hasNextLine()) {

34                 String line = input.nextLine();

35                 Scanner lineScanner = new Scanner(line);

36                 String name = lineScanner.next();

37                 String size = lineScanner.next();

38                 String color = lineScanner.next();

39                 String surface = lineScanner.next();

40                 boolean sweet = (lineScanner.next().equals("T"));

41                 Fruit fruit = new Fruit(name, size, color, surface, sweet);

42                 fruits.add(fruit);

43             }

44         } catch (FileNotFoundException e) {

45             System.err.println(e);

46         }

47         return fruits;

48     }

49

50     public static void print(Set<Fruit> fruits) {

51         int k=1;

52         for (Fruit fruit : fruits) {

53             System.out.printf("%2d. %s%n", k++, fruit);

54         }

55     }

56 }

BayesianTest.java

 1 import java.io.File;

 2 import java.util.Set;

 3

 4 public class BayesianTest {

 5     private static Set<Fruit> fruits;

 6

 7     public static void main(String[] args) {

 8         fruits = Fruit.loadData(new File("data/Fruit.arff"));

 9         Fruit fruit = new Fruit("cola", "SMALL", "RED", "SMOOTH", false);

10         double n = fruits.size();  // total number of fruits in training set

11         double sum1 = 0;           // number of sweet fruits

12         for (Fruit f : fruits) {

13             sum1 += (f.sweet? 1: 0);

14         }

15         double sum2 = n - sum1;    // number of sour fruits

16         double[][] p = new double[4][3];

17         for (Fruit f : fruits) {

18             if (f.sweet) {

19                 p[1][1] += (f.size.equals(fruit.size)? 1: 0)/sum1;

20                 p[2][1] += (f.color.equals(fruit.color)? 1: 0)/sum1;

21                 p[3][1] += (f.surface.equals(fruit.surface)? 1: 0)/sum1;

22             } else {

23                 p[1][2] += (f.size.equals(fruit.size)? 1: 0)/sum2;

24                 p[2][2] += (f.color.equals(fruit.color)? 1: 0)/sum2;

25                 p[3][2] += (f.surface.equals(fruit.surface)? 1: 0)/sum2;

26             }

27         }

28         double pc1 = p[1][1]*p[2][1]*p[3][1]*sum1/n;

29         double pc2 = p[1][2]*p[2][2]*p[3][2]*sum2/n;

30         System.out.printf("pc1 = %.4f, pc2 = %.4f%n", pc1, pc2);

31         System.out.printf("Predict %s is %s.%n",

32                 fruit.name, (pc1 > pc2? "sweet": "sour"));

33     }

34 }

pc1 = 0.0186, pc2 = 0.0150
Predict cola is sweet.

TestWekaBayes.java

 1 import java.util.List;

 2 import weka.classifiers.Evaluation;

 3 import weka.classifiers.bayes.NaiveBayes;

 4 import weka.classifiers.evaluation.Prediction;

 5 import weka.core.Instance;

 6 import weka.core.Instances;

 7 import weka.core.converters.ConverterUtils;

 8 import weka.core.converters.ConverterUtils.DataSource;

 9

10 public class TestWekaBayes {

11     public static void main(String[] args) throws Exception {

12 //        ConverterUtils.DataSource source = new ConverterUtils.DataSource("data/AnonFruit.arff");

13         DataSource source = new DataSource("data/AnonFruit.arff");

14         Instances train = source.getDataSet();

15         train.setClassIndex(3);  // target attribute: (Sweet)

16         //build model

17         NaiveBayes model=new NaiveBayes();

18         model.buildClassifier(train);

19

20         //use

21         Instances test = train;

22         Evaluation eval = new Evaluation(test);

23         eval.evaluateModel(model,test);

24         List <Prediction> predictions = eval.predictions();

25         int k = 0;

26         for (Instance instance : test) {

27             double actual = instance.classValue();

28             double prediction = eval.evaluateModelOnce(model, instance);

29             System.out.printf("%2d.%4.0f%4.0f", ++k, actual, prediction);

30             System.out.println(prediction != actual? " *": "");

31         }

32     }

33 }

1. 1 1
2. 1 1
3. 1 1
4. 1 1
5. 1 1
6. 0 1 *
7. 1 1
8. 0 0
9. 0 0
10. 0 1 *
11. 1 1
12. 1 1
13. 1 1
14. 1 1
15. 0 0
16. 1 1

SVM算法

思路：生成超平面方程，计算数据点位于哪一边

逻辑回归

思路：将目标值属性为布尔值的问题转化成一个数值变量，在转化后的问题上进行线性回归
需求：某政党候选人想知道选举获胜的花费
实现

 1 import org.apache.commons.math3.analysis.function.*;

 2 import org.apache.commons.math3.stat.regression.SimpleRegression;

 3

 4 public class LogisticRegression {

 5     static int n = 6;

 6     static double[] x = {5, 15, 25, 35, 45, 55};

 7     static double[] p = {2./6,2./5, 4./8, 5./9, 3./5, 4./5};

 8     static double[] y = new double[n];    // y = logit(p)

 9

10     public static void main(String[] args) {

11

12         //  Transform p-values into y-values:

13         Logit logit = new Logit();

14         for (int i = 0; i < n; i++) {

15             y[i] = logit.value(p[i]);

16         }

17

18         //  Set up input array for linear regression:

19         double[][] data = new double[n][n];

20         for (int i = 0; i < n; i++) {

21             data[i][0] = x[i];

22             data[i][1] = y[i];

23         }

24

25         //  Run linear regression of y on x:

26         SimpleRegression sr = new SimpleRegression();

27         sr.addData(data);

28

29         //  Print results:

30         for (int i = 0; i < n; i++) {

31             System.out.printf("x = %2.0f, y = %7.4f%n", x[i], sr.predict(x[i]));

32         }

33         System.out.println();

34

35         //  Convert y-values back to p-values:

36         Sigmoid sigmoid = new Sigmoid();

37         for (int i = 0; i < n; i++) {

38             double p = sr.predict(x[i]);

39             System.out.printf("x = %2.0f, p = %6.4f%n", x[i], sigmoid.value(p));

40         }

41     }

42 }

x = 5, y = -0.7797
x = 15, y = -0.4067
x = 25, y = -0.0338
x = 35, y = 0.3392
x = 45, y = 0.7121
x = 55, y = 1.0851

x = 5, p = 0.3144
x = 15, p = 0.3997
x = 25, p = 0.4916
x = 35, p = 0.5840
x = 45, p = 0.6709
x = 55, p = 0.7475

k临近

思路：根据临近范围内的样本进行分类

 1 import weka.classifiers.lazy.IBk;  // K-Nearest Neighbors

 2 import weka.core.Instances;

 3 import weka.core.Instance;

 4 import weka.core.converters.ConverterUtils.DataSource;

 5

 6 public class TestIBk {

 7     public static void main(String[] args) throws Exception {

 8         DataSource source = new DataSource("data/AnonFruit.arff");

 9         Instances instances = source.getDataSet();

10         instances.setClassIndex(3);  // target attribute: (Sweet)

11

12         IBk ibk = new IBk();

13         ibk.buildClassifier(instances);

14

15         for (Instance instance : instances) {

16             double prediction = ibk.classifyInstance(instance);

17             System.out.printf("%4.0f%4.0f%n",

18                     instance.classValue(), prediction);

19         }

20     }

21 }

1 1
1 1
1 1
1 0
1 1
0 0
1 1
0 0
0 0
0 0
1 1
1 1
1 1
1 1
0 0
1 1

[Java] 数据分析--分类的更多相关文章

Java异常分类转载
Java异常分类 http://blog.csdn.net/woshixuye/article/details/8230407 一.基本概念看java的异常结构图 Throwable是所有异 ...
Java注释分类
Java注释分类 1.单行注释 //打印结果 System.out.println("结果是:"+result); 2.多行注释 /** * @autho ...
Java应用分类
Java应用分类一.应用程序.指在操作系统上直接运行的,不是浏览器,Java环境用本机的,需要在客户端安装,Java环境可以一起安装. 1.GUI图形界面应用程序 ...
Java凝视分类
Java凝视分类 1.单行凝视 //打印结果 System.out.println("结果是:"+result); 2.多行凝视 /** * @autho ...
Java集合【1】--俯瞰java集合分类
目录 (一) java集合分类 (1) Iterable接口 1. 内部定义的方法 1.1 iterator方法 1.2 forEach方法 1.3 spliterator方法 2. Collecti ...
irms模拟数据生成及数据分析分类： H_HISTORY 2015-03-06 14:17 212人阅读评论(0) 收藏
一.数据准备 1.每天生成随机一个文本,每小时向文本中追加2次数据,每次10万条随机数据生成: 2,32 * * * * bash /mnt/jediael/irms/signalGenerat ...
各种排序算法的分析及java实现分类： B10_计算机基础 2015-02-03 20:09 186人阅读评论(0) 收藏
转载自:http://www.cnblogs.com/liuling/p/2013-7-24-01.html 另可参考:http://gengning938.blog.163.com/blog/sta ...
Java 引用分类：StrongReference、SoftReference、WeakReference、PhantomReference
一,定义在Java中,引用的定义是:如果reference类型的数据中存储的数值代表的是另一块内存的起始地址,就称这块内存代表着一个引用.后面在JDK1.2开始,引用的概念被扩充,引用被分为强引用( ...
Java异常分类
一.基本概念看java的异常结构图 Throwable是所有异常的根,java.lang.ThrowableError是错误,java.lang.ErrorException是异常,java.lan ...

随机推荐

阳明-K8S训练营全部文档-2020年08月11日14:59:02更新
阳明-K8S训练营全部文档 Docker 基础简介安装基本操作 Dockerfile Dockerfile最佳实践 Kubernetes 基础简介安装资源清单 Pod 原理 Pod 生命周 ...
JMeter元件作用域实践指南
从一个问题说起对于以下测试脚本: 为了能调用进入房间接口,需要从考场接口获取考场token.为了调用考场接口,需要从登陆接口获取登陆token.元件说明如下: 学生登录,提取登录${token}传入 ...
Spring Boot 轻量替代框架 Solon 1.3.20 发布
Solon 是一个微型的Java开发框架.项目2018年启动,参考过大量前人作品:内核0.1m的身材,超高的跑分,以及良好的使用体验.支持:RPC.REST API.MVC.WebSocket.Soc ...
201871030115-康旭实验二软件工程个人项目—《D{0-1} KP》项目报告
项目内容课程班级博客连接课程班级这个作业要求连接作业链接我的课程学习目标 (1)详细阅读<构建之法>第1章.第2章,掌握PSP流程:(2)设计实际程序掌握动态规划算法.回溯算法 ...
通过Dapr实现一个简单的基于.net的微服务电商系统
本来想在Dpar 1.0GA时发布这篇文章,由于其他事情耽搁了放到现在.时下微服务和云原生技术如何如荼,微软也不甘示弱的和阿里一起适时推出了Dapr(https://dapr.io/),园子里关于da ...
NLP入门学习中关于分词库HanLP导入使用教程
大家好,时隔多年再次打开我的博客园写下自己的经验和学习总结,开园三年多,文章数少得可怜,一方面自己技术水平局限,另一方面是自己确实想放弃写博客.由于毕业工作的原因,经常性的加班以及仅剩下少的可怜的休息 ...
前端缓存API请求数据
1. 背景在一些项目中,有时候会出现不同模块重复请求大量相同api接口的情况,特别是在一些功能相似的后台管理页面中.以下面这几个页面为例,每次进入页面都需要请求等大量重复的下拉框数据,下拉框数据短时 ...
1036 Boys vs Girls
This time you are asked to tell the difference between the lowest grade of all the male students and ...
1089 Insert or Merge
According to Wikipedia: Insertion sort iterates, consuming one input element each repetition, and gr ...
问题：dependencyManagement和dependencies有什么区别
dependencyManagement和dependencies有什么区别一.Maven的包管理在maven中,dependencyManagement.dependencies和depende ...

[Java] 数据分析--分类

[Java] 数据分析--分类的更多相关文章

随机推荐

热门专题