OpenCV码源笔记——Decision Tree决策树

来自OpenCV2.3.1 sample/c/mushroom.cpp

1.首先读入agaricus-lepiota.data的训练样本。

样本中第一项是e或p代表有毒或无毒的标志位；其他是特征，可以把每个样本看做一个特征向量；

cvSeqPush( seq, el_ptr );读入序列seq中，每一项都存储一个样本即特征向量；

之后，把特征向量与标志位分别读入CvMat* data与CvMat* reponses中

还有一个CvMat* missing保留丢失位当前小于0位置；

2.训练样本

dtree = new CvDTree;
dtree->train( data, CV_ROW_SAMPLE, responses, 0, 0, var_type, missing,
CvDTreeParams( 8, // max depth
10, // min sample count 样本数小于10时，停止分裂
0, // regression accuracy: N/A here；回归树的限制精度
true, // compute surrogate split, as we have missing data；；为真时，计算missing data和变量的重要性
15, // max number of categories (use sub-optimal algorithm for larger numbers)类型上限以保证计算速度。树会以次优分裂（suboptimal split）的形式生长。只对2种取值以上的树有意义
10, // the number of cross-validation folds；If cv_folds > 1 then prune a tree with K-fold cross-validation where K is equal to cv_folds
true, // use 1SE rule => smaller tree；If true 修剪树. 这将使树更紧凑,更能抵抗训练数据噪声,但有点不太准确
true, // throw away the pruned tree branches
priors //错分类的代价我们判断的：有毒VS无毒错误的代价比 the array of priors, the bigger p_weight, the more attention
// to the poisonous mushrooms
// (a mushroom will be judjed to be poisonous with bigger chance)
));

double r = dtree->predict( &sample, &mask )->value;//使用predict来预测样本，结果为 CvDTreeNode结构,dtree->predict(sample,mask)->value是分类情况下的类别或回归情况下的函数估计值；

4.interactive_classification通过人工输入特征来判断。

#include "opencv2/core/core_c.h"
#include "opencv2/ml/ml.hpp"
#include <stdio.h>
void help()
{
printf("\nThis program demonstrated the use of OpenCV's decision tree function for learning and predicting data\n"
"Usage :\n"
"./mushroom <path to agaricus-lepiota.data>\n"
"\n"
"The sample demonstrates how to build a decision tree for classifying mushrooms.\n"
"It uses the sample base agaricus-lepiota.data from UCI Repository, here is the link:\n"
"\n"
"Newman, D.J. & Hettich, S. & Blake, C.L. & Merz, C.J. (1998).\n"
"UCI Repository of machine learning databases\n"
"[http://www.ics.uci.edu/~mlearn/MLRepository.html].\n"
"Irvine, CA: University of California, Department of Information and Computer Science.\n"
"\n"
"// loads the mushroom database, which is a text file, containing\n"
"// one training sample per row, all the input variables and the output variable are categorical,\n"
"// the values are encoded by characters.\n\n");
}
int mushroom_read_database( const char* filename, CvMat** data, CvMat** missing, CvMat** responses )
{
const int M = 1024;
FILE* f = fopen( filename, "rt" );
CvMemStorage* storage;
CvSeq* seq;
char buf[M+2], *ptr;
float* el_ptr;
CvSeqReader reader;
int i, j, var_count = 0;
if( !f )
return 0;
// read the first line and determine the number of variables
if( !fgets( buf, M, f ))
{
fclose(f);
return 0;
}
for( ptr = buf; *ptr != '\0'; ptr++ )
var_count += *ptr == ',';//计算每个样本的数量，每个样本一个“，”，样本数量=var_count+1;
assert( ptr - buf == (var_count+1)*2 );
// create temporary memory storage to store the whole database
//把样本存入seq中，存储空间是storage；
el_ptr = new float[var_count+1];
storage = cvCreateMemStorage();
seq = cvCreateSeq( 0, sizeof(*seq), (var_count+1)*sizeof(float), storage );//
for(;;)
{
for( i = 0; i <= var_count; i++ )
{
int c = buf[i*2];
el_ptr[i] = c == '?' ? -1.f : (float)c;
}
if( i != var_count+1 )
break;
cvSeqPush( seq, el_ptr );
if( !fgets( buf, M, f ) || !strchr( buf, ',' ) )
break;
}
fclose(f);
// allocate the output matrices and copy the base there
*data = cvCreateMat( seq->total, var_count, CV_32F );//行数：样本数量；列数：样本大小；
*missing = cvCreateMat( seq->total, var_count, CV_8U );
*responses = cvCreateMat( seq->total, 1, CV_32F );//样本标志；
cvStartReadSeq( seq, &reader );
for( i = 0; i < seq->total; i++ )
{
const float* sdata = (float*)reader.ptr + 1;
float* ddata = data[0]->data.fl + var_count*i;
float* dr = responses[0]->data.fl + i;
uchar* dm = missing[0]->data.ptr + var_count*i;
for( j = 0; j < var_count; j++ )
{
ddata[j] = sdata[j];
dm[j] = sdata[j] < 0;
}
*dr = sdata[-1];//样本的第一个位置是标志；
CV_NEXT_SEQ_ELEM( seq->elem_size, reader );
}
cvReleaseMemStorage( &storage );
delete el_ptr;
return 1;
}
CvDTree* mushroom_create_dtree( const CvMat* data, const CvMat* missing,
const CvMat* responses, float p_weight )
{
CvDTree* dtree;
CvMat* var_type;
int i, hr1 = 0, hr2 = 0, p_total = 0;
float priors[] = { 1, p_weight };
var_type = cvCreateMat( data->cols + 1, 1, CV_8U );
cvSet( var_type, cvScalarAll(CV_VAR_CATEGORICAL) ); // all the variables are categorical
dtree = new CvDTree;
dtree->train( data, CV_ROW_SAMPLE, responses, 0, 0, var_type, missing,
CvDTreeParams( 8, // max depth
10, // min sample count样本数小于10时，停止分裂
0, // regression accuracy: N/A here；回归树的限制精度
true, // compute surrogate split, as we have missing data；为真时，计算missing data和可变的重要性正确度
15, // max number of categories (use sub-optimal algorithm for larger numbers)类型上限以保证计算速度。树会以次优分裂（suboptimal split）的形式生长。只对2种取值以上的树有意义
10, // the number of cross-validation folds；If cv_folds > 1 then prune a tree with K-fold cross-validation
true, // use 1SE rule => smaller treeIf true 修剪树. 这将使树更紧凑,更能抵抗训练数据噪声,但有点不太准确
true, // throw away the pruned tree branches
priors // the array of priors, the bigger p_weight, the more attention
// to the poisonous mushrooms
// (a mushroom will be judjed to be poisonous with bigger chance)
));
// compute hit-rate on the training database, demonstrates predict usage.
for( i = 0; i < data->rows; i++ )
{
CvMat sample, mask;
cvGetRow( data, &sample, i );
cvGetRow( missing, &mask, i );
double r = dtree->predict( &sample, &mask )->value;//使用predict来预测样本，结果为 CvDTreeNode结构,dtree->predict(sample,mask)->value是分类情况下的类别或回归情况下的函数估计值；
int d = fabs(r - responses->data.fl[i]) >= FLT_EPSILON;//大于阈值FLT_EPSILON被判断为误检
if( d )
{
if( r != 'p' )
hr1++;
else
hr2++;
}
p_total += responses->data.fl[i] == 'p';
}
printf( "Results on the training database:\n"
"\tPoisonous mushrooms mis-predicted: %d (%g%%)\n"
"\tFalse-alarms: %d (%g%%)\n", hr1, (double)hr1*100/p_total,
hr2, (double)hr2*100/(data->rows - p_total) );
cvReleaseMat( &var_type );
return dtree;
}
static const char* var_desc[] =
{
"cap shape (bell=b,conical=c,convex=x,flat=f)",
"cap surface (fibrous=f,grooves=g,scaly=y,smooth=s)",
"cap color (brown=n,buff=b,cinnamon=c,gray=g,green=r,\n\tpink=p,purple=u,red=e,white=w,yellow=y)",
"bruises? (bruises=t,no=f)",
"odor (almond=a,anise=l,creosote=c,fishy=y,foul=f,\n\tmusty=m,none=n,pungent=p,spicy=s)",
"gill attachment (attached=a,descending=d,free=f,notched=n)",
"gill spacing (close=c,crowded=w,distant=d)",
"gill size (broad=b,narrow=n)",
"gill color (black=k,brown=n,buff=b,chocolate=h,gray=g,\n\tgreen=r,orange=o,pink=p,purple=u,red=e,white=w,yellow=y)",
"stalk shape (enlarging=e,tapering=t)",
"stalk root (bulbous=b,club=c,cup=u,equal=e,rhizomorphs=z,rooted=r)",
"stalk surface above ring (ibrous=f,scaly=y,silky=k,smooth=s)",
"stalk surface below ring (ibrous=f,scaly=y,silky=k,smooth=s)",
"stalk color above ring (brown=n,buff=b,cinnamon=c,gray=g,orange=o,\n\tpink=p,red=e,white=w,yellow=y)",
"stalk color below ring (brown=n,buff=b,cinnamon=c,gray=g,orange=o,\n\tpink=p,red=e,white=w,yellow=y)",
"veil type (partial=p,universal=u)",
"veil color (brown=n,orange=o,white=w,yellow=y)",
"ring number (none=n,one=o,two=t)",
"ring type (cobwebby=c,evanescent=e,flaring=f,large=l,\n\tnone=n,pendant=p,sheathing=s,zone=z)",
"spore print color (black=k,brown=n,buff=b,chocolate=h,green=r,\n\torange=o,purple=u,white=w,yellow=y)",
"population (abundant=a,clustered=c,numerous=n,\n\tscattered=s,several=v,solitary=y)",
"habitat (grasses=g,leaves=l,meadows=m,paths=p\n\turban=u,waste=w,woods=d)",
0
};
void print_variable_importance( CvDTree* dtree, const char** var_desc )
{
const CvMat* var_importance = dtree->get_var_importance();
int i;
char input[1000];
if( !var_importance )
{
printf( "Error: Variable importance can not be retrieved\n" );
return;
}
printf( "Print variable importance information? (y/n) " );
scanf( "%1s", input );
if( input[0] != 'y' && input[0] != 'Y' )
return;
for( i = 0; i < var_importance->cols*var_importance->rows; i++ )
{
double val = var_importance->data.db[i];
if( var_desc )
{
char buf[100];
int len = strchr( var_desc[i], '(' ) - var_desc[i] - 1;
strncpy( buf, var_desc[i], len );
buf[len] = '\0';
printf( "%s", buf );
}
else
printf( "var #%d", i );
printf( ": %g%%\n", val*100. );
}
}
void interactive_classification( CvDTree* dtree, const char** var_desc )
{
char input[1000];
const CvDTreeNode* root;
CvDTreeTrainData* data;
if( !dtree )
return;
root = dtree->get_root();
data = dtree->get_data();
for(;;)
{
const CvDTreeNode* node;
printf( "Start/Proceed with interactive mushroom classification (y/n): " );
scanf( "%1s", input );
if( input[0] != 'y' && input[0] != 'Y' )
break;
printf( "Enter 1-letter answers, '?' for missing/unknown value...\n" );
// custom version of predict
//传统的预测方式;
node = root;
for(;;)
{
CvDTreeSplit* split = node->split;
int dir = 0;
if( !node->left || node->Tn <= dtree->get_pruned_tree_idx() || !node->split )
break;
for( ; split != 0; )
{
int vi = split->var_idx, j;
int count = data->cat_count->data.i[vi];
const int* map = data->cat_map->data.i + data->cat_ofs->data.i[vi];
printf( "%s: ", var_desc[vi] );
scanf( "%1s", input );
if( input[0] == '?' )
{
split = split->next;
continue;
}
// convert the input character to the normalized value of the variable
for( j = 0; j < count; j++ )
if( map[j] == input[0] )
break;
if( j < count )
{
dir = (split->subset[j>>5] & (1 << (j&31))) ? -1 : 1;
if( split->inversed )
dir = -dir;
break;
}
else
printf( "Error: unrecognized value\n" );
}
if( !dir )
{
printf( "Impossible to classify the sample\n");
node = 0;
break;
}
node = dir < 0 ? node->left : node->right;
}
if( node )
printf( "Prediction result: the mushroom is %s\n",
node->class_idx == 0 ? "EDIBLE" : "POISONOUS" );
printf( "\n-----------------------------\n" );
}
}
int main( int argc, char** argv )
{
CvMat *data = 0, *missing = 0, *responses = 0;
CvDTree* dtree;
const char* base_path = argc >= 2 ? argv[1] : "agaricus-lepiota.data";
help();
if( !mushroom_read_database( base_path, &data, &missing, &responses ) )
{
printf( "\nUnable to load the training database\n\n");
help();
return -1;
}
dtree = mushroom_create_dtree( data, missing, responses,
10 // poisonous mushrooms will have 10x higher weight in the decision tree
);
cvReleaseMat( &data );
cvReleaseMat( &missing );
cvReleaseMat( &responses );
print_variable_importance( dtree, var_desc );
interactive_classification( dtree, var_desc );
delete dtree;
return 0;
}
//from: http://blog.csdn.net/yangtrees/article/details/7490852

OpenCV码源笔记——Decision Tree决策树的更多相关文章

OpenCV码源笔记——RandomTrees (二)(Forest)
源码细节: ● 训练函数 bool CvRTrees::train( const CvMat* _train_data, int _tflag, cons ...
OpenCV码源笔记——RandomTrees (一)
OpenCV2.3中Random Trees(R.T.)的继承结构: API: CvRTParams 定义R.T.训练用参数,CvDTreeParams的扩展子类,但并不用到CvDTreeParams ...
Decision tree(决策树)算法初探
0. 算法概述决策树(decision tree)是一种基本的分类与回归方法.决策树模型呈树形结构(二分类思想的算法模型往往都是树形结构) 0x1:决策树模型的不同角度理解在分类问题中,表示基于特 ...
decision tree 决策树（一）
一决策树原理:分类决策树模型是一种描述对实例进行分类的树形结构.决策树由结点(node)和有向边(directed edge)组成.结点有两种类型:内部结点(internal node)和叶结点( ...
Decision tree——决策树
基本流程决策树是通过分次判断样本属性来进行划分样本类别的机器学习模型.每个树的结点选择一个最优属性来进行样本的分流,最终将样本类别划分出来. 决策树的关键就是分流时最优属性$a$的选择.使用所谓信息 ...
决策树Decision Tree 及实现
Decision Tree 及实现标签: 决策树熵信息增益分类有监督 2014-03-17 12:12 15010人阅读评论(41) 收藏举报分类: Data Mining(25) Pyt ...
Spark MLlib - Decision Tree源码分析
http://spark.apache.org/docs/latest/mllib-decision-tree.html 以决策树作为开始,因为简单,而且也比较容易用到,当前的boosting或ran ...
[ML学习笔记] 决策树与随机森林（Decision Tree&Random Forest）
[ML学习笔记] 决策树与随机森林(Decision Tree&Random Forest) 决策树决策树算法以树状结构表示数据分类的结果.每个决策点实现一个具有离散输出的测试函数,记为分支 ...
【机器学习】决策树（Decision Tree）学习笔记
[机器学习]决策树(decision tree) 学习笔记标签(空格分隔): 机器学习决策树简介决策树(decision tree)是一个树结构(可以是二叉树或非二叉树).其每个非叶节点表示一个 ...

随机推荐

【转】为什么我说 Android 很糟糕
http://zhuanlan.zhihu.com/wooyun/19879016 Android 的安全问题一直被吐槽,包括不安全的APP市场.上次的远程命令执行漏洞.还有它的权限机制,总之一团糟, ...
Thinkphp模板中使用自定义函数的方法
注意:自定义函数要放在项目应用目录/common/common.php中. 这里是关键. 模板变量的函数调用格式:{$varname|function1|function2=arg1,arg2,### ...
在eclipse中使用jax-ws构建webservices服务端和客户端
服务端: package com.yinfu.service; import javax.jws.WebService; import javax.xml.ws.Endpoint; @WebServi ...
部署图 Deployment Diagram
UML部署图描述了一个运行时的硬件结点,以及在这些结点上运行的软件组件的静态视图. 部署图显示了系统的硬件,安装在硬件上的软件,以及用于连接异构的机器之间的中间件. 下面这张图介绍了部署图的基本内容: ...
LintCode-Kth Prime Number.
Design an algorithm to find the kth number such that the only prime factors are 3, 5, and 7. The eli ...
团队作业week2-软件分析和用户需求调查
我们的团队选择评定的软件是必应词典(iphone版)和使用较多的有道词典(iphone版) 类别描述评分(Bing) 评分(有道) 功能核心功能1:词典顾名思义,作为一款词典类 ...
DB天气app冲刺二阶段第十一天（完结）
今天最后一天冲刺了,明天就不再冲刺了..已经把所有的技术的问题还有设计的问题都弄好了吧应该说至少目前来说是的.因为有的实现不了的或者需要耗费时间的已经果断舍弃了,然后需要完善的也都基本完善了. 现在 ...
linux 删除某种规则命名的文件
由于android开发需要删除以IMG_开头命名的图片文件,因此用到此命令命令格式: rm IMG_*
CSS去除Chrome浏览器的控件默认样式
html的input输入框在Chrome浏览器里是有默认样式的,当它获得焦点时,即使你没有为它设置:focus时的样式,Chrome浏览器还是会给它加上蓝色的边框,今天百度找到有个方法可以去除该默认样 ...
11个实用jQuery日历插件
1. FullCalendar FullCalendar是很出名的jQuery日历插件,它支持拖拽等功能,整合了Google Calendar,而且可以通过JSON来绑定事件,设计师可以轻松地自定义日 ...

OpenCV码源笔记——Decision Tree决策树

OpenCV码源笔记——Decision Tree决策树的更多相关文章

随机推荐

热门专题