模式识别之线性判别---naive bayes朴素贝叶斯代码实现
http://blog.csdn.net/xceman1997/article/details/7955349
http://www.cnblogs.com/yuyang-DataAnalysis/archive/2012/01/31/2333760.html
http://zhan.renren.com/dmeryuyang?gid=3602888497999161050&checked=true
http://blog.csdn.net/yanqingan/article/details/6125812
bool NaiveBayes::Train (const char * sFileSample, int iClassNum, int iFeaTypeNum,
							 string & sSegmenter, int iFeaExtractNum, const char * sFileModel, bool bCompactModel)
{
	// 防御性代码
	if (iClassNum <= 0 || iFeaTypeNum <= 0 || iFeaExtractNum <= 0)
		return false;
ifstream in (sFileSample, ios_base::binary);
	ofstream out (sFileModel);
	if (!in || !out)
	{
		cerr << "Can not open the file" << endl;
		return false;
	}
// 这些都是临时数据结构,用来临时存储模型参数,特征选择需要的参数等等
	// 1. the temp data structure for model parameters
	// 1.1 the total number of document in training samples
	int iTotalDocNum = 0;
	// 1.2 the prior probability of class, temparaly it store the doc number in this class
	double * pClassPriorProb = new double [iClassNum];
	memset (pClassPriorProb, 0, iClassNum*sizeof(double));
	// 1.3 the prior probability of feature type, temparaly it stores the doc number in this feature (这个主要用于特征选择,bayes模型本身并不需要这个参数)
	double * pFeaItemPriorProb = new double [iFeaTypeNum];
	memset (pFeaItemPriorProb, 0, iFeaTypeNum*sizeof(double));
	// 1.4 the chi-square value that feature falls into class, temparaly it stores the doc number for this class and feature (可以看到,特征选择算法主要用卡方选择)
	double ** ppChiMatrix = new double * [iClassNum];
	for (int i=0; i<iClassNum; i++)
	{
		ppChiMatrix[i] = new double [iFeaTypeNum];
		memset (ppChiMatrix[i], 0, iFeaTypeNum*sizeof(double));
	}
	// 1.5 the post-probability for class and feature
	double ** ppPProbMatrix = new double * [iClassNum];
	for (int i=0; i<iClassNum; i++)
	{
		ppPProbMatrix[i] = new double [iFeaTypeNum];
		memset (ppChiMatrix[i], 0, iFeaTypeNum*sizeof(double));
	}
	// 1.6 for the feature selection (表示哪些特征被选中了)
	int * pFeaSelected = new int [iFeaTypeNum];
	memset (pFeaSelected, 0, iFeaTypeNum*sizeof(int));
// 2. iterate the training samples and fill count into the temp data structure
	string sLine;
	int i = 0;
	while (getline (in, sLine))
	{
		// show some information on screen
		if (0 == i%10000)
			cout << i << "\n";
		i++;
// 2.1 the total number of doc
		iTotalDocNum++;
// 2.2 split the sample into class and feature items
		string::size_type iSeg = sLine.find_first_of (sSegmenter);
		string sTmp = sLine.substr (0, iSeg);
		int iClassId = atoi (sTmp.c_str());
		if (iClassId >= iClassNum)
			continue;
		pClassPriorProb [iClassId]++;
// 2.3 count the rest feature items
		iSeg += sTmp.length();
		sTmp = sLine.substr (iSeg);
		istringstream isLine (sTmp);
		string sTmpItem;
		while (isLine >> sTmpItem)
		{
			int iFeaItemId = atoi (sTmpItem.c_str());
			if (iFeaItemId >= iFeaTypeNum)
				continue;
			// add the count
			pFeaItemPriorProb [iFeaItemId]++;
			ppChiMatrix [iClassId][iFeaItemId]++;
}
	}
// 3. calculate the model parameters 
	// 3.1 the chi-square value as well as the post-probabilty
	for (int i=0; i<iClassNum; i++)
	{
		for (int j=0; j<iFeaTypeNum; j++)
		{
			double dA = ppChiMatrix[i][j];
			double dB = pFeaItemPriorProb[j] - dA; // currently pFeaItemPriorProb[i] == sum_i (ppChiMatrix[i][j])
			double dC = pClassPriorProb [i] - dA;  // currently pClassPriorProb[i] == sum_j (ppChiMatrix[i][j])
			double dD = (double)iTotalDocNum - dA - dB - dC;
// the chi value 
			double dNumerator = dA * dD;
			dNumerator -= dB * dC;
			dNumerator = pow (dNumerator, 2.0);
			double dDenominator = dA + dB;
			dDenominator *= (dC + dD);
			dDenominator += DBL_MIN; // for smoothing
			ppChiMatrix[i][j] = dNumerator / dDenominator;
// the post-probability: p(feature|class)
			ppPProbMatrix[i][j] = dA / pClassPriorProb [i];
		}
	}
// 3.2 the prior probability of class
	for (int i=0; i<iClassNum; i++)
		pClassPriorProb [i] /= iTotalDocNum;
// 3.3 the prior probability of feature
	for (int i=0; i<iFeaTypeNum; i++)
		pFeaItemPriorProb [i] /= iTotalDocNum;
// 4. feature selection (这个函数下一篇文章再详细讲)
	FeaSelByChiSquare (ppChiMatrix, ppPProbMatrix, iClassNum, 
		iFeaTypeNum, iFeaExtractNum, pFeaSelected);
// 5. dump the model into txt file
if (bCompactModel)		// output the parameters only for predicting
	{
		// 5.1 the prior probability of class
		out << iClassNum << endl;
		for (int i=0; i<iClassNum; i++)
		{
			out << pClassPriorProb [i] << "\n";
		}
		// 5.2 the actual selected feature type number
		int iActualFeaNum = 0;
		for (int j=0; j<iFeaTypeNum; j++)
		{
			if (1 == pFeaSelected[j])
				iActualFeaNum ++;
		}
		out << iActualFeaNum << endl;
		// 5.3 the post probability
		for (int i=0; i<iClassNum; i++)
		{
			for (int j=0; j<iFeaTypeNum; j++)
			{
				if (1 == pFeaSelected[j])
				{
					out << j << ":" << ppPProbMatrix[i][j] << "\n";
				}
			}
		}
	}
	else					// output the full information
	{
		// 5.1 the total number of document
		out << iTotalDocNum << endl;
// 5.2 the prior probability of class 
		out << iClassNum << endl;
		for (int i=0; i<iClassNum; i++)		// classindex:priorprob
		{
			out << i << ":" << pClassPriorProb [i] << "\n";
		}
// 5.3 the prior probability of feature type: this is NO used in bayes model, record this for more info
		//     and whether this feature is selected or not by any class
		out << iFeaTypeNum << "\n";
		for (int i=0; i<iFeaTypeNum; i++)	// featureId:priorprob:selected or not
		{
			out << i << ":" << pFeaItemPriorProb[i] << ":" << pFeaSelected << "\n";
		}
// 5.4 the chi-square value for class-feature pair
		for (int i=0; i<iClassNum; i++)
		{
			for (int j=0; j<iFeaTypeNum; j++)
			{
				out << ppChiMatrix[i][j] << "\n";
			}
		}
// 5.5 the post probability 
		for (int i=0; i<iClassNum; i++)
		{
			for (int j=0; j<iFeaTypeNum; j++)
			{
				out << ppPProbMatrix[i][j] << "\n";
			}
		} 
	}
// last, release the memory
	delete [] pClassPriorProb;
	delete [] pFeaItemPriorProb;
	for (int i=0; i<iClassNum; i++)
	{
		delete [] ppChiMatrix[i];
	}
	delete [] ppChiMatrix;
	for (int i=0; i<iClassNum; i++)
	{
		delete [] ppPProbMatrix[i];
	}
	delete [] ppPProbMatrix;
	delete [] pFeaSelected;
return true;
}
模式识别之线性判别---naive bayes朴素贝叶斯代码实现的更多相关文章
- Naive Bayes(朴素贝叶斯算法)[分类算法]
		Naïve Bayes(朴素贝叶斯)分类算法的实现 (1) 简介: (2) 算法描述: (3) <?php /* *Naive Bayes朴素贝叶斯算法(分类算法的实现) */ /* *把. ... 
- PGM:贝叶斯网表示之朴素贝叶斯模型naive Bayes
		http://blog.csdn.net/pipisorry/article/details/52469064 独立性质的利用 条件参数化和条件独立性假设被结合在一起,目的是对高维概率分布产生非常紧凑 ... 
- Python机器学习算法 — 朴素贝叶斯算法(Naive Bayes)
		朴素贝叶斯算法 -- 简介 朴素贝叶斯法是基于贝叶斯定理与特征条件独立假设的分类方法.最为广泛的两种分类模型是决策树模型(Decision Tree Model)和朴素贝叶斯模型(Naive Baye ... 
- 【机器学习实战】第4章 朴素贝叶斯(Naive Bayes)
		第4章 基于概率论的分类方法:朴素贝叶斯 朴素贝叶斯 概述 贝叶斯分类是一类分类算法的总称,这类算法均以贝叶斯定理为基础,故统称为贝叶斯分类.本章首先介绍贝叶斯分类算法的基础——贝叶斯定理.最后,我们 ... 
- 【Spark机器学习速成宝典】模型篇04朴素贝叶斯【Naive Bayes】(Python版)
		目录 朴素贝叶斯原理 朴素贝叶斯代码(Spark Python) 朴素贝叶斯原理 详见博文:http://www.cnblogs.com/itmorn/p/7905975.html 返回目录 朴素贝叶 ... 
- NLP系列(4)_朴素贝叶斯实战与进阶
		作者: 寒小阳 && 龙心尘 时间:2016年2月. 出处:http://blog.csdn.net/han_xiaoyang/article/details/50629608 htt ... 
- NLP系列(4)_朴素贝叶斯实战与进阶(转)
		http://blog.csdn.net/han_xiaoyang/article/details/50629608 作者: 寒小阳 && 龙心尘 时间:2016年2月. 出处:htt ... 
- 一步步教你轻松学朴素贝叶斯模型算法Sklearn深度篇3
		一步步教你轻松学朴素贝叶斯深度篇3(白宁超 2018年9月4日14:18:14) 导读:朴素贝叶斯模型是机器学习常用的模型算法之一,其在文本分类方面简单易行,且取得不错的分类效果.所以很受欢迎,对 ... 
- 统计学习方法——第四章朴素贝叶斯及c++实现
		1.名词解释 贝叶斯定理,自己看书,没啥说的,翻译成人话就是,条件A下的bi出现的概率等于A和bi一起出现的概率除以A出现的概率. 记忆方式就是变后验概率为先验概率,或者说,将条件与结果转换. 先验概 ... 
随机推荐
- oracle序列sequence操作汇总(命令)--待续
			1.创建sequence 2.删除sequence 3.查询有哪些sequence select * from user_objects where object_type='SEQUENCE'; 4 ... 
- NewsLetter001
			http://www.learn-english-today.com/ Project management - Iron out problems. – resolve issues. Critic ... 
- Latex中定义、定理、引理、证明 设置方法总结
			Latex中定义.定理.引理.证明 设置方法总结 在LaTex中需要有关定理.公理.命题.引理.定义等时,常用如下命令 \newtheorem{定理环境名}{标题}[主计数器名] \newtheore ... 
- CentOS7设置自定义开机启动,添加自定义系统服务
			Centos 系统服务脚本目录: /usr/lib/systemd/ 有系统(system)和用户(user)之分,如需要开机没有登陆情况下就能运行的程序,存在系统服务(system)里,即: lib ... 
- Linux安装Qt详细步骤 亲测总结
			下载 qt-everywhere-opensource-src-4.8.4.tar.gz================准备工作====================yum install kern ... 
- 【Android】11.0 第11章 活动和片段--本章示例主界面
			分类:C#.Android.VS2015: 创建日期:2016-02-21 一.简介 这一章我们学习activity和fragment,深入理解activity和fragment的生命周期是如何工作的 ... 
- 【Android】7.7 以后改为在Win10下开发了
			分类:C#.Android.VS2015: 创建日期:2016-02-12 修改日期:2016-02-13 一.鼠标点击时千万别一心二用 在Win10升级提醒不厌其烦的持续轰炸下,今天看手机时一不留神 ... 
- Git的4个阶段的撤销更改
			虽然git诞生距今已有12年之久,网上各种关于git的介绍文章数不胜数,但是依然有很多人(包括我自己在内)对于它的功能不能完全掌握.以下的介绍只是基于我个人对于git的理解,并且可能生编硬造了一些不完 ... 
- 服务器搭建2 VSFTP搭建FTP服务器
			FTP服务器是平时应用最为广泛的服务之一.VSFTP是Very Secure FTP的缩写,意指非常安全的FTP服务.VSFTP功能强大,通过结合本地系统的用户认证模块及其多功能的配置项目,可以快速有 ... 
- ny643 发短信 stl库 map函数
			发短信 时间限制:1000 ms | 内存限制:65535 KB 难度:3 描述 下图是手机常用的九键英文输入法界面,如果要输入字母'A',我们只 需要按一 ... 
