LDA实现






变分后,计算出来的似然函数,其似然值用户判断迭代的收敛程度:

| for (k = 0; k < num_topics; k++) { for (i = 0; i < NUM_INIT; i++) { d = floor(myrand() * c->num_docs); printf("initialized with document %d\n", d); doc = &(c->docs[d]); for (n = 0; n < doc->length; n++) { ss->class_word[k][doc->words[n]] += doc->counts[n]; } } for (n = 0; n < model->num_terms; n++) { ss->class_word[k][n] += 1.0; ss->class_total[k] = ss->class_total[k] + ss->class_word[k][n]; } } |
| void run_em(char* start, char* directory, corpus* corpus) { int d, n; lda_model *model = NULL; double **var_gamma, **phi; // allocate variational parameters var_gamma = malloc(sizeof(double*)*(corpus->num_docs)); for (d = 0; d < corpus->num_docs; d++) var_gamma[d] = malloc(sizeof(double) * NTOPICS); int max_length = max_corpus_length(corpus); phi = malloc(sizeof(double*)*max_length); for (n = 0; n < max_length; n++) phi[n] = malloc(sizeof(double) * NTOPICS); // initialize model char filename[100]; lda_suffstats* ss = NULL; if (strcmp(start, "seeded")==0) { model = new_lda_model(corpus->num_terms, NTOPICS); ss = new_lda_suffstats(model); corpus_initialize_ss(ss, model, corpus); lda_mle(model, ss, 0); model->alpha = INITIAL_ALPHA; } else if (strcmp(start, "random")==0) { model = new_lda_model(corpus->num_terms, NTOPICS); ss = new_lda_suffstats(model); random_initialize_ss(ss, model); lda_mle(model, ss, 0); model->alpha = INITIAL_ALPHA; } else { model = load_lda_model(start); ss = new_lda_suffstats(model); } sprintf(filename,"%s/000",directory); save_lda_model(model, filename); // run expectation maximization int i = 0; double likelihood, likelihood_old = 0, converged = 1; sprintf(filename, "%s/likelihood.dat", directory); FILE* likelihood_file = fopen(filename, "w"); while (((converged < 0) || (converged > EM_CONVERGED) || (i <= 2)) && (i <= EM_MAX_ITER)) { i++; likelihood = 0; zero_initialize_ss(ss, model); // e-step //这里是核心,针对每篇文档计算相关模型参数 for (d = 0; d < corpus->num_docs; d++) { likelihood += doc_e_step(&(corpus->docs[d]), var_gamma[d], phi, model, ss); } // m-step lda_mle(model, ss, ESTIMATE_ALPHA); // check for convergence converged = (likelihood_old - likelihood) / (likelihood_old); if (converged < 0) VAR_MAX_ITER = VAR_MAX_ITER * 2; likelihood_old = likelihood; |


,但是实际α只有一个,所以作者通过在所有topic上的分布计算出α。| double doc_e_step(document* doc, double* gamma, double** phi, lda_model* model, lda_suffstats* ss) { double likelihood; int n, k; // posterior inference likelihood = lda_inference(doc, model, gamma, phi); // update sufficient statistics double gamma_sum = 0; for (k = 0; k < model->num_topics; k++) { gamma_sum += gamma[k]; ss->alpha_suffstats += digamma(gamma[k]); } ss->alpha_suffstats -= model->num_topics * digamma(gamma_sum); for (n = 0; n < doc->length; n++) { for (k = 0; k < model->num_topics; k++) { ss->class_word[k][doc->words[n]] += doc->counts[n]*phi[n][k]; ss->class_total[k] += doc->counts[n]*phi[n][k]; } } ss->num_docs = ss->num_docs + 1; return(likelihood); } |

| double lda_inference(document* doc, lda_model* model, double* var_gamma, double** phi) { double converged = 1; double phisum = 0, likelihood = 0; double likelihood_old = 0, oldphi[model->num_topics]; int k, n, var_iter; double digamma_gam[model->num_topics]; // compute posterior dirichlet //init gama and php for (k = 0; k < model->num_topics; k++) { //初始化 γ以及φ var_gamma[k] = model->alpha + (doc->total/((double) model->num_topics)); digamma_gam[k] = digamma(var_gamma[k]); for (n = 0; n < doc->length; n++) phi[n][k] = 1.0/model->num_topics; } var_iter = 0; while ((converged > VAR_CONVERGED) && ((var_iter < VAR_MAX_ITER) || (VAR_MAX_ITER == -1))) { var_iter++; for (n = 0; n < doc->length; n++) { phisum = 0; for (k = 0; k < model->num_topics; k++) { oldphi[k] = phi[n][k]; //对于每个word,更新对应的topic,也就是公式中 phi[n][k] = digamma_gam[k] + model->log_prob_w[k][doc->words[n]]; if (k > 0) //为归一化做准备,通过log(a) +log(b)计算log(a+b) phisum = log_sum(phisum, phi[n][k]); else phisum = phi[n][k]; // note, phi is in log space } for (k = 0; k < model->num_topics; k++) { phi[n][k] = exp(phi[n][k] - phisum);//归一化 //update γ,这里面没有用到α,原始公式不同 var_gamma[k] =var_gamma[k] + doc->counts[n]*(phi[n][k] - oldphi[k]); // !!! a lot of extra digamma's here because of how we're computing it // !!! but its more automatically updated too. digamma_gam[k] = digamma(var_gamma[k]); printf("%d:%d: gmama: %f php: %f\n", n, k, var_gmama[k], php[n][k]); } } //计算似然结果,观察是否收敛,计算采用公式 likelihood = compute_likelihood(doc, model, phi, var_gamma); assert(!isnan(likelihood)); converged = (likelihood_old - likelihood) / likelihood_old; likelihood_old = likelihood; // printf("[LDA INF] %8.5f %1.3e\n", likelihood, converged); } return(likelihood); } |

| double compute_likelihood(document* doc, lda_model* model, double** phi, double* var_gamma) { double likelihood = 0, digsum = 0, var_gamma_sum = 0, dig[model->num_topics]; int k, n; for (k = 0; k < model->num_topics; k++) { dig[k] = digamma(var_gamma[k]); var_gamma_sum += var_gamma[k]; } digsum = digamma(var_gamma_sum); lgamma(α*k) - k*lgamma(alpha) likelihood = lgamma(model->alpha * model -> num_topics) - model -> num_topics * lgamma(model->alpha) - (lgamma(var_gamma_sum)); for (k = 0; k < model->num_topics; k++) { likelihood += (model->alpha - 1)*(dig[k] - digsum) + lgamma(var_gamma[k]) - (var_gamma[k] - 1)*(dig[k] - digsum); for (n = 0; n < doc->length; n++) { if (phi[n][k] > 0) { likelihood += doc->counts[n]* (phi[n][k]*((dig[k] - digsum) - log(phi[n][k]) + model->log_prob_w[k][doc->words[n]])); } } } return(likelihood); } |
void lda_mle(lda_model* model, lda_suffstats* ss, int estimate_alpha)
{
int k; int w;
for (k = 0; k < model->num_topics; k++)
{
for (w = 0; w < model->num_terms; w++)
{
if (ss->class_word[k][w] > 0)
{
model->log_prob_w[k][w] =
log(ss->class_word[k][w]) -
log(ss->class_total[k]);
}
else
model->log_prob_w[k][w] = -100;
}
}
if (estimate_alpha == 1)
{
model->alpha = opt_alpha(ss->alpha_suffstats,
ss->num_docs,
model->num_topics);
printf("new alpha = %5.5f\n", model->alpha);
}
}
LDA实现的更多相关文章
- 用scikit-learn进行LDA降维
在线性判别分析LDA原理总结中,我们对LDA降维的原理做了总结,这里我们就对scikit-learn中LDA的降维使用做一个总结. 1. 对scikit-learn中LDA类概述 在scikit-le ...
- 线性判别分析LDA原理总结
在主成分分析(PCA)原理总结中,我们对降维算法PCA做了总结.这里我们就对另外一种经典的降维方法线性判别分析(Linear Discriminant Analysis, 以下简称LDA)做一个总结. ...
- word2vec参数调整 及lda调参
一.word2vec调参 ./word2vec -train resultbig.txt -output vectors.bin -cbow 0 -size 200 -window 5 -neg ...
- PCA与LDA的区别与联系
由于涉及内容较多,这里转载别人的博客: http://blog.csdn.net/sunmenggmail/article/details/8071502 其实主要在于:PCA与LDA的变换矩阵不同, ...
- 计算LDA模型困惑度
http://www.52nlp.cn/lda-math-lda-%E6%96%87%E6%9C%AC%E5%BB%BA%E6%A8%A1 LDA主题模型评估方法--Perplexity http:/ ...
- LDA的Python实现源码
#-*- coding:utf-8 -*- import logging import logging.config import ConfigParser import numpy as np im ...
- LDA( Latent Dirichlet Allocation)主题模型 学习报告
1 问题描述 LDA由Blei, David M..Ng, Andrew Y..Jordan于2003年提出,是一种主题模型,它可以将文档集中每篇文档的主题以概率分布的形式给出,从而通过分析一 ...
- 关于LDA的几何表示——MATLAB实现
承接这个PCA的练习,还有一个关于LDA的几何表示. 题目如下: 代码实现LDA如下:LDA.m clear clc % 生成training sample MU1 = [6 10]'; MU2 = ...
- Gensim LDA主题模型实验
本文利用gensim进行LDA主题模型实验,第一部分是基于前文的wiki语料,第二部分是基于Sogou新闻语料. 1. 基于wiki语料的LDA实验 上一文得到了wiki纯文本已分词语料 wiki.z ...
- [综] Latent Dirichlet Allocation(LDA)主题模型算法
多项分布 http://szjc.math168.com/book/ebookdetail.aspx?cateid=1&§ionid=983 二项分布和多项分布 http:// ...
随机推荐
- Android Multimedia框架总结(二十一)MediaCodec中创建到start过程(到jni部分)
转载请把头部出处链接和尾部二维码一起转载,本文出自逆流的鱼yuiop:http://blog.csdn.net/hejjunlin/article/details/53386117 我最近正在参加CS ...
- Dynamics CRM2016 查询数据的三种方式的性能对比
之前写过一个博客,对非声明验证方式下连接组织服务的两种方式的性能进行了对比,但当时只是对比了实例化组织服务的时间,并没有对查询数据的时间进行对比,那有朋友也在我的博客中留言了反映了查询的时间问题,一直 ...
- Android必知必会-App 常用图标尺寸规范汇总
若移动端访问不佳,请使用 –> Github版 内容持续更新中,更新日期:2016-08-11 1. 程序启动图标(icon launcher) 放在mipmap-*dpi下,文件名为ic_la ...
- Servlet之Request对象
下面的方法可用在 Servlet 程序中读取 HTTP 头.这些方法通过HttpServletRequest 对象可用. 1 Cookie[] getCookies() 返回一个数组,包含客户端 ...
- Linux内核分配内存的方式
page = alloc_pages(GFP_KERNEL, get_order(1234));分配失败返回NULLGFP_KERNEL ---> 分配标志,当没有足够内存分配时,睡眠阻塞,直 ...
- MySQL 实现调用外部程序和系统命令
MySQL 实现调用外部程序和系统命令 Refer:http://www.cnblogs.com/yunsicai/p/4080864.html1) Download lib_mysqludf_sys ...
- (九十七)集成JPush实现远程通知和推送的发送
上节介绍了通过直接和APNS交互实现推送的方法,较为繁琐,最为重要的是发送推送需要特定的服务端,通过JPush,不仅可以简化客户端的接收,还可以通过控制台或者API实现通知的发送. 首先注册JPush ...
- Android初级教程启动定时器详解
本案例知识是:后台执行定时任务. Alarm机制: 一.创建LongRunningService类 package com.example.servicebestpractice; import ja ...
- Socket接收器——Acceptor
Acceptor是JIoEndpoint的内部类,主要的职责就是监听是否有客户端套接字连接并接收socket,再将socket交由任务执行者(Executor)执行.不断从系统底层读取socket,接 ...
- Hessian探究(一)Hessian入门示例
一.hessian的maven信息: [html] view plain copy print? <dependency> <groupId>com.caucho</gr ...