1。说明

本文对LDA原始论文的作者所提供的C代码中LDA的主要逻辑部分做凝视,原代码可在这里下载到:https://github.com/Blei-Lab/lda-c

这份代码实现论文《Latent Dirichlet Allocation》中介绍的LDA模型。用变分EM算法求解參数。

为了使代码在vs2013中执行。做了一些微小修改,但不影响原代码的逻辑。

vs2013project可在我的资源中下载:

http://download.csdn.net/detail/happyer88/8861773

----------------------------------------------------------------------

2。准备知识

2.1,LDA原理及推导

《Latent Dirichlet Allocation》论文

我的LDA学习笔记1-4系列

2.2,充分统计量

https://en.wikipedia.org/wiki/Sufficient_statistic

----------------------------------------------------------------------

3,代码凝视

3.1 main.c

原代码中main函数在lda-estimate.c中,创建vsproject时把它挪到了main.c中。

<span style="font-size:14px;">#include  <stdio.h>
#include <stdlib.h>
#include <io.h>
#include<time.h>
#include "cokus.h"
#include "lda-alpha.h"
#include"lda-data.h"
#include"lda-estimate.h"
#include"lda-inference.h"
#include"lda.h"
#include"utils.h" char * datasetName = "scene8"; //数据集名字,必须与目录名字同样
int expec = 1; // expec==1,expect , inf
int vocabularySize_global = 512; // 字典大小
int k = 100; //topic的数目
char* params ="../settings.txt"; //预计:预计过程须要的參数
char* params1 ="../inf-settings.txt"; //判断:判断过程须要的參数
char dataset_train[500]; //预计:预计參数的数据文件
char dataset_test[500]; //判断:判断的数据文件
char dir_trainData[500]; //预计:预计的中间数据和终于数据目录路径
char dir_testData[500]; //判断:判断的中间数据和终于数据目录路径
char model_pre[500];
void assignParameter();
int main()
{
corpus* corpus;
clock_t start,finish;
double totaltime;
long double totaltime_EMiteration;
assignParameter();
//myCreateDirectory(); start=clock();
if(expec)
{
INITIAL_ALPHA = 1; //狄利克雷分布的參数alpha
NTOPICS =k; //主题个数
read_settings(params); //读取參数。。 。 最大迭代次数。收敛条件阈值;EM的最大迭代次数、收敛条件阈值;
corpus = read_data(dataset_train); //读取数据。 。。 数据格式:(每一行)在一个文档中出现的word总数目(去掉次数=0的)index_word1:counts index_word2:counts ........... totaltime_EMiteration = run_em("seeded", dir_trainData, corpus); //求解參数。。。EM过程求解參数--输入:中间数据和终于数据存放目录、语料库 printf("inferencing test images!\n");
read_settings(params1);
corpus = read_data(dataset_test);
infer(model_pre, dir_testData, corpus);
//用完開始释放
}
else
{
read_settings(params1);
corpus = read_data(dataset_test);
infer(model_pre, dir_testData, corpus);
}
finish=clock();
totaltime=(double)(finish-start)/CLOCKS_PER_SEC; printf("nTopic = %d, nTerm = %d estimation time: \n", k, vocabularySize_global);
printf(" EM iteration takes %f seconds(this is %f miniutes)\n", totaltime_EMiteration*60, totaltime_EMiteration); printf("Running Time(--estimate trainData and inference trainData and testData--):%f\n",totaltime); printf("\ntrain--- final data are saved to: %s\n", dir_trainData);
printf("test---- final data are saved to: %s\n", dir_testData);
getch();
return(0);
} void assignParameter()
{
sprintf(dataset_train,"../train.txt");
sprintf(dataset_test,"../test.txt"); sprintf(dir_trainData, "../ResultData");
sprintf(dir_testData, "../ResultData");
sprintf(model_pre, "../ResultData/final"); }
</span>

3.2 lda.h

自己定义数据结构

#ifndef LDA_H
#define LDA_H typedef struct
{
int* words; //文档中的单词,这里存的是该单词在文档集字典中的ID
int* counts; //每一个单词文档中出现次数
int length; //文档中出现的单词个数,去重的。也就是反复出现的单词不计
int total; //文档中总单词数,不去重
} document; typedef struct
{
document* docs;
int num_terms; //文档集中出现的单词个数。去重的。也就是文档集字典大小
int num_docs; //文档集中文档个数
} corpus; typedef struct
{
double alpha; //论文中的模型參数alpha,本来应该是k维。程序中实现的是对称分布的Dirichlet,k维的值是同样的
double** log_prob_w; //论文中的模型參数beta。每一行存一个主题的词分布,维<span style="font-family: Arial, Helvetica, sans-serif;">度k*V</span>
int num_topics; //主题个数
int num_terms;
} lda_model; typedef struct
{
double** class_word;//模型參数beta的充分统计量。维度:主题个数*文档集字典大小(K*V)
double* class_total;//存主题分布z的 充分统计量,维度:主题个数K
double alpha_suffstats; //模型參数alpha的充分统计量
int num_docs;
} lda_suffstats; #endif

3.3 lda-model.c

主要是初始化lda模型(有三种方法),一种是全部值都为0,'random'是用随机数,'seeded'是随机挑选一些文档来初始化模型

还有计算模型參数alpha , beta (lda_mle)

#include "lda-model.h"

/*
* compute MLE lda model from sufficient statistics
*
*/ void lda_mle(lda_model* model, lda_suffstats* ss, int estimate_alpha)
{
int k; int w; for (k = 0; k < model->num_topics; k++)
{
for (w = 0; w < model->num_terms; w++)
{
if (ss->class_word[k][w] > 0)
{
//log_prob_w是模型參数beta,主题-词分布
//class_word和class_total都是充分统计量(sufficient statistic)
//所以log相减是在做归一化,beta中的值是概率。要在0-1之间
model->log_prob_w[k][w] =
log(ss->class_word[k][w]) -
log(ss->class_total[k]);
}
else
model->log_prob_w[k][w] = -100;
}
}
if (estimate_alpha == 1)
{
//用牛顿方法优化得到alpha
//注意这里alpha_suffstats的值
model->alpha = opt_alpha(ss->alpha_suffstats,
ss->num_docs,
model->num_topics); printf("new alpha = %5.5f\n", model->alpha);
}
} /*
* allocate sufficient statistics
*
*/ lda_suffstats* new_lda_suffstats(lda_model* model)
{
int num_topics = model->num_topics;
int num_terms = model->num_terms;
int i,j; lda_suffstats* ss = malloc(sizeof(lda_suffstats));
ss->class_total = malloc(sizeof(double)*num_topics);
ss->class_word = malloc(sizeof(double*)*num_topics);
for (i = 0; i < num_topics; i++)
{
ss->class_total[i] = 0;
ss->class_word[i] = malloc(sizeof(double)*num_terms);
for (j = 0; j < num_terms; j++)
{
ss->class_word[i][j] = 0;
}
}
return(ss);
}
void free_lda_ss(lda_suffstats* ss, lda_model* model)
{
int i=0;
for (i=0; i < model->num_topics; i++)
free(ss->class_word[i]);
free(ss->class_word);
free(ss->class_total);
free(ss);
} /*
* various intializations for the sufficient statistics
*
*/ void zero_initialize_ss(lda_suffstats* ss, lda_model* model)
{
int k, w;
for (k = 0; k < model->num_topics; k++)
{
ss->class_total[k] = 0;
for (w = 0; w < model->num_terms; w++)
{
ss->class_word[k][w] = 0;
}
}
ss->num_docs = 0;
ss->alpha_suffstats = 0;
} void random_initialize_ss(lda_suffstats* ss, lda_model* model)
{
int num_topics = model->num_topics;
int num_terms = model->num_terms;
int k, n;
for (k = 0; k < num_topics; k++)
{
for (n = 0; n < num_terms; n++)
{
ss->class_word[k][n] += 1.0/num_terms + myrand();
ss->class_total[k] += ss->class_word[k][n];
}
}
} void corpus_initialize_ss(lda_suffstats* ss, lda_model* model, corpus* c)
{
int num_topics = model->num_topics;
int i, k, d, n;
document* doc; for (k = 0; k < num_topics; k++)//每一个主题用一些文档的来初始化其主题-词 分布 的充分统计量
{
for (i = 0; i < NUM_INIT; i++)//在文档集中随机挑选NUM_INIT=1个文档
{
d = floor(myrand() * c->num_docs); //随机挑选
printf("initialized with document %d\n", d);
doc = &(c->docs[d]);
for (n = 0; n < doc->length; n++)
{
//将NUM_INIT个文档的词频统计,作为第k个主题的词分布的统计量
ss->class_word[k][doc->words[n]] += doc->counts[n];
}
}
for (n = 0; n < model->num_terms; n++)
{
ss->class_word[k][n] += 1.0;//由于后面要对它求log,所以值必须大于0
//是对class_word按行求和的结果,是主题k被选中的次数,也就是该主题下的词出现次数的和
ss->class_total[k] = ss->class_total[k] + ss->class_word[k][n];
}
//这样用文档的词频信息初始化,total必定不为0
//if (ss->class_total[k] == 0)
// ss->class_total[k] = 1;
}
} /*
* allocate new lda model
*
*/ lda_model* new_lda_model(int num_terms, int num_topics)
{
int i,j;
lda_model* model; model = malloc(sizeof(lda_model));
model->num_topics = num_topics;
model->num_terms = num_terms;
model->alpha = 1.0;
model->log_prob_w = malloc(sizeof(double*)*num_topics);
for (i = 0; i < num_topics; i++)
{
model->log_prob_w[i] = malloc(sizeof(double)*num_terms);
for (j = 0; j < num_terms; j++)
model->log_prob_w[i][j] = 0;
}
return(model);
} /*
* deallocate new lda model
*
*/ void free_lda_model(lda_model* model)
{
int i; for (i = 0; i < model->num_topics; i++)
{
free(model->log_prob_w[i]);
}
free(model->log_prob_w);
free(model);
} /*
* save an lda model
*
*/ void save_lda_model(lda_model* model, char* model_root)
{
char filename[100];
FILE* fileptr;
int i, j; sprintf(filename, "%s.beta", model_root);
fileptr = fopen(filename, "w");
for (i = 0; i < model->num_topics; i++)
{
for (j = 0; j < model->num_terms; j++)
{
fprintf(fileptr, " %5.10f", model->log_prob_w[i][j]);
}
fprintf(fileptr, "\n");
}
fclose(fileptr); sprintf(filename, "%s.other", model_root);
fileptr = fopen(filename, "w");
fprintf(fileptr, "num_topics %d\n", model->num_topics);
fprintf(fileptr, "num_terms %d\n", model->num_terms);
fprintf(fileptr, "alpha %5.10f\n", model->alpha);
fclose(fileptr);
} lda_model* load_lda_model(char* model_root)
{
char filename[100];
FILE* fileptr;
int i, j, num_terms, num_topics;
float x, alpha;
lda_model* model; sprintf(filename, "%s.other", model_root);
printf("loading %s\n", filename);
fileptr = fopen(filename, "r");
fscanf(fileptr, "num_topics %d\n", &num_topics);
fscanf(fileptr, "num_terms %d\n", &num_terms);
fscanf(fileptr, "alpha %f\n", &alpha);
fclose(fileptr); model = new_lda_model(num_terms, num_topics);
model->alpha = alpha; sprintf(filename, "%s.beta", model_root);
printf("loading %s\n", filename);
fileptr = fopen(filename, "r");
for (i = 0; i < num_topics; i++)
{
for (j = 0; j < num_terms; j++)
{
fscanf(fileptr, "%f", &x);
model->log_prob_w[i][j] = x;
}
}
fclose(fileptr);
return(model);
}

3.3 lda-estimate.c

当中包括和模型求解相关的函数,em算法(run_em)和e-step(doc_e_step)

#include "lda-estimate.h"

/*
* perform inference on a document and update sufficient statistics
*
*/
int LAG=5;
double doc_e_step(document* doc, double* gamma, double** phi,
lda_model* model, lda_suffstats* ss)
{
double likelihood;
int n, k;
double gamma_sum;
// posterior inference likelihood = lda_inference(doc, model, gamma, phi); // update sufficient statistics //这里更新alpha的 充分统计量
//alpha_suffstats = sum(digamma(gamma)) - K*digamma(gamm_sum)
gamma_sum = 0;
for (k = 0; k < model->num_topics; k++)
{
gamma_sum += gamma[k];
ss->alpha_suffstats += digamma(gamma[k]); //log gamma函数的一阶导数
}
ss->alpha_suffstats -= model->num_topics * digamma(gamma_sum); for (n = 0; n < doc->length; n++)
{
for (k = 0; k < model->num_topics; k++)
{
//phi[n][k]是第n个word由第k个主题生成的概率。在log space
ss->class_word[k][doc->words[n]] += doc->counts[n]*phi[n][k];
ss->class_total[k] += doc->counts[n]*phi[n][k];
}
}
//增加充分统计量的文档数
ss->num_docs = ss->num_docs + 1; return(likelihood);
} /*
* writes the word assignments line for a document to a file
*
*/ int write_word_assignment(FILE* result, FILE* f, document* doc, double** phi, lda_model* model)
{
int n;
//f中保存phi, result中保存结果:[wordID:概率最大的topicID]
fprintf(result, "%03d", doc->length);
for (n = 0; n < doc->length; n++)
{
int k;
for (k=0;k< model->num_topics;k++)
fprintf(f, "%f\t",phi[n][k]); //一行相应一个word由每个topic生成的概率
fprintf(f, "\n"); fprintf(result, " %04d:%02d",
doc->words[n], argmax(phi[n], model->num_topics));//argmax 找出phi[n]中最大的元素相应的索引位置,也就是topicID
}
fprintf(result, "\n"); //一行相应一个文档的 每个word相应的概率最大的topic
fflush(f);
fflush(result);
return 0; } /*
* saves the gamma parameters of the current dataset
*
*/ void save_gamma(char* filename, double** gamma, int num_docs, int num_topics)
{
FILE* fileptr;
int d, k;
fileptr = fopen(filename, "w"); for (d = 0; d < num_docs; d++)
{
fprintf(fileptr, "%5.10f", gamma[d][0]);
for (k = 1; k < num_topics; k++)
{
fprintf(fileptr, " %5.10f", gamma[d][k]);
}
fprintf(fileptr, "\n");
}
fclose(fileptr);
} /*
* run_em
*
*/ long double run_em(char* start, char* directory, corpus* corpus)
{
clock_t start_EM, finish_EM;
double * theta;
FILE * thetaFile; int d, n;
lda_model *model = NULL;
double **var_gamma, **phi;
FILE* likelihood_file;
int max_length;
char filename[500];
char filename1[500];
int i;
double likelihood, likelihood_old, converged;
lda_suffstats* ss;
FILE* w_asgn_file;
FILE* result; // allocate variational parameters
//为变分參数gamma分配空间,维度:文档数*主题数
var_gamma = malloc(sizeof(double*)*(corpus->num_docs));
for (d = 0; d < corpus->num_docs; d++)
var_gamma[d] = malloc(sizeof(double) * NTOPICS);
//为变分參数phi分配空间。维度:文档集中文档的最大单词数(去重) * 主题数
max_length = max_corpus_length(corpus);
phi = malloc(sizeof(double*)*max_length);
for (n = 0; n < max_length; n++)
phi[n] = malloc(sizeof(double) * NTOPICS); // initialize model
ss = NULL;
if (strcmp(start, "seeded")==0)
{
model = new_lda_model(corpus->num_terms, NTOPICS);
ss = new_lda_suffstats(model);
corpus_initialize_ss(ss, model, corpus); //初始化tw分布
lda_mle(model, ss, 0); //compute MLE lda model from sufficient statistics
model->alpha = INITIAL_ALPHA;
}
else if (strcmp(start, "random")==0)
{
model = new_lda_model(corpus->num_terms, NTOPICS);
ss = new_lda_suffstats(model);
random_initialize_ss(ss, model);
lda_mle(model, ss, 0);
model->alpha = INITIAL_ALPHA;
}
else
{
model = load_lda_model(start);
ss = new_lda_suffstats(model);
} sprintf(filename,"%s/000",directory);
save_lda_model(model, filename); // run expectation maximization i = 0;
likelihood_old = 0;
converged = 1;
sprintf(filename, "%s/likelihood.dat", directory);
likelihood_file = fopen(filename, "w"); start_EM = clock();
//em迭代继续运行条件:下面1和2同一时候满足
//1,
//converaged<0 也就是新值比旧值好
//或converaged>EM_CONVERGED 新值和旧值还不够近似
//或迭代步骤运行太少(<=2)
//2,
//当前迭代step数在规定的最大迭代步数以内
//或者没有指定最大迭代步数(-1)
while (((converged < 0) || (converged > EM_CONVERGED) || (i <= 2)) && ((i <= EM_MAX_ITER) || (EM_MAX_ITER == -1)) )
{
i++; printf("**** em iteration %d ****\n", i);
likelihood = 0;
zero_initialize_ss(ss, model); //把统计量的值都赋为0 // e-step
//固定alpha和beta。对每一篇文档找到优化的gamma和phi,更新充分统计量,计算似然
for (d = 0; d < corpus->num_docs; d++)
{
if ((d % 10) == 0) printf("document %d in %d EM iteration\n",d, i);
likelihood += doc_e_step(&(corpus->docs[d]),
var_gamma[d],
phi,
model,
ss);
} // m-step //依据当前的充分统计量。更新模型參数alpha,beta
lda_mle(model, ss, ESTIMATE_ALPHA); // check for convergence converged = (likelihood_old - likelihood) / (likelihood_old);
if (converged < 0) VAR_MAX_ITER = VAR_MAX_ITER * 2;
likelihood_old = likelihood; // output model and likelihood fprintf(likelihood_file, "%10.10f\t%5.5e\n", likelihood, converged);
fflush(likelihood_file);
if ((i % LAG) == 0)
{
sprintf(filename,"%s/%03d",directory, i);
save_lda_model(model, filename);
sprintf(filename,"%s/%03d.gamma",directory, i);
save_gamma(filename, var_gamma, corpus->num_docs, model->num_topics);
}
}
//EM迭代结束
finish_EM = clock();
printf("nTopic = %d, nTerm = %d estimation time: \n", model->num_topics, model->num_terms);
printf(" EM iteration takes %f seconds(this is %d miniutes)\n", (double)(finish_EM-start_EM)/CLOCKS_PER_SEC, (finish_EM-start_EM)/CLOCKS_PER_SEC/60); // output the final model
sprintf(filename,"%s/final",directory);
save_lda_model(model, filename); //此函数中保存了beta到final.beta文件 , 还有.other文件
sprintf(filename,"%s/final.gamma",directory);
save_gamma(filename, var_gamma, corpus->num_docs, model->num_topics); // output theta
theta = (double*)malloc(sizeof(double)*model->num_topics);
sprintf(filename1, "%s/final.theta", directory);
thetaFile = fopen(filename1, "w");
// output the word assignments (for visualization)
sprintf(filename1, "%s/result-doc-assgn.dat", directory);
result = fopen(filename1, "w");
for (d = 0; d < corpus->num_docs; d++)
{
sprintf(filename, "%s/result_%d_phi.dat", directory,d); //调试这一部分有越界的错误,完成,filename数组空间太小。
w_asgn_file = fopen(filename, "w");
printf("final e step document %d\n",d);
likelihood += lda_inference(&(corpus->docs[d]), model, var_gamma[d], phi);
write_word_assignment(result, w_asgn_file, &(corpus->docs[d]), phi, model);
computeTheta( thetaFile, &(corpus->docs[d]), phi, model, theta);
fclose(w_asgn_file);
}
fclose(result);
fclose(thetaFile);
fclose(likelihood_file);
//释放空间
free(theta);
for (d = 0; d < corpus->num_docs; d++)
free(var_gamma[d]);
free(var_gamma);
for (n = 0; n < max_length; n++)
free(phi[n]);
free(phi);
free_lda_ss( ss, model);
free_lda_model(model); return (long double)(finish_EM-start_EM)/CLOCKS_PER_SEC/60;
} void computeTheta( FILE* thetaFile, document* doc, double** phi, lda_model* model, double * theta)
{
int n; for (n=0; n< model->num_topics; n++)
theta[n] = 0;
for (n = 0; n<doc->length; n++)
{
int topicIndex = argmax(phi[n], model->num_topics);
theta[ topicIndex ] = theta[ topicIndex ] + doc->counts[ n ];
} for (n=0; n<model->num_topics; n++)
{
theta[n] = theta[n]/doc->total;
fprintf(thetaFile, "%f\t", theta[n]);
}
fprintf(thetaFile, "\n");
fflush(thetaFile); } /*
* read settings.
*
*/ void read_settings(char* filename)
{
FILE* fileptr;
char alpha_action[100];
fileptr = fopen(filename, "r");
fscanf(fileptr, "var max iter %d\n", &VAR_MAX_ITER);
fscanf(fileptr, "var convergence %f\n", &VAR_CONVERGED);
fscanf(fileptr, "em max iter %d\n", &EM_MAX_ITER);
fscanf(fileptr, "em convergence %f\n", &EM_CONVERGED);
fscanf(fileptr, "alpha %s", alpha_action);
if (strcmp(alpha_action, "fixed")==0)
{
ESTIMATE_ALPHA = 0;
}
else
{
ESTIMATE_ALPHA = 1;
}
fclose(fileptr);
} /*
* inference only
*
*/ void infer(char* model_root, char* save, corpus* corpus)
{
FILE* fileptr;
FILE* result;
FILE* w_asgn_file;
char filename[100];
char filename1[200];
int i, d, n;
lda_model *model;
double **var_gamma, likelihood, **phi;
document* doc; /*double ***corpusPhi;
corpusPhi = (double***)malloc(sizeof(double**)*(corpus->num_docs));
for (i=0;i<corpus.num_docs;j++)
{
corpusPhi[i] = (double**)malloc(sizeof(double*)*)
}*/
sprintf(filename1, "%s/result-doc-assgn.dat", save);
result = fopen(filename1, "w"); model = load_lda_model(model_root);
var_gamma = (double**)malloc(sizeof(double*)*(corpus->num_docs));
for (i = 0; i < corpus->num_docs; i++)
var_gamma[i] = (double*)malloc(sizeof(double)*model->num_topics); //int max_length = max_corpus_length(corpus); sprintf(filename, "%s-lda-lhood.dat", save);
fileptr = fopen(filename, "w"); for (d = 0; d < corpus->num_docs; d++)
{
if (((d % 100) == 0) && (d>0)) printf("document %d\n",d); doc = &(corpus->docs[d]);
phi = (double**) malloc(sizeof(double*) * doc->length);
//phi = (double**) malloc(sizeof(double*) * max_length);
for (n = 0; n < doc->length; n++)
//for (n = 0; n < max_length; n++)
phi[n] = (double*) malloc(sizeof(double) * model->num_topics);
likelihood = lda_inference(doc, model, var_gamma[d], phi); fprintf(fileptr, "%5.5f\n", likelihood); //输出每个文档的phi到文件result_%d_phi.dat中 另外每个word相应的概率最大的topic保存在文件result-doc-assgn.dat中 一行相应一个文档
sprintf(filename, "%s/result_%d_phi.dat", save,d);
w_asgn_file = fopen(filename, "w");
printf("final e step document %d\n",d);
write_word_assignment(result,w_asgn_file, &(corpus->docs[d]), phi, model);
fclose(w_asgn_file);
for (n = 0; n < doc->length; n++)
free(phi[n]);
free(phi);
}
fclose(fileptr);
sprintf(filename, "%s-gamma.dat", save);
save_gamma(filename, var_gamma, corpus->num_docs, model->num_topics); fclose(result);
for (d = 0; d < corpus->num_docs; d++)
free(var_gamma[d]);
free(var_gamma); free_lda_model(model);
}

3.4 lda-inference.c

当中包括变分參数求解相关的函数

#include "lda-inference.h"

/*
* variational inference
*
*/
int lisnan(double x) {
return x != x;
}
double lda_inference(document* doc, lda_model* model, double* var_gamma, double** phi)
{
double converged = 1;
double phisum = 0, likelihood = 0;
double likelihood_old = 0, *oldphi=(double *)malloc(sizeof(double)*(model->num_topics)); int k, n, var_iter;
double *digamma_gam=(double *)malloc(sizeof(double)*(model->num_topics)); // compute posterior dirichlet for (k = 0; k < model->num_topics; k++)
{
//初始化变分參数gamma=alpha + 当前文档中单词个数(不去重) N / 主题个数 k
var_gamma[k] = model->alpha + (doc->total/((double) model->num_topics));
//log gamma函数的一阶导数
digamma_gam[k] = digamma(var_gamma[k]);
//初始化变分參数phi=1/k
for (n = 0; n < doc->length; n++)
phi[n][k] = 1.0/model->num_topics;
}
var_iter = 0;
//開始迭代
while ((converged > VAR_CONVERGED) &&
((var_iter < VAR_MAX_ITER) || (VAR_MAX_ITER == -1)))
{
var_iter++;
for (n = 0; n < doc->length; n++)
{
phisum = 0;
for (k = 0; k < model->num_topics; k++)
{
oldphi[k] = phi[n][k];
//更新变分參数 phi
//就是论文中变分判断算法的式子 phi(n,i) = b(i,wn) * exp(digamma(gamma(i)))
//这里由于有exp所以在log空间计算,算得的phi也是log space的
phi[n][k] =
digamma_gam[k] +
model->log_prob_w[k][doc->words[n]]; if (k > 0)
phisum = log_sum(phisum, phi[n][k]);//在log space对phi求和
else
phisum = phi[n][k]; // note, phi is in log space
} for (k = 0; k < model->num_topics; k++)
{
//归一化,使phi(n)和为1
phi[n][k] = exp(phi[n][k] - phisum);
//更新变分參数 gamma
var_gamma[k] =
var_gamma[k] + doc->counts[n]*(phi[n][k] - oldphi[k]);
// !!! a lot of extra digamma's here because of how we're computing it
// !!! but its more automatically updated too.
digamma_gam[k] = digamma(var_gamma[k]);
}
} likelihood = compute_likelihood(doc, model, phi, var_gamma);
assert(!isnan(likelihood));
converged = (likelihood_old - likelihood) / likelihood_old;
likelihood_old = likelihood; // printf("[LDA INF] %8.5f %1.3e\n", likelihood, converged);
}//迭代结束
return(likelihood);
} /*
* compute likelihood bound
*
*/
//依照论文附录(15)式计算L(gamma,phi;alpha,beta)
double
compute_likelihood(document* doc, lda_model* model, double** phi, double* var_gamma)
{
double likelihood = 0, digsum = 0, var_gamma_sum = 0, *dig=(double *)malloc(sizeof(double)*(model->num_topics));
int k, n; for (k = 0; k < model->num_topics; k++)
{
dig[k] = digamma(var_gamma[k]);
var_gamma_sum += var_gamma[k];
}
digsum = digamma(var_gamma_sum);
//论文(14)式中的Eq,第1个和第4个是合在一起再拆分算的,第2,3,5个是合在一起算的
//Eq[logp(theta|alpha)]中的前两个部分 和 Eq[logq(theta)]中第一部分
likelihood =
log_gamma(model->alpha * model -> num_topics)
- model -> num_topics * log_gamma(model->alpha)
- (log_gamma(var_gamma_sum)); for (k = 0; k < model->num_topics; k++)
{
//Eq[logp(theta|alpha)]中的第三个部分 和 Eq[logq(theta)]中剩余的
likelihood +=
(model->alpha - 1)*(dig[k] - digsum) + log_gamma(var_gamma[k])
- (var_gamma[k] - 1)*(dig[k] - digsum);
//Eq[logp(z|theta)] + Eq[logp(w|z,beta)] - Eq[logq(z)]
for (n = 0; n < doc->length; n++)
{
if (phi[n][k] > 0)
{
likelihood += doc->counts[n]*
(phi[n][k]*((dig[k] - digsum) - log(phi[n][k])
+ model->log_prob_w[k][doc->words[n]]));
}
}
}
return(likelihood);
}

3.5 lda-data.c

数据集读入

#include "lda-data.h"

corpus* read_data(char* data_filename)
{
FILE *fileptr;
int length, count, word, n, nd, nw;
corpus* c; printf("reading data from %s\n", data_filename);
c = malloc(sizeof(corpus));
c->docs = 0;
c->num_terms = 0;
c->num_docs = 0;
fileptr = fopen(data_filename, "r");
nd = 0; nw = 0;
while ((fscanf_s(fileptr, "%10d", &length) != EOF)) //读入每行数据的第一个数字,是文档的字典大小(文档中单词去重的个数)
{
//对于第nd个文档
c->docs = (document*) realloc(c->docs, sizeof(document)*(nd+1)); //(数据类型*)realloc(要改变内存大小的指针名,新的大小) 新的大小一定要大于原来的大小。不然的话会导致数据丢失! c->docs[nd].length = length; //文档中出现过的单词的个数。也就是文档字典大小,是去重的
c->docs[nd].total = 0; //文档中总单词个数。不去重,是对counts的求和。
c->docs[nd].words = malloc(sizeof(int)*length); //文档中的word在文档集字典中的ID
c->docs[nd].counts = malloc(sizeof(int)*length); //文档中word出现次数
for (n = 0; n < length; n++)//读入每行数据剩下的数据。词频统计
{
fscanf_s(fileptr, "%10d:%10d", &word, &count); //读入每一个 [wordID:word出现次数]
word = word - OFFSET;
c->docs[nd].words[n] = word;
c->docs[nd].counts[n] = count;
c->docs[nd].total += count;
if (word >= nw) { nw = word + 1; } //nw记录文档集最大的那个word ID。也就是文档集字典中的单词个数
//if (word >= nw) { nw = word; } }
nd++;
}
fclose(fileptr);
c->num_docs = nd;
c->num_terms = nw;
printf("number of docs : %d\n", nd);
printf("number of terms : %d\n", nw);
return(c);
} int max_corpus_length(corpus* c)//输出数据集中单词数(去重后)最多的文档的单词数。这个length是去重后的长度
{
int n, max = 0;
for (n = 0; n < c->num_docs; n++)
if (c->docs[n].length > max) max = c->docs[n].length;
return(max);
}

3.6 lda-alpha.c

牛顿法计算模型參数alpha

#include "lda-alpha.h"
#include "lda-inference.h"
/*
* objective function and its derivatives
*
*/ double alhood(double a, double ss, int D, int K)
{ return(D * (log_gamma(K * a) - K * log_gamma(a)) + (a - 1) * ss); } double d_alhood(double a, double ss, int D, int K)
{ return(D * (K * digamma(K * a) - K * digamma(a)) + ss); } double d2_alhood(double a, int D, int K)
{ return(D * (K * K * trigamma(K * a) - K * trigamma(a))); } /*
* newtons method
*
*/ double opt_alpha(double ss, int D, int K)
{
double a, log_a, init_a = 100;
double f, df, d2f;
int iter = 0; log_a = log(init_a);
do
{
iter++;
a = exp(log_a);
if (isnan(a))
{
init_a = init_a * 10;
printf("warning : alpha is nan; new init = %5.5f\n", init_a);
a = init_a;
log_a = log(a);
}
f = alhood(a, ss, D, K); //附录A4.2中的L(a)
df = d_alhood(a, ss, D, K); //L对a的一阶偏导
d2f = d2_alhood(a, D, K); //二阶偏导
log_a = log_a - df/(d2f * a + df);//迭代公式
printf("alpha maximization : %5.5f %5.5f\n", f, df);
}
while ((fabs(df) > NEWTON_THRESH) && (iter < MAX_ALPHA_ITER));
return(exp(log_a));
}

还有cokus.c 和 utils.c 中是一些数学计算的函数。

LDA主题模型学习笔记5:C源代码理解的更多相关文章

  1. LDA主题模型学习笔记3.5:变分參数推导

    如今来推导一下得到变分參数更新式的过程.这一部分是在论文的附录中,为避免陷入过多细节而影响总体理解.能够在刚開始学习LDA的时候先不关注求解细节.首先要把L写成关于γ,ϕ\gamma,\phi函数.依 ...

  2. NLP传统基础(2)---LDA主题模型---学习文档主题的概率分布(文本分类/聚类)

    一.简介 https://cloud.tencent.com/developer/article/1058777 1.LDA是一种主题模型 作用:可以将每篇文档的主题以概率分布的形式给出[给定一篇文档 ...

  3. 用scikit-learn学习LDA主题模型

    在LDA模型原理篇我们总结了LDA主题模型的原理,这里我们就从应用的角度来使用scikit-learn来学习LDA主题模型.除了scikit-learn,  还有spark MLlib和gensim库 ...

  4. 机器学习-LDA主题模型笔记

    LDA常见的应用方向: 信息提取和搜索(语义分析):文档分类/聚类.文章摘要.社区挖掘:基于内容的图像聚类.目标识别(以及其他计算机视觉应用):生物信息数据的应用; 对于朴素贝叶斯模型来说,可以胜任许 ...

  5. 概率图模型学习笔记:HMM、MEMM、CRF

    作者:Scofield链接:https://www.zhihu.com/question/35866596/answer/236886066来源:知乎著作权归作者所有.商业转载请联系作者获得授权,非商 ...

  6. 理解 LDA 主题模型

    前言 gamma函数 0 整体把握LDA 1 gamma函数 beta分布 1 beta分布 2 Beta-Binomial 共轭 3 共轭先验分布 4 从beta分布推广到Dirichlet 分布 ...

  7. 通俗理解LDA主题模型

    通俗理解LDA主题模型 0 前言 印象中,最開始听说"LDA"这个名词,是缘于rickjin在2013年3月写的一个LDA科普系列,叫LDA数学八卦,我当时一直想看来着,记得还打印 ...

  8. R语言︱LDA主题模型——最优主题数选取(topicmodels)+LDAvis可视化(lda+LDAvis)

    每每以为攀得众山小,可.每每又切实来到起点,大牛们,缓缓脚步来俺笔记葩分享一下吧,please~ --------------------------- 笔者寄语:在自己学LDA主题模型时候,发现该模 ...

  9. Spark:聚类算法之LDA主题模型算法

    http://blog.csdn.net/pipisorry/article/details/52912179 Spark上实现LDA原理 LDA主题模型算法 [主题模型TopicModel:隐含狄利 ...

随机推荐

  1. PHP面向对象摘要

    一.面向对象的三种特性,分别是封装性,继承性和多态性. 1.封装性:封装是面向对象的核心思想,将对象的属性和行为封装起来,不需要让外界知道具体的实现细节,这就是封装思想. 2.继承性:继承性主要是描述 ...

  2. Oracle-3 - :超级适合初学者的入门级笔记--用户权限,set运算符,高级子查询

    上一篇的内容在这里第二篇内容, 用户权限:创建用户,创建角色,使用grant  和 revoke 语句赋予和回收权限,创建数据库联接 创建用户:create user xxx identified b ...

  3. 【Tesseract】Tesseract API在VS 2013中的配置以及调用

    想要在VS中使用Tesseract库,必须使用经过相对应的VS版本编译过的dll以及lib.比如在VS 2013中,就必须使用在VS 2013中编译过的Tesseract库. 这里我给出经过VS 20 ...

  4. Python学习--语句

    一.print 和 import print 打印多个表达式,用逗号隔开 >>> print 'ab','cd' ab cd import import somemodule fro ...

  5. java获取当前上一周、上一月、上一年的时间

    SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); Calendar c = Calend ...

  6. Get started with Google Analytics

    What is Google Analytics Google Analytics is a Google official analytics tool that is primarily used ...

  7. 从java1到java9每个版本都有什么新特性?

    每次出新版本,大家大概都会这么问,"Java X会有什么特性呢?" .在下面的内容里,我总结了至今为止的Java主要发行版中各自引入的新特性,这样做的目的是为了突出各个新特性是在哪 ...

  8. [转载] Thrift-client与spring集成

    转载自http://shift-alt-ctrl.iteye.com/blog/1990030?utm_source=tuicool&utm_medium=referral Thrift-cl ...

  9. tmux frequently asked questions

    tmux frequently asked questions How is tmux different from GNU screen?     tmux and GNU screen have ...

  10. 【转】Entity Framework 5.0系列之自动生成Code First代码

    在前面的文章中我们提到Entity Framework的“Code First”模式也同样可以基于现有数据库进行开发.今天就让我们一起看一下使用Entity Framework Power Tools ...