doc2vec使用笔记

#!/usr/bin/env Python

# coding:utf-8

#improt依赖包

# import sys

# reload(sys)

# sys.setdefaultencoding('utf-8')

import chardet

from gensim import utils

from gensim.models.doc2vec import LabeledSentence

from gensim.models import Doc2Vec

import numpy

from random import shuffle

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import confusion_matrix

import sklearn.metrics as metrics

# Doc2vec需要以LabeledLineSentece对象作为输入，所以需要构建一个类将文本转化为LabeledLineStentece对象

class LabeledLineSentence(object):

    def __init__(self, sources):

        self.sources = sources

        flipped = {}

        # make sure that keys are unique

        for key, value in sources.items():

            if value not in flipped:

                flipped[value] = [key]

            else:

                raise Exception('Non-unique prefix encountered')

    def __iter__(self):

        for source, prefix in self.sources.items():

            with utils.smart_open(source) as fin:

                for item_no, line in enumerate(fin):

                    yield LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])

    def to_array(self):

        self.sentences = []

        for source, prefix in self.sources.items():

            with utils.smart_open(source) as fin:

                for item_no, line in enumerate(fin):

                    print chardet.detect(line)

                    line=line.decode("GB2312",'ignore').encode("utf-8")

                    print chardet.detect(line)

                    self.sentences.append(LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no]))

                    # self.sentences.append(LabeledSentence(utils.to_utf8(line).split(), [prefix + '_%s' % item_no]))

        return self.sentences

    def sentences_perm(self):

        shuffle(self.sentences)

        return self.sentences

#将文本数据以以下方式导入到Doc2vec中

# sources = {u'/Volumes/Macintosh HD/Users/RayChou/Downloads/情感分析训练语料/neg_train.txt':'TRAIN_NEG',

# u'/Volumes/Macintosh HD/Users/RayChou/Downloads/情感分析训练语料/pos_train.txt':'TRAIN_POS'

# ,u'/Volumes/Macintosh HD/Users/RayChou/Downloads/情感分析训练语料/uns_train.txt':'TRAIN_UNS',

# u'/Volumes/Macintosh HD/Users/RayChou/Downloads/情感分析训练语料/uns_test.txt':'TEST_UNS'}

sources = {\

'./yuliao/fYuliao0.txt':'TRAIN_0',

'./yuliao/fYuliao1.txt':'TRAIN_1',

'./yuliao/fYuliao2.txt':'TRAIN_2',

'./yuliao/fYuliao3.txt':'TRAIN_3',

'./yuliao/fYuliao4.txt':'TRAIN_4',

'./yuliao/fYuliao5.txt':'TRAIN_5',\

}

sentences = LabeledLineSentence(sources)

#构建Doc2vec模型

model = Doc2Vec(min_count=1, window=15, size=100, sample=1e-4, negative=5, workers=8)

model.build_vocab(sentences.to_array())

#训练Doc2vec模型（本例迭代次数为10，如果时间允许，可以迭代更多的次数）

for epoch in range(2):

    model.train(sentences.sentences_perm())

model.save("model.txt")

# model=Doc2Vec.load("model.txt")

#将训练好的句子向量装进array里面，后文作为分类器的输入

train_arrays = numpy.zeros((5000, 100))

train_labels = numpy.zeros(5000)

test_arrays = []

true_labels=[]

train_data=[]

train_lb=[]

for i in range(5000):

    if(i<=645):

        prefix_train_0 = 'TRAIN_0_' + str(i)

        train_arrays[i] = model.docvecs[prefix_train_0]

        train_labels[i] = 0

    elif(i>645 and i<=4249):

        j=i-646

        prefix_train_1 = 'TRAIN_1_' + str(j)

        train_arrays[i]=model.docvecs[prefix_train_1]

        train_labels[i]=1

    elif(i>4249 and i<=4800):

        j=i-4250

        prefix_train_2 = 'TRAIN_2_' + str(j)

        train_arrays[i]=model.docvecs[prefix_train_2]

        train_labels[i]=2

    elif(i>4800 and i<=4965):

        j=i-4801

        prefix_train_3 = 'TRAIN_3_' + str(j)

        train_arrays[i]=model.docvecs[prefix_train_3]

        train_labels[i]=3

    elif(i>4965 and i<=4994):

        j=i-4966

        prefix_train_4 = 'TRAIN_4_' + str(j)

        train_arrays[i]=model.docvecs[prefix_train_4]

        train_labels[i]=4

    else:

        j=i-4995

        prefix_train_5 = 'TRAIN_5_' + str(j)

        train_arrays[i]=model.docvecs[prefix_train_5]

        train_labels[i]=5

#载入测试集数据

a=open("./yuliao/fYuliao0_test.txt")

b=open("./yuliao/fYuliao1_test.txt")

c=open("./yuliao/fYuliao2_test.txt")

d=open("./yuliao/fYuliao3_test.txt")

e=open("./yuliao/fYuliao4_test.txt")

f=open("./yuliao/fYuliao5_test.txt")

test_content1=a.readlines()

test_content2=b.readlines()

test_content3=c.readlines()

test_content4=d.readlines()

test_content5=e.readlines()

test_content6=f.readlines()

g=open("./yuliao/fYuliao0_test.txt")

test_content7=g.readline()

inferred_docvec=model.infer_vector(test_content7)

print model.docvecs.most_similar([inferred_docvec], topn=3)

for i in test_content1:

    test_arrays.append(model.infer_vector(i))

    true_labels.append(0)

for i in test_content2:

    test_arrays.append(model.infer_vector(i))

    true_labels.append(1)

for i in test_content3:

    test_arrays.append(model.infer_vector(i))

    true_labels.append(2)

for i in test_content4:

    test_arrays.append(model.infer_vector(i))

    true_labels.append(3)

for i in test_content5:

    test_arrays.append(model.infer_vector(i))

    true_labels.append(4)

for i in test_content6:

    test_arrays.append(model.infer_vector(i))

    true_labels.append(5)

#构建逻辑回归分类器

classifier = LogisticRegression(class_weight={0:0.38,1:0.62})

classifier.fit(train_arrays, train_labels)

# 构建随机森林分类器

'''

from sklearn.ensemble import RandomForestClassifier

RF = RandomForestClassifier(n_estimators=1200,max_depth=14,class_weight={0:0.3,1:0.7})

RF.fit(train_arrays, train_labels)

'''

#构建GBDT分类器

'''

from sklearn.ensemble import GradientBoostingClassifier

GBDT = GradientBoostingClassifier(n_estimators=1000,max_depth=14)

GBDT.fit(train_arrays, train_labels)

'''

#对Test数据进行预测

test_labels_LR=[]

# test_labels_RF=[]

# test_labels_GBDT=[]

for i in range(len(test_arrays)):

    test_labels_LR.append(classifier.predict(test_arrays[i]))

    '''

    test_labels_RF.append(RF.predict(test_arrays[i]))

    test_labels_GBDT.append(GBDT.predict(test_arrays[i]))

    '''

#打印各个模型的准确率和召回率

print("LR:")

test_labels_LR1 = []

count = 0

for i in range(len(test_labels_LR)):

    if (test_labels_LR[i][0] == true_labels[i]):

        count +=1

print count

'''

print("RF:")

print(metrics.accuracy_score(test_labels_RF,true_labels))

print(confusion_matrix(test_labels_RF,true_labels))

print("GBDT:")

print(metrics.accuracy_score(test_labels_GBDT,true_labels))

print(confusion_matrix(test_labels_GBDT,true_labels))

'''

doc2vec使用笔记的更多相关文章

Use of Deep Learning in Modern Recommendation System: A Summary of Recent Works（笔记）
注意:论文中,很多的地方出现baseline,可以理解为参照物的意思,但是在论文中,我们还是直接将它称之为基线,也就是对照物,参照物. 这片论文中,作者没有去做实际的实验,但是却做了一件很有意义的事 ...
人工智能头条（公开课笔记）+AI科技大本营——一拨微信公众号文章
不错的 Tutorial: 从零到一学习计算机视觉:朋友圈爆款背后的计算机视觉技术与应用 | 公开课笔记分享人 | 叶聪(腾讯云 AI 和大数据中心高级研发工程师) 整理 | Leo 出 ...
git-简单流程（学习笔记）
这是阅读廖雪峰的官方网站的笔记,用于自己以后回看 1.进入项目文件夹初始化一个Git仓库,使用git init命令. 添加文件到Git仓库,分两步: 第一步,使用命令git add <file ...
js学习笔记：webpack基础入门（一）
之前听说过webpack,今天想正式的接触一下,先跟着webpack的官方用户指南走: 在这里有: 如何安装webpack 如何使用webpack 如何使用loader 如何使用webpack的开发者 ...
SQL Server技术内幕笔记合集
SQL Server技术内幕笔记合集发这一篇文章主要是方便大家找到我的笔记入口,方便大家o(∩_∩)o Microsoft SQL Server 6.5 技术内幕笔记http://www.cnbl ...
PHP-自定义模板-学习笔记
1. 开始这几天,看了李炎恢老师的<PHP第二季度视频>中的“章节7:创建TPL自定义模板”,做一个学习笔记,通过绘制架构图.UML类图和思维导图,来对加深理解. 2. 整体架构图 ...
PHP-会员登录与注册例子解析-学习笔记
1.开始最近开始学习李炎恢老师的<PHP第二季度视频>中的“章节5:使用OOP注册会员”,做一个学习笔记,通过绘制基本页面流程和UML类图,来对加深理解. 2.基本页面流程 3.通过UM ...
NET Core-学习笔记（三）
这里将要和大家分享的是学习总结第三篇:首先感慨一下这周跟随netcore官网学习是遇到的一些问题: a.官网的英文版教程使用的部分nuget包和我当时安装的最新包版本不一致,所以没法按照教材上给出的列 ...
springMVC学习笔记--知识点总结1
以下是学习springmvc框架时的笔记整理: 结果跳转方式 1.设置ModelAndView,根据view的名称,和视图渲染器跳转到指定的页面. 比如jsp的视图渲染器是如下配置的: <!-- ...

随机推荐

由于使用JDBC ResultSet的滚动功能而导致的内存溢出
前天一去公司,老大说,服务器全挂了! 最后排查了半天,结论是内存溢出! 在WAS的DUMP日志中,看得我头晕眼花,终于找到了罪魁祸首,原来是有同事写代码的时候使用了可滚动的结果集导致内存溢出. 什么是 ...
magento2 重置后台密码
项目根目录:运行如下命令 bin/magento admin:user:create --admin-user="admin" --admin-password="123 ...
git五分钟教程
使用Git前,需要先建立一个仓库(repository).您可以使用一个已经存在的目录作为Git仓库或创建一个空目录. 使用您当前目录作为Git仓库,我们只需使它初始化. git init 使用我们指 ...
css实现梯形
使用伪元素before和after分别在矩形元素前后加三角形或者直接设置border 使用3d旋转矩形,使之看起来像矩形 <html> <head> <meta char ...
统计过程控制与评价 Cpk、SPC、PPM
Cpk(Process capability index)--工序能力指数 SPC(Statisical Process Control)--工艺过程统计受控状态分析 PPM(Parts Per Mi ...
【C#】#102 发送邮件
项目需求:定时的发送邮件,于是学习了如何发送邮件下面有一个简单的例子.能够实现简单的发送邮件,加上附件可以添加一个属性[Attachment],然后配置上附件的路径 Demo下载代码总共只有一下这 ...
JQuery $.axaj的基本格式
总是忘了,保存以备后用. $.ajax({ url: '', //请求的url地址 dataType: "json", //返回的格式为json async: true, //请求 ...
SOJ 4583 动态规划之分组背包
Description Sidney想去Gandtom家玩.但Sidney家和Gandtom家之间是高低不平.坑坑洼洼的土路.所以他需要用他的背包装几袋稀的泥,在路上铺平一些干的土,使路变成平整的泥土 ...
python第十五课——全局变量and局部变量
全局变量&局部变量: 全局变量的特点: 1).直接定义在.py文件中(函数外)的变量(全局位置) 2).作用域比较大,可以被此文件中的任何函数所使用局部变量的特点:1).定义在函数内部(函数 ...
Windows7下配置JMeter安装环境
JMeter配置安装 1.安装JDK环境下载地址:http://www.Oracle.com/technetwork/Java/javase/downloads/jdk8-downloads-21 ...

doc2vec使用笔记

doc2vec使用笔记的更多相关文章

随机推荐

热门专题