python文本挖掘模版

import xlrd

import jieba

import sys

import importlib

import os         #python内置的包，用于进行文件目录操作，我们将会用到os.listdir函数

import pickle    #导入cPickle包并且取一个别名pickle #持久化类

import random

import numpy as np

import matplotlib.pyplot as plt

from mpl_toolkits.mplot3d import Axes3D

from pylab import mpl

from sklearn.naive_bayes import MultinomialNB # 导入多项式贝叶斯算法包

from sklearn import svm

from sklearn import metrics

from sklearn.datasets.base import Bunch

from sklearn.feature_extraction.text import TfidfVectorizer

importlib.reload(sys)

#把内容和类别转化成一个向量的形式

trainContentdatasave=[] #存储所有训练和测试数据的分词

testContentdatasave=[]

trainContentdata = []

testContentdata = []

trainlabeldata = []

testlabeldata = []

#导入文本描述的训练和测试数据

def importTrainContentdata():

    file = '20180716_train.xls'

    wb = xlrd.open_workbook(file)

    ws = wb.sheet_by_name("Sheet1")

    for r in range(ws.nrows):

        col = []

        for c in range(1):

            col.append(ws.cell(r, c).value)

        trainContentdata.append(col)

def importTestContentdata():

    file = '20180716_test.xls'

    wb = xlrd.open_workbook(file)

    ws = wb.sheet_by_name("Sheet1")

    for r in range(ws.nrows):

        col = []

        for c in range(1):

            col.append(ws.cell(r, c).value)

        testContentdata.append(col)   

#导入类别的训练和测试数据

def importTrainlabeldata():

    file = '20180716_train_label.xls'

    wb = xlrd.open_workbook(file)

    ws = wb.sheet_by_name("Sheet1")

    for r in range(ws.nrows):

        col = []

        for c in range(1):

            col.append(ws.cell(r, c).value)

        trainlabeldata.append(col)

def importTestlabeldata():

    file = '20180716_test_label.xls'

    wb = xlrd.open_workbook(file)

    ws = wb.sheet_by_name("Sheet1")

    for r in range(ws.nrows):

        col = []

        for c in range(1):

            col.append(ws.cell(r, c).value)

        testlabeldata.append(col)

"""

def importClassSet():

    file = 'ClassSet.xls'

    wb = xlrd.open_workbook(file)

    ws = wb.sheet_by_name("Sheet1")

    for r in range(ws.nrows):

        col = []

        for c in range(ws.ncols):

            col.append(ws.cell(r, c).value)

        ClassSet.append(col)

"""

def buildtrainbunch(bunch_path):

    bunch = Bunch(label=[],contents=[])

    for item1 in trainlabeldata:

        bunch.label.append(item1)

    for item2 in trainContentdata:

        item2=str(item2)

        item2 = item2.replace("\r\n", "")

        item2 = item2.replace(" ", "")

        content_seg=jieba.cut(item2)

        save2=''

        for item3 in content_seg:

            if len(item3) > 1 and item3!='\r\n':

                trainContentdatasave.append(item3)

                save2=save2+","+item3

        bunch.contents.append(save2)

    with open(bunch_path, "wb") as file_obj:

        pickle.dump(bunch, file_obj)

    print("构建训练数据文本对象结束！！！")

def buildtestbunch(bunch_path):

    bunch = Bunch(label=[],contents=[])

    for item1 in testlabeldata:

        bunch.label.append(item1)

    for item2 in testContentdata:

        item2=str(item2)

        item2 = item2.replace("\r\n", "")

        item2 = item2.replace(" ", "")

        content_seg=jieba.cut(item2)

        save2=''

        for item3 in content_seg:

            if len(item3) > 1 and item3!='\r\n':

                testContentdatasave.append(item3)

                save2=save2+","+item3

        bunch.contents.append(save2)

    with open(bunch_path, "wb") as file_obj:

        pickle.dump(bunch, file_obj)

    print("构建测试数据文本对象结束！！！")

#读取停用词

def _readfile(path):

    with open(path, "rb") as fp:

        content = fp.read()

    return content  

# 读取bunch对象

def _readbunchobj(path):

    with open(path, "rb") as file_obj:

        bunch = pickle.load(file_obj)

    return bunch  

# 写入bunch对象

def _writebunchobj(path, bunchobj):

    with open(path, "wb") as file_obj:

        pickle.dump(bunchobj, file_obj) 

def vector_space(stopword_path,bunch_path,space_path):

    stpwrdlst = _readfile(stopword_path).splitlines()#读取停用词

    bunch = _readbunchobj(bunch_path)#导入分词后的词向量bunch对象

    #构建tf-idf词向量空间对象

    tfidfspace = Bunch(label=bunch.label,tdm=[], vocabulary={})

    '''

    权重矩阵tdm，其中，权重矩阵是一个二维矩阵，tdm[i][j]表示，第j个词（即词典中的序号）在第i个类别中的IF-IDF值

    '''

    #使用TfidVectorizer初始化向量空间模型

    vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5, min_df=0.0001,use_idf=False,max_features=10000)

    #print(vectorizer)

    #文本转为词频矩阵，单独保存字典文件

    tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)

    tfidfspace.vocabulary = vectorizer.vocabulary_

    #创建词袋的持久化

    _writebunchobj(space_path, tfidfspace)

    print("if-idf词向量空间实例创建成功！！！")

def testvector_space(stopword_path,bunch_path,space_path,train_tfidf_path):

    stpwrdlst = _readfile(stopword_path).splitlines()#把停用词变成列表

    bunch = _readbunchobj(bunch_path)

    tfidfspace = Bunch(label=bunch.label,tdm=[], vocabulary={})

    '''

    tdm存放的是计算后得到的TF-IDF权重矩阵.

    vocabulary是词向量空间的索引，例如，如果我们定义的词向量空间是（我，喜欢，相国大人），那么vocabulary就是这样一个索引字典

    vocabulary={"我":0,"喜欢":1,"相国大人":2}，你可以简单的理解为：vocabulary就是词向量空间的坐标轴，索引值相当于表明了第几个维度。

    '''

    #导入训练集的TF-IDF词向量空间  ★★

    trainbunch = _readbunchobj(train_tfidf_path)

    tfidfspace.vocabulary = trainbunch.vocabulary

    '''

    关于参数，你只需要了解这么几个就可以了：

    stop_words:

    传入停用词，以后我们获得vocabulary_的时候，就会根据文本信息去掉停用词得到

    vocabulary:

    之前说过，不再解释。

    sublinear_tf:

    计算tf值采用亚线性策略。比如，我们以前算tf是词频，现在用1+log(tf)来充当词频。

    smooth_idf:

    计算idf的时候log(分子/分母)分母有可能是0，smooth_idf会采用log(分子/(1+分母))的方式解决。默认已经开启，无需关心。

    norm:

    归一化，我们计算TF-IDF的时候，是用TF*IDF，TF可以是归一化的，也可以是没有归一化的，一般都是采用归一化的方法，默认开启.

    max_df:

    有些词，他们的文档频率太高了（一个词如果每篇文档都出现，那还有必要用它来区分文本类别吗？当然不用了呀），所以，我们可以

    设定一个阈值，比如float类型0.5（取值范围[0.0,1.0]）,表示这个词如果在整个数据集中超过50%的文本都出现了，那么我们也把它列

    为临时停用词。当然你也可以设定为int型，例如max_df=10,表示这个词如果在整个数据集中超过10的文本都出现了，那么我们也把它列

    为临时停用词。

    min_df:

    与max_df相反，虽然文档频率越低，似乎越能区分文本，可是如果太低，例如10000篇文本中只有1篇文本出现过这个词，仅仅因为这1篇

    文本，就增加了词向量空间的维度，太不划算。

    当然，max_df和min_df在给定vocabulary参数时，就失效了。

    '''

    vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.7, vocabulary=trainbunch.vocabulary, min_df=0.001)  

    #print(vectorizer)

    tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)

    _writebunchobj(space_path, tfidfspace)

    print("if-idf词向量空间实例创建成功！！！")

def metrics_result(actual, predict):  #  metrics.f1_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))

    print('精度:{0:.3f}'.format(metrics.precision_score(actual, predict,average='weighted', labels=np.unique(predict))))

    print('召回:{0:0.3f}'.format(metrics.recall_score(actual, predict,average='weighted', labels=np.unique(predict))))

    print('f1-score:{0:.3f}'.format(metrics.f1_score(actual, predict, average='weighted', labels=np.unique(predict))))

    #准确率和召回率是相互影响的，理想情况下是二者都高，但是一般情况下准确率高，召回率就低；召回率高，准确率就低  

if __name__=="__main__":  

    importTrainContentdata()

    importTestContentdata()

    importTrainlabeldata()

    importTestlabeldata()

    #导入分词后的词向量bunch对象

    train_bunch_path ="F:/goverment/ArticleMining/trainbunch.bat"#Bunch保存路径

    test_bunch_path ="F:/goverment/ArticleMining/testbunch.bat"

    stopword_path ="F:/goverment/ArticleMining/hlt_stop_words.txt"

    train_space_path = "F:/goverment/ArticleMining/traintfdifspace.dat"

    test_space_path = "F:/goverment/ArticleMining/testtfdifspace.dat"

    #对训练和测试集进行bunch操作

    buildtrainbunch(train_bunch_path)

    buildtestbunch(test_bunch_path)

    vector_space(stopword_path,train_bunch_path,train_space_path)

    testvector_space(stopword_path,test_bunch_path,test_space_path,train_space_path)

    #导入训练和测试数据集

    train_set=_readbunchobj(train_space_path)

    test_set=_readbunchobj(test_space_path)

    print(train_set.tdm)

    '''

    mm=0

    ii=0

    jj=0

    for i in range(3142):

        for j in range(3142):

            if train_set.tdm[i][j] >mm:

                mm=train_set.tdm[i][j]

                ii=i

                jj=j

    print(ii)

    print(jj)

    '''        

    #test_set.tdm

    #train_set.label

    # 训练分类器：输入词袋向量和分类标签，alpha:0.001 alpha越小，迭代次数越多，精度越高  

    #低召回、F1： 0.75 rbf:0.59    0.8 rbf 0.578

    #c0.75 poly 66.5 精度:0.665 gamma=10 召回:0.330  f1-score:0.416

    #C=0.7, kernel='poly', gamma=10 召回:0.331 f1-score:0.417

    # alpha:0.001 alpha 越小，迭代次数越多，精度越高

    '''

    clf = MultinomialNB(alpha=0.052).fit(train_set.tdm, train_set.label)

    #clf = svm.SVC(C=0.7, kernel='poly', gamma=10, decision_function_shape='ovr')

    clf.fit(train_set.tdm, train_set.label)

    predicted=clf.predict(test_set.tdm)

    tv = TfidfVectorizer()

    train_data = tv.fit_transform(X_train)

    test_data = tv.transform(X_test)

    lr = LogisticRegression(C=3)

    lr.fit(train_set.tdm, train_set.label)

    predicted=lr.predict(test_set.tdm)

    print(lr.score(test_set.tdm, test_set.label))

    #print(test_set.tdm)

    '''

    clf = SVC(C=1500)

    clf.fit(train_set.tdm, train_set.label)

    predicted=clf.predict(test_set.tdm)

    print(clf.score(test_set.tdm, test_set.label))

    '''

    from sklearn.neighbors import KNeighborsClassifier

    knnclf = KNeighborsClassifier(n_neighbors=9)#default with k=5

    knnclf.fit(train_set.tdm,train_set.label)

    predicted = knnclf.predict(test_set.tdm)

    '''

    a=[]

    b=[]

    for i in range(len(predicted)):

        b.append((int)(float(predicted[i])))

        a.append(int(test_set.label[i][0]))

    f=open('F:/goverment/ArticleMining/predict.txt', 'w')

    for i in range(len(predicted)):

       f.write(str(b[i]))

       f.write('\n')

    f.write("写好了")

    f.close()

    #for i in range(len(predicted)):

        #print(b[i])

    metrics_result(a, b)

python文本挖掘模版的更多相关文章

设置PyCharm中的Python代码模版
再MacOs运行的PyCharm中,执行python文件,如果不指定python文件字符编码会报错: SyntaxError: Non-ASCII character , but no encodin ...
Python设计模式——模版方法模式
1.模版方法模式做题的列子: 需求:有两个学生,要回答问题,写出自己的答案 #encoding=utf-8 __author__ = 'kevinlu1010@qq.com' class Stude ...
python接口测试模版
"""Test case implementation""" import sys import functools import diff ...
转：Python 文本挖掘：使用gensim进行文本相似度计算
Python使用gensim进行文本相似度计算转于:http://rzcoding.blog.163.com/blog/static/2222810172013101895642665/ 在文本处理 ...
Python学习---模版/包的概念
1.1. 模块/包的概念在Python中,一个.py文件就称之为一个模块(Module) 模块一共三种: python标准库第三方模块应用程序自定义模块模块的使用:模块是用来组织函数的解释器 ...
大佬整理出来的干货：LDA模型实现—Python文本挖掘
前言本文的文字及图片来源于网络,仅供学习.交流使用,不具有任何商业用途,版权归原作者所有,如有问题请及时联系我们以作处理. PS:如有需要Python学习资料的小伙伴可以加点击下方链接自行获取htt ...
python文本挖掘输出权重，词频等信息，画出3d权重图
# -*- coding: utf-8 -*- from pandas import read_csv import numpy as np from sklearn.datasets.base im ...
Python 文本挖掘：使用情感词典进行情感分析（算法及程序设计）
出处:http://www.ithao123.cn/content-242299.html 情感分析就是分析一句话说得是很主观还是客观描述,分析这句话表达的是积极的情绪还是消极的情绪. 原理比如 ...
python tkinter模版
import tkinter import time import threading from tkinter import ttk event = threading.Event() once=0 ...

随机推荐

转载-lvs官方文档-LVS集群中的IP负载均衡技术
章文嵩(wensong@linux-vs.org) 2002 年 4 月本文在分析服务器集群实现虚拟网络服务的相关技术上,详细描述了LVS集群中实现的三种IP负载均衡技术(VS/NAT.VS/TUN ...
linux系统挂载NTFS移动硬盘
有时候做大数据量迁移时,为了快速迁移大数据,有可能在Linux服务器上临时挂载NTFS格式的移动硬盘, 一般情况下,Linux是识别不了NTFS格式移动硬盘的(需要重编译Linux核心才能,加挂NTF ...
跟着小程学微服务-Mock自动化系统的原理及实现
一.前言在之前的文章 http://blog.csdn.net/u013970991/article/details/54862772 中已经介绍了"自动化Mock系统0.9版本" ...
Node 抓取非utf-8编码页面
代码示例 Nodejs抓取非utf8字符编码的页面 -- Ruby's Louvre var http = require('http'); var iconv = require('iconv-li ...
[Python] RuntimeError: Invalid DISPLAY variable
1．问题:在本地用matplotlib绘图可以,但是在ssh远程绘图的时候会报错 RuntimeError: Invalid DISPLAY variable 2．原因:matplotlib的默认ba ...
HDU 1374
http://acm.hdu.edu.cn/showproblem.php?pid=1374 已知三点坐标,求三点确定的圆的周长 #include <iostream> #include ...
Objective C － 2 - 随机数，可变字符串，字符串，SubString
int main(int argc, const char * argv[]) { @autoreleasepool { NSString *outputString = @"1234567 ...
关于python机器学习常用算法的例子
Home Installation Documentation Examples Previous An introduction ... This documentation is for ...
C：源文件编译过程
可以大致概括为3个阶段: 源文件 → 汇编代码(文本) 汇编代码 → 机器语言(二进制) 各个目标文件的处理详细过程: 预编译处理 Pre-processing(*.c/ *.cpp → *.i) ...
LOJ2360. 「NOIP2016」换教室【概率DP】【Floyed】【傻逼题】
LINK 思路先floyed出两点最短路然后就可以直接\(dp_{i,j,0/1}\)表示前i节课选择换j节,换不换当前这一节的最小贡献直接可以枚举上一次决策的状态计算概率进行统计就可以了我变 ...

python文本挖掘模版

python文本挖掘模版的更多相关文章

随机推荐

热门专题