NLTK的探索

import nltk

import random

from nltk.corpus import movie_reviews

documents = [(list(movie_reviews.words(fileid)), category)

             for category in movie_reviews.categories()

             for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)

all_words = []

for w in movie_reviews.words():

    all_words.append(w.lower())

all_words = nltk.FreqDist(all_words)

word_features = list(all_words.keys())[:3000]

def find_features(document):

    words = set(document)

    features = {}

    for w in word_features:

        features[w] = (w in words)

    return features

print((find_features(movie_reviews.words('neg/cv000_29416.txt'))))

featuresets = [(find_features(rev), category) for (rev, category) in documents]

# set that we'll train our classifier with

training_set = featuresets[:1900]

# set that we'll test against.

testing_set = featuresets[1900:]

classifier = nltk.NaiveBayesClassifier.train(training_set)

print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier, testing_set))*100)

classifier.show_most_informative_features(15)

######################

Most Informative Features

insulting = True neg : pos = 10.6 : 1.0

ludicrous = True neg : pos = 10.1 : 1.0

winslet = True pos : neg = 9.0 : 1.0

detract = True pos : neg = 8.4 : 1.0

breathtaking = True pos : neg = 8.1 : 1.0

silverstone = True neg : pos = 7.6 : 1.0

excruciatingly = True neg : pos = 7.6 : 1.0

warns = True pos : neg = 7.0 : 1.0

tracy = True pos : neg = 7.0 : 1.0

insipid = True neg : pos = 7.0 : 1.0

freddie = True neg : pos = 7.0 : 1.0

damon = True pos : neg = 5.9 : 1.0

debate = True pos : neg = 5.9 : 1.0

ordered = True pos : neg = 5.8 : 1.0

lang = True pos : neg = 5.7 : 1.0

#############################
##保存和恢复模型

save_classifier = open("naivebayes.pickle","wb")

pickle.dump(classifier, save_classifier)

save_classifier.close()

classifier_f = open("naivebayes.pickle", "rb")

classifier = pickle.load(classifier_f)

classifier_f.close()

使用nltk自带的继承于ClassifierI的投票器进行集体分类评估，模型包括nltk的classifier和sklearn的一些分类模型

读取文本并统计出前3000的频繁词汇，然后标记这3000个词的好坏，具体判断标准看这3000词是否是事先有好坏标记的词袋里的词

import nltk

import random

from nltk.corpus import movie_reviews

from nltk.classify.scikitlearn import SklearnClassifier

import pickle

from sklearn.naive_bayes import MultinomialNB, BernoulliNB

from sklearn.linear_model import LogisticRegression, SGDClassifier

from sklearn.svm import SVC, LinearSVC, NuSVC

from nltk.classify import ClassifierI

from statistics import mode

##定义VoteClassifier继承于ClassifierI

class VoteClassifier(ClassifierI):

    def __init__(self, *classifiers):

        self._classifiers = classifiers

    
   ##返回众数，即投票最多的项

    def classify(self, features):

        votes = []

        for c in self._classifiers:

            v = c.classify(features)

            votes.append(v)

        return mode(votes)

    
    ##定义置信区间

    def confidence(self, features):

        votes = []

        for c in self._classifiers:

            v = c.classify(features)

            votes.append(v)

        choice_votes = votes.count(mode(votes))

        conf = choice_votes / len(votes)

        return conf

documents = [(list(movie_reviews.words(fileid)), category)

             for category in movie_reviews.categories()

             for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)

all_words = []

for w in movie_reviews.words():

    all_words.append(w.lower())

all_words = nltk.FreqDist(all_words)

##取出现最多的前3000个词

word_features = list(all_words.keys())[:3000]

##标记词的好坏

def find_features(document):

    words = set(document)

    features = {}

    for w in word_features:

        features[w] = (w in words)

    return features

#print((find_features(movie_reviews.words('neg/cv000_29416.txt'))))

featuresets = [(find_features(rev), category) for (rev, category) in documents]

training_set = featuresets[:1900]

testing_set =  featuresets[1900:]

#classifier = nltk.NaiveBayesClassifier.train(training_set)

classifier_f = open("naivebayes.pickle","rb")

classifier = pickle.load(classifier_f)

classifier_f.close()

print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100)

classifier.show_most_informative_features(15)

MNB_classifier = SklearnClassifier(MultinomialNB())

MNB_classifier.train(training_set)

print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100)

BernoulliNB_classifier = SklearnClassifier(BernoulliNB())

BernoulliNB_classifier.train(training_set)

print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100)

LogisticRegression_classifier = SklearnClassifier(LogisticRegression())

LogisticRegression_classifier.train(training_set)

print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100)

SGDClassifier_classifier = SklearnClassifier(SGDClassifier())

SGDClassifier_classifier.train(training_set)

print("SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set))*100)

##SVC_classifier = SklearnClassifier(SVC())

##SVC_classifier.train(training_set)

##print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, testing_set))*100)

LinearSVC_classifier = SklearnClassifier(LinearSVC())

LinearSVC_classifier.train(training_set)

print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100)

NuSVC_classifier = SklearnClassifier(NuSVC())

NuSVC_classifier.train(training_set)

print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100)

voted_classifier = VoteClassifier(classifier,

                                  NuSVC_classifier,

                                  LinearSVC_classifier,

                                  SGDClassifier_classifier,

                                  MNB_classifier,

                                  BernoulliNB_classifier,

                                  LogisticRegression_classifier)

print("voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, testing_set))*100)

print("Classification:", voted_classifier.classify(testing_set[0][0]), "Confidence %:",voted_classifier.confidence(testing_set[0][0])*100)

print("Classification:", voted_classifier.classify(testing_set[1][0]), "Confidence %:",voted_classifier.confidence(testing_set[1][0])*100)

print("Classification:", voted_classifier.classify(testing_set[2][0]), "Confidence %:",voted_classifier.confidence(testing_set[2][0])*100)

print("Classification:", voted_classifier.classify(testing_set[3][0]), "Confidence %:",voted_classifier.confidence(testing_set[3][0])*100)

print("Classification:", voted_classifier.classify(testing_set[4][0]), "Confidence %:",voted_classifier.confidence(testing_set[4][0])*100)

print("Classification:", voted_classifier.classify(testing_set[5][0]), "Confidence %:",voted_classifier.confidence(testing_set[5][0])*100)

####################################
out：

Original Naive Bayes Algo accuracy percent: 66.0

Most Informative Features

                thematic = True              pos : neg    =      9.1 : 1.0

                secondly = True              pos : neg    =      8.5 : 1.0

                narrates = True              pos : neg    =      7.8 : 1.0

                 layered = True              pos : neg    =      7.1 : 1.0

                 rounded = True              pos : neg    =      7.1 : 1.0

                 supreme = True              pos : neg    =      7.1 : 1.0

                  crappy = True              neg : pos    =      6.9 : 1.0

               uplifting = True              pos : neg    =      6.2 : 1.0

                     ugh = True              neg : pos    =      5.3 : 1.0

                 gaining = True              pos : neg    =      5.1 : 1.0

                   mamet = True              pos : neg    =      5.1 : 1.0

                   wanda = True              neg : pos    =      4.9 : 1.0

                   onset = True              neg : pos    =      4.9 : 1.0

               fantastic = True              pos : neg    =      4.5 : 1.0

                   milos = True              pos : neg    =      4.4 : 1.0

MNB_classifier accuracy percent: 67.0

BernoulliNB_classifier accuracy percent: 67.0

LogisticRegression_classifier accuracy percent: 68.0

SGDClassifier_classifier accuracy percent: 57.99999999999999

LinearSVC_classifier accuracy percent: 67.0

NuSVC_classifier accuracy percent: 65.0

voted_classifier accuracy percent: 65.0

Classification: neg Confidence %: 100.0

Classification: pos Confidence %: 57.14285714285714

Classification: neg Confidence %: 57.14285714285714

Classification: neg Confidence %: 57.14285714285714

Classification: pos Confidence %: 57.14285714285714

Classification: pos Confidence %: 85.71428571428571

#########################################

NLTK的探索的更多相关文章

探索 Python、机器学习和 NLTK 库开发一个应用程序，使用 Python、NLTK 和机器学习对 RSS 提要进行分类
挑战:使用机器学习对 RSS 提要进行分类最近,我接到一项任务,要求为客户创建一个 RSS 提要分类子系统.目标是读取几十个甚至几百个 RSS 提要,将它们的许多文章自动分类到几十个预定义的主题领域 ...
NLTK在自然语言处理
nltk-data.zip 本文主要是总结最近学习的论文.书籍相关知识,主要是Natural Language Pracessing(自然语言处理,简称NLP)和Python挖掘维基百科Infobox ...
NLTK学习笔记(五):分类和标注词汇
目录词性标注器标注语料库表示已经标注的标识符:nltk.tag.str2tuple('word/类型') 读取已经标注的语料库名词.动词.形容词等尝试找出每个名词类型中最频繁的名词探索已经 ...
使用Python中的NLTK和spaCy删除停用词与文本标准化
概述了解如何在Python中删除停用词与文本标准化,这些是自然语言处理的基本技术探索不同的方法来删除停用词,以及讨论文本标准化技术,如词干化(stemming)和词形还原(lemmatizatio ...
【探索】机器指令翻译成 JavaScript
前言前些时候研究脚本混淆时,打算先学一些「程序流程」相关的概念.为了不因太枯燥而放弃,决定想一个有趣的案例,可以边探索边学. 于是想了一个话题:尝试将机器指令 1:1 翻译成 JavaScript ...
【探索】利用 canvas 实现数据压缩
前言 HTTP 支持 GZip 压缩,可节省不少传输资源.但遗憾的是,只有下载才有,上传并不支持.如果上传也能压缩,那就完美了.特别适合大量文本提交的场合,比如博客园,就是很好的例子. 虽然标准不支持 ...
探索C#之6.0语法糖剖析
阅读目录: 自动属性默认初始化自动只读属性默认初始化表达式为主体的函数表达式为主体的属性(赋值) 静态类导入 Null条件运算符字符串格式化索引初始化异常过滤器when catch和fin ...
Mysql事务探索及其在Django中的实践（二）
继上一篇<Mysql事务探索及其在Django中的实践(一)>交代完问题的背景和Mysql事务基础后,这一篇主要想介绍一下事务在Django中的使用以及实际应用给我们带来的效率提升. 首先 ...
Linux学习之探索文件系统
Linux,一起学习进步- ls With it, we can see directory contents and determine a variety of important file ...

随机推荐

python用reduce和map把字符串转为数字的方法
python用reduce和map把字符串转为数字的方法最近在复习高阶函数的时候,有一道题想了半天解不出来.于是上午搜索资料,看了下别人的解法,发现学习编程,思维真的很重要.下面这篇文章就来给大家介 ...
JDK源码阅读--Object
在java.lang包下 Object类:是所有类的基类(父类) public final native Class<?> getClass(); 返回这个Object所代表的的运行时类 ...
Elasticsearch template学习
Elasticsearch template Elasticsearch存在一个关键问题就是索引的设置及字段的属性指定,最常见的问题就是,某个字段我们并不希望ES对其进行分词,但如果使用自动模板创建索 ...
GitHub：如何构建一个股票市场知识图谱？（附代码&链接）
来源:专知本文约 600007 董事⻓/董事高燕女 60 600007 执⾏董事刘永政男 50 600008 董事⻓/董事 ··· ··· ··· ··· ··· 注:建议表头最好用相应的英 ...
Activiti流程定义语言
1.流程(process) bpmn文件一个流程的根元素.一个流程就代表一个工作流. 2.顺序流(sequenceFlow) 顺序流是连接两个流程节点的连线,代表一个节点的出口.流程执行完一个节点后, ...
网络爬虫技术Jsoup
Jsoup介绍:Jsoup 是一个 Java 的开源HTML解析器,可直接解析某个URL地址.HTML文本内容 Jsoup主要有以下功能: 1. 从一个URL,文件或字符串中解析HTML 2. 使用D ...
mysql-connector-java-8.0.12使用时报错
配置url加 &useSSL=false&serverTimezone=UTC 就可以了
廖雪峰Java10加密与安全-5签名算法-2DSA签名算法
DSA DSA:Digital Signature Algorithm,使用EIGamal数字签名算法,和RSA数字签名相比,DSA更快. DSA只能配合SHA使用: SHA1withDSA SHA2 ...
jdk不同版本的垃圾收集器
架构hive2mysql流程
1.分析参数 args = new String[5]; args[0]="d:/3-20.sql"; args[1]="-date"; args[2]=&qu ...

NLTK的探索

NLTK的探索的更多相关文章

随机推荐

热门专题