import nltk
import random
from nltk.corpus import movie_reviews documents = [(list(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)] random.shuffle(documents) all_words = [] for w in movie_reviews.words():
all_words.append(w.lower()) all_words = nltk.FreqDist(all_words) word_features = list(all_words.keys())[:3000] def find_features(document):
words = set(document)
features = {}
for w in word_features:
features[w] = (w in words) return features print((find_features(movie_reviews.words('neg/cv000_29416.txt')))) featuresets = [(find_features(rev), category) for (rev, category) in documents] # set that we'll train our classifier with
training_set = featuresets[:1900] # set that we'll test against.
testing_set = featuresets[1900:] classifier = nltk.NaiveBayesClassifier.train(training_set) print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier, testing_set))*100) classifier.show_most_informative_features(15) ######################
Most Informative Features
insulting = True neg : pos = 10.6 : 1.0
ludicrous = True neg : pos = 10.1 : 1.0
winslet = True pos : neg = 9.0 : 1.0
detract = True pos : neg = 8.4 : 1.0
breathtaking = True pos : neg = 8.1 : 1.0
silverstone = True neg : pos = 7.6 : 1.0
excruciatingly = True neg : pos = 7.6 : 1.0
warns = True pos : neg = 7.0 : 1.0
tracy = True pos : neg = 7.0 : 1.0
insipid = True neg : pos = 7.0 : 1.0
freddie = True neg : pos = 7.0 : 1.0
damon = True pos : neg = 5.9 : 1.0
debate = True pos : neg = 5.9 : 1.0
ordered = True pos : neg = 5.8 : 1.0
lang = True pos : neg = 5.7 : 1.0 #############################
##保存和恢复模型
save_classifier = open("naivebayes.pickle","wb")
pickle.dump(classifier, save_classifier)
save_classifier.close() classifier_f = open("naivebayes.pickle", "rb")
classifier = pickle.load(classifier_f)
classifier_f.close()

使用nltk自带的继承于ClassifierI的投票器进行集体分类评估,模型包括nltk的classifier和sklearn的一些分类模型

读取文本并统计出前3000的频繁词汇,然后标记这3000个词的好坏,具体判断标准看这3000词是否是事先有好坏标记的词袋里的词

import nltk
import random
from nltk.corpus import movie_reviews
from nltk.classify.scikitlearn import SklearnClassifier
import pickle from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC from nltk.classify import ClassifierI
from statistics import mode ##定义VoteClassifier继承于ClassifierI
class VoteClassifier(ClassifierI):
def __init__(self, *classifiers):
self._classifiers = classifiers

##返回众数,即投票最多的项
def classify(self, features):
votes = []
for c in self._classifiers:
v = c.classify(features)
votes.append(v)
return mode(votes)

##定义置信区间
def confidence(self, features):
votes = []
for c in self._classifiers:
v = c.classify(features)
votes.append(v) choice_votes = votes.count(mode(votes))
conf = choice_votes / len(votes)
return conf documents = [(list(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)] random.shuffle(documents) all_words = [] for w in movie_reviews.words():
all_words.append(w.lower()) all_words = nltk.FreqDist(all_words)
##取出现最多的前3000个词
word_features = list(all_words.keys())[:3000]
##标记词的好坏
def find_features(document):
words = set(document)
features = {}
for w in word_features:
features[w] = (w in words) return features #print((find_features(movie_reviews.words('neg/cv000_29416.txt')))) featuresets = [(find_features(rev), category) for (rev, category) in documents] training_set = featuresets[:1900]
testing_set = featuresets[1900:] #classifier = nltk.NaiveBayesClassifier.train(training_set) classifier_f = open("naivebayes.pickle","rb")
classifier = pickle.load(classifier_f)
classifier_f.close() print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100)
classifier.show_most_informative_features(15) MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100) BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100) LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100) SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
print("SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set))*100) ##SVC_classifier = SklearnClassifier(SVC())
##SVC_classifier.train(training_set)
##print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, testing_set))*100) LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100) NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)
print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100) voted_classifier = VoteClassifier(classifier,
NuSVC_classifier,
LinearSVC_classifier,
SGDClassifier_classifier,
MNB_classifier,
BernoulliNB_classifier,
LogisticRegression_classifier) print("voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, testing_set))*100) print("Classification:", voted_classifier.classify(testing_set[0][0]), "Confidence %:",voted_classifier.confidence(testing_set[0][0])*100)
print("Classification:", voted_classifier.classify(testing_set[1][0]), "Confidence %:",voted_classifier.confidence(testing_set[1][0])*100)
print("Classification:", voted_classifier.classify(testing_set[2][0]), "Confidence %:",voted_classifier.confidence(testing_set[2][0])*100)
print("Classification:", voted_classifier.classify(testing_set[3][0]), "Confidence %:",voted_classifier.confidence(testing_set[3][0])*100)
print("Classification:", voted_classifier.classify(testing_set[4][0]), "Confidence %:",voted_classifier.confidence(testing_set[4][0])*100)
print("Classification:", voted_classifier.classify(testing_set[5][0]), "Confidence %:",voted_classifier.confidence(testing_set[5][0])*100) ####################################
out: Original Naive Bayes Algo accuracy percent: 66.0
Most Informative Features
thematic = True pos : neg = 9.1 : 1.0
secondly = True pos : neg = 8.5 : 1.0
narrates = True pos : neg = 7.8 : 1.0
layered = True pos : neg = 7.1 : 1.0
rounded = True pos : neg = 7.1 : 1.0
supreme = True pos : neg = 7.1 : 1.0
crappy = True neg : pos = 6.9 : 1.0
uplifting = True pos : neg = 6.2 : 1.0
ugh = True neg : pos = 5.3 : 1.0
gaining = True pos : neg = 5.1 : 1.0
mamet = True pos : neg = 5.1 : 1.0
wanda = True neg : pos = 4.9 : 1.0
onset = True neg : pos = 4.9 : 1.0
fantastic = True pos : neg = 4.5 : 1.0
milos = True pos : neg = 4.4 : 1.0
MNB_classifier accuracy percent: 67.0
BernoulliNB_classifier accuracy percent: 67.0
LogisticRegression_classifier accuracy percent: 68.0
SGDClassifier_classifier accuracy percent: 57.99999999999999
LinearSVC_classifier accuracy percent: 67.0
NuSVC_classifier accuracy percent: 65.0
voted_classifier accuracy percent: 65.0
Classification: neg Confidence %: 100.0
Classification: pos Confidence %: 57.14285714285714
Classification: neg Confidence %: 57.14285714285714
Classification: neg Confidence %: 57.14285714285714
Classification: pos Confidence %: 57.14285714285714
Classification: pos Confidence %: 85.71428571428571 #########################################

NLTK的探索的更多相关文章

  1. 探索 Python、机器学习和 NLTK 库 开发一个应用程序,使用 Python、NLTK 和机器学习对 RSS 提要进行分类

    挑战:使用机器学习对 RSS 提要进行分类 最近,我接到一项任务,要求为客户创建一个 RSS 提要分类子系统.目标是读取几十个甚至几百个 RSS 提要,将它们的许多文章自动分类到几十个预定义的主题领域 ...

  2. NLTK在自然语言处理

    nltk-data.zip 本文主要是总结最近学习的论文.书籍相关知识,主要是Natural Language Pracessing(自然语言处理,简称NLP)和Python挖掘维基百科Infobox ...

  3. NLTK学习笔记(五):分类和标注词汇

    目录 词性标注器 标注语料库 表示已经标注的标识符:nltk.tag.str2tuple('word/类型') 读取已经标注的语料库 名词.动词.形容词等 尝试找出每个名词类型中最频繁的名词 探索已经 ...

  4. 使用Python中的NLTK和spaCy删除停用词与文本标准化

    概述 了解如何在Python中删除停用词与文本标准化,这些是自然语言处理的基本技术 探索不同的方法来删除停用词,以及讨论文本标准化技术,如词干化(stemming)和词形还原(lemmatizatio ...

  5. 【探索】机器指令翻译成 JavaScript

    前言 前些时候研究脚本混淆时,打算先学一些「程序流程」相关的概念.为了不因太枯燥而放弃,决定想一个有趣的案例,可以边探索边学. 于是想了一个话题:尝试将机器指令 1:1 翻译 成 JavaScript ...

  6. 【探索】利用 canvas 实现数据压缩

    前言 HTTP 支持 GZip 压缩,可节省不少传输资源.但遗憾的是,只有下载才有,上传并不支持.如果上传也能压缩,那就完美了.特别适合大量文本提交的场合,比如博客园,就是很好的例子. 虽然标准不支持 ...

  7. 探索C#之6.0语法糖剖析

    阅读目录: 自动属性默认初始化 自动只读属性默认初始化 表达式为主体的函数 表达式为主体的属性(赋值) 静态类导入 Null条件运算符 字符串格式化 索引初始化 异常过滤器when catch和fin ...

  8. Mysql事务探索及其在Django中的实践(二)

    继上一篇<Mysql事务探索及其在Django中的实践(一)>交代完问题的背景和Mysql事务基础后,这一篇主要想介绍一下事务在Django中的使用以及实际应用给我们带来的效率提升. 首先 ...

  9. Linux学习之探索文件系统

    Linux,一起学习进步-    ls With it, we can see directory contents and determine a variety of important file ...

随机推荐

  1. python用reduce和map把字符串转为数字的方法

    python用reduce和map把字符串转为数字的方法 最近在复习高阶函数的时候,有一道题想了半天解不出来.于是上午搜索资料,看了下别人的解法,发现学习编程,思维真的很重要.下面这篇文章就来给大家介 ...

  2. JDK源码阅读--Object

    在java.lang包下 Object类:是所有类的基类(父类) public final native Class<?> getClass(); 返回这个Object所代表的的运行时类 ...

  3. Elasticsearch template学习

    Elasticsearch template Elasticsearch存在一个关键问题就是索引的设置及字段的属性指定,最常见的问题就是,某个字段我们并不希望ES对其进行分词,但如果使用自动模板创建索 ...

  4. GitHub:如何构建一个股票市场知识图谱?(附代码&链接)

    来源:专知 本文约 600007 董事⻓/董事 高燕 女 60 600007 执⾏董事 刘永政 男 50 600008 董事⻓/董事 ··· ··· ··· ··· ··· 注:建议表头最好用相应的英 ...

  5. Activiti流程定义语言

    1.流程(process) bpmn文件一个流程的根元素.一个流程就代表一个工作流. 2.顺序流(sequenceFlow) 顺序流是连接两个流程节点的连线,代表一个节点的出口.流程执行完一个节点后, ...

  6. 网络爬虫技术Jsoup

    Jsoup介绍:Jsoup 是一个 Java 的开源HTML解析器,可直接解析某个URL地址.HTML文本内容 Jsoup主要有以下功能: 1. 从一个URL,文件或字符串中解析HTML 2. 使用D ...

  7. mysql-connector-java-8.0.12使用时报错

    配置url加 &useSSL=false&serverTimezone=UTC 就可以了

  8. 廖雪峰Java10加密与安全-5签名算法-2DSA签名算法

    DSA DSA:Digital Signature Algorithm,使用EIGamal数字签名算法,和RSA数字签名相比,DSA更快. DSA只能配合SHA使用: SHA1withDSA SHA2 ...

  9. jdk不同版本的垃圾收集器

  10. 架构hive2mysql流程

    1.分析参数 args = new String[5]; args[0]="d:/3-20.sql"; args[1]="-date"; args[2]=&qu ...