NLP之电影评分数据的情感分析

1、基于词袋模型的逻辑回归情感分类

# coding: utf-8

import re

import numpy as np

import pandas as pd

from bs4 import BeautifulSoup

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.metrics import confusion_matrix

from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

import itertools

###########################词袋模型特征############################################

#重组为新的句子

def clean_text(text):

    """

    去掉html标签、移除标点、切分成词/token、去掉停用词、重组为新的句子

    :param text:

    :return:

    """

    print(text)

    text = BeautifulSoup(text, 'html.parser').get_text()

    text = re.sub(r'[^a-zA-Z]', ' ', text)

    words = text.lower().split()

    stopwords = {}.fromkeys([line.rstrip() for line in open('../stopwords/stopwords_english.txt')])

    eng_stopwords = set(stopwords)

    print(eng_stopwords)

    words = [w for w in words if w not in eng_stopwords]

    print(words)

    return ' '.join(words)

#混淆矩阵

def plot_confusion_matrix(cm, classes,

                          title='Confusion matrix',

                          cmap=plt.cm.Blues):

    """

    This function prints and plots the confusion matrix.

    """

    plt.imshow(cm, interpolation='nearest', cmap=cmap)

    plt.title(title)

    plt.colorbar()

    tick_marks = np.arange(len(classes))

    plt.xticks(tick_marks, classes, rotation=0)

    plt.yticks(tick_marks, classes)

    thresh = cm.max() / 2.

    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):

        plt.text(j, i, cm[i, j],

                 horizontalalignment="center",

                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()

    plt.ylabel('True label')

    plt.xlabel('Predicted label')

if __name__=='__main__':

    #读取数据

    df = pd.read_csv('../data/labeledTrainData.tsv', sep='\t', escapechar='\\')

    print(df.head(5))

    #数据清洗,对df中的每一个Serial进行清洗

    df['clean_review'] = df.review.apply(clean_text)

    print(df['clean_review'])

    #抽取bag of words特征(用sklearn的CountVectorizer)

    vectorizer = CountVectorizer(max_features=5000)

    train_data_features = vectorizer.fit_transform(df.clean_review).toarray()

    print(train_data_features)

    # 数据切分

    X_train, X_test, y_train, y_test = train_test_split(train_data_features, df.sentiment, test_size=0.2,

                                                    random_state=0)

    print(X_train,X_test,y_train,y_test)

    # ### 训练分类器

    LR_model = LogisticRegression()

    LR_model = LR_model.fit(X_train, y_train)

    y_pred = LR_model.predict(X_test)

    cnf_matrix = confusion_matrix(y_test, y_pred)

    print("Recall metric in the testing dataset: ", cnf_matrix[1, 1] / (cnf_matrix[1, 0] + cnf_matrix[1, 1]))

    print("accuracy metric in the testing dataset: ", (cnf_matrix[1, 1] + cnf_matrix[0, 0]) / (

                cnf_matrix[0, 0] + cnf_matrix[1, 1] + cnf_matrix[1, 0] + cnf_matrix[0, 1]))

    # Plot non-normalized confusion matrix

    class_names = [0, 1]

    plt.figure()

    plot_confusion_matrix(cnf_matrix, classes=class_names, title='Confusion matrix')

    plt.show()

2、基于word2vec词向量模型的逻辑回归情感分类

import re

import numpy as np

import pandas as pd

from bs4 import BeautifulSoup

from sklearn.metrics import confusion_matrix

from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split

import nltk

import warnings

from gensim.models.word2vec import Word2Vec

from nltk.corpus import stopwords

import matplotlib.pyplot as plt

import itertools

warnings.filterwarnings("ignore")

def clean_text(text, remove_stopwords=False):

    text = BeautifulSoup(text, 'html.parser').get_text()

    text = re.sub(r'[^a-zA-Z]', ' ', text)

    words = text.lower().split()

    eng_stopwords = set(stopwords.words('english'))

    if remove_stopwords:

        words = [w for w in words if w not in eng_stopwords]

    return words

def split_sentences(review):

    #print(type(review))

    raw_sentences=tokenizer.tokenize(str(review).strip())

    sentences = [clean_text(s) for s in raw_sentences if s]

    return sentences

def to_review_vector(review):

    global word_vec

    review = clean_text(review, remove_stopwords=True)

    # print (review)

    # words = nltk.word_tokenize(review)

    word_vec = np.zeros((1, 300))

    for word in review:

        # word_vec = np.zeros((1,300))

        if word in model:

            word_vec += np.array([model[word]])

    # print (word_vec.mean(axis = 0))

    return pd.Series(word_vec.mean(axis=0))

def plot_confusion_matrix(cm, classes,

                          title='Confusion matrix',

                          cmap=plt.cm.Blues):

    """

    This function prints and plots the confusion matrix.

    """

    plt.imshow(cm, interpolation='nearest', cmap=cmap)

    plt.title(title)

    plt.colorbar()

    tick_marks = np.arange(len(classes))

    plt.xticks(tick_marks, classes, rotation=0)

    plt.yticks(tick_marks, classes)

    thresh = cm.max() / 2.

    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):

        plt.text(j, i, cm[i, j],

                 horizontalalignment="center",

                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()

    plt.ylabel('True label')

    plt.xlabel('Predicted label')

if __name__ == '__main__':

    #读取数据

    df = pd.read_csv('../data/labeledTrainData.tsv', sep='\t', escapechar='\\')

    #数据清洗

    df['clean_review'] = df.review.apply(clean_text)

    review_part = df['clean_review']

    #nltk库分词

    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

    sentences = sum(review_part.apply(split_sentences), [])

    sentences_list = []

    for line in sentences:

        sentences_list.append(nltk.word_tokenize(str(line).strip()))

    #word2vec

    num_features = 300  # Word vector dimensionality

    min_word_count = 40  # Minimum word count

    num_workers = 4  # Number of threads to run in parallel

    context = 10  # Context window size

    model_name = '{}features_{}minwords_{}context.model'.format(num_features, min_word_count, context)

    model = Word2Vec(sentences_list, workers=num_workers, size=num_features, min_count=min_word_count, window=context)

    model.init_sims(replace=True)

    model.save('word2vec.models')

    train_data_features = df.review.apply(to_review_vector)

    X_train, X_test, y_train, y_test = train_test_split(train_data_features, df.sentiment, test_size=0.2, random_state=0)

    LR_model = LogisticRegression()

    LR_model = LR_model.fit(X_train, y_train)

    y_pred = LR_model.predict(X_test)

    cnf_matrix = confusion_matrix(y_test, y_pred)

    print("Recall metric in the testing dataset: ", cnf_matrix[1, 1] / (cnf_matrix[1, 0] + cnf_matrix[1, 1]))

    print("accuracy metric in the testing dataset: ", (cnf_matrix[1, 1] + cnf_matrix[0, 0]) / (

                cnf_matrix[0, 0] + cnf_matrix[1, 1] + cnf_matrix[1, 0] + cnf_matrix[0, 1]))

    # Plot non-normalized confusion matrix

    class_names = [0, 1]

    plt.figure()

    plot_confusion_matrix(cnf_matrix , classes=class_names, title='Confusion matrix')

    plt.show()

NLP之电影评分数据的情感分析的更多相关文章

NLP之中文自然语言处理工具库：SnowNLP(情感分析/分词/自动摘要)
一安装与介绍 1.1 概述 SnowNLP是一个python写的类库,可以方便的处理中文文本内容,是受到了TextBlob的启发而写的,由于现在大部分的自然语言处理库基本都是针对英文的,于是写了一个 ...
情感分析的现代方法（包含word2vec Doc2Vec）
英文原文地址:https://districtdatalabs.silvrback.com/modern-methods-for-sentiment-analysis 转载文章地址:http://da ...
使用Spark MLlib进行情感分析
使用Spark MLlib进行情感分析使用Spark MLlib进行情感分析一.实验说明在当今这个互联网时代,人们对于各种事情的舆论观点都散布在各种社交网络平台或新闻提要 ...
Stanford NLP学习笔记：7. 情感分析（Sentiment）
1. 什么是情感分析(别名:观点提取,主题分析,情感挖掘...) 应用: 1)正面VS负面的影评(影片分类问题) 2)产品/品牌评价: Google产品搜索 3)twitter情感预测股票市场行情/消 ...
【项目实战】Kaggle电影评论情感分析
前言这几天持续摆烂了几天,原因是我自己对于Kaggle电影评论情感分析的这个赛题敲出来的代码无论如何没办法运行,其中数据变换的维度我无法把握好,所以总是在函数中传错数据.今天痛定思痛,重新写了一遍代 ...
Java豆瓣电影爬虫——使用Word2Vec分析电影短评数据
在上篇实现了电影详情和短评数据的抓取.到目前为止,已经抓了2000多部电影电视以及20000多的短评数据. 数据本身没有规律和价值,需要通过分析提炼成知识才有意义.抱着试试玩的想法,准备做一个有关情感 ...
NLP入门（十）使用LSTM进行文本情感分析
情感分析简介文本情感分析(Sentiment Analysis)是自然语言处理(NLP)方法中常见的应用,也是一个有趣的基本任务,尤其是以提炼文本情绪内容为目的的分类.它是对带有情感色彩的主观性 ...
浅谈NLP 文本分类/情感分析任务中的文本预处理工作
目录浅谈NLP 文本分类/情感分析任务中的文本预处理工作前言 NLP相关的文本预处理浅谈NLP 文本分类/情感分析任务中的文本预处理工作前言之所以心血来潮想写这篇博客,是因为最近在关注N ...
爬虫再探实战（五）———爬取APP数据——超级课程表【四】——情感分析
仔细看的话,会发现之前的词频分析并没有什么卵用...文本分析真正的大哥是NLP,不过,这个坑太大,小白不大敢跳...不过还是忍不住在坑边上往下瞅瞅2333. 言归正传,今天刚了解到boson公司有py ...

随机推荐

php连接docker运行的mysql，显示(HY000/2002): Connection refused的解决办法
php要连接docker中运行的mysql是不能用localhost, 127.0.0.1来连接的,因为每个docker运行容器的localhost 127.0.0.1都是自己容器本身,不是mysql ...
Delphi 执行线程对象
一周死磕fastreport ----ASP.NET （三）
做了一周,然而说着很快首先拖一个WebReport 点击design report 设置模板引入dll using引用设置好就打印就可以了未来几天, 然后都在设置样式 ....如何就一周过去 ...
Fiddler抓包HTTPS捕捉旧版App
“现在可以公开的情报:简易操作Fiddler抓包可能” 任何App的更新都限于苹果开发者规定,有时为了上架不得已放弃一些真正实用的功能,比如视频音频的直接下载,脚本的直接导入,手机上IPA的直接安装等 ...
socket 多线程安全、粘包问题
脚本如下: # -*- coding:utf-8 -*- ''' @Author: Stefan @File: server_listener.py @Date: 2016-11-09 If you ...
Windows环境下使用uiautomatorviewer进行元素定位
一.摘要元素定位本篇主要介绍如何使用uiautomatorviewer,通过定位到页面上的元素,然后进行相应的点击等操作,uiautomatorviewer 是 android-sdk 自带的一个元 ...
SQL 归纳
查询父节点的所有子节点: SELECT * FROM menu m START WITH m.ID_ = '402882836068695f0160688eebf70006' CONNECT BY m ...
CSP-S2019 退役记/赛后总结
真就退役了呗. 作为一名非常失败的OIer,开了一个非常失败的blog,一直想在赛后写点什么,做点什么,总结些什么.自csp结束以来,徘徊了半个月,今夜里终于还是起笔了. 因为从来没写过这种玩意,不妨 ...
[MySQL优化] -- 如何查找SQL效率地下的原因
[MySQL优化] -- 如何查找SQL效率地下的原因来源: ChinaUnix博客日期: 2009.07.20 16:12 (共有条评论) 我要评论查询到效率低的 SQL 语句 ...
e.target.value和this的区别
1.e.target.value获取的就是你选择接受事件的元素输入的或者选择的值. 参数e接收事件对象. 而事件对象也有很多属性和方法,其中target属性是获取触发事件对象的目标,也就是绑定事件的元 ...

NLP之电影评分数据的情感分析

NLP之电影评分数据的情感分析的更多相关文章

随机推荐

热门专题