kaggle实战之 bag of words meet bag of poopcorn

由于编辑器总是崩溃，我只能直接把代码贴上了。

import numpy

#first step

import pandas as pd

import numpy as np

# Read data from files

#这三行的目的就是读入文件，pd.read_csv()这个API里面参数还是比较多的，可以查阅官方文档

#人工标记过的训练数据

train = pd.read_csv( "data/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3 )

#测试集

test = pd.read_csv( "data/testData.tsv", header=0, delimiter="\t", quoting=3 )

#未标记的训练数据，其实和测试集没什么区别，可以作为word2vec训练的时候的语料

unlabeled_train = pd.read_csv( "data/unlabeledTrainData.tsv", header=0,delimiter="\t", quoting=3 )

# Verify the number of reviews that were read (100,000 in total)

#显示读入数据的行数

print "Read %d labeled train reviews, %d labeled test reviews, and %d unlabeled reviews\n" % \

      (train["review"].size,test["review"].size, unlabeled_train["review"].size )

# second strp

# Import various modules for string cleaning

from bs4 import BeautifulSoup

import re

from nltk.corpus import stopwords

#数据预处理，主要是网页标签，去数字和去停用词

def review_to_wordlist( review, remove_stopwords=False ):

    # Function to convert a document to a sequence of words,

    # optionally removing stop words.  Returns a list of words.

    #

    # 1. Remove HTML

    #BeautifulSoup这个库是一个在做爬虫是经常使用的库，主要作用除去爬下来的文档标签，

    #大家可以看到原始句子里面含有<br /><br />这些标签，这是由于这些评论是从网页里面爬取出来的

    #我们后续的处理是必须要去掉这些标签的，get_text()这个API可以轻松实现这个功能

    review_text = BeautifulSoup(review,"html.parser").get_text()

    #

    # 2. Remove non-letters

    #这里就需要正则表达式的知识了，这句话实现的功能就是将数字去掉并且用一个空格去替换

    review_text = re.sub("[^a-zA-Z]"," ", review_text)

    #

    # 3. Convert words to lower case and split them

    #将大写字母转换为小写字母，也许大小写不同会影响到处理吧，不太清楚

    #这也是中英文自然语言处理的区别之一，中文不必考虑大小写问题，但是中文分词比英文分词麻烦很多

    words = review_text.lower().split()

    #

    # 4. Optionally remove stop words (false by default)

    #除去停用词，这是自然语言处理里面经常会做的，不过为什么是Optionally remove

    #后面有答案

    if remove_stopwords:

        stops = set(stopwords.words("english"))

        words = [w for w in words if not w in stops]

    #

    # 5. Return a list of words

    # print words

    return(words)

# Download the punkt tokenizer for sentence splitting

#nltk是python里面常用的自然语言处理的工具包，但是这一步会出问题

#原因貌似是nltk_data的网址变了，我是自己手动在网上找到了nltk_data

#然后放在特定的路径就可以了

import nltk.data

# Load the punkt tokenizer

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

# Define a function to split a review into parsed sentences

def review_to_sentences( review, tokenizer, remove_stopwords=False ):

    # Function to split a review into parsed sentences. Returns a

    # list of sentences, where each sentence is a list of words

    #

    # 1. Use the NLTK tokenizer to split the paragraph into sentences

    #使用nltk将每一条评论都分成一个个句子，比如利用英文的句号'.'进行划分.

    #'review.strip()'的作用是进行分词，不得补羡慕英文分词是真么简单高效

    raw_sentences = tokenizer.tokenize(review.strip())

    #

    # 2. Loop over each sentence

    #每个评论都被分成了几个句子，这里就是去掉那些长度为0的句子

    sentences = []

    for raw_sentence in raw_sentences:

        # If a sentence is empty, skip it

        if len(raw_sentence) > 0:

            # Otherwise, call review_to_wordlist to get a list of words

            #这里调用review_to_wordlist()实现数据清洗

            sentences.append( review_to_wordlist( raw_sentence, \

              remove_stopwords ))

    #

    # Return the list of sentences (each sentence is a list of words,

    # so this returns a list of lists

    #也就是说，输出是sentence的列表，而每个sentence也是一个单词的列表

    return sentences

sentences = []  # Initialize an empty list of sentences

#这个处理就是把标记的训练数据进行处理，都放入sentences这个列表里面，这个列表每个元素

#其实是原来评论里面的一句话，不过是经过了数据清洗和分词

#注意到review_to_sentences(review.decode("utf8"), tokenizer)这个调用remove_stopwords=False

#也就是说不除去停用词，为什么呢？这个就和word2vec这个方法有关了，有停用词可以保留完整的语料信息

#传统表示文本的方式都是BOW，也就是词袋模型，但是这种方法有两个的缺点：1.无法表征出词的关系，比如“篮球”“足球”“鸡腿”

#明显“篮球”和“足球”含义相近，但是词袋模型并不能体现出来。2.维度过高，计算量过大，一般利用互信息，卡方检验等等进行降维处理

#word2vec也是将词表示成一种向量的办法，但是利用word2vec表示同意后的优点在于：1.词意相近的词语距离会更近（可以进算向量之间的距离）

#2.维度低，可以人工指定维数。理解word2vec需要很多的数学知识，我在这里就不讲了

print "Parsing sentences from training set"

for review in train["review"]:

    sentences += review_to_sentences(review.decode("utf8"), tokenizer)

#为什么未标记的数据也能用呢，因为word2vec是无监督的，只是将这笔资料用作训练word2vec的语料库

#因此，这也体现出word2vec一个优点，因为未标记的预料是比标记预料容易获取到的

print "Parsing sentences from unlabeled set"

for review in unlabeled_train["review"]:

    sentences += review_to_sentences(review.decode("utf8"), tokenizer)

# Import the built-in logging module and configure it so that Word2Vec

# creates nice output messages

#输出日志信息，level一共是五级，这里level=logging.INFO

import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\

    level=logging.INFO)

# Set values for various parameters

#设定word2vec的参数，具体每个参数含义需要理解word2vec数学原理以及查阅API文档

num_features = 300    # Word vector dimensionality指定维度

min_word_count = 40   # Minimum word count

num_workers = 4       # Number of threads to run in parallel四个线程

context = 10          # Context window size滑动窗口大小

downsampling = 1e-3   # Downsample setting for frequent words负采样

# Initialize and train the model (this will take some time)

from gensim.models import word2vec

print "Training model..."

model = word2vec.Word2Vec(sentences, workers=num_workers, \

            size=num_features, min_count = min_word_count, \

            window = context, sample = downsampling)

# If you don't plan to train the model any further, calling

# init_sims will make the model much more memory-efficient.

#保存模型，因为跑word2vec还是需要花时间的，因此在训练好之后保存下来，下次就可以直接使用了

model.init_sims(replace=True)

# It can be helpful to create a meaningful model name and

# save the model for later use. You can load it later using Word2Vec.load()

model_name = "300features_40minwords_10context"

model.save(model_name)

#"man woman child kitchen"四个单词里面哪个和其他三个差距最大

# print model.doesnt_match("man woman child kitchen".split())

# print model.doesnt_match("france england germany berlin".split())

# print model.doesnt_match("paris berlin london austria".split())

#和"man"最像的单词

# print model.most_similar("man")

# print model.most_similar("queen")

# print model.most_similar("awful")

# ****************************************************************

# Calculate average feature vectors for training and testing sets,

# using the functions we defined above. Notice that we now use stop word

# removal.

def makeFeatureVec(words, model, num_features):

    # Function to average all of the word vectors in a given

    # paragraph

    #

    # Pre-initialize an empty numpy array (for speed)

    featureVec = np.zeros((num_features,),dtype="float32")

    #

    nwords = 0.

    #

    # Index2word is a list that contains the names of the words in

    # the model's vocabulary. Convert it to a set, for speed

    index2word_set = set(model.wv.index2word)

    #

    # Loop over each word in the review and, if it is in the model's

    # vocaublary, add its feature vector to the total

    for word in words:

    if word in index2word_set:

        nwords = nwords + 1.

        #从模型里面取出相应单词的向量值

        featureVec = np.add(featureVec,model[word])

    #

    # Divide the result by the number of words to get the average

    featureVec = np.divide(featureVec,nwords)

    return featureVec

#利用word2vec建模的关键就是如何给一个表示一个样本，在这个问题里面也就是如何表示一条评论？

#BOW词袋模型由于其高维度，可以轻松表示，而且还是稀疏的

#我们知道，经过word2vec，每个单词可以用长度为300的向量表示，假设某一条评论有100个单词，也就是100个向量

#我们的处理是将100个向量加起来再除去100，结果是一个300维测向量，也就是每条评论用300维向量表示

#看起来这种方法不是很靠谱，比较简单粗暴，我说说自己的两点理解：1.BOW词袋模型表示一个句子其实也是用的这个方法

#2.这样最起码保证了每个评论可以用相同维度的数据来表示

def getAvgFeatureVecs(reviews, model, num_features):

    # Given a set of reviews (each one a list of words), calculate

    # the average feature vector for each one and return a 2D numpy array

    #

    # Initialize a counter

    counter = 0

    #

    # Preallocate a 2D numpy array, for speed

    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")

    #

    # Loop through the reviews

    for review in reviews:

       #

       # Print a status message every 1000th review

       if counter%1000 == 0:

           print "Review %d of %d" % (counter, len(reviews))

       #

       # Call the function (defined above) that makes average feature vectors

       reviewFeatureVecs[counter] = makeFeatureVec(review, model, \

           num_features)

       #

       # Increment the counter

       counter = counter + 1

    return reviewFeatureVecs

#这里为什么又要除去停用词呢？前面是利用word2vec表示单词，语料越完整越好

#这里是利用向量化的单词去表示文本，而在文本中，停用词对于文本表示几乎毫无作用，因此要去掉

clean_train_reviews = []

for review in train["review"]:

    clean_train_reviews.append( review_to_wordlist( review, \

        remove_stopwords=True ))

trainDataVecs = getAvgFeatureVecs( clean_train_reviews, model, num_features )

print "Creating average feature vecs for test reviews"

clean_test_reviews = []

for review in test["review"]:

    clean_test_reviews.append( review_to_wordlist( review, \

        remove_stopwords=True ))

testDataVecs = getAvgFeatureVecs( clean_test_reviews, model, num_features )

print type(testDataVecs)

print len(testDataVecs)

print testDataVecs[0]

print len(testDataVecs[0])

# Fit a random forest to the training data, using 100 trees

#利用随机森林去建模

from sklearn.ensemble import RandomForestClassifier

from sklearn.svm import SVC

forest = RandomForestClassifier( n_estimators = 100 )

print "Fitting a random forest to labeled training data..."

forest = forest.fit( trainDataVecs, train["sentiment"] )

# Test & extract results

result = forest.predict( testDataVecs )

# Write the test results

output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )

output.to_csv( "Word2Vec_AverageVectors.csv", index=False, quoting=3 )

result = forest.predict( testDataVecs )

# Write the test results

#利用率SVC去建模

model_svc = SVC.fit( trainDataVecs, train["sentiment"] )

result = model_svc.predict( testDataVecs )

output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )

output.to_csv( "okcing.csv", index=False, quoting=3 )

#SVC的效果确实比随机森林要好一些

kaggle实战之 bag of words meet bag of poopcorn的更多相关文章

Kaggle实战之一回归问题
0. 前言 1.任务描述 2.数据概览 3. 数据准备 4. 模型训练 5. kaggle实战 0. 前言 "尽管新技术新算法层出不穷,但是掌握好基础算法就能解决手头 90% 的机器学习问题 ...
Kaggle实战之二分类问题
0. 前言 1. MNIST 数据集 2. 二分类器 3. 效果评测 4. 多分类器与误差分析 5. Kaggle 实战 0. 前言 "尽管新技术新算法层出不穷,但是掌握好基础算法就能解决手 ...
机器学习(一)：记一次k一近邻算法的学习与Kaggle实战
本篇博客是基于以Kaggle中手写数字识别实战为目标,以KNN算法学习为驱动导向来进行讲解. 写这篇博客的原因什么是KNN kaggle实战优缺点及其优化方法总结参考文献写这篇博客的原因写 ...
Kaggle实战分类问题2
Kaggle实战之二分类问题 0. 前言 1. MNIST 数据集 2. 二分类器 3. 效果评测 4. 多分类器与误差分析 5. Kaggle 实战 0. 前言 “尽管新技术新算法层出不穷,但是掌握 ...
Python机器学习实践与Kaggle实战（转）
https://mlnote.wordpress.com/2015/12/16/python%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0%E5%AE%9E%E8%B7%B5 ...
Kaggle实战——点击率预估
https://blog.csdn.net/chengcheng1394/article/details/78940565 原创文章,转载请注明出处: http://blog.csdn.net/che ...
kaggle实战记录 =>Digit Recognizer
date:2016-09-13 今天开始注册了kaggle,从digit recognizer开始学习, 由于是第一个案例对于整个流程目前我还不够了解,首先了解大神是怎么运行怎么构思,然后模仿.这样的 ...
kaggle 实战（1）: PCA + KNN 手写数字识别
文章目录加载package read data PCA 降维探索选择50维度, 拆分数据为训练集,测试机 KNN PCA降维和K值筛选分析k & 维度 vs 精度预测生成提交文件本 ...
基于Colab Pro & Google Drive的Kaggle实战
原文:https://hippocampus-garden.com/kaggle_colab/ 原文标题:How to Kaggle with Colab Pro & Google Drive ...

随机推荐

初识 Kafka Producer 生产者
目录 1.KafkaProducer 概述 2.KafkaProducer 类图 3.KafkaProducer 简单示例温馨提示:整个 Kafka Client 专栏基于 kafka-2.3.0 ...
《带你装B，带你飞》pytest修炼之路1- 简介和环境准备
1. pytest简介 pytest是python的一种单元测试框架,与python自带的unittest测试框架类似,但是比unittest框架使用起来更简洁,效率更高.根据pytest的官方网站介 ...
$loj\ 6045$ [雅礼集训 $2017\ Day8$] 价网络流
正解:网络流解题报告: 传送门$QwQ$ 这题还,挺有趣的我$jio$得. 考虑依然先是照着最小割的模子建图呗,然后从意义上来分析,割一条边就相当于不吃一种减肥药/买一种药材.由已知得,买的药材数量 ...
如何根据HttpServletRequets获取用户真实IP地址
最近的一个项目的某个功能获取用户的ip地址,添加用户的系统使用记录. 我发现当我直接使用getRemoteAddr()方法从HttpServletRequet中获取用户的ip时,获取到的是服务器的ip ...
sql函数实用——字符函数（sqlserver与mysql对比）
1.获取长度 sqlserver写法:关键字:len() 获取参数的字符数量 select Len('aksjdhh') 输出结果 7 select len('张无忌ooo') 输出 ...
Java之Object类用法总结
Object类概述: 1.Object类是所有Java类的根父类. 2.如果在类的声明中未使用extends关键字指明其父类, 则默认父类为java.lang.Object类. Object类主要结构 ...
js如何下载后台传过来的base64文件
一.<a>标签的作用相信大部分人都知道<a>链接再简单不过了,跳转嘛,跳转到另外一个页面,这谁不知道. 当然这这是一部分, <a> 标签定义超链接,用于从一个页面 ...
【一头扎进Spring】 01 | 从 HelloWorld 开始看Spring
Spring 是一个开源框架. Spring 为简化企业级应用开发而生. 使用 Spring 可以使简单的 JavaBean 实现以前只有 EJB 才能实现的功能. Spring 是一个 IOC(DI ...
七彩线段 - 装压dp (牛客网)
题目描述听说彩虹有七种颜色?一维坐标轴上n条线段,每条线段左端点l,右端点r,颜色为c,从中选m种颜色的互不接触的线段,每种颜色可选多条,所选线段的总长度最长为多少?输入描述: 第一行2个整数 n, ...
Unable to open debugger port (127.0.0.1:57046): java.net.SocketException "so
原因分析: 出现这个报错的原因是因为端口被占用导致的. 解决方法: 解决方法主要两种:修改端口配置(推荐).关闭占用端口的进程(不推荐). 方式一:修改端口配置(推荐) 被占用的端口可能是本地端口,也 ...

kaggle实战之 bag of words meet bag of poopcorn

kaggle实战之 bag of words meet bag of poopcorn的更多相关文章

随机推荐

热门专题