TensorFlow练习2: 对评论进行分类
本帖是前一贴的补充:
- 使用大数据,了解怎么处理数据不能一次全部加载到内存的情况。如果你内存充足,当我没说
- 训练好的模型的保存和使用
- 使用的模型没变,还是简单的feedforward神经网络(update:添加CNN模型)
- 如果你要运行本帖代码,推荐使用GPU版本或强大的VPS,我使用小笔记本差点等吐血
- 后续有关于中文的练习《TensorFlow练习13: 制作一个简单的聊天机器人》《TensorFlow练习7: 基于RNN生成古诗词》《TensorFlow练习18: 根据姓名判断性别》
在正文开始之前,我画了一个机器学习模型的基本开发流程图:

使用的数据集
使用的数据集:http://help.sentiment140.com/for-students/ (情绪分析)
数据集包含1百60万条推特,包含消极、中性和积极tweet。不知道有没有现成的微博数据集。
数据格式:移除表情符号的CSV文件,字段如下:
- 0 – the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)
- 1 – the id of the tweet (2087)
- 2 – the date of the tweet (Sat May 16 23:58:44 UTC 2009)
- 3 – the query (lyx). If there is no query, then this value is NO_QUERY.
- 4 – the user that tweeted (robotickilldozr)
- 5 – the text of the tweet (Lyx is cool)
training.1600000.processed.noemoticon.csv(238M)
testdata.manual.2009.06.14.csv(74K)
数据预处理
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
|
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import pickle
import numpy as np
import pandas as pd
from collections import OrderedDict
org_train_file = 'training.1600000.processed.noemoticon.csv'
org_test_file = 'testdata.manual.2009.06.14.csv'
# 提取文件中有用的字段
def usefull_filed(org_file, output_file):
output = open(output_file, 'w')
with open(org_file, buffering=10000, encoding='latin-1') as f:
try:
for line in f: # "4","2193601966","Tue Jun 16 08:40:49 PDT 2009","NO_QUERY","AmandaMarie1028","Just woke up. Having no school is the best feeling ever "
line = line.replace('"', '')
clf = line.split(',')[0] # 4
if clf == '0':
clf = [0, 0, 1] # 消极评论
elif clf == '2':
clf = [0, 1, 0] # 中性评论
elif clf == '4':
clf = [1, 0, 0] # 积极评论
tweet = line.split(',')[-1]
outputline = str(clf) + ':%:%:%:' + tweet
output.write(outputline) # [0, 0, 1]:%:%:%: that's a bummer. You shoulda got David Carr of Third Day to do it. ;D
except Exception as e:
print(e)
output.close() # 处理完成,处理后文件大小127.5M
usefull_filed(org_train_file, 'training.csv')
usefull_filed(org_test_file, 'tesing.csv')
# 创建词汇表
def create_lexicon(train_file):
lex = []
lemmatizer = WordNetLemmatizer()
with open(train_file, buffering=10000, encoding='latin-1') as f:
try:
count_word = {} # 统计单词出现次数
for line in f:
tweet = line.split(':%:%:%:')[1]
words = word_tokenize(line.lower())
for word in words:
word = lemmatizer.lemmatize(word)
if word not in count_word:
count_word[word] = 1
else:
count_word[word] += 1
count_word = OrderedDict(sorted(count_word.items(), key=lambda t: t[1]))
for word in count_word:
if count_word[word] < 100000 and count_word[word] > 100: # 过滤掉一些词
lex.append(word)
except Exception as e:
print(e)
return lex
lex = create_lexicon('training.csv')
with open('lexcion.pickle', 'wb') as f:
pickle.dump(lex, f)
"""
# 把字符串转为向量
def string_to_vector(input_file, output_file, lex):
output_f = open(output_file, 'w')
lemmatizer = WordNetLemmatizer()
with open(input_file, buffering=10000, encoding='latin-1') as f:
for line in f:
label = line.split(':%:%:%:')[0]
tweet = line.split(':%:%:%:')[1]
words = word_tokenize(tweet.lower())
words = [lemmatizer.lemmatize(word) for word in words]
features = np.zeros(len(lex))
for word in words:
if word in lex:
features[lex.index(word)] = 1 # 一个句子中某个词可能出现两次,可以用+=1,其实区别不大
features = list(features)
output_f.write(str(label) + ":" + str(features) + '\n')
output_f.close()
f = open('lexcion.pickle', 'rb')
lex = pickle.load(f)
f.close()
# lexcion词汇表大小112k,training.vec大约112k*1600000 170G 太大,只能边转边训练了
# string_to_vector('training.csv', 'training.vec', lex)
# string_to_vector('tesing.csv', 'tesing.vec', lex)
"""
|
上面代码把原始数据转为training.csv、和tesing.csv,里面只包含label和tweet。lexcion.pickle文件保存了词汇表。
如果数据文件太大,不能一次加载到内存,可以把数据导入数据库
Dask可处理大csv文件
开始漫长的训练
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
|
import os
import random
import tensorflow as tf
import pickle
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
f = open('lexcion.pickle', 'rb')
lex = pickle.load(f)
f.close()
def get_random_line(file, point):
file.seek(point)
file.readline()
return file.readline()
# 从文件中随机选择n条记录
def get_n_random_line(file_name, n=150):
lines = []
file = open(file_name, encoding='latin-1')
total_bytes = os.stat(file_name).st_size
for i in range(n):
random_point = random.randint(0, total_bytes)
lines.append(get_random_line(file, random_point))
file.close()
return lines
def get_test_dataset(test_file):
with open(test_file, encoding='latin-1') as f:
test_x = []
test_y = []
lemmatizer = WordNetLemmatizer()
for line in f:
label = line.split(':%:%:%:')[0]
tweet = line.split(':%:%:%:')[1]
words = word_tokenize(tweet.lower())
words = [lemmatizer.lemmatize(word) for word in words]
features = np.zeros(len(lex))
for word in words:
if word in lex:
features[lex.index(word)] = 1
test_x.append(list(features))
test_y.append(eval(label))
return test_x, test_y
test_x, test_y = get_test_dataset('tesing.csv')
#######################################################################
n_input_layer = len(lex) # 输入层
n_layer_1 = 2000 # hide layer
n_layer_2 = 2000 # hide layer(隐藏层)听着很神秘,其实就是除输入输出层外的中间层
n_output_layer = 3 # 输出层
def neural_network(data):
# 定义第一层"神经元"的权重和biases
layer_1_w_b = {'w_':tf.Variable(tf.random_normal([n_input_layer, n_layer_1])), 'b_':tf.Variable(tf.random_normal([n_layer_1]))}
# 定义第二层"神经元"的权重和biases
layer_2_w_b = {'w_':tf.Variable(tf.random_normal([n_layer_1, n_layer_2])), 'b_':tf.Variable(tf.random_normal([n_layer_2]))}
# 定义输出层"神经元"的权重和biases
layer_output_w_b = {'w_':tf.Variable(tf.random_normal([n_layer_2, n_output_layer])), 'b_':tf.Variable(tf.random_normal([n_output_layer]))}
# w·x+b
layer_1 = tf.add(tf.matmul(data, layer_1_w_b['w_']), layer_1_w_b['b_'])
layer_1 = tf.nn.relu(layer_1) # 激活函数
layer_2 = tf.add(tf.matmul(layer_1, layer_2_w_b['w_']), layer_2_w_b['b_'])
layer_2 = tf.nn.relu(layer_2 ) # 激活函数
layer_output = tf.add(tf.matmul(layer_2, layer_output_w_b['w_']), layer_output_w_b['b_'])
return layer_output
X = tf.placeholder('float')
Y = tf.placeholder('float')
batch_size = 90
def train_neural_network(X, Y):
predict = neural_network(X)
cost_func = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(predict, Y))
optimizer = tf.train.AdamOptimizer().minimize(cost_func)
with tf.Session() as session:
session.run(tf.initialize_all_variables())
lemmatizer = WordNetLemmatizer()
saver = tf.train.Saver()
i = 0
pre_accuracy = 0
while True: # 一直训练
batch_x = []
batch_y = []
#if model.ckpt文件已存在:
# saver.restore(session, 'model.ckpt') 恢复保存的session
try:
lines = get_n_random_line('training.csv', batch_size)
for line in lines:
label = line.split(':%:%:%:')[0]
tweet = line.split(':%:%:%:')[1]
words = word_tokenize(tweet.lower())
words = [lemmatizer.lemmatize(word) for word in words]
features = np.zeros(len(lex))
for word in words:
if word in lex:
features[lex.index(word)] = 1 # 一个句子中某个词可能出现两次,可以用+=1,其实区别不大
batch_x.append(list(features))
batch_y.append(eval(label))
session.run([optimizer, cost_func], feed_dict={X:batch_x,Y:batch_y})
except Exception as e:
print(e)
# 准确率
if i > 100:
correct = tf.equal(tf.argmax(predict,1), tf.argmax(Y,1))
accuracy = tf.reduce_mean(tf.cast(correct,'float'))
accuracy = accuracy.eval({X:test_x, Y:test_y})
if accuracy > pre_accuracy: # 保存准确率最高的训练模型
print('准确率: ', accuracy)
pre_accuracy = accuracy
saver.save(session, 'model.ckpt') # 保存session
i = 0
i += 1
train_neural_network(X,Y)
|
上面程序占用内存600M,峰值1G。
运行:

训练模型保存为model.ckpt。
使用训练好的模型
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
|
import tensorflow as tf
import pickle
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import numpy as np
f = open('lexcion.pickle', 'rb')
lex = pickle.load(f)
f.close()
n_input_layer = len(lex) # 输入层
n_layer_1 = 2000 # hide layer
n_layer_2 = 2000 # hide layer(隐藏层)听着很神秘,其实就是除输入输出层外的中间层
n_output_layer = 3 # 输出层
def neural_network(data):
# 定义第一层"神经元"的权重和biases
layer_1_w_b = {'w_':tf.Variable(tf.random_normal([n_input_layer, n_layer_1])), 'b_':tf.Variable(tf.random_normal([n_layer_1]))}
# 定义第二层"神经元"的权重和biases
layer_2_w_b = {'w_':tf.Variable(tf.random_normal([n_layer_1, n_layer_2])), 'b_':tf.Variable(tf.random_normal([n_layer_2]))}
# 定义输出层"神经元"的权重和biases
layer_output_w_b = {'w_':tf.Variable(tf.random_normal([n_layer_2, n_output_layer])), 'b_':tf.Variable(tf.random_normal([n_output_layer]))}
# w·x+b
layer_1 = tf.add(tf.matmul(data, layer_1_w_b['w_']), layer_1_w_b['b_'])
layer_1 = tf.nn.relu(layer_1) # 激活函数
layer_2 = tf.add(tf.matmul(layer_1, layer_2_w_b['w_']), layer_2_w_b['b_'])
layer_2 = tf.nn.relu(layer_2 ) # 激活函数
layer_output = tf.add(tf.matmul(layer_2, layer_output_w_b['w_']), layer_output_w_b['b_'])
return layer_output
X = tf.placeholder('float')
def prediction(tweet_text):
predict = neural_network(X)
with tf.Session() as session:
session.run(tf.initialize_all_variables())
saver = tf.train.Saver()
saver.restore(session, 'model.ckpt')
lemmatizer = WordNetLemmatizer()
words = word_tokenize(tweet_text.lower())
words = [lemmatizer.lemmatize(word) for word in words]
features = np.zeros(len(lex))
for word in words:
if word in lex:
features[lex.index(word)] = 1
#print(predict.eval(feed_dict={X:[features]})) [[val1,val2,val3]]
res = session.run(tf.argmax(predict.eval(feed_dict={X:[features]}),1 ))
return res
prediction("I am very happe")
|
上面使用简单的feedfroward模型,下面使用CNN模型
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
|
# https://github.com/Lab41/sunny-side-up
import os
import random
import tensorflow as tf
import pickle
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
f = open('lexcion.pickle', 'rb')
lex = pickle.load(f)
f.close()
def get_random_line(file, point):
file.seek(point)
file.readline()
return file.readline()
# 从文件中随机选择n条记录
def get_n_random_line(file_name, n=150):
lines = []
file = open(file_name, encoding='latin-1')
total_bytes = os.stat(file_name).st_size
for i in range(n):
random_point = random.randint(0, total_bytes)
lines.append(get_random_line(file, random_point))
file.close()
return lines
def get_test_dataset(test_file):
with open(test_file, encoding='latin-1') as f:
test_x = []
test_y = []
lemmatizer = WordNetLemmatizer()
for line in f:
label = line.split(':%:%:%:')[0]
tweet = line.split(':%:%:%:')[1]
words = word_tokenize(tweet.lower())
words = [lemmatizer.lemmatize(word) for word in words]
features = np.zeros(len(lex))
for word in words:
if word in lex:
features[lex.index(word)] = 1
test_x.append(list(features))
test_y.append(eval(label))
return test_x, test_y
test_x, test_y = get_test_dataset('tesing.csv')
##############################################################################
input_size = len(lex)
num_classes = 3
X = tf.placeholder(tf.int32, [None, input_size])
Y = tf.placeholder(tf.float32, [None, num_classes])
dropout_keep_prob = tf.placeholder(tf.float32)
batch_size = 90
def neural_network():
# embedding layer
with tf.device('/cpu:0'), tf.name_scope("embedding"):
embedding_size = 128
W = tf.Variable(tf.random_uniform([input_size, embedding_size], -1.0, 1.0))
embedded_chars = tf.nn.embedding_lookup(W, X)
embedded_chars_expanded = tf.expand_dims(embedded_chars, -1)
# convolution + maxpool layer
num_filters = 128
filter_sizes = [3,4,5]
pooled_outputs = []
for i, filter_size in enumerate(filter_sizes):
with tf.name_scope("conv-maxpool-%s" % filter_size):
filter_shape = [filter_size, embedding_size, 1, num_filters]
W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1))
b = tf.Variable(tf.constant(0.1, shape=[num_filters]))
conv = tf.nn.conv2d(embedded_chars_expanded, W, strides=[1, 1, 1, 1], padding="VALID")
h = tf.nn.relu(tf.nn.bias_add(conv, b))
pooled = tf.nn.max_pool(h, ksize=[1, input_size - filter_size + 1, 1, 1], strides=[1, 1, 1, 1], padding='VALID')
pooled_outputs.append(pooled)
num_filters_total = num_filters * len(filter_sizes)
h_pool = tf.concat(3, pooled_outputs)
h_pool_flat = tf.reshape(h_pool, [-1, num_filters_total])
# dropout
with tf.name_scope("dropout"):
h_drop = tf.nn.dropout(h_pool_flat, dropout_keep_prob)
# output
with tf.name_scope("output"):
W = tf.get_variable("W", shape=[num_filters_total, num_classes], initializer=tf.contrib.layers.xavier_initializer())
b = tf.Variable(tf.constant(0.1, shape=[num_classes]))
output = tf.nn.xw_plus_b(h_drop, W, b)
return output
def train_neural_network():
output = neural_network()
optimizer = tf.train.AdamOptimizer(1e-3)
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(output, Y))
grads_and_vars = optimizer.compute_gradients(loss)
train_op = optimizer.apply_gradients(grads_and_vars)
saver = tf.train.Saver(tf.global_variables())
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
lemmatizer = WordNetLemmatizer()
i = 0
while True:
batch_x = []
batch_y = []
#if model.ckpt文件已存在:
# saver.restore(session, 'model.ckpt') 恢复保存的session
try:
lines = get_n_random_line('training.csv', batch_size)
for line in lines:
label = line.split(':%:%:%:')[0]
tweet = line.split(':%:%:%:')[1]
words = word_tokenize(tweet.lower())
words = [lemmatizer.lemmatize(word) for word in words]
features = np.zeros(len(lex))
for word in words:
if word in lex:
features[lex.index(word)] = 1 # 一个句子中某个词可能出现两次,可以用+=1,其实区别不大
batch_x.append(list(features))
batch_y.append(eval(label))
_, loss_ = sess.run([train_op, loss], feed_dict={X:batch_x, Y:batch_y, dropout_keep_prob:0.5})
print(loss_)
except Exception as e:
print(e)
if i % 10 == 0:
predictions = tf.argmax(output, 1)
correct_predictions = tf.equal(predictions, tf.argmax(Y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"))
accur = sess.run(accuracy, feed_dict={X:test_x[0:50], Y:test_y[0:50], dropout_keep_prob:1.0})
print('准确率:', accur)
i += 1
train_neural_network()
|
使用了CNN模型之后,准确率有了显著提升。
http://blog.topspeedsnail.com/archives/10420
TensorFlow练习2: 对评论进行分类的更多相关文章
- ML.NET 示例:深度学习之集成TensorFlow
写在前面 准备近期将微软的machinelearning-samples翻译成中文,水平有限,如有错漏,请大家多多指正. 如果有朋友对此感兴趣,可以加入我:https://github.com/fei ...
- 深度学习原理与框架-Tensorflow卷积神经网络-cifar10图片分类(代码) 1.tf.nn.lrn(局部响应归一化操作) 2.random.sample(在列表中随机选值) 3.tf.one_hot(对标签进行one_hot编码)
1.tf.nn.lrn(pool_h1, 4, bias=1.0, alpha=0.001/9.0, beta=0.75) # 局部响应归一化,使用相同位置的前后的filter进行响应归一化操作 参数 ...
- kaggle赛题Digit Recognizer:利用TensorFlow搭建神经网络(附上K邻近算法模型预测)
一.前言 kaggle上有传统的手写数字识别mnist的赛题,通过分类算法,将图片数据进行识别.mnist数据集里面,包含了42000张手写数字0到9的图片,每张图片为28*28=784的像素,所以整 ...
- 一文学会最常见的10种NLP处理技术
一文学会最常见的10种NLP处理技术(附资源&代码) 技术小能手 2017-11-21 11:08:29 浏览2562 评论0 算法 HTTPS 序列 自然语言处理 神经网络 摘要: 自然 ...
- 【读书笔记与思考】《python数据分析与挖掘实战》-张良均
[读书笔记与思考]<python数据分析与挖掘实战>-张良均 最近看一些机器学习相关书籍,主要是为了拓宽视野.在阅读这本书前最吸引我的地方是实战篇,我通读全书后给我印象最深的还是实战篇.基 ...
- Deep-Learning-with-Python] 文本序列中的深度学习
https://blog.csdn.net/LSG_Down/article/details/81327072 将文本数据处理成有用的数据表示 循环神经网络 使用1D卷积处理序列数据 深度学习模型可以 ...
- tensorflow 教程 文本分类 IMDB电影评论
昨天配置了tensorflow的gpu版本,今天开始简单的使用一下 主要是看了一下tensorflow的tutorial 里面的 IMDB 电影评论二分类这个教程 教程里面主要包括了一下几个内容:下载 ...
- 使用RNN对文本进行分类实践电影评论
本教程在IMDB大型影评数据集 上训练一个循环神经网络进行情感分类. from __future__ import absolute_import, division, print_function, ...
- tensorflow对鸢尾花进行分类——人工智能入门篇
tensorflow之对鸢尾花进行分类 任务目标 对鸢尾花数据集分析 建立鸢尾花的模型 利用模型预测鸢尾花的类别 环境搭建 pycharm编辑器搭建python3.* 第三方库 tensorflow1 ...
随机推荐
- Android性能优化典例(二)
1.使用 Maven 依赖方案代替使用导入jar包方案 如果项目中需要用到第三方jar包,常用的做法是去网上下载后然后放入libs文件夹,再添加到项目依赖,不过,在Android Studio已经不推 ...
- ReentrantReadWriteLock读写锁的使用1
本文可作为传智播客<张孝祥-Java多线程与并发库高级应用>的学习笔记. 一个简单的例子 两个线程,一个不断打印a,一个不断打印b public class LockTest { publ ...
- 在 Vim 中设置 Tab 为4个空格
缩进用 tab 制表符还是空格,这不是个问题,就像 python 用四个空格来缩进一样,这是要看个人喜好的.在 Vim 中可以很方便的根据不同的文件类型来设置使用 tab 制表符或者空格,还可以设置长 ...
- Aandroid 图片加载库Glide 实战(一),初始,加载进阶到实践
原文: http://blog.csdn.net/sk719887916/article/details/39989293 skay 初识Glide 为何使用 Glide? 有经验的 Android ...
- Oracle创建视图view权限不足问题剖析
问题: 使用USER1等其他用户登录Oracle以后,创建视图,提示"权限不够",怎么解决? 这是因为USER1这个帐户目前没有创建视图的权限. 解决方法为: 首先使用system ...
- Spring Boot缓存应用实践
缓存是最直接有效提升系统性能的手段之一.个人认为用好用对缓存是优秀程序员的必备基本素质. 本文结合实际开发经验,从简单概念原理和代码入手,一步一步搭建一个简单的二级缓存系统. 一.通用缓存接口 1.缓 ...
- Spring Kafka和Spring Boot整合实现消息发送与消费简单案例
本文主要分享下Spring Boot和Spring Kafka如何配置整合,实现发送和接收来自Spring Kafka的消息. 先前我已经分享了Kafka的基本介绍与集群环境搭建方法.关于Kafka的 ...
- MQ队列管理器搭建(二)
MQ级联方式使用场景 使用场景: 如上图所示,Application1与Application2要进行通信或者消息互换,使用MQ中间件作为中介.上图中,Application1与Applica ...
- 解析Json字符串的三种方法
在很多时候,我们的需要将类似 json 格式的字符串数据转为json, 下面将介绍日常中使用的三种解析json字符串的方法 1.首先,我们先看一下什么是 json 格式字符串数据,很简单,就是 jso ...
- 修改访问的后缀contant
设置Struts 2处理的请求后缀及Action调用 1.在struts2中默认处理的请求后缀为action,我们可以修改struts.xml 和struts.properties来修改默认的配置,在 ...