text clf rnn

#!/usr/bin/env python

# coding=utf-

import numpy as np

import pandas as pd

import re

from bs4 import BeautifulSoup

import os

from keras.preprocessing.text import Tokenizer

from keras.preprocessing.sequence import pad_sequences

from keras.utils.np_utils import to_categorical

from keras.models import Model

from keras.layers import Dense, Input, Layer

from keras.layers import Convolution1D, MaxPooling1D, Embedding, Merge, Dropout, LSTM, GRU, Bidirectional

from keras import backend as K

from sklearn.model_selection import train_test_split

MAX_SEQUENCE_LENGTH=

MAX_NB_WORDS =

EMBEDDING_DIM = 

def load_data(path='~/workspace/data/imdb/labeledTrainData.tsv'):

"""

载入imdb评论的训练数据

"""

def clean_str(sent):

sent = re.sub(r"\\|\'|\"", '', sent)

return sent.strip().lower()

data, label = [], []

df = pd.read_csv(path, sep='\t')

print df.shape

for idx in range(df['review'].shape[]):

text = BeautifulSoup(df['review'][idx], 'lxml')

data.append(clean_str(text.get_text().encode('ascii', 'ignore')))

label.append(int(df['sentiment'][idx]))

if idx>:

break

return data, label

def load_weights(fname, word_index):

"""

导入预先训练好的glove词向量

"""

emb_weights = {}

with open(fname) as fr:

for line in fr:

values = line.strip().split()

emb_weights[values[]] = values[:]

emb_matrix = np.random.random((len(word_index)+, EMBEDDING_DIM))

for word, i in word_index.items():

if word in emb_weights:

emb_matrix[i] = emb_weights[word]

return emb_matrix

texts, label = load_data()

tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)

tokenizer.fit_on_texts(texts)

word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(texts)

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

label_onehot = to_categorical(label)

x_train, x_test, y_train, y_test = train_test_split(data, label_onehot, test_size=0.1)

print('Training %d positive reviews', y_train.sum(axis=))

print('Training %d negative reviews', y_test.sum(axis=))

emb_matrix = load_weights(fname='/home/jkmiao/workspace/data/glove/glove.6B.100d.txt', word_index=word_index)

embedding_layer = Embedding(len(word_index)+, EMBEDDING_DIM, weights=[emb_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=True)

input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')

embedded_sequences = embedding_layer(input)

bi_lstm = Bidirectional(LSTM())(embedded_sequences)

output = Dense(, activation='softmax')(bi_lstm)

model = Model(input=input, output=output)

model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics='accuracy')

print('model fitting - Bidirectional LSTM')

model.summary()

model.fit(x_train, y_train, validation_data=(x_test, y_test), nb_epoch=, batch_size=)

model.save('model/text_clf_lstm.h5')

text clf rnn的更多相关文章

论文阅读（Weilin Huang——【ECCV2016】Detecting Text in Natural Image with Connectionist Text Proposal Network）
Weilin Huang——[ECCV2016]Detecting Text in Natural Image with Connectionist Text Proposal Network 目录 ...
[Tensorflow] RNN - 04. Work with CNN for Text Classification
Ref: Combining CNN and RNN for spoken language identification Ref: Convolutional Methods for Text [1 ...
文本分类：Keras+RNN vs传统机器学习
摘要:本文通过Keras实现了一个RNN文本分类学习的案例,并详细介绍了循环神经网络原理知识及与机器学习对比. 本文分享自华为云社区<基于Keras+RNN的文本分类vs基于传统机器学习的文本分 ...
论文阅读（Weilin Huang——【AAAI2016】Reading Scene Text in Deep Convolutional Sequences）
Weilin Huang--[AAAI2016]Reading Scene Text in Deep Convolutional Sequences 目录作者和相关链接方法概括创新点和贡献方法 ...
论文阅读（Xiang Bai——【PAMI2017】An End-to-End Trainable Neural Network for Image-based Sequence Recognition and Its Application to Scene Text Recognition）
白翔的CRNN论文阅读 1. 论文题目 Xiang Bai--[PAMI2017]An End-to-End Trainable Neural Network for Image-based Seq ...
RNN 入门教程 Part 4 – 实现 RNN-LSTM 和 GRU 模型
转载 - Recurrent Neural Network Tutorial, Part 4 – Implementing a GRU/LSTM RNN with Python and Theano ...
RNN 入门教程 Part 2 – 使用 numpy 和 theano 分别实现RNN模型
转载 - Recurrent Neural Networks Tutorial, Part 2 – Implementing a RNN with Python, Numpy and Theano 本 ...
RNN 入门教程 Part 1 – RNN 简介
转载 - Recurrent Neural Networks Tutorial, Part 1 – Introduction to RNNs Recurrent Neural Networks (RN ...
循环神经网络(RNN, Recurrent Neural Networks)介绍（转载）
循环神经网络(RNN, Recurrent Neural Networks)介绍这篇文章很多内容是参考:http://www.wildml.com/2015/09/recurrent-neur ...

随机推荐

input的焦点事件
<body> <h3>表单中文本框的focus和blur事件</h3> <input id="txtest" type="tex ...
【java规则引擎】《Drools7.0.0.Final规则引擎教程》第4章 4.5RHS语法
转载至:https://blog.csdn.net/wo541075754/article/details/76651073 RHS语法使用说明 RHS是满足LHS条件之后进行后续处理部分的统称,该 ...
zabbix监控第一台主机系统
注意zabbix客户端和zabbix服务端版本要一致,否则很容易出问题实验环境,在第一台centos7(ip为192.168.245.128,以下简称主机1)上安装zabbix服务器端,在第二台ce ...
Linux系统运维故障排查
一.思路 1.处理问题要求 2.一般思路二.具体问题 1.网络问题 (1)网络不通 (2)网络很慢 2.硬件问题 3.操作系统问题 (1)系统无法正常启动 (2)系统运行慢或死机 4.服务或程序问题 ...
SecureCRT突然卡死的问题
SecureCRT作为著名的SSHclient,经经常使用于登陆远程server. 在上面编辑文本,特别是用vi打开两个文本,而且须要切换时.非常easy出现卡死的现象,不能接受不论什么的键盘输入. ...
Tower Defense Toolkit 学习
代码太多,就不贴了.用到的基本已注释. 游戏中的数据存放在Resources/Database中.游戏运行时,通过Resources.Load加载 UI构成对象池 using UnityEngi ...
网络-console
console接口h3c er8300cisco asaQuidway S5700-28C-SI Routing Switchtopsec <H3C>? reboot Reboot dev ...
MySQL事务描述
并发事务处理引起的数据问题更新丢失(Lost Update):当两个或多个事务选择同一行,然后基于最初选定的值更新该行时,由于每个事务都不知道其他事务的存在,就会发生丢失更新问题--最后的更新覆盖了 ...
js对象的key类型
http://javascript.ruanyifeng.com/grammar/object.html#toc2 对象的所有键名都是字符串(ES6 又引入了 Symbol 值也可以作为键名),所以加 ...
基于Elasticsearch的智能客服机器人
本次分享主要会介绍一下ES是如何帮我们完成NLP的任务的.在做NLP相关任务的时候,ES的相似度算法并不足以支撑用户的搜索,需要使用一些与语义相关的方法进行改进.但是ES的很多特性对我们优化搜索体验是 ...

text clf rnn

text clf rnn的更多相关文章

随机推荐

热门专题