text clf rnn
#!/usr/bin/env python
# coding=utf- import numpy as np
import pandas as pd
import re
from bs4 import BeautifulSoup
import os from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical from keras.models import Model
from keras.layers import Dense, Input, Layer
from keras.layers import Convolution1D, MaxPooling1D, Embedding, Merge, Dropout, LSTM, GRU, Bidirectional
from keras import backend as K
from sklearn.model_selection import train_test_split MAX_SEQUENCE_LENGTH=
MAX_NB_WORDS =
EMBEDDING_DIM = def load_data(path='~/workspace/data/imdb/labeledTrainData.tsv'):
"""
载入imdb评论的训练数据
"""
def clean_str(sent):
sent = re.sub(r"\\|\'|\"", '', sent)
return sent.strip().lower() data, label = [], []
df = pd.read_csv(path, sep='\t')
print df.shape
for idx in range(df['review'].shape[]):
text = BeautifulSoup(df['review'][idx], 'lxml')
data.append(clean_str(text.get_text().encode('ascii', 'ignore')))
label.append(int(df['sentiment'][idx]))
if idx>:
break
return data, label def load_weights(fname, word_index):
"""
导入预先训练好的glove词向量
"""
emb_weights = {}
with open(fname) as fr:
for line in fr:
values = line.strip().split()
emb_weights[values[]] = values[:] emb_matrix = np.random.random((len(word_index)+, EMBEDDING_DIM))
for word, i in word_index.items():
if word in emb_weights:
emb_matrix[i] = emb_weights[word]
return emb_matrix texts, label = load_data() tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(texts) data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
label_onehot = to_categorical(label) x_train, x_test, y_train, y_test = train_test_split(data, label_onehot, test_size=0.1)
print('Training %d positive reviews', y_train.sum(axis=))
print('Training %d negative reviews', y_test.sum(axis=)) emb_matrix = load_weights(fname='/home/jkmiao/workspace/data/glove/glove.6B.100d.txt', word_index=word_index)
embedding_layer = Embedding(len(word_index)+, EMBEDDING_DIM, weights=[emb_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=True) input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(input)
bi_lstm = Bidirectional(LSTM())(embedded_sequences)
output = Dense(, activation='softmax')(bi_lstm) model = Model(input=input, output=output)
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics='accuracy') print('model fitting - Bidirectional LSTM')
model.summary()
model.fit(x_train, y_train, validation_data=(x_test, y_test), nb_epoch=, batch_size=)
model.save('model/text_clf_lstm.h5')
text clf rnn的更多相关文章
- 论文阅读(Weilin Huang——【ECCV2016】Detecting Text in Natural Image with Connectionist Text Proposal Network)
Weilin Huang——[ECCV2016]Detecting Text in Natural Image with Connectionist Text Proposal Network 目录 ...
- [Tensorflow] RNN - 04. Work with CNN for Text Classification
Ref: Combining CNN and RNN for spoken language identification Ref: Convolutional Methods for Text [1 ...
- 文本分类:Keras+RNN vs传统机器学习
摘要:本文通过Keras实现了一个RNN文本分类学习的案例,并详细介绍了循环神经网络原理知识及与机器学习对比. 本文分享自华为云社区<基于Keras+RNN的文本分类vs基于传统机器学习的文本分 ...
- 论文阅读(Weilin Huang——【AAAI2016】Reading Scene Text in Deep Convolutional Sequences)
Weilin Huang--[AAAI2016]Reading Scene Text in Deep Convolutional Sequences 目录 作者和相关链接 方法概括 创新点和贡献 方法 ...
- 论文阅读(Xiang Bai——【PAMI2017】An End-to-End Trainable Neural Network for Image-based Sequence Recognition and Its Application to Scene Text Recognition)
白翔的CRNN论文阅读 1. 论文题目 Xiang Bai--[PAMI2017]An End-to-End Trainable Neural Network for Image-based Seq ...
- RNN 入门教程 Part 4 – 实现 RNN-LSTM 和 GRU 模型
转载 - Recurrent Neural Network Tutorial, Part 4 – Implementing a GRU/LSTM RNN with Python and Theano ...
- RNN 入门教程 Part 2 – 使用 numpy 和 theano 分别实现RNN模型
转载 - Recurrent Neural Networks Tutorial, Part 2 – Implementing a RNN with Python, Numpy and Theano 本 ...
- RNN 入门教程 Part 1 – RNN 简介
转载 - Recurrent Neural Networks Tutorial, Part 1 – Introduction to RNNs Recurrent Neural Networks (RN ...
- 循环神经网络(RNN, Recurrent Neural Networks)介绍(转载)
循环神经网络(RNN, Recurrent Neural Networks)介绍 这篇文章很多内容是参考:http://www.wildml.com/2015/09/recurrent-neur ...
随机推荐
- 【数据库】jdbc详解
try { if(resultSet!=null){ resultSet.close(); } }catch (SQLException e){ e.printStackTrace(); }final ...
- -save和-save-dev 区别(转载)
本文原文地址:https://www.limitcode.com/detail/59a15b1a69e95702e0780249.html 回顾 npm install 命令 最近在写Node程序的时 ...
- linux之数据备份
第一种方法:tar备份 [root@bogon ~]# cat bp/linux.txt no centos [root@bogon ~]# tar cvf bp.tar bp //打包bp目录 bp ...
- lerna import && add 使用&&常见问题解决
使用lerna 的import 我们可以方便的将一个普通的npm 包倒入到lerna 管理的monorepo 中 环境准备 lerna init 注意必须是一个git 项目,同时需要commit ,不 ...
- css动画和js动画的差异
代码复杂度,js 动画代码相对复杂一些 动画运行时,对动画的控制程度上,js 能够让动画,暂停,取消,终止,css动画不能添加事件 动画性能看,js 动画多了一个js 解析的过程,性能不如 css 动 ...
- 关注 硬件 发展, 转载一篇介绍 VHDL 的文章
<VHDL学习笔记> https://www.eefocus.com/hrbeulvcaho/blog/12-11/289109_978e2.html VHDL 和 “可编程逻辑阵列” ...
- 适配器模式adepter
1. 主要优点 无论是对象适配器模式还是类适配器模式都具有如下优点: (1) 将目标类和适配者类解耦,通过引入一个适配器类来重用现有的适配者类,无须修改原有结构.(适配者得结构 (2) 增加了类的透明 ...
- DevOps需要的工具
DevOps需要的工具: 代码管理(SCM):GitHub.GitLab.BitBucket.SubVersion 构建工具:Ant.Gradle.maven 自动部署:Capistrano.Code ...
- 搭建Cordova + Ionic + WebStorm环境开发Web App应用
1. 下载并且安装Node.js(https://nodejs.org/en/) 2. 打开终端,安装cordova (如果安装失败或者卡住不动则重新安装) sudo npm install - ...
- java集合之HashMap源码解析
Map是java中的一种数据结构,围绕着Map接口,有一系列的实现类如Hashtable.HashMap.LinkedHashMap和TreeMap.而其中HashMap和Hashtable我们平常使 ...