fasttext:

'''This example demonstrates the use of fasttext for text classification
Based on Joulin et al's paper:
Bags of Tricks for Efficient Text Classification
https://arxiv.org/abs/1607.01759
Results on IMDB datasets with uni and bi-gram embeddings:
Uni-gram: 0.8813 test accuracy after 5 epochs. 8s/epoch on i7 cpu.
Bi-gram : 0.9056 test accuracy after 5 epochs. 2s/epoch on GTx 980M gpu.
''' from __future__ import print_function
import numpy as np from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import GlobalAveragePooling1D
from keras.datasets import imdb import numpy as np
import json
import warnings def load_data(path='imdb.npz', num_words=None, skip_top=0,
maxlen=None, seed=113,
start_char=1, oov_char=2, index_from=3, **kwargs):
"""Loads the IMDB dataset.
# Arguments
path: where to cache the data (relative to `~/.keras/dataset`).
num_words: max number of words to include. Words are ranked
by how often they occur (in the training set) and only
the most frequent words are kept
skip_top: skip the top N most frequently occurring words
(which may not be informative).
maxlen: sequences longer than this will be filtered out.
seed: random seed for sample shuffling.
start_char: The start of a sequence will be marked with this character.
Set to 1 because 0 is usually the padding character.
oov_char: words that were cut out because of the `num_words`
or `skip_top` limit will be replaced with this character.
index_from: index actual words with this index and higher.
# Returns
Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.
# Raises
ValueError: in case `maxlen` is so low
that no input sequence could be kept.
Note that the 'out of vocabulary' character is only used for
words that were present in the training set but are not included
because they're not making the `num_words` cut here.
Words that were not seen in the training set but are in the test set
have simply been skipped.
"""
# Legacy support
if 'nb_words' in kwargs:
warnings.warn('The `nb_words` argument in `load_data` '
'has been renamed `num_words`.')
num_words = kwargs.pop('nb_words')
if kwargs:
raise TypeError('Unrecognized keyword arguments: ' + str(kwargs)) #path = get_file(path,
# origin='https://s3.amazonaws.com/text-datasets/imdb.npz',
# file_hash='599dadb1135973df5b59232a0e9a887c')
with np.load(path) as f:
x_train, labels_train = f['x_train'], f['y_train']
x_test, labels_test = f['x_test'], f['y_test'] np.random.seed(seed)
indices = np.arange(len(x_train))
np.random.shuffle(indices)
x_train = x_train[indices]
labels_train = labels_train[indices] indices = np.arange(len(x_test))
np.random.shuffle(indices)
x_test = x_test[indices]
labels_test = labels_test[indices] xs = np.concatenate([x_train, x_test])
labels = np.concatenate([labels_train, labels_test]) if start_char is not None:
xs = [[start_char] + [w + index_from for w in x] for x in xs]
elif index_from:
xs = [[w + index_from for w in x] for x in xs] if maxlen:
xs, labels = _remove_long_seq(maxlen, xs, labels)
if not xs:
raise ValueError('After filtering for sequences shorter than maxlen=' +
str(maxlen) + ', no sequence was kept. '
'Increase maxlen.')
if not num_words:
num_words = max([max(x) for x in xs]) # by convention, use 2 as OOV word
# reserve 'index_from' (=3 by default) characters:
# 0 (padding), 1 (start), 2 (OOV)
if oov_char is not None:
xs = [[w if (skip_top <= w < num_words) else oov_char for w in x] for x in xs]
else:
xs = [[w for w in x if skip_top <= w < num_words] for x in xs] idx = len(x_train)
x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])
return (x_train, y_train), (x_test, y_test) def create_ngram_set(input_list, ngram_value=2):
"""
Extract a set of n-grams from a list of integers.
>>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=2)
{(4, 9), (4, 1), (1, 4), (9, 4)}
>>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=3)
[(1, 4, 9), (4, 9, 4), (9, 4, 1), (4, 1, 4)]
"""
return set(zip(*[input_list[i:] for i in range(ngram_value)])) def add_ngram(sequences, token_indice, ngram_range=2):
"""
Augment the input list of list (sequences) by appending n-grams values.
Example: adding bi-gram
>>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
>>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017}
>>> add_ngram(sequences, token_indice, ngram_range=2)
[[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42]]
Example: adding tri-gram
>>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
>>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017, (7, 9, 2): 2018}
>>> add_ngram(sequences, token_indice, ngram_range=3)
[[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42, 2018]]
"""
new_sequences = []
for input_list in sequences:
new_list = input_list[:]
for ngram_value in range(2, ngram_range + 1):
for i in range(len(new_list) - ngram_value + 1):
ngram = tuple(new_list[i:i + ngram_value])
if ngram in token_indice:
new_list.append(token_indice[ngram])
new_sequences.append(new_list) return new_sequences # Set parameters:
# ngram_range = 2 will add bi-grams features
ngram_range = 1
max_features = 20000
maxlen = 400
batch_size = 32
embedding_dims = 50
epochs = 5 print('Loading data...')
# the data, split between train and test sets
#(x_train, y_train), (x_test, y_test) = load_data()
(x_train, y_train), (x_test, y_test) = load_data(num_words=max_features)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')
print('Average train sequence length: {}'.format(np.mean(list(map(len, x_train)), dtype=int)))
print('Average test sequence length: {}'.format(np.mean(list(map(len, x_test)), dtype=int))) if ngram_range > 1:
print('Adding {}-gram features'.format(ngram_range))
# Create set of unique n-gram from the training set.
ngram_set = set()
for input_list in x_train:
for i in range(2, ngram_range + 1):
set_of_ngram = create_ngram_set(input_list, ngram_value=i)
ngram_set.update(set_of_ngram) # Dictionary mapping n-gram token to a unique integer.
# Integer values are greater than max_features in order
# to avoid collision with existing features.
start_index = max_features + 1
token_indice = {v: k + start_index for k, v in enumerate(ngram_set)}
indice_token = {token_indice[k]: k for k in token_indice} # max_features is the highest integer that could be found in the dataset.
max_features = np.max(list(indice_token.keys())) + 1 # Augmenting x_train and x_test with n-grams features
x_train = add_ngram(x_train, token_indice, ngram_range)
x_test = add_ngram(x_test, token_indice, ngram_range)
print('Average train sequence length: {}'.format(np.mean(list(map(len, x_train)), dtype=int)))
print('Average test sequence length: {}'.format(np.mean(list(map(len, x_test)), dtype=int))) print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape) print('Build model...')
model = Sequential() # we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
model.add(Embedding(max_features,
embedding_dims,
input_length=maxlen)) # we add a GlobalAveragePooling1D, which will average the embeddings
# of all words in the document
model.add(GlobalAveragePooling1D()) # We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(1, activation='sigmoid')) model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy']) model.fit(x_train, y_train,
batch_size=batch_size,
epochs=epochs, validation_data=(x_test, y_test))

效果:

Train on 25000 samples, validate on 25000 samples
Epoch 1/50
2018-06-06 15:50:28.133461: I tensorflow/core/platform/cpu_feature_guard.cc:140] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
25000/25000 [==============================] - 9s 379us/step - loss: 0.6125 - acc: 0.7431 - val_loss: 0.5050 - val_acc: 0.8227
Epoch 2/50
25000/25000 [==============================] - 10s 402us/step - loss: 0.4059 - acc: 0.8633 - val_loss: 0.3738 - val_acc: 0.8646
Epoch 3/50
25000/25000 [==============================] - 11s 441us/step - loss: 0.3061 - acc: 0.8934 - val_loss: 0.3219 - val_acc: 0.8783
Epoch 4/50
25000/25000 [==============================] - 9s 375us/step - loss: 0.2550 - acc: 0.9110 - val_loss: 0.2970 - val_acc: 0.8853
Epoch 5/50

可以看到一个epoch只需要10来秒,还是很快的!但是我训练到50个epoch后发现acc 100%,但是验证集上数据acc 86%,看来是过拟合了。

再看看传统cnn:

'''This example demonstrates the use of Convolution1D for text classification.
Gets to 0.89 test accuracy after 2 epochs.
90s/epoch on Intel i5 2.4Ghz CPU.
10s/epoch on Tesla K40 GPU.
'''
from __future__ import print_function from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.datasets import imdb # set parameters:
max_features = 5000
import numpy as np
import json
import warnings def load_data(path='imdb.npz', num_words=None, skip_top=0,
maxlen=None, seed=113,
start_char=1, oov_char=2, index_from=3, **kwargs):
"""Loads the IMDB dataset.
# Arguments
path: where to cache the data (relative to `~/.keras/dataset`).
num_words: max number of words to include. Words are ranked
by how often they occur (in the training set) and only
the most frequent words are kept
skip_top: skip the top N most frequently occurring words
(which may not be informative).
maxlen: sequences longer than this will be filtered out.
seed: random seed for sample shuffling.
start_char: The start of a sequence will be marked with this character.
Set to 1 because 0 is usually the padding character.
oov_char: words that were cut out because of the `num_words`
or `skip_top` limit will be replaced with this character.
index_from: index actual words with this index and higher.
# Returns
Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.
# Raises
ValueError: in case `maxlen` is so low
that no input sequence could be kept.
Note that the 'out of vocabulary' character is only used for
words that were present in the training set but are not included
because they're not making the `num_words` cut here.
Words that were not seen in the training set but are in the test set
have simply been skipped.
"""
# Legacy support
if 'nb_words' in kwargs:
warnings.warn('The `nb_words` argument in `load_data` '
'has been renamed `num_words`.')
num_words = kwargs.pop('nb_words')
if kwargs:
raise TypeError('Unrecognized keyword arguments: ' + str(kwargs)) #path = get_file(path,
# origin='https://s3.amazonaws.com/text-datasets/imdb.npz',
# file_hash='599dadb1135973df5b59232a0e9a887c')
with np.load(path) as f:
x_train, labels_train = f['x_train'], f['y_train']
x_test, labels_test = f['x_test'], f['y_test'] np.random.seed(seed)
indices = np.arange(len(x_train))
np.random.shuffle(indices)
x_train = x_train[indices]
labels_train = labels_train[indices] indices = np.arange(len(x_test))
np.random.shuffle(indices)
x_test = x_test[indices]
labels_test = labels_test[indices] xs = np.concatenate([x_train, x_test])
labels = np.concatenate([labels_train, labels_test]) if start_char is not None:
xs = [[start_char] + [w + index_from for w in x] for x in xs]
elif index_from:
xs = [[w + index_from for w in x] for x in xs] if maxlen:
xs, labels = _remove_long_seq(maxlen, xs, labels)
if not xs:
raise ValueError('After filtering for sequences shorter than maxlen=' +
str(maxlen) + ', no sequence was kept. '
'Increase maxlen.')
if not num_words:
num_words = max([max(x) for x in xs]) # by convention, use 2 as OOV word
# reserve 'index_from' (=3 by default) characters:
# 0 (padding), 1 (start), 2 (OOV)
if oov_char is not None:
xs = [[w if (skip_top <= w < num_words) else oov_char for w in x] for x in xs]
else:
xs = [[w for w in x if skip_top <= w < num_words] for x in xs] idx = len(x_train)
x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])
return (x_train, y_train), (x_test, y_test) def create_ngram_set(input_list, ngram_value=2):
"""
Extract a set of n-grams from a list of integers.
>>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=2)
{(4, 9), (4, 1), (1, 4), (9, 4)}
>>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=3)
[(1, 4, 9), (4, 9, 4), (9, 4, 1), (4, 1, 4)]
"""
return set(zip(*[input_list[i:] for i in range(ngram_value)])) def add_ngram(sequences, token_indice, ngram_range=2):
"""
Augment the input list of list (sequences) by appending n-grams values.
Example: adding bi-gram
>>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
>>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017}
>>> add_ngram(sequences, token_indice, ngram_range=2)
[[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42]]
Example: adding tri-gram
>>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
>>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017, (7, 9, 2): 2018}
>>> add_ngram(sequences, token_indice, ngram_range=3)
[[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42, 2018]]
"""
new_sequences = []
for input_list in sequences:
new_list = input_list[:]
for ngram_value in range(2, ngram_range + 1):
for i in range(len(new_list) - ngram_value + 1):
ngram = tuple(new_list[i:i + ngram_value])
if ngram in token_indice:
new_list.append(token_indice[ngram])
new_sequences.append(new_list) return new_sequences
maxlen = 400
batch_size = 32
embedding_dims = 50
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 5 print('Loading data...')
(x_train, y_train), (x_test, y_test) = load_data(num_words=max_features)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences') print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape) print('Build model...')
model = Sequential() # we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
model.add(Embedding(max_features,
embedding_dims,
input_length=maxlen))
model.add(Dropout(0.2)) # we add a Convolution1D, which will learn filters
# word group filters of size filter_length:
model.add(Conv1D(filters,
kernel_size,
padding='valid',
activation='relu',
strides=1))
# we use max pooling:
model.add(GlobalMaxPooling1D()) # We add a vanilla hidden layer:
model.add(Dense(hidden_dims))
model.add(Dropout(0.2))
model.add(Activation('relu')) # We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(1))
model.add(Activation('sigmoid')) model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy'])
model.fit(x_train, y_train,
batch_size=batch_size,
epochs=epochs,
validation_data=(x_test, y_test))

效果:

Train on 25000 samples, validate on 25000 samples
Epoch 1/5
2018-06-06 16:10:34.733973: I tensorflow/core/platform/cpu_feature_guard.cc:140] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
25000/25000 [==============================] - 117s 5ms/step - loss: 0.4044 - acc: 0.8007 - val_loss: 0.3212 - val_acc: 0.8600
Epoch 2/5
25000/25000 [==============================] - 121s 5ms/step - loss: 0.2323 - acc: 0.9057 - val_loss: 0.2903 - val_acc: 0.8801
Epoch 3/5
25000/25000 [==============================] - 124s 5ms/step - loss: 0.1640 - acc: 0.9377 - val_loss: 0.2720 - val_acc: 0.8900
Epoch 4/5
25000/25000 [==============================] - 116s 5ms/step - loss: 0.1136 - acc: 0.9579 - val_loss: 0.3353 - val_acc: 0.8811
Epoch 5/5
25000/25000 [==============================] - 114s 5ms/step - loss: 0.0764 - acc: 0.9726 - val_loss: 0.3958 - val_acc: 0.8793

可以看出cnn的确要慢10倍。

fasttext和cnn的比较,使用keras imdb看效果——cnn要慢10倍。的更多相关文章

  1. Deep Learning模型之:CNN卷积神经网络(一)深度解析CNN

    http://m.blog.csdn.net/blog/wu010555688/24487301 本文整理了网上几位大牛的博客,详细地讲解了CNN的基础结构与核心思想,欢迎交流. [1]Deep le ...

  2. Keras(四)CNN 卷积神经网络 RNN 循环神经网络 原理及实例

    CNN 卷积神经网络 卷积 池化 https://www.cnblogs.com/peng8098/p/nlp_16.html 中有介绍 以数据集MNIST构建一个卷积神经网路 from keras. ...

  3. 【Keras案例学习】 CNN做手写字符分类(mnist_cnn )

    from __future__ import print_function import numpy as np np.random.seed(1337) from keras.datasets im ...

  4. Keras框架下使用CNN进行CIFAR-10的识别测试

    有手册,然后代码不知道看一下:https://keras-cn.readthedocs.io/en/latest/ 首先是下载数据集,下载太慢了就从网盘上下载: 链接:https://pan.baid ...

  5. [C1W1] Neural Networks and Deep Learning - Introduction to Deep Learning

    第一周:深度学习引言(Introduction to Deep Learning) 欢迎(Welcome) 深度学习改变了传统互联网业务,例如如网络搜索和广告.但是深度学习同时也使得许多新产品和企业以 ...

  6. keras入门(三)搭建CNN模型破解网站验证码

    项目介绍   在文章CNN大战验证码中,我们利用TensorFlow搭建了简单的CNN模型来破解某个网站的验证码.验证码如下: 在本文中,我们将会用Keras来搭建一个稍微复杂的CNN模型来破解以上的 ...

  7. 使用Keras进行深度学习:(二)CNN讲解及实践

    欢迎大家关注我们的网站和系列教程:http://www.tensorflownews.com/,学习更多的机器学习.深度学习的知识! 现今最主流的处理图像数据的技术当属深度神经网络了,尤其是卷积神经网 ...

  8. 深度学习:Keras入门(二)之卷积神经网络(CNN)

    说明:这篇文章需要有一些相关的基础知识,否则看起来可能比较吃力. 1.卷积与神经元 1.1 什么是卷积? 简单来说,卷积(或内积)就是一种先把对应位置相乘然后再把结果相加的运算.(具体含义或者数学公式 ...

  9. 深度学习:Keras入门(二)之卷积神经网络(CNN)【转】

    本文转载自:https://www.cnblogs.com/lc1217/p/7324935.html 说明:这篇文章需要有一些相关的基础知识,否则看起来可能比较吃力. 1.卷积与神经元 1.1 什么 ...

随机推荐

  1. (4.9)SQL Server如何校验备份文件

    译 SQL Server如何校验备份文件 转自:https://blog.csdn.net/tjvictor/article/details/5261666 RESTORE VERIFYONLY与 c ...

  2. 安卓3d引擎

    很 多初学Android游戏开发 href="http://edu.gamfe.com/gamedev.html">游戏开发的朋友,往往会显得有些无所适从.他们经常不知道该从 ...

  3. C# 函数4

    //数据库     public class GF_DA     {         /// <summary>         /// 执行SQL语句 sConnStr 连接字符串,sq ...

  4. 标准c内存函数的使用方法

    标准c内存函数 calloc 语法:     #include <stdlib.h>   void *calloc( size_t num, size_t size ); 功能: 函数返回 ...

  5. oracle扩容

    动态添加表空间: alter tablespace cbs_cos add datafile '/dba/oradata/ORADEVdatafile/cbs_cos02.dbf' size 100m ...

  6. Jquery.ScrollLoading图片延迟加载技术

    关于分屏加载图片,像天猫.京东等电商图片较多页面很长,就采用了延迟加载技术. 目前很流行的做法就是滚动动态加载,显示屏幕之外的图片默认是不加载的, 随着页面的滚动,显示区域图片才被动态加载. 原理其实 ...

  7. Vuex mapGetters,mapActions

    一.基本用法 1. 初始化并创建一个项目 ? 1 2 3 vue init webpack-simple vuex-demo cd vuex-demo npm install 2. 安装 vuex ? ...

  8. CSS伪元素实现的3D按钮

    在线演示 本地下载

  9. php数组函数-array_intersect()

    array_intersect()函数返回两个或多个数组的交集数组 结果数组包含了所有在被比较数组中,也同时出现在所有其他参数数组中 的值,键名保留不变 array_intersect(array1, ...

  10. 网络安全-跨站脚本攻击XSS(Cross-Site Scripting)

    一.XSS攻击简介 作为一种HTML注入攻击,XSS攻击的核心思想就是在HTML页面中注入恶意代码,而XSS采用的注入方式是非常巧妙的. 在XSS攻击中,一般有三个角色参与:攻击者.目标服务器.受害者 ...