fasttext:

'''This example demonstrates the use of fasttext for text classification
Based on Joulin et al's paper:
Bags of Tricks for Efficient Text Classification
https://arxiv.org/abs/1607.01759
Results on IMDB datasets with uni and bi-gram embeddings:
Uni-gram: 0.8813 test accuracy after 5 epochs. 8s/epoch on i7 cpu.
Bi-gram : 0.9056 test accuracy after 5 epochs. 2s/epoch on GTx 980M gpu.
''' from __future__ import print_function
import numpy as np from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import GlobalAveragePooling1D
from keras.datasets import imdb import numpy as np
import json
import warnings def load_data(path='imdb.npz', num_words=None, skip_top=0,
maxlen=None, seed=113,
start_char=1, oov_char=2, index_from=3, **kwargs):
"""Loads the IMDB dataset.
# Arguments
path: where to cache the data (relative to `~/.keras/dataset`).
num_words: max number of words to include. Words are ranked
by how often they occur (in the training set) and only
the most frequent words are kept
skip_top: skip the top N most frequently occurring words
(which may not be informative).
maxlen: sequences longer than this will be filtered out.
seed: random seed for sample shuffling.
start_char: The start of a sequence will be marked with this character.
Set to 1 because 0 is usually the padding character.
oov_char: words that were cut out because of the `num_words`
or `skip_top` limit will be replaced with this character.
index_from: index actual words with this index and higher.
# Returns
Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.
# Raises
ValueError: in case `maxlen` is so low
that no input sequence could be kept.
Note that the 'out of vocabulary' character is only used for
words that were present in the training set but are not included
because they're not making the `num_words` cut here.
Words that were not seen in the training set but are in the test set
have simply been skipped.
"""
# Legacy support
if 'nb_words' in kwargs:
warnings.warn('The `nb_words` argument in `load_data` '
'has been renamed `num_words`.')
num_words = kwargs.pop('nb_words')
if kwargs:
raise TypeError('Unrecognized keyword arguments: ' + str(kwargs)) #path = get_file(path,
# origin='https://s3.amazonaws.com/text-datasets/imdb.npz',
# file_hash='599dadb1135973df5b59232a0e9a887c')
with np.load(path) as f:
x_train, labels_train = f['x_train'], f['y_train']
x_test, labels_test = f['x_test'], f['y_test'] np.random.seed(seed)
indices = np.arange(len(x_train))
np.random.shuffle(indices)
x_train = x_train[indices]
labels_train = labels_train[indices] indices = np.arange(len(x_test))
np.random.shuffle(indices)
x_test = x_test[indices]
labels_test = labels_test[indices] xs = np.concatenate([x_train, x_test])
labels = np.concatenate([labels_train, labels_test]) if start_char is not None:
xs = [[start_char] + [w + index_from for w in x] for x in xs]
elif index_from:
xs = [[w + index_from for w in x] for x in xs] if maxlen:
xs, labels = _remove_long_seq(maxlen, xs, labels)
if not xs:
raise ValueError('After filtering for sequences shorter than maxlen=' +
str(maxlen) + ', no sequence was kept. '
'Increase maxlen.')
if not num_words:
num_words = max([max(x) for x in xs]) # by convention, use 2 as OOV word
# reserve 'index_from' (=3 by default) characters:
# 0 (padding), 1 (start), 2 (OOV)
if oov_char is not None:
xs = [[w if (skip_top <= w < num_words) else oov_char for w in x] for x in xs]
else:
xs = [[w for w in x if skip_top <= w < num_words] for x in xs] idx = len(x_train)
x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])
return (x_train, y_train), (x_test, y_test) def create_ngram_set(input_list, ngram_value=2):
"""
Extract a set of n-grams from a list of integers.
>>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=2)
{(4, 9), (4, 1), (1, 4), (9, 4)}
>>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=3)
[(1, 4, 9), (4, 9, 4), (9, 4, 1), (4, 1, 4)]
"""
return set(zip(*[input_list[i:] for i in range(ngram_value)])) def add_ngram(sequences, token_indice, ngram_range=2):
"""
Augment the input list of list (sequences) by appending n-grams values.
Example: adding bi-gram
>>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
>>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017}
>>> add_ngram(sequences, token_indice, ngram_range=2)
[[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42]]
Example: adding tri-gram
>>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
>>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017, (7, 9, 2): 2018}
>>> add_ngram(sequences, token_indice, ngram_range=3)
[[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42, 2018]]
"""
new_sequences = []
for input_list in sequences:
new_list = input_list[:]
for ngram_value in range(2, ngram_range + 1):
for i in range(len(new_list) - ngram_value + 1):
ngram = tuple(new_list[i:i + ngram_value])
if ngram in token_indice:
new_list.append(token_indice[ngram])
new_sequences.append(new_list) return new_sequences # Set parameters:
# ngram_range = 2 will add bi-grams features
ngram_range = 1
max_features = 20000
maxlen = 400
batch_size = 32
embedding_dims = 50
epochs = 5 print('Loading data...')
# the data, split between train and test sets
#(x_train, y_train), (x_test, y_test) = load_data()
(x_train, y_train), (x_test, y_test) = load_data(num_words=max_features)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')
print('Average train sequence length: {}'.format(np.mean(list(map(len, x_train)), dtype=int)))
print('Average test sequence length: {}'.format(np.mean(list(map(len, x_test)), dtype=int))) if ngram_range > 1:
print('Adding {}-gram features'.format(ngram_range))
# Create set of unique n-gram from the training set.
ngram_set = set()
for input_list in x_train:
for i in range(2, ngram_range + 1):
set_of_ngram = create_ngram_set(input_list, ngram_value=i)
ngram_set.update(set_of_ngram) # Dictionary mapping n-gram token to a unique integer.
# Integer values are greater than max_features in order
# to avoid collision with existing features.
start_index = max_features + 1
token_indice = {v: k + start_index for k, v in enumerate(ngram_set)}
indice_token = {token_indice[k]: k for k in token_indice} # max_features is the highest integer that could be found in the dataset.
max_features = np.max(list(indice_token.keys())) + 1 # Augmenting x_train and x_test with n-grams features
x_train = add_ngram(x_train, token_indice, ngram_range)
x_test = add_ngram(x_test, token_indice, ngram_range)
print('Average train sequence length: {}'.format(np.mean(list(map(len, x_train)), dtype=int)))
print('Average test sequence length: {}'.format(np.mean(list(map(len, x_test)), dtype=int))) print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape) print('Build model...')
model = Sequential() # we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
model.add(Embedding(max_features,
embedding_dims,
input_length=maxlen)) # we add a GlobalAveragePooling1D, which will average the embeddings
# of all words in the document
model.add(GlobalAveragePooling1D()) # We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(1, activation='sigmoid')) model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy']) model.fit(x_train, y_train,
batch_size=batch_size,
epochs=epochs, validation_data=(x_test, y_test))

效果:

Train on 25000 samples, validate on 25000 samples
Epoch 1/50
2018-06-06 15:50:28.133461: I tensorflow/core/platform/cpu_feature_guard.cc:140] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
25000/25000 [==============================] - 9s 379us/step - loss: 0.6125 - acc: 0.7431 - val_loss: 0.5050 - val_acc: 0.8227
Epoch 2/50
25000/25000 [==============================] - 10s 402us/step - loss: 0.4059 - acc: 0.8633 - val_loss: 0.3738 - val_acc: 0.8646
Epoch 3/50
25000/25000 [==============================] - 11s 441us/step - loss: 0.3061 - acc: 0.8934 - val_loss: 0.3219 - val_acc: 0.8783
Epoch 4/50
25000/25000 [==============================] - 9s 375us/step - loss: 0.2550 - acc: 0.9110 - val_loss: 0.2970 - val_acc: 0.8853
Epoch 5/50

可以看到一个epoch只需要10来秒,还是很快的!但是我训练到50个epoch后发现acc 100%,但是验证集上数据acc 86%,看来是过拟合了。

再看看传统cnn:

'''This example demonstrates the use of Convolution1D for text classification.
Gets to 0.89 test accuracy after 2 epochs.
90s/epoch on Intel i5 2.4Ghz CPU.
10s/epoch on Tesla K40 GPU.
'''
from __future__ import print_function from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.datasets import imdb # set parameters:
max_features = 5000
import numpy as np
import json
import warnings def load_data(path='imdb.npz', num_words=None, skip_top=0,
maxlen=None, seed=113,
start_char=1, oov_char=2, index_from=3, **kwargs):
"""Loads the IMDB dataset.
# Arguments
path: where to cache the data (relative to `~/.keras/dataset`).
num_words: max number of words to include. Words are ranked
by how often they occur (in the training set) and only
the most frequent words are kept
skip_top: skip the top N most frequently occurring words
(which may not be informative).
maxlen: sequences longer than this will be filtered out.
seed: random seed for sample shuffling.
start_char: The start of a sequence will be marked with this character.
Set to 1 because 0 is usually the padding character.
oov_char: words that were cut out because of the `num_words`
or `skip_top` limit will be replaced with this character.
index_from: index actual words with this index and higher.
# Returns
Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.
# Raises
ValueError: in case `maxlen` is so low
that no input sequence could be kept.
Note that the 'out of vocabulary' character is only used for
words that were present in the training set but are not included
because they're not making the `num_words` cut here.
Words that were not seen in the training set but are in the test set
have simply been skipped.
"""
# Legacy support
if 'nb_words' in kwargs:
warnings.warn('The `nb_words` argument in `load_data` '
'has been renamed `num_words`.')
num_words = kwargs.pop('nb_words')
if kwargs:
raise TypeError('Unrecognized keyword arguments: ' + str(kwargs)) #path = get_file(path,
# origin='https://s3.amazonaws.com/text-datasets/imdb.npz',
# file_hash='599dadb1135973df5b59232a0e9a887c')
with np.load(path) as f:
x_train, labels_train = f['x_train'], f['y_train']
x_test, labels_test = f['x_test'], f['y_test'] np.random.seed(seed)
indices = np.arange(len(x_train))
np.random.shuffle(indices)
x_train = x_train[indices]
labels_train = labels_train[indices] indices = np.arange(len(x_test))
np.random.shuffle(indices)
x_test = x_test[indices]
labels_test = labels_test[indices] xs = np.concatenate([x_train, x_test])
labels = np.concatenate([labels_train, labels_test]) if start_char is not None:
xs = [[start_char] + [w + index_from for w in x] for x in xs]
elif index_from:
xs = [[w + index_from for w in x] for x in xs] if maxlen:
xs, labels = _remove_long_seq(maxlen, xs, labels)
if not xs:
raise ValueError('After filtering for sequences shorter than maxlen=' +
str(maxlen) + ', no sequence was kept. '
'Increase maxlen.')
if not num_words:
num_words = max([max(x) for x in xs]) # by convention, use 2 as OOV word
# reserve 'index_from' (=3 by default) characters:
# 0 (padding), 1 (start), 2 (OOV)
if oov_char is not None:
xs = [[w if (skip_top <= w < num_words) else oov_char for w in x] for x in xs]
else:
xs = [[w for w in x if skip_top <= w < num_words] for x in xs] idx = len(x_train)
x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])
return (x_train, y_train), (x_test, y_test) def create_ngram_set(input_list, ngram_value=2):
"""
Extract a set of n-grams from a list of integers.
>>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=2)
{(4, 9), (4, 1), (1, 4), (9, 4)}
>>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=3)
[(1, 4, 9), (4, 9, 4), (9, 4, 1), (4, 1, 4)]
"""
return set(zip(*[input_list[i:] for i in range(ngram_value)])) def add_ngram(sequences, token_indice, ngram_range=2):
"""
Augment the input list of list (sequences) by appending n-grams values.
Example: adding bi-gram
>>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
>>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017}
>>> add_ngram(sequences, token_indice, ngram_range=2)
[[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42]]
Example: adding tri-gram
>>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
>>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017, (7, 9, 2): 2018}
>>> add_ngram(sequences, token_indice, ngram_range=3)
[[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42, 2018]]
"""
new_sequences = []
for input_list in sequences:
new_list = input_list[:]
for ngram_value in range(2, ngram_range + 1):
for i in range(len(new_list) - ngram_value + 1):
ngram = tuple(new_list[i:i + ngram_value])
if ngram in token_indice:
new_list.append(token_indice[ngram])
new_sequences.append(new_list) return new_sequences
maxlen = 400
batch_size = 32
embedding_dims = 50
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 5 print('Loading data...')
(x_train, y_train), (x_test, y_test) = load_data(num_words=max_features)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences') print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape) print('Build model...')
model = Sequential() # we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
model.add(Embedding(max_features,
embedding_dims,
input_length=maxlen))
model.add(Dropout(0.2)) # we add a Convolution1D, which will learn filters
# word group filters of size filter_length:
model.add(Conv1D(filters,
kernel_size,
padding='valid',
activation='relu',
strides=1))
# we use max pooling:
model.add(GlobalMaxPooling1D()) # We add a vanilla hidden layer:
model.add(Dense(hidden_dims))
model.add(Dropout(0.2))
model.add(Activation('relu')) # We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(1))
model.add(Activation('sigmoid')) model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy'])
model.fit(x_train, y_train,
batch_size=batch_size,
epochs=epochs,
validation_data=(x_test, y_test))

效果:

Train on 25000 samples, validate on 25000 samples
Epoch 1/5
2018-06-06 16:10:34.733973: I tensorflow/core/platform/cpu_feature_guard.cc:140] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
25000/25000 [==============================] - 117s 5ms/step - loss: 0.4044 - acc: 0.8007 - val_loss: 0.3212 - val_acc: 0.8600
Epoch 2/5
25000/25000 [==============================] - 121s 5ms/step - loss: 0.2323 - acc: 0.9057 - val_loss: 0.2903 - val_acc: 0.8801
Epoch 3/5
25000/25000 [==============================] - 124s 5ms/step - loss: 0.1640 - acc: 0.9377 - val_loss: 0.2720 - val_acc: 0.8900
Epoch 4/5
25000/25000 [==============================] - 116s 5ms/step - loss: 0.1136 - acc: 0.9579 - val_loss: 0.3353 - val_acc: 0.8811
Epoch 5/5
25000/25000 [==============================] - 114s 5ms/step - loss: 0.0764 - acc: 0.9726 - val_loss: 0.3958 - val_acc: 0.8793

可以看出cnn的确要慢10倍。

fasttext和cnn的比较,使用keras imdb看效果——cnn要慢10倍。的更多相关文章

  1. Deep Learning模型之:CNN卷积神经网络(一)深度解析CNN

    http://m.blog.csdn.net/blog/wu010555688/24487301 本文整理了网上几位大牛的博客,详细地讲解了CNN的基础结构与核心思想,欢迎交流. [1]Deep le ...

  2. Keras(四)CNN 卷积神经网络 RNN 循环神经网络 原理及实例

    CNN 卷积神经网络 卷积 池化 https://www.cnblogs.com/peng8098/p/nlp_16.html 中有介绍 以数据集MNIST构建一个卷积神经网路 from keras. ...

  3. 【Keras案例学习】 CNN做手写字符分类(mnist_cnn )

    from __future__ import print_function import numpy as np np.random.seed(1337) from keras.datasets im ...

  4. Keras框架下使用CNN进行CIFAR-10的识别测试

    有手册,然后代码不知道看一下:https://keras-cn.readthedocs.io/en/latest/ 首先是下载数据集,下载太慢了就从网盘上下载: 链接:https://pan.baid ...

  5. [C1W1] Neural Networks and Deep Learning - Introduction to Deep Learning

    第一周:深度学习引言(Introduction to Deep Learning) 欢迎(Welcome) 深度学习改变了传统互联网业务,例如如网络搜索和广告.但是深度学习同时也使得许多新产品和企业以 ...

  6. keras入门(三)搭建CNN模型破解网站验证码

    项目介绍   在文章CNN大战验证码中,我们利用TensorFlow搭建了简单的CNN模型来破解某个网站的验证码.验证码如下: 在本文中,我们将会用Keras来搭建一个稍微复杂的CNN模型来破解以上的 ...

  7. 使用Keras进行深度学习:(二)CNN讲解及实践

    欢迎大家关注我们的网站和系列教程:http://www.tensorflownews.com/,学习更多的机器学习.深度学习的知识! 现今最主流的处理图像数据的技术当属深度神经网络了,尤其是卷积神经网 ...

  8. 深度学习:Keras入门(二)之卷积神经网络(CNN)

    说明:这篇文章需要有一些相关的基础知识,否则看起来可能比较吃力. 1.卷积与神经元 1.1 什么是卷积? 简单来说,卷积(或内积)就是一种先把对应位置相乘然后再把结果相加的运算.(具体含义或者数学公式 ...

  9. 深度学习:Keras入门(二)之卷积神经网络(CNN)【转】

    本文转载自:https://www.cnblogs.com/lc1217/p/7324935.html 说明:这篇文章需要有一些相关的基础知识,否则看起来可能比较吃力. 1.卷积与神经元 1.1 什么 ...

随机推荐

  1. Python3.6全栈开发实例[004]

    4.计算传入函数的字符串中, 数字.字母.空格以及其他内容的个数,并返回结果. s1 = 'wan%$#(gwdwq\nwdhuaiww3 w02041718' def func1(s1): dic ...

  2. Python学习笔记2_Python基础

    一.变量(给数据起个名字) 变量是计算机内存中的一块区域,变量可以存储规定范围内的值,而且值可以改变. 1.变量的命名方法 -变量名有字母.数字.下划线组成 -不能以数字开头 -不可以使用关键字 -a ...

  3. python学习之路-第二天-常见的注意事项(代码风格、运算符、优先级、控制语句)

    总结了今天学习几个注意事项: 对代码声明变量的时候没必要像以前写java或者c代码要声明数据类型,只需要赋值即可 代码一行基本只写一句逻辑行,而且尽量不在python里面写':' 明确的行连接'',暗 ...

  4. Excel 查找某一列中包含指定字符的单元格

    网上查找相关内容,个人感觉是另一种形式的过滤喽.有的说用FIND,有的用高级筛选.我查找时如下: 1.新拉一列,标注公式“=ISNUMBER(FIND("宣",B2))”,然后拉至 ...

  5. 【转】Python爬虫(1)_基本原理

    一 爬虫是什么 #如果我们把互联网比作一张大的蜘蛛网,数据便是存放于蜘蛛网的各个节点,而爬虫就是一只小蜘蛛,沿着网络抓取自己的猎物/数据 #爬虫指的是:向网站发起请求,获取资源后分析并提取有用数据的程 ...

  6. Fidder详解之抓包

    前言 本文是博主发表的第一篇文章,如有傻逼之处,请大家见谅.最近遇到很多人说接口相关的问题,比如:什么是接口,我该怎么做接口测试,还有我总是抓不到APP上的https请求(这个巨坑,不知道坑了多少小白 ...

  7. 生成sql表结构

    DataConstruct.php <?php /** * Created by PhpStorm. * User: Administrator * Date: 2017/7/21 * Time ...

  8. ubuntu: lightdm 登录root超级管理员方法

    ubuntu 12.04 lts 默认是不允许root登录的, 在登录窗口只能看到普通用户和访客登录. 以普通身份登陆Ubuntu后我们需要做一些修改,普通用户登录后, 修改系统配置文件需要切换到超级 ...

  9. 摊铺机基本参数介绍(鼎盛天工WTD9501A)

    柴油水冷发动机,马力强劲,功率储备系数大,低噪音.低污染,经济性好,低温起动性能好.微电子控制,分别实现刮板输送.螺旋供料左右独立驱动,可实现自动供料,保持熨平板前物料均匀,调平系统响应速度快,调平精 ...

  10. Go 字符串相关-标准库

    标准库中有四个包对字符串处理尤为重要: bytes strings strconv unicode strings包提供了许多如字符串的查询.替换.比较.截断.拆分和合并等功能. bytes包也提供了 ...