fasttext和cnn的比较，使用keras imdb看效果—

fasttext:

'''This example demonstrates the use of fasttext for text classification

Based on Joulin et al's paper:

Bags of Tricks for Efficient Text Classification

https://arxiv.org/abs/1607.01759

Results on IMDB datasets with uni and bi-gram embeddings:

    Uni-gram: 0.8813 test accuracy after 5 epochs. 8s/epoch on i7 cpu.

    Bi-gram : 0.9056 test accuracy after 5 epochs. 2s/epoch on GTx 980M gpu.

'''

from __future__ import print_function

import numpy as np

from keras.preprocessing import sequence

from keras.models import Sequential

from keras.layers import Dense

from keras.layers import Embedding

from keras.layers import GlobalAveragePooling1D

from keras.datasets import imdb

import numpy as np

import json

import warnings

def load_data(path='imdb.npz', num_words=None, skip_top=0,

              maxlen=None, seed=113,

              start_char=1, oov_char=2, index_from=3, **kwargs):

    """Loads the IMDB dataset.

    # Arguments

        path: where to cache the data (relative to `~/.keras/dataset`).

        num_words: max number of words to include. Words are ranked

            by how often they occur (in the training set) and only

            the most frequent words are kept

        skip_top: skip the top N most frequently occurring words

            (which may not be informative).

        maxlen: sequences longer than this will be filtered out.

        seed: random seed for sample shuffling.

        start_char: The start of a sequence will be marked with this character.

            Set to 1 because 0 is usually the padding character.

        oov_char: words that were cut out because of the `num_words`

            or `skip_top` limit will be replaced with this character.

        index_from: index actual words with this index and higher.

    # Returns

        Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.

    # Raises

        ValueError: in case `maxlen` is so low

            that no input sequence could be kept.

    Note that the 'out of vocabulary' character is only used for

    words that were present in the training set but are not included

    because they're not making the `num_words` cut here.

    Words that were not seen in the training set but are in the test set

    have simply been skipped.

    """

    # Legacy support

    if 'nb_words' in kwargs:

        warnings.warn('The `nb_words` argument in `load_data` '

                      'has been renamed `num_words`.')

        num_words = kwargs.pop('nb_words')

    if kwargs:

        raise TypeError('Unrecognized keyword arguments: ' + str(kwargs))

    #path = get_file(path,

    #                origin='https://s3.amazonaws.com/text-datasets/imdb.npz',

    #                file_hash='599dadb1135973df5b59232a0e9a887c')

    with np.load(path) as f:

        x_train, labels_train = f['x_train'], f['y_train']

        x_test, labels_test = f['x_test'], f['y_test']

    np.random.seed(seed)

    indices = np.arange(len(x_train))

    np.random.shuffle(indices)

    x_train = x_train[indices]

    labels_train = labels_train[indices]

    indices = np.arange(len(x_test))

    np.random.shuffle(indices)

    x_test = x_test[indices]

    labels_test = labels_test[indices]

    xs = np.concatenate([x_train, x_test])

    labels = np.concatenate([labels_train, labels_test])

    if start_char is not None:

        xs = [[start_char] + [w + index_from for w in x] for x in xs]

    elif index_from:

        xs = [[w + index_from for w in x] for x in xs]

    if maxlen:

        xs, labels = _remove_long_seq(maxlen, xs, labels)

        if not xs:

            raise ValueError('After filtering for sequences shorter than maxlen=' +

                             str(maxlen) + ', no sequence was kept. '

                             'Increase maxlen.')

    if not num_words:

        num_words = max([max(x) for x in xs])

    # by convention, use 2 as OOV word

    # reserve 'index_from' (=3 by default) characters:

    # 0 (padding), 1 (start), 2 (OOV)

    if oov_char is not None:

        xs = [[w if (skip_top <= w < num_words) else oov_char for w in x] for x in xs]

    else:

        xs = [[w for w in x if skip_top <= w < num_words] for x in xs]

    idx = len(x_train)

    x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])

    x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])

    return (x_train, y_train), (x_test, y_test)

def create_ngram_set(input_list, ngram_value=2):

    """

    Extract a set of n-grams from a list of integers.

    >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=2)

    {(4, 9), (4, 1), (1, 4), (9, 4)}

    >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=3)

    [(1, 4, 9), (4, 9, 4), (9, 4, 1), (4, 1, 4)]

    """

    return set(zip(*[input_list[i:] for i in range(ngram_value)]))

def add_ngram(sequences, token_indice, ngram_range=2):

    """

    Augment the input list of list (sequences) by appending n-grams values.

    Example: adding bi-gram

    >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]

    >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017}

    >>> add_ngram(sequences, token_indice, ngram_range=2)

    [[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42]]

    Example: adding tri-gram

    >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]

    >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017, (7, 9, 2): 2018}

    >>> add_ngram(sequences, token_indice, ngram_range=3)

    [[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42, 2018]]

    """

    new_sequences = []

    for input_list in sequences:

        new_list = input_list[:]

        for ngram_value in range(2, ngram_range + 1):

            for i in range(len(new_list) - ngram_value + 1):

                ngram = tuple(new_list[i:i + ngram_value])

                if ngram in token_indice:

                    new_list.append(token_indice[ngram])

        new_sequences.append(new_list)

    return new_sequences

# Set parameters:

# ngram_range = 2 will add bi-grams features

ngram_range = 1

max_features = 20000

maxlen = 400

batch_size = 32

embedding_dims = 50

epochs = 5

print('Loading data...')

# the data, split between train and test sets

#(x_train, y_train), (x_test, y_test) = load_data()

(x_train, y_train), (x_test, y_test) = load_data(num_words=max_features)

print(len(x_train), 'train sequences')

print(len(x_test), 'test sequences')

print('Average train sequence length: {}'.format(np.mean(list(map(len, x_train)), dtype=int)))

print('Average test sequence length: {}'.format(np.mean(list(map(len, x_test)), dtype=int)))

if ngram_range > 1:

    print('Adding {}-gram features'.format(ngram_range))

    # Create set of unique n-gram from the training set.

    ngram_set = set()

    for input_list in x_train:

        for i in range(2, ngram_range + 1):

            set_of_ngram = create_ngram_set(input_list, ngram_value=i)

            ngram_set.update(set_of_ngram)

    # Dictionary mapping n-gram token to a unique integer.

    # Integer values are greater than max_features in order

    # to avoid collision with existing features.

    start_index = max_features + 1

    token_indice = {v: k + start_index for k, v in enumerate(ngram_set)}

    indice_token = {token_indice[k]: k for k in token_indice}

    # max_features is the highest integer that could be found in the dataset.

    max_features = np.max(list(indice_token.keys())) + 1

    # Augmenting x_train and x_test with n-grams features

    x_train = add_ngram(x_train, token_indice, ngram_range)

    x_test = add_ngram(x_test, token_indice, ngram_range)

    print('Average train sequence length: {}'.format(np.mean(list(map(len, x_train)), dtype=int)))

    print('Average test sequence length: {}'.format(np.mean(list(map(len, x_test)), dtype=int)))

print('Pad sequences (samples x time)')

x_train = sequence.pad_sequences(x_train, maxlen=maxlen)

x_test = sequence.pad_sequences(x_test, maxlen=maxlen)

print('x_train shape:', x_train.shape)

print('x_test shape:', x_test.shape)

print('Build model...')

model = Sequential()

# we start off with an efficient embedding layer which maps

# our vocab indices into embedding_dims dimensions

model.add(Embedding(max_features,

                    embedding_dims,

                    input_length=maxlen))

# we add a GlobalAveragePooling1D, which will average the embeddings

# of all words in the document

model.add(GlobalAveragePooling1D())

# We project onto a single unit output layer, and squash it with a sigmoid:

model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',

              optimizer='adam',

              metrics=['accuracy'])

model.fit(x_train, y_train,

          batch_size=batch_size,

          epochs=epochs, validation_data=(x_test, y_test))

效果：

Train on 25000 samples, validate on 25000 samples

Epoch 1/50

2018-06-06 15:50:28.133461: I tensorflow/core/platform/cpu_feature_guard.cc:140] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA

25000/25000 [==============================] - 9s 379us/step - loss: 0.6125 - acc: 0.7431 - val_loss: 0.5050 - val_acc: 0.8227

Epoch 2/50

25000/25000 [==============================] - 10s 402us/step - loss: 0.4059 - acc: 0.8633 - val_loss: 0.3738 - val_acc: 0.8646

Epoch 3/50

25000/25000 [==============================] - 11s 441us/step - loss: 0.3061 - acc: 0.8934 - val_loss: 0.3219 - val_acc: 0.8783

Epoch 4/50

25000/25000 [==============================] - 9s 375us/step - loss: 0.2550 - acc: 0.9110 - val_loss: 0.2970 - val_acc: 0.8853

Epoch 5/50

可以看到一个epoch只需要10来秒，还是很快的！但是我训练到50个epoch后发现acc 100%，但是验证集上数据acc 86%，看来是过拟合了。

再看看传统cnn：

'''This example demonstrates the use of Convolution1D for text classification.

Gets to 0.89 test accuracy after 2 epochs.

90s/epoch on Intel i5 2.4Ghz CPU.

10s/epoch on Tesla K40 GPU.

'''

from __future__ import print_function

from keras.preprocessing import sequence

from keras.models import Sequential

from keras.layers import Dense, Dropout, Activation

from keras.layers import Embedding

from keras.layers import Conv1D, GlobalMaxPooling1D

from keras.datasets import imdb

# set parameters:

max_features = 5000

import numpy as np

import json

import warnings

def load_data(path='imdb.npz', num_words=None, skip_top=0,

              maxlen=None, seed=113,

              start_char=1, oov_char=2, index_from=3, **kwargs):

    """Loads the IMDB dataset.

    # Arguments

        path: where to cache the data (relative to `~/.keras/dataset`).

        num_words: max number of words to include. Words are ranked

            by how often they occur (in the training set) and only

            the most frequent words are kept

        skip_top: skip the top N most frequently occurring words

            (which may not be informative).

        maxlen: sequences longer than this will be filtered out.

        seed: random seed for sample shuffling.

        start_char: The start of a sequence will be marked with this character.

            Set to 1 because 0 is usually the padding character.

        oov_char: words that were cut out because of the `num_words`

            or `skip_top` limit will be replaced with this character.

        index_from: index actual words with this index and higher.

    # Returns

        Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.

    # Raises

        ValueError: in case `maxlen` is so low

            that no input sequence could be kept.

    Note that the 'out of vocabulary' character is only used for

    words that were present in the training set but are not included

    because they're not making the `num_words` cut here.

    Words that were not seen in the training set but are in the test set

    have simply been skipped.

    """

    # Legacy support

    if 'nb_words' in kwargs:

        warnings.warn('The `nb_words` argument in `load_data` '

                      'has been renamed `num_words`.')

        num_words = kwargs.pop('nb_words')

    if kwargs:

        raise TypeError('Unrecognized keyword arguments: ' + str(kwargs))

    #path = get_file(path,

    #                origin='https://s3.amazonaws.com/text-datasets/imdb.npz',

    #                file_hash='599dadb1135973df5b59232a0e9a887c')

    with np.load(path) as f:

        x_train, labels_train = f['x_train'], f['y_train']

        x_test, labels_test = f['x_test'], f['y_test']

    np.random.seed(seed)

    indices = np.arange(len(x_train))

    np.random.shuffle(indices)

    x_train = x_train[indices]

    labels_train = labels_train[indices]

    indices = np.arange(len(x_test))

    np.random.shuffle(indices)

    x_test = x_test[indices]

    labels_test = labels_test[indices]

    xs = np.concatenate([x_train, x_test])

    labels = np.concatenate([labels_train, labels_test])

    if start_char is not None:

        xs = [[start_char] + [w + index_from for w in x] for x in xs]

    elif index_from:

        xs = [[w + index_from for w in x] for x in xs]

    if maxlen:

        xs, labels = _remove_long_seq(maxlen, xs, labels)

        if not xs:

            raise ValueError('After filtering for sequences shorter than maxlen=' +

                             str(maxlen) + ', no sequence was kept. '

                             'Increase maxlen.')

    if not num_words:

        num_words = max([max(x) for x in xs])

    # by convention, use 2 as OOV word

    # reserve 'index_from' (=3 by default) characters:

    # 0 (padding), 1 (start), 2 (OOV)

    if oov_char is not None:

        xs = [[w if (skip_top <= w < num_words) else oov_char for w in x] for x in xs]

    else:

        xs = [[w for w in x if skip_top <= w < num_words] for x in xs]

    idx = len(x_train)

    x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])

    x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])

    return (x_train, y_train), (x_test, y_test)

def create_ngram_set(input_list, ngram_value=2):

    """

    Extract a set of n-grams from a list of integers.

    >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=2)

    {(4, 9), (4, 1), (1, 4), (9, 4)}

    >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=3)

    [(1, 4, 9), (4, 9, 4), (9, 4, 1), (4, 1, 4)]

    """

    return set(zip(*[input_list[i:] for i in range(ngram_value)]))

def add_ngram(sequences, token_indice, ngram_range=2):

    """

    Augment the input list of list (sequences) by appending n-grams values.

    Example: adding bi-gram

    >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]

    >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017}

    >>> add_ngram(sequences, token_indice, ngram_range=2)

    [[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42]]

    Example: adding tri-gram

    >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]

    >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017, (7, 9, 2): 2018}

    >>> add_ngram(sequences, token_indice, ngram_range=3)

    [[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42, 2018]]

    """

    new_sequences = []

    for input_list in sequences:

        new_list = input_list[:]

        for ngram_value in range(2, ngram_range + 1):

            for i in range(len(new_list) - ngram_value + 1):

                ngram = tuple(new_list[i:i + ngram_value])

                if ngram in token_indice:

                    new_list.append(token_indice[ngram])

        new_sequences.append(new_list)

    return new_sequences

maxlen = 400

batch_size = 32

embedding_dims = 50

filters = 250

kernel_size = 3

hidden_dims = 250

epochs = 5

print('Loading data...')

(x_train, y_train), (x_test, y_test) = load_data(num_words=max_features)

print(len(x_train), 'train sequences')

print(len(x_test), 'test sequences')

print('Pad sequences (samples x time)')

x_train = sequence.pad_sequences(x_train, maxlen=maxlen)

x_test = sequence.pad_sequences(x_test, maxlen=maxlen)

print('x_train shape:', x_train.shape)

print('x_test shape:', x_test.shape)

print('Build model...')

model = Sequential()

# we start off with an efficient embedding layer which maps

# our vocab indices into embedding_dims dimensions

model.add(Embedding(max_features,

                    embedding_dims,

                    input_length=maxlen))

model.add(Dropout(0.2))

# we add a Convolution1D, which will learn filters

# word group filters of size filter_length:

model.add(Conv1D(filters,

                 kernel_size,

                 padding='valid',

                 activation='relu',

                 strides=1))

# we use max pooling:

model.add(GlobalMaxPooling1D())

# We add a vanilla hidden layer:

model.add(Dense(hidden_dims))

model.add(Dropout(0.2))

model.add(Activation('relu'))

# We project onto a single unit output layer, and squash it with a sigmoid:

model.add(Dense(1))

model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',

              optimizer='adam',

              metrics=['accuracy'])

model.fit(x_train, y_train,

          batch_size=batch_size,

          epochs=epochs,

validation_data=(x_test, y_test))

效果：

Train on 25000 samples, validate on 25000 samples
Epoch 1/5
2018-06-06 16:10:34.733973: I tensorflow/core/platform/cpu_feature_guard.cc:140] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
25000/25000 [==============================] - 117s 5ms/step - loss: 0.4044 - acc: 0.8007 - val_loss: 0.3212 - val_acc: 0.8600
Epoch 2/5
25000/25000 [==============================] - 121s 5ms/step - loss: 0.2323 - acc: 0.9057 - val_loss: 0.2903 - val_acc: 0.8801
Epoch 3/5
25000/25000 [==============================] - 124s 5ms/step - loss: 0.1640 - acc: 0.9377 - val_loss: 0.2720 - val_acc: 0.8900
Epoch 4/5
25000/25000 [==============================] - 116s 5ms/step - loss: 0.1136 - acc: 0.9579 - val_loss: 0.3353 - val_acc: 0.8811
Epoch 5/5
25000/25000 [==============================] - 114s 5ms/step - loss: 0.0764 - acc: 0.9726 - val_loss: 0.3958 - val_acc: 0.8793

可以看出cnn的确要慢10倍。

fasttext和cnn的比较，使用keras imdb看效果——cnn要慢10倍。的更多相关文章

Deep Learning模型之：CNN卷积神经网络（一）深度解析CNN
http://m.blog.csdn.net/blog/wu010555688/24487301 本文整理了网上几位大牛的博客,详细地讲解了CNN的基础结构与核心思想,欢迎交流. [1]Deep le ...
Keras（四）CNN 卷积神经网络 RNN 循环神经网络原理及实例
CNN 卷积神经网络卷积池化 https://www.cnblogs.com/peng8098/p/nlp_16.html 中有介绍以数据集MNIST构建一个卷积神经网路 from keras. ...
【Keras案例学习】 CNN做手写字符分类（mnist_cnn ）
from __future__ import print_function import numpy as np np.random.seed(1337) from keras.datasets im ...
Keras框架下使用CNN进行CIFAR-10的识别测试
有手册,然后代码不知道看一下:https://keras-cn.readthedocs.io/en/latest/ 首先是下载数据集,下载太慢了就从网盘上下载: 链接:https://pan.baid ...
[C1W1] Neural Networks and Deep Learning - Introduction to Deep Learning
第一周:深度学习引言(Introduction to Deep Learning) 欢迎(Welcome) 深度学习改变了传统互联网业务,例如如网络搜索和广告.但是深度学习同时也使得许多新产品和企业以 ...
keras入门（三）搭建CNN模型破解网站验证码
项目介绍在文章CNN大战验证码中,我们利用TensorFlow搭建了简单的CNN模型来破解某个网站的验证码.验证码如下: 在本文中,我们将会用Keras来搭建一个稍微复杂的CNN模型来破解以上的 ...
使用Keras进行深度学习：（二）CNN讲解及实践
欢迎大家关注我们的网站和系列教程:http://www.tensorflownews.com/,学习更多的机器学习.深度学习的知识! 现今最主流的处理图像数据的技术当属深度神经网络了,尤其是卷积神经网 ...
深度学习：Keras入门(二)之卷积神经网络(CNN)
说明:这篇文章需要有一些相关的基础知识,否则看起来可能比较吃力. 1.卷积与神经元 1.1 什么是卷积? 简单来说,卷积(或内积)就是一种先把对应位置相乘然后再把结果相加的运算.(具体含义或者数学公式 ...
深度学习：Keras入门(二)之卷积神经网络(CNN)【转】
本文转载自:https://www.cnblogs.com/lc1217/p/7324935.html 说明:这篇文章需要有一些相关的基础知识,否则看起来可能比较吃力. 1.卷积与神经元 1.1 什么 ...

随机推荐

人工智能-基于百度baidu-ai和图灵机器人实现学说话机器人
本文引用了2个js文件,这里提供下CDN资源,! <script type="application/javascript" src="https://cdn.bo ...
我的Android进阶之旅------>Android中如何高效率的进行简繁体转换
因为APP要做国际化适配,所以就需要顾及到香港和台湾都是使用繁体字,怎样快速便捷高效的把简体字转换成繁体字呢? 说实话我之前用的方法比较呆板,把每个需要转换的字符串进行在线翻译.今天突然发现word或 ...
nodejs get请求
const http = require('http'); http.get('http://192.168.1.6:8080/getDemo?msg=12', (res) => { const ...
make menuconfig 时出现 mixed implicit and normal rules: deprecated syntax
這是 make 的版本問題!不清楚為何要這樣限制? 將此行 config %config: scripts_basic outputmakefile FORCE改成 ...
20170520 BADI增强学习
一.要求:Tcode:FF_5 导入数据运行时,产生财务凭证之前修改某些字段值.Exmp:FEBRE-VWEZWBKPF-XBLNRFEBEP-CHECTBSEG-ZUONR there is a b ...
4.4 使用STM32控制MC20进行GPS帧数据解析
需要准备的硬件 MC20开发板 1个 https://item.taobao.com/item.htm?id=562661881042 GSM/GPRS天线 1根 https://item.taoba ...
Loadrunder常见问题汇总（持续更新）
1.LR 脚本为空的解决方法: 1)如果安装了IE以外的浏览器,并且IE不是默认浏览器,则无法生成录制脚本 2)如果录制脚本时IE不能打开,则需要将浏览器的IE工具高级选项中,将“启用第三方浏览器扩展 ...
PHP验证是否为图片格式文件
/** * 判断是否为图片格式(jpg/jpeg/gif/png)文件 * * @param string $filePath * @return bool|string */ function is ...
百度竞价推广URL通配符使用说明
{keywordid} 被替换为触发该创意的关键词ID(全局唯一ID,不是字面ID),当没有对应的keywordid时,替换为0. {creative} 被替换为所点击的创意ID(全局唯一ID). 2 ...
探究操作系统【TLCL】
ls – List directory contents file – Determine file type less – View file contents ls常用选项 ls -a 全部输出 ...

fasttext和cnn的比较，使用keras imdb看效果——cnn要慢10倍。

fasttext和cnn的比较，使用keras imdb看效果——cnn要慢10倍。的更多相关文章

随机推荐

热门专题