使用 NLTK 对文本进行清洗，索引工具

EN_WHITELIST = '0123456789abcdefghijklmnopqrstuvwxyz ' # space is included in whitelist

EN_BLACKLIST = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\''

FILENAME = 'data/chat.txt'

limit = {

        'maxq' : 20,

        'minq' : 0,

        'maxa' : 20,

        'mina' : 3

        }

UNK = 'unk'

VOCAB_SIZE = 6000

import random

import sys

import nltk

import itertools

from collections import defaultdict

import numpy as np

import pickle

def ddefault():

    return 1

'''

 read lines from file

     return [list of lines]

'''

def read_lines(filename):

    return open(filename).read().split('\n')[:-1]

'''

 split sentences in one line

  into multiple lines

    return [list of lines]

'''

def split_line(line):

    return line.split('.')

'''

 remove anything that isn't in the vocabulary

    return str(pure ta/en)

'''

def filter_line(line, whitelist):

    return ''.join([ ch for ch in line if ch in whitelist ])

'''

 read list of words, create index to word,

  word to index dictionaries

    return tuple( vocab->(word, count), idx2w, w2idx )

'''

def index_(tokenized_sentences, vocab_size):

    # get frequency distribution

    freq_dist = nltk.FreqDist(itertools.chain(*tokenized_sentences))

    # get vocabulary of 'vocab_size' most used words

    vocab = freq_dist.most_common(vocab_size)

    # index2word

    index2word = ['_'] + [UNK] + [ x[0] for x in vocab ]

    # word2index

    word2index = dict([(w,i) for i,w in enumerate(index2word)] )

    return index2word, word2index, freq_dist

'''

 filter too long and too short sequences

    return tuple( filtered_ta, filtered_en )

'''

def filter_data(sequences):

    filtered_q, filtered_a = [], []

    raw_data_len = len(sequences)//2

    for i in range(0, len(sequences), 2):

        qlen, alen = len(sequences[i].split(' ')), len(sequences[i+1].split(' '))

        if qlen >= limit['minq'] and qlen <= limit['maxq']:

            if alen >= limit['mina'] and alen <= limit['maxa']:

                filtered_q.append(sequences[i])

                filtered_a.append(sequences[i+1])

    # print the fraction of the original data, filtered

    filt_data_len = len(filtered_q)

    filtered = int((raw_data_len - filt_data_len)*100/raw_data_len)

    print(str(filtered) + '% filtered from original data')

    return filtered_q, filtered_a

'''

 create the final dataset :

  - convert list of items to arrays of indices

  - add zero padding

      return ( [array_en([indices]), array_ta([indices]) )

'''

def zero_pad(qtokenized, atokenized, w2idx):

    # num of rows

    data_len = len(qtokenized)

    # numpy arrays to store indices

    idx_q = np.zeros([data_len, limit['maxq']], dtype=np.int32)

    idx_a = np.zeros([data_len, limit['maxa']], dtype=np.int32)

    for i in range(data_len):

        q_indices = pad_seq(qtokenized[i], w2idx, limit['maxq'])

        a_indices = pad_seq(atokenized[i], w2idx, limit['maxa'])

        #print(len(idx_q[i]), len(q_indices))

        #print(len(idx_a[i]), len(a_indices))

        idx_q[i] = np.array(q_indices)

        idx_a[i] = np.array(a_indices)

    return idx_q, idx_a

'''

 replace words with indices in a sequence

  replace with unknown if word not in lookup

    return [list of indices]

'''

def pad_seq(seq, lookup, maxlen):

    indices = []

    for word in seq:

        if word in lookup:

            indices.append(lookup[word])

        else:

            indices.append(lookup[UNK])

    return indices + [0]*(maxlen - len(seq))

def process_data():

    print('\n>> Read lines from file')

    lines = read_lines(filename=FILENAME)

    # change to lower case (just for en)

    lines = [ line.lower() for line in lines ]

    print('\n:: Sample from read(p) lines')

    print(lines[121:125])

    # filter out unnecessary characters

    print('\n>> Filter lines')

    lines = [ filter_line(line, EN_WHITELIST) for line in lines ]

    print(lines[121:125])

    # filter out too long or too short sequences

    print('\n>> 2nd layer of filtering')

    qlines, alines = filter_data(lines)

    print('\nq : {0} ; a : {1}'.format(qlines[60], alines[60]))

    print('\nq : {0} ; a : {1}'.format(qlines[61], alines[61]))

    # convert list of [lines of text] into list of [list of words ]

    print('\n>> Segment lines into words')

    qtokenized = [ wordlist.split(' ') for wordlist in qlines ]

    atokenized = [ wordlist.split(' ') for wordlist in alines ]

    print('\n:: Sample from segmented list of words')

    print('\nq : {0} ; a : {1}'.format(qtokenized[60], atokenized[60]))

    print('\nq : {0} ; a : {1}'.format(qtokenized[61], atokenized[61]))

    # indexing -> idx2w, w2idx : en/ta

    print('\n >> Index words')

    idx2w, w2idx, freq_dist = index_( qtokenized + atokenized, vocab_size=VOCAB_SIZE)

    print('\n >> Zero Padding')

    idx_q, idx_a = zero_pad(qtokenized, atokenized, w2idx)

    print('\n >> Save numpy arrays to disk')

    # save them

    np.save('idx_q.npy', idx_q)

    np.save('idx_a.npy', idx_a)

    # let us now save the necessary dictionaries

    metadata = {

            'w2idx' : w2idx,

            'idx2w' : idx2w,

            'limit' : limit,

            'freq_dist' : freq_dist

                }

    # write to disk : data control dictionaries

    with open('metadata.pkl', 'wb') as f:

        pickle.dump(metadata, f)

def load_data(PATH=''):

    # read data control dictionaries

    with open(PATH + 'metadata.pkl', 'rb') as f:

        metadata = pickle.load(f)

    # read numpy arrays

    idx_ta = np.load(PATH + 'idx_q.npy')

    idx_en = np.load(PATH + 'idx_a.npy')

    return metadata, idx_q, idx_a

if __name__ == '__main__':

    process_data()

使用 NLTK 对文本进行清洗，索引工具的更多相关文章

【NLP】Python NLTK获取文本语料和词汇资源
Python NLTK 获取文本语料和词汇资源作者:白宁超 2016年11月7日13:15:24 摘要:NLTK是由宾夕法尼亚大学计算机和信息科学使用python语言实现的一种自然语言工具包,其收集 ...
bash文本查看及处理工具
文本查看及处理工具: wc [OPTION] FILE... -c: 字节数 -l:行数 -w: 单词数 who | w ...
js实现去文本换行符小工具
js实现去文本换行符小工具一.总结一句话总结: 1.vertical属性使用的时候注意看清定义,也注意父元素的基准线问题.vertical-align:top; 2.获取textareaEleme ...
基于COCA词频表的文本词汇分布测试工具v0.1
美国语言协会对美国人日常使用的英语单词做了一份详细的统计,按照日常使用的频率做成了一张表,称为COCA词频表.排名越低的单词使用频率越高,该表可以用来统计词汇量. 如果你的词汇量约为6000,那么这张 ...
MySQL检查重复索引工具-pt-duplicate-key-checker
在MySQL中是允许在同一个列上创建多个索引的,示例如下: mysql --socket=/tmp/mysql5173.sock -uroot -p mysql> SELECT VERSION( ...
Linux Shell处理文本最常用的工具大盘点
导读本文将介绍Linux下使用Shell处理文本时最常用的工具:find.grep.xargs.sort.uniq.tr.cut.paste.wc.sed.awk:提供的例子和参数都是最常用和最为实 ...
NLTK和Stanford NLP两个工具的安装配置
这里安装的是两个自然语言处理工具,NLTK和Stanford NLP. 声明:笔者操作系统是Windows10,理论上Windows都可以: 版本号:NLTK 3.2 Stanford NLP 3.6 ...
谈谈开发文本转URL小工具的思路
URL提供了一种定位互联网上任意资源的手段,由于采用HTTP协议的URL能在互联网上自由传播和使用,所以能大行其道.在软件开发.测试甚至部署的环节,URL几乎可以说无处不再,其中用来定位文本的URL数 ...
nltk处理文本
nltk(Natural Language Toolkit)是处理文本的利器. 安装 pip install nltk 进入python命令行,键入nltk.download()可以下载nltk需要的 ...

随机推荐

自定义 ---UICollectionViewLayout-正N变形居中布局
1. 自定义UICollectionLayout ---- 正三角形居中布局支持多个图形的自动布局 2. 自定义UICollectionLayout ---- 正方形居中布局滚动展示的区域 3. ...
关于图数据库HugeGraph的百万，千万，亿量级测试
1.Hugegraph测试硬件 1.1.本机硬件本机测试hugeGraph版本:0.10.4 后置存储数据库:rocksdb,1TB的普通硬盘 1.2.测试服务器硬件测试服务器hugegraph版 ...
Slog64_项目上线之ArthurSlog个人网站上线3
ArthurSlog SLog-64 Year·1 Guangzhou·China September 9th 2018 ArthurSlog Page GitHub NPM Package Page ...
JZOJ 1154. 【GDOI2003】购物
1154. [GDOI2003]购物 (Standard IO) Time Limits: 1000 ms Memory Limits: 65536 KB Description GDOI商场推出优惠 ...
Pycharm IDE安装及注册激活笔记（1）
一.Windows 下的安装及激活. 1.首先去Pycharm官网,或者直接输入网址:http://www.jetbrains.com/pycharm/download/#section=window ...
关于在layui中的table checkbox 默认选中设置
一.layui版本 layui-v2.4.5 二.设置table的checkbox默认选中总共有两种方法: 方法1:在返回的json中设置LAY_CHECKED为true,页面上的checkbox就 ...
SQL数据库中的增删改查总结1
一.增:有2种方法 1.使用insert插入单行数据: 语法:insert [into]<表名> [列名] values <列值> 例:insert into Strdents ...
Sublime text 3 运行python3
要在Sublime text3编译器中成功运行 python3,需要在编译器设置中将python3添加至编译器中新建编译系统编辑弹出的文件,添加如下内容: { "cmd":[& ...
scrapy 在爬取过程中抓取下载图片
先说前提,我不推荐在sarapy爬取过程中使用scrapy自带的 ImagesPipeline 进行下载,是在是太耗时间了最好是保存,在使用其他方法下载我这个是在 https://blog.csd ...

使用 NLTK 对文本进行清洗，索引工具

使用 NLTK 对文本进行清洗，索引工具的更多相关文章

随机推荐

热门专题