nlp英文的数据清洗代码

一、英文数据清洗

英文数据清洗是去除缩写、非字母符号、专有名词的缩写、提取词干、提取词根。

1.常规的清洗方式

去除非字母符号和常用缩写

#coding=utf-8

import jieba

import unicodedata

import sys,re,collections,nltk

from nltk.stem.wordnet import WordNetLemmatizer

from nltk.tokenize import word_tokenize

class rule:

    # 正则表达式过滤特殊符号用空格符占位，双引号、单引号、句点、逗号

    pat_letter = re.compile(r'[^a-zA-Z \']+')#保留'

    # 还原常见缩写单词

    pat_is = re.compile("(it|he|she|that|this|there|here)(\'s)", re.I)

    pat_s = re.compile("([a-zA-Z])(\'s)")  # 处理类似于这样的缩写today’s

    pat_not = re.compile("([a-zA-Z])(n\'t)")  # not的缩写

    pat_would = re.compile("([a-zA-Z])(\'d)")  # would的缩写

    pat_will = re.compile("([a-zA-Z])(\'ll)")  # will的缩写

    pat_am = re.compile("([I|i])(\'m)")  # am的缩写

    pat_are = re.compile("([a-zA-Z])(\'re)")  # are的缩写

    pat_ve = re.compile("([a-zA-Z])(\'ve)")  # have的缩写

def replace_abbreviations(text):

    new_text = text

    new_text = rule.pat_letter.sub(' ', new_text).strip().lower()

    new_text = rule.pat_is.sub(r"\1 is", new_text)#其中\1是匹配到的第一个group

    new_text = rule.pat_s.sub(r"\1 ", new_text)

    new_text = rule.pat_not.sub(r"\1 not", new_text)

    new_text = rule.pat_would.sub(r"\1 would", new_text)

    new_text = rule.pat_will.sub(r"\1 will", new_text)

    new_text = rule.pat_am.sub(r"\1 am", new_text)

    new_text = rule.pat_are.sub(r"\1 are", new_text)

    new_text = rule.pat_ve.sub(r"\1 have", new_text)

    new_text = new_text.replace('\'', ' ')

    return new_text

if __name__=='__main__':

    text='there\'re many recen\'t \'t extensions of this basic idea to include attention. 120,yes\'s it\'s'

    text=replace_abbreviations(text)

    print(text)#there are many rece not  t extensions of this basic idea to include attention   yes  it is

2.详细的处理方式

去除普通的缩写，还引入了一些专有名词的处理、标点符号的处理

import re

def clean_text(text):

    """

    Clean text

    :param text: the string of text

    :return: text string after cleaning

    """

    # acronym

    text = re.sub(r"can\'t", "can not", text)

    text = re.sub(r"cannot", "can not ", text)

    text = re.sub(r"what\'s", "what is", text)

    text = re.sub(r"What\'s", "what is", text)

    text = re.sub(r"\'ve ", " have ", text)

    text = re.sub(r"n\'t", " not ", text)

    text = re.sub(r"i\'m", "i am ", text)

    text = re.sub(r"I\'m", "i am ", text)

    text = re.sub(r"\'re", " are ", text)

    text = re.sub(r"\'d", " would ", text)

    text = re.sub(r"\'ll", " will ", text)

    text = re.sub(r" e mail ", " email ", text)

    text = re.sub(r" e \- mail ", " email ", text)

    text = re.sub(r" e\-mail ", " email ", text)

    # spelling correction

    text = re.sub(r"ph\.d", "phd", text)

    text = re.sub(r"PhD", "phd", text)

    text = re.sub(r" e g ", " eg ", text)

    text = re.sub(r" fb ", " facebook ", text)

    text = re.sub(r"facebooks", " facebook ", text)

    text = re.sub(r"facebooking", " facebook ", text)

    text = re.sub(r" usa ", " america ", text)

    text = re.sub(r" us ", " america ", text)

    text = re.sub(r" u s ", " america ", text)

    text = re.sub(r" U\.S\. ", " america ", text)

    text = re.sub(r" US ", " america ", text)

    text = re.sub(r" American ", " america ", text)

    text = re.sub(r" America ", " america ", text)

    text = re.sub(r" mbp ", " macbook-pro ", text)

    text = re.sub(r" mac ", " macbook ", text)

    text = re.sub(r"macbook pro", "macbook-pro", text)

    text = re.sub(r"macbook-pros", "macbook-pro", text)

    text = re.sub(r" 1 ", " one ", text)

    text = re.sub(r" 2 ", " two ", text)

    text = re.sub(r" 3 ", " three ", text)

    text = re.sub(r" 4 ", " four ", text)

    text = re.sub(r" 5 ", " five ", text)

    text = re.sub(r" 6 ", " six ", text)

    text = re.sub(r" 7 ", " seven ", text)

    text = re.sub(r" 8 ", " eight ", text)

    text = re.sub(r" 9 ", " nine ", text)

    text = re.sub(r"googling", " google ", text)

    text = re.sub(r"googled", " google ", text)

    text = re.sub(r"googleable", " google ", text)

    text = re.sub(r"googles", " google ", text)

    text = re.sub(r"dollars", " dollar ", text)

    # punctuation

    text = re.sub(r"\+", " + ", text)

    text = re.sub(r"'", " ", text)

    text = re.sub(r"-", " - ", text)

    text = re.sub(r"/", " / ", text)

    text = re.sub(r"\\", " \ ", text)

    text = re.sub(r"=", " = ", text)

    text = re.sub(r"\^", " ^ ", text)

    text = re.sub(r":", " : ", text)

    text = re.sub(r"\.", " . ", text)

    text = re.sub(r",", " , ", text)

    text = re.sub(r"\?", " ? ", text)

    text = re.sub(r"!", " ! ", text)

    text = re.sub(r"\"", " \" ", text)

    text = re.sub(r"&", " & ", text)

    text = re.sub(r"\|", " | ", text)

    text = re.sub(r";", " ; ", text)

    text = re.sub(r"\(", " ( ", text)

    text = re.sub(r"\)", " ( ", text)

    # symbol replacement

    text = re.sub(r"&", " and ", text)

    text = re.sub(r"\|", " or ", text)

    text = re.sub(r"=", " equal ", text)

    text = re.sub(r"\+", " plus ", text)

    text = re.sub(r"\$", " dollar ", text)

    # remove extra space

    text = ' '.join(text.split())

    return text

if __name__=='__main__':

    text = 'there\'re many recen\'t \'t extensions of this basic idea to include attention. 120,yes\'s it\'s'

    text = clean_text(text)

    print(text)  # there are many rece not t extensions of this basic idea to include attention . 120 , yes s it s

3.包括有处理词根词缀的处理方式

去除符号、还原缩写、获取词根。

#coding=utf-8

import jieba

import unicodedata

import sys,re,collections,nltk

from nltk.stem.wordnet import WordNetLemmatizer

from nltk.tokenize import word_tokenize

class rule:

    # 正则表达式过滤特殊符号用空格符占位，双引号、单引号、句点、逗号

    pat_letter = re.compile(r'[^a-zA-Z \']+')#保留'

    # 还原常见缩写单词

    pat_is = re.compile("(it|he|she|that|this|there|here)(\'s)", re.I)

    pat_s = re.compile("([a-zA-Z])(\'s)")  # 处理类似于这样的缩写today’s

    pat_not = re.compile("([a-zA-Z])(n\'t)")  # not的缩写

    pat_would = re.compile("([a-zA-Z])(\'d)")  # would的缩写

    pat_will = re.compile("([a-zA-Z])(\'ll)")  # will的缩写

    pat_am = re.compile("([I|i])(\'m)")  # am的缩写

    pat_are = re.compile("([a-zA-Z])(\'re)")  # are的缩写

    pat_ve = re.compile("([a-zA-Z])(\'ve)")  # have的缩写

def replace_abbreviations(text):

    new_text = text

    new_text = rule.pat_letter.sub(' ', new_text).strip().lower()

    new_text = rule.pat_is.sub(r"\1 is", new_text)#其中\1是匹配到的第一个group

    new_text = rule.pat_s.sub(r"\1 ", new_text)

    new_text = rule.pat_not.sub(r"\1 not", new_text)

    new_text = rule.pat_would.sub(r"\1 would", new_text)

    new_text = rule.pat_will.sub(r"\1 will", new_text)

    new_text = rule.pat_am.sub(r"\1 am", new_text)

    new_text = rule.pat_are.sub(r"\1 are", new_text)

    new_text = rule.pat_ve.sub(r"\1 have", new_text)

    new_text = new_text.replace('\'', ' ')

    return new_text

# pos和tag有相似的地方，通过tag获得pos

def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):

        return nltk.corpus.wordnet.ADJ

    elif treebank_tag.startswith('V'):

        return nltk.corpus.wordnet.VERB

    elif treebank_tag.startswith('N'):

        return nltk.corpus.wordnet.NOUN

    elif treebank_tag.startswith('R'):#以副词

        return nltk.corpus.wordnet.ADV

    else:

        return ''

def merge(words):

    lmtzr = WordNetLemmatizer()

    new_words = ''

    words = nltk.pos_tag(word_tokenize(words))  # tag is like [('bigger', 'JJR')]

    for word in words:

        pos = get_wordnet_pos(word[1])

        if pos:

            # lemmatize()方法将word单词还原成pos词性的形式

            word = lmtzr.lemmatize(word[0], pos)

            new_words+=' '+word

        else:

            new_words+=' '+word[0]

    return new_words

def clear_data(text):

    text=replace_abbreviations(text)

    text=merge(text)

    text=text.strip()

    return text

if __name__=='__main__':

    text='there\'re many recen\'t \'t extensions of this basic had idea to include attention. 120,had'

    text=clear_data(text)

    print(text)#there be many rece not t extension of this basic have idea to include attention have

二、中文数据清洗

去除一些停用词。而停用词是文本中一些高频的代词、连词、介词等对文本分类无意义的词，通常维护一个停用词表，特征提取过程中删除停用表中出现的词，本质上属于特征选择的一部分。具体可参考Hanlp的停用词表https://github.com/hankcs/HanLP

nlp英文的数据清洗代码的更多相关文章

JavaScript验证字符串只能包含数字或者英文字符的代码实例
验证字符串只能包含数字或者英文字符的代码实例:本章节分享一段代码实例,它实现了验证字符串内容是否只包含英文字符或者数字.代码实例如下: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 ...
[NLP] The Annotated Transformer 代码修正
1. RuntimeError: "exp" not implemented for 'torch.LongTensor' class PositionalEncoding(nn. ...
NLP整体流程的代码
import nltk import numpy as np import re from nltk.corpus import stopwords # 1 分词1 text = "Sent ...
8个数据清洗Python代码，复制可用，最长11行 | 资源
最近,大数据工程师Kin Lim Lee在Medium上发表了一篇文章,介绍了8个用于数据清洗的Python代码. 数据清洗,是进行数据分析和使用数据训练模型的必经之路,也是最耗费数据科学家/程序员精 ...
git入门（4）团队中git保管代码常用操作
在团队中协作代码时候,一定要熟练使用以下git命令,不至于把代码库弄乱, PS:一定要提交自己代码(git push)时候,先进行更新本地代码库(git pull),不然提交异常 git常用命令 1· ...
快看Sample代码，速学Swift语言（1）-语法速览
Swift是苹果推出的一个比较新的语言,它除了借鉴语言如C#.Java等内容外,好像还采用了很多JavaScript脚本里面的一些脚本语法,用起来感觉非常棒,作为一个使用C#多年的技术控,对这种比较超 ...
IDEA 代码规范插件
前言在工作过程中,每个人的代码习惯都不同,在一起工作做同一个项目,如果按照自己的习惯来,有可能造成代码维护困难,开发进度缓慢等. 代码规范的重要性谷歌发布的代码规范中指出,80% 的缺失是由 20 ...
ph 提交代码的步骤；
ph 提交代码的步骤: git status 查看状态: ls -ah 查看文件: git stash list 查看本地缓存的文件: git branch 查看本地的分支: git checkout ...
cucumber java从入门到精通（2）用代码定义步骤
cucumber java从入门到精通(2)用代码定义步骤上一节里我们定义了feature文件,feature文件就是自然语言描述的用例文件,它有一定的章法,具体的潜规则是: 使用Feature关键 ...

随机推荐

ef not in
//not in linq var xx=(from c in measStateDetail where !((from d in breakInstr select d.InstrCode).Co ...
从零开始的openGL——五、光线追踪
前言前面介绍了基本图形.模型.曲线的绘制,但是,在好像还没有感受到那种3D游戏里一些能惊艳到自己的效果,即真实感还不是很足.这篇文章中介绍的光线追踪,是实现真实感必不可少的.拿下面的两张图片来对比 ...
Redis实战 | 5种Redis数据类型详解
我们知道Redis是目前非常主流的KV数据库,它因高性能的读写能力而著称,其实还有另外一个优势,就是Redis提供了更加丰富的数据类型,这使得Redis有着更加广泛的使用场景.那Redis提供给用户的 ...
HPS端的GPIO如何控制
该笔记主要记录HPS端的GPIO如何控制,包括控制LED和Key 1.GPIO地址映射 Peripheral Base Address 0xf000_0000 64M 2.HPS外设 (1)GPIO ...
LINUX网络传输的序列化和反序列化C++
来到公司实习,开始面向实习生有一个小项目,当然了,服务器就由我一个人来写了,1.0主要完成的使用纯C 写Linux下UDP的服务器框架,大部分还都是以前在学校做过的项目套用的,难度不大,经过跟大bos ...
Caffe源码-InsertSplits()函数
InsertSplits()函数在Net初始化的过程中,存在一个特殊的修改网络结构的操作,那就是当某层的输出blob对应多个其他层的输入blob时,会在输出blob所在层的后面插入一个新的Split ...
CVE-2018-12613-phpmyadmin4.8.1远程文件包含漏洞复现
CVE-2018-12613-phpmyadmin4.8.1远程文件包含漏洞复现参考文章1 参考文章2 By:Mirror王宇阳漏洞原理攻击者利用发现在服务器上包含(查看和潜在执行)文件的漏洞. ...
windows + flutter +android+ vscode 安装配置运行流程（详细版本）
flutter 是由谷歌发布的一个全新的响应式.跨平台.高性能的移动开发框架,可以快速在iOS和Android上构建高质量的原生用户界面. 框架特点快速开发:Flutter的热重载可以快速地进行测试 ...
oop面向对象【接口、多态】
今日内容 1.接口 2.三大特征——多态 3.引用类型转换教学目标 1.写出定义接口的格式 2.写出实现接口的格式 3.说出接口中成员的特点 4.能够说出使用多态的前提条件 5.理解多态的向上转型 ...
Navicat Keygen - 注册机是怎么工作的？
Navicat Keygen - 注册机是怎么工作的? 1. 关键词解释. Navicat激活公钥这是一个2048位的RSA公钥,Navicat使用这个公钥来完成相关激活信息的加密和解密. 这个公钥 ...