一、英文数据清洗

英文数据清洗是去除缩写、非字母符号、专有名词的缩写、提取词干、提取词根。

1.常规的清洗方式

去除非字母符号和常用缩写

#coding=utf-8
import jieba
import unicodedata
import sys,re,collections,nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
class rule:
# 正则表达式过滤特殊符号用空格符占位,双引号、单引号、句点、逗号
pat_letter = re.compile(r'[^a-zA-Z \']+')#保留'
# 还原常见缩写单词
pat_is = re.compile("(it|he|she|that|this|there|here)(\'s)", re.I)
pat_s = re.compile("([a-zA-Z])(\'s)") # 处理类似于这样的缩写today’s
pat_not = re.compile("([a-zA-Z])(n\'t)") # not的缩写
pat_would = re.compile("([a-zA-Z])(\'d)") # would的缩写
pat_will = re.compile("([a-zA-Z])(\'ll)") # will的缩写
pat_am = re.compile("([I|i])(\'m)") # am的缩写
pat_are = re.compile("([a-zA-Z])(\'re)") # are的缩写
pat_ve = re.compile("([a-zA-Z])(\'ve)") # have的缩写 def replace_abbreviations(text):
new_text = text
new_text = rule.pat_letter.sub(' ', new_text).strip().lower()
new_text = rule.pat_is.sub(r"\1 is", new_text)#其中\1是匹配到的第一个group
new_text = rule.pat_s.sub(r"\1 ", new_text)
new_text = rule.pat_not.sub(r"\1 not", new_text)
new_text = rule.pat_would.sub(r"\1 would", new_text)
new_text = rule.pat_will.sub(r"\1 will", new_text)
new_text = rule.pat_am.sub(r"\1 am", new_text)
new_text = rule.pat_are.sub(r"\1 are", new_text)
new_text = rule.pat_ve.sub(r"\1 have", new_text)
new_text = new_text.replace('\'', ' ')
return new_text if __name__=='__main__':
text='there\'re many recen\'t \'t extensions of this basic idea to include attention. 120,yes\'s it\'s'
text=replace_abbreviations(text)
print(text)#there are many rece not t extensions of this basic idea to include attention yes it is

2.详细的处理方式

去除普通的缩写,还引入了一些专有名词的处理、标点符号的处理

import re
def clean_text(text):
"""
Clean text
:param text: the string of text
:return: text string after cleaning
"""
# acronym
text = re.sub(r"can\'t", "can not", text)
text = re.sub(r"cannot", "can not ", text)
text = re.sub(r"what\'s", "what is", text)
text = re.sub(r"What\'s", "what is", text)
text = re.sub(r"\'ve ", " have ", text)
text = re.sub(r"n\'t", " not ", text)
text = re.sub(r"i\'m", "i am ", text)
text = re.sub(r"I\'m", "i am ", text)
text = re.sub(r"\'re", " are ", text)
text = re.sub(r"\'d", " would ", text)
text = re.sub(r"\'ll", " will ", text)
text = re.sub(r" e mail ", " email ", text)
text = re.sub(r" e \- mail ", " email ", text)
text = re.sub(r" e\-mail ", " email ", text) # spelling correction
text = re.sub(r"ph\.d", "phd", text)
text = re.sub(r"PhD", "phd", text)
text = re.sub(r" e g ", " eg ", text)
text = re.sub(r" fb ", " facebook ", text)
text = re.sub(r"facebooks", " facebook ", text)
text = re.sub(r"facebooking", " facebook ", text)
text = re.sub(r" usa ", " america ", text)
text = re.sub(r" us ", " america ", text)
text = re.sub(r" u s ", " america ", text)
text = re.sub(r" U\.S\. ", " america ", text)
text = re.sub(r" US ", " america ", text)
text = re.sub(r" American ", " america ", text)
text = re.sub(r" America ", " america ", text)
text = re.sub(r" mbp ", " macbook-pro ", text)
text = re.sub(r" mac ", " macbook ", text)
text = re.sub(r"macbook pro", "macbook-pro", text)
text = re.sub(r"macbook-pros", "macbook-pro", text)
text = re.sub(r" 1 ", " one ", text)
text = re.sub(r" 2 ", " two ", text)
text = re.sub(r" 3 ", " three ", text)
text = re.sub(r" 4 ", " four ", text)
text = re.sub(r" 5 ", " five ", text)
text = re.sub(r" 6 ", " six ", text)
text = re.sub(r" 7 ", " seven ", text)
text = re.sub(r" 8 ", " eight ", text)
text = re.sub(r" 9 ", " nine ", text)
text = re.sub(r"googling", " google ", text)
text = re.sub(r"googled", " google ", text)
text = re.sub(r"googleable", " google ", text)
text = re.sub(r"googles", " google ", text)
text = re.sub(r"dollars", " dollar ", text) # punctuation
text = re.sub(r"\+", " + ", text)
text = re.sub(r"'", " ", text)
text = re.sub(r"-", " - ", text)
text = re.sub(r"/", " / ", text)
text = re.sub(r"\\", " \ ", text)
text = re.sub(r"=", " = ", text)
text = re.sub(r"\^", " ^ ", text)
text = re.sub(r":", " : ", text)
text = re.sub(r"\.", " . ", text)
text = re.sub(r",", " , ", text)
text = re.sub(r"\?", " ? ", text)
text = re.sub(r"!", " ! ", text)
text = re.sub(r"\"", " \" ", text)
text = re.sub(r"&", " & ", text)
text = re.sub(r"\|", " | ", text)
text = re.sub(r";", " ; ", text)
text = re.sub(r"\(", " ( ", text)
text = re.sub(r"\)", " ( ", text) # symbol replacement
text = re.sub(r"&", " and ", text)
text = re.sub(r"\|", " or ", text)
text = re.sub(r"=", " equal ", text)
text = re.sub(r"\+", " plus ", text)
text = re.sub(r"\$", " dollar ", text) # remove extra space
text = ' '.join(text.split()) return text if __name__=='__main__':
text = 'there\'re many recen\'t \'t extensions of this basic idea to include attention. 120,yes\'s it\'s'
text = clean_text(text)
print(text) # there are many rece not t extensions of this basic idea to include attention . 120 , yes s it s

3.包括有处理词根词缀的处理方式

去除符号、还原缩写、获取词根。

#coding=utf-8
import jieba
import unicodedata
import sys,re,collections,nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
class rule:
# 正则表达式过滤特殊符号用空格符占位,双引号、单引号、句点、逗号
pat_letter = re.compile(r'[^a-zA-Z \']+')#保留'
# 还原常见缩写单词
pat_is = re.compile("(it|he|she|that|this|there|here)(\'s)", re.I)
pat_s = re.compile("([a-zA-Z])(\'s)") # 处理类似于这样的缩写today’s
pat_not = re.compile("([a-zA-Z])(n\'t)") # not的缩写
pat_would = re.compile("([a-zA-Z])(\'d)") # would的缩写
pat_will = re.compile("([a-zA-Z])(\'ll)") # will的缩写
pat_am = re.compile("([I|i])(\'m)") # am的缩写
pat_are = re.compile("([a-zA-Z])(\'re)") # are的缩写
pat_ve = re.compile("([a-zA-Z])(\'ve)") # have的缩写 def replace_abbreviations(text):
new_text = text
new_text = rule.pat_letter.sub(' ', new_text).strip().lower()
new_text = rule.pat_is.sub(r"\1 is", new_text)#其中\1是匹配到的第一个group
new_text = rule.pat_s.sub(r"\1 ", new_text)
new_text = rule.pat_not.sub(r"\1 not", new_text)
new_text = rule.pat_would.sub(r"\1 would", new_text)
new_text = rule.pat_will.sub(r"\1 will", new_text)
new_text = rule.pat_am.sub(r"\1 am", new_text)
new_text = rule.pat_are.sub(r"\1 are", new_text)
new_text = rule.pat_ve.sub(r"\1 have", new_text)
new_text = new_text.replace('\'', ' ')
return new_text # pos和tag有相似的地方,通过tag获得pos
def get_wordnet_pos(treebank_tag):
if treebank_tag.startswith('J'):
return nltk.corpus.wordnet.ADJ
elif treebank_tag.startswith('V'):
return nltk.corpus.wordnet.VERB
elif treebank_tag.startswith('N'):
return nltk.corpus.wordnet.NOUN
elif treebank_tag.startswith('R'):#以副词
return nltk.corpus.wordnet.ADV
else:
return '' def merge(words):
lmtzr = WordNetLemmatizer()
new_words = ''
words = nltk.pos_tag(word_tokenize(words)) # tag is like [('bigger', 'JJR')]
for word in words:
pos = get_wordnet_pos(word[1])
if pos:
# lemmatize()方法将word单词还原成pos词性的形式
word = lmtzr.lemmatize(word[0], pos)
new_words+=' '+word
else:
new_words+=' '+word[0]
return new_words def clear_data(text):
text=replace_abbreviations(text)
text=merge(text)
text=text.strip()
return text
if __name__=='__main__':
text='there\'re many recen\'t \'t extensions of this basic had idea to include attention. 120,had'
text=clear_data(text)
print(text)#there be many rece not t extension of this basic have idea to include attention have

二、中文数据清洗

去除一些停用词。而停用词是文本中一些高频的代词、连词、介词等对文本分类无意义的词,通常维护一个停用词表,特征提取过程中删除停用表中出现的词,本质上属于特征选择的一部分。具体可参考Hanlp的停用词表https://github.com/hankcs/HanLP

nlp英文的数据清洗代码的更多相关文章

  1. JavaScript验证字符串只能包含数字或者英文字符的代码实例

    验证字符串只能包含数字或者英文字符的代码实例:本章节分享一段代码实例,它实现了验证字符串内容是否只包含英文字符或者数字.代码实例如下: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 ...

  2. [NLP] The Annotated Transformer 代码修正

    1. RuntimeError: "exp" not implemented for 'torch.LongTensor' class PositionalEncoding(nn. ...

  3. NLP整体流程的代码

    import nltk import numpy as np import re from nltk.corpus import stopwords # 1 分词1 text = "Sent ...

  4. 8个数据清洗Python代码,复制可用,最长11行 | 资源

    最近,大数据工程师Kin Lim Lee在Medium上发表了一篇文章,介绍了8个用于数据清洗的Python代码. 数据清洗,是进行数据分析和使用数据训练模型的必经之路,也是最耗费数据科学家/程序员精 ...

  5. git入门(4)团队中git保管代码常用操作

    在团队中协作代码时候,一定要熟练使用以下git命令,不至于把代码库弄乱, PS:一定要提交自己代码(git push)时候,先进行更新本地代码库(git pull),不然提交异常 git常用命令 1· ...

  6. 快看Sample代码,速学Swift语言(1)-语法速览

    Swift是苹果推出的一个比较新的语言,它除了借鉴语言如C#.Java等内容外,好像还采用了很多JavaScript脚本里面的一些脚本语法,用起来感觉非常棒,作为一个使用C#多年的技术控,对这种比较超 ...

  7. IDEA 代码规范插件

    前言 在工作过程中,每个人的代码习惯都不同,在一起工作做同一个项目,如果按照自己的习惯来,有可能造成代码维护困难,开发进度缓慢等. 代码规范的重要性 谷歌发布的代码规范中指出,80% 的缺失是由 20 ...

  8. ph 提交代码的步骤;

    ph 提交代码的步骤: git status 查看状态: ls -ah 查看文件: git stash list 查看本地缓存的文件: git branch 查看本地的分支: git checkout ...

  9. cucumber java从入门到精通(2)用代码定义步骤

    cucumber java从入门到精通(2)用代码定义步骤 上一节里我们定义了feature文件,feature文件就是自然语言描述的用例文件,它有一定的章法,具体的潜规则是: 使用Feature关键 ...

随机推荐

  1. 在ASP.NET Core中使用托管启动(hosting startup)程序集,实现批量注册service

    在启动ASPNET Core时可以从外部程序集向应用添加增强功能.例如,外部库可以用托管启动( hosting startup) 实现为应用程序提供附加配置(Configuration)或服务(ser ...

  2. java之单例设计模式

    什么是设计模式? 设计模式是在大量的实践中总结和理论化之后优选的代码结构.编程风格.以及解决问题的思考方式.设计模式就像是经典的棋谱,不同的棋局,我们用不同的棋谱,免去我们自己再思考和探索. 所谓单例 ...

  3. Jmeter介绍以及脚本制作与调试

    目录 Jmeter介绍 Jmeter安装 Jmeter主要测试组件 Jmeter元件作用域与执行顺序 Jmeter运行原理 Jmeter脚本制作 Jmeter脚本调试 Jmeter介绍 Jmeter ...

  4. Orleans[NET Core 3.1] 学习笔记(四)( 1 )创建项目

    ClassRoom ClassRoom是一个练手demo,目的是为了能熟悉掌握Orleans的基本知识和使用方法,我会尽量在这个项目中加入更多的知识点,一边学一边练避免我看完文档就忘掉 创建项目 依旧 ...

  5. 【hibernate】自定义转换器

    [hibernate]自定义转换器 转载:https://www.cnblogs.com/yangchongxing/p/10398255.html 1.转换基本属性 package cn.ycx.s ...

  6. python学习-list

    # 数据类型之 列表 有顺序.# 关键字:list# 语法 :[] 数据之间用,隔开.列表当中的数据,可以是任意类型.数值是可以重复的.a = []b = ["魔王", " ...

  7. Redis有哪几种数据类型

    Redis支持五种数据类型:string(字符串),hash(哈希),list(列表),set(集合)及zset(sorted set:有序集合). String(字符串) string 是 redi ...

  8. java之线程(线程的创建方式、java中的Thread类、线程的同步、线程的生命周期、线程之间的通信)

    CPU:10核 主频100MHz 1核  主频    3GHz 那么哪一个CPU比较好呢? CPU核不是越多越好吗?并不一定.主频用于衡量GPU处理速度的快慢,举个例子10头牛运送货物快还是1架飞机运 ...

  9. Internet History,Technology,and Security - Technology: Internets and Packets (Week5)

    Week5 Technology: Internets and Packets Welcome to Week 5! This week, we’ll be covering internets an ...

  10. CAD绘图效率低?教你4个CAD绘图技巧,绘图效率提升十倍

    CAD绘图一直是一个谜一样的存在,说它简单吧,很多人都无法完全精通,说它难吧,很多人也都自学成才了. 如何学好CAD绘图是个难题,但是老话说的好,只要思想不滑坡,办法总比困难多,掌握以下这些CAD绘图 ...