根据职位名,自动生成jd
代码本身就是最好的解释,不赘述。
文本聚类输出: cluster.py
#!/usr/bin/env python
# coding=utf-8 import jieba,re
from gensim import corpora,models
from sklearn.cluster import KMeans
import sys
reload(sys)
sys.setdefaultencoding('utf-8') class MyCorpus(object):
def __init__(self,fname):
self.fname = fname def __iter__(self):
for line in open(self.fname):
yield jieba.cut(line,cut_all=False) class MyCluster(object): def __init__(self):
self.CLEAN = re.compile(ur"[^\u4e00-\u9f5aA-Za-z0-9]")
self.dictionary = {}
self.corpus = [] def gen_dataset(self,documents):
self.gen_corpus(documents)
res = [self.doc2vec(doc) for doc in documents]
return res def gen_corpus(self,documents):
texts = [ list(jieba.cut(doc)) for doc in documents ]
self.dictionary = corpora.Dictionary(texts)
self.corpus = [self.dictionary.doc2bow(text) for text in texts]
self.tfidf = models.TfidfModel(self.corpus) def doc2vec(self,doc):
vec = self.dictionary.doc2bow(jieba.cut(doc))
vec = self.tfidf[vec]
wordlist = [.0] * len(self.dictionary)
for w in vec:
wordlist[w[0]] = w[1]
return wordlist def kcluster(self,texts,k=3):
from random import shuffle
data = self.gen_dataset(texts)
data = [ map(lambda x:round(x,5),line) for line in data ]
km = KMeans(n_clusters=k,init='k-means++',max_iter=200,n_init=1,verbose=True)
km.fit(data)
labels = km.labels_
flag = [0]*len(labels)
randomtext = zip(labels,texts)
shuffle(randomtext)
res = []
for d in randomtext:
if flag[d[0]]==0:
res.append(d[1])
flag[d[0]] = 1 return res if __name__ == "__main__":
texts = [ line for line in open('data/python.db') ]
test = MyCluster()
res = test.kcluster(texts,k=4) print '\n'.join(res)
自动生成主文件: auto_gen_jd.py
#!/usr/bin/env python
# coding=utf-8 import sys,os
import simplejson as json
import codecs
# from snownlp import SnowNLP
from simhash import Simhash
# from bosonnlp import BosonNLP
from cluster import MyCluster
from jd_parser import JdParser
import re
reload(sys)
sys.setdefaultencoding('utf-8') class AutoGenJD(object):
''' 自动生成JD,输入一个职位名 和句子数,输出一份岗位描述和要求 ''' def __init__(self):
self.CLEAR_NUM = re.compile(u"^\d+[\.、::]|^[\(\(]\d+[\)\)\.]?|\d\s*[\))】]")
self.CLEAR_COLO = re.compile(u"^[。\.)(【】]\S+|[\.;:;。]$")
self.jd_database = json.load(codecs.open('data/lagou_jd_clean.json'))
# self.jobname = [ jobname[:-3] for jobname in os.listdir("data") if jobname.endswith(".db") ]
self.jobname = self.jd_database.keys()
# self.bosonnlp = BosonNLP('UYTG1Csb.3652.5pZ2otkIncEn')
self.jdparser = JdParser()
self.km = MyCluster() def load_json_data(self,fname="../preprocess/data/mini_jd.json",arg1=None,arg2=None):
for line in codecs.open(fname):
try:
data = json.loads(line)
except Exception,e:
print e
continue
if data.get(arg1,False) != False and data[arg1].has_key("job_title") and data[arg1].has_key("job_description"):
if len(data[arg1]["job_title"])<2 or len(data[arg1]["job_title"])>16:
continue
else:
fw = codecs.open('./data/'+data[arg1][arg2]+".txt",'w','utf-8')
fw.write(data[arg1]["job_description"].strip()+"\n\n")
print "writing...",data[arg1][arg2] # 去除 序列号等清洗数据
def clean_jd(self,fname="./data/java.txt"):
clean_sents = set()
with codecs.open(fname+".txt",'r','utf-8') as fr:
for line in fr:
line = self.CLEAR_NUM.sub("",line.strip())
line = self.CLEAR_COLO.sub("",line.strip())
if len(line)>2:
clean_sents.add(line.strip())
with codecs.open(fname[:-3]+"db",'w','utf-8') as fw:
for line in clean_sents:
fw.write(line+'\n')
return clean_sents def is_most_english(self,line):
en_word = [ uchar for uchar in line if (uchar>=u'\u0041' and uchar<=u'\u005a') or (uchar>=u'\u0061' and uchar<=u'\u007a') ]
return float(len(en_word)*1.0/len(line))>0.7 def clean_jd2(self,jdstr):
"""
清洗数据,去除句子前后的标点符合,序号等杂乱数据
"""
res = set()
for line in jdstr.split("\n"):
line = line.strip()
if len(line)<12:
print "line",line
if re.search(u"[;\.;。]\d+|\d?[,,、::\.]$|^\d\s{0,1}[\u4e00-\u9f5e]",line) or len(line)<8 or len(line)>32:continue
if self.is_most_english(line):continue
line = self.CLEAR_NUM.sub("",line)
line = self.CLEAR_COLO.sub("",line)
res.add(line)
return res # 获取和用户输入相似度最近的职位名
def get_closet_job(self,jobname="java"):
dis = [ (other,Simhash(jobname).distance(Simhash(other))) for other in self.jobname ]
sorteddis = sorted(dis,key = lambda x:x[1])
for k,v in sorteddis[:5]:
print k,v
return sorteddis[0][0] # 规范化jd句子数目
def norm_jd_num(self,num):
if num<1:
num=1
elif num>20:
num = 20
return num # 根据职位名和句子数,获得jd
def get_jd_with_snownlp(self,jobname="java",num=5):
jobname = self.get_closet_job(jobname)
# with open("./data/"+jobname+".db") as fr:
# s = SnowNLP(fr.read())
# return s.summary(num)
jdstr = self.clean_jd2(self.jd_database[jobname])
s = SnowNLP(jdstr)
return s.summary(num) def get_jd_with_bosonnlp(self,jobname="java",num=5): res = set()
jobname = self.get_closet_job(jobname)
jdstr = self.clean_jd2(self.jd_database[jobname])[:80]
all_cluster = self.bosonnlp.cluster(jdstr)
sort_all_cluster = sorted(all_cluster,key = lambda x:x['num'],reverse=True)
for idx,cluster in enumerate(sort_all_cluster):
print idx+1,cluster['_id']
res.add(jdstr[cluster['_id']])
return res def _get_sent_score(self,line):
"""
句子得分,最后结果排序使用,分值越小,排序越靠前
"""
s = len(line)+100
if re.search(u"男|女|男女不限|性别|岁",line):
s -= 60
if re.search(u"学历|专业|\d+[kK元]",line):
s -= 40
if re.search(u"经验",line):
s -= 20
return s def get_jd_with_kmeans(self,jobname='python',num=6):
"""
使用kmeans 进行聚类,相同一类只出现一句
"""
jobname = self.get_closet_job(jobname)
jdstr = self.clean_jd2(self.jd_database[jobname])
print "jdstr",len(jdstr)
print self.jd_database[jobname] if len(jdstr)<int(num):
num = len(jdstr)
res = self.km.kcluster(jdstr,k=int(num))
return sorted(res,cmp=lambda x,y:self._get_sent_score(x)-self._get_sent_score(y)) def jd_parser(self,jdstr):
result = self.jdparser.parser(jdstr)
return result if __name__ == "__main__": test = AutoGenJD()
jobname = sys.argv[1]
jdnum = int(sys.argv[2])
print "job name:",jobname
print "demand:"
demand = test.get_jd_with_kmeans(jobname,jdnum)
for i,jdstr in enumerate(demand):
print "%d. %s" %(i+1,jdstr)
根据职位名,自动生成jd的更多相关文章
- Java代码自动生成,生成前端vue+后端controller、service、dao代码,根据表名自动生成增删改查功能
本项目地址:https://github.com/OceanBBBBbb/ocean-code-generator 项目简介 ocean-code-generator采用(适用): ,并使用m ...
- 转载:C#保存文件时重名自动生成新文件的方法
/// <summary> /// Generates a new path for duplicate filenames. /// </summary> /// <p ...
- c# datagridview禁止自动生成额外列
在某些时候,处于重用pojo的考虑,我们希望在不同的datagridview之间进行复用,这就涉及到pojo中的字段会比有些datagridview所需要的字段多,默认情况下,.net对于pojo中的 ...
- oracle数据库高级应用之《自动生成指定表的insert,update,delete语句》
/* * 多条记录连接成一条 * tableName 表名 * type 类型:可以是insert/update/select之一 */ create or replace function my_c ...
- 懒人小工具:自动生成Model,Insert,Select,Delete以及导出Excel的方法
在开发的过程中,我们为了节约时间,往往会将大量重复机械的代码封装,考虑代码的复用性,这样我们可以节约很多时间来做别的事情.最近跳槽到一节webform开发的公司,主要是开发自己公司用的ERP.开始因为 ...
- 懒人小工具1:winform自动生成Model,Insert,Select,Delete以及导出Excel的方法
懒人小工具2:T4自动生成Model,Insert,Select,Delete以及导出Excel的方法 github地址:https://github.com/Jimmey-Jiang/J ...
- PowerDesigner中表名过长,自动生成的主键名截取的问题
在PowerDesinger中,若表名过长,自动生成的主键名会被自动截取. 解决如下:DataBase/Edit Current DBMS/Scripts/Objects/PKey/ConstName ...
- Linq to Sql自动生成实体类重名情况的处理
使用Linq to sql自动生成实体类时,如果要生成多个库的实体类,往往会遇到类名重名的情况,也就是表名重名,这样编译会不通过,这种情况下要在自动生成的实体类文件中(.designer.cs后缀)将 ...
- eclipse自动生成变量名声明(按方法返回值为本地变量赋值)
eclipse自动生成变量名声明(按方法返回值为本地变量赋值) ctrl+2+L 这个快捷键可自动补全代码,极大提升编码效率! 注:ctrl和2同时按完以后释放,再快速按L.不能同时按! 比如写这句代 ...
随机推荐
- C语言调试的几种方法
linux系统下,在不gdb调试的情况下,我们如何解决程序崩溃问题呢?首先想到的就是添加log日志信息,其次还有以下几种方法可以帮助我们分析存在的问题: (一)add2line 程序崩溃时会打出一些崩 ...
- Using CSV-Format Log Output
Including csvlog in the log_destination list provides a convenient way to import log files into a da ...
- org.springframework.orm.hibernate3.support.OpenSessionInViewFilter作用
在Spring与Hibernate集成时在web.xml要加入这样的过滤器: <filter> <filter-name>openSessionInView</filte ...
- unity, Additive Animtion注意事项
以下摘自官方文档:(http://docs.unity3d.com/Manual/AnimationScripting.html) Additive animations allow you to o ...
- WCF Client is Open Source
WCF Client is Open Source Wednesday, May 20, 2015 Announcement New Project WCF We’re excited to anno ...
- 【转】.NET多种WebKit内核/Blink内核浏览器初步测评报告
第1篇:.NET多种WebKit内核/Blink内核浏览器初步测评报告 本文转自“吾乐吧软件站”,原文链接:http://www.wuleba.com/?p=23590 报告研究时间:2013-10- ...
- event.srcElement兼容处理
在IE下,event对象有srcElement属性,但是没有target属性:Firefox下,even对象有target属性,但是没有srcElement属性.. 解决方法:使用obj(obj = ...
- SVN server环境搭建
VisualSVN server搭建: 1. 下载VisualSVN server for Windows安装包:https://www.visualsvn.com/ 2. 安装过程,基本就是下一步一 ...
- Java OCR 图像智能字符识别技术,可识别中文
http://www.open-open.com/lib/view/open1363156299203.html
- mysql.server 文件修改起动多实例
如果你用mysql.server来启动的话,就要注意一下点,要修改一下里面的内容,修改如下:大约218行左右 查看复制打印? then print_defaults="$bindir/mys ...