# version 1.0
def connect_mysql(sql, oper_type="select", data_l=None):
conn = pymysql.connect(host='localhost', user="root", password="",
database="work", port=3306)
cur = conn.cursor()
if oper_type == "insert":
cur.executemany(sql, data_l)
conn.commit()
else:
cur.execute(sql)
result = cur.fetchall()
# print(type(result), "result")
conn.close()
return result def process_jobs(field_name):
sql = "select j." + field_name + " FROM personal_jobs j"
column_name = connect_mysql(sql, oper_type="select")
row_total = (len(column_name))
row_category = set(column_name) # init category dict
category_dict = {}
for k in row_category:
category_dict[k] = 0 # calculate amount
cal_nmu = 0
for k in row_category:
for r in column_name:
if r == k:
cal_nmu += 1
category_dict[k] = cal_nmu
cal_nmu = 0
print(type(category_dict.items()), category_dict.items())
print(row_total, len(category_dict.items()))
return row_total, category_dict process_jobs("job_salary")
version 1.1
def count_times(all_list):
ls = []
item_list = list(set(all_list))
for m in item_list:
c = all_list.count(m)
ls.append([m, c])
return sorted(ls) def process_salary(field_name):
# sql = "select " + field_name + " from work.personal_jobs where job_exp = '1-3年';"
sql = "select " + field_name + " from work.personal_jobs where job_exp = '1年以内' or job_exp = '经验不限';"
original_sal = connect_mysql(sql)
# sort salary order
row_category = list(set(original_sal))
general_min, general_avg, general_max = [], [], []
# cal_num = 0
for sal in row_category:
# calculate category amount
# for cat in column_name:
# if cat == sal:
# cal_num += 1
# process salary
if field_name == "job_salary":
sal_tmp = str(sal).strip("('").strip("K',)").split("K-")
general_min.append(int(sal_tmp[0]))
general_max.append(int(sal_tmp[1])) # process experience
if field_name == "job_exp":
print(original_sal) # initial again
# cal_num = 0 # calculate min sal
min_sal = count_times(general_min)
for m1 in min_sal:
min_s = str(m1[0]) + "K"
m1[0] = min_s # calculate max sal
max_sal = count_times(general_max)
for m2 in max_sal:
min_s = str(m2[0]) + "K"
m2[0] = min_s # calculate avg sal
avg_sal = count_times(original_sal)
print("original: ", avg_sal)
for a1 in avg_sal:
sal_tmp_1 = str(a1[0]).strip("('").strip("K',)").split("K-")
a1[0] = (int(sal_tmp_1[0]) + int(sal_tmp_1[1])) / 2.0
avg_sal = sorted(avg_sal) for a2 in avg_sal:
a2[0] = str(a2[0]) + "K"
# debug
print(len(min_sal), min_sal)
print(len(avg_sal), avg_sal)
print(len(max_sal), max_sal)
return min_sal, avg_sal, max_sal # process_salary("job_salary")
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import Counter
from scipy.misc import imread def process_reqirement(field_name):
sql = "select " + field_name + " from work.personal_jobs where job_exp = '1年以内' or job_exp = '经验不限';"
original_req = connect_mysql(sql)
userdict = ["C", "C#", "C++", "Go", "Linux", "MongoDB", "Mysql", "PostgreSQL", "Ajax", "Bootstrap", "CSS", "Django", "Docker", "Flask", "Git", "http", "tcp", "Java", "JavaScript", "Jquery", "Oracle", "Python", "Redis", "Ruby", "Scrapy", "shell", "Tornado", "Web", "Zabbix", "RESTful", "云计算", "分布式", "前端", "后端", "大数据", "高并发", "数据分析", "数据挖掘", "机器学习", "爬虫", "算法", "自动化", "运维", "集群"] jieba.load_userdict(userdict)
# print(type(original_req), str(original_req))
text0 = Counter(jieba.cut(str(original_req)))
text1 = " ".join(jieba.cut(str(original_req)))
[item for item in sorted(text0.values())]
# print(text0.keys(), text0.values())
# print(type(text0), text0) # # create word cloud
# wordcloud = WordCloud(font_path=r"D:\wwj\work\script\web\personal\database_operation\MSYH.TTC",
# background_color="white", mask=imread("china.jpg")).generate(text1)
# plt.imshow(wordcloud)
# plt.axis("off")
# plt.show() # find requirement item what we really need
req_list = []
# print(len(text0.keys()), text0)
for k, v in text0.items():
for kk, vv in text0.items():
if str(k).lower() == str(kk).lower():
# print(k, v)
req_list.append([k, (v + vv)])
# print(k, v)
break
print(len(req_list), req_list) for t in userdict:
for k, v in text0.items():
if t.lower() == str(k).lower():
req_list.append([t, v])
break
# print(req_list)
return req_list
process_reqirement("job_requirement")
def user_defined(file_name):
user_list = []
with open(file_name, "r", encoding="utf8") as f:
for i in f:
user_list.append(i.strip())
return user_list def process_company(field_name):
sql = "select " + field_name + " from work.personal_jobs"
company = [list(i) for i in connect_mysql(sql)]
user_list = user_defined("t.txt")
user_list = ['C','C#','C++','Go','Linux','MongoDB','Mysql','PostgreSQL','Ajax','Bootstrap','CSS','Django','Docker','Flask','Git','http','tcp','Java','JavaScript','Jquery','Oracle','Python','Redis','Ruby','Scrapy','shell','Tornado','Web','RESTful','云计算','分布式','前端','后端','大数据','高并发','数据分析','数据挖掘','机器学习','爬虫','算法','自动化','测试','运维','集群']
jieba.load_userdict(user_list)
me_list = ['python', 'django', 'linux', '运维', '自动化', '爬虫', '数据分析', 'shell', 'mysql', 'oracle']
req_list, suit_list = [], []
for req in company:
req_dict = Counter(jieba.cut(req[1]))
req_list.append([req[0], [k for k in req_dict.keys() if k in user_list]])
for r in req_list:
if len(r[1]) > 0:
# print(r[1])
own = [item for item in me_list if item in r[1]]
if len(own) > 0:
suit_list.append([r[0], int(len(own) * 100/len(r[1]))])
return sorted(suit_list, key=lambda x: x[1])
# print(sorted(suit_list, key=lambda x: x[1]))
process_company("company_name, job_requirement")

process data的更多相关文章

  1. 1.3 Quick Start中 Step 8: Use Kafka Streams to process data官网剖析(博主推荐)

    不多说,直接上干货! 一切来源于官网 http://kafka.apache.org/documentation/ Step 8: Use Kafka Streams to process data ...

  2. [CDH] Process data: integrate Spark with Spring Boot

    c 一.Spark 统计计算 简单统计后写入Redis. /** * 订单统计和乘车人数统计 */ object OrderStreamingProcessor { def main(args: Ar ...

  3. Flink应用案例:How Trackunit leverages Flink to process real-time data from industrial IoT devices

    January 22, 2019Use Cases, Apache Flink Lasse Nedergaard     Recently there has been significant dis ...

  4. [AJAX系列]$.post(url,[data],[fn],[type])

    概述: 通过远程HTTP POST请求载入信息 参数: url:发送请求地址 data:待发送Key/value值 callback:发送成功时回调函数 type:返回内容格式  xml  html ...

  5. Data Science at the Command Line学习笔记(二)

    1.vagrant建立简单httpserver方法: 1)映射端口 修改Vagrantfile, 末尾添加本地端口和虚机端口的映射关系, 然后执行vagrant reload. Vagrant::Co ...

  6. [Chapter 3 Process]Practice 3.3 Discuss three major complications that concurrent processing adds to an operating system.

    3.3  Original version of Apple's mobile iOS operating system provied no means of concurrent processi ...

  7. Learn know more about big data

    As we all know,we are in a big data age now."Every sword has two slides",as a ITer,we shou ...

  8. Monitoring and Tuning the Linux Networking Stack: Receiving Data

    http://blog.packagecloud.io/eng/2016/06/22/monitoring-tuning-linux-networking-stack-receiving-data/ ...

  9. Big Data Analytics for Security(Big Data Analytics for Security Intelligence)

    http://www.infoq.com/articles/bigdata-analytics-for-security This article first appeared in the IEEE ...

随机推荐

  1. curl测试dns解析时间及tcp连接时间

    1.用Linux下的curl命令测量网络请求(分号是分隔符,可以是其他符号): curl -o /dev/null -s -w %{time_connect}:%{time_starttransfer ...

  2. ASP.NET Core MVC 2.x 全面教程_ASP.NET Core MVC 17. 基于Claim和Policy的授权 上

    首先补一下昨天没有讲的东西 只有管理员才能访问UserController RoleController都加上这个角色 Cliam 不是管理员角色的用户访问 cliam是name个Value值的键值对 ...

  3. Cardboard对像的公共方法与属性

    一.  public Pose3D EyePose(Eye eye)/// The transformation from head to eye. 获取眼睛在头部坐标系中的局部transform: ...

  4. mongodb 安装问题

    重新安装一台机器时出现头疼的问题,老是说什么 dbpath 不存在 结果最后发现是没有写 mongodb.log 这个log文件名 1. 创建 datas 文件夹  e:\mongodb\datas ...

  5. bzoj 2780: [Spoj]8093 Sevenk Love Oimaster【广义SAM】

    AC自动机比较简单,把询问串做成AC自动机然后模板串边跑变更新即可 SAM是把模板串做成广义SAM,然后每个节点存有几个模板串经过,具体方法是每次更新暴力向上跳直到有时间戳我不会证为什么时间复杂度是对 ...

  6. Codeforces732E Sockets

    首先检测有木有和Computer匹配的Socket,如果有则将其匹配. 然后将所有没有匹配的Socket连上Adapter,再去检测有木有Computer与Socket匹配. 重复这个操作31次,所有 ...

  7. codeforces 149D Coloring Brackets (区间DP + dfs)

    题目链接: codeforces 149D Coloring Brackets 题目描述: 给一个合法的括号串,然后问这串括号有多少种涂色方案,当然啦!涂色是有限制的. 1,每个括号只有三种选择:涂红 ...

  8. Educational Codeforces Round 46 (Rated for Div. 2) B. Light It Up

    Bryce1010模板 http://codeforces.com/problemset/problem/1000/B 思路:先用两个数组sumon[]和sumoff[]将亮着的灯和灭的灯累计一下. ...

  9. Almost Acyclic Graph Codeforces - 915D

    以前做过的题都不会了.... 此题做法:优化的暴力 有一个显然的暴力:枚举每一条边试着删掉 注意到题目要求使得图无环,那么找出图上任意一个环,都应当要在其某一处断开(当然没有环是YES) 因此找出图中 ...

  10. Invitation Cards POJ 1511 SPFA || dij + heap

    http://poj.org/problem?id=1511 求解从1去其他顶点的最短距离之和. 加上其他顶点到1的最短距离之和. 边是单向的. 第一种很容易,直接一个最短路, 然后第二个,需要把边反 ...