from openpyxl import Workbook
import xlrd
import time
import Levenshtein as Le target_city_list = ['北京市', '上海市', '深圳市', '广州市']
source_name = 'JMTool任务_csv_py_wholeCSV-加百度170826165729'
BDpoi_list_tag, BDpoi_list_tagb = '|-|', '|--|'
FEXCEL = '%s%s' % (source_name, '.xlsx') weight_ratio, weight_seqratio = 0.7, 0.3 def main_():
global source_name
data = xlrd.open_workbook(FEXCEL)
table = data.sheets()[0]
nrows, ncols = table.nrows, table.ncols
res_dic = {}
for i in range(0, nrows):
l = table.row_values(i)
dbid, area_code, ref_area_type_code, city, district, address, city_street, name_, BDpoi_list = l
if dbid == 'dbid':
continue
if city not in target_city_list:
continue
if city not in res_dic:
res_dic[city] = {}
if district not in res_dic[city]:
res_dic[city][district] = {}
if name_ not in res_dic[city][district]:
res_dic[city][district][name_] = [] if BDpoi_list.find(BDpoi_list_tag) == -1:
ll = dbid, area_code, ref_area_type_code, city, district, address, city_street, name_, BDpoi_list, '', '', ''
res_dic[city][district][name_].append(ll)
else:
addr_ = '%s%s%s%s' % (city, district, address, city_street)
chk_name_lsit, cmp_list, sorted_ratio_seqratio_res_dic = [name_, addr_], BDpoi_list.split(
BDpoi_list_tag), {}
for ii in cmp_list:
if len(ii) == 0:
continue
cmp_, BD_name, BD_addr = ['', ''], '', ''
cmp_one = ii.split(BDpoi_list_tagb)
if len(cmp_one) == 2:
# format data -fair
BD_name, BD_addr = cmp_[0], cmp_[1] = cmp_one[0], cmp_one[1].replace(city, '').replace(district, '')
else:
BD_name = cmp_[0] = cmp_one[0]
ratio_res, seqratio_res = Le.ratio(name_, BD_name), Le.seqratio(chk_name_lsit, cmp_)
ratio_seqratio_res = weight_ratio * ratio_res + weight_seqratio * seqratio_res
ll = dbid, area_code, ref_area_type_code, city, district, address, city_street, name_, BDpoi_list, BD_name, BD_addr, ratio_seqratio_res, ratio_res, seqratio_res
if ratio_seqratio_res not in sorted_ratio_seqratio_res_dic:
sorted_ratio_seqratio_res_dic[ratio_seqratio_res] = []
sorted_ratio_seqratio_res_dic[ratio_seqratio_res].append(ll)
sorted_seqratio_res_list = sorted(sorted_ratio_seqratio_res_dic)
for ratio_seqratio_res in sorted_seqratio_res_list:
lll = sorted_ratio_seqratio_res_dic[ratio_seqratio_res]
for vl in lll:
res_dic[city][district][name_].append(vl) wb = Workbook()
worksheet = wb.active
file_title_str = ' dbid, area_code, ref_area_type_code, city, district, address, city_street, name_, BDpoi_list, BD_name, BD_addr, ratio_seqratio_res, ratio_res, seqratio_res'
file_title_l = file_title_str.replace(' ', '').split(',')
worksheet.append(file_title_l)
for city in res_dic:
for district in res_dic[city]:
for name_ in res_dic[city][district]:
l = res_dic[city][district][name_]
for ll in l:
worksheet.append(ll)
localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
file_name = '%s%s%s' % (source_name, '-Levenshtein', localtime_)
file_name_save = '%s%s' % (file_name, '.xlsx')
wb.save(file_name_save) wb = Workbook()
worksheet = wb.active
file_title_str = ' dbid, area_code, ref_area_type_code, city, district, address, city_street, name_,BDpoi_list,max_BD_name, max_BD_addr, max_ratio_seqratio_res, ratio_res, seqratio_res'
file_title_l = file_title_str.replace(' ', '').split(',')
worksheet.append(file_title_l)
for city in res_dic:
for district in res_dic[city]:
for name_ in res_dic[city][district]:
l = res_dic[city][district][name_]
lll = l[-1] worksheet.append(lll)
localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
file_name = '%s%s%s' % (file_name, '-Levenshtein-ordered', localtime_)
file_name_save = '%s%s' % (file_name, '.xlsx')
wb.save(file_name_save) main_()

  

from openpyxl import Workbook
import xlrd
import time
import Levenshtein as Le target_city_list = ['深圳市']
BDpoi_list_tag, BDpoi_list_tagb = '|-|', '|--|' source_name = 'JMTool任务_csv_py_wholeCSV_住宅小区-加百度170826152533'
FEXCEL = '%s%s' % (source_name, '.xlsx')
weight_ratio, weight_seqratio = 0.7, 0.3 def main_():
global source_name
data = xlrd.open_workbook(FEXCEL)
table = data.sheets()[0]
nrows, ncols = table.nrows, table.ncols
res_dic = {}
for i in range(0, nrows):
l = table.row_values(i)
dbid, area_code, ref_area_type_code, city, district, address, city_street, name_, name_reduction, BDpoi_list = l
if dbid == 'dbid':
continue
if city not in target_city_list:
continue
if city not in res_dic:
res_dic[city] = {}
if district not in res_dic[city]:
res_dic[city][district] = {}
if name_ not in res_dic[city][district]:
res_dic[city][district][name_] = [] if BDpoi_list.find(BDpoi_list_tag) == -1:
ll = dbid, area_code, ref_area_type_code, city, district, address, city_street, name_, name_reduction, BDpoi_list, '', '', ''
res_dic[city][district][name_].append(ll)
else:
addr_ = '%s%s%s%s' % (city, district, address, city_street)
chk_name_lsit, cmp_list, sorted_ratio_seqratio_res_dic = [name_reduction, addr_], BDpoi_list.split(
BDpoi_list_tag), {}
for ii in cmp_list:
if len(ii) == 0:
continue
cmp_, BD_name, BD_addr = ['', ''], '', ''
cmp_one = ii.split(BDpoi_list_tagb)
if len(cmp_one) == 2:
# format data -fair
BD_name, BD_addr = cmp_[0], cmp_[1] = cmp_one[0], cmp_one[1].replace(city, '').replace(district, '')
else:
BD_name = cmp_[0] = cmp_one[0]
ratio_res, seqratio_res = Le.ratio(name_reduction, BD_name), Le.seqratio(chk_name_lsit, cmp_)
ratio_seqratio_res = weight_ratio * ratio_res + weight_seqratio * seqratio_res
ll = dbid, area_code, ref_area_type_code, city, district, address, city_street, name_, name_reduction, BDpoi_list, BD_name, BD_addr, ratio_seqratio_res, ratio_res, seqratio_res
if ratio_seqratio_res not in sorted_ratio_seqratio_res_dic:
sorted_ratio_seqratio_res_dic[ratio_seqratio_res] = []
sorted_ratio_seqratio_res_dic[ratio_seqratio_res].append(ll)
sorted_seqratio_res_list = sorted(sorted_ratio_seqratio_res_dic)
for ratio_seqratio_res in sorted_seqratio_res_list:
lll = sorted_ratio_seqratio_res_dic[ratio_seqratio_res]
for vl in lll:
res_dic[city][district][name_].append(vl) wb = Workbook()
worksheet = wb.active
file_title_str = 'dbid, area_code, ref_area_type_code, city, district, address, city_street, name_, name_reduction, BDpoi_list, BD_name, BD_addr, ratio_seqratio_res, ratio_res, seqratio_res'
file_title_l = file_title_str.replace(' ', '').split(',')
worksheet.append(file_title_l)
for city in res_dic:
for district in res_dic[city]:
for name_ in res_dic[city][district]:
l = res_dic[city][district][name_]
for ll in l:
worksheet.append(ll)
localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
file_name = '%s%s%s' % (source_name, '-Levenshtein', localtime_)
file_name_save = '%s%s' % (file_name, '.xlsx')
wb.save(file_name_save) wb = Workbook()
worksheet = wb.active
file_title_str = 'dbid, area_code, ref_area_type_code, city, district, address, city_street, name_, name_reduction,BDpoi_list,max_BD_name, max_BD_addr, max_ratio_seqratio_res, ratio_res, seqratio_res'
file_title_l = file_title_str.replace(' ', '').split(',')
worksheet.append(file_title_l)
for city in res_dic:
for district in res_dic[city]:
for name_ in res_dic[city][district]:
l = res_dic[city][district][name_]
lll = l[-1] worksheet.append(lll)
localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
file_name = '%s%s%s' % (file_name, '-Levenshtein-ordered', localtime_)
file_name_save = '%s%s' % (file_name, '.xlsx')
wb.save(file_name_save) main_()

  

算法调参 weight_ratio, weight_seqratio的更多相关文章

  1. annoy ANN算法 调参

    search_k serach_k越大,越准确,但是要在时间和准确率之间取个trade off During the query it will inspect up to search_k node ...

  2. CatBoost算法和调参

    欢迎关注博主主页,学习python视频资源 sklearn实战-乳腺癌细胞数据挖掘(博主亲自录制视频) https://study.163.com/course/introduction.htm?co ...

  3. k-近邻算法采用for循环调参方法

    //2019.08.02下午#机器学习算法中的超参数与模型参数1.超参数:是指机器学习算法运行之前需要指定的参数,是指对于不同机器学习算法属性的决定参数.通常来说,人们所说的调参就是指调节超参数.2. ...

  4. 调参、最优化、ml算法(未完成)

    最优化方法 调参方法 ml算法 梯度下降gd grid search lr 梯度上升 随机梯度下降 pca 随机梯度下降sgd  贝叶斯调参 lda 牛顿算法   knn 拟牛顿算法   kmeans ...

  5. scikit-learn随机森林调参小结

    在Bagging与随机森林算法原理小结中,我们对随机森林(Random Forest, 以下简称RF)的原理做了总结.本文就从实践的角度对RF做一个总结.重点讲述scikit-learn中RF的调参注 ...

  6. scikit-learn 梯度提升树(GBDT)调参小结

    在梯度提升树(GBDT)原理小结中,我们对GBDT的原理做了总结,本文我们就从scikit-learn里GBDT的类库使用方法作一个总结,主要会关注调参中的一些要点. 1. scikit-learn ...

  7. word2vec参数调整 及lda调参

     一.word2vec调参   ./word2vec -train resultbig.txt -output vectors.bin -cbow 0 -size 200 -window 5 -neg ...

  8. 漫谈PID——实现与调参

    闲话: 作为一个控制专业的学生,说起PID,真是让我又爱又恨.甚至有时候会觉得我可能这辈子都学不会pid了,但是经过一段时间的反复琢磨,pid也不是很复杂.所以在看懂pid的基础上,写下这篇文章,方便 ...

  9. hyperopt自动调参

    hyperopt自动调参 在传统机器学习和深度学习领域经常需要调参,调参有些是通过通过对数据和算法的理解进行的,这当然是上上策,但还有相当一部分属于"黑盒" hyperopt可以帮 ...

随机推荐

  1. Java源码阅读HashMap

    1类签名与注释 public class HashMap<K,V> extends AbstractMap<K,V> implements Map<K,V>, Cl ...

  2. 【Hadoop】Hadoop DataNode节点超时时间设置

    hadoop datanode节点超时时间设置 datanode进程死亡或者网络故障造成datanode无法与namenode通信,namenode不会立即把该节点判定为死亡,要经过一段时间,这段时间 ...

  3. Hadoop之Flume详解

    1.日志采集框架Flume 1.1 Flume介绍 Flume是一个分布式.可靠.和高可用的海量日志采集.聚合和传输的系统. Flume可以采集文件,socket数据包等各种形式源数据,又可以将采集到 ...

  4. 2017.9.15 mybatis批量插入后实现主键回填

    参考来自:mybatis mysql 批量insert 返回主键 注意:必须要在mybatis3.3.1及其以上才能实现. 1.service List branchEntryList = (Arra ...

  5. Hello,Android

    项目介绍 由于要參加某信息安全比赛.选择了安卓apk的行为分析与评估的课题,所以首先须要了解安卓程序是如何编写和执行的.我们的第一个任务就是写出一个多人通信的app. 我本人之前没有不论什么安卓和ja ...

  6. A read-only user or a user in a read-only database is not permitted to disable

    A read-only user or a user in a read-only database is not permitted to disable 出现如题的问题通常是由于db.lck的所属 ...

  7. mysql ubuntu 开启3306端口,设置远程访问

    远程登陆数据库的时候出现了下面出错信息 :ERROR 2003 ( HY000 ) : Can 't connect to MySQL server on ' xxx.xxx.xxx.xxx ',经过 ...

  8. PS如何为图片添加四面投影

    如图所示,像四周的投影 很像Areo效果的Windows7. 用这样的图片做成PNG透明的效果非常好. 我们不妨仔细研究上图的两个角,发现其实只是简单的投影效果而已. 简单的使用投影效果即可.注意混合 ...

  9. selenium从入门到应用 - 1,环境准备(Java+TestNG+Maven+Selenium)

    本系列所有代码 https://github.com/zhangting85/simpleWebtest 本文将介绍一个Java+TestNG+Maven+Selenium的web自动化测试脚本环境的 ...

  10. 如何创建JAR文件?如何运行.jar形式的Java程序?

    一.如何创建JAR文件? .jar是用来压缩档案或者解压档案的文件格式,其特点是具有无损压缩的功能.想知道如何创建这种程序?请访问 http://www.cnblogs.com/yjmyzz/p/ex ...