import xlrd
import time
import sys
import os
import requests
import sqlite3
import threading curPath = os.path.abspath(os.path.dirname(__file__))
rootPath = os.path.split(curPath)[0]
sys.path.append(rootPath) MAX_USED_TIMES, overrun_str, DB_KEY_EXHAUST, next_day_tag = 1900, '天配额超限,限制访问', 'DB_KEY_EXHAUST', '000003' db = 'py_bdspider_status.db'
db = '%s\\%s' % (curPath, db) def db_chk_one_exist(key):
conn = sqlite3.connect(db)
c = conn.cursor()
sql = 'SELECT key FROM baidu_map_key_used WHERE key="%s"' % (key)
r = 0
res = c.execute(sql).fetchone()
if res is not None:
r = 1
conn.close
return r # def db_init_key_table():
# conn = sqlite3.connect(db)
# c = conn.cursor()
# k_file = '%s\\%s' % (curPath, 'bdmap_key.txt')
# with open(k_file, 'r', encoding='utf-8') as pf:
# for i in pf:
# if len(i) < 4:
# continue
# author, key = i.replace(' ', '').replace('\n', '').replace('\t', '').split(';')
# r = db_chk_one_exist(key)
# if r == 0:
# localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
# sql = 'INSERT INTO baidu_map_key_used (author,key,update_time,today_used) VALUES ("%s","%s","%s",%s) ' % (
# author, key, localtime_, 0)
# c.execute(sql)
# conn.commit()
# conn.close()
# pf.close()
#
#
# db_init_key_table() def db_recovery_bdkeynum():
if time.strftime("%H%M%S", time.localtime()) == next_day_tag:
conn = sqlite3.connect(db)
c = conn.cursor()
localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
sql = 'UPDATE baidu_map_key_used SET today_used = 0 ,update_time=%s ' % (localtime_)
c.execute(sql)
conn.commit()
conn.close()
return def db_get_one_effective():
db_recovery_bdkeynum()
conn = sqlite3.connect(db)
c = conn.cursor()
sql = 'SELECT key FROM baidu_map_key_used WHERE today_used<=%s ORDER BY today_used ASC' % (MAX_USED_TIMES)
res, r = c.execute(sql).fetchone(), ''
if res is None:
r = DB_KEY_EXHAUST
else:
r = res[0]
conn.close()
return r def db_update_one_today_used(key):
conn = sqlite3.connect(db)
c = conn.cursor()
localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
sql = 'UPDATE baidu_map_key_used SET today_used = today_used+1 ,update_time=%s WHERE key="%s" ' % (
localtime_, key)
c.execute(sql)
conn.commit()
conn.close() dir_, dir_exception, requested_file_list = 'baidu_map_uid', 'baidu_map_uid_exception', []
requested_file_dir_str, requested_file_dir_exception_str = '%s\\%s\\' % (curPath, dir_), '%s\\%s\\' % (
curPath, dir_exception)
requested_file_dir = os.listdir(requested_file_dir_str) ex_l = ['Proxy Error', 'APP IP校验失败', 'APP不存在,AK有误请检查再重试', 'The requested URL could not be retrieved',
'Address already in use', '天配额超限,限制访问', 'Parameter Invalid'] def rm_invalid_file(file_postfix='.txt'):
filepath = '%s\\%s\\' % (curPath, dir_)
file_l = os.listdir(filepath)
for i in file_l:
fdir_o = os.path.join('%s%s' % (filepath, i))
file_postfix_ = os.path.splitext(fdir_o)[1]
if file_postfix_ == file_postfix:
with open(fdir_o, 'r', encoding='utf-8') as ft:
jstr = ''
for i in ft:
jstr = '%s%s' % (jstr, i)
ft.close()
for ex in ex_l:
if jstr.find(ex) > -1:
statinfo = os.stat(fdir_o)
strftime_st_ctime = time.strftime("%y%m%d%H%M%S", time.localtime(statinfo.st_ctime))
try:
os.remove(fdir_o)
print('remove', ex, strftime_st_ctime, fdir_o)
except Exception:
print('multiprocess--multithreading--', fdir_o) rm_invalid_file() def gen_requested_file_list(file_postfix='.html'):
filepath = '%s\\%s' % (curPath, dir_)
pathDir = os.listdir(filepath)
for allDir in pathDir:
child = os.path.join('%s%s' % (filepath, allDir))
requested_file = child.split(dir_)[1].split('&')[0].split(file_postfix)[0]
if requested_file not in requested_file_list:
requested_file_list.append(requested_file) file_postfix_l = ['.txt']
for i in file_postfix_l:
gen_requested_file_list(i) def gen_file_data(fname_source, file_type='.xlsx'):
fname_open = '%s\\%s' % (curPath, fname_source)
excel_ = '%s%s' % (fname_open, file_type)
book = xlrd.open_workbook(excel_, on_demand=True)
sheet = book.sheet_by_index(0)
data = [[str(c.value) for c in sheet.row(i)] for i in range(sheet.nrows)]
book.release_resources()
del book
return data request_dic, target_type_list, target_type_except_list = {}, [], [] fname_source = '【SOURCE】采集员新增任务133598条-楼宇归集-互异百度uid数51700' data_selfadd = gen_file_data(fname_source) def replace_illeagl_tag(str_):
l = [' ', '\n', '\t']
for i in l:
str_ = str_.replace(i, '')
return str_ # 碧海富通城三期(3栋) ok
# =碧海富通城-三期(3栋) ok
replace_to_empty_l = [' ', '|', '\t', '\n', '/', '?', '?', '·', '.'] def gen_bd_query_origin_name(name_):
for i in replace_to_empty_l:
name_ = name_.replace(i, '')
return name_.replace('(', '(').replace(')', ')').replace('?', '').replace('?', '') for l in data_selfadd:
dbid, area_code, type_, city, district, uid, name_, address, street, request_name, submit_time = l
# if city != '深圳市':
# continue
# if len(uid.replace(' ', '')) > 0:
# continue
request_name = gen_bd_query_origin_name(name_)
request_name_chk = '%s%s%s' % (city, district, request_name)
if request_name_chk in requested_file_list:
continue
if city not in request_dic:
request_dic[city] = {}
if district not in request_dic[city]:
request_dic[city][district] = {}
request_dic[city][district] = []
if request_name not in request_dic[city][district]:
request_dic[city][district].append(request_name)
del data_selfadd fname_source = '【TEAM】41876条JMTool官方数据百度POIuid_添加率0.9388_170830171339'
data_jmtool = gen_file_data(fname_source)
for l in data_jmtool:
dbid, area_code, name_, request_name, type_, city, district, addr, street, bd_status, bd_message, bd_res_str, city_bd, district_bd, business_bd, cityid_bd, name_bd, uid, lat_bd, lng_bd, compute_res, name_ratio_res, combine_ratio_res, uid_href = l
# if len(uid.replace(' ', '')) > 0:
# continue
# if city != '深圳市':
# continue
request_name = gen_bd_query_origin_name(name_)
request_name_chk = '%s%s%s' % (city, district, request_name)
if request_name_chk in requested_file_list:
continue
if city not in request_dic:
request_dic[city] = {}
if district not in request_dic[city]:
request_dic[city][district] = {}
request_dic[city][district] = []
if request_name not in request_dic[city][district]:
request_dic[city][district].append(request_name)
del data_jmtool write_res_file_dir = '%s\\%s\\' % (curPath, dir_) def write_res_file(input_, str_, dir_=write_res_file_dir, file_postfix='.txt'):
for ex in ex_l:
if str_.find(ex) > -1:
global ak, url_
print('EXCEPTION-', ex, 'AK-', ak, 'URL-', url_)
return
fname = '%s%s%s' % (dir_, input_, file_postfix)
with open(fname, 'w', encoding='utf-8') as ft:
ft.write(str_)
ft.close()
print('ok', threading.get_ident(), input_) class MyThread(threading.Thread):
def __init__(self, func, args, name):
threading.Thread.__init__(self)
self.name, self.func, self.args = name, func, args def run(self):
self.func(self.args) # http://api.map.baidu.com/place/v2/suggestion?query=瀛嘉天下&region=重庆市&city_limit=true&output=json&ak=oy2Q7IluhhwTGlz6l8pXYv6a0m6hXxr1
base_url = 'http://api.map.baidu.com/place/v2/suggestion?query=R-QUERY&region=R-CITY&city_limit=true&output=json&ak=R-AK' def fun_(city):
for district in request_dic[city]:
for request_name in request_dic[city][district]:
request_name_chk = '%s%s%s' % (city, district, request_name)
# gen_requested_file_list('.txt')
if request_name_chk in requested_file_list:
continue
ak = db_get_one_effective()
if ak == DB_KEY_EXHAUST:
print(DB_KEY_EXHAUST)
break
else:
url_ = base_url.replace('R-QUERY', request_name).replace('R-CITY', city).replace('R-AK', ak)
try:
bd_res_json_str = requests.get(url_).text
db_update_one_today_used(ak)
write_res_file(request_name_chk, bd_res_json_str)
except Exception:
bd_res_json_str = '请求百度-异常'
write_res_file(request_name_chk, bd_res_json_str, requested_file_dir_exception_str)
print(request_name_chk, bd_res_json_str) try:
start_loop, stop_loop = int(sys.argv[1]), int(sys.argv[2])
except Exception:
start_loop, stop_loop = -1, 200 def main():
threads_list, nloop = [], 0
request_dic_city_l = sorted(request_dic, reverse=False)
for city in request_dic_city_l:
nloop += 1
if nloop < start_loop or nloop > stop_loop:
continue
thread_instance = MyThread(fun_, (city), fun_.__name__)
threads_list.append(thread_instance)
for t in threads_list:
t.setDaemon = False
t.start()
for t in threads_list:
t.join() if __name__ == '__main__':
main()

  

import time
import sys
import os curPath = os.path.abspath(os.path.dirname(__file__))
rootPath = os.path.split(curPath)[0]
sys.path.append(rootPath) dir_, dir_exception, requested_file_list = 'baidu_map_uid', 'baidu_map_uid_exception', []
requested_file_dir_str, requested_file_dir_exception_str = '%s\\%s\\' % (curPath, dir_), '%s\\%s\\' % (
curPath, dir_exception)
requested_file_dir = os.listdir(requested_file_dir_str) ex_l = ['Proxy Error', 'APP IP校验失败', 'APP不存在,AK有误请检查再重试', 'The requested URL could not be retrieved',
'Address already in use', '天配额超限,限制访问', 'Parameter Invalid'] def rm_invalid_file(file_postfix='.txt'):
filepath = '%s\\%s\\' % (curPath, dir_)
file_l = os.listdir(filepath)
for i in file_l:
fdir_o = os.path.join('%s%s' % (filepath, i))
file_postfix_ = os.path.splitext(fdir_o)[1]
if file_postfix_ == file_postfix:
with open(fdir_o, 'r', encoding='utf-8') as ft:
jstr = ''
for i in ft:
jstr = '%s%s' % (jstr, i)
ft.close()
for ex in ex_l:
if jstr.find(ex) > -1:
statinfo = os.stat(fdir_o)
strftime_st_ctime = time.strftime("%y%m%d%H%M%S",time.localtime(statinfo.st_ctime))
os.remove(fdir_o)
print('remove', ex,strftime_st_ctime, fdir_o) rm_invalid_file()

  

import xlrd
import time
import sys
import os
import requests
import sqlite3
import threading curPath = os.path.abspath(os.path.dirname(__file__))
rootPath = os.path.split(curPath)[0]
sys.path.append(rootPath) MAX_USED_TIMES, overrun_str, DB_KEY_EXHAUST, next_day_tag = 1900, '天配额超限,限制访问', 'DB_KEY_EXHAUST', '000003' db = 'py_bdspider_status.db'
db = '%s\\%s' % (curPath, db) def db_chk_one_exist(key):
conn = sqlite3.connect(db)
c = conn.cursor()
sql = 'SELECT key FROM baidu_map_key_used WHERE key="%s"' % (key)
r = 0
res = c.execute(sql).fetchone()
if res is not None:
r = 1
conn.close
return r # def db_init_key_table():
# conn = sqlite3.connect(db)
# c = conn.cursor()
# k_file = '%s\\%s' % (curPath, 'bdmap_key.txt')
# with open(k_file, 'r', encoding='utf-8') as pf:
# for i in pf:
# if len(i) < 4:
# continue
# author, key = i.replace(' ', '').replace('\n', '').replace('\t', '').split(';')
# r = db_chk_one_exist(key)
# if r == 0:
# localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
# sql = 'INSERT INTO baidu_map_key_used (author,key,update_time,today_used) VALUES ("%s","%s","%s",%s) ' % (
# author, key, localtime_, 0)
# c.execute(sql)
# conn.commit()
# conn.close()
# pf.close()
#
#
# db_init_key_table() def db_recovery_bdkeynum():
if time.strftime("%H%M%S", time.localtime()) == next_day_tag:
conn = sqlite3.connect(db)
c = conn.cursor()
localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
sql = 'UPDATE baidu_map_key_used SET today_used = 0 ,update_time=%s ' % (localtime_)
c.execute(sql)
conn.commit()
conn.close()
return def db_get_one_effective():
db_recovery_bdkeynum()
conn = sqlite3.connect(db)
c = conn.cursor()
sql = 'SELECT key FROM baidu_map_key_used WHERE today_used<=%s ORDER BY today_used ASC' % (MAX_USED_TIMES)
res, r = c.execute(sql).fetchone(), ''
if res is None:
r = DB_KEY_EXHAUST
else:
r = res[0]
conn.close()
return r def db_update_one_today_used(key):
conn = sqlite3.connect(db)
c = conn.cursor()
localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
sql = 'UPDATE baidu_map_key_used SET today_used = today_used+1 ,update_time=%s WHERE key="%s" ' % (
localtime_, key)
c.execute(sql)
conn.commit()
conn.close() dir_, dir_exception, requested_file_list = 'baidu_map_uid', 'baidu_map_uid_exception', []
requested_file_dir_str, requested_file_dir_exception_str = '%s\\%s\\' % (curPath, dir_), '%s\\%s\\' % (
curPath, dir_exception)
requested_file_dir = os.listdir(requested_file_dir_str) ex_l = ['Proxy Error', 'APP IP校验失败', 'APP不存在,AK有误请检查再重试', 'The requested URL could not be retrieved',
'Address already in use', '天配额超限,限制访问', 'Parameter Invalid']
def rm_invalid_file(file_postfix='.txt'):
filepath = '%s\\%s\\' % (curPath, dir_)
file_l = os.listdir(filepath)
for i in file_l:
fdir_o = os.path.join('%s%s' % (filepath, i))
file_postfix_ = os.path.splitext(fdir_o)[1]
if file_postfix_ == file_postfix:
with open(fdir_o, 'r', encoding='utf-8') as ft:
jstr = ''
for i in ft:
jstr = '%s%s' % (jstr, i)
ft.close()
for ex in ex_l:
if jstr.find(ex) > -1:
statinfo = os.stat(fdir_o)
strftime_st_ctime = time.strftime("%y%m%d%H%M%S",time.localtime(statinfo.st_ctime))
os.remove(fdir_o)
print('remove', ex,strftime_st_ctime, fdir_o) rm_invalid_file() def gen_requested_file_list(file_postfix='.html'):
filepath = '%s\\%s' % (curPath, dir_)
pathDir = os.listdir(filepath)
for allDir in pathDir:
child = os.path.join('%s%s' % (filepath, allDir))
requested_file = child.split(dir_)[1].split('&')[0].split(file_postfix)[0]
if requested_file not in requested_file_list:
requested_file_list.append(requested_file) file_postfix_l = ['.txt']
for i in file_postfix_l:
gen_requested_file_list(i) def gen_file_data(fname_source, file_type='.xlsx'):
fname_open = '%s\\%s' % (curPath, fname_source)
excel_ = '%s%s' % (fname_open, file_type)
book = xlrd.open_workbook(excel_, on_demand=True)
sheet = book.sheet_by_index(0)
data = [[str(c.value) for c in sheet.row(i)] for i in range(sheet.nrows)]
book.release_resources()
del book
return data request_dic, target_type_list, target_type_except_list = {}, [], [] fname_source = '【SOURCE】采集员新增任务133598条-楼宇归集-互异百度uid数51700' data_selfadd = gen_file_data(fname_source) def replace_illeagl_tag(str_):
l = [' ', '\n', '\t']
for i in l:
str_ = str_.replace(i, '')
return str_ # 碧海富通城三期(3栋) ok
# =碧海富通城-三期(3栋) ok
replace_to_empty_l = [' ', '|', '\t', '\n', '/', '?', '?', '·', '.'] def gen_bd_query_origin_name(name_):
for i in replace_to_empty_l:
name_ = name_.replace(i, '')
return name_.replace('(', '(').replace(')', ')').replace('?', '').replace('?', '') for l in data_selfadd:
dbid, area_code, type_, city, district, uid, name_, address, street, request_name, submit_time = l
# if city != '深圳市':
# continue
# if len(uid.replace(' ', '')) > 0:
# continue
request_name = gen_bd_query_origin_name(name_)
request_name_chk = '%s%s%s' % (city, district, request_name)
if request_name_chk in requested_file_list:
continue
if city not in request_dic:
request_dic[city] = {}
if district not in request_dic[city]:
request_dic[city][district] = {}
request_dic[city][district] = []
if request_name not in request_dic[city][district]:
request_dic[city][district].append(request_name)
del data_selfadd fname_source = '【TEAM】41876条JMTool官方数据百度POIuid_添加率0.9388_170830171339'
data_jmtool = gen_file_data(fname_source)
for l in data_jmtool:
dbid, area_code, name_, request_name, type_, city, district, addr, street, bd_status, bd_message, bd_res_str, city_bd, district_bd, business_bd, cityid_bd, name_bd, uid, lat_bd, lng_bd, compute_res, name_ratio_res, combine_ratio_res, uid_href = l
# if len(uid.replace(' ', '')) > 0:
# continue
# if city != '深圳市':
# continue
request_name = gen_bd_query_origin_name(name_)
request_name_chk = '%s%s%s' % (city, district, request_name)
if request_name_chk in requested_file_list:
continue
if city not in request_dic:
request_dic[city] = {}
if district not in request_dic[city]:
request_dic[city][district] = {}
request_dic[city][district] = []
if request_name not in request_dic[city][district]:
request_dic[city][district].append(request_name)
del data_jmtool write_res_file_dir = '%s\\%s\\' % (curPath, dir_) def write_res_file(input_, str_, dir_=write_res_file_dir, file_postfix='.txt'):
for ex in ex_l:
if str_.find(ex) > -1:
global ak, url_
print('EXCEPTION-', ex, 'AK-', ak, 'URL-', url_)
return
fname = '%s%s%s' % (dir_, input_, file_postfix)
with open(fname, 'w', encoding='utf-8') as ft:
ft.write(str_)
ft.close()
print('ok', threading.get_ident(), input_) class MyThread(threading.Thread):
def __init__(self, func, args, name):
threading.Thread.__init__(self)
self.name, self.func, self.args = name, func, args def run(self):
self.func(self.args) # http://api.map.baidu.com/place/v2/suggestion?query=瀛嘉天下&region=重庆市&city_limit=true&output=json&ak=oy2Q7IluhhwTGlz6l8pXYv6a0m6hXxr1
base_url = 'http://api.map.baidu.com/place/v2/suggestion?query=R-QUERY&region=R-CITY&city_limit=true&output=json&ak=R-AK' def fun_(city):
for district in request_dic[city]:
for request_name in request_dic[city][district]:
request_name_chk = '%s%s%s' % (city, district, request_name)
# gen_requested_file_list('.txt')
if request_name_chk in requested_file_list:
continue
ak = db_get_one_effective()
if ak == DB_KEY_EXHAUST:
print(DB_KEY_EXHAUST)
break
else:
url_ = base_url.replace('R-QUERY', request_name).replace('R-CITY', city).replace('R-AK', ak)
try:
bd_res_json_str = requests.get(url_).text
db_update_one_today_used(ak)
write_res_file(request_name_chk, bd_res_json_str)
except Exception:
bd_res_json_str = '请求百度-异常'
write_res_file(request_name_chk, bd_res_json_str, requested_file_dir_exception_str)
print(request_name_chk, bd_res_json_str) try:
start_loop, stop_loop = int(sys.argv[1]), int(sys.argv[2])
except Exception:
start_loop, stop_loop = -1, 200 def main():
threads_list, nloop = [], 0
request_dic_city_l = sorted(request_dic, reverse=False)
for city in request_dic_city_l:
nloop += 1
if nloop < start_loop or nloop > stop_loop:
continue
thread_instance = MyThread(fun_, (city), fun_.__name__)
threads_list.append(thread_instance)
for t in threads_list:
t.setDaemon = False
t.start()
for t in threads_list:
t.join() if __name__ == '__main__':
main()

  

rm_invalid_file的更多相关文章

随机推荐

  1. 【Hadoop】如何形象描述大数据生态?

    作者:千岁大王链接:https://www.zhihu.com/question/27974418/answer/39845635来源:知乎著作权归作者所有,转载请联系作者获得授权. Google内部 ...

  2. win8.1使用WP8SDK出现Windows Phone Emulator无法启动的问题解决方案

    近期在win8.1专业版系统的vs2012上装了wp8SDK 体验一把wp开发的快感 安装sdk过程一切顺利 打完代码之后运行调试 问题来了: 提示如下错误 遂百度之 主要的方法就是两步 1.检查机器 ...

  3. 转:MVVM的基本入门简介

    https://mp.weixin.qq.com/s?__biz=MzA3MjA4NjE3NQ==&mid=404502568&idx=1&sn=fe512f9820b99d3 ...

  4. 往MySQL数据库datetime类型字段中插入数据库的当前时间

    代码: StringBuilder sb = new StringBuilder(); sb.append(" insert into uosdetailfile ("); sb. ...

  5. 系统封装 如何打造原生WINPE

    1 安装微软的AIK(Windows Automated Installation Kit,Windows自动安装工具包),AIK简体中文版下载地址: http://download.microsof ...

  6. Java 循环结构 - for, while 及 do...while

    Java 循环结构 - for, while 及 do...while 顺序结构的程序语句只能被执行一次.如果您想要同样的操作执行多次,,就需要使用循环结构. Java中有三种主要的循环结构: whi ...

  7. ping百度不通的解决方案

    1.ping  www.baidu.com unknow baidu.com 第一步,确定是否能ping通网关 第二步,确定是否能直接ping通外网 如ping 8.8.8.8 第三步,如果上面两个都 ...

  8. IIS5.1、IIS6.0、IIS7.5中安装配置MVC 3

    本文主要介绍在IIS5.1.IIS6.0.IIS7.5中安装配置MVC 3的具体办法! 正文: IIS5.1 1. 安装Microsoft .net FrameWork 4.0安装包; 2. 安装AS ...

  9. ASP.NET CORE RAZOR :个性化显示

    https://docs.microsoft.com/zh-cn/aspnet/core/tutorials/razor-pages/da1 我们的电影应用有个不错的开始,但是展示效果还不够理想. 我 ...

  10. 2.JAVA编程思想——一切都是对象

    一切都是对象 欢迎转载.转载请标明出处:http://blog.csdn.net/notbaron/article/details/51040221 虽然以C++为基础,但 Java 是一种更纯粹的面 ...