def gen_file_data(fodir, fname, sheet_index=0, ):
if fname.find('.xlsx') > -1:
fname_open = '%s\\%s' % (fodir, fname)
book = xlrd.open_workbook(fname_open, on_demand=True)
sheet = book.sheet_by_index(sheet_index)
data = [[str(c.value) for c in sheet.row(i)] for i in range(sheet.nrows)]
book.release_resources()
del book
elif fname.find('.csv') > -1:
data = []
fname_open = '%s\\%s' % (fodir, fname)
with open(fname_open, 'r', encoding='utf-8') as csvfile:
spamreader = csv.reader(csvfile, delimiter=',')
for row in spamreader:
data.append(row)
csvfile.close()
return data

  

import xlrd
import time
import sys
import os
import requests
import sqlite3
import threading
import math
import csv curPath = os.path.abspath(os.path.dirname(__file__))
rootPath = os.path.split(curPath)[0]
sys.path.append(rootPath) MAX_USED_TIMES, overrun_str, DB_KEY_EXHAUST, next_day_tag = 1900, '天配额超限,限制访问', 'DB_KEY_EXHAUST', '000003' db = 'py_bdspider_status.db'
db = '%s\\%s' % (curPath, db) def db_chk_one_exist(key):
conn = sqlite3.connect(db)
c = conn.cursor()
sql = 'SELECT key FROM baidu_map_key_used WHERE key="%s"' % (key)
r = 0
res = c.execute(sql).fetchone()
if res is not None:
r = 1
conn.close
return r # def db_init_key_table():
# conn = sqlite3.connect(db)
# c = conn.cursor()
# k_file = '%s\\%s' % (curPath, 'bdmap_key.txt')
# with open(k_file, 'r', encoding='utf-8') as pf:
# for i in pf:
# if len(i) < 4:
# continue
# author, key = i.replace(' ', '').replace('\n', '').replace('\t', '').split(';')
# r = db_chk_one_exist(key)
# if r == 0:
# localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
# sql = 'INSERT INTO baidu_map_key_used (author,key,update_time,today_used) VALUES ("%s","%s","%s",%s) ' % (
# author, key, localtime_, 0)
# c.execute(sql)
# conn.commit()
# conn.close()
# pf.close()
#
#
# db_init_key_table() def db_recovery_bdkeynum():
if time.strftime("%H%M%S", time.localtime()) == next_day_tag:
conn = sqlite3.connect(db)
c = conn.cursor()
localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
sql = 'UPDATE baidu_map_key_used SET today_used = 0 ,update_time=%s ' % (localtime_)
c.execute(sql)
conn.commit()
conn.close()
return def db_get_one_effective():
db_recovery_bdkeynum()
conn = sqlite3.connect(db)
c = conn.cursor()
sql = 'SELECT key FROM baidu_map_key_used WHERE today_used<=%s ORDER BY today_used ASC' % (MAX_USED_TIMES)
res, r = c.execute(sql).fetchone(), ''
if res is None:
r = DB_KEY_EXHAUST
else:
r = res[0]
conn.close()
return r def db_update_one_today_used(key):
conn = sqlite3.connect(db)
c = conn.cursor()
localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
sql = 'UPDATE baidu_map_key_used SET today_used = today_used+1 ,update_time=%s WHERE key="%s" ' % (
localtime_, key)
c.execute(sql)
conn.commit()
conn.close() dir_, dir_exception, requested_file_list = 'baidu_map_uid', 'baidu_map_uid_exception', []
requested_file_dir_str, requested_file_dir_exception_str = '%s\\%s\\' % (curPath, dir_), '%s\\%s\\' % (
curPath, dir_exception)
requested_file_dir = os.listdir(requested_file_dir_str) def gen_requested_file_list(file_postfix='.html'):
filepath = '%s\\%s' % (curPath, dir_)
pathDir = os.listdir(filepath)
for allDir in pathDir:
child = os.path.join('%s%s' % (filepath, allDir))
requested_file = child.split(dir_)[1].split('&')[0].split(file_postfix)[0]
if requested_file not in requested_file_list:
requested_file_list.append(requested_file) def gen_file_data(fodir, fname, sheet_index=0, ):
if fname.find('.xlsx') > -1:
fname_open = '%s\\%s' % (fodir, fname)
book = xlrd.open_workbook(fname_open, on_demand=True)
sheet = book.sheet_by_index(sheet_index)
data = [[str(c.value) for c in sheet.row(i)] for i in range(sheet.nrows)]
book.release_resources()
del book
elif fname.find('.csv') > -1:
data = []
fname_open = '%s\\%s' % (fodir, fname)
with open(fname_open, 'r', encoding='utf-8') as csvfile:
spamreader = csv.reader(csvfile, delimiter=',')
for row in spamreader:
data.append(row)
csvfile.close()
return data # 3 9
request_dic, target_type_list, target_type_except_list = {}, ['北京市', '上海市', '广州市'], ['火车站', '高铁站', '汽车站', '飞机场', '小学',
'幼儿园', '中学',
'综合医院', '商场']
# ['4s店','餐饮','家电','酒店','咖啡馆','售楼处','专科医院']
# ['住宅小区','写字楼'] # file_postfix_l = ['.html', '.txt']
# for i in file_postfix_l:
# gen_requested_file_list(i) fname_source = 'jfinder_public_jmtool_old_data.csv'
data_file = gen_file_data(curPath, fname_source) def replace_illeagl_tag(str_):
l = [' ', '\n', '\t']
for i in l:
str_ = str_.replace(i, '')
return str_ # 碧海富通城三期(3栋) ok
# =碧海富通城-三期(3栋) ok
replace_to_empty_l = [' ', '|', '\t', '\n', '/', '?', '?', '·', '.'] def gen_bd_query_origin_name(name_):
for i in replace_to_empty_l:
name_ = name_.replace(i, '')
return name_.replace('(', '(').replace(')', ')').replace('?', '').replace('?', '') for l in data_file:
# db_from, db_id, db_area_code, db_name, db_type_, db_city, db_district, db_address, db_street, db_uid, db_submit_time = l
# db_from, id, area_code, name, type_, city, district, address, street, uid, submit_time = l
dbid, area_code, uid, name_, type_en, city, district, address, street, db_from, submit_time, type_ = l if db_from == 'db_from':
continue
request_name = gen_bd_query_origin_name(name_)
input_ = '%s%s%s' % (city, district, request_name)
if input_ in requested_file_list:
print('requested', input_)
continue
if city not in request_dic:
request_dic[city] = {}
if district not in request_dic[city]:
request_dic[city][district] = {}
request_dic[city][district]['request_name_list'] = []
request_dic[city][district]['request_uid_list'] = []
request_dic[city][district]['file_row_list'] = []
if request_name not in request_dic[city][district]['request_name_list']:
request_dic[city][district]['request_name_list'].append(request_name)
uid = uid.replace(' ', '')
if len(uid) > 0 and uid not in request_dic[city][district]['request_uid_list']:
request_dic[city][district]['request_uid_list'].append(uid)
request_dic[city][district]['file_row_list'].append(l)
del data_file base_url = 'http://api.map.baidu.com/place/v2/suggestion?query=R-QUERY&region=R-CITY&city_limit=true&output=json&ak=R-AK'
ex_l = ['Proxy Error', 'APP IP校验失败', 'APP不存在,AK有误请检查再重试', 'The requested URL could not be retrieved',
'Address already in use', '天配额超限,限制访问', 'Parameter Invalid'] write_res_file_dir = '%s\\%s\\' % (curPath, dir_) def write_res_file(str_, input_, ak, dir_=write_res_file_dir, file_postfix='.txt'):
for ex in ex_l:
if str_.find(ex) > -1:
print('EXCEPTION-', ex, 'AK-', ak, 'STR-', str_) return
fname = '%s%s%s' % (dir_, input_, file_postfix)
with open(fname, 'w', encoding='utf-8') as ft:
ft.write(str_)
ft.close()
print('ok', threading.get_ident(), input_) class MyThread(threading.Thread):
def __init__(self, func, args, name):
threading.Thread.__init__(self)
self.name, self.func, self.args = name, func, args def run(self):
self.func(self.args) def fun_(city):
for district in request_dic[city]:
for request_name in request_dic[city][district]['request_name_list']:
ak = db_get_one_effective()
if ak == DB_KEY_EXHAUST:
print(DB_KEY_EXHAUST)
break
else:
url_ = base_url.replace('R-QUERY', request_name).replace('R-CITY', city).replace('R-AK', ak)
print(url_)
input_ = '%s%s%s' % (city, district, request_name) bd_res_json_str = requests.get(url_).text
db_update_one_today_used(ak)
write_res_file(bd_res_json_str, input_, ak) # try:
# # gen_requested_file_list()
# # gen_requested_file_list('.txt')
# # if input_ in requested_file_list:
# # continue
# bd_res_json_str = requests.get(url_).text
# db_update_one_today_used(ak)
# write_res_file(bd_res_json_str, input_)
# except Exception:
# bd_res_json_str = '请求百度-异常'
# write_res_file(bd_res_json_str, input_, requested_file_dir_exception_str)
# print(bd_res_json_str, input_) try:
start_loop, stop_loop = int(sys.argv[1]), int(sys.argv[2])
except Exception:
start_loop, stop_loop = -1, 200 def main():
threads_list, nloop = [], 0
request_dic_city_l = sorted(request_dic, reverse=False)
for city in request_dic_city_l:
nloop += 1
if nloop < start_loop or nloop > stop_loop:
continue
thread_instance = MyThread(fun_, (city), fun_.__name__)
threads_list.append(thread_instance)
for t in threads_list:
t.setDaemon = False
t.start()
for t in threads_list:
t.join() if __name__ == '__main__':
main()

  

csv .xlsx的更多相关文章

  1. 读取 csv , xlsx 表格并添加总分列

    import pandas as pd import numpy as np data = pd.read_excel('学生成绩表.csv',columns = ['学号','姓名','高数','英 ...

  2. Linux-各种姿势(less\vi等)打开各种类型的文件(txt/csv/xlsx等)出现不能打开(全乱码、部分乱码、二进制文件等)的问题

    (一)linux各种中文乱码解决办法整理 远程登录服务器用vim在终端下编辑查看文件经常会遇见各种中文乱码问题. 做如下设置可基本解决vim中文乱码问题,首先查看系统对中文的支持locale -a | ...

  3. SAS学习笔记15 SAS导入数据(import txt csv xlsx spss)

  4. C#:将.csv格式文件转换成.xlsx格式文件

    using System; using System.Collections.Generic; using System.ComponentModel; using System.Data; usin ...

  5. Data_r_and_w(csv,json,xlsx)

    import osimport sysimport argparsetry:    import cStringIO as StringIOexcept:    import StringIOimpo ...

  6. 使用 WeihanLi.Npoi 操作 CSV

    使用 WeihanLi.Npoi 操作 CSV Intro 最近发现 csv 文件在很多情况下都在使用,而且经过大致了解,csv 格式简单,相比 excel 文件要小很多,读取也很是方便,而且也很通用 ...

  7. SSIS 中将csv 文件批量导出到excel 文件,并设置excel 文件中某些列的data column format 为Text

    csv 文件是文本文件类型,但是打开csv 文件后(默认使用本地已经安装的excel 来打开excel 文件),默认显示出来的是general 类型(column data format)的数据, 这 ...

  8. PHP 导出 Excel 兼容 CSV XlS格式

    class ExcelRead { /** * 获取Excel文件内容 * @param $file * @return mixed * @throws PHPExcel_Reader_Excepti ...

  9. Coursera-Getting and Cleaning Data-week4-R语言中的正则表达式以及文本处理

    博客总目录:http://www.cnblogs.com/weibaar/p/4507801.html Thursday, January 29, 2015 补上第四周笔记,以及本次课程总结. 第四周 ...

随机推荐

  1. Centos 通过yum的方式升级内核

    在安装某些软件时,可能对我们的系统内核版本有要求. 比如在安装docker要满足一定的条件,对于centos系统,要求必须是64位,并且内核版本是3.10以上. 如果你的centos操作系统内核低于3 ...

  2. [脚本编程] 过云盾、D盾各种盾shell

    作者: dean <?php //过云盾.D盾各种盾shell $id = $_GET['id']; //debug echo $catid = isset($_GET['catid'])?ba ...

  3. Nginx:subrequest的使用方式

    参考资料<深入理解Nginx> subrequest是由HTTP框架提供的一种分解复杂请求的设计模式. 它可以把原始请求分解为许多子请求,使得诸多请求协同完成一个用户请求,并且每个请求只关 ...

  4. UE4射击小游戏原型

    尝试使用了下blueprint,不知道是bug还是不熟悉,blueprint有些地方运行的跟逻辑不太一样.不管ue4目前,快速做原型倒是蛮方便的.就等着官方发更多教程讲述关于新的matinee,Nav ...

  5. EasyUI 鼠标经过 显示气泡一例

    $(function(){ $('#contacts').tooltip({ position: 'bottom', content: '<c:forEach items="${rec ...

  6. 微信小程序登录JAVA后台

    代码地址如下:http://www.demodashi.com/demo/12736.html 登录流程时序登录流程时序 具体的登录说明查看 小程序官方API 项目的结构图: springboot项目 ...

  7. PJISP 修改 消息头Fromto字段

    项目需求,需要修改sip信令消息头中Fromto字段,完成此功能需要修改sip库(PJSIP)源码,具体如下: PJSIP 消息头 Formto  字段默认的格式是sip:平台@平台IP地址,例如si ...

  8. Ubuntu下安装配置JDK,Tomcat,MySql

    jdk安装配置 下载jdk-6u45-linux-x64.bin 切换到root用户su root 切换目录,新建文件夹,复制文件cd /usr      mkdir javacd javacp 路径 ...

  9. 使用Gitolite搭建Gitserver

    Gitolite是一款Perl语言开发的Git服务管理工具.通过公钥对用户进行认证.并可以通过配置文件对些操作进行基于分支和路径的精细控制. Gitolite採用的是SSH协议而且使用SSH公钥认证. ...

  10. ES 31 - 从0开始搭建Elasticsearch生产集群

    目录 1 配置环境 1.1 服务器IP映射 1.2 配置各节点的ssh免密通信 1.3 安装JDK并配置环境变量 2 部署单节点服务 3 部署集群服务 4 启动集群中的所有节点 4.2 启动各个节点中 ...