def gen_file_data(fodir, fname, sheet_index=0, ):
if fname.find('.xlsx') > -1:
fname_open = '%s\\%s' % (fodir, fname)
book = xlrd.open_workbook(fname_open, on_demand=True)
sheet = book.sheet_by_index(sheet_index)
data = [[str(c.value) for c in sheet.row(i)] for i in range(sheet.nrows)]
book.release_resources()
del book
elif fname.find('.csv') > -1:
data = []
fname_open = '%s\\%s' % (fodir, fname)
with open(fname_open, 'r', encoding='utf-8') as csvfile:
spamreader = csv.reader(csvfile, delimiter=',')
for row in spamreader:
data.append(row)
csvfile.close()
return data

  

import xlrd
import time
import sys
import os
import requests
import sqlite3
import threading
import math
import csv curPath = os.path.abspath(os.path.dirname(__file__))
rootPath = os.path.split(curPath)[0]
sys.path.append(rootPath) MAX_USED_TIMES, overrun_str, DB_KEY_EXHAUST, next_day_tag = 1900, '天配额超限,限制访问', 'DB_KEY_EXHAUST', '000003' db = 'py_bdspider_status.db'
db = '%s\\%s' % (curPath, db) def db_chk_one_exist(key):
conn = sqlite3.connect(db)
c = conn.cursor()
sql = 'SELECT key FROM baidu_map_key_used WHERE key="%s"' % (key)
r = 0
res = c.execute(sql).fetchone()
if res is not None:
r = 1
conn.close
return r # def db_init_key_table():
# conn = sqlite3.connect(db)
# c = conn.cursor()
# k_file = '%s\\%s' % (curPath, 'bdmap_key.txt')
# with open(k_file, 'r', encoding='utf-8') as pf:
# for i in pf:
# if len(i) < 4:
# continue
# author, key = i.replace(' ', '').replace('\n', '').replace('\t', '').split(';')
# r = db_chk_one_exist(key)
# if r == 0:
# localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
# sql = 'INSERT INTO baidu_map_key_used (author,key,update_time,today_used) VALUES ("%s","%s","%s",%s) ' % (
# author, key, localtime_, 0)
# c.execute(sql)
# conn.commit()
# conn.close()
# pf.close()
#
#
# db_init_key_table() def db_recovery_bdkeynum():
if time.strftime("%H%M%S", time.localtime()) == next_day_tag:
conn = sqlite3.connect(db)
c = conn.cursor()
localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
sql = 'UPDATE baidu_map_key_used SET today_used = 0 ,update_time=%s ' % (localtime_)
c.execute(sql)
conn.commit()
conn.close()
return def db_get_one_effective():
db_recovery_bdkeynum()
conn = sqlite3.connect(db)
c = conn.cursor()
sql = 'SELECT key FROM baidu_map_key_used WHERE today_used<=%s ORDER BY today_used ASC' % (MAX_USED_TIMES)
res, r = c.execute(sql).fetchone(), ''
if res is None:
r = DB_KEY_EXHAUST
else:
r = res[0]
conn.close()
return r def db_update_one_today_used(key):
conn = sqlite3.connect(db)
c = conn.cursor()
localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
sql = 'UPDATE baidu_map_key_used SET today_used = today_used+1 ,update_time=%s WHERE key="%s" ' % (
localtime_, key)
c.execute(sql)
conn.commit()
conn.close() dir_, dir_exception, requested_file_list = 'baidu_map_uid', 'baidu_map_uid_exception', []
requested_file_dir_str, requested_file_dir_exception_str = '%s\\%s\\' % (curPath, dir_), '%s\\%s\\' % (
curPath, dir_exception)
requested_file_dir = os.listdir(requested_file_dir_str) def gen_requested_file_list(file_postfix='.html'):
filepath = '%s\\%s' % (curPath, dir_)
pathDir = os.listdir(filepath)
for allDir in pathDir:
child = os.path.join('%s%s' % (filepath, allDir))
requested_file = child.split(dir_)[1].split('&')[0].split(file_postfix)[0]
if requested_file not in requested_file_list:
requested_file_list.append(requested_file) def gen_file_data(fodir, fname, sheet_index=0, ):
if fname.find('.xlsx') > -1:
fname_open = '%s\\%s' % (fodir, fname)
book = xlrd.open_workbook(fname_open, on_demand=True)
sheet = book.sheet_by_index(sheet_index)
data = [[str(c.value) for c in sheet.row(i)] for i in range(sheet.nrows)]
book.release_resources()
del book
elif fname.find('.csv') > -1:
data = []
fname_open = '%s\\%s' % (fodir, fname)
with open(fname_open, 'r', encoding='utf-8') as csvfile:
spamreader = csv.reader(csvfile, delimiter=',')
for row in spamreader:
data.append(row)
csvfile.close()
return data # 3 9
request_dic, target_type_list, target_type_except_list = {}, ['北京市', '上海市', '广州市'], ['火车站', '高铁站', '汽车站', '飞机场', '小学',
'幼儿园', '中学',
'综合医院', '商场']
# ['4s店','餐饮','家电','酒店','咖啡馆','售楼处','专科医院']
# ['住宅小区','写字楼'] # file_postfix_l = ['.html', '.txt']
# for i in file_postfix_l:
# gen_requested_file_list(i) fname_source = 'jfinder_public_jmtool_old_data.csv'
data_file = gen_file_data(curPath, fname_source) def replace_illeagl_tag(str_):
l = [' ', '\n', '\t']
for i in l:
str_ = str_.replace(i, '')
return str_ # 碧海富通城三期(3栋) ok
# =碧海富通城-三期(3栋) ok
replace_to_empty_l = [' ', '|', '\t', '\n', '/', '?', '?', '·', '.'] def gen_bd_query_origin_name(name_):
for i in replace_to_empty_l:
name_ = name_.replace(i, '')
return name_.replace('(', '(').replace(')', ')').replace('?', '').replace('?', '') for l in data_file:
# db_from, db_id, db_area_code, db_name, db_type_, db_city, db_district, db_address, db_street, db_uid, db_submit_time = l
# db_from, id, area_code, name, type_, city, district, address, street, uid, submit_time = l
dbid, area_code, uid, name_, type_en, city, district, address, street, db_from, submit_time, type_ = l if db_from == 'db_from':
continue
request_name = gen_bd_query_origin_name(name_)
input_ = '%s%s%s' % (city, district, request_name)
if input_ in requested_file_list:
print('requested', input_)
continue
if city not in request_dic:
request_dic[city] = {}
if district not in request_dic[city]:
request_dic[city][district] = {}
request_dic[city][district]['request_name_list'] = []
request_dic[city][district]['request_uid_list'] = []
request_dic[city][district]['file_row_list'] = []
if request_name not in request_dic[city][district]['request_name_list']:
request_dic[city][district]['request_name_list'].append(request_name)
uid = uid.replace(' ', '')
if len(uid) > 0 and uid not in request_dic[city][district]['request_uid_list']:
request_dic[city][district]['request_uid_list'].append(uid)
request_dic[city][district]['file_row_list'].append(l)
del data_file base_url = 'http://api.map.baidu.com/place/v2/suggestion?query=R-QUERY&region=R-CITY&city_limit=true&output=json&ak=R-AK'
ex_l = ['Proxy Error', 'APP IP校验失败', 'APP不存在,AK有误请检查再重试', 'The requested URL could not be retrieved',
'Address already in use', '天配额超限,限制访问', 'Parameter Invalid'] write_res_file_dir = '%s\\%s\\' % (curPath, dir_) def write_res_file(str_, input_, ak, dir_=write_res_file_dir, file_postfix='.txt'):
for ex in ex_l:
if str_.find(ex) > -1:
print('EXCEPTION-', ex, 'AK-', ak, 'STR-', str_) return
fname = '%s%s%s' % (dir_, input_, file_postfix)
with open(fname, 'w', encoding='utf-8') as ft:
ft.write(str_)
ft.close()
print('ok', threading.get_ident(), input_) class MyThread(threading.Thread):
def __init__(self, func, args, name):
threading.Thread.__init__(self)
self.name, self.func, self.args = name, func, args def run(self):
self.func(self.args) def fun_(city):
for district in request_dic[city]:
for request_name in request_dic[city][district]['request_name_list']:
ak = db_get_one_effective()
if ak == DB_KEY_EXHAUST:
print(DB_KEY_EXHAUST)
break
else:
url_ = base_url.replace('R-QUERY', request_name).replace('R-CITY', city).replace('R-AK', ak)
print(url_)
input_ = '%s%s%s' % (city, district, request_name) bd_res_json_str = requests.get(url_).text
db_update_one_today_used(ak)
write_res_file(bd_res_json_str, input_, ak) # try:
# # gen_requested_file_list()
# # gen_requested_file_list('.txt')
# # if input_ in requested_file_list:
# # continue
# bd_res_json_str = requests.get(url_).text
# db_update_one_today_used(ak)
# write_res_file(bd_res_json_str, input_)
# except Exception:
# bd_res_json_str = '请求百度-异常'
# write_res_file(bd_res_json_str, input_, requested_file_dir_exception_str)
# print(bd_res_json_str, input_) try:
start_loop, stop_loop = int(sys.argv[1]), int(sys.argv[2])
except Exception:
start_loop, stop_loop = -1, 200 def main():
threads_list, nloop = [], 0
request_dic_city_l = sorted(request_dic, reverse=False)
for city in request_dic_city_l:
nloop += 1
if nloop < start_loop or nloop > stop_loop:
continue
thread_instance = MyThread(fun_, (city), fun_.__name__)
threads_list.append(thread_instance)
for t in threads_list:
t.setDaemon = False
t.start()
for t in threads_list:
t.join() if __name__ == '__main__':
main()

  

csv .xlsx的更多相关文章

  1. 读取 csv , xlsx 表格并添加总分列

    import pandas as pd import numpy as np data = pd.read_excel('学生成绩表.csv',columns = ['学号','姓名','高数','英 ...

  2. Linux-各种姿势(less\vi等)打开各种类型的文件(txt/csv/xlsx等)出现不能打开(全乱码、部分乱码、二进制文件等)的问题

    (一)linux各种中文乱码解决办法整理 远程登录服务器用vim在终端下编辑查看文件经常会遇见各种中文乱码问题. 做如下设置可基本解决vim中文乱码问题,首先查看系统对中文的支持locale -a | ...

  3. SAS学习笔记15 SAS导入数据(import txt csv xlsx spss)

  4. C#:将.csv格式文件转换成.xlsx格式文件

    using System; using System.Collections.Generic; using System.ComponentModel; using System.Data; usin ...

  5. Data_r_and_w(csv,json,xlsx)

    import osimport sysimport argparsetry:    import cStringIO as StringIOexcept:    import StringIOimpo ...

  6. 使用 WeihanLi.Npoi 操作 CSV

    使用 WeihanLi.Npoi 操作 CSV Intro 最近发现 csv 文件在很多情况下都在使用,而且经过大致了解,csv 格式简单,相比 excel 文件要小很多,读取也很是方便,而且也很通用 ...

  7. SSIS 中将csv 文件批量导出到excel 文件,并设置excel 文件中某些列的data column format 为Text

    csv 文件是文本文件类型,但是打开csv 文件后(默认使用本地已经安装的excel 来打开excel 文件),默认显示出来的是general 类型(column data format)的数据, 这 ...

  8. PHP 导出 Excel 兼容 CSV XlS格式

    class ExcelRead { /** * 获取Excel文件内容 * @param $file * @return mixed * @throws PHPExcel_Reader_Excepti ...

  9. Coursera-Getting and Cleaning Data-week4-R语言中的正则表达式以及文本处理

    博客总目录:http://www.cnblogs.com/weibaar/p/4507801.html Thursday, January 29, 2015 补上第四周笔记,以及本次课程总结. 第四周 ...

随机推荐

  1. java gc log

    java full gc 经常带来延迟, 导致性能问题 如下命令使java虚拟机记录gc的log到文件, 帮助分析定位问题. java -Xloggc:./a.log -jar a.jar    // ...

  2. ubuntu 查看系统服务的列表

    service --status-all

  3. Android之TextView的Span样式源代码剖析

    Android中的TextView是个显示文字的的UI类.在现实中的需求中,文字有各式各样的样式,TextView本身没有属性去设置实现.我们能够通过Android提供的 SpannableStrin ...

  4. http header 具体解释

    HTTP(HyperTextTransferProtocol)即超文本传输协议,眼下网页传输的的通用协议. HTTP协议採用了请求/响应模型,浏览器或其它client发出请求,server给与响应. ...

  5. Python学习笔记(一)类和继承的使用

    一年前就打算学Python了,折腾来折腾去也一直没有用熟练,主要是类那一块不熟,昨天用Python写了几个网络编程的示例,感觉一下子迈进了很多.这几天把学习Python的笔记整理一下,内容尽量简洁. ...

  6. PLSQL Developer设置技巧

    1.右键菜单 在PL/SQL Developer(下面简称PLD)中的每一个文本编辑窗口,如SQL Window,Command Window和Porgram Window,右键点击某个对象名称,会弹 ...

  7. hdu 2871 Memory Control(线段树)

    题目链接:hdu 2871 Memory Control 题目大意:模拟一个内存分配机制. Reset:重置,释放全部空间 New x:申请内存为x的空间,输出左地址 Free x:释放地址x所在的内 ...

  8. oracle经常使用函数(1)

    1.返回与指定的字符相应的十进制数 select ascii('A') A,ascii('z') a,ascii('12') 一打,ascii(' ') kg from dual; 2.返回与指定十进 ...

  9. visual studio 2010 LNK1123解决方式

    ------------------------------------------------------------Lysen----------------------------------- ...

  10. 为php添加pcntl扩展,多线程

    前言: pcntl 介绍 pcntl扩展可以支持 PHP 的多线程操作.(非Unix类系统不支持此模块) phpize 介绍 phpize 可以用来给 PHP 动态的添加扩展.比如编译 PHP 时忘记 ...