import sys
import os curPath = os.path.abspath(os.path.dirname(__file__))
rootPath = os.path.split(curPath)[0]
sys.path.append(rootPath) from selenium import webdriver
from selenium.webdriver.chrome.options import Options
# from selenium.webdriver.firefox.options import Options
import time
from time import sleep
import math
import random
import sys
import threading
import urllib.parse
import xlrd
import sys
import os
import sqlite3 MAX_TIME = 600 def py_stop_update_db():
# db = 'py_bdspider_status.db'
# db = '%s\\%s' % (curPath, db)
# conn = sqlite3.connect(db)
# pyname = os.path.basename(__file__).split('.py')[0]
# sql_ = '%s%s%s' % ('UPDATE pystatus_table SET pystatus =2 WHERE pyname="', pyname, '"')
# print(sql_)
# conn.execute(sql_)
# conn.commit()
# conn.close()
return def chk_time(browser, start_time):
if time.time() - start_time > MAX_TIME:
py_stop_update_db()
browser.delete_all_cookies()
browser.quit()
return dir_html = 'baidu_map_html_firstpage_pc_not_shop'
filepath = '%s\\%s' % (curPath, dir_html)
requested_file_list = []
pathDir = os.listdir(filepath)
for allDir in pathDir:
child = os.path.join('%s%s' % (filepath, allDir))
requested_file = child.split(dir_html)[1].split('&')[0].split('.html')[0]
requested_file_list.append(requested_file) tag_jmtool_list = ['(', '(', '-'] def extract_name(name_):
for i in tag_jmtool_list:
name_ = name_.split(i)[0]
return name_ pcity_list = []
pcity_file = '%s\\%s' % (curPath, '省会城市.txt')
with open(pcity_file, 'r', encoding='utf-8') as pf:
c_ = 0
for i in pf:
c_ += 1
if c_ == 3:
c_ = 0
pcity_list.append(i.replace(' ', '').replace('\n', '') + '市')
pcity_sorted_list = sorted(pcity_list) target_type_list = ['住宅小区', '写字楼']
# target_type_list = ['住宅小区']
target_type_list = ['专科医院']
target_type_list = ['商场']
requested_type_counter = 0
# 商场 4705 酒店 24915 专科医院 2513 商圈 334
target_dic = {}
# target_city_list = ['北京市', '上海市', '深圳市', '广州市']
target_city_list = ['深圳市', '广州市']
target_city_list = ['深圳市']
target_city_list = ['北京市', '上海市']
target_city_list = ['北京市', '上海市', '深圳市', '广州市']
target_city_list = ['北京市', '上海市']
target_city_list = ['深圳市', '广州市']
target_city_list = ['北京市']
target_city_list = ['北京市', '上海市', '深圳市', '广州市']
target_city_list = pcity_sorted_list[21:28]
#pcity_sorted_list[7:14]
#target_city_list = pcity_sorted_list # target_city_list = ['杭州市']
file_name = 'JMTool任务_csv_py_wholeCSV' FEXCEL = '%s\\%s%s' % (curPath, file_name, '.xlsx')
data = xlrd.open_workbook(FEXCEL)
table = data.sheets()[0]
nrows, ncols = table.nrows, table.ncols
res_dic, counter_ = {}, 0
for i in range(0, nrows):
l = table.row_values(i)
dbid, area_code, ref_area_type_code, city, district, address, city_street, name_, emp_, emp_1 = l
if city not in target_city_list:
continue
# if city not in target_city_list:
# target_city_list.append(city)
type_ = ref_area_type_code
if type_ not in target_type_list:
continue
name_ = name_.replace('?', '')
name_reduction = extract_name(name_)
if len(name_reduction) < 3:
name_reduction = name_
if city not in target_dic:
target_dic[city] = {}
if district not in target_dic[city]:
target_dic[city][district] = {}
if type_ not in target_dic[city][district]:
target_dic[city][district][type_] = {}
if name_reduction not in target_dic[city][district]:
target_dic[city][district][type_][name_reduction] = {}
target_dic[city][district][type_][name_reduction]['name_reduction_list'] = []
target_dic[city][district][type_][name_reduction]['history_list'] = []
try:
target_dic[city][district][type_][name_reduction]['name_reduction_list'].append(name_)
target_dic[city][district][type_][name_reduction]['history_list'].append(l)
except Exception:
print(Exception) write_res_html_dir = '%s\\%s\\' % (curPath, dir_html) def write_res_html(browser, dir_=write_res_html_dir):
close_alert(browser)
current_url_ = urllib.parse.unquote(browser.current_url)
try:
input_ = current_url_.split('&wd=')[1].split('/?')[0]
except Exception:
print('Exception-', __file__, sys._getframe().f_lineno, current_url_)
return
current_url_ = '%s%s%s' % ('<!--', browser.current_url, '-->')
page_source = '%s%s' % (current_url_, browser.page_source)
# localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
# file_name = '%s%s%s%s' % (dir_, input_, localtime_, '.html')
file_name = '%s%s%s' % (dir_, input_, '.html')
fo = open(file_name, 'w', encoding='utf-8')
fo.write(page_source)
fo.closed
print(os.path.basename(__file__), 'OK-writed-', sys._getframe().f_lineno, '') def gen_random_letter():
return chr(random.randint(97, 122)) def gen_random_num():
return random.randint(0, 10) def gen_sougo_pid():
res_ = ''
for i in range(1, 17, 1):
if i in [1, 3, 4, 15]:
res_ = '%s%s' % (res_, gen_random_letter())
else:
res_ = '%s%s' % (res_, gen_random_num())
return res_ def close_alert(browser, attitude='accept'):
return # executable_path_str = '%s\\%s' % (curPath, 'geckodriver.exe')
# browser = webdriver.Firefox(executable_path=executable_path_str) def mobile_mobile_pages_html(browser, input_):
start_time = time.time()
chk_time(browser, start_time)
sleep(3)
url_ = '%s%s' % ('http://map.baidu.com/?s=s%26wd%3D', input_)
browser.get(url_)
write_res_html(browser) class MyThread(threading.Thread):
def __init__(self, func, args, name):
threading.Thread.__init__(self)
self.name, self.func, self.args = name, func, args def run(self):
self.func(self.args) def thread_city(city):
global requested_type_counter
for district in target_dic[city]:
for type_ in target_dic[city][district]:
for name_reduction in target_dic[city][district][type_]:
for name_ in target_dic[city][district][type_][name_reduction]['name_reduction_list']:
input_ = '%s%s%s' % (city, district, name_)
if input_ in requested_file_list:
requested_type_counter += 1
print('requested_type_counter=', requested_type_counter, input_)
else:
# executable_path_str = '%s\\%s' % (curPath, 'chromedriver.exe')
# browser = webdriver.Chrome(executable_path=executable_path_str) executable_path_str = '%s\\%s' % (curPath, 'geckodriver.exe')
browser = webdriver.Firefox(executable_path=executable_path_str)
mobile_mobile_pages_html(browser, input_) threads_list = []
for city in target_dic:
thread_instance = MyThread(thread_city, (city), thread_city.__name__)
threads_list.append(thread_instance)
for t in threads_list:
t.setDaemon = False
t.start()
for t in threads_list:
t.join() # browser.delete_all_cookies()
# browser.quit()

  

Is this its limit?的更多相关文章

  1. mysql limit分页查询优化写法

    在mysql中进行分页查询时,一般会使用limit查询,而且通常查询中都会使用orderby排 序.但是在表数据量比较大的时候,例如查询语句片段limit 10000, 20,数据库会读取10020条 ...

  2. SQL中TOP,LIMIT,ROWNUM的用法

    SQL SERVER/MS Access的Select Top的用法: Select TOP number|percent table_columname FROM tablename MySQL/O ...

  3. mysql 中的LIMIT用法

    select * from table_name LIMIT 起始偏移量,数量 (1)起始偏移量为0:代表没有偏移,即从第1行开始. (2)数量为-1:代表是无穷,即偏移量之后所有的行. (3)LIM ...

  4. laravel 框架使用总结 limit

    后台开发就是数据的各种处理很多时候需要做到分页,但是在laravel中使用limit做分页的时候会出现问题,偏移量和每页的条数放进去好像不好使了 下面推荐给大家一种在laravel框架中非常好用的写法 ...

  5. [软件推荐]快速文件复制工具(Limit Copy) V4.0 绿色版

    快速文件复制工具(Limit Copy)绿色版是一款智能变频超快复制绿色软件. 快速文件复制工具(Limit Copy)功能比较完善,除了文件复制还可以智能变频,直接把要复制的文件拖入窗口即可,无需手 ...

  6. java.lang.OutOfMemoryError:GC overhead limit exceeded填坑心得

    我遇到这样的问题,本地部署时抛出异常java.lang.OutOfMemoryError:GC overhead limit exceeded导致服务起不来,查看日志发现加载了太多资源到内存,本地的性 ...

  7. TNS-12540: TNS:internal limit restriction exceeded

    应用程序以及客户端工具(Toad.PL/SQL Developer等)出现突然连接不上数据库服务器的情况,监听日志listener.log里面出现了TSN-12518与TSN-12540错误,如下所示 ...

  8. -bash: ulimit: pipe size: cannot modify limit: Invalid argument

    从root账号切换到oracle账号时,出现了"-bash: ulimit: pipe size: cannot modify limit: Invalid argument"提示 ...

  9. [MySQL性能优化系列]LIMIT语句优化

    1. 背景 假设有如下SQL语句: SELECT * FROM table1 LIMIT offset, rows 这是一条典型的LIMIT语句,常见的使用场景是,某些查询返回的内容特别多,而客户端处 ...

  10. migration integer limit option

    https://gist.github.com/stream7/1069589 :limit Numeric Type Column Size Max value 1 tinyint 1 byte 1 ...

随机推荐

  1. 【Hive】Hive 安装&使用基础

    2 安装 2.1 参考 2.1.1 下载 2.1.1.1 https://mirrors.tuna.tsinghua.edu.cn/apache/hive/stable-2/ 2.1.2 安装指导 2 ...

  2. j2ee、mvn、eclipse、Tomcat等中文乱码问题解决方法

    一.更改jdk默认编码为UTF-8,保证启动的JVM不会出现中文乱码问题 1.在编译的时候,如果我们没有用 -encoding 参数指定我们的JAVA源程序的编码格式,则javac.exe首先获得我们 ...

  3. Git的微操作

    合并分支代码,简单操作: 1.切换到master主干代码 2.到git repositories 视图,点击需要合并的分支,例如v1.1.9 点击merge 进行合并 3.然后push to Upst ...

  4. vscode - 安装离线插件

    打开网站(示例): https://marketplace.visualstudio.com/items?itemName=oderwat.indent-rainbow 下载扩展 vscode 安装离 ...

  5. Java BaseDao

    BaseDao类: package dao; import java.sql.*; public class BaseDao { private static final String driver ...

  6. jetty学习小结

    1.什么是jetty? 开源HTTP服务器和Servlet引擎,是web应用的容器,同tomcat类似.由于其轻量灵活的特性,很多知名产品也应用了它,如maven.eclipse.hadoop.spa ...

  7. 如何为Apache JMeter开发插件(二)—第一个JMeter插件

    文章内容转载于:http://lib.csdn.net/article/softwaretest/25700,并且加上个人一些截图 本篇将开启为JMeter开发插件之旅,我们选择以Function(函 ...

  8. my_interface

    import flask,osserver=flask.Flask(__name__) #当前这个python文件,当做一个服务 @server.route('/error',methods=['ge ...

  9. 网页抓取工具Teleport Ultra简介及如何使用

    Teleport Ultra是一款专业的离线浏览器,能够快速.准确地从网络抓取数据并保存到本地,实现离线浏览的目的.它可以从Internet的任何地方抓回你想要的任何文件,它可以在你指定的时间自动登录 ...

  10. The Application does not have a valid signature

    真机运行程序,报错(The application does not have a valid signature,如图 环境:Xcode7.3,使用cocoapods管理第三方库 如果确认证书没有问 ...