蜘蛛页面

from selenium import webdriver
import time
import random
from bs4 import *
import pymysql h, pt, u, p, db = 'localhost', , 'root', 'root', 'test' def mysql_fetch(sql, res_type='tuple'):
global h, pt, u, p, db
try:
conn = pymysql.connect(host=h, port=pt, user=u, passwd=p, db=db, charset='utf8')
except Exception as e:
print(e)
return ()
if res_type == 'dic':
cursor = conn.cursor(pymysql.cursors.DictCursor)
else: cursor = conn.cursor()
cursor.execute(sql)
conn.commit()
cursor.close()
conn.close()
return cursor.fetchall() def mysql_write(sql):
global h, pt, u, p, db
try:
conn = pymysql.connect(host=h, port=pt, user=u, passwd=p, db=db, charset='utf8')
except Exception as e:
print(e)
return
cursor = conn.cursor()
cursor.execute(sql)
conn.commit()
cursor.close()
conn.close()
return browser = webdriver.Chrome()
f_url_l = ['https://www.baidu.com/', 'https://www.so.com/']
f_url_l_a = f_url_l[int(time.time()) % len(f_url_l)]
browser.get(f_url_l_a)
time.sleep(random.randint(, ))
url = 'https://so.gushiwen.org/shiwenv_5a36a4613434.aspx'
js = 'window.location.href="{}";'.format(url)
browser.execute_script(js)
# img_url = 'https://img.gushiwen.org/authorImg/daishulun.jpg'
myhtml = 'D:\\myhtml\\{}gushiwen.tmp.html'.format(random.randint(, ))
with open(myhtml, 'w', encoding='utf-8') as fw:
fw.write(browser.page_source)
sql = 'INSERT INTO parent_url (page_title,page_url,children_url) VALUES '
with open(myhtml, 'r', encoding='utf-8') as myhtml_o:
bs = BeautifulSoup(myhtml_o, 'html.parser')
url_l = [i.attrs['href'] for i in bs.find_all('a')]
res_l = []
sql_l = []
for i in url_l:
if 'gushiwen.org' in i and 'javascript' not in i and 'http' in i and i[:] == 'http':
# sql_chk = 'SELECT * FROM parent_url WHERE page_url="{}" AND children_url="{}"'.format(url, i)
# r = mysql_fetch(sql_chk)
# if len(r) > :
# continue
if i not in res_l:
if i == url:
continue
res_l.append(i)
s = '("{}","{}","{}")'.format(browser.title, url, i)
sql_l.append(s)
if len(sql_l) > : sql = '{}{}'.format(sql, ','.join(sql_l))
print(sql)
mysql_write(sql) sql_del = 'DELETE FROM parent_url WHERE id IN ( SELECT min_id FROM ( SELECT MIN(id) AS min_id, COUNT(1) AS c FROM parent_url GROUP BY CONCAT(page_url,children_url) ) AS tab WHERE c>1 )'
print(sql_del)
mysql_write(sql_del) while True:
sql_ori = 'SELECT page_url,children_url FROM parent_url WHERE if_spider=0 ORDER BY id DESC '
res = mysql_fetch(sql_ori, 'dic')
for d in res:
page_url, children_url = d['page_url'], d['children_url']
url = children_url
js = 'window.location.href="{}";'.format(url)
browser.execute_script(js)
time.sleep()
browser.refresh()
myhtml = 'D:\\myhtml\\{}gushiwen.tmp.html'.format(random.randint(, ))
with open(myhtml, 'w', encoding='utf-8') as fw:
fw.write(browser.page_source)
sql = 'INSERT INTO parent_url (page_title,page_url,children_url) VALUES '
with open(myhtml, 'r', encoding='utf-8') as myhtml_o:
bs = BeautifulSoup(myhtml_o, 'html.parser')
try:
url_l = [i.attrs['href'] for i in bs.find_all('a')]
except Exception as e:
print(e)
continue
res_l = []
sql_l = []
for i in url_l:
# /user/findpwd.aspx?from=http://so.gushiwen.org/user/collect.aspx
if 'gushiwen.org' in i and 'javascript' not in i and 'http' in i and i[:] == 'http':
# sql_chk = 'SELECT * FROM parent_url WHERE page_url="{}" AND children_url="{}"'.format(url, i)
# print(sql_chk)
# r = mysql_fetch(sql_chk)
# print(r)
# if len(r) > :
# continue
if i not in res_l:
if i == url:
continue
res_l.append(i)
s = '("{}","{}","{}")'.format(browser.title, url, i)
# sql_break = '{}{}'.format(sql, s)
# print(sql_break)
# mysql_write(sql_break) # print(s)
sql_l.append(s)
if len(sql_l) > :
sql = '{}{}'.format(sql, ','.join(sql_l))
print(sql)
mysql_write(sql) sql_del = 'DELETE FROM parent_url WHERE id IN ( SELECT min_id FROM ( SELECT MIN(id) AS min_id, COUNT(1) AS c FROM parent_url GROUP BY CONCAT(page_url,children_url) ) AS tab WHERE c>1 )'
print(sql_del)
mysql_write(sql_del) sql_udp = 'UPDATE parent_url SET if_spider=1 WHERE page_url="{}" AND children_url="{}"'.format(page_url,
url)
mysql_write(sql_udp)
print(sql_udp)
time.sleep() dd =

CREATE TABLE `parent_url` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`page_title` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci DEFAULT NULL,
`page_url` text,
`children_url` text,
`if_spider` tinyint(4) DEFAULT '0',
PRIMARY KEY (`id`)
) ENGINE=MyISAM AUTO_INCREMENT=5328 DEFAULT CHARSET=latin1;

先写入,后删除

避免每个写入前的检查

消耗时间

获取一个网站的全部url

修复逻辑错误

支持 多进程   脚本多开

from selenium import webdriver
import time
import random
from bs4 import *
import pymysql h, pt, u, p, db = 'localhost', 3306, 'root', 'root', 'test' def mysql_fetch(sql, res_type='tuple'):
global h, pt, u, p, db
try:
conn = pymysql.connect(host=h, port=pt, user=u, passwd=p, db=db, charset='utf8')
except Exception as e:
print(e)
return ()
if res_type == 'dic':
cursor = conn.cursor(pymysql.cursors.DictCursor)
else: cursor = conn.cursor()
cursor.execute(sql)
conn.commit()
cursor.close()
conn.close()
return cursor.fetchall() def mysql_write(sql):
global h, pt, u, p, db
try:
conn = pymysql.connect(host=h, port=pt, user=u, passwd=p, db=db, charset='utf8')
except Exception as e:
print(e)
return 1
cursor = conn.cursor()
cursor.execute(sql)
conn.commit()
cursor.close()
conn.close()
return 0 browser = webdriver.Chrome()
f_url_l = ['https://www.baidu.com/', 'https://www.so.com/']
f_url_l_a = f_url_l[int(time.time()) % len(f_url_l)]
browser.get(f_url_l_a)
time.sleep(random.randint(1, 2))
url = 'https://so.gushiwen.org/shiwenv_5a36a4613434.aspx'
js = 'window.location.href="{}";'.format(url)
browser.execute_script(js)
# img_url = 'https://img.gushiwen.org/authorImg/daishulun.jpg'
myhtml = 'D:\\myhtml\\{}gushiwen.tmp.html'.format(random.randint(123, 999))
with open(myhtml, 'w', encoding='utf-8') as fw:
fw.write(browser.page_source)
sql = 'INSERT INTO parent_url (page_title,page_url,children_url) VALUES '
with open(myhtml, 'r', encoding='utf-8') as myhtml_o:
bs = BeautifulSoup(myhtml_o, 'html.parser')
url_l = [i.attrs['href'] for i in bs.find_all('a')]
res_l = []
sql_l = []
for i in url_l:
if 'gushiwen.org' in i and 'javascript' not in i and 'http' in i and i[0:4] == 'http':
# sql_chk = 'SELECT * FROM parent_url WHERE page_url="{}" AND children_url="{}"'.format(url, i)
# r = mysql_fetch(sql_chk)
# if len(r) > 0:
# continue
if i not in res_l:
if i == url:
continue
res_l.append(i)
s = '("{}","{}","{}")'.format(browser.title, url, i)
sql_l.append(s)
if len(sql_l) > 0:
sql = '{}{}'.format(sql, ','.join(sql_l))
print(sql)
mysql_write(sql) sql_del = 'DELETE FROM parent_url WHERE id IN ( SELECT max_id FROM ( SELECT MAX(id) AS max_id, COUNT(1) AS c FROM parent_url GROUP BY CONCAT(page_url,children_url) ) AS tab WHERE c>1 )'
print(sql_del)
mysql_write(sql_del) import random # https://www.gushiwen.org/FileNotFound.htm?aspxerrorpath=/user/findpwd.aspx
# https://so.gushiwen.org/user/findpwd.aspx?from=http://so.gushiwen.org/user/collect.aspx
# https://so.gushiwen.org/app/
url_kw_filter_l = ['FileNotFound', 'findpwd', '/app/']
while True: sql_filter = "DELETE FROM parent_url WHERE id IN ( SELECT id FROM ( SELECT id FROM parent_url WHERE INSTR(page_title,'密码')>0 OR INSTR(UPPER(page_url),'PWD')>0 OR INSTR(UPPER(children_url),'PWD')>0) AS t);"
mysql_write(sql_filter) sql_ori = 'SELECT page_url,children_url FROM parent_url WHERE if_spider=0'
res = mysql_fetch(sql_ori, 'dic')
jump_c, jump_s = 0, random.randint(0, max(0, len(res) - 10))
for d in res:
jump_c += 1
if jump_c < jump_s:
continue
page_url, children_url = d['page_url'], d['children_url']
url = children_url continue_ = False
for fl in url_kw_filter_l:
if fl in url:
continue_ = True
break
if continue_:
continue js = 'window.location.href="{}";'.format(url)
browser.execute_script(js)
time.sleep(1)
browser.refresh()
myhtml = 'D:\\myhtml\\{}gushiwen.tmp.html'.format(random.randint(123, 999))
with open(myhtml, 'w', encoding='utf-8') as fw:
fw.write(browser.page_source)
sql = 'INSERT INTO parent_url (page_title,page_url,children_url) VALUES '
with open(myhtml, 'r', encoding='utf-8') as myhtml_o:
bs = BeautifulSoup(myhtml_o, 'html.parser')
try:
url_l = [i.attrs['href'] for i in bs.find_all('a')]
except Exception as e:
print(e)
continue
res_l = []
sql_l = []
for i in url_l:
# /user/findpwd.aspx?from=http://so.gushiwen.org/user/collect.aspx
if 'gushiwen.org' in i and 'javascript' not in i and 'http' in i and i[0:4] == 'http':
# sql_chk = 'SELECT * FROM parent_url WHERE page_url="{}" AND children_url="{}"'.format(url, i)
# print(sql_chk)
# r = mysql_fetch(sql_chk)
# print(r)
# if len(r) > 0:
# continue
if i not in res_l:
if i == url:
continue
res_l.append(i)
s = '("{}","{}","{}")'.format(browser.title, url, i)
# sql_break = '{}{}'.format(sql, s)
# print(sql_break)
# mysql_write(sql_break) # print(s)
sql_l.append(s)
if len(sql_l) > 0:
sql = '{}{}'.format(sql, ','.join(sql_l))
print(sql)
mysql_write(sql)
sql_del = 'DELETE FROM parent_url WHERE id IN ( SELECT max_id FROM ( SELECT MAX(id) AS max_id, COUNT(1) AS c FROM parent_url GROUP BY CONCAT(page_url,children_url) ) AS tab WHERE c>1 )'
print(sql_del)
mysql_write(sql_del)
sql_udp = 'UPDATE parent_url SET if_spider=1 WHERE page_url="{}" AND children_url="{}"'.format(page_url,
url)
mysql_write(sql_udp)
print(sql_udp)
time.sleep(3) dd = 0

  

代码的每一个功能点的模块化

from selenium import webdriver
import time
import random
from bs4 import *
import pymysql h, pt, u, p, db = 'localhost', 3306, 'root', 'root', 'test' def mysql_fetch(sql, res_type='tuple'):
global h, pt, u, p, db
try:
conn = pymysql.connect(host=h, port=pt, user=u, passwd=p, db=db, charset='utf8')
except Exception as e:
print(e)
return ()
if res_type == 'dic':
cursor = conn.cursor(pymysql.cursors.DictCursor)
else: cursor = conn.cursor()
cursor.execute(sql)
conn.commit()
cursor.close()
conn.close()
return cursor.fetchall() def mysql_write(sql):
global h, pt, u, p, db
try:
conn = pymysql.connect(host=h, port=pt, user=u, passwd=p, db=db, charset='utf8')
except Exception as e:
print(e)
return 1
cursor = conn.cursor()
cursor.execute(sql)
conn.commit()
cursor.close()
conn.close()
return 0 browser = webdriver.Chrome()
f_url_l = ['https://www.baidu.com/', 'https://www.so.com/']
f_url_l_a = f_url_l[int(time.time()) % len(f_url_l)]
browser.get(f_url_l_a)
time.sleep(random.randint(1, 2))
url = 'https://so.gushiwen.org/shiwenv_5a36a4613434.aspx'
js = 'window.location.href="{}";'.format(url)
browser.execute_script(js)
# img_url = 'https://img.gushiwen.org/authorImg/daishulun.jpg'
myhtml = 'D:\\myhtml\\{}gushiwen.tmp.html'.format(random.randint(123, 999))
with open(myhtml, 'w', encoding='utf-8') as fw:
fw.write(browser.page_source)
sql = 'INSERT INTO parent_url (page_title,page_url,children_url) VALUES '
with open(myhtml, 'r', encoding='utf-8') as myhtml_o:
bs = BeautifulSoup(myhtml_o, 'html.parser')
url_l = [i.attrs['href'] for i in bs.find_all('a')]
res_l = []
sql_l = []
for i in url_l:
if 'gushiwen.org' in i and 'javascript' not in i and 'http' in i and i[0:4] == 'http':
# sql_chk = 'SELECT * FROM parent_url WHERE page_url="{}" AND children_url="{}"'.format(url, i)
# r = mysql_fetch(sql_chk)
# if len(r) > 0:
# continue
if i not in res_l:
if i == url:
continue
res_l.append(i)
s = '("{}","{}","{}")'.format(browser.title, url, i)
sql_l.append(s)
if len(sql_l) > 0:
sql = '{}{}'.format(sql, ','.join(sql_l))
print(sql)
mysql_write(sql) sql_del = 'DELETE FROM parent_url WHERE id IN ( SELECT max_id FROM ( SELECT MAX(id) AS max_id, COUNT(1) AS c FROM parent_url GROUP BY CONCAT(page_url,children_url) ) AS tab WHERE c>1 )'
print(sql_del)
mysql_write(sql_del) import random # https://www.gushiwen.org/FileNotFound.htm?aspxerrorpath=/user/findpwd.aspx
# https://so.gushiwen.org/user/findpwd.aspx?from=http://so.gushiwen.org/user/collect.aspx
# https://so.gushiwen.org/app/
# https://so.gushiwen.org/jiucuo.aspx?u= url_kw_filter_l = ['FileNotFound', 'findpwd', '/app/', '/jiucuo.aspx']
sql_filter = "DELETE FROM parent_url WHERE id IN ( SELECT id FROM ( SELECT id FROM parent_url WHERE INSTR(page_title,'密码')>0 OR INSTR(UPPER(page_url),'PWD')>0 OR INSTR(UPPER(children_url),'PWD')>0) AS t);"
sql_filter = "DELETE FROM parent_url WHERE id IN ( SELECT id FROM ( SELECT id FROM parent_url WHERE MYWHERE ) AS t);"
print(sql_filter)
sql_s_l = []
for i in url_kw_filter_l:
ii = i.upper()
s = " INSTR(UPPER(page_url),'{}')>0 OR INSTR(UPPER(children_url),'{}')>0 ".format(ii, ii)
sql_s_l.append(s)
sql_filter = sql_filter.replace('MYWHERE', ' OR '.join(sql_s_l)) while True:
mysql_write(sql_filter) sql_ori = 'SELECT page_url,children_url FROM parent_url WHERE if_spider=0' res = mysql_fetch(sql_ori, 'dic')
jump_c, jump_s = 0, random.randint(0, max(0, len(res) - 10))
for d in res:
jump_c += 1
if jump_c < jump_s:
continue
page_url, children_url = d['page_url'], d['children_url']
url = children_url js = 'window.location.href="{}";'.format(url)
browser.execute_script(js)
time.sleep(1)
browser.refresh()
myhtml = 'D:\\myhtml\\{}gushiwen.tmp.html'.format(random.randint(123, 999))
with open(myhtml, 'w', encoding='utf-8') as fw:
fw.write(browser.page_source)
sql = 'INSERT INTO parent_url (page_title,page_url,children_url) VALUES '
with open(myhtml, 'r', encoding='utf-8') as myhtml_o:
bs = BeautifulSoup(myhtml_o, 'html.parser')
try:
url_l = [i.attrs['href'] for i in bs.find_all('a')]
except Exception as e:
print(e)
continue
res_l = []
sql_l = []
for i in url_l:
# /user/findpwd.aspx?from=http://so.gushiwen.org/user/collect.aspx
if 'gushiwen.org' in i and 'javascript' not in i and 'http' in i and i[0:4] == 'http':
# sql_chk = 'SELECT * FROM parent_url WHERE page_url="{}" AND children_url="{}"'.format(url, i)
# print(sql_chk)
# r = mysql_fetch(sql_chk)
# print(r)
# if len(r) > 0:
# continue
if i not in res_l:
if i == url:
continue
res_l.append(i)
s = '("{}","{}","{}")'.format(browser.title, url, i)
# sql_break = '{}{}'.format(sql, s)
# print(sql_break)
# mysql_write(sql_break) # print(s)
sql_l.append(s)
if len(sql_l) > 0:
sql = '{}{}'.format(sql, ','.join(sql_l))
print(sql)
mysql_write(sql)
sql_del = 'DELETE FROM parent_url WHERE id IN ( SELECT max_id FROM ( SELECT MAX(id) AS max_id, COUNT(1) AS c FROM parent_url GROUP BY CONCAT(page_url,children_url) ) AS tab WHERE c>1 )'
print(sql_del)
mysql_write(sql_del)
sql_udp = 'UPDATE parent_url SET if_spider=1 WHERE page_url="{}" AND children_url="{}"'.format(page_url,
url)
mysql_write(sql_udp)
print(sql_udp)
time.sleep(3) dd = 0

  

from selenium import webdriver
import time
import random
from bs4 import *
import pymysql h, pt, u, p, db = 'localhost', 3306, 'root', 'root', 'test' def mysql_fetch(sql, res_type='tuple'):
global h, pt, u, p, db
try:
conn = pymysql.connect(host=h, port=pt, user=u, passwd=p, db=db, charset='utf8')
except Exception as e:
print(e)
return ()
if res_type == 'dic':
cursor = conn.cursor(pymysql.cursors.DictCursor)
else: cursor = conn.cursor()
cursor.execute(sql)
conn.commit()
cursor.close()
conn.close()
return cursor.fetchall() def mysql_write(sql):
global h, pt, u, p, db
try:
conn = pymysql.connect(host=h, port=pt, user=u, passwd=p, db=db, charset='utf8')
except Exception as e:
print(e)
return 1
cursor = conn.cursor()
cursor.execute(sql)
conn.commit()
cursor.close()
conn.close()
return 0 browser = webdriver.Chrome()
f_url_l = ['https://www.baidu.com/', 'https://www.so.com/']
f_url_l_a = f_url_l[int(time.time()) % len(f_url_l)]
browser.get(f_url_l_a)
time.sleep(random.randint(1, 2))
url = 'https://so.gushiwen.org/shiwenv_5a36a4613434.aspx'
js = 'window.location.href="{}";'.format(url)
browser.execute_script(js)
# img_url = 'https://img.gushiwen.org/authorImg/daishulun.jpg'
myhtml = 'D:\\myhtml\\{}gushiwen.tmp.html'.format(random.randint(123, 999))
with open(myhtml, 'w', encoding='utf-8') as fw:
fw.write(browser.page_source)
sql = 'INSERT INTO parent_url (page_title,page_url,children_url) VALUES '
with open(myhtml, 'r', encoding='utf-8') as myhtml_o:
bs = BeautifulSoup(myhtml_o, 'html.parser')
url_l = [i.attrs['href'] for i in bs.find_all('a')]
res_l = []
sql_l = []
for i in url_l:
if 'gushiwen.org' in i and 'javascript' not in i and 'http' in i and i[0:4] == 'http':
# sql_chk = 'SELECT * FROM parent_url WHERE page_url="{}" AND children_url="{}"'.format(url, i)
# r = mysql_fetch(sql_chk)
# if len(r) > 0:
# continue
if i not in res_l:
if i == url:
continue
res_l.append(i)
s = '("{}","{}","{}")'.format(browser.title, url, i)
sql_l.append(s)
if len(sql_l) > 0:
sql = '{}{}'.format(sql, ','.join(sql_l))
print(sql)
mysql_write(sql) sql_del = 'DELETE FROM parent_url WHERE id IN ( SELECT max_id FROM ( SELECT MAX(id) AS max_id, COUNT(1) AS c FROM parent_url GROUP BY CONCAT(page_url,children_url) ) AS tab WHERE c>1 )'
print(sql_del)
mysql_write(sql_del) import random # https://www.gushiwen.org/FileNotFound.htm?aspxerrorpath=/user/findpwd.aspx
# https://so.gushiwen.org/user/findpwd.aspx?from=http://so.gushiwen.org/user/collect.aspx
# https://so.gushiwen.org/app/
# https://so.gushiwen.org/jiucuo.aspx?u= url_kw_filter_l = ['FileNotFound', 'findpwd', '/app/', '/jiucuo.aspx']
sql_filter = "DELETE FROM parent_url WHERE id IN ( SELECT id FROM ( SELECT id FROM parent_url WHERE INSTR(page_title,'密码')>0 OR INSTR(UPPER(page_url),'PWD')>0 OR INSTR(UPPER(children_url),'PWD')>0) AS t);"
sql_filter = "DELETE FROM parent_url WHERE id IN ( SELECT id FROM ( SELECT id FROM parent_url WHERE MYWHERE ) AS t);" sql_s_l = []
for i in url_kw_filter_l:
ii = i.upper()
s = " INSTR(UPPER(page_url),'{}')>0 OR INSTR(UPPER(children_url),'{}')>0 ".format(ii, ii)
sql_s_l.append(s)
sql_filter = sql_filter.replace('MYWHERE', ' OR '.join(sql_s_l)) while True:
mysql_write(sql_filter)
print(sql_filter)
sql_del = 'DELETE FROM parent_url WHERE id IN ( SELECT max_id FROM ( SELECT MAX(id) AS max_id, COUNT(1) AS c FROM parent_url GROUP BY CONCAT(page_url,children_url) ) AS tab WHERE c>1 )'
print(sql_del)
mysql_write(sql_del) sql_pass = 'SELECT DISTINCT(page_url) FROM parent_url'
url_pass = ['"{}"'.format(i[0]) for i in mysql_fetch(sql_pass, res_type='tuple')]
# 乐观代码
sql_ori = 'SELECT page_url,children_url FROM parent_url WHERE if_spider=0 AND children_url NOT IN ({})'.format(
','.join(url_pass)) res = mysql_fetch(sql_ori, 'dic') jump_c, jump_s = 0, random.randint(0, max(0, len(res) - 10))
for d in res:
jump_c += 1
if jump_c < jump_s:
continue
page_url, children_url = d['page_url'], d['children_url']
url = children_url js = 'window.location.href="{}";'.format(url)
browser.execute_script(js)
# time.sleep(1)
# browser.refresh() try:
for isc in range(1):
time.sleep(1)
js = 'window.scrollTo(0,document.body.scrollHeight)'
browser.execute_script(js)
except Exception as e:
print('window.scrollTo-->', e) myhtml = 'D:\\myhtml\\{}gushiwen.tmp.html'.format(random.randint(123, 999))
with open(myhtml, 'w', encoding='utf-8') as fw:
fw.write(browser.page_source)
sql = 'INSERT INTO parent_url (page_title,page_url,children_url) VALUES '
with open(myhtml, 'r', encoding='utf-8') as myhtml_o:
bs = BeautifulSoup(myhtml_o, 'html.parser')
try:
url_l = [i.attrs['href'] for i in bs.find_all('a')]
except Exception as e:
print(e)
continue
res_l = []
sql_l = []
for i in url_l:
# /user/findpwd.aspx?from=http://so.gushiwen.org/user/collect.aspx
if 'gushiwen.org' in i and 'javascript' not in i and 'http' in i and i[0:4] == 'http':
# sql_chk = 'SELECT * FROM parent_url WHERE page_url="{}" AND children_url="{}"'.format(url, i)
# print(sql_chk)
# r = mysql_fetch(sql_chk)
# print(r)
# if len(r) > 0:
# continue
if i not in res_l:
if i == url:
continue continue_ = False
for fi in url_kw_filter_l:
ii = fi.upper()
if fi in i.upper():
continue_ = True
break
if continue_:
continue res_l.append(i)
s = '("{}","{}","{}")'.format(browser.title, url, i)
# sql_break = '{}{}'.format(sql, s)
# print(sql_break)
# mysql_write(sql_break) # print(s)
sql_l.append(s)
if len(sql_l) > 0:
sql = '{}{}'.format(sql, ','.join(sql_l))
print(sql)
mysql_write(sql) sql_udp = 'UPDATE parent_url SET if_spider=1 WHERE page_url="{}" AND children_url="{}"'.format(page_url,
url)
mysql_write(sql_udp)
print(sql_udp) dd = 0

  

from selenium import webdriver
import time
import random
from bs4 import *
import pymysql h, pt, u, p, db = 'localhost', 3306, 'root', 'root', 'test' def mysql_fetch(sql, res_type='tuple'):
global h, pt, u, p, db
try:
conn = pymysql.connect(host=h, port=pt, user=u, passwd=p, db=db, charset='utf8')
except Exception as e:
print(e)
return ()
if res_type == 'dic':
cursor = conn.cursor(pymysql.cursors.DictCursor)
else: cursor = conn.cursor()
cursor.execute(sql)
conn.commit()
cursor.close()
conn.close()
return cursor.fetchall() def mysql_write(sql):
global h, pt, u, p, db
try:
conn = pymysql.connect(host=h, port=pt, user=u, passwd=p, db=db, charset='utf8')
except Exception as e:
print(e)
return 1
cursor = conn.cursor()
cursor.execute(sql)
conn.commit()
cursor.close()
conn.close()
return 0 browser = webdriver.Chrome()
f_url_l = ['https://www.baidu.com/', 'https://www.so.com/']
f_url_l_a = f_url_l[int(time.time()) % len(f_url_l)]
browser.get(f_url_l_a)
time.sleep(random.randint(1, 2))
url = 'https://so.gushiwen.org/shiwenv_5a36a4613434.aspx'
js = 'window.location.href="{}";'.format(url)
browser.execute_script(js)
# img_url = 'https://img.gushiwen.org/authorImg/daishulun.jpg'
myhtml = 'D:\\myhtml\\{}gushiwen.tmp.html'.format(random.randint(123, 999))
with open(myhtml, 'w', encoding='utf-8') as fw:
fw.write(browser.page_source)
sql = 'INSERT INTO parent_url (page_title,page_url,children_url) VALUES '
with open(myhtml, 'r', encoding='utf-8') as myhtml_o:
bs = BeautifulSoup(myhtml_o, 'html.parser')
url_l = [i.attrs['href'] for i in bs.find_all('a')]
res_l = []
sql_l = []
for i in url_l:
break
if 'gushiwen.org' in i and 'javascript' not in i and 'http' in i and i[0:4] == 'http':
# sql_chk = 'SELECT * FROM parent_url WHERE page_url="{}" AND children_url="{}"'.format(url, i)
# r = mysql_fetch(sql_chk)
# if len(r) > 0:
# continue
if i not in res_l:
if i == url:
continue
res_l.append(i)
s = '("{}","{}","{}")'.format(browser.title, url, i)
sql_l.append(s)
if len(sql_l) > 0:
sql = '{}{}'.format(sql, ','.join(sql_l))
print(sql)
mysql_write(sql) sql_del = 'DELETE FROM parent_url WHERE id IN ( SELECT max_id FROM ( SELECT MAX(id) AS max_id, COUNT(1) AS c FROM parent_url GROUP BY CONCAT(page_url,children_url) ) AS tab WHERE c>1 )'
print(sql_del)
mysql_write(sql_del) import random # https://www.gushiwen.org/FileNotFound.htm?aspxerrorpath=/user/findpwd.aspx
# https://so.gushiwen.org/user/findpwd.aspx?from=http://so.gushiwen.org/user/collect.aspx
# https://so.gushiwen.org/app/
# https://so.gushiwen.org/jiucuo.aspx?u= url_kw_filter_l = ['FileNotFound', 'findpwd', '/app/', '/jiucuo.aspx']
sql_filter = "DELETE FROM parent_url WHERE id IN ( SELECT id FROM ( SELECT id FROM parent_url WHERE INSTR(page_title,'密码')>0 OR INSTR(UPPER(page_url),'PWD')>0 OR INSTR(UPPER(children_url),'PWD')>0) AS t);"
sql_filter = "DELETE FROM parent_url WHERE id IN ( SELECT id FROM ( SELECT id FROM parent_url WHERE MYWHERE ) AS t);" sql_s_l = []
for i in url_kw_filter_l:
ii = i.upper()
s = " INSTR(UPPER(page_url),'{}')>0 OR INSTR(UPPER(children_url),'{}')>0 ".format(ii, ii)
sql_s_l.append(s)
sql_filter = sql_filter.replace('MYWHERE', ' OR '.join(sql_s_l)) while True:
mysql_write(sql_filter)
print(sql_filter)
sql_del = 'DELETE FROM parent_url WHERE id IN ( SELECT max_id FROM ( SELECT MAX(id) AS max_id, COUNT(1) AS c FROM parent_url GROUP BY CONCAT(page_url,children_url) ) AS tab WHERE c>1 )'
print(sql_del)
mysql_write(sql_del) sql_pass = 'SELECT DISTINCT(page_url) FROM parent_url'
url_pass = ['"{}"'.format(i[0]) for i in mysql_fetch(sql_pass, res_type='tuple')]
# 乐观代码
sql_ori = 'SELECT page_url,children_url FROM parent_url WHERE if_spider=0 AND children_url NOT IN ({})'.format(
','.join(url_pass)) res = mysql_fetch(sql_ori, 'dic') jump_c, jump_s = 0, random.randint(0, max(0, len(res) - 10))
for d in res:
jump_c += 1
if jump_c < jump_s:
continue
page_url, children_url = d['page_url'], d['children_url']
url = children_url js = 'window.location.href="{}";'.format(url)
browser.execute_script(js)
# time.sleep(1)
# browser.refresh() try:
for isc in range(1):
time.sleep(1)
js = 'window.scrollTo(0,document.body.scrollHeight)'
browser.execute_script(js)
except Exception as e:
print('window.scrollTo-->', e) myhtml = 'D:\\myhtml\\{}gushiwen.tmp.html'.format(random.randint(123, 999))
with open(myhtml, 'w', encoding='utf-8') as fw:
fw.write(browser.page_source)
sql = 'INSERT INTO parent_url (page_title,page_url,children_url) VALUES '
with open(myhtml, 'r', encoding='utf-8') as myhtml_o:
bs = BeautifulSoup(myhtml_o, 'html.parser')
try:
url_l = [i.attrs['href'] for i in bs.find_all('a')]
except Exception as e:
print(e)
continue
res_l = []
sql_l = []
for i in url_l:
# /user/findpwd.aspx?from=http://so.gushiwen.org/user/collect.aspx
if 'gushiwen.org' in i and 'javascript' not in i and 'http' in i and i[0:4] == 'http':
# sql_chk = 'SELECT * FROM parent_url WHERE page_url="{}" AND children_url="{}"'.format(url, i)
# print(sql_chk)
# r = mysql_fetch(sql_chk)
# print(r)
# if len(r) > 0:
# continue
if i not in res_l:
if i == url:
continue continue_ = False
for fi in url_kw_filter_l:
ii = fi.upper()
if fi in i.upper():
continue_ = True
break
if continue_:
continue res_l.append(i)
s = '("{}","{}","{}")'.format(browser.title, url, i)
# sql_break = '{}{}'.format(sql, s)
# print(sql_break)
# mysql_write(sql_break) # print(s)
sql_l.append(s)
if len(sql_l) > 0:
sql = '{}{}'.format(sql, ','.join(sql_l))
print(sql)
mysql_write(sql) sql_udp = 'UPDATE parent_url SET if_spider=1 WHERE page_url="{}" AND children_url="{}"'.format(page_url,
url)
mysql_write(sql_udp)
print(sql_udp) dd = 0

  

(父,子)url有序二元组

蜘蛛页面 获取一个网站的全部url 乐观代码的更多相关文章

  1. jsp页面获取参数的方法(url解析、el表达式赋值、session取值)【原创】

    最近使用myEclispse做网站,使用jsp+js+css做页面,网站中常用到从列表进入详情页面的跳转,下面对详情页面的值填充方式做一个简单总结: 1.url中使用request获取参数 jsp上方 ...

  2. 用JavaScript获取一个超链接的绝对URL地址

    对于Web程序员来说,处理简单的URL格式也许会成为一场噩梦.试想一下,一个网址里有很多组成部分都会影响你对它的解析方法: 是否以/字符开头 是否以//开头 是否以?号开头 是否以#号开头 …等等 当 ...

  3. 通过cookies跳过验证码登陆页面,直接访问网站的其它URL

    我每次手动访问去NN网的一家酒店,就不需要登陆,一旦我用脚本打开就会让我登陆,而登陆页面又有验证码,不想识别验证码,所以就想:“通过cookies跳过验证码登陆页面,直接访问网站的其它URL”   转 ...

  4. Python 网络爬虫 009 (编程) 通过正则表达式来获取一个网页中的所有的URL链接,并下载这些URL链接的源代码

    通过 正则表达式 来获取一个网页中的所有的 URL链接,并下载这些 URL链接 的源代码 使用的系统:Windows 10 64位 Python 语言版本:Python 2.7.10 V 使用的编程 ...

  5. songtaste网站歌曲真实URL获取

    个人挺喜欢songtaste网站的歌曲的,下载方法也层出不穷,可是作为程序员如果不知其中原理的方法真是羞愧.首先简单点的方法当然有google插件这样的嗅探器了,不过这种工具的原理还不是很了解.今天先 ...

  6. 获取一个 app 的 URL Scheme 的方法:

    获取一个 app 的 URL Scheme 的方法: 上这个网站 URL Schemes 查一下相应的 app 的 URL Scheme 是否有被收录 第一种方法没找到的话,把相应的 app 的 ip ...

  7. ASP.NET 获取来源网站的网址,获取上一网页的网址,获取来源网页的URL,获取上一网页的URL

    ASP.NET 获取来源网站的网址,获取上一网页的网址,获取来源网页的URL, 获取上一网页的URL Uri Url = HttpContext.Current.Request.UrlReferrer ...

  8. 多域名环境,页面获取url的一种方案

    因为系统是分布式部署的.而且有多个域名,所以常常涉及到获取url的问题. 这是系统框架层面须要提供的能力.否则每一个模块都须要自己去想办法获取ip,就会非常混乱.上线也easy发生bug 主要须要解决 ...

  9. 通过Iframe在A网站页面内嵌入空白页面的方式,跨域获取B网站的数据返回给A网站!

    以下代码只是为演示该方法具体是如何操作的,实际的意义并不大. 其实这个方法还可以解决很多方面的跨域操作,以下两点为我工作中遇到的情况! 比如A系统中打开B系统页面的时候,获取B系统页面高度,A系统中可 ...

随机推荐

  1. Spring Boot . 4 -- 定制 Spring Boot 配置 【2】

    除了第一篇中使用 覆写的方式进行 自动配置的更改外,还可以通过 Spring Boot 中提供的 application.properties 文件改变应用的运行时配置.这种配置的方式粒度是非常精细的 ...

  2. ICMP协议和ping命令

    当网络不通的情况下,通常会想到ping命令,ping一下,但是ping命令内部如何执行的,可能并不清楚,其实ping是基于ICMP协议进行工作的.  一.ICMP协议的格式 ICMP是在RFC 792 ...

  3. 洛谷——P3907 圈的异或

    P3907 圈的异或 无向图$dfs$找环,并判断边权异或和是否为0 #include<iostream> #include<cstdio> #include<algor ...

  4. 零基础入门学习Python(14)--字符串:各种奇葩的内置方法

    前言 这节课我们回过头来,再谈一下字符串,或许我们现在再来谈字符串,有些朋友可能觉得没必要了,甚至有些朋友就会觉得,不就是字符串吗,哥闭着眼也能写出来,那其实关于字符串还有很多你不知道的秘密哦.由于字 ...

  5. MySQL-----连表

    连表: **拿到两张表的信息** select * from userinfo,department 弊端是数据会乱,出现重复,不建议这样. **使userinfo表的part_id列与departm ...

  6. Python之条件判断

    Python之条件判断 计算机之所以能做很多自动化的任务,因为它可以自己做条件判断. 比如,输入用户年龄,根据年龄打印不同的内容,在Python程序中,用if语句实现: age = 20 if age ...

  7. allegro中原理图和pcb中元件的交互

    一.前言: 所谓的交互是这样的,在原理图里点击某个元件,在pcb图中就相应的被选中,这样在元器件刚导进pcb中布局放置元器件的时候可以为我们提供很大的方便. 二.前提: pcb中导入元件是这种方式: ...

  8. Java中static、final、static final的区别

    final: final可以修饰:属性,方法,类,局部变量(方法中的变量) final修饰的属性的初始化可以在编译期,也可以在运行期,初始化后不能被改变. final修饰的属性跟具体对象有关,在运行期 ...

  9. 关于Web服务接口测试的一些问题及答案

    本篇主要是像想要了解并且学习接口测试的朋友,做一个入门的简单介绍 1.什么是接口 答:接口就是内部模块对模块,外部系统对其他服务提供的一种可调用或者连接的能力的标准,就好比usb接口,他是系统向外接提 ...

  10. SQL Server 2008如何创建定期自动备份任务

    我们知道,利用SQL Server 2008数据库可以实现数据库的定期自动备份.方法是用SQL SERVER 2008自带的维护计划创建一个计划对数据库进行备份,下面我们将SQL SERVER 200 ...