本源码仅供测试,发包有风险,优化还是踏实的好!本代码是本人自己学习python练手作品!
  附上代码:

# -*- coding: utf-8 -*-from selenium import webdriver
import time
import requests
import random
import os
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import traceback
import urllib.request
import pymysql
import socket
#import win32api #pip install pypiwin32
 
#from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
#DesiredCapabilities.INTERNETEXPLORER['ignoreProtectedModeSettings'] = True
 
 
 
#rasdial 宽带连接 19ab68----643534
def connect():
    cmd_str = "rasdial %s %s %s" % (g_adsl_account['name'], g_adsl_account['username'], g_adsl_account['password'])
    os.system(cmd_str)
    time.sleep(5)
 
 
#"rasdial 断开宽带连接 /disconnect"
def disconnect():
    cmd_str = "rasdial %s /disconnect" % g_adsl_account['name']
    os.system(cmd_str)
    time.sleep(5)
     
#获取ip地址
def get_ip():
    #return ['ip','address']
    fp = urllib.request.urlopen("http://ip.chinaz.com/getip.aspx")
    mybytes = fp.read()
    # note that Python3 does not read the html code as string
    # but as html code bytearray, convert to string with
    mystr = mybytes.decode("utf8")
    fp.close()
    ip = mystr.find("ip")
    add = mystr.find("address")
    ip = mystr[ip+4:add-2]
    address = mystr[add+9:-2]
    return [ip,address]
 
#将ip地址插入数据库
def insert_db(ipdate):
    #try:
        #获取一个数据库连接,注意如果是UTF-8类型的,需要制定数据库
        conn=pymysql.connect(host='localhost',user='root',passwd='',port=3306,charset='utf8')
        cur=conn.cursor()                              #获取一个游标对象
        #cur.execute("CREATE DATABASE zongzong")          #执行对应的SQL语句
        #exit()
        cur.execute("USE zongzong")
        #exit()
        #cur.execute("CREATE TABLE `ip_log` (`id` int(11) NOT NULL AUTO_INCREMENT,`ip` varchar(32) DEFAULT NULL,`address` varchar(64) DEFAULT NULL,`keyword` varchar(64) DEFAULT '',`url` varchar(256) DEFAULT '',`error` varchar(64) DEFAULT '',`created_at` timestamp NULL DEFAULT NULL ON UPDATE CURRENT_TIMESTAMP,PRIMARY KEY (`id`)) ENGINE=InnoDB AUTO_INCREMENT=21 DEFAULT CHARSET=utf8;")
         
        #插入数据
        ISOTIMEFORMAT='%Y-%m-%d %X'
        ipdate.append( time.strftime( ISOTIMEFORMAT, time.localtime() ))
        cur.execute("INSERT INTO ip_log(ip,address,keyword,url,error,page,rank,created_at) VALUES(%s, %s, %s, %s, %s, %s, %s, %s)",ipdate)
         
        #cur.execute("SELECT * FROM ip_log")
        #data=cur.fetchall()
        #print(data)
             
        cur.close()#关闭游标
        conn.commit()#向数据库中提交任何未解决的事务,对不支持事务的数据库不进行任何操作
        conn.close()#关闭到数据库的连接,释放数据库资源
    #except:
    #   print("发生异常")  
 
 
#获取搜素出来的url
def get_search_url(driver):
    urls = []
    real = []
    real_url = []
    click_link = []
    content = driver.find_element_by_css_selector("div[id=\"content_left\"]")
    links = content.find_elements_by_tag_name("a")
    for link in links:
        if link.get_attribute('class') == "c-showurl":
            real.append(link.text)
            url = link.get_attribute('href')
            urls.append(url)
             
            #解密url
            header = requests.head(url).headers
            is_append = True
            for out_url in out_urls:
                if out_url in header['location']:
                    is_append = False
                    break
                     
            if is_append == True:
                real_url.append(header['location'])
                #a标签对象
                click_link.append(link)
                     
    #print(real)
    #print(urls)
    #return urls
    return [real_url,click_link]
     
     
#function:解析加密url,剔除竞争对手的url
# def get_real_url(urls):
    # real_url = []
    # for url in urls:
        # header = requests.head(url).headers
        # is_append = True
        # for out_url in out_urls:
            # if out_url in header['location']:
                # is_append = False
                # break
             
        # if is_append == True:
            # real_url.append(header['location'])
    # return real_url
 
#function 目标地址是否在某个list中
def get_urlIndex(tagurl,urls):
    i = 0
    has = -1
    for url in urls:
        if tagurl in url:
            has = True
            return i
        i = i+1
    return has
 
     
#点击百度搜索内容下面的下一页
def click_nextBtn(driver):
    div = driver.find_element_by_css_selector("div[id=\"page\"]")
    a = div.find_elements_by_tag_name("a")
    for item in a:
        print(item.text)
        if item.text == "下一页>":
            item.click()
     
    return driver
 
             
 
 
#随机点击
def click_search_url(driver,items):
    urls = []
    real = []
    content = driver.find_element_by_css_selector("div[id=\"content_left\"]")
    links = content.find_elements_by_tag_name("a")
    i=0
    '''获取当前窗口'''
    nowhandle = driver.current_window_handle
    #allhandles=driver.window_handles
    #for handle in allhandles:
    #   print('....当前窗口....',handle.title)
    #exit()
     
    for link in links:
        if link.get_attribute('class') == "c-showurl":
            if i in items:
                print("随机点击item:",i)
                print(link.get_attribute('href'),link.text)
                #exit()
                link.click()
                #停留在点击页面
                time.sleep(random.randint(5,10))
             
                '''获取所有窗口'''
                allhandles=driver.window_handles
                #for handle in allhandles:
                #   print('....当前窗口....',handle.title)
                #exit()
                 
                '''循环判断窗口是否为当前窗口'''
                for handle in allhandles:
                    if handle != nowhandle:
                        print("切换到当前窗口")
                        driver.switch_to_window(handle)
                        print("title:",driver.title)
                        '''关闭当前窗口'''
                        driver.close()
                        '''回到原先的窗口'''
                        print("切换到原来的窗口")
                        driver.switch_to_window(nowhandle)
                        print("title:",driver.title)
                print("本次随机点击完毕!")
                         
            i=i+1
 
             
#获取随机点击的搜索页random.randint(0
def get_random_index(index,len):
    if index >= 8:
        random_index = [
            random.randint(0,4),random.randint(5,8)
        ]
    elif index>=4:
        random_index = [
            random.randint(0,3),random.randint(3,index)
        ]
    elif index>=0:
        random_index = [
            index
        ]
    elif index == -1:
        if len <=5:
            random_index = [
                random.randint(0,5)
            ]
        else:
            random_index = [
                #random.randint(0,4),random.randint(5,len)
                random.randint(5,len)
            ]
    return random_index
 
 
 
def getUA():
    uaList = [
        #360
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36",
        #chrome
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36",
        #"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36",
         
        #firefox
        #"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0",
        "Mozilla/5.0 (Windows NT 6.3; WOW64; rv:36.0) Gecko/20100101 Firefox/36.0",
         
        #ie11
        #"Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
        #ie8
        #"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; 4399Box.1357; 4399Box.1253; 4399Box.1357)",
         
        #2345王牌
        #"Chrome/39.0.2171.99 Safari/537.36 2345Explorer/6.5.0.11018",
         
        #搜狗
        #"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0",
        #opera
        "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60"
         
    ]
    headers = random.choice(uaList)
    return headers
 
#屏幕浏览器窗口大小
def getWindowSize():
    wind_size = [
        [1920,1080],
        [1600,900],
        [1280,720]
    ]
    headers = random.choice(wind_size)
    return headers
     
     
#屏幕分辨率设置
def setDisplay():
    display_size = [
        [1920,1080],
        [1680,1050],
        [1600,900],
        [1440,900],
        [1400,1050]
    ]
    d_size = random.choice(display_size)
     
    dm = win32api.EnumDisplaySettings(None, 0)
    dm.PelsWidth = d_size[0]
    dm.PelsHeight = d_size[1]
    dm.BitsPerPel = 32
    dm.DisplayFixedOutput = 0
    win32api.ChangeDisplaySettings(dm, 0)
 
     
#拨号 19ab68----643534       
g_adsl_account = {
    "name":"宽带连接",
    "username":"19ab68",
    "password":"643534"
}
 
 
#屏蔽点击的地址(竞争对手)
out_urls = [
    'zhimo.yuanzhumuban.cc',
    'bbs.yuanzhumuban.cc',
        'http://money.163.com/15/0416/11/ANANRECC00253B0H.html'
]
 
 
##内页词
targetURL = [
     
        ['http://www.hkuws.com','注册离岸公司'],
    ['zs.efu.com.cn/mornfeeit/','梦菲雪'],
    ['zs.efu.com.cn/chengshijiaren/','城市佳人'],
    ['www.kidsnet.cn/exposition','童装展会'],
    #['top.kidsnet.cn/','童装加盟排行榜'],
    #['www.nynet.com.cn/','内衣网'],
    #['www.nzw.cn/','女装网'],
    ['zs.efu.com.cn/ks/','卡索'],
    ['zs.efu.com.cn/distin-kidny/','迪斯廷凯'],
    ['zs.efu.com.cn/fuzhuang/luyidigao/','路易迪高童装代{过}{滤}理'],
    ['brand.efu.com.cn/brandshow-1221090.html','凯帝龙驰'],
    ['zs.efu.com.cn/rabbitjero/','兔子杰罗'],
    ['zs.efu.com.cn/wmprince/','西瓜王子'],
        ['zs.efu.com.cn/betu','百图'],
        ['zs.efu.com.cn/pepco/','小猪班纳'],
 
 
    #['http://news.ifeng.com/a/20160518/48795120_0.shtml','华夏信财'],
    ['http://weibo.com/huaxiafinance','华夏信财'],
    ['http://p2p.hexun.com/2016-04-26/183531215.html','华夏信财'],
    #['http://news.xinhuanet.com/fortune/2016-04/26/c_128932834.htm','华夏信财'],
    ['http://www.xcf.cn/gdyw/201605/t20160526_772682.htm','华夏信财'],
    ['http://www.huaxiaoxia.com/','华夏信财'],
        #['https://lc.huaxiafinance.com/','华夏信财'],
 
 
 
        ['so.tedu.cn','网络营销培训机构'],
        ['www.cosatto.net.cn','个性安全座椅'],
        ['www.kaihuata.com/','开化旅游'],
        #['www.kaihuata.com/','开化'],
 
]
 
 
for targetInfo in targetURL:
    try:
        #更换ip
        disconnect()
        connect()
         
        while(1):
                    try:
                        socket.gethostbyname("baidu.com")
                        break;
                    except:
                        disconnect()
                        connect()
        #更换分辨率
        #setDisplay()
         
         
        #启动浏览器
        #driver = webdriver.Ie()
        #driver = webdriver.Chrome()
        #driver = webdriver.Firefox()
         
        #设置PhantomJS的user_agent
        dcap = dict(DesiredCapabilities.PHANTOMJS)
        user_agent = getUA()
        print(user_agent)
        dcap["phantomjs.page.settings.userAgent"] = (
                user_agent
        )
        #dcap["phantomjs.page.settings.resourceTimeout"] = (15000)
        dcap["phantomjs.page.settings.loadImages"] = (False)
        driver = webdriver.PhantomJS(desired_capabilities=dcap,service_args=['--load-images=no'])
         
         
        # UA = getUA()
        # print(UA)
        # webdriver.DesiredCapabilities.PHANTOMJS['phantomjs.page.customHeaders.User-Agent'] = UA
        # driver = webdriver.PhantomJS()
         
        driver.implicitly_wait(30)
         
        #清cookie
        driver.delete_all_cookies()
 
        #driver.maximize_window() # 浏览器全屏显示
 
        #打开百度
        driver.get("http://www.baidu.com/")
        #driver.get("http://mch.weiba01.com/2.php")
         
        #设置浏览器窗口大小
        window_size = getWindowSize()
        driver.set_window_size(window_size[0], window_size[1])
                 
                 
        #搜索某个关键词
        print('打开百度成功',driver.title)
        target = targetInfo[0]
        keyword = targetInfo[1]
        if len(targetInfo)>2:
                error_keyword = targetInfo[random.randint(2,len(targetInfo)-1)]      
        print(">>>>>>>>>>>>>>>点击的关键词:",keyword,"--->目标地址:",target,">>>>>>>>>>>>>>>>>>>>")
         
         
        if len(targetInfo)>2:
            #模拟错误关键词
            print("点击错误关键词:",error_keyword);
            driver.find_element_by_id("kw").send_keys(error_keyword)
            time.sleep(2)
            driver.find_element_by_id("su").click()
            time.sleep(5)
            driver.find_element_by_id("kw").clear()
            time.sleep(2)
            print("错误关键词点击完毕")
             
        driver.find_element_by_id("kw").send_keys(keyword)
        #time.sleep(2)
 
        #点击搜索按钮
        print("...开始点击搜索按钮..")
        driver.find_element_by_id("su").click()
        #exit()
        print("...点击完毕..")
        time.sleep(2)
 
         
        #获取搜索结果页 0:着陆页  1:对应的链接对象
        urls_res = get_search_url(driver)
        real_urls = urls_res[0]
        #get_search_url(driver)[1][2].click()
         
         
        #real_urls = get_real_url(urls)
        print("搜索出来的可点击着陆页个数:",len(real_urls))
        print(real_urls)
        index = get_urlIndex(target,real_urls)
        print("目标index:",index)
 
        page = 1
        while index == -1 and page <= 4:
            if page == 1:
                #点击前面的几个着陆页,模拟用户真实行为
                items = get_random_index(index,len(real_urls))
                #items = [4]
                print(items)
                click_search_url(driver,items)
             
            #下一页
            driver = click_nextBtn(driver)
            time.sleep(3)
            urls_res = get_search_url(driver)
            real_urls = urls_res[0]
            #real_urls = get_real_url(urls)
            print(real_urls)
            index = get_urlIndex(target,real_urls)
             
            page = page+1
 
             
             
        if index > 4 and page == 1:
            #第一页,随机点击两个或一个
            int = random.randint(1,2)
            if int == 2:
                items = get_random_index(index,len(real_urls))
            else:
                items = [1]
            print(items)
            click_search_url(driver,items)
             
        if page >=5:
            print("没有找到目标地址,放弃搜索...")
            print("关闭浏览器")
            driver.quit()
             
            time.sleep(5)
            data = get_ip()
            data.append(keyword)
            data.append(target)
            data.append("no_find")
            data.append(-1)
            data.append(-1)
            insert_db(data)
            continue
         
        print("目标在page",page,"当前排名:",index,real_urls[index])
        print("反问最后的目标页...")
        #driver.get(real_urls[index])
        urls_res[1][index].click()
        time.sleep(5)
         
        nowhandle = driver.current_window_handle
        allhandles = driver.window_handles
        #目标页和搜索栏目页切换下
        for handle in allhandles:
            if handle != nowhandle:
                print("切换到当前窗口")
                driver.switch_to_window(handle)
                stime = random.randint(15,25)
                #stime = 5;
                print("目标页title:",driver.title,"停留-->",stime)
                time.sleep(stime)
                '''关闭当前窗口'''
                driver.close()
                 
                '''回到原先的窗口'''
                print("切换到原来的窗口")
                driver.switch_to_window(nowhandle)
                print("title:",driver.title)
         
         
        #time.sleep(random.randint(40,60))
        #time.sleep(5)
 
        #清除所有cookie
        print("打印cookie")
        cookie= driver.get_cookies()
        print(cookie)
        print("清除cookie")
        driver.delete_all_cookies()
        print("打印cookie:")
        cookie= driver.get_cookies()
        print(cookie)
 
        #关闭浏览器
        print("关闭浏览器")
        time.sleep(5)
        #driver.close()
        driver.quit()
        #time.sleep(5)
         
        #数据库记录运行信息
        data = get_ip()
        data.append(keyword)
        data.append(target)
        data.append("success")
        data.append(page)
        data.append(index)
        insert_db(data)
     
    except:
        data = get_ip()
        data.append(keyword)
        data.append(target)
        data.append("faild")
        data.append(-1)
        data.append(-1)
        insert_db(data)
    

  

百度快排发包python核心源码的更多相关文章

  1. Rank & Sort Loss for Object Detection and Instance Segmentation 论文解读(含核心源码详解)

    第一印象 Rank & Sort Loss for Object Detection and Instance Segmentation 这篇文章算是我读的 detection 文章里面比较难 ...

  2. Android版数据结构与算法(五):LinkedHashMap核心源码彻底分析

    版权声明:本文出自汪磊的博客,未经作者允许禁止转载. 上一篇基于哈希表实现HashMap核心源码彻底分析 分析了HashMap的源码,主要分析了扩容机制,如果感兴趣的可以去看看,扩容机制那几行最难懂的 ...

  3. Java内存管理-掌握类加载器的核心源码和设计模式(六)

    勿在流沙筑高台,出来混迟早要还的. 做一个积极的人 编码.改bug.提升自己 我有一个乐园,面向编程,春暖花开! 上一篇文章介绍了类加载器分类以及类加载器的双亲委派模型,让我们能够从整体上对类加载器有 ...

  4. 并发编程之 SynchronousQueue 核心源码分析

    前言 SynchronousQueue 是一个普通用户不怎么常用的队列,通常在创建无界线程池(Executors.newCachedThreadPool())的时候使用,也就是那个非常危险的线程池 ^ ...

  5. iOS 开源库系列 Aspects核心源码分析---面向切面编程之疯狂的 Aspects

    Aspects的源码学习,我学到的有几下几点 Objective-C Runtime 理解OC的消息分发机制 KVO中的指针交换技术 Block 在内存中的数据结构 const 的修饰区别 block ...

  6. 快排的python实现

    快排的python实现 #python 2.7 def quick_sort(L): if len(L) <= 1: return L else: return quick_sort([lt f ...

  7. Backbone事件机制核心源码(仅包含Events、Model模块)

    一.应用场景 为了改善酷版139邮箱的代码结构,引入backbone的事件机制,按照MVC的分层思想搭建酷版云邮局的代码框架.力求在保持酷版轻量级的基础上提高代码的可维护性.   二.遗留问题 1.b ...

  8. 6 手写Java LinkedHashMap 核心源码

    概述 LinkedHashMap是Java中常用的数据结构之一,安卓中的LruCache缓存,底层使用的就是LinkedHashMap,LRU(Least Recently Used)算法,即最近最少 ...

  9. 3 手写Java HashMap核心源码

    手写Java HashMap核心源码 上一章手写LinkedList核心源码,本章我们来手写Java HashMap的核心源码. 我们来先了解一下HashMap的原理.HashMap 字面意思 has ...

随机推荐

  1. [LeetCode] 505. The Maze II 迷宫之二

    There is a ball in a maze with empty spaces and walls. The ball can go through empty spaces by rolli ...

  2. [LeetCode] 438. Find All Anagrams in a String 找出字符串中所有的变位词

    Given a string s and a non-empty string p, find all the start indices of p's anagrams in s. Strings ...

  3. 第02组 Alpha冲刺(4/6)

    队名:無駄無駄 组长博客 作业博客 组员情况 张越洋 过去两天完成了哪些任务 摸鱼 提交记录(全组共用) 接下来的计划 沟通前后端成员,监督.提醒他们尽快完成各自的进度 学习如何评估代码质量 准备Al ...

  4. ORACLE--10G安装问题( error while loading shared libraries)

    01,问题描述 问题一: WARNING: directory '/u01/app/oracle/product/10.2.0' is not owned by root WARNING: direc ...

  5. markdown格式接口文档模板

    源文件 https://files.cnblogs.com/files/bincoding/%E6%8E%A5%E5%8F%A3%E6%96%87%E6%A1%A3.zip 目录 测试接口 查询指定项 ...

  6. (四)golang--注意事项

    以.go为结尾: 执行入口是main()函数: 区分大小写: Go方法由一条条语句构成,每个语句后不需要加问号: Go编译器是一行行进行编译的,一行不能写多条语句: go语言定义的变量或者import ...

  7. PHP接口并发测试的方法

    PHP接口并发测试的方法 <pre> header('Content-type:text/html; Charset=utf-8'); $uri = "输入你的url" ...

  8. go-gin-api 路由中间件 - 签名验证(七)

    概览 首先同步下项目概况: 上篇文章分享了,路由中间件 - Jaeger 链路追踪(实战篇),文章反响真是出乎意料, 「Go中国」 公众号也转发了,有很多朋友加我好友交流,直呼我大神,其实我哪是什么大 ...

  9. Prometheus Grafana可视化展示Linux资源使用率

    Prometheus Grafana可视化展示Linux资源使用率  Grfana官方仪表盘下载:https://grafana.com/dashboards 数据源推荐:https://grafan ...

  10. Filebeat与Logstash配置SSL加密通信

    为了保证应用日志数据的传输安全,我们可以使用SSL相互身份验证来保护Filebeat和Logstash之间的连接. 这可以确保Filebeat仅将加密数据发送到受信任的Logstash服务器,并确保L ...