百度快排发包python核心源码

本源码仅供测试，发包有风险，优化还是踏实的好！本代码是本人自己学习python练手作品！
附上代码：

# -*- coding: utf-8 -*-from selenium import webdriver

import time

import requests

import random

import os

from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

import traceback

import urllib.request

import pymysql

import socket

#import win32api #pip install pypiwin32

 

#from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

#DesiredCapabilities.INTERNETEXPLORER['ignoreProtectedModeSettings'] = True

 

 

 

#rasdial 宽带连接 19ab68----643534

def connect():

    cmd_str = "rasdial %s %s %s" % (g_adsl_account['name'], g_adsl_account['username'], g_adsl_account['password'])

    os.system(cmd_str)

    time.sleep(5)

 

 

#"rasdial 断开宽带连接 /disconnect"

def disconnect():

    cmd_str = "rasdial %s /disconnect" % g_adsl_account['name']

    os.system(cmd_str)

    time.sleep(5)

     

#获取ip地址

def get_ip():

    #return ['ip','address']

    fp = urllib.request.urlopen("http://ip.chinaz.com/getip.aspx")

    mybytes = fp.read()

    # note that Python3 does not read the html code as string

    # but as html code bytearray, convert to string with

    mystr = mybytes.decode("utf8")

    fp.close()

    ip = mystr.find("ip")

    add = mystr.find("address")

    ip = mystr[ip+4:add-2]

    address = mystr[add+9:-2]

    return [ip,address]

 

#将ip地址插入数据库

def insert_db(ipdate):

    #try:

        #获取一个数据库连接，注意如果是UTF-8类型的，需要制定数据库

        conn=pymysql.connect(host='localhost',user='root',passwd='',port=3306,charset='utf8')

        cur=conn.cursor()                              #获取一个游标对象

        #cur.execute("CREATE DATABASE zongzong")          #执行对应的SQL语句

        #exit()

        cur.execute("USE zongzong")

        #exit()

        #cur.execute("CREATE TABLE `ip_log` (`id` int(11) NOT NULL AUTO_INCREMENT,`ip` varchar(32) DEFAULT NULL,`address` varchar(64) DEFAULT NULL,`keyword` varchar(64) DEFAULT '',`url` varchar(256) DEFAULT '',`error` varchar(64) DEFAULT '',`created_at` timestamp NULL DEFAULT NULL ON UPDATE CURRENT_TIMESTAMP,PRIMARY KEY (`id`)) ENGINE=InnoDB AUTO_INCREMENT=21 DEFAULT CHARSET=utf8;")

         

        #插入数据

        ISOTIMEFORMAT='%Y-%m-%d %X'

        ipdate.append( time.strftime( ISOTIMEFORMAT, time.localtime() ))

        cur.execute("INSERT INTO ip_log(ip,address,keyword,url,error,page,rank,created_at) VALUES(%s, %s, %s, %s, %s, %s, %s, %s)",ipdate)

         

        #cur.execute("SELECT * FROM ip_log")

        #data=cur.fetchall()

        #print(data)

             

        cur.close()#关闭游标

        conn.commit()#向数据库中提交任何未解决的事务，对不支持事务的数据库不进行任何操作

        conn.close()#关闭到数据库的连接，释放数据库资源

    #except:

    #   print("发生异常")  

 

 

#获取搜素出来的url

def get_search_url(driver):

    urls = []

    real = []

    real_url = []

    click_link = []

    content = driver.find_element_by_css_selector("div[id=\"content_left\"]")

    links = content.find_elements_by_tag_name("a")

    for link in links:

        if link.get_attribute('class') == "c-showurl":

            real.append(link.text)

            url = link.get_attribute('href')

            urls.append(url)

             

            #解密url

            header = requests.head(url).headers

            is_append = True

            for out_url in out_urls:

                if out_url in header['location']:

                    is_append = False

                    break

                     

            if is_append == True:

                real_url.append(header['location'])

                #a标签对象

                click_link.append(link)

                     

    #print(real)

    #print(urls)

    #return urls

    return [real_url,click_link]

     

     

#function：解析加密url，剔除竞争对手的url

# def get_real_url(urls):

    # real_url = []

    # for url in urls:

        # header = requests.head(url).headers

        # is_append = True

        # for out_url in out_urls:

            # if out_url in header['location']:

                # is_append = False

                # break

             

        # if is_append == True:

            # real_url.append(header['location'])

    # return real_url

 

#function 目标地址是否在某个list中

def get_urlIndex(tagurl,urls):

    i = 0

    has = -1

    for url in urls:

        if tagurl in url:

            has = True

            return i

        i = i+1

    return has

 

     

#点击百度搜索内容下面的下一页

def click_nextBtn(driver):

    div = driver.find_element_by_css_selector("div[id=\"page\"]")

    a = div.find_elements_by_tag_name("a")

    for item in a:

        print(item.text)

        if item.text == "下一页>":

            item.click()

     

    return driver

 

             

 

 

#随机点击

def click_search_url(driver,items):

    urls = []

    real = []

    content = driver.find_element_by_css_selector("div[id=\"content_left\"]")

    links = content.find_elements_by_tag_name("a")

    i=0

    '''获取当前窗口'''

    nowhandle = driver.current_window_handle

    #allhandles=driver.window_handles

    #for handle in allhandles:

    #   print('....当前窗口....',handle.title)

    #exit()

     

    for link in links:

        if link.get_attribute('class') == "c-showurl":

            if i in items:

                print("随机点击item:",i)

                print(link.get_attribute('href'),link.text)

                #exit()

                link.click()

                #停留在点击页面

                time.sleep(random.randint(5,10))

             

                '''获取所有窗口'''

                allhandles=driver.window_handles

                #for handle in allhandles:

                #   print('....当前窗口....',handle.title)

                #exit()

                 

                '''循环判断窗口是否为当前窗口'''

                for handle in allhandles:

                    if handle != nowhandle:

                        print("切换到当前窗口")

                        driver.switch_to_window(handle)

                        print("title:",driver.title)

                        '''关闭当前窗口'''

                        driver.close()

                        '''回到原先的窗口'''

                        print("切换到原来的窗口")

                        driver.switch_to_window(nowhandle)

                        print("title:",driver.title)

                print("本次随机点击完毕！")

                         

            i=i+1

 

             

#获取随机点击的搜索页random.randint(0

def get_random_index(index,len):

    if index >= 8:

        random_index = [

            random.randint(0,4),random.randint(5,8)

        ]

    elif index>=4:

        random_index = [

            random.randint(0,3),random.randint(3,index)

        ]

    elif index>=0:

        random_index = [

            index

        ]

    elif index == -1:

        if len <=5:

            random_index = [

                random.randint(0,5)

            ]

        else:

            random_index = [

                #random.randint(0,4),random.randint(5,len)

                random.randint(5,len)

            ]

    return random_index

 

 

 

def getUA():

    uaList = [

        #360

        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36",

        #chrome

        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36",

        #"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36",

        "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36",

         

        #firefox

        #"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0",

        "Mozilla/5.0 (Windows NT 6.3; WOW64; rv:36.0) Gecko/20100101 Firefox/36.0",

         

        #ie11

        #"Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",

        #ie8

        #"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; 4399Box.1357; 4399Box.1253; 4399Box.1357)",

         

        #2345王牌

        #"Chrome/39.0.2171.99 Safari/537.36 2345Explorer/6.5.0.11018",

         

        #搜狗

        #"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0",

        #opera

        "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60"

         

    ]

    headers = random.choice(uaList)

    return headers

 

#屏幕浏览器窗口大小

def getWindowSize():

    wind_size = [

        [1920,1080],

        [1600,900],

        [1280,720]

    ]

    headers = random.choice(wind_size)

    return headers

     

     

#屏幕分辨率设置

def setDisplay():

    display_size = [

        [1920,1080],

        [1680,1050],

        [1600,900],

        [1440,900],

        [1400,1050]

    ]

    d_size = random.choice(display_size)

     

    dm = win32api.EnumDisplaySettings(None, 0)

    dm.PelsWidth = d_size[0]

    dm.PelsHeight = d_size[1]

    dm.BitsPerPel = 32

    dm.DisplayFixedOutput = 0

    win32api.ChangeDisplaySettings(dm, 0)

 

     

#拨号 19ab68----643534       

g_adsl_account = {

    "name":"宽带连接",

    "username":"19ab68",

    "password":"643534"

}

 

 

#屏蔽点击的地址（竞争对手）

out_urls = [

    'zhimo.yuanzhumuban.cc',

    'bbs.yuanzhumuban.cc',

        'http://money.163.com/15/0416/11/ANANRECC00253B0H.html'

]

 

 

##内页词

targetURL = [

     

        ['http://www.hkuws.com','注册离岸公司'],

    ['zs.efu.com.cn/mornfeeit/','梦菲雪'],

    ['zs.efu.com.cn/chengshijiaren/','城市佳人'],

    ['www.kidsnet.cn/exposition','童装展会'],

    #['top.kidsnet.cn/','童装加盟排行榜'],

    #['www.nynet.com.cn/','内衣网'],

    #['www.nzw.cn/','女装网'],

    ['zs.efu.com.cn/ks/','卡索'],

    ['zs.efu.com.cn/distin-kidny/','迪斯廷凯'],

    ['zs.efu.com.cn/fuzhuang/luyidigao/','路易迪高童装代{过}{滤}理'],

    ['brand.efu.com.cn/brandshow-1221090.html','凯帝龙驰'],

    ['zs.efu.com.cn/rabbitjero/','兔子杰罗'],

    ['zs.efu.com.cn/wmprince/','西瓜王子'],

        ['zs.efu.com.cn/betu','百图'],

        ['zs.efu.com.cn/pepco/','小猪班纳'],

 

 

    #['http://news.ifeng.com/a/20160518/48795120_0.shtml','华夏信财'],

    ['http://weibo.com/huaxiafinance','华夏信财'],

    ['http://p2p.hexun.com/2016-04-26/183531215.html','华夏信财'],

    #['http://news.xinhuanet.com/fortune/2016-04/26/c_128932834.htm','华夏信财'],

    ['http://www.xcf.cn/gdyw/201605/t20160526_772682.htm','华夏信财'],

    ['http://www.huaxiaoxia.com/','华夏信财'],

        #['https://lc.huaxiafinance.com/','华夏信财'],

 

 

 

        ['so.tedu.cn','网络营销培训机构'],

        ['www.cosatto.net.cn','个性安全座椅'],

        ['www.kaihuata.com/','开化旅游'],

        #['www.kaihuata.com/','开化'],

 

]

 

 

for targetInfo in targetURL:

    try:

        #更换ip

        disconnect()

        connect()

         

        while(1):

                    try:

                        socket.gethostbyname("baidu.com")

                        break;

                    except:

                        disconnect()

                        connect()

        #更换分辨率

        #setDisplay()

         

         

        #启动浏览器

        #driver = webdriver.Ie()

        #driver = webdriver.Chrome()

        #driver = webdriver.Firefox()

         

        #设置PhantomJS的user_agent

        dcap = dict(DesiredCapabilities.PHANTOMJS)

        user_agent = getUA()

        print(user_agent)

        dcap["phantomjs.page.settings.userAgent"] = (

                user_agent

        )

        #dcap["phantomjs.page.settings.resourceTimeout"] = (15000)

        dcap["phantomjs.page.settings.loadImages"] = (False)

        driver = webdriver.PhantomJS(desired_capabilities=dcap,service_args=['--load-images=no'])

         

         

        # UA = getUA()

        # print(UA)

        # webdriver.DesiredCapabilities.PHANTOMJS['phantomjs.page.customHeaders.User-Agent'] = UA

        # driver = webdriver.PhantomJS()

         

        driver.implicitly_wait(30)

         

        #清cookie

        driver.delete_all_cookies()

 

        #driver.maximize_window() # 浏览器全屏显示

 

        #打开百度

        driver.get("http://www.baidu.com/")

        #driver.get("http://mch.weiba01.com/2.php")

         

        #设置浏览器窗口大小

        window_size = getWindowSize()

        driver.set_window_size(window_size[0], window_size[1])

                 

                 

        #搜索某个关键词

        print('打开百度成功',driver.title)

        target = targetInfo[0]

        keyword = targetInfo[1]

        if len(targetInfo)>2:

                error_keyword = targetInfo[random.randint(2,len(targetInfo)-1)]      

        print(">>>>>>>>>>>>>>>点击的关键词：",keyword,"--->目标地址：",target,">>>>>>>>>>>>>>>>>>>>")

         

         

        if len(targetInfo)>2:

            #模拟错误关键词

            print("点击错误关键词：",error_keyword);

            driver.find_element_by_id("kw").send_keys(error_keyword)

            time.sleep(2)

            driver.find_element_by_id("su").click()

            time.sleep(5)

            driver.find_element_by_id("kw").clear()

            time.sleep(2)

            print("错误关键词点击完毕")

             

        driver.find_element_by_id("kw").send_keys(keyword)

        #time.sleep(2)

 

        #点击搜索按钮

        print("...开始点击搜索按钮..")

        driver.find_element_by_id("su").click()

        #exit()

        print("...点击完毕..")

        time.sleep(2)

 

         

        #获取搜索结果页 0:着陆页  1：对应的链接对象

        urls_res = get_search_url(driver)

        real_urls = urls_res[0]

        #get_search_url(driver)[1][2].click()

         

         

        #real_urls = get_real_url(urls)

        print("搜索出来的可点击着陆页个数：",len(real_urls))

        print(real_urls)

        index = get_urlIndex(target,real_urls)

        print("目标index:",index)

 

        page = 1

        while index == -1 and page <= 4:

            if page == 1:

                #点击前面的几个着陆页,模拟用户真实行为

                items = get_random_index(index,len(real_urls))

                #items = [4]

                print(items)

                click_search_url(driver,items)

             

            #下一页

            driver = click_nextBtn(driver)

            time.sleep(3)

            urls_res = get_search_url(driver)

            real_urls = urls_res[0]

            #real_urls = get_real_url(urls)

            print(real_urls)

            index = get_urlIndex(target,real_urls)

             

            page = page+1

 

             

             

        if index > 4 and page == 1:

            #第一页，随机点击两个或一个

            int = random.randint(1,2)

            if int == 2:

                items = get_random_index(index,len(real_urls))

            else:

                items = [1]

            print(items)

            click_search_url(driver,items)

             

        if page >=5:

            print("没有找到目标地址，放弃搜索...")

            print("关闭浏览器")

            driver.quit()

             

            time.sleep(5)

            data = get_ip()

            data.append(keyword)

            data.append(target)

            data.append("no_find")

            data.append(-1)

            data.append(-1)

            insert_db(data)

            continue

         

        print("目标在page",page,"当前排名：",index,real_urls[index])

        print("反问最后的目标页...")

        #driver.get(real_urls[index])

        urls_res[1][index].click()

        time.sleep(5)

         

        nowhandle = driver.current_window_handle

        allhandles = driver.window_handles

        #目标页和搜索栏目页切换下

        for handle in allhandles:

            if handle != nowhandle:

                print("切换到当前窗口")

                driver.switch_to_window(handle)

                stime = random.randint(15,25)

                #stime = 5;

                print("目标页title:",driver.title,"停留-->",stime)

                time.sleep(stime)

                '''关闭当前窗口'''

                driver.close()

                 

                '''回到原先的窗口'''

                print("切换到原来的窗口")

                driver.switch_to_window(nowhandle)

                print("title:",driver.title)

         

         

        #time.sleep(random.randint(40,60))

        #time.sleep(5)

 

        #清除所有cookie

        print("打印cookie")

        cookie= driver.get_cookies()

        print(cookie)

        print("清除cookie")

        driver.delete_all_cookies()

        print("打印cookie:")

        cookie= driver.get_cookies()

        print(cookie)

 

        #关闭浏览器

        print("关闭浏览器")

        time.sleep(5)

        #driver.close()

        driver.quit()

        #time.sleep(5)

         

        #数据库记录运行信息

        data = get_ip()

        data.append(keyword)

        data.append(target)

        data.append("success")

        data.append(page)

        data.append(index)

        insert_db(data)

     

    except:

        data = get_ip()

        data.append(keyword)

        data.append(target)

        data.append("faild")

        data.append(-1)

        data.append(-1)

        insert_db(data)

百度快排发包python核心源码的更多相关文章

Rank & Sort Loss for Object Detection and Instance Segmentation 论文解读（含核心源码详解）
第一印象 Rank & Sort Loss for Object Detection and Instance Segmentation 这篇文章算是我读的 detection 文章里面比较难 ...
Android版数据结构与算法(五):LinkedHashMap核心源码彻底分析
版权声明:本文出自汪磊的博客,未经作者允许禁止转载. 上一篇基于哈希表实现HashMap核心源码彻底分析分析了HashMap的源码,主要分析了扩容机制,如果感兴趣的可以去看看,扩容机制那几行最难懂的 ...
Java内存管理-掌握类加载器的核心源码和设计模式（六）
勿在流沙筑高台,出来混迟早要还的. 做一个积极的人编码.改bug.提升自己我有一个乐园,面向编程,春暖花开! 上一篇文章介绍了类加载器分类以及类加载器的双亲委派模型,让我们能够从整体上对类加载器有 ...
并发编程之 SynchronousQueue 核心源码分析
前言 SynchronousQueue 是一个普通用户不怎么常用的队列,通常在创建无界线程池(Executors.newCachedThreadPool())的时候使用,也就是那个非常危险的线程池 ^ ...
iOS 开源库系列 Aspects核心源码分析---面向切面编程之疯狂的 Aspects
Aspects的源码学习,我学到的有几下几点 Objective-C Runtime 理解OC的消息分发机制 KVO中的指针交换技术 Block 在内存中的数据结构 const 的修饰区别 block ...
快排的python实现
快排的python实现 #python 2.7 def quick_sort(L): if len(L) <= 1: return L else: return quick_sort([lt f ...
Backbone事件机制核心源码（仅包含Events、Model模块）
一.应用场景为了改善酷版139邮箱的代码结构,引入backbone的事件机制,按照MVC的分层思想搭建酷版云邮局的代码框架.力求在保持酷版轻量级的基础上提高代码的可维护性. 二.遗留问题 1.b ...
6 手写Java LinkedHashMap 核心源码
概述 LinkedHashMap是Java中常用的数据结构之一,安卓中的LruCache缓存,底层使用的就是LinkedHashMap,LRU(Least Recently Used)算法,即最近最少 ...
3 手写Java HashMap核心源码
手写Java HashMap核心源码上一章手写LinkedList核心源码,本章我们来手写Java HashMap的核心源码. 我们来先了解一下HashMap的原理.HashMap 字面意思 has ...

随机推荐

git diff比较版本差异(生成补丁)
1.git diff [<options>] <commit> <commit> options 使用--name-only(git diff HEAD cd504 ...
关于交叉编译Nodejs的坑
前言交叉编译Nodejs到其他平台上的时候,遇到了2个坑,网上极少有人提及,花了整个晚上才解决,在此记录下. 我的编译目标环境为: 龙芯3A 编译脚本 cd 代码目录 export PREFIX=/ ...
PHP获取cookie、Token、模拟登录、抓取数据、解析生成json
本文介绍使用PHP获取cookie,获取Token.以及模拟登录.然后抓取数据.最后解析生成json的的过程. 0. 设置Cookie路径 set_time_limit(0); //使用的cookie ...
spring boot开启gzip
Web服务使用Spring Boot2X且运行在Tomcat或者Jetty中,支持gzip压缩可以修改配置文件 application.properties server.compression.e ...
js 实现 promise
本文仅用于个人技术记录,如误导他人,概不负责. 本文有参考其他文章,不过地址忘了~~~. ======================================================= ...
CF1200D 【White Lines】
退役快一年了之后又打了场紧张刺激的$CF$(斜眼笑) 然后发现$D$题和题解里的大众做法不太一样 (思路清奇) 题意不再赘述,我们可以看到这个题~~好做~~在只有一次擦除机会,尝试以此为突破口解决问题 ...
golang --os系统包
环境变量 Environ 获取所有环境变量, 返回变量列表 func Environ() []string package main import ( "fmt" "os ...
LinQ in 写法
有时候会碰到一个需求,又一堆id 1,2,3,4,循环遍历id再去数据库查不仅效率低,而且代码难看,可以用in来实现: string[] codes = item.ExamPaperCode.Spli ...
asp.net MVC中使用EasyUI Treegrid 树形网格
引入CSS和JS <link href="~/Content/plugins/jquery-easyui-1.7.0/themes/default/easyui.css" r ...
android版本对应表
API Level 最初Android版本 Linux内核版本首次发布日期后续Android版本 28 9 Unknown 2018-07-02(Beta 3) - 27 8.1 4.10 201 ...

百度快排发包python核心源码

百度快排发包python核心源码的更多相关文章

随机推荐

热门专题