selenium

基本操作

from selenium import webdriver

from time import sleep

#实例化一个浏览器对象

bro = webdriver.Chrome(executable_path=r'C:\pycahrm文件\chromedriver.exe')

url = 'https://www.jd.com/'

#用户发起请求

bro.get(url)

#定位标签

search_input = bro.find_element_by_id('key')

#对指定标签进行数据交互

search_input.send_keys('macPro')

btn = bro.find_element_by_xpath('//*[@id="search"]/div/div[2]/button')

btn.click()

sleep(2)

#执行js代码

jsCode = 'window.scrollTo(0,document.body.scrollHeight)'

bro.execute_script(jsCode)

sleep(3)

bro.quit()

selenium

- 概念：基于浏览器自动化的一个模块。

- 环境的安装：

    - pip install selenium

- selenium和爬虫之间的关联：

    - 模拟登录

    - 便捷的捕获到动态加载的数据（重点）

    	获取的页码源码数据 : page_source

        - 特点：可见及可得

        - 缺点：效率低

- selenium的具体使用

    - 准备浏览器的驱动程序：http://chromedriver.storage.googleapis.com/index.html

- 动作链：ActionChains，一系列的行为动作

    - 使用流程：

        - 实例化一个动作连对象，需要将指定的浏览器和动作连对象进行绑定

        - 执行相关的连续的动作

        - perform()立即执行动作连制定好的动作

滑动操作

方式一:

from selenium import webdriver

from selenium.webdriver.common.keys import Keys

from selenium.webdriver import ActionChains #动作链

import time

bro = webdriver.Chrome(executable_path=r'C:\pycahrm文件\chromedriver.exe')

bro.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')

bro.implicitly_wait(3)

bro.switch_to.frame('iframeResult')  #切换到frame标签下

frame_tag = bro.find_element_by_id('draggable') #如果不切换到iframe下,就找不到该标签

begin_tag = bro.find_element_by_id('draggable') #滑动块的起始位置

end_tag = bro.find_element_by_id('droppable')#滑动块的终止位置

actions = ActionChains(bro) #拿到动作链对象

actions.drag_and_drop(begin_tag,end_tag) #把动作放到动作链中，准备串行执行

actions.perform() #开始执行

time.sleep(2)

bro.quit()

方式二:

from selenium import webdriver

from selenium.webdriver.common.keys import Keys

from selenium.webdriver import ActionChains

import time

bro = webdriver.Chrome(executable_path=r'C:\pycahrm文件\chromedriver.exe')

bro.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')

bro.implicitly_wait(3)

bro.switch_to.frame('iframeResult')

frame_tag = bro.find_element_by_id('draggable')

begin_tag = bro.find_element_by_id('draggable')

end_tag = bro.find_element_by_id('droppable')

ActionChains(bro).click_and_hold(begin_tag).perform() #起始位置的滑动块,点击并按住

distance = end_tag.location['x']-begin_tag.location['x']

#滑动的距离

trsck = 0

while trsck < distance:   		      ActionChains(bro).move_by_offset(xoffset=50,yoffset=0).perform()     #xoffset横向滑动距离为50像素

    trsck += 50

ActionChains(bro).release().perform() #滑动结束,释放滑动块

time.sleep(2)

bro.quit()

基于selenium模拟登陆12306

import requests

from hashlib import md5

class Chaojiying_Client(object):

    def __init__(self, username, password, soft_id):

        self.username = username

        password =  password.encode('utf8')

        self.password = md5(password).hexdigest()

        self.soft_id = soft_id

        self.base_params = {

            'user': self.username,

            'pass2': self.password,

            'softid': self.soft_id,

        }

        self.headers = {

            'Connection': 'Keep-Alive',

            'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',

        }

    def PostPic(self, im, codetype):

        """

        im: 图片字节

        codetype: 题目类型 参考 http://www.chaojiying.com/price.html

        """

        params = {

            'codetype': codetype,

        }

        params.update(self.base_params)

        files = {'userfile': ('ccc.jpg', im)}

        r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files, headers=self.headers)

        return r.json()

    def ReportError(self, im_id):

        """

        im_id:报错题目的图片ID

        """

        params = {

            'id': im_id,

        }

        params.update(self.base_params)

        r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)

        return r.json()

from selenium import webdriver

from selenium.webdriver import ActionChains

from time import sleep

from PIL import Image #安装PIL或者是Pillow

from CJY import Chaojiying_Client

#封装一个识别验证码的函数

def transformCode(imgPath,imgType):

    chaojiying = Chaojiying_Client('username', 'password', '902590')

    im = open(imgPath, 'rb').read()

    return chaojiying.PostPic(im, imgType)['pic_str']

bro = webdriver.Chrome(executable_path=r'C:\pycahrm文件\chromedriver.exe')

bro.get('https://kyfw.12306.cn/otn/login/init')

sleep(2)

#将当前浏览器页面进行图片保存

bro.save_screenshot('./main.png')

#将验证码的局部区域进行裁剪

#捕获标签在页面中的位置信息

img_tag = bro.find_element_by_xpath('//*[@id="loginForm"]/div/ul[2]/li[4]/div/div/div[3]/img')

location = img_tag.location#标签的起始位置坐标（左下角坐标）

size = img_tag.size#标签的尺寸

#裁剪范围对应的矩形区域

rangle = (int(location['x']),int(location['y']),int(location['x']+size['width']),int(location['y']+size['height']))

#使用Image工具进行指定区域的裁剪

i = Image.open('./main.png')

frame = i.crop(rangle)#crop就是根据指定的裁剪范围进行图片的截取

frame.save('code.png')

#调用打码平台进行验证码的识别

result = transformCode('./code.png',9004)

print(result) #x1,y1|x2,y2|x3,y3

#x1,y1|x2,y2|x3,y3 ==>[[x1,y1],[x2,y2],[x3,y3]]

all_list = []#[[x1,y1],[x2,y2],[x3,y3]]

if '|' in result:

    list_1 = result.split('|')

    count_1 = len(list_1)

    for i in range(count_1):

        xy_list = []

        x = int(list_1[i].split(',')[0])

        y = int(list_1[i].split(',')[1])

        xy_list.append(x)

        xy_list.append(y)

        all_list.append(xy_list)

else:

    x = int(result.split(',')[0])

    y = int(result.split(',')[1])

    xy_list = []

    xy_list.append(x)

    xy_list.append(y)

    all_list.append(xy_list)

for point in all_list:

    x = point[0]

    y = point[1]

    ActionChains(bro).move_to_element_with_offset(img_tag,x,y).click().perform()

    sleep(1)

bro.find_element_by_id('username').send_keys('xxxxxx')

sleep(1)

bro.find_element_by_id('password').send_keys('xxxx')

sleep(1)

bro.find_element_by_id('loginSub').click()

sleep(10)

print(bro.page_source)

bro.quit()

selenium规避风险

规避检测

from selenium import webdriver

from selenium.webdriver import ChromeOptions

option = ChromeOptions()

option.add_experimental_option('excludeSwitches', ['enable-automation'])

bro = webdriver.Chrome(executable_path='./chromedriver.exe',options=option)

url = 'https://www.taobao.com/'

bro.get(url)

# 当用爬虫程序发起的请求时,在后台进行window.navigator.webdriver的js的注入,返回值为true

# 正常访问一个页面是注入js会返回一个undefind

无头浏览器

#无头浏览器

from selenium import webdriver

from selenium.webdriver.chrome.options import Options

from time import sleep

chrome_options = Options()

chrome_options.add_argument('--headless')

chrome_options.add_argument('--disable-gpu')

bro = webdriver.Chrome(executable_path='./chromedriver.exe',chrome_options=chrome_options) #看不见页面,不显示可视化页面

url = 'https://www.taobao.com/'

bro.get(url)

sleep(2)

bro.save_screenshot('123.png') 

print(bro.page_source)

python爬虫--selenium模块.上来自己动!的更多相关文章

Python爬虫——selenium模块
selenium模块介绍 selenium最初是一个测试工具,而爬虫中使用它主要是为了解决requests无法直接执行JavaScript代码的问题 selenium本质是通过驱动浏览器,完全模拟浏览 ...
[Python爬虫] Selenium实现自动登录163邮箱和Locating Elements介绍
前三篇文章介绍了安装过程和通过Selenium实现访问Firefox浏览器并自动搜索"Eastmount"关键字及截图的功能.而这篇文章主要简单介绍如何实现自动登录163邮箱,同时 ...
[Python爬虫] Selenium+Phantomjs动态获取CSDN下载资源信息和评论
前面几篇文章介绍了Selenium.PhantomJS的基础知识及安装过程,这篇文章是一篇应用.通过Selenium调用Phantomjs获取CSDN下载资源的信息,最重要的是动态获取资源的评论,它是 ...
[Python爬虫] Selenium获取百度百科旅游景点的InfoBox消息盒
前面我讲述过如何通过BeautifulSoup获取维基百科的消息盒,同样可以通过Spider获取网站内容,最近学习了Selenium+Phantomjs后,准备利用它们获取百度百科的旅游景点消息盒(I ...
[Python爬虫] Selenium爬取新浪微博客户端用户信息、热点话题及评论 (上)
转载自:http://blog.csdn.net/eastmount/article/details/51231852 一. 文章介绍源码下载地址:http://download.csdn.net/ ...
python爬虫---selenium库的用法
python爬虫---selenium库的用法 selenium是一个自动化测试工具,支持Firefox,Chrome等众多浏览器在爬虫中的应用主要是用来解决JS渲染的问题. 1.使用前需要安装这个 ...
[python爬虫] Selenium常见元素定位方法和操作的学习介绍
这篇文章主要Selenium+Python自动测试或爬虫中的常见定位方法.鼠标操作.键盘操作介绍,希望该篇基础性文章对你有所帮助,如果有错误或不足之处,请海涵~同时CSDN总是屏蔽这篇文章,再加上最近 ...
Python学习--Selenium模块
1. Python学习--Selenium模块介绍(1) 2.Python学习--Selenium模块学习(2) 其他: 1. Python学习--打码平台
Python学习--Selenium模块学习(2)
Selenium的基本操作获取浏览器驱动寻找方式 1. 通过手动指定浏览器驱动路径2. 通过 `$PATH`环境变量找寻浏览器驱动可参考Python学习--Selenium模块简单介绍(1) 控制 ...

随机推荐

python：timeit模块
(鱼c)timeit模块详解——准确测量小段代码的执行时间 http://bbs.fishc.com/forum.php?mod=viewthread&tid=55593&extra= ...
【algo&ds】8.最小生成树
1.最小生成树介绍什么是最小生成树? 最小生成树(Minimum spanning tree,MST)是在一个给定的无向图G(V,E)中求一棵树T,使得这棵树拥有图G中的所有顶点,且所有边都是来自图 ...
"PSP助手”微信小程序宣传视频链接及内容介绍
此作业的要求参见[https://edu.cnblogs.com/campus/nenu/2019fall/homework/8677] 队名:扛把子组组长:迟俊文组员:刘信鹏韩昊宋晓丽梁梦 ...
Java描述设计模式(24)：备忘录模式
本文源码:GitHub·点这里 || GitEE·点这里一.生活场景 1.场景描述常见的视频播放软件都具备这样一个功能:假设在播放视频西游记,如果这时候切换播放视频红楼梦,当再次切回播放西游记时, ...
Fortran文件读写--查找内容
program ex implicit none character(len=) A(),B(),C() !A异常.B已开挖.C需标记 integer i,j,N1,N2,count !N1是10号文 ...
各种优化方法总结比较(sgd/momentum/Nesterov/adagrad/adadelta)
前言这里讨论的优化问题指的是,给定目标函数f(x),我们需要找到一组参数x,使得f(x)的值最小. 本文以下内容假设读者已经了解机器学习基本知识,和梯度下降的原理. Batch gradient d ...
《手把手教你》系列练习篇之4-python+ selenium自动化测试（详细教程）
1. 简介今天我们继续前边的练习,学习和练习一下:如何使用webdriver方法获取当前测试页面的URL.如何获取当前页面的title. 2. webdriver方法获取当前测试页面的URL 本小 ...
SpringBoot第一次案例
一.Spring Boot 入门 1.Spring Boot 简介简化Spring应用开发的一个框架: 整个Spring技术栈的一个大整合: J2EE开发的一站式解决方案: 2.微服务 2014,m ...
6. 彤哥说netty系列之Java NIO核心组件之Buffer
--日拱一卒,不期而至! 你好,我是彤哥,本篇是netty系列的第六篇. 简介上一章我们一起学习了Java NIO的核心组件Channel,它可以看作是实体与实体之间的连接,而且需要与Buffer交 ...
如何使用JavaScript直接上传并预览粘贴板的图片？
(题图:梵高-橄榄树) 提出需求因为工作原因,现在有一个需求就是需要用户使用QQ或者微信复制一张截图后,在div中直接粘贴这张图片,而不是采用上传的方式.类似我们在使用QQ微信时直接粘贴截图的操作, ...

python爬虫--selenium模块.上来自己动!

selenium

基本操作

滑动操作

方式一:

方式二:

基于selenium模拟登陆12306

selenium规避风险

无头浏览器

python爬虫--selenium模块.上来自己动!的更多相关文章

随机推荐

热门专题