python_day05

今日内容

'''

post请求登录github

Request URL:

    https://github.com/session

Request

    Method: POST

#Referer表示上一个请求的页面

Referer:

    https://github.com/login

User-Agent:

    Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36

请求体：

    只有POST请求才有

    commit: Sign in

    utf8: ✓

    authenticity_token: COh/MDoDDUVauDtPbZ2A6pjf4pEA4pV8jwRO8PjHPzbXiLJiwtCXRQ7Ik3kXWxJOOF+i5/1r9twxUqaUnXe5TA==

    login: HS1

    password: ***********

    webauthn-support: unsupported

'''

'''

# 1.访问login页面获取token信息

Request URL:

    https://github.com/login

Request Method:

    GET

#服务端告诉客户端需要设置的Cookies

响应头：

    Set-Cookies

请求头：

    Cookie

    User-Agent

'''

headers = {

    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36'

}

import requests

import re

url1 = 'https://github.com/login'

response = requests.get(url1, headers=headers)

# 把login页返回的Cookie信息转为字典

login_cookies = response.cookies.get_dict()

# print(response.text)

token = re.findall('<input type="hidden" name="authenticity_token" value="(.*?)" />', response.text, re.S)

# print(token[0])

'''

# 2.往session页面发送post请求

Request URL:

    https://github.com/session

Request Method:

    POST

请求头：

    Cookie

    User-Agent

'''

url2 = 'https://github.com/session'

# 拼接请求体信息

form_data = {

    'commit': 'Sign in',

    'utf8': '✓',

    'authenticity_token': token[0],

    'login': '********',

    'password': '***********',

    'webauthn-support': 'unsupported'

}

headers1 = {

    'Referer': 'https://github.com/login',

    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36'

}

# 携带请求头、请求体、login页的cookies信息

response2 = requests.post(url2, data=form_data, headers=headers1, cookies=login_cookies)

# print(response2.status_code)

# print(response2.text)

with open('github.html', 'w', encoding='utf-8') as f:

    f.write(response2.text)

编译后出现github.html

二、response响应

import requests

response = requests.get('https://www.baidu.com')

print(response.status_code)     # 获取响应状态码

print(response.url)     # 获取url地址

print(response.text)     # 获取文本

print(response.content)     # 获取二进制流

print(response.headers)     # 获取页面请求头信息

print(response.history)     # 上一次请求地址

print(response.cookies)     # 返回cookies对象

print(response.cookies.get_dict())     # 获取cookies信息转换成字典

print(response.cookies.items())     # 获取cookies信息转换成字典

print(response.encoding)     # 字符编码

print(response.elapsed)     # 访问时间

#一点一点写入二进制流

import requests

# 往音频地址发送get请求

url = 'https://vd3.bdstatic.com/mda-ic4pfhh3ex32svqi/hd/mda-ic4pfhh3ex32svqi.mp4?auth_key=1557973824-0-0-bfb2e69bb5198ff65e18065d91b2b8c8&bcevod_channel=searchbox_feed&pd=wisenatural&abtest=all.mp4'

response = requests.get(url, stream=True)     # stream=True 把content设置为一个迭代器对象

print(response.content)

with open('love_for_GD.mp4', 'wb') as f:

    for content in response.iter_content():

        f.write(content)

三、

'''

证书验证(大部分网站都是https)

'''

import requests

# https = http + ssl

response = requests.get('https://www.xiaohuar.com')

print(response.status_code)

# 改进1:去掉报错,但是会报警告

import requests

response = requests.get('https://www.xiaohuar.com', verify=False)

# 不验证证书,报警告,返回200

print(response.status_code)

# 改进2:去掉报错,并且去掉警报信息

import requests

import urllib3

urllib3.disable_warnings()  # 关闭警告

response = requests.get('https://www.xiaohuar.com', verify=False)

print(response.status_code)

# 改进3:加上证书(伪代码)

# 很多网站都是https,但是不用证书也可以访问,大多数情况都是可以携带也可以不携带证书

# 知乎\百度等都是可带可不带

# 有硬性要求的,则必须带，比如对于定向的用户,拿到证书后才有权限访问某个特定网站

import requests

response = requests.get(

    'https://www.xiaohuar.com',

    cert=('/path/server.crt', '/path/key'))

print(response.status_code)

'''

超时设置

# 两种超时:float or tuple

# timeout=0.1  # 代表接收数据的超时时间

# timeout=(0.1,0.2)  # 0.1代表链接超时  0.2代表接收数据的超时时间

'''

import requests

response = requests.get('https://www.baidu.com',

                        timeout=0.0001)

print(response.status_code)

'''

使用代理（重要指数*******）

# 代理设置:先发送请求给代理,然后由代理帮忙发送(封ip是常见的事情)

# 西刺代理

'''

import requests

proxies={

    # 带用户名密码的代理,@符号前是用户名与密码

    'http': 'http://tank:123@localhost:9527',

    'http': 'http://localhost:9527',

    'https': 'https://localhost:9527',

}

response = requests.get('https://www.12306.cn', proxies=proxies)

print(response.status_code)

# 支持socks代理,安装:pip install requests[socks]

import requests

proxies = {

    'http': 'socks5://user:pass@host:port',

    'https': 'socks5://user:pass@host:port'

}

respone = requests.get('https://www.12306.cn', proxies=proxies)

print(respone.status_code)

'''

# 认证设置

登录网站时，会弹出一个框，要求你输入用户名与密码（类似于alert），此时无法进入html页面，待授权通过后才能进入html页面。

Requests模块为我们提供了多种身份认证方式，包括基本身份认证等...

其原理指的是通过输入用户名与密码获取用户的凭证来识别用户，然后通过token对用户进行授权。

基本身份认证:

    HTTP Basic Auth是HTTP1.0提出的认证方式。客户端对于每一个realm，通过提供用户名和密码来进行认证的方式当认证失败时，服务器收到客户端请求，返回401。

'''

import requests

# 通过访问github的api来测试

url = 'https://api.github.com/user'

HEADERS = {

    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',

}

# 测试1，失败返回401

response = requests.get(url, headers=HEADERS)

print(response.status_code)  #

print(response.text)

# 测试2，通过requests.auth内的HTTPBasicAuth进行认证，认证成功返回用户信息

from requests.auth import HTTPBasicAuth

response = requests.get(url, headers=HEADERS, auth=HTTPBasicAuth('uesrname', 'pwd'))

print(response.text)

# 测试3，通过requests.get请求内的auth参数默认就是HTTPBasicAuth，认证成功返回用户信息

response = requests.get(url, headers=HEADERS, auth=('*******', '**********'))

print(response.text)

'''

上传文件

'''

import requests

# 上传文本文件

files1 = {'file': open('user.txt', 'rb')}

response = requests.post('http://httpbin.org/post', files=files1)

print(response.status_code)  #

print(response.text)  #

# 上传图片文件

files2 = {'jpg': open('小狗.jpg', 'rb')}

response = requests.post('http://httpbin.org/post', files=files2)

print(response.status_code)  #

print(response.text)  #

# 上传视频文件

files3 = {'movie': open('love_for_GD.mp4', 'rb')}

response = requests.post('http://httpbin.org/post', files=files3)

print(response.status_code)  #

print(response.text)  #

四、selenium模块

from selenium import webdriver#用来驱动浏览器

import time

#调用一个动作链对象，破解滑动验证码的时候用的，可以拖动图片

from selenium.webdriver import ActionChains

#按照什么方法查找属性，By.ID,By.CSS_SELECTOR,By.Class

from selenium.webdriver.common.by import  By

from selenium.webdriver.common.keys import Keys#按键盘操作

#和下面的webdriver一起用的EC是expected_conditions的别名

from selenium.webdriver.support import expected_conditions as EC

#等待页面加载某些元素

from selenium.webdriver.support.wait import WebDriverWait

chrome=webdriver.Chrome()

#若try出现异常

try:

    #往tank博客主页发送get请求

    #参数一：驱动对象 参数二：等待时间

    wait=WebDriverWait(chrome,10)

    #1、访问百度

    chrome.get('https://www.baidu.com/')

    #2、查找input输入框

    input_tag=wait.until(

        EC.presence_of_element_located(

            '''

            此处可以写一个元祖

            参数一：查找属性的方式

            参数二：属性的名字

            '''

            (By.ID,"kw")

        )

    )

    #3、搜索一拳超人

    input_tag.send_keys("一拳超人")

    #4、按键盘回车键

    input_tag.send_keys(Keys.ENTER)

    time.sleep(3)

#无论发送什么都会关闭浏览器

finally:

    #关闭浏览器

    chrome.close()

'''

访问京东主页

'''

try:

    # 向百度主页发送get请求

    # chrome.get('https://www.jianshu.com/u/bfd35b09c0d7')

    # 参数1：驱动对象，参数2：超时时间

    wait = WebDriverWait(chrome, 10)

    # 1.访问京东主页

    chrome.get('https://www.jd.com')

    # 查找input输入框

    input_tag = wait.until(

        # 调用EC的presence_of_element_located()

        EC.presence_of_element_located(

            # 此处可以写一个元组，参数1：查找属性的方式，参数2：属性的名字

            (By.ID, 'key')

        )

    )

    # 3.搜索一拳超人

    input_tag.send_keys('唐诗三百首')

    # 4.按键盘回车键

    button = wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'button')))

    # 5.点击搜索按钮

    button.click()

    time.sleep(50)

# 无论发生什么都会关闭浏览器

finally:

    chrome.close()     # 关闭浏览器

'''

选择器

# 自动登陆百度

'''

from selenium import webdriver

import time

'''

===============所有方法===================

    element是查找一个标签

    elements是查找所有标签

    1、find_element_by_link_text  通过链接文本去找

    2、find_element_by_id 通过id去找

    3、find_element_by_class_name

    4、find_element_by_partial_link_text

    5、find_element_by_name

    6、find_element_by_css_selector

    7、find_element_by_tag_name

'''

# 获取驱动对象

driver = webdriver.Chrome()

try:

    # 自动登陆百度

    # 往百度发送get请求

    driver.get('https://www.baidu.com')

    # 隐式等待

    driver.implicitly_wait(10)

    # 1、find_element_by_link_text     # 通过链接文本去找

    # 根据'登陆'寻找标签

    # send_tag = driver.find_element_by_link_text('登录')

    # send_tag.click()

    # 2、find_element_by_partial_link_text     # 通过局部链接文本去找

    send_tag = driver.find_element_by_partial_link_text('登')

    send_tag.click()

    # 3、find_element_by_class_name

    send_tag = driver.find_element_by_class_name('tang-pass-footerBarULogin')

    send_tag.click()

    time.sleep(1)

    # 4、find_element_by_name

    username = driver.find_element_by_name('userName')

    username.send_keys('********')

    time.sleep(1)

    # 5、find_element_by_id     # 通过id去找

    password = driver.find_element_by_id('TANGRAM__PSP_10__password')

    password.send_keys('********')

    time.sleep(1)

    # 6、find_element_by_css_selector     # 根据属性选择器查找

    login = driver.find_element_by_css_selector('#TANGRAM__PSP_10__submit')

    # css = driver.find_element_by_css_selector('.pass-button-submit')

    login.click()

    # 7、find_element_by_tag_name     #通过标签名称查找

    # driver.find_element_by_tag_name('div')

    time.sleep(10)

finally:

    driver.close()

今日作业：

'''

爬取快代理：

    1.访问快代理页面

    2.通过re模块解析并提取所有代理

    3.通过ip测试网站对爬取的代理进行测试

    4.若test_ip函数抛出异常代表代理作废，否则代理有效

    5.利用有效的代理进行代理测试

<tr>

                    <td data-title="IP">124.205.143.212</td>

                    <td data-title="PORT">40585</td>

                    <td data-title="匿名度">高匿名</td>

                    <td data-title="类型">HTTP</td>

                    <td data-title="位置">北京市北京市  鹏博士宽带</td>

                    <td data-title="响应速度">2秒</td>

                    <td data-title="最后验证时间">2019-06-17 16:30:54</td>

                </tr>

re:

    <tr>.*?<td data-title="IP">(.*?)</td>.*?<td data-title="PORT">(.*?)</td>

'''

'''

页面链接

第一页：

    https://www.kuaidaili.com/free/

第二页：

    https://www.kuaidaili.com/free/inha/2/

'''

import requests

import re

import time

headers = {

    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',

}

def get_index(url):

    time.sleep(1)

    response1 = requests.get(url, headers=headers)

    return response1

def parse_index(text):

    ip_list1 = re.findall('<tr>.*?<td data-title="IP">(.*?)</td>.*?<td data-title="PORT">(.*?)</td>', text, re.S)

    for ip_port in ip_list1:

        ip1 = ':'.join(ip_port)

        yield ip1

def test_ip(ip2):

    print('测试ip: %s' % ip2)

    try:

        proxies = {'https': ip2}

        # ip测试网站

        ip_url1 = 'https://www.ipip.net/'

        # 使用有效与无效的代理对ip测试站点进行访问，若返回的结果为200则代表当前测试ip正常

        response2 = requests.get(ip_url1, headers=headers, proxies=proxies, timeout=1)

        if response2.status_code == 200:

            return ip

    # 若ip代理无效则抛出异常

    except Exception as e:

        print(e)

# 使用代理爬取nba

def spider_nba(good_ip1):

    url = 'https://china.nba.com/'

    proxies = {'https': good_ip1}

    response3 = requests.get(url, headers=headers, proxies=proxies)

    print(response3.status_code)

    print(response3.text)

if __name__ == '__main__':

    base_url = 'https://www.kuaidaili.com/free/inha/{}/'

    for line in range(1, 2905):

        ip_url = base_url.format(line)

        response = get_index(ip_url)

        ip_list = parse_index(response.text)

        for ip in ip_list:

            good_ip = test_ip(ip)

            if good_ip:

                spider_nba(good_ip)

python_day05的更多相关文章

python_day05(去爬登录的豆瓣)
# 爬豆瓣需要用cookie# 需要注意隐藏的参数,即input 里面的默认的一些参数# 需要自己注册一个账户密码import urllib.requestimport http.cookiejarf ...

随机推荐

Redis真集群安装
Redis真集群安装命令文档:http://redisdoc.com/index.html 下载:https://code.google.com/archive/p/redis/downloads ...
C# 闭包对像
主要内容: 1.描述出现的现像 2.分析其出现的原因 3.提示一.看如下一段代码及结果 class Program { static void Main(string[] args) { List& ...
VirtualBox NAT Network配置
VirtualBox NAT Network配置(OSX上的) VirtualBox的5种连接方式 NAT :虚拟机之间不能互通 NAT网络 :本文对象桥接 :一般情况下虚拟机无法设置静态IP,并且 ...
.Net轻量状态机Stateless的简单应用
对于大部分系统中流程的变更,是十分正常的事情,小到一个状态的切换,大到整个系统都是围绕业务流再走,复杂点的有工作流引擎,简单点的几个if/else收工,但是往往有那种,心有余而力不足的,比简单复杂,比 ...
一个html，3D 标签鼓励自己
效果如图: <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w ...
设计模式（十一）Composite模式
Composite模式模式能够使容器与内容具有一致性,创造出递归结构.有时,与将文件夹和文件都作为目录条目看待一样,将容器和内容作为同一种东西看待,可以帮助我们方便地处理问题.在容器中既可以放入内容, ...
在vue中使用Ueditor
今天研究的主角是:UEditor UEditor是由百度WEB前端研发部开发的所见即所得的开源富文本编辑器,具有轻量.可定制.用户体验优秀等特点. 版本有很多我用的是:[1.4.3.3 PHP 版本 ...
UART和RS232/RS485的关系是什么？
串口通讯是电子工程师和嵌入式开发工程师面对的最基本问题,RS232则是其中最简单最常用的通讯方式.但是初学者往往搞不清有关的名词如UART和RS232或RS485之间是什么关系,因为它们经常被放到语句 ...
C#/.NET/.NET Core定时任务调度的方法或者组件有哪些--Timer,FluentScheduler,TaskScheduler,Gofer.NET,Coravel,Quartz.NET还是Hangfire？
原文由Rector首发于码友网之 <C#/.NET/.NET Core应用程序编程中实现定时任务调度的方法或者组件有哪些,Timer,FluentScheduler,TaskSchedule ...
[springboot 开发单体web shop] 3. 用户注册实现
目录用户注册 ## 创建数据库 ## 生成UserMapper ## 编写业务逻辑 ## 编写user service UserServiceImpl#findUserByUserName 说明 U ...

python_day05

python_day05的更多相关文章

随机推荐

热门专题