python爬虫基础_requests和bs4

这些都是笔记，还缺少详细整理，后续会更新。

下面这种方式,属于入门阶段,手动成分比较多.

首先安装必要组件:

pip3 install requests

pip3 install beautifulsoup4

一、爬汽车之家

#!/usr/bin/env python

# coding:utf-8

import requests

from bs4 import BeautifulSoup

# 1.下载页面

ret = requests.get(url="https://www.autohome.com.cn/news/")

# print(ret) # 得到对象

# ret.encoding="gbk" # 指定编码

# print(ret.apparent_encoding)

ret.encoding = ret.apparent_encoding  # 指定编码等于原始页面编码

# print(ret.text)

# 2. 解析：获取想要的指定内容 beautifulsoup

soup = BeautifulSoup(ret.text, 'html.parser')  # 使用lxml则速度更快

# 如果要加class,则前面加下划线

# div = soup.find(name='div', id='auto-channel-lazyload-article', _class='article-wrapper')  # 找到外部DIV

div = soup.find(name='div', attrs={"id":"auto-channel-lazyload-article","class":"article-wrapper"})  # 使用属性字典方式

li_list = div.find_all(name='li')

for li in li_list:

    h3 = li.find(name='h3')

    if not h3:

        continue

    print(h3.text)

    a = li.find('a')

    # print(a.attrs)

    print(a.get('href'))

    p = li.find(name='p')

    print(p.text)

    print('----->' * 20)

    img = li.find(name='img')

    src = img.get('src')

    filename = src.rsplit('__', maxsplit=1)[1]

    down_img = requests.get(url='https:' + src)

    with open(filename, 'wb') as f:

        f.write(down_img.content)

当然，从for循环输出开始，将内容写入文件或数据库，就随需求了。

import requests

from bs4 import BeautifulSoup

# 1.下载页面

ret = requests.get(url="https://www.autohome.com.cn/news/")

ret.encoding = ret.apparent_encoding  # 指定编码等于原始页面编码

# 2. 解析：获取想要的指定内容 beautifulsoup

soup = BeautifulSoup(ret.text, 'html.parser')  # 使用lxml则速度更快

# 如果要加class,则前面加下划线 # 使用属性字典方式

div = soup.find(name='div', attrs={"id":"auto-channel-lazyload-article","class":"article-wrapper"}) 

li_list = div.find_all(name='li')

with open('res.txt','w',encoding='utf-8') as t:

    for li in li_list:

        h3 = li.find(name='h3')

        if not h3:

            continue

        t.write(h3.text+'\n')

        a = li.find('a')

        t.write(a.get('href')+'\n')

        p = li.find(name='p')

        txt = p.text.split('  ',1)[1]

        t.write(txt+'\n')

        t.write('\n')

        img = li.find(name='img')

        src = img.get('src')

        filename = src.rsplit('__', maxsplit=1)[1]

        down_img = requests.get(url='https:' + src)

        with open('./img/'+filename, 'wb') as f:

            f.write(down_img.content)

二、登录抽屉

#!/usr/bin/env python

# coding:utf-8

import requests

# 请求头要加，先访问普通网页，伪造得越像浏览器越好

# 1. 先访问网页，获取cookie(未授权）

ret = requests.get(

    url="https://dig.chouti.com/all/hot/recent/1",

    headers={

        'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', }

)

# print(ret.text)

r1_cookie_dict = ret.cookies.get_dict()

# 2. 登录 发送用户名和密码认证, 带上未授权的cookie

# 需要注意反爬虫策略

response_login = requests.post(

    url="https://dig.chouti.com/login",

    data={

        "phone": "",

        "password": "wodemima",

        "oneMonth": ""

    },

    headers={

        'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'

    },

    cookies=r1_cookie_dict

)

# print(response_login.text)

# cookie_dict=response_login.cookies.get_dict() # 第二次返回的cookie

# 点赞

r1 = requests.post(

    url="https://dig.chouti.com/link/vote?linksId=20630611",

    headers={

        'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'},

    cookies=r1_cookie_dict

)

print(r1.text)

# {"result":{"code":"9999", "message":"推荐成功", "data":{"jid":"cdu_53074732774","likedTime":"1530752755154000","lvCount":"21","nick":"aabbccdd","uvCount":"1","voteTime":"小于1分钟前"}}}

requests和bs4的几个小片段：

#!/usr/bin/env python

# coding:utf-8

import requests,re

from bs4 import BeautifulSoup

'''

requests.get(url="http://www.baidu.com")  # requests.request(method="get",url="xxx")

requests.post(url="http://www.baidu.com")  # requests.request(method="post",url="xxx")

可以传的参数：

url: 地址

params: URL中传入的参数

headers: 请求头

cookies: Cookie

data: 数据

    以上必需牢记

'''

ret = requests.get(

    url="https://www.baidu.com/s",

    params={"wd": "王历宏"},  # https://www.baidu.com/s?wd=%E6%9D%8E%E5%81%A5

    headers={

        'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', },

)

ret.encoding = ret.apparent_encoding

# print(ret.text)

soup = BeautifulSoup(ret.text, 'html.parser')

div = soup.find(name='span', attrs={"class":"nums_text"})

# lis = re.findall("\d+",div.text)

# print("".join(lis))

print(div.text)

'''

### json参数

requests.post(

    url="http://www.baidu.com",

    # json={

    #     'name':'alex',

    #     'passwd':'123456',

    # },

    headers={},

    cookies={},

    # 如果搞不清对方是要Form_data 还是payload 就使用下面的方式。

    data=json_dumps({

        'name':'alex',

        'pwd':'123456',

    })

)

'''

## 上传文件

# auth 基本弹窗验证

from requests.auth import HTTPBasicAuth,HTTPDigestAuth

res = requests.get(

    'https://api.github.com/user', auth=HTTPBasicAuth("abc@163.com","")

    # 'https://api.github.com/user', auth=HTTPDigestAuth("abc@163.com","11223344") # 方法不一样

)

print(res.text)

# timeout 超时时间

#  allow_redirects

##  proxies  代理

'''

proxies ={

    "http":"61.172.249.96:80",

    "https":"http://61.185.219.126:3128",

}

ret = requests.get("http://www.proxy360.cn/Proxy",proxies=proxies)

proxies2 = {"http://10.20.1.128":"http://10.10.1.10:5323"}

'''

# 使用代理字典，以及用户名密码

'''

from requests.auth import HTTPProxyAuth

proxy_dict={

    'http':'77.75.105.165',

    'https':'77.75.105.166'

}

auth=HTTPProxyAuth('username','mypwd')

r = requests.get("http://www.google.com",proxies=proxy_dict,auth=auth)

'''

我上交的作业，还是有不少问题。

#!/usr/bin/env python

# coding:utf-8

import requests

from bs4 import BeautifulSoup

username = input("请输入github账号：")

pwd = input("请输入github密码：")

print("请稍等几秒... ")

# 1. 打开登录页

ret1 = requests.get(

    url="https://github.com/login",

    headers={

        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0',

    }

)

r1_cookie_dict = ret1.cookies.get_dict()  # 首次获取cookie

soup1 = BeautifulSoup(ret1.text, features='lxml')

token1 = soup1.find(name="input", attrs={"name": "authenticity_token"}).get("value")  # 拿到页面token

# print(token1) # 是否取到 authenticity_token

# 2. 登录动作

ret2 = requests.post(

    url="https://github.com/session",

    data={

        "commit": "Sign in",

        "utf8": "✓",

        "authenticity_token": token1,

        "login": username,

        "password": pwd,

    },

    headers={

        'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0',

    },

    cookies=r1_cookie_dict # 带上首次的cookie

)

r2_cookie_dict = ret2.cookies.get_dict() # 获取登录成功后返回的cookie

# print(ret2.text) # 确实是慢了点

# 3. 作业中要求获取个人信息，所以打开个人settings页

ret3 = requests.get(

    url="https://github.com/settings/profile",

    headers={

        'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0',

    },

    cookies=r2_cookie_dict # 带上登录成功后的cookie

)

# print(ret3.text)

# 4. 查找并打印个人信息

soup3 = BeautifulSoup(ret3.text, features='lxml')

user_info_name= soup3.find(name="input", attrs={"name": "user[profile_name]"}).get("value")

user_info_email = soup3.find(name="select", attrs={"name": "user[profile_email]"}).get("option") # 可能有问题

user_info_bio = soup3.find(name="textarea", attrs={"name": "user[profile_bio]"}).get("value")

user_info_url = soup3.find(name="input", attrs={"name": "user[profile_blog]"}).get("value")

user_info_company = soup3.find(name="input", attrs={"name": "user[profile_company]"}).get("value")

user_info_location = soup3.find(name="input", attrs={"name": "user[profile_location]"}).get("value")

print('Name: ',user_info_name)

print('Public email: ',user_info_email)

print('Bio: ',user_info_bio)

print('URL: ',user_info_url)

print('Company: ',user_info_company)

print('Location: ',user_info_location)

'''

以下是API的方式，试过，直接得到字典。

from requests.auth import HTTPBasicAuth

res = requests.get(

    'https://api.github.com/user', auth=HTTPBasicAuth(username, pwd)

)

print(res.text)

'''

以下是老师给的指导意见，真是非常好的反馈：

1.请了解下python的pep8规范

2.你的请求头一定要写完整，不要这么暴露你的爬虫请求，这种行为是不好的习惯。

3.你代码的注释写在文档里最好了。

4.你每个请求一定要try一下这在爬虫里很重要你要保证你的爬虫稳定运行

5.你的代码应该封装成函数

6.你写任何项目的时候注意下项目结构哈

7.同学作业写的很好了，其实生产中bs4还是不多的。pyquery或者路径获取的方式用的很多。

python爬虫基础_requests和bs4的更多相关文章

Python爬虫基础
前言 Python非常适合用来开发网页爬虫,理由如下: 1.抓取网页本身的接口相比与其他静态编程语言,如java,c#,c++,python抓取网页文档的接口更简洁:相比其他动态脚本语言,如perl ...
python爬虫-基础入门-python爬虫突破封锁
python爬虫-基础入门-python爬虫突破封锁 >> 相关概念 >> request概念:是从客户端向服务器发出请求,包括用户提交的信息及客户端的一些信息.客户端可通过H ...
python爬虫-基础入门-爬取整个网站《3》
python爬虫-基础入门-爬取整个网站<3> 描述: 前两章粗略的讲述了python2.python3爬取整个网站,这章节简单的记录一下python2.python3的区别 python ...
python爬虫-基础入门-爬取整个网站《2》
python爬虫-基础入门-爬取整个网站<2> 描述: 开场白已在<python爬虫-基础入门-爬取整个网站<1>>中描述过了,这里不在描述,只附上 python3 ...
python爬虫-基础入门-爬取整个网站《1》
python爬虫-基础入门-爬取整个网站<1> 描述: 使用环境:python2.7.15 ,开发工具:pycharm,现爬取一个网站页面(http://www.baidu.com)所有数 ...
python爬虫基础要学什么，有哪些适合新手的书籍与教程？
一,爬虫基础: 首先我们应该了解爬虫是个什么东西,而不是直接去学习带有代码的内容,新手小白应该花一个小时去了解爬虫是什么,再去学习带有代码的知识,这样所带来的收获是一定比你直接去学习代码内容要多很多很 ...
Python爬虫基础之认识爬虫
一.前言爬虫Spider什么的,老早就听别人说过,感觉挺高大上的东西,爬网页,爬链接~~~dos黑屏的数据刷刷刷不断地往上冒,看着就爽,漂亮的校花照片,音乐网站的歌曲,笑话.段子应有尽有,全部都过来 ...
python 爬虫基础知识一
网络爬虫(又被称为网页蜘蛛,网络机器人,在FOAF社区中间,更经常的称为网页追逐者),是一种按照一定的规则,自动的抓取万维网信息的程序或者脚本. 网络爬虫必备知识点 1. Python基础知识2. P ...
Python爬虫基础（一）——HTTP
前言因特网联系的是世界各地的计算机(通过电缆),万维网联系的是网上的各种各样资源(通过超文本链接),如静态的HTML文件,动态的软件程序······.由于万维网的存在,处于因特网中的每台计算机可以很 ...

随机推荐

object的wait()、notify()、notifyAll()、方法和Condition的await()、signal()方法
wait().notify()和notifyAll()是 Object类中的方法从这三个方法的文字描述可以知道以下几点信息: 1)wait().notify()和notifyAll()方法是本地方 ...
雷林鹏分享：jQuery EasyUI 数据网格 - 格式化列
jQuery EasyUI 数据网格 - 格式化列以下实例格式化在 easyui DataGrid 里的列数据,并使用自定义列的 formatter,如果价格小于 20 就将文本变为红色. 为了格式 ...
Facebook主页照片和封面照片的尺寸要求
为什么好好的照片上传到Facebook后效果总不理想?为了避免你的照片在上传时被压缩,建议你尽量调整一下图片大小和格式,下面一起来看看Facebook主页照片和封面照片的尺寸要求. 1. Facebo ...
vue版弹幕
效果: 下载优化版下载: https://pan.baidu.com/s/1mvKGwJsBjXd2hvvi5Rp9pA 用法: import barrage from '../components ...
jquary高级和ajax
jquary高级: 1.动画 1.三种方式显示与隐藏元素 1.默认显示和隐藏的方式 1.show([speed],[easing],[fn]):显示 [speed],[easing],[fn] spe ...
prometheus监控示例
prometheus架构图 prometheus 各组件介绍 Prometheus Server: 使用pull方式采集监控数据,在该组件上配置监控数据的采集和告警规则. Client Library ...
docker 中安装 rabbitMQ
安装rabbitMQ的命令 docker run -d --hostname my-rabbit --name rabbit -e RABBITMQ_DEFAULT_USER=admin -e RAB ...
Previous operation has not finished;run 'cleanup' if it was interrupted;Please execute the 'Cleanup' command.
今天更新文件夹时svn报错如下提示说让clean up,但是clean up又提示fail,让继续clean up,这样就陷入死循环了…… 搜了多种解决办法后找到原因:当时正在打开着svn的某个文件 ...
Firebug: 已拦截跨源请求：同源策略禁止读取位于XXX的远程资源。（原因：CORS 头缺少 'Access-Control-Allow-
第一种,就是在被请求的程序中添加HTTP头,即CORS跨域(跨域资源共享,Cross-Origin Resource Sharing) 如: Response.Headers.Add("Ac ...
mybatis源码数据库链接配置
<?xml version="1.0" encoding="UTF-8" ?> <!DOCTYPE configuration ...

python爬虫基础_requests和bs4

python爬虫基础_requests和bs4的更多相关文章

随机推荐

热门专题