requests获取所有状态码

requests默认是不会获取301/302的状态码的。可以设置allow_redirects=False，这样就可以获取所有的状态码了

import requests

# url

# url = 'http://www.freebuf.com/news/157100.html'  # 请求200，返回200

url = 'http://www.freebuf.com/fevents/133225.html'  # 请求302，返回200。要想不跳转，获取302，用参数：allow_redirects=False

# url = 'http://www.freebuf.com/articles/database/151839.html'  # 请求403，返回403

# url = 'http://www.freebuf.com/articles/database/1518391.html'  # 请求存在的域名中不存在的页面，请求404，返回404

# url = 'http://www.freebudfsf.com/articles/database/1518391.html'  # 请求不存在的域名。程序崩溃

# url = 'https://www.douban.com/group/topic/49606658/'  # 请求存在的域名，公司限制访问，返回抛出异常，程序崩溃。效果和网络中断一样。

# url = 'http://10.1.75.241'  # 请求ip，（一定要加协议HTTP，否则崩溃）

# headers

headers = {

    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'

}

try:

    # 发请求，得响应

    response = requests.get(url, headers=headers, allow_redirects=False)

    # 解析

    print('    give url:', url)

    print(' request.url:', response.request.url)

    print('response.url:', response.url)

    print(response.content)

    print(response.status_code)

except Exception as e:

    print(e)

封装一个获取所有状态码的函数，同时实现验证返回值的方法

import requests

def get_statecode_or_errinfo(url=''):

    '''

    获取响应状态码，或者未响应的错误信息

    :param url: 请求的url

    :return: 状态码，或者未响应的错误信息

    '''

    if url == '':

        return '请输入一个url作为get_statecode_or_errinfo的参数'

    # headers

    headers = {

        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'

    }

    try:

        # 发请求，得响应

        response = requests.get(url, headers=headers, allow_redirects=False)

        # 返回状态码

        return response.status_code

    except Exception as e:

        # 返回异常信息

        return e

if __name__ == '__main__':

    # url

    # url = 'http://www.freebuf.com/news/157100.html'  # 请求200，返回200

    url = 'http://www.freebuf.com/fevents/133225.html'  # 请求302，返回200。要想不跳转，获取302，用参数：allow_redirects=False

    # url = 'http://www.freebuf.com/articles/database/151839.html'  # 请求403，返回403

    # url = 'http://www.freebuf.com/articles/database/1518391.html'  # 请求存在的域名中不存在的页面，请求404，返回404

    # url = 'http://www.freebudfsf.com/articles/database/1518391.html'  # 请求不存在的域名。程序崩溃。如果有Nginx，返回200

    # url = 'http://dsfs'  # 请求不存在的域名，设置了参数：allow_redirects=False，在有Nginx处理的情况下，有304，返回200。

    # url = 'https://www.douban.com/group/topic/49606658/'  # 请求存在的域名，公司限制访问，返回抛出异常，程序崩溃。效果和网络中断一样。

    # url = 'http://10.1.75.241'  # 请求ip，请求200，返回200（一定要加协议HTTP，否则崩溃）

    # url = 'http://www.freebuf.com/fevents/133225.html'  # 请求302，返回200。要想不跳转，获取302，用参数：allow_redirects=False

    url = 'http://www.freebuf.com/news/171238.html'

    headers = {

        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'

    }

    # response=requests.get(url,headers=headers,allow_redirects=False)

    response=requests.get(url,headers=headers)

    # 检查状态码

    print(response.status_code)

    # # 检查url

    print(url)

    print(response.url)

    #

    # # 检查请求头

    print(response.request.headers)

    #

    # # 检查响应头

    print(response.headers)

    #

    # # 检查源码

    # print(response.content)

    # print(response.content.decode())

    # print(response.text)

    #

    # response.encoding = 'utf-8'

    # print(response.text)

    print(response.encoding)

    #

    # # 检查源码字符串长度

    print(len(response.content))

说明：

反扒：
总结多种验证返回值的方式。requests
比如：检查状态码、检查url（有可能发送了跳转）、检查请求头、检查响应头、检查源码、检查源码字符串长度。
检查状态码
print (response.status_code)
检查url
print (response.url)
检查请求头
print (response.request.headers)
检查响应头
print (response.headers)
检查源码字符串长度
print (len(response.content))
检查源码
print (response.content)
print (response.content.decode())
response.encoding='utf-8'
print (response.text)
print (response.encoding)

scrapy爬虫的响应规则：

# 1、被过滤掉，不发出请求：不在允许的域名范围内

# temp['title_url'] = "https://www.baidu.com/"  # 跨域。请求发出前，url直接被过滤掉。

# temp['title_url'] = "http://open.freebuf.com/live?id=1021"  # 跨域。请求发出前，url直接被过滤掉。

# temp['title_url'] = "http://10.1.75.241"  # 请求ip地址，请求发出前，url直接过来掉。如果设置为允许ip网站，没有被过滤，就返回200

# 2、禁止访问

# temp['title_url'] = "http://www.freebuf.com/articles/database/151839.html"#禁止访问403，资源存在，不让访问。Ignoring non-200 response

# temp['title_url'] = "http://www.freebuf.com/articles/database/1518391.html"#禁止访问404，资源本身不存在。Ignoring non-200 response

# 3、重定向后的作为新请求

# temp['title_url'] = "http://www.freebuf.com/news/156654.html"  # 重定向301、302。会返回重定向后200的状态码

# 4、断网

# temp['title_url'] = "https://www.douban.com/group/topic/49606658/"  # 公司限制访问。[<twisted.python.failure.Failure twisted.internet.error.ConnectionLost: Connection to the other side was lost in a non-clean fashion: Connection lost.>]

# 5、没有的网站

# temp['title_url'] = "https://www.badfsdsdfsdfsdfsdddd.com/"  # 直接被过滤掉，如果没有被过滤，就返回域名解析错误：DNS lookup failed: no results for hostname lookup: www.badfsdsdfsdfsdfsdddd.com.

pass

scrapy爬虫举例

freebuf2.py

# -*- coding: utf-8 -*-

import scrapy

from scrapy_FB.items import ScrapyFb2Item

# from util.logger import Logger

# logger_freebuf2 = Logger(logname=__name__, logpath='collection_log', logformat=1, loglevel=10).getlog()

# logger_freebuf2.debug('i am debug3')

# logger_freebuf2.info('i am info3')

# logger_freebuf2.warning('i am warning3')

class Freebuf2Spider(scrapy.Spider):

    # freebuf2爬虫

    name = 'freebuf2'

    allowed_domains = ['freebuf.com','douban.com']

    start_urls = ['http://www.freebuf.com/page/708']

    def parse(self, response):

        cur_url = response.url  # 当前列表页url

        cur_page_num = int(cur_url.rpartition('/')[-1])  # 当前page num

        print('cur_url：%s' % cur_url)

        print('cur_page_num：%s' % cur_page_num)

        # 获取列表节点

        node_list = response.xpath('//*[@id="timeline"]/div/div[2]/dl/dt/a[1]')

        print('len(node_list)：%s' % len(node_list))

        page_num = int(cur_url.rpartition('/')[-1])  # 当前页码

        count_node = len(node_list)  # 当前列表页，一共有的详细页条数

        # 遍历节点

        for i, node in enumerate(node_list):

            # temp = {}

            temp = ScrapyFb2Item()

            temp['title'] = node.xpath('./text()').extract()[0].strip()

            if i == 0:

                # 1、被过滤掉，不发出请求：不在允许的域名范围内

                # temp['title_url'] = "https://www.baidu.com/"  # 跨域。请求发出前，url直接被过滤掉。

                # temp['title_url'] = "http://open.freebuf.com/live?id=1021"  # 跨域。请求发出前，url直接被过滤掉。

                # temp['title_url'] = "http://10.1.75.241"  # 请求ip地址，请求发出前，url直接过来掉。如果设置为允许ip网站，没有被过滤，就返回200

                # 2、禁止访问

                # temp['title_url'] = "http://www.freebuf.com/articles/database/151839.html"#禁止访问403，资源存在，不让访问。Ignoring non-200 response

                # temp['title_url'] = "http://www.freebuf.com/articles/database/1518391.html"#禁止访问404，资源本身不存在。Ignoring non-200 response

                # 3、重定向后的作为新请求

                # temp['title_url'] = "http://www.freebuf.com/news/156654.html"  # 重定向301、302。会返回重定向后200的状态码

                # 4、断网

                # temp['title_url'] = "https://www.douban.com/group/topic/49606658/"  # 公司限制访问。[<twisted.python.failure.Failure twisted.internet.error.ConnectionLost: Connection to the other side was lost in a non-clean fashion: Connection lost.>]

                # 5、没有的网站

                # temp['title_url'] = "https://www.badfsdsdfsdfsdfsdddd.com/"  # 直接被过滤掉，如果没有被过滤，就返回域名解析错误：DNS lookup failed: no results for hostname lookup: www.badfsdsdfsdfsdfsdddd.com.

                pass

            else:

                temp['title_url'] = node.xpath('./@href').extract()[0]

            temp['page_num'] = str(page_num)

            temp['line_num'] = i + 1

            temp['line_total'] = str(count_node)

            # print(temp['line_num'])

            yield scrapy.Request(temp['title_url'], callback=self.parse_detail, meta={"meta_1": temp}, errback=self.err)

        if len(node_list) != 0:  # 爬虫不终止的条件

            # 下一页

            next_url = 'http://www.freebuf.com/page/{}'.format(cur_page_num + 1)

            # print('next_url：%s' % next_url)

            yield scrapy.Request(next_url, callback=self.parse)  # 访问下一页

    def parse_detail(self, response):

        item = response.meta['meta_1']

        print(item['line_num'], item['title_url'])

        # print(response.status)

        print(item['line_num'], response.request.url)

    def err(self, response):

        print('err:',response.request.url)

        # print('err:',response.status)

        # print(dir(response))

        print('err:',response.getErrorMessage())

        print(dir(response))

        # print(type(response.getErrorMessage()))

requests获取所有状态码的更多相关文章

如何在使用 RemoteWebDriver 打开网页的同时获取 Http 状态码
最近一直在用Selenium这个开源项目写一些web 自动化的小玩意.本来一直运行的挺好,直到有一天突然发现资源抓取失败了,翻看日志才发现,原来本该正常打开的页面返回了504错误所以自然失败了.如何避 ...
LODOP获取打印机状态码和状态码含义测试
由于打印机千差万别,打印机执行的标准也不一样,LODOP获取的打印状态码也可能不同,安装了个打印机驱动实际测试一下,测试的打印机驱动是Brother Color Type3 Class Driver. ...
LODOP获取打印状态码和时间列表
之前有博文介绍获取打印状态码和打印状态码的含义,相关博文:LODOP获取打印机状态码和状态码含义测试.此外 ,也有获取状态码及其变化的方法,可以获取打印状态码的列表,列表包含每个状态和每个状态的时间. ...
前端如何获取http状态码400的返回值
axios.get("/check_mobile_and_sent_code",{withCredentials:true,params:{mobile:formInline.mo ...
【总结】crontab 使用脚本及直接获取HTTP状态码
一.在crontab里面计划执行的脚本,所有的命令都要写出绝对路径.因为crontab的独立的进程,可能无法直接加载环境变量. 二.在判断网站能否正常访问一般的思路: 1. 判断网站是否能够正常打开. ...
（六）获取http状态码和处理返回结果
int StatusCode = httpResponse.getStatusLine().getStatusCode(); 处理返回结果: /** * 处理返回结果 * @param respons ...
c# HttpWebResponse 各种情况下获取StatusCode状态码
捕捉网页出现404.500等会直接抛出WebException异常异常代码: (HttpWebResponse)req.GetResponse(); 当执行这段代码出现异常解决问题那如果我们想获 ...
(404) 未找到获取StatusCode状态码
异常代码: (HttpWebResponse)req.GetResponse(); 当执行这段代码出现异常解决问题那如果我们想获得错误发生时候服务器段错误页面的源代码该如何做呢? 其实非常非常简单 ...
php获取响应状态码
$ch = curl_init('http://www.jb51.net'); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_exec($ch); ...

随机推荐

hadoop2.7.1单机和伪集群的搭建-0
内容中包含 base64string 图片造成字符过多,拒绝显示
【MFC】OnInitDialog
OnInitDialog OnInitDialog是MFC的面向对象编程语言的类CDialog中的初始化成员函数名(虚函数).相当于对对话框进行初始化处理. 属性初始化成员函数名处 ...
module、applet
Each Module is developed as a standalone Windows DLL.Each module can contain one or more applets, an ...
第一章 Spring.Net介绍
1.1 概述在Java社区中Spring经历的长时间的考验,已经是一套很成熟稳定的框架.而Spring.Net 的前身是 Java版的Spring.Spring.Net是以Java版的Spring框 ...
Android Studio 引入 so 文件
1.在build.gradle中添加配置 task nativeLibsToJar(type: Zip, description: "create a jar archive of the ...
java中Date的使用情况
在开发中常使用情况. 1.将String转为date 例如"201604131630" //设置日期格式 public SimpleDateFormat sdf = new Si ...
Python cookielib 模块
什么是 cookie : 指某些网站为了辨别用户身份,进行 session 跟踪而储存在用户本地终端上的数据,通常以 txt 文件形式存储.比如你登录了淘宝,浏览器就会保存 cookie 信息,这样我 ...
linux系统输入法设置
首先是要安装了中文输入法,下面以搜狗为例. 2 从system settings 进入language support ,在keyboard input method system 中是看不到自己安装 ...
http协议详解-摘抄
引言 HTTP 是一个属于应用层的面向对象的协议,由于其简捷.快速的方式,适用于分布式超媒体信息系统.它于1990年提出,经过几年的使用与发展,得到不断地完善和扩展.目前在WWW中使用的是HTTP/ ...
C++中的枚举变量
至从C语言开始enum类型就被作为用户自定义分类有限集合常量的方法被引入到了语言当中,而且一度成为C++中定义编译期常量的唯一方法(后来在类中引入了静态整型常量).根据上面对enum类型的描述,有以下 ...

requests获取所有状态码

requests获取所有状态码的更多相关文章

随机推荐

热门专题