3.编写函数parse,这里需要注意的是,该函数不能改变,是因为Scrapy中默认callback函数的函数名就是parse.
# 1.scrapy中post请求的发送:重写源码中的start_requests方法
# 因为源码中这样写的:for url in self.start_urls:
# yield self.make_requests_from_url(url) (make_requests_from_url方法的返回结果是Request对象)
# 2.在scrapy 框架中,会自动对cookie进行处理,可以在settings中设置不处理 COOKIES_ENABLED = False
示例代码:
import scrapy
class LoginSpider(scrapy.Spider):
name = 'login'
# allowed_domains = ['www.xxx.com']
start_urls = ['http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=201903160368']
def start_requests(self):
data = {
"email": "15516092050",
"icode": '',
"origURL": "http://www.renren.com/home",
"domain": "renren.com",
"key_id": '1',
"captcha_type": "web_login",
"password": "5e088a2ee22d34dd081aac25578e67bd3a2d851cdfbcf1f0c9ab7056bd1bad62",
"rkey": "3f4696f6fa1b89e9061868300bf11484",
"f": "http%3A%2F%2Fwww.renren.com%2F969395731",
}
for url in self.start_urls:
yield scrapy.FormRequest(url=url, formdata=data, callback=self.parse)
def parse(self, response):
detail_url = 'http://www.renren.com/969395731'
yield scrapy.Request(url = detail_url, callback=self.GetDetail)
def GetDetail(self, response):
ret = response.text
print(ret)
1.用scrapy爬取数据时,如果发现需要爬取的数据不在同一页面内,则必须使用请求传参的方式进行持久化储存
2.示例代码
class MovieSpider(scrapy.Spider):
name = 'movie'
# allowed_domains = ['www.xxx.com']
start_urls = ['http://www.55xia.com/']
def parse(self, response):
div_list = response.xpath('/html/body/div[1]/div[2]/div[1]/div/div | /html/body/div[1]/div[2]/div[3]/div/div')
for div in div_list:
item = MoviedemoItem()
detail_url = div.xpath('./div/div/h1/a/@href')
if not detail_url:
continue
else:
detail_url = 'http:' + detail_url.extract_first()
name = div.xpath('./div/div/h1/a/text()').extract_first()
score = div.xpath('./div/div/h1/em/text()')
if not score:
score = '暂无评价'
else:
score = score.extract_first()
item['name'] = name
item['score'] = score
print(name)
print(score)
yield scrapy.Request(url=detail_url, callback=self.GetDetail, meta={'item':item})
def GetDetail(self,response):
item = response.meta['item']
direct = response.xpath('/html/body/div[1]/div/div/div[1]/div[1]/div[2]/table/tbody/tr[1]/td[2]/a/text()').\
extract_first()
detail = response.xpath('/html/body/div[1]/div/div/div[1]/div[2]/div[2]//text()'). extract_first()
item['direct'] = direct
item['detail'] = detail
print(direct)
print(detail)
yield item
1.UA池:User-Agent池
- 作用:尽可能多的将scrapy工程中的请求伪装成不同类型的浏览器身份。
- 操作流程:
1.在下载中间件中拦截请求
2.将拦截到的请求的请求头信息中的UA进行篡改伪装
3.在配置文件中开启下载中间件
user_agent_list = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
"(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
"(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
"(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "
"(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
"(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
"(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
"(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
"(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "
"(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
2.代理池
- 作用:尽可能多的将scrapy工程中的请求的IP设置成不同的。
- 操作流程:
1.在下载中间件中拦截请求
2.将拦截到的请求的IP修改成某一代理IP
3.在配置文件中开启下载中间件
示例代码
http_proxy = ['http://91.226.35.93:53281', 'http://110.52.235.73:9999', 'http://151.3.53.246:53281']
https_proxy = ['https://106.104.168.15:8080', 'https://93.190.143.59:1080', 'https://223.27.212.41:8080' ]
if request.url.split(':')[0] == 'http':
request.meta['proxy'] = random.choice(http_proxy)
else:
request.meta['proxy'] = random.choice(https_proxy)