# -*- coding: utf-8 -*-
__author__ = 'Frank Li'
import requests
import json class HotSpider(object):
def __init__(self):
self.url = "https://m.douban.com/rexxar/api/v2/subject_collection/filter_tv_domestic_hot/items?os=android&for_mobile=1&start={}&count=18&loc_id=108288"
self.session = requests.session()
self.headers = {"Referer": "https://m.douban.com/tv/chinese",
"User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Mobile Safari/537.36"} def parse_2_list_from_str(self,url):
return json.loads(self.session.get(url,headers=self.headers).content.decode())['subject_collection_items'] def save_as_file(self,content_list,file):
with open(file,'a',encoding='utf-8') as f:
for content in content_list:
f.write(json.dumps(content,ensure_ascii=False))
f.write('\n') def run(self):
url = self.url.format(0)
num = 0
total = 500
while num<total+18:
print(url)
self.save_as_file(self.parse_2_list_from_str(url),'hot.json')
num+=18
url=self.url.format(num) if __name__ == '__main__':
hot_spider = HotSpider()
hot_spider.run()

使用 xpath 爬取正在热映的 电影保存为 json 文件

# -*- coding: utf-8 -*-
__author__ = 'Frank Li'
import requests
from lxml import etree
import json url = "https://movie.douban.com/cinema/nowplaying/changsha/"
headers = {"Referer":"https://movie.douban.com/cinema/nowplaying/changsha/",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"}
sess = requests.session()
response = sess.get(url,headers=headers)
html_str = response.content.decode()
element = etree.HTML(html_str)
movie_img_list = element.xpath("//div[@class='mod-bd']/ul[@class='lists']//li[@class='poster']//img/@src")
movie_name_list = element.xpath("//div[@class='mod-bd']/ul[@class='lists']//li[@class='stitle']/a/@title")
movie_addr_list = element.xpath("//div[@class='mod-bd']/ul[@class='lists']//li[@class='stitle']/a/@href")
movie_score_list = element.xpath("//div[@class='mod-bd']/ul[@class='lists']//li[@class='srating']/span[@class='subject-rate']/text()") for name,img,addr,score in zip(movie_name_list,movie_img_list,movie_addr_list,movie_score_list):
item = {}
item['name'] = name
item['img'] = img
item['addr'] = addr
item['score'] = score
with open('movie.json','a',encoding='utf-8') as f:
item_json = json.dumps(item, ensure_ascii=False, indent=2)
print(item_json)
f.write(item_json)
f.write('\n')
f.flush()

保存下来的 movie.json 文件

{
"name": "碟中谍6:全面瓦解",
"img": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2529365085.jpg",
"addr": "https://movie.douban.com/subject/26336252/?from=playing_poster",
"score": "8.3"
}
{
"name": "阿尔法:狼伴归途",
"img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2530871439.jpg",
"addr": "https://movie.douban.com/subject/26810318/?from=playing_poster",
"score": "6.5"
}
{
"name": "蚁人2:黄蜂女现身",
"img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2529389608.jpg",
"addr": "https://movie.douban.com/subject/26636712/?from=playing_poster",
"score": "7.5"
}
{
"name": "传奇的诞生",
"img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2531286907.jpg",
"addr": "https://movie.douban.com/subject/3073268/?from=playing_poster",
"score": "7.6"
}
{
"name": "快把我哥带走",
"img": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2531080870.jpg",
"addr": "https://movie.douban.com/subject/30122633/?from=playing_poster",
"score": "7.0"
}
{
"name": "道高一丈",
"img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2530863118.jpg",
"addr": "https://movie.douban.com/subject/26954268/?from=playing_poster",
"score": "5.7"
}
{
"name": "李宗伟:败者为王",
"img": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2530870325.jpg",
"addr": "https://movie.douban.com/subject/27195119/?from=playing_poster",
"score": "7.1"
}
{
"name": "西虹市首富",
"img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2529206747.jpg",
"addr": "https://movie.douban.com/subject/27605698/?from=playing_poster",
"score": "6.7"
}
{
"name": "一出好戏",
"img": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2529571873.jpg",
"addr": "https://movie.douban.com/subject/26985127/?from=playing_poster",
"score": "7.3"
}
{
"name": "精灵旅社3:疯狂假期",
"img": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2530591543.jpg",
"addr": "https://movie.douban.com/subject/26630714/?from=playing_poster",
"score": "6.9"
}
{
"name": "苏丹",
"img": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2529570494.jpg",
"addr": "https://movie.douban.com/subject/26728641/?from=playing_poster",
"score": "7.0"
}
{
"name": "巨齿鲨",
"img": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2530572643.jpg",
"addr": "https://movie.douban.com/subject/26426194/?from=playing_poster",
"score": "6.0"
}
{
"name": "藏北秘岭-重返无人区",
"img": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2532522676.jpg",
"addr": "https://movie.douban.com/subject/30208007/?from=playing_poster",
"score": "6.2"
}
{
"name": "那些女人",
"img": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2530146643.jpg",
"addr": "https://movie.douban.com/subject/26574965/?from=playing_poster",
"score": "5.3"
}
{
"name": "草戒指",
"img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2531782507.jpg",
"addr": "https://movie.douban.com/subject/27204180/?from=playing_poster",
"score": "5.6"
}
{
"name": "吻隐者",
"img": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2531980221.jpg",
"addr": "https://movie.douban.com/subject/26928809/?from=playing_poster",
"score": "7.6"
}
{
"name": "禹神传之寻找神力",
"img": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2532781444.jpg",
"addr": "https://movie.douban.com/subject/30227727/?from=playing_poster",
"score": "6.6"
}
{
"name": "大师兄",
"img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2528842218.jpg",
"addr": "https://movie.douban.com/subject/27201353/?from=playing_poster",
"score": "6.2"
}

简单多线程 图片下载

import requests
from bs4 import BeautifulSoup
import os
import threading def download_img(src,target=None):
parent_dir = './img'
os.makedirs(parent_dir,exist_ok=True)
r = requests.get(src,stream=True)
target = src.split('/')[-1]
target = os.path.join(parent_dir,target)
print(threading.current_thread(),' start to download img: ',target)
with open(target,'wb') as tar_file:
for chunk in r.iter_content(chunk_size=128):
tar_file.write(chunk)
print('saved {}'.format(target)) if __name__ == '__main__':
URL = 'https://tieba.baidu.com/p/6034793219'
html = requests.get(URL).text
soup = BeautifulSoup(html,'lxml')
# print(html)
imgs = []
srcs = soup.find_all('img',{'class':'BDE_Image'})
for src in srcs:
imgs.append(src['src']) threads = []
for i,img in enumerate(imgs):
t = threading.Thread(target=download_img,args=(img,),name='Thread-{}'.format(i))
t.start()
threads.append(t)
for t in threads:
t.join()

python--爬取豆瓣热门国产电视剧保存为文件的更多相关文章

  1. requests库爬取豆瓣热门国产电视剧数据并保存到本地

    首先要做的就是去豆瓣网找对应的接口,这里就不赘述了,谷歌浏览器抓包即可,然后要做的就是分析返回的json数据的结构: https://movie.douban.com/j/search_subject ...

  2. 利用Python爬取豆瓣电影

    目标:使用Python爬取豆瓣电影并保存MongoDB数据库中 我们先来看一下通过浏览器的方式来筛选某些特定的电影: 我们把URL来复制出来分析分析: https://movie.douban.com ...

  3. Python爬取豆瓣指定书籍的短评

    Python爬取豆瓣指定书籍的短评 #!/usr/bin/python # coding=utf-8 import re import sys import time import random im ...

  4. Python爬取豆瓣《复仇者联盟3》评论并生成乖萌的格鲁特

    代码地址如下:http://www.demodashi.com/demo/13257.html 1. 需求说明 本项目基于Python爬虫,爬取豆瓣电影上关于复仇者联盟3的所有影评,并保存至本地文件. ...

  5. Python爬取豆瓣电影top

    Python爬取豆瓣电影top250 下面以四种方法去解析数据,前面三种以插件库来解析,第四种以正则表达式去解析. xpath pyquery beaufifulsoup re 爬取信息:名称  评分 ...

  6. python爬取豆瓣首页热门栏目详细流程

    记录一下爬取豆瓣热门专栏的经过,通过这篇文章,你能学会requests,HTMLParser,json的基本使用,以及爬取网页内容的基本思路. 使用模块 1,获取豆瓣首页代码:首先我们需要访问豆瓣页面 ...

  7. python 爬取豆瓣的美剧

    pc版大概有500条记录,mobile大概是50部,只有热门的,所以少一点 url构造很简单,主要参数就是page_limit与page_start,每翻一页,start+=20即可,tag是&quo ...

  8. python爬取豆瓣电影信息数据

    题外话+ 大家好啊,最近自己在做一个属于自己的博客网站(准备辞职回家养老了,明年再战)在家里 琐事也很多, 加上自己 一回到家就懒了(主要是家里冷啊! 广东十几度,老家几度,躲在被窝瑟瑟发抖,) 由于 ...

  9. python 爬取豆瓣电影短评并wordcloud生成词云图

    最近学到数据可视化到了词云图,正好学到爬虫,各种爬网站 [实验名称] 爬取豆瓣电影<千与千寻>的评论并生成词云 1. 利用爬虫获得电影评论的文本数据 2. 处理文本数据生成词云图 第一步, ...

随机推荐

  1. sql里的正则表达式

    SQL语句还可以搭配正则表达式作为查询条件,很是有用. REGEXP_LIKE(匹配)REGEXP_INSTR (包含)REGEXP_REPLACE(替换)REGEXP_SUBSTR(提取) 表 1: ...

  2. Markdown 使用技巧

    懒得复制,直接贴网页吧 懒得复制,直接贴网页吧*2 懒得复制,直接贴网页吧*3

  3. BZOJ 2069: [POI2004]ZAW(Dijkstra + 二进制拆分)

    题意 给定一个有 \(N\) 个点 \(M\) 条边的无向图, 每条无向边 最多只能经过一次 . 对于边 \((u, v)\) , 从 \(u\) 到 \(v\) 的代价为 \(a\) , 从 \(v ...

  4. 【转】设置 vim 显示行号永久有效

    在linux环境下,vim是常用的代码查看和编辑工具.在程序编译出错时,一般会提示出错的行号,但是用vim打开的代码确不显示行号,错误语句的定位非常不便.那么怎样才能让vim显示代码的行号呢? 1 临 ...

  5. ubuntu “无法获得锁 /var/lib/dpkg/lock -open”

    在ubuntu系统终端下,用apt-get install 安装软件的时候,如果在未完成下载的情况下将终端中断,此时 apt-get进程可能没有结束.结果,如果再次运行apt-get install ...

  6. vue路由原理剖析

    单页面应用(SPA)的核心之一是: 更新视图而不重新请求页面, 实现这一点主要是两种方式: 1.Hash: 通过改变hash值 2.History: 利用history对象新特性(详情可出门左拐见:  ...

  7. 2018 ICPC 焦作网络赛 E.Jiu Yuan Wants to Eat

    题意:四个操作,区间加,区间每个数乘,区间的数变成 2^64-1-x,求区间和. 题解:2^64-1-x=(2^64-1)-x 因为模数为2^64,-x%2^64=-1*x%2^64 由负数取模的性质 ...

  8. 小白眼中的AI之~Numpy基础

      周末码一文,明天见矩阵- 其实Numpy之类的单讲特别没意思,但不稍微说下后面说实际应用又不行,所以大家就练练手吧 代码裤子: https://github.com/lotapp/BaseCode ...

  9. LinkedList(JDK1.8)源码分析

    双向循环链表 双向循环链表和双向链表的不同在于,第一个节点的pre指向最后一个节点,最后一个节点的next指向第一个节点,也形成一个"环".而LinkedList就是基于双向循环链 ...

  10. P1886 P2216 单调队列模板

    何为单调队列? 单调队列是一个队列(废话) 而且必须同时满足下标单调和值单调两个单调特性. 跟优先队列不同,优先队列直接使用堆(heap)来实现,如何删去特定下标元素?不明. 本人喜欢用单调队列存下标 ...