python--爬取豆瓣热门国产电视剧保存为文件

# -*- coding: utf-8 -*-

__author__ = 'Frank Li'

import requests

import json

class HotSpider(object):

    def __init__(self):

        self.url = "https://m.douban.com/rexxar/api/v2/subject_collection/filter_tv_domestic_hot/items?os=android&for_mobile=1&start={}&count=18&loc_id=108288"

        self.session = requests.session()

        self.headers = {"Referer": "https://m.douban.com/tv/chinese",

                        "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Mobile Safari/537.36"}

    def parse_2_list_from_str(self,url):

        return json.loads(self.session.get(url,headers=self.headers).content.decode())['subject_collection_items']

    def save_as_file(self,content_list,file):

        with open(file,'a',encoding='utf-8') as f:

            for content in content_list:

                f.write(json.dumps(content,ensure_ascii=False))

                f.write('\n')

    def run(self):

        url = self.url.format(0)

        num = 0

        total = 500

        while num<total+18:

            print(url)

            self.save_as_file(self.parse_2_list_from_str(url),'hot.json')

            num+=18

            url=self.url.format(num)

if __name__ == '__main__':

    hot_spider = HotSpider()

    hot_spider.run()

使用 xpath 爬取正在热映的电影保存为 json 文件

# -*- coding: utf-8 -*-

__author__ = 'Frank Li'

import requests

from lxml import etree

import json

url = "https://movie.douban.com/cinema/nowplaying/changsha/"

headers = {"Referer":"https://movie.douban.com/cinema/nowplaying/changsha/",

           "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"}

sess = requests.session()

response = sess.get(url,headers=headers)

html_str = response.content.decode()

element = etree.HTML(html_str)

movie_img_list = element.xpath("//div[@class='mod-bd']/ul[@class='lists']//li[@class='poster']//img/@src")

movie_name_list = element.xpath("//div[@class='mod-bd']/ul[@class='lists']//li[@class='stitle']/a/@title")

movie_addr_list = element.xpath("//div[@class='mod-bd']/ul[@class='lists']//li[@class='stitle']/a/@href")

movie_score_list = element.xpath("//div[@class='mod-bd']/ul[@class='lists']//li[@class='srating']/span[@class='subject-rate']/text()")

for name,img,addr,score in zip(movie_name_list,movie_img_list,movie_addr_list,movie_score_list):

    item = {}

    item['name'] = name

    item['img'] = img

    item['addr'] = addr

    item['score'] = score

    with open('movie.json','a',encoding='utf-8') as f:

        item_json = json.dumps(item, ensure_ascii=False, indent=2)

        print(item_json)

        f.write(item_json)

        f.write('\n')

        f.flush()

保存下来的 movie.json 文件

{

  "name": "碟中谍6：全面瓦解",

  "img": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2529365085.jpg",

  "addr": "https://movie.douban.com/subject/26336252/?from=playing_poster",

  "score": "8.3"

}

{

  "name": "阿尔法：狼伴归途",

  "img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2530871439.jpg",

  "addr": "https://movie.douban.com/subject/26810318/?from=playing_poster",

  "score": "6.5"

}

{

  "name": "蚁人2：黄蜂女现身",

  "img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2529389608.jpg",

  "addr": "https://movie.douban.com/subject/26636712/?from=playing_poster",

  "score": "7.5"

}

{

  "name": "传奇的诞生",

  "img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2531286907.jpg",

  "addr": "https://movie.douban.com/subject/3073268/?from=playing_poster",

  "score": "7.6"

}

{

  "name": "快把我哥带走",

  "img": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2531080870.jpg",

  "addr": "https://movie.douban.com/subject/30122633/?from=playing_poster",

  "score": "7.0"

}

{

  "name": "道高一丈",

  "img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2530863118.jpg",

  "addr": "https://movie.douban.com/subject/26954268/?from=playing_poster",

  "score": "5.7"

}

{

  "name": "李宗伟：败者为王",

  "img": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2530870325.jpg",

  "addr": "https://movie.douban.com/subject/27195119/?from=playing_poster",

  "score": "7.1"

}

{

  "name": "西虹市首富",

  "img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2529206747.jpg",

  "addr": "https://movie.douban.com/subject/27605698/?from=playing_poster",

  "score": "6.7"

}

{

  "name": "一出好戏",

  "img": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2529571873.jpg",

  "addr": "https://movie.douban.com/subject/26985127/?from=playing_poster",

  "score": "7.3"

}

{

  "name": "精灵旅社3：疯狂假期",

  "img": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2530591543.jpg",

  "addr": "https://movie.douban.com/subject/26630714/?from=playing_poster",

  "score": "6.9"

}

{

  "name": "苏丹",

  "img": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2529570494.jpg",

  "addr": "https://movie.douban.com/subject/26728641/?from=playing_poster",

  "score": "7.0"

}

{

  "name": "巨齿鲨",

  "img": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2530572643.jpg",

  "addr": "https://movie.douban.com/subject/26426194/?from=playing_poster",

  "score": "6.0"

}

{

  "name": "藏北秘岭-重返无人区",

  "img": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2532522676.jpg",

  "addr": "https://movie.douban.com/subject/30208007/?from=playing_poster",

  "score": "6.2"

}

{

  "name": "那些女人",

  "img": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2530146643.jpg",

  "addr": "https://movie.douban.com/subject/26574965/?from=playing_poster",

  "score": "5.3"

}

{

  "name": "草戒指",

  "img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2531782507.jpg",

  "addr": "https://movie.douban.com/subject/27204180/?from=playing_poster",

  "score": "5.6"

}

{

  "name": "吻隐者",

  "img": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2531980221.jpg",

  "addr": "https://movie.douban.com/subject/26928809/?from=playing_poster",

  "score": "7.6"

}

{

  "name": "禹神传之寻找神力",

  "img": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2532781444.jpg",

  "addr": "https://movie.douban.com/subject/30227727/?from=playing_poster",

  "score": "6.6"

}

{

  "name": "大师兄",

  "img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2528842218.jpg",

  "addr": "https://movie.douban.com/subject/27201353/?from=playing_poster",

  "score": "6.2"

}

简单多线程图片下载

import requests

from bs4 import BeautifulSoup

import os

import threading

def download_img(src,target=None):

    parent_dir = './img'

    os.makedirs(parent_dir,exist_ok=True)

    r = requests.get(src,stream=True)

    target = src.split('/')[-1]

    target = os.path.join(parent_dir,target)

    print(threading.current_thread(),' start to download img: ',target)

    with open(target,'wb') as tar_file:

        for chunk in r.iter_content(chunk_size=128):

            tar_file.write(chunk)

        print('saved {}'.format(target))

if __name__ == '__main__':

    URL = 'https://tieba.baidu.com/p/6034793219'

    html = requests.get(URL).text

    soup = BeautifulSoup(html,'lxml')

    # print(html)

    imgs = []

    srcs = soup.find_all('img',{'class':'BDE_Image'})

    for src in srcs:

        imgs.append(src['src'])

    threads = []

    for i,img in enumerate(imgs):

        t = threading.Thread(target=download_img,args=(img,),name='Thread-{}'.format(i))

        t.start()

        threads.append(t)

    for t in threads:

        t.join()

python--爬取豆瓣热门国产电视剧保存为文件的更多相关文章

requests库爬取豆瓣热门国产电视剧数据并保存到本地
首先要做的就是去豆瓣网找对应的接口,这里就不赘述了,谷歌浏览器抓包即可,然后要做的就是分析返回的json数据的结构: https://movie.douban.com/j/search_subject ...
利用Python爬取豆瓣电影
目标:使用Python爬取豆瓣电影并保存MongoDB数据库中我们先来看一下通过浏览器的方式来筛选某些特定的电影: 我们把URL来复制出来分析分析: https://movie.douban.com ...
Python爬取豆瓣指定书籍的短评
Python爬取豆瓣指定书籍的短评 #!/usr/bin/python # coding=utf-8 import re import sys import time import random im ...
Python爬取豆瓣《复仇者联盟3》评论并生成乖萌的格鲁特
代码地址如下:http://www.demodashi.com/demo/13257.html 1. 需求说明本项目基于Python爬虫,爬取豆瓣电影上关于复仇者联盟3的所有影评,并保存至本地文件. ...
Python爬取豆瓣电影top
Python爬取豆瓣电影top250 下面以四种方法去解析数据,前面三种以插件库来解析,第四种以正则表达式去解析. xpath pyquery beaufifulsoup re 爬取信息:名称评分 ...
python爬取豆瓣首页热门栏目详细流程
记录一下爬取豆瓣热门专栏的经过,通过这篇文章,你能学会requests,HTMLParser,json的基本使用,以及爬取网页内容的基本思路. 使用模块 1,获取豆瓣首页代码:首先我们需要访问豆瓣页面 ...
python 爬取豆瓣的美剧
pc版大概有500条记录,mobile大概是50部,只有热门的,所以少一点 url构造很简单,主要参数就是page_limit与page_start,每翻一页,start+=20即可,tag是&quo ...
python爬取豆瓣电影信息数据
题外话+ 大家好啊,最近自己在做一个属于自己的博客网站(准备辞职回家养老了,明年再战)在家里琐事也很多, 加上自己一回到家就懒了(主要是家里冷啊! 广东十几度,老家几度,躲在被窝瑟瑟发抖,) 由于 ...
python 爬取豆瓣电影短评并wordcloud生成词云图
最近学到数据可视化到了词云图,正好学到爬虫,各种爬网站 [实验名称] 爬取豆瓣电影<千与千寻>的评论并生成词云 1. 利用爬虫获得电影评论的文本数据 2. 处理文本数据生成词云图第一步, ...

随机推荐

「SPOJ6340」「BZOJ1939」ZUMA - ZUMA【记忆化搜索】
题目链接 [洛谷传送门] 题解 \(f[i][j][k]\)表示在消除了\((i,j)\),在后面加上了\(k\)个珠子的总的珠子数. 考虑三种决策:(题目给出的\(k\)在下文表示成\(K\)) 决 ...
poj 3186 Treats for the Cows(dp)
Description FJ has purchased N (1 <= N <= 2000) yummy treats for the cows who get money for gi ...
初入react-redux （基于webpack babel的react应用框架）
react这么热门的框架也不介绍了,redux是一个单项数据流的小框架,当然不只配合react,它起初是为react而配的,现在面向所有了,比如ng-redux的项目.redux做为react的标准搭 ...
数组和list互转
数组转list 方法1: String[] stringArray = { "a", "b", "c", "d", &q ...
[SDOI2011]计算器（exgcd&BSGS）
k=1:裸的快速幂k=2:xy=z+kp,直接exgcd,这个可以不用解释了,不懂的同学可以看代码 k=3:裸的BSGS 重点是k=3(BSGS学习)ax=b(mod p)求解这个同余方程只能求gcd ...
为什么管理工具里没有Internet(IIS)管理器选项
如上图,localhost页能打开了,但是管理工具里没有iis管理器,主要原因是安装iis时候没有选择web管理工具,选取安装上就有了
POJ 2299树状数组求逆序对
求逆序对最常用的方法就是树状数组了,确实,树状数组是非常优秀的一种算法.在做POJ2299时,接触到了这个算法,理解起来还是有一定难度的,那么下面我就总结一下思路: 首先:因为题目中a[i]可以到99 ...
POJ 1979 Heavy Transportation （kruskal）
Heavy Transportation Time Limit: 3000MS Memory Limit: 30000K Total Submissions:46898 Accepted: 1 ...
glob.glob 匹配文件
glob.glob以列表形式返回匹配的文件路径只有一个参数:文件的匹配规则 e.g. >>>res_home = '/vip_data_center/test_envs/train ...
jquery 前端跨域 jsonp
1.jsonp 支持get:可以跨域: 2.java 在controller中增加header: /** * 获取用户信息 */ @ResponseBody @RequestMapping(value ...

python--爬取豆瓣热门国产电视剧保存为文件

使用 xpath 爬取正在热映的 电影保存为 json 文件

保存下来的 movie.json 文件

简单多线程 图片下载

python--爬取豆瓣热门国产电视剧保存为文件的更多相关文章

随机推荐

热门专题

使用 xpath 爬取正在热映的电影保存为 json 文件

简单多线程图片下载