python--爬取豆瓣热门国产电视剧保存为文件

# -*- coding: utf-8 -*-

__author__ = 'Frank Li'

import requests

import json

class HotSpider(object):

    def __init__(self):

        self.url = "https://m.douban.com/rexxar/api/v2/subject_collection/filter_tv_domestic_hot/items?os=android&for_mobile=1&start={}&count=18&loc_id=108288"

        self.session = requests.session()

        self.headers = {"Referer": "https://m.douban.com/tv/chinese",

                        "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Mobile Safari/537.36"}

    def parse_2_list_from_str(self,url):

        return json.loads(self.session.get(url,headers=self.headers).content.decode())['subject_collection_items']

    def save_as_file(self,content_list,file):

        with open(file,'a',encoding='utf-8') as f:

            for content in content_list:

                f.write(json.dumps(content,ensure_ascii=False))

                f.write('\n')

    def run(self):

        url = self.url.format(0)

        num = 0

        total = 500

        while num<total+18:

            print(url)

            self.save_as_file(self.parse_2_list_from_str(url),'hot.json')

            num+=18

            url=self.url.format(num)

if __name__ == '__main__':

    hot_spider = HotSpider()

    hot_spider.run()

使用 xpath 爬取正在热映的电影保存为 json 文件

# -*- coding: utf-8 -*-

__author__ = 'Frank Li'

import requests

from lxml import etree

import json

url = "https://movie.douban.com/cinema/nowplaying/changsha/"

headers = {"Referer":"https://movie.douban.com/cinema/nowplaying/changsha/",

           "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"}

sess = requests.session()

response = sess.get(url,headers=headers)

html_str = response.content.decode()

element = etree.HTML(html_str)

movie_img_list = element.xpath("//div[@class='mod-bd']/ul[@class='lists']//li[@class='poster']//img/@src")

movie_name_list = element.xpath("//div[@class='mod-bd']/ul[@class='lists']//li[@class='stitle']/a/@title")

movie_addr_list = element.xpath("//div[@class='mod-bd']/ul[@class='lists']//li[@class='stitle']/a/@href")

movie_score_list = element.xpath("//div[@class='mod-bd']/ul[@class='lists']//li[@class='srating']/span[@class='subject-rate']/text()")

for name,img,addr,score in zip(movie_name_list,movie_img_list,movie_addr_list,movie_score_list):

    item = {}

    item['name'] = name

    item['img'] = img

    item['addr'] = addr

    item['score'] = score

    with open('movie.json','a',encoding='utf-8') as f:

        item_json = json.dumps(item, ensure_ascii=False, indent=2)

        print(item_json)

        f.write(item_json)

        f.write('\n')

        f.flush()

保存下来的 movie.json 文件

{

  "name": "碟中谍6：全面瓦解",

  "img": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2529365085.jpg",

  "addr": "https://movie.douban.com/subject/26336252/?from=playing_poster",

  "score": "8.3"

}

{

  "name": "阿尔法：狼伴归途",

  "img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2530871439.jpg",

  "addr": "https://movie.douban.com/subject/26810318/?from=playing_poster",

  "score": "6.5"

}

{

  "name": "蚁人2：黄蜂女现身",

  "img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2529389608.jpg",

  "addr": "https://movie.douban.com/subject/26636712/?from=playing_poster",

  "score": "7.5"

}

{

  "name": "传奇的诞生",

  "img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2531286907.jpg",

  "addr": "https://movie.douban.com/subject/3073268/?from=playing_poster",

  "score": "7.6"

}

{

  "name": "快把我哥带走",

  "img": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2531080870.jpg",

  "addr": "https://movie.douban.com/subject/30122633/?from=playing_poster",

  "score": "7.0"

}

{

  "name": "道高一丈",

  "img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2530863118.jpg",

  "addr": "https://movie.douban.com/subject/26954268/?from=playing_poster",

  "score": "5.7"

}

{

  "name": "李宗伟：败者为王",

  "img": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2530870325.jpg",

  "addr": "https://movie.douban.com/subject/27195119/?from=playing_poster",

  "score": "7.1"

}

{

  "name": "西虹市首富",

  "img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2529206747.jpg",

  "addr": "https://movie.douban.com/subject/27605698/?from=playing_poster",

  "score": "6.7"

}

{

  "name": "一出好戏",

  "img": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2529571873.jpg",

  "addr": "https://movie.douban.com/subject/26985127/?from=playing_poster",

  "score": "7.3"

}

{

  "name": "精灵旅社3：疯狂假期",

  "img": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2530591543.jpg",

  "addr": "https://movie.douban.com/subject/26630714/?from=playing_poster",

  "score": "6.9"

}

{

  "name": "苏丹",

  "img": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2529570494.jpg",

  "addr": "https://movie.douban.com/subject/26728641/?from=playing_poster",

  "score": "7.0"

}

{

  "name": "巨齿鲨",

  "img": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2530572643.jpg",

  "addr": "https://movie.douban.com/subject/26426194/?from=playing_poster",

  "score": "6.0"

}

{

  "name": "藏北秘岭-重返无人区",

  "img": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2532522676.jpg",

  "addr": "https://movie.douban.com/subject/30208007/?from=playing_poster",

  "score": "6.2"

}

{

  "name": "那些女人",

  "img": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2530146643.jpg",

  "addr": "https://movie.douban.com/subject/26574965/?from=playing_poster",

  "score": "5.3"

}

{

  "name": "草戒指",

  "img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2531782507.jpg",

  "addr": "https://movie.douban.com/subject/27204180/?from=playing_poster",

  "score": "5.6"

}

{

  "name": "吻隐者",

  "img": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2531980221.jpg",

  "addr": "https://movie.douban.com/subject/26928809/?from=playing_poster",

  "score": "7.6"

}

{

  "name": "禹神传之寻找神力",

  "img": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2532781444.jpg",

  "addr": "https://movie.douban.com/subject/30227727/?from=playing_poster",

  "score": "6.6"

}

{

  "name": "大师兄",

  "img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2528842218.jpg",

  "addr": "https://movie.douban.com/subject/27201353/?from=playing_poster",

  "score": "6.2"

}

简单多线程图片下载

import requests

from bs4 import BeautifulSoup

import os

import threading

def download_img(src,target=None):

    parent_dir = './img'

    os.makedirs(parent_dir,exist_ok=True)

    r = requests.get(src,stream=True)

    target = src.split('/')[-1]

    target = os.path.join(parent_dir,target)

    print(threading.current_thread(),' start to download img: ',target)

    with open(target,'wb') as tar_file:

        for chunk in r.iter_content(chunk_size=128):

            tar_file.write(chunk)

        print('saved {}'.format(target))

if __name__ == '__main__':

    URL = 'https://tieba.baidu.com/p/6034793219'

    html = requests.get(URL).text

    soup = BeautifulSoup(html,'lxml')

    # print(html)

    imgs = []

    srcs = soup.find_all('img',{'class':'BDE_Image'})

    for src in srcs:

        imgs.append(src['src'])

    threads = []

    for i,img in enumerate(imgs):

        t = threading.Thread(target=download_img,args=(img,),name='Thread-{}'.format(i))

        t.start()

        threads.append(t)

    for t in threads:

        t.join()

python--爬取豆瓣热门国产电视剧保存为文件的更多相关文章

requests库爬取豆瓣热门国产电视剧数据并保存到本地
首先要做的就是去豆瓣网找对应的接口,这里就不赘述了,谷歌浏览器抓包即可,然后要做的就是分析返回的json数据的结构: https://movie.douban.com/j/search_subject ...
利用Python爬取豆瓣电影
目标:使用Python爬取豆瓣电影并保存MongoDB数据库中我们先来看一下通过浏览器的方式来筛选某些特定的电影: 我们把URL来复制出来分析分析: https://movie.douban.com ...
Python爬取豆瓣指定书籍的短评
Python爬取豆瓣指定书籍的短评 #!/usr/bin/python # coding=utf-8 import re import sys import time import random im ...
Python爬取豆瓣《复仇者联盟3》评论并生成乖萌的格鲁特
代码地址如下:http://www.demodashi.com/demo/13257.html 1. 需求说明本项目基于Python爬虫,爬取豆瓣电影上关于复仇者联盟3的所有影评,并保存至本地文件. ...
Python爬取豆瓣电影top
Python爬取豆瓣电影top250 下面以四种方法去解析数据,前面三种以插件库来解析,第四种以正则表达式去解析. xpath pyquery beaufifulsoup re 爬取信息:名称评分 ...
python爬取豆瓣首页热门栏目详细流程
记录一下爬取豆瓣热门专栏的经过,通过这篇文章,你能学会requests,HTMLParser,json的基本使用,以及爬取网页内容的基本思路. 使用模块 1,获取豆瓣首页代码:首先我们需要访问豆瓣页面 ...
python 爬取豆瓣的美剧
pc版大概有500条记录,mobile大概是50部,只有热门的,所以少一点 url构造很简单,主要参数就是page_limit与page_start,每翻一页,start+=20即可,tag是&quo ...
python爬取豆瓣电影信息数据
题外话+ 大家好啊,最近自己在做一个属于自己的博客网站(准备辞职回家养老了,明年再战)在家里琐事也很多, 加上自己一回到家就懒了(主要是家里冷啊! 广东十几度,老家几度,躲在被窝瑟瑟发抖,) 由于 ...
python 爬取豆瓣电影短评并wordcloud生成词云图
最近学到数据可视化到了词云图,正好学到爬虫,各种爬网站 [实验名称] 爬取豆瓣电影<千与千寻>的评论并生成词云 1. 利用爬虫获得电影评论的文本数据 2. 处理文本数据生成词云图第一步, ...

随机推荐

Log Parser Studio 分析 IIS 日志
Log Parser Studio 分析 IIS 日志来源 https://www.cnblogs.com/lonelyxmas/p/8671336.html 软件下载地址: Log Parser ...
[luogu1486][bzoj1503][NOI2004]郁闷的出纳员【平衡树treap】
题目描述 OIER公司是一家大型专业化软件公司,有着数以万计的员工.作为一名出纳员,我的任务之一便是统计每位员工的工资.这本来是一份不错的工作,但是令人郁闷的是,我们的老板反复无常,经常调整员工的工资 ...
[poj1160][IOI2000]Post Office【动态规划】
传送门 https://vjudge.net/problem/POJ-1160#author=SCU2018 题目描述在一条水平的公路上建有n个小屋,两个小屋间的距离是它们的横坐标之差的绝对值.保证 ...
django rest framework mixins
aaarticlea/png;base64,iVBORw0KGgoAAAANSUhEUgAAAXQAAAEZCAIAAAAIa0mAAAAU/0lEQVR4nO2d247cxoGG5y3yKH6AAf
urls 管理
问题阐述:如何管理多个app下的路由分发,使得管理更加清晰? 1. 在app下创建urls.py文件 from django.conf.urls import url from django.urls ...
hdu3038How Many Answers Are Wrong(带权并查集）
题目链接:http://acm.hdu.edu.cn/showproblem.php?pid=3038 题解转载自:https://www.cnblogs.com/liyinggang/p/53270 ...
MVC之自定义过滤器(ActionFilterAttribute)
一.自定义Filter 自定义Filter需要继承ActionFilterAttribute抽象类,重写其中需要的方法,来看下ActionFilterAttribute类的方法签名. //表示所有操作 ...
关于Nginx负载均衡的5种策略
nginx的upstream目前支持的5种方式的分配 1.轮询(默认) 每个请求按时间顺序逐一分配到不同的后端服务器,如果后端服务器down掉,能自动剔除. upstream backserver { ...
斯坦福大学公开课机器学习： advice for applying machine learning | deciding what to try next(revisited)（针对高偏差、高方差问题的解决方法以及隐藏层数的选择）
针对高偏差.高方差问题的解决方法: 1.解决高方差问题的方案:增大训练样本量.缩小特征量.增大lambda值 2.解决高偏差问题的方案:增大特征量.增加多项式特征(比如x1*x2,x1的平方等等).减 ...
java操作redis集群配置[可配置密码]和工具类（比较好用）
转: java操作redis集群配置[可配置密码]和工具类 java操作redis集群配置[可配置密码]和工具类 <dependency> <groupId>red ...

python--爬取豆瓣热门国产电视剧保存为文件

使用 xpath 爬取正在热映的 电影保存为 json 文件

保存下来的 movie.json 文件

简单多线程 图片下载

python--爬取豆瓣热门国产电视剧保存为文件的更多相关文章

随机推荐

热门专题

使用 xpath 爬取正在热映的电影保存为 json 文件

简单多线程图片下载