pressmuSpiderr

#!/usr/bin/env python

# encoding: utf-8

import requests

from random import choice

from lxml import html

from urllib.parse import urljoin,quote

import os

import time

NAMEURLDIC={}

NAMEURLDIC_L2={}

ualist=["Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)",

"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",

"Sogou web spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)",

"Mozilla/5.0 (compatible; Yahoo! Slurp/3.0; http://help.yahoo.com/help/us/ysearch/slurp)"]

ua=choice(ualist)

header={"User_Agent":ua}

mailurl="https://press.mu"

url="https://press.mu/tag"

searc_url="https://press.mu/search/{}?p={}"

def getpage(url):

    req=None

    try:

        req=requests.get(url=url,headers=header,stream=True)

        req.encoding=req.apparent_encoding

    except:

        pass

    return req

def parse(url):

    source=getpage(url).text

    if len(source):

        root=html.fromstring(source)

    return root

def buff(url):

    buff = None

    req=getpage(url)

    return req

def save_file(title,url,type="m3u8"):

    if os.path.exists("pressimg"):

        pass

    else:

        os.mkdir("pressimg")

    with open(f'./pressimg/{title}.{type}',"wb") as fs:

            fs.write(buff(url).content)

root=parse(url)

taglist=root.xpath("//section[@id='tag']/ul/li/a")

for tag in taglist:

    title=tag.xpath("./text()")[0]

    href=urljoin(mailurl,tag.xpath("./@href")[0])

    NAMEURLDIC.setdefault(title,href)

for k,v in NAMEURLDIC.items():

    #第一页

    root=parse(v)

    #视频件数：

    v_count=root.xpath("//p[@id='hit']/strong/text()")[0]

    v_max_page_num=root.xpath("//nav[@id='pager']/ul/li[last()-1]/a/text()")[0]

    print(f'当前分类为{k}:，视频件数为：{v_count}')

    for item in range(1,int(v_max_page_num)+1):

        print(f"获取第{item}页")

        if item==1:

            pass

        else:

            root = parse(searc_url.format(quote(title.strip()),item))

        level2list=root.xpath("//section[@class='items']//h2/a")

        for level2 in level2list:

            title_level2 = level2.xpath("./text()")[0]

            href_level2 = urljoin(mailurl, level2.xpath("./@href")[0])

            NAMEURLDIC_L2.setdefault(title_level2, href_level2)

            print(title_level2,href_level2)

            root2 = parse(href_level2)

            videourl=root2.xpath("//div[@id='player']//video/source/@src")[0]

            imgurl="https:"+root2.xpath("//div[@id='player']//video/@poster")[0]

            print("videourl",videourl)

            print("imgurl",imgurl)

            save_file(title_level2,videourl)

            save_file(title_level2,imgurl,"jpg")

            print("开始下载",f"{title_level2}.jpg")

pressmuSpiderr的更多相关文章

Thymeleaf3.0内容
Thymeleaf简介什么是Thymeleaf Thymeleaf是网站或者独立应用程序的新式的服务端java模板引擎,可以执行HTML,XML,JavaScript,CSS甚至纯文本模板. Thy ...

随机推荐

LeetCode 21 ——合并两个有序链表
1. 题目 2. 解答新建一个带有哨兵结点的链表,依次比较两个有序链表的结点值,将较小值的结点插入到新链表后面.直到其中一个比较完毕,将另一个链表剩余的结点全部放到新链表最后面即可.最后,可以删除哨 ...
java设计模式之装饰器模式以及在java中作用
在JAVA I/O类库里有很多不同的功能组合情况,这些不同的功能组合都是使用装饰器模式实现的,下面以FilterInputStream为例介绍装饰器模式的使用 FilterInputStream和F ...
HDU 1693 Eat the Trees（插头DP，入门题）
Problem Description Most of us know that in the game called DotA(Defense of the Ancient), Pudge is a ...
初学者学习python2还是python3？
如果你是一个初学者,或者你以前接触过其他的编程语言,你可能不知道,在开始学习python的时候都会遇到一个比较让人很头疼的问题:版本问题!!是学习python2 还是学习 python3 ?这是非常让 ...
C#中的Stack的Peek操作，曝出异常
我们在遍历一个栈的时候,有时候需要判断栈顶元素,用到了Peek元素,然后再用Pop元素,但是这个时候会出现一个逻辑错误, 当用Pop删除全部栈的元素时,再用Peek就会报错, InvalidOpera ...
C语言循环结构作业总结
循环作业总结 1.1 基本要求按时交 - 有分未交 - 0分迟交一周以上 - 倒扣本次作业分数抄袭 - 0分博客作业不规范,没有Markdown语法 - 扣分泛泛而谈(最多七分) 1.2 ...
web online ide &web online editor & web online playground & web online runtime
web online ide &web online editor web online ide &web online editor & web online playgro ...
React & shit Antd
React & shit Antd https://ant.design/components/tooltip-cn/ https://ant.design/components/tag-cn ...
CSS3基础选择器
/*选择器分组:多个选择器使用同一个样式*/ h1,h2,a{ color: blue; } strong{ color: aquamarine; } /*选择器继承:body中未设置样式的会使用继承 ...
jQuery - AJAX get()和post()方法
jQuery get()和post()方法用于通过HTTP GET或POST请求从服务器请求数据. HTTP请求:GET VS POST 两种在客户端和服务器端进行请求-响应的常用方法是:GET和PO ...

pressmuSpiderr

pressmuSpiderr的更多相关文章

随机推荐

热门专题