pressmuSpiderr

#!/usr/bin/env python

# encoding: utf-8

import requests

from random import choice

from lxml import html

from urllib.parse import urljoin,quote

import os

import time

NAMEURLDIC={}

NAMEURLDIC_L2={}

ualist=["Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)",

"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",

"Sogou web spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)",

"Mozilla/5.0 (compatible; Yahoo! Slurp/3.0; http://help.yahoo.com/help/us/ysearch/slurp)"]

ua=choice(ualist)

header={"User_Agent":ua}

mailurl="https://press.mu"

url="https://press.mu/tag"

searc_url="https://press.mu/search/{}?p={}"

def getpage(url):

    req=None

    try:

        req=requests.get(url=url,headers=header,stream=True)

        req.encoding=req.apparent_encoding

    except:

        pass

    return req

def parse(url):

    source=getpage(url).text

    if len(source):

        root=html.fromstring(source)

    return root

def buff(url):

    buff = None

    req=getpage(url)

    return req

def save_file(title,url,type="m3u8"):

    if os.path.exists("pressimg"):

        pass

    else:

        os.mkdir("pressimg")

    with open(f'./pressimg/{title}.{type}',"wb") as fs:

            fs.write(buff(url).content)

root=parse(url)

taglist=root.xpath("//section[@id='tag']/ul/li/a")

for tag in taglist:

    title=tag.xpath("./text()")[0]

    href=urljoin(mailurl,tag.xpath("./@href")[0])

    NAMEURLDIC.setdefault(title,href)

for k,v in NAMEURLDIC.items():

    #第一页

    root=parse(v)

    #视频件数：

    v_count=root.xpath("//p[@id='hit']/strong/text()")[0]

    v_max_page_num=root.xpath("//nav[@id='pager']/ul/li[last()-1]/a/text()")[0]

    print(f'当前分类为{k}:，视频件数为：{v_count}')

    for item in range(1,int(v_max_page_num)+1):

        print(f"获取第{item}页")

        if item==1:

            pass

        else:

            root = parse(searc_url.format(quote(title.strip()),item))

        level2list=root.xpath("//section[@class='items']//h2/a")

        for level2 in level2list:

            title_level2 = level2.xpath("./text()")[0]

            href_level2 = urljoin(mailurl, level2.xpath("./@href")[0])

            NAMEURLDIC_L2.setdefault(title_level2, href_level2)

            print(title_level2,href_level2)

            root2 = parse(href_level2)

            videourl=root2.xpath("//div[@id='player']//video/source/@src")[0]

            imgurl="https:"+root2.xpath("//div[@id='player']//video/@poster")[0]

            print("videourl",videourl)

            print("imgurl",imgurl)

            save_file(title_level2,videourl)

            save_file(title_level2,imgurl,"jpg")

            print("开始下载",f"{title_level2}.jpg")

pressmuSpiderr的更多相关文章

Thymeleaf3.0内容
Thymeleaf简介什么是Thymeleaf Thymeleaf是网站或者独立应用程序的新式的服务端java模板引擎,可以执行HTML,XML,JavaScript,CSS甚至纯文本模板. Thy ...

随机推荐

Django源码分析之执行入口
魔法门一般我们启动django,最简单的方法是进入project 目录,这时目录结构是这样的然后我们执行python manage.py runserver,程序就开始执行了. 那django是如 ...
Pro Git - 笔记2
Git Basics Getting a Git Repository Initializing a Repository in an Existing Directory For Linux: $ ...
vue-component=>v-on
$emit 返回 shouldPropagate,shouldPropagate 是一个布尔值,取决于父链上的是否存在该事件的监听器以及,事件处理程序返回的值.他决定 $dispatch 是否停止冒泡 ...
关于C标准
关于C标准 1. 前言本文从英文 C-FAQ (2004 年 7 月 3 日修订版) 翻译而来.本文的中文版权为朱群英和孙云所有. 本文的内容可以自由用于个人目的,但是不可以未经许可出版发行. ...
NIO--2-代码
package com.study.nio; import java.io.IOException; import java.net.InetSocketAddress; import java.ni ...
oracle 引用类型声明
洛谷 P1415 拆分数列解题报告
拆分数列题目背景 [为了响应党中央勤节俭.反铺张的精神,题目背景描述故事部分略去^-^] 题目描述给出一列数字,需要你添加任意多个逗号将其拆成若干个严格递增的数. 如果有多组解,则输出使得最后一个 ...
android OTA升级包制作
0.签名 java -Xmx2048m -jar out/host/linux-x86/framework/signapk.jar -w build/target/product/security/t ...
The xor-longest Path [Trie]
The xo-longest Path 题目描述给定一棵$n≤100 000$个点的带权树,求树上最长的异或和路径. 输入多组数据.每组数据第一行一个整数n($1≤n≤100 00$,接下 ...
POJ3159:Candies（差分约束）
Candies Time Limit: 1500MS Memory Limit: 131072K Total Submissions: 39666 Accepted: 11168 题目链接:h ...

pressmuSpiderr

pressmuSpiderr的更多相关文章

随机推荐

热门专题