python 电影下载链接爬虫

V1.0

功能：从比较知名的几个电影下载网站爬取下载链接，并自动打印出来：

代码：

# -*- coding: utf8 -*-

from bs4 import BeautifulSoup

import requests, lxml

from urllib.parse import quote

import re

def get_name():

    while 1:

        moviename = input('请输入要查找的电影名\n->')

        moviename_quote = quote(moviename.encode('gb2312'))

        get_url_from_ygdy(moviename_quote)

        get_url_from_bttiantang(moviename)

        get_url_from_dytt(moviename_quote)

def get_url_from_ygdy(moviename):

    baseurl = 'http://s.dydytt.net/plus/search.php?kwtype=0&keyword='

    url = baseurl + str(moviename)

    content = BeautifulSoup(requests.get(url).content.decode('gb2312', 'ignore'), 'lxml')

    first_page = content.find_all('td', width="")

    movie_infos = content.find_all('td', width="55%")

    if movie_infos.__len__() == 0:

        print('查无此电影，请检查后重试')

        return

    else:

        print('阳光电影搜索结果:')

        if first_page.__len__() == 0:

            for movie_info in movie_infos:

                get_info(movie_info, moviename)

        else:

            last_page_url = first_page[1].find('a').get('href') + '"'

            pattern = re.compile('PageNo=(.*?)"')

            pnt = re.findall(pattern, last_page_url)

            for i in range(int(pnt[0])):

                print('第', i + 1, '页:')

                page_url = url + '&PageNo=' + str(i + 1)

                pagecontent = BeautifulSoup(requests.get(page_url).content.decode('gb2312', 'ignore'), 'lxml')

                movie_infos = pagecontent.find_all('td', width='55%')

                for movie_info in movie_infos:

                    get_info(movie_info, moviename)

def get_info(movie_info, name):

    movie_url = movie_info.find('a').get('href')

    moviename = movie_info.text

    if '游戏' not in name and '游戏' in moviename:

        return

    else:

        print('电影名:', moviename)

        url = 'http://www.ygdy8.com' + movie_url

        info = BeautifulSoup(requests.get(url).content.decode('gbk', 'ignore'), 'lxml')

        download = info.find_all('td', style="WORD-WRAP: break-word")

        print('下载链接:')

        if download.__len__() == 1:

            print(download[0].find('a').string)

        else:

            for each in range(download.__len__()):

                print('链接', each + 1, ':', download[each].find('a').string)

        print('\n')

def get_url_from_bttiantang(moviename):

    baseurl = 'http://www.bttiantang.com/s.php?q=' + str(moviename)

    page_content = requests.get(baseurl).content.decode('utf8', 'ignore')

    pattern = re.compile('</b>条<b>(.*?)</b>')

    pagenum_info = re.findall(pattern, page_content)

    page_content = BeautifulSoup(page_content, 'lxml')

    content = page_content.find_all('p', class_="tt cl")

    if content.__len__() == 0:

        print('查无此电影，请检查后重试')

        return

    else:

        print('BT天堂搜索结果:')

        if pagenum_info.__len__() == 0:

            for each in content:

                get_movieinfo(each, moviename)

        else:

            for i in range(int(pagenum_info[0])):

                print('第', i + 1, '页:')

                page_url = baseurl + '&PageNo=' + str(i + 1)

                page_content = BeautifulSoup(requests.get(page_url).content.decode('utf8', 'ignore'), 'lxml')

                content = page_content.find_all('p', class_="tt cl")

                for each in content:

                    get_movieinfo(each, moviename)

def get_movieinfo(movie_content, name):

    url = 'http://www.bttiantang.com/' + movie_content.find('a').get('href')

    moviename = movie_content.text

    if '游戏' not in name and '游戏' in moviename:

        return

    print('电影名:', moviename)

    info = BeautifulSoup(requests.get(url).content.decode('utf8', 'ignore'), 'lxml')

    links = info.find_all('div', class_='tinfo')

    print('下载链接:')

    i = 0

    for each in links:

        i += 1

        print('链接' + str(i) + ':')

        print('http://www.bttiantang.com' + each.find('a').get('href'))

def get_url_from_dytt(moviename):

    baseurl = 'http://www.dytt.com/search.asp?searchword=' + str(moviename)

    content = requests.get(baseurl).content.decode('gbk', 'ignore')

    pattern = re.compile('下一页.*?href.*?page=(.*?)&')

    result = re.findall(pattern, content)

    content = BeautifulSoup(content, 'lxml')

    items = content.find_all('p', class_='s1')

    if items.__len__() == 1:

        print('查无此电影，请检查后重试')

        return

    else:

        print('电影淘淘搜索结果:')

        if result.__len__() == 0:

            for i in range(items.__len__() - 1):

                get_movieinfo_from_dytt(items[i + 1], moviename)

        else:

            for i in range(int(result[0])):

                print('第', i + 1, '页:')

                url = baseurl + '&page=' + str(i + 1)

                page_content = BeautifulSoup(requests.get(url).content.decode('gbk', 'ignore'), 'lxml')

                items = page_content.find_all('p', class_='s1')

                for i in range(items.__len__() - 1):

                    get_movieinfo_from_dytt(items[i + 1], moviename)

def get_movieinfo_from_dytt(item, name):

    moviename = item.find('a').text

    movieurl = 'http://www.dytt.com' + item.find('a').get('href')

    if '游戏' not in name and '游戏' in moviename:

        return

    print('电影名:', moviename)

    pagecontent = requests.get(movieurl).content.decode('gbk', 'ignore')

    links = re.findall(re.compile('ed2k:(.*?)\|/'), pagecontent)

    i = 0

    print('下载链接:')

    if links.__len__() != 0:

        for link in links:

            i += 1

            print('链接' + str(i) + ':', 'ed2k://|file|' + link + '|/')

    else:

        links = re.findall(re.compile('http:(.*?)torrent'), pagecontent)

        if links.__len__() != 0:

            for link in links:

                i += 1

                print('链接' + str(i) + ':', 'http:' + link + 'torrent')

        else:

            links = re.findall(re.compile('ftp:(.*?)mkv'), pagecontent)

            for link in links:

                i += 1

                print('链接' + str(i) + ':', 'ftp:' + link + 'mkv')

if __name__ == '__main__':

    get_name()

运行结果：

python 电影下载链接爬虫的更多相关文章

Java爬虫爬取网站电影下载链接
之前有看过一段时间爬虫,了解了爬虫的原理,以及一些实现的方法,本项目完成于半年前,一直放在那里,现在和大家分享出来. 网络爬虫简单的原理就是把程序想象成为一个小虫子,一旦进去了一个大门,这个小虫子就像 ...
使用htmlparse爬虫技术爬取电影网页的全部下载链接
昨天,我们利用webcollector爬虫技术爬取了网易云音乐17万多首歌曲,而且还包括付费的在内,如果时间允许的话,可以获取更多的音乐下来,当然,也有小伙伴留言说这样会降低国人的知识产权保护意识,诚 ...
使用htmlparser爬虫技术爬取电影网页的全部下载链接
昨天,我们利用webcollector爬虫技术爬取了网易云音乐17万多首歌曲,而且还包括付费的在内,如果时间允许的话,可以获取更多的音乐下来,当然,也有小伙伴留言说这样会降低国人的知识产权保护意识,诚 ...
一篇文章教会你利用Python网络爬虫获取电影天堂视频下载链接
[一.项目背景] 相信大家都有一种头疼的体验,要下载电影特别费劲,对吧?要一部一部的下载,而且不能直观的知道最近电影更新的状态. 今天小编以电影天堂为例,带大家更直观的去看自己喜欢的电影,并且下载下来 ...
Python 爬虫的工具列表附Github代码下载链接
Python爬虫视频教程零基础小白到scrapy爬虫高手-轻松入门 https://item.taobao.com/item.htm?spm=a1z38n.10677092.0.0.482434a6E ...
【Python项目】简单爬虫批量获取资源网站的下载链接
简单爬虫批量获取资源网站的下载链接项目链接:https://github.com/RealIvyWong/GotDownloadURL 1 由来自己在收集剧集资源的时候,这些网站的下载链接还要手动 ...
Python爬虫个人记录（二）获取fishc 课件下载链接
参考: Python爬虫个人记录(一)豆瓣250 (2017.9.6更新,通过cookie模拟登陆方法,已成功实现下载文件功能!!) 一.目的分析获取http://bbs.fishc.com/for ...
Python网络爬虫笔记（二）：链接爬虫和下载限速
(一)代码1(link_crawler()和get_links()实现链接爬虫) import urllib.request as ure import re import urllib.parse ...
Python 爬虫 Vimeo视频下载链接
python vimeo_d.py https://vimeo.com/228013581 在https://vimeo.com/上看到稀罕的视频按照上面加上视频的观看地址运行即可获得视频下载链接 ...

随机推荐

Eclipse如何生成jar包
Eclipse如何生成jar包图1 右击项目Properites,选择Android,选择Is Library,然后会编译生成jar包在bin目录下.
BZOJ 3241: [Noi2013]书法家
题目链接:http://www.lydsy.com/JudgeOnline/problem.php?id=3241 题意: 思路:把每个字母分成三部分,两个字母之间还有空的列,所以我一共设了11个状态 ...
[CF355C]Vasya and Robot（思维，贪心）
题目链接:http://codeforces.com/contest/355/problem/C 题意:1~n n个物品各重wi,现在有一个人可以从左边拿和从右边拿, 左边拿一个物品的花费是l*wi, ...
JDBC操作Oracle数据库
背景知识含义:JDBC是一种java数据库连接技术,能实现java程序对各种数据库的访问.由一组使用java语言编写的类和接口组成,这些类和接口称为JDBC API,他们位于java.sql 以及j ...
hdu 1700 Points on Cycle（坐标旋转）
http://acm.hdu.edu.cn/showproblem.php?pid=1700 Points on Cycle Time Limit: 1000/1000 MS (Java/Others ...
POJ1011 （DFS+剪枝）
Sticks Time Limit: 1000MS Memory Limit: 10000K Total Submissions: 129606 Accepted: 30388 Descrip ...
Log4j XML 配置
Xml代码 <?xml version="1.0" encoding="UTF-8"?> <!DOCTYPE log4j:configurat ...
ASP.Net MVC4中封装CSS和js冗余代码（不让其大篇的显示在前台上）
(1)封装CSS和JS代码,使用调用的方式在前台进行调用.是开发看起来简洁和易于管理,可达到重用. 由于asp.netMVC4 框架 ,在封装js和CSS的时候,有如下规范: using Syst ...
Nginx RTMP 专题
说明: 记录器 - 记录器名称 path - 记录文件路径(recorded file path) (/tmp/rec/mystream-1389499351.flv)filename - 省略目录的 ...
Redis常用命令入门——列表类型（一级二级缓存技术）
获取列表片段 redis > LRANGE KEY_NAME START END lrange命令比较常用,返回从start到stop的所有元素的列表,start和stop都是从0开始. (1) ...

python 电影下载链接爬虫

python 电影下载链接爬虫的更多相关文章

随机推荐

热门专题