Python爬虫爬取1905电影网视频电影并存储到mysql数据库

数据获取方式：微信搜索关注【靠谱杨阅读人生】回复【电影】。
整理不易，资源付费，谢谢支持！

代码：

  1 import time

  2 import traceback

  3 import requests

  4 from lxml import etree

  5 import re

  6 from bs4 import BeautifulSoup

  7 from lxml.html.diff import end_tag

  8 import json

  9 import pymysql

 10

 11 def get1905():

 12     url='https://www.1905.com/vod/list/n_1/o3p1.html'

 13     headers={

 14         'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'

 15     }

 16     templist=[]

 17     dataRes=[]

 18     #最热

 19     #1905电影网一共有99页，每页24部电影 for1-100 输出1-99页

 20     for i in range(1,100):

 21         url_1='https://www.1905.com/vod/list/n_1/o3p'

 22         auto=str(i)

 23         url_2='.html'

 24         url=url_1+auto+url_2

 25         print(url)

 26         response = requests.get(url, headers)

 27         response.encoding = 'utf-8'

 28         page_text = response.text

 29         soup = BeautifulSoup(page_text, 'lxml')

 30         # print(page_text)

 31         movie_all = soup.find_all('div', class_="grid-2x grid-3x-md grid-6x-sm")

 32         for single in movie_all:

 33             part_html=str(single)

 34             part_soup=BeautifulSoup(part_html,'lxml')

 35             #添加名字

 36             name=part_soup.find('a')['title']

 37             templist.append(name)

 38             # print(name)

 39             #添加评分

 40             try:

 41                 score=part_soup.find('i').text

 42             except:

 43                 if(len(score)==0):

 44                     score="1905暂无评分"

 45             templist.append(score)

 46             # print(score)

 47             #添加path

 48             path=part_soup.find('a',class_="pic-pack-outer")['href']

 49             templist.append(path)

 50             # print(path)

 51             #添加state

 52             state="免费"

 53             templist.append(state)

 54             print(templist)

 55             dataRes.append(templist)

 56             templist=[]

 57         print(len(dataRes))

 58     # print(movie_all)

 59

 60     #---------------------------------------------

 61     #好评

 62     templist = []

 63     # 1905电影网一共有99页，每页24部电影 for1-100 输出1-99页

 64     for i in range(1, 100):

 65         url_1 = 'https://www.1905.com/vod/list/n_1/o4p'

 66         auto = str(i)

 67         url_2 = '.html'

 68         url = url_1 + auto + url_2

 69         print(url)

 70         response = requests.get(url, headers)

 71         response.encoding = 'utf-8'

 72         page_text = response.text

 73         soup = BeautifulSoup(page_text, 'lxml')

 74         # print(page_text)

 75         movie_all = soup.find_all('div', class_="grid-2x grid-3x-md grid-6x-sm")

 76         for single in movie_all:

 77             part_html = str(single)

 78             part_soup = BeautifulSoup(part_html, 'lxml')

 79             # 添加名字

 80             name = part_soup.find('a')['title']

 81             templist.append(name)

 82             # print(name)

 83             # 添加评分

 84             try:

 85                 score = part_soup.find('i').text

 86             except:

 87                 if (len(score) == 0):

 88                     score = "1905暂无评分"

 89             templist.append(score)

 90             # print(score)

 91             # 添加path

 92             path = part_soup.find('a', class_="pic-pack-outer")['href']

 93             templist.append(path)

 94             # print(path)

 95             # 添加state

 96             state = "免费"

 97             templist.append(state)

 98             print(templist)

 99             dataRes.append(templist)

100             templist = []

101         print(len(dataRes))

102         #---------------------------------------------

103         # 最新

104         templist = []

105         # 1905电影网一共有99页，每页24部电影 for1-100 输出1-99页

106     for i in range(1, 100):

107         url_1 = 'https://www.1905.com/vod/list/n_1/o1p'

108         auto = str(i)

109         url_2 = '.html'

110         url = url_1 + auto + url_2

111         print(url)

112         response = requests.get(url, headers)

113         response.encoding = 'utf-8'

114         page_text = response.text

115         soup = BeautifulSoup(page_text, 'lxml')

116         # print(page_text)

117         movie_all = soup.find_all('div', class_="grid-2x grid-3x-md grid-6x-sm")

118         for single in movie_all:

119             part_html = str(single)

120             part_soup = BeautifulSoup(part_html, 'lxml')

121             # 添加名字

122             name = part_soup.find('a')['title']

123             templist.append(name)

124             # print(name)

125             # 添加评分

126             try:

127                 score = part_soup.find('i').text

128             except:

129                 if (len(score) == 0):

130                     score = "1905暂无评分"

131             templist.append(score)

132             # print(score)

133             # 添加path

134             path = part_soup.find('a', class_="pic-pack-outer")['href']

135             templist.append(path)

136             # print(path)

137             # 添加state

138             state = "免费"

139             templist.append(state)

140             print(templist)

141             dataRes.append(templist)

142             templist = []

143         print(len(dataRes))

144     #去重

145     old_list = dataRes

146     new_list = []

147     for i in old_list:

148         if i not in new_list:

149             new_list.append(i)

150             print(len(new_list))

151     print("总数:     "+str(len(new_list)))

152     return new_list

153 def insert_1905():

154     cursor = None

155     conn = None

156     try:

157         count = 0

158         list = get1905()

159         print(f"{time.asctime()}开始插入1905电影数据")

160         conn, cursor = get_conn()

161         sql = "insert into movie1905 (id,name,score,path,state) values(%s,%s,%s,%s,%s)"

162         for item in list:

163             print(item)

164             # 异常捕获，防止数据库主键冲突

165             try:

166                 cursor.execute(sql, [0, item[0], item[1], item[2], item[3]])

167             except pymysql.err.IntegrityError:

168                 print("重复！跳过！")

169         conn.commit()  # 提交事务 update delete insert操作

170         print(f"{time.asctime()}插入1905电影数据完毕")

171     except:

172         traceback.print_exc()

173     finally:

174         close_conn(conn, cursor)

175     return;

176

177 #连接数据库  获取游标

178 def get_conn():

179     """

180     :return: 连接，游标

181     """

182     # 创建连接

183     conn = pymysql.connect(host="127.0.0.1",

184                     user="root",

185                     password="000429",

186                     db="movierankings",

187                     charset="utf8")

188     # 创建游标

189     cursor = conn.cursor()  # 执行完毕返回的结果集默认以元组显示

190     if ((conn != None) & (cursor != None)):

191         print("数据库连接成功！游标创建成功！")

192     else:

193         print("数据库连接失败！")

194     return conn, cursor

195 #关闭数据库连接和游标

196 def close_conn(conn, cursor):

197     if cursor:

198         cursor.close()

199     if conn:

200         conn.close()

201     return 1

202

203 if __name__ == '__main__':

204     # get1905()

205     insert_1905()

运行截图：

数据库

Python爬虫爬取1905电影网视频电影并存储到mysql数据库的更多相关文章

Python爬虫爬取BT之家找电影资源
一.写在前面最近看新闻说圣城家园(SCG)倒了,之前BT天堂倒了,暴风影音也不行了,可以说看个电影越来越费力,国内大厂如企鹅和爱奇艺最近也出现一些幺蛾子,虽然目前版权意识虽然越来越强,但是很多资源在 ...
如何利用python爬虫爬取爱奇艺VIP电影？
环境:windows python3.7 思路: 1.先选取你要爬取的电影 2.用vip解析工具解析,获取地址 3.写好脚本,下载片断 4.将片断利用电脑合成需要的python模块: ##第一 ...
Python爬虫---爬取抖音短视频
目录前言抖音爬虫制作选定网页分析网页提取id构造网址拼接数据包链接获取视频地址下载视频全部代码实现结果待解决的问题前言最近一直想要写一个抖音爬虫来批量下载抖音的短视频,但是经 ...
python爬虫–爬取煎蛋网妹子图片
前几天刚学了python网络编程,书里没什么实践项目,只好到网上找点东西做. 一直对爬虫很好奇,所以不妨从爬虫先入手吧. Python版本:3.6 这是我看的教程:Python - Jack -Cui ...
Python 爬虫爬取煎蛋网图片
今天, 试着爬取了煎蛋网的图片. 用到的包: urllib.request os 分别使用几个函数,来控制下载的图片的页数,获取图片的网页,获取网页页数以及保存图片到本地.过程简单清晰明了直接上源代 ...
python爬虫爬取煎蛋网妹子图片
import urllib.request import os def url_open(url): req = urllib.request.Request(url) req.add_header( ...
Python爬虫训练：爬取酷燃网视频数据
前言本文的文字及图片来源于网络,仅供学习.交流使用,不具有任何商业用途,版权归原作者所有,如有问题请及时联系我们以作处理项目目标爬取酷燃网视频数据 https://krcom.cn/ 环境 Py ...
利用Python网络爬虫爬取学校官网十条标题
利用Python网络爬虫爬取学校官网十条标题案例代码: # __author : "J" # date : 2018-03-06 # 导入需要用到的库文件 import urll ...
用Python爬虫爬取广州大学教务系统的成绩（内网访问）
用Python爬虫爬取广州大学教务系统的成绩(内网访问) 在进行爬取前,首先要了解: 1.什么是CSS选择器? 每一条css样式定义由两部分组成,形式如下: [code] 选择器{样式} [/code ...
Python爬虫爬取全书网小说，程序源码+程序详细分析
Python爬虫爬取全书网小说教程第一步:打开谷歌浏览器,搜索全书网,然后再点击你想下载的小说,进入图一页面后点击F12选择Network,如果没有内容按F5刷新一下点击Network之后出现如下 ...

随机推荐

Java并发编程实例--11.在线程组中处理未检查异常
第8个例子讲了如何在线程中捕捉未检查异常,本例将介绍如何在线程组中处理未检查异常. Task.java package com.dylan.thread.ch1.c11.task; import ja ...
Java并发编程实例--9.使用本地线程变量
并发程序一个重要方面就是共享数据. 这一点在继承了Thread类或实现了Runnable接口的对象中有着特殊的重要性. 如果你创建了一个实现了Runnable接口的类对象并且用这个对象开启了N个线程对 ...
C++ std::move 的一些问题
看 SO 上有一个比较奇怪的问题, When does an rvalue reference result in a move vs copy constructor and why? 问题代码: ...
SSL/TLS 资料整理
1. HTTPS详解二:SSL / TLS 工作原理和详细握手过程看到另外几篇介绍不错的文章,再次分享一下园内大佬写的, 通过一个小故事,理解 HTTPS 工作原理这篇博文已经把 SSL 的工作 ...
【libGDX】使用Mesh绘制立方体
1 前言本文主要介绍使用 Mesh 绘制立方体,读者如果对 Mesh 不太熟悉,请回顾以下内容: 使用Mesh绘制三角形使用Mesh绘制矩形使用Mesh绘制圆形在绘制立方体的过程中,主 ...
学习go语言编程之工程管理
Go命令行工具安装了Go语言的安装包后,就直接自带Go命令行工具. # 查看当前安装的Golang版本 go version # 查看go命令行工具的帮助信息 go help Go命令行工具可以完成 ...
rename重命名
[root@liuwei test]# ls 11.txt 12.txt 13.txt 14.txt 15.txt 16.txt 17.txt 18.txt 19.txt 1.txt 20.txt 2 ...
django中一些快捷函数
1.get_object_or_404() 接收两个参数,参数1为模型类,参数2为查询参数查询到对象则返回对象,查询不到则返回http404,但是不会返回模型的DoesNotExist异常示例: ...
【LeetCode剑指offer#05】回文链表的两种解法+删除链表中间节点（链表的基本操作）
回文链表给你一个单链表的头节点 head ,请你判断该链表是否为回文链表.如果是,返回 true :否则,返回 false . 示例 1: 输入:head = [1,2,2,1] 输出:true 示 ...
带你领略下iOS中OC的“alloc”源代码，让你在工作中不在迷惑
前言前面我们使用官方开源的objc源码进行了编译调试 objc4-818.2源码编译调试笔记前言为什么会想要调试源码? 苹果开源了部分源码, 但相似内容太多, 基本找不到代码见的对应关系, 如果能 ...

Python爬虫爬取1905电影网视频电影并存储到mysql数据库

代码：

运行截图：

数据库

Python爬虫爬取1905电影网视频电影并存储到mysql数据库的更多相关文章

随机推荐

热门专题