Python爬虫爬取搜狐视频电影并存储到mysql数据库

数据获取方式：微信搜索关注【靠谱杨阅读人生】回复【电影】。
整理不易，资源付费，谢谢支持。

代码：

  1 import time

  2 import traceback

  3 import requests

  4 from lxml import etree

  5 import re

  6 from bs4 import BeautifulSoup

  7 from lxml.html.diff import end_tag

  8 import json

  9 import pymysql

 10 #连接数据库  获取游标

 11 def get_conn():

 12     """

 13     :return: 连接，游标

 14     """

 15     # 创建连接

 16     conn = pymysql.connect(host="127.0.0.1",

 17                     user="root",

 18                     password="000429",

 19                     db="movierankings",

 20                     charset="utf8")

 21     # 创建游标

 22     cursor = conn.cursor()  # 执行完毕返回的结果集默认以元组显示

 23     if ((conn != None) & (cursor != None)):

 24         print("数据库连接成功！游标创建成功！")

 25     else:

 26         print("数据库连接失败！")

 27     return conn, cursor

 28 #关闭数据库连接和游标

 29 def close_conn(conn, cursor):

 30     if cursor:

 31         cursor.close()

 32     if conn:

 33         conn.close()

 34     return 1

 35

 36 def get_souhu():

 37     url='https://film.sohu.com/list_0_0_0_2_2_1_60.html?channeled=1200100000'

 38     #最新上架

 39     new_url='https://film.sohu.com/list_0_0_0_2_1_1_60.html?channeled=1200100000'

 40     #本周热播

 41     week_url='https://film.sohu.com/list_0_0_0_2_0_1_60.html?channeled=1200100000'

 42     headers={

 43         'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'

 44     }

 45

 46     #初始化list

 47     templist=[]

 48     dataRes=[]

 49     #最受好评

 50     for i in range(1,31):

 51         url_1='https://film.sohu.com/list_0_0_0_2_2_'

 52         auto=str(i)

 53         url_2='_60.html?channeled=1200100000'

 54         url=url_1+auto+url_2

 55         response = requests.get(url, headers)

 56         response.encoding = 'utf-8'

 57         page_text = response.text

 58         # etree_ = etree.HTML(page_text)

 59         # 获取所有的li

 60         soup = BeautifulSoup(page_text, 'lxml')

 61         # 标签层级选择

 62         li_list = soup.select('.movie-list>li')

 63         print(len(li_list))

 64         if(len(li_list)==0):

 65             print("最受好评爬取结束！")

 66             if(len(dataRes)!=0):

 67                 return dataRes

 68         for li in li_list:

 69             li_text=str(li)

 70             # print(li_text)

 71             li_soup=BeautifulSoup(li_text,'lxml')

 72             name=li_soup.find('div',class_="v_name_info").text

 73             #添加名字

 74             templist.append(name)

 75             # print(name)

 76             #添加评分

 77             score=li_soup.find('span',class_='v_score').text

 78             #处理评分

 79             score=score[-4:-1]

 80             templist.append(score)

 81             # print(score)

 82             #添加path

 83             path=li_soup.find('a',target="_blank")['href']

 84             templist.append(path)

 85             # print(path)

 86             #添加播放状态

 87             state="VIP"

 88             templist.append(state)

 89             print(templist)

 90             dataRes.append(templist)

 91             templist=[]

 92         print("-------------------------------------------")

 93     # print(len(dataRes))

 94

 95     # #最新上架

 96     #

 97     # templist = []

 98     # for i in range(1,31):

 99     #     url_1='https://film.sohu.com/list_0_0_0_2_1_'

100     #     auto=str(i)

101     #     url_2='_60.html?channeled=1200100000'

102     #     url=url_1+auto+url_2

103     #     response = requests.get(url, headers)

104     #     response.encoding = 'utf-8'

105     #     page_text = response.text

106     #     # etree_ = etree.HTML(page_text)

107     #     # 获取所有的li

108     #     soup = BeautifulSoup(page_text, 'lxml')

109     #     # 标签层级选择

110     #     li_list = soup.select('.movie-list>li')

111     #     print(len(li_list))

112     #     if(len(li_list)==0):

113     #         print("最新上架爬取结束！")

114     #         if(len(dataRes)!=0):

115     #             return dataRes

116     #     for li in li_list:

117     #         li_text=str(li)

118     #         # print(li_text)

119     #         li_soup=BeautifulSoup(li_text,'lxml')

120     #         name=li_soup.find('div',class_="v_name_info").text

121     #         #添加名字

122     #         templist.append(name)

123     #         # print(name)

124     #         #添加评分

125     #         score=li_soup.find('span',class_='v_score').text

126     #         #处理评分

127     #         score=score[-4:-1]

128     #         templist.append(score)

129     #         # print(score)

130     #         #添加path

131     #         path=li_soup.find('a',target="_blank")['href']

132     #         templist.append(path)

133     #         # print(path)

134     #         #添加播放状态

135     #         state="VIP"

136     #         templist.append(state)

137     #         print(templist)

138     #         dataRes.append(templist)

139     #         templist=[]

140     #     print("-------------------------------------------")

141     # # print(len(dataRes))

142     # #本周热播

143     # templist = []

144     # for i in range(1, 31):

145     #     url_1 = 'https://film.sohu.com/list_0_0_0_2_0_'

146     #     auto = str(i)

147     #     url_2 = '_60.html?channeled=1200100000'

148     #     url = url_1 + auto + url_2

149     #     response = requests.get(url, headers)

150     #     response.encoding = 'utf-8'

151     #     page_text = response.text

152     #     # etree_ = etree.HTML(page_text)

153     #     # 获取所有的li

154     #     soup = BeautifulSoup(page_text, 'lxml')

155     #     # 标签层级选择

156     #     li_list = soup.select('.movie-list>li')

157     #     print(len(li_list))

158     #     if (len(li_list) == 0):

159     #         print("本周热播爬取结束！")

160     #         if (len(dataRes) != 0):

161     #             return dataRes

162     #     for li in li_list:

163     #         li_text = str(li)

164     #         # print(li_text)

165     #         li_soup = BeautifulSoup(li_text, 'lxml')

166     #         name = li_soup.find('div', class_="v_name_info").text

167     #         # 添加名字

168     #         templist.append(name)

169     #         # print(name)

170     #         # 添加评分

171     #         score = li_soup.find('span', class_='v_score').text

172     #         # 处理评分

173     #         score = score[-4:-1]

174     #         templist.append(score)

175     #         # print(score)

176     #         # 添加path

177     #         path = li_soup.find('a', target="_blank")['href']

178     #         templist.append(path)

179     #         # print(path)

180     #         # 添加播放状态

181     #         state = "VIP"

182     #         templist.append(state)

183     #         print(templist)

184     #         dataRes.append(templist)

185     #         templist = []

186     #     print("-------------------------------------------")

187     # print(len(dataRes))

188     #list去重

189     # old_list = dataRes

190     # new_list = []

191     # for i in old_list:

192     #     if i not in new_list:

193     #         new_list.append(i)

194     # print(new_list)  # [2, 3, 4, 5, 1]

195     return dataRes

196 #插入数据库

197 def insert_souhu():

198     cursor = None

199     conn = None

200     try:

201         count=0

202         list = get_souhu()

203         print(f"{time.asctime()}开始插入搜狐电影数据")

204         conn, cursor = get_conn()

205         sql = "insert into moviesohu (id,name,score,path,state) values(%s,%s,%s,%s,%s)"

206         for item in list:

207             print(item)

208             count = count + 1

209             #异常捕获，防止数据库主键冲突

210             try:

211                 cursor.execute(sql, [0, item[0], item[1], item[2], item[3] ])

212             except pymysql.err.IntegrityError:

213                 print("重复！跳过！")

214         conn.commit()  # 提交事务 update delete insert操作

215         print(f"{time.asctime()}插入搜狐电影数据完毕")

216     except:

217         traceback.print_exc()

218     finally:

219         close_conn(conn, cursor)

220     return;

221

222 if __name__ == '__main__':

223     # get_iqy()

224     # get_souhu()

225     insert_souhu()

运行截图

数据库截图

建表语句

1 CREATE TABLE `moviesohu` (

2   `id` INT(11) NOT NULL AUTO_INCREMENT,

3   `name` VARCHAR(45) COLLATE utf8_bin NOT NULL,

4   `score` VARCHAR(45) COLLATE utf8_bin NOT NULL,

5   `path` VARCHAR(100) COLLATE utf8_bin NOT NULL,

6   `state` VARCHAR(10) COLLATE utf8_bin NOT NULL,

7   PRIMARY KEY (`name`),

8   KEY `id` (`id`)

9 ) ENGINE=INNODB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8 COLLATE=utf8_bin;

Python爬虫爬取搜狐视频电影并存储到mysql数据库的更多相关文章

python爬虫:爬取慕课网视频
前段时间安装了一个慕课网app,发现不用注册就可以在线看其中的视频,就有了想爬取其中的视频,用来在电脑上学习.决定花两天时间用学了一段时间的python做一做.(我的新书<Python爬虫开发与 ...
Python爬虫 - 爬取百度html代码前200行
Python爬虫 - 爬取百度html代码前200行 - 改进版, 增加了对字符串的.strip()处理源代码如下: # 改进版, 增加了 .strip()方法的使用 # coding=utf-8 ...
用Python爬虫爬取广州大学教务系统的成绩（内网访问）
用Python爬虫爬取广州大学教务系统的成绩(内网访问) 在进行爬取前,首先要了解: 1.什么是CSS选择器? 每一条css样式定义由两部分组成,形式如下: [code] 选择器{样式} [/code ...
使用Python爬虫爬取网络美女图片
代码地址如下:http://www.demodashi.com/demo/13500.html 准备工作安装python3.6 略安装requests库(用于请求静态页面) pip install ...
Python爬虫|爬取喜马拉雅音频
"GOOD Python爬虫|爬取喜马拉雅音频喜马拉雅是知名的专业的音频分享平台,用户规模突破4.8亿,汇集了有声小说,有声读物,儿童睡前故事,相声小品等数亿条音频,成为国内发展最快.规模 ...
python爬虫爬取内容中，-xa0，-u3000的含义
python爬虫爬取内容中,-xa0,-u3000的含义 - CSDN博客 https://blog.csdn.net/aiwuzhi12/article/details/54866310
Python爬虫爬取全书网小说，程序源码+程序详细分析
Python爬虫爬取全书网小说教程第一步:打开谷歌浏览器,搜索全书网,然后再点击你想下载的小说,进入图一页面后点击F12选择Network,如果没有内容按F5刷新一下点击Network之后出现如下 ...
python爬虫—爬取英文名以及正则表达式的介绍
python爬虫—爬取英文名以及正则表达式的介绍爬取英文名: 一. 爬虫模块详细设计 (1)整体思路对于本次爬取英文名数据的爬虫实现,我的思路是先将A-Z所有英文名的连接爬取出来,保存在一个cs ...
一个简单的python爬虫,爬取知乎
一个简单的python爬虫,爬取知乎主要实现爬取一个收藏夹里所有问题答案下的图片文字信息暂未收录,可自行实现,比图片更简单具体代码里有详细注释,请自行阅读项目源码: # -*- cod ...
python爬虫-爬取百度图片
python爬虫-爬取百度图片(转) #!/usr/bin/python# coding=utf-8# 作者 :Y0010026# 创建时间 :2018/12/16 16:16# 文件 :spider ...

随机推荐

Java集合框架学习（八） HashMap详解
HashMap介绍 HashMap是一个基于Map的集合类,用于存储Key和Value的键值对. 通常用HashMap<Key, Value> or HashMap<K, V> ...
win32- 使用WM_NCPAINT在非客户区域绘制边框
#pragma comment(lib, "UxTheme") #include <windows.h> #include <uxtheme.h> LRES ...
如何避免Git合并远程分支时出现可读性差的日志
问题及现象当某一分支(假设为main)的本地仓库和远程仓库都基于同一个提交进行了修改,并分别创建了新的提交时,在本地执行git push origin main会提示先要执行git pull合并远程 ...
常见Python问题及解决办法
文件编码问题如果Python文件中存在中文注释,在运行时报错"SyntaxError: Non-ASCII character '\xe7' in file". 解决办法: 在文 ...
【图论#02】岛屿系列题（数量、周长、最大面积），flood fill算法的代码实现与优化
岛屿数量给你一个由 '1'(陆地)和 '0'(水)组成的的二维网格,请你计算网格中岛屿的数量. 岛屿总是被水包围,并且每座岛屿只能由水平方向和/或竖直方向上相邻的陆地连接形成. 此外,你可以假设该网 ...
【Azure 应用服务】App Service for Linux 环境中为Tomcat页面修改默认的Azure 404页面
问题描述在App Service Linux环境中,如部署Tomcat 应用后,如果访问的页面找不到,应用会返回一个由Azure生成的404页面,那么是否可以修改它呢? PS: 如果是App Ser ...
利用Linux自动编译Vivado工程
https://codetd.com/article/12458043. 利用Linux自动编译Vivado工程
Java 常用类 String类与其他结构之间的转换-----String 与基本数据类型，包装类之间的转换
1 /* 2 涉及到String类与其他结构之间的转换 3 4 */ 5 6 //String 与基本数据类型,包装类之间的转换 7 //String --->基本数据类型,包装类:调用包装类 ...
4、dubbo的高可用
1.zookeeper宕机与dubbo直连现象:zookeeper注册中心宕机,还可以消费dubbo暴露的服务. 原因: 健壮性 l 监控中心宕掉不影响使用,只是丢失部分采样数据 l 数据库宕掉后, ...
mybaits 笔记2022年8月学习笔记
mybatis整理前期准备安装必要依赖: idea开发mybatis,如果学习测试,可以在一个直接建一个空白项目,如果是用spring boot,则建议用用boot的安装捆绑方式核心依赖 or ...

Python爬虫爬取搜狐视频电影并存储到mysql数据库

代码：

运行截图

数据库截图

建表语句

Python爬虫爬取搜狐视频电影并存储到mysql数据库的更多相关文章

随机推荐

热门专题