Python3:爬取新浪、网易、今日头条、UC四大网站新闻标题及内容

以爬取相应网站的社会新闻内容为例:

一、新浪:

新浪网的新闻比较好爬取,我是用BeautifulSoup直接解析的,它并没有使用JS异步加载,直接爬取就行了。

'''
新浪新闻:http://news.sina.com.cn/society/
Date:20180920
Author:lizm
Description:获取新浪新闻
'''
import requests
from bs4 import BeautifulSoup
from urllib import request
import sys
import re
import os def getNews(title,url,m):
Hostreferer = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
}
req = request.Request(url)
response = request.urlopen(req)
#过滤非utf-8的网页新闻
response = response.read().decode('utf-8',"ignore")
soup = BeautifulSoup(response,'lxml')
tag = soup.find('div',class_='article')
if tag == None:
return 0
#获取文章发布时间
fb_date = soup.find('div','date-source').span.string
#获取发布网站名称
fb_www= soup.find('div','date-source').a.string
#获取文章内容
rep = re.compile("[\s+\.\!\/_,$%^*(+\"\']+|[+<>?、~*()]+")
title = rep.sub('',title)
title = title.replace(':',':')
filename = sys.path[0]+"/news/"+title+".txt"
with open(filename,'w',encoding='utf8') as file_object:
file_object.write(fb_date + " " + fb_www)
file_object.write("\n")
file_object.write("网址:"+url)
file_object.write("\n")
file_object.write(title)
file_object.write(tag.get_text()) i = 0
for image in tag.find_all('div','img_wrapper'):
title_img = title +str(i)
#保存图片
#判断目录是否存在
if (os.path.exists(sys.path[0]+"/news/"+title)):
pass
else:
#不存在,则新建目录
os.mkdir(sys.path[0]+"/news/"+title)
os.chdir(sys.path[0]+"/news/"+title)
file_name = "http://news.sina.com.cn/"+image.img.get('src').replace('//','')
html = requests.get(file_name, headers=Hostreferer)
# 图片不是文本文件,以二进制格式写入,所以是html.content
title_img = title_img +".jpg"
f = open(title_img, 'wb')
f.write(html.content)
f.close()
i+=1
print('成功爬取第', m,'个新闻',title)
return 0 #获取社会新闻(最新的162条新闻)
def getTitle(url):
req = request.Request(url)
response = request.urlopen(req)
response = response.read().decode('utf8')
soup = BeautifulSoup(response,'lxml')
y = 0
for tag in soup.find('ul',class_='seo_data_list').find_all('li'):
if tag.a != None:
#if y== 27:
print(y,tag.a.string,tag.a.get('href'))
temp = tag.a.string
getNews(temp,tag.a.get('href'),y)
y += 1 if __name__ == '__main__':
url = 'http://news.sina.com.cn/society/'
getTitle(url)

二、网易:

网易新闻的标题及内容是使用js异步加载的,单纯的下载网页源代码是没有标题及内容的,我们可以在Network的js中找到我们需要的内容,这里我使用了正则表达式来获取我们需要的标题及其链接,并使用了BeautifulSoup来获取相应标题的内容。

import re
from urllib import request
from bs4 import BeautifulSoup def download(title, url):
req = request.urlopen(url)
res = req.read()
soup = BeautifulSoup(res,'lxml')
#print(soup.prettify())
tag = soup.find('div',class_='post_text')
#print(tag.get_text())
title = title.replace(':','')
title = title.replace('"','')
title = title.replace('|','')
title = title.replace('/','')
title = title.replace('\\','')
title = title.replace('*','')
title = title.replace('<','')
title = title.replace('>','')
title = title.replace('?','')
#print(title)
file_name = r'D:\code\python\spider_news\NetEase_news\sociaty\\' +title + '.txt'
file = open(file_name,'w',encoding = 'utf-8')
file.write(tag.get_text())
if __name__ == '__main__':
urls = ['http://temp.163.com/special/00804KVA/cm_shehui.js?callback=data_callback',
'http://temp.163.com/special/00804KVA/cm_shehui_02.js?callback=data_callback',
'http://temp.163.com/special/00804KVA/cm_shehui_03.js?callback=data_callback']
for url in urls:
#url = 'http://temp.163.com/special/00804KVA/cm_shehui_02.js?callback=data_callback'
req = request.urlopen(url)
res = req.read().decode('gbk')
#print(res)
pat1 = r'"title":"(.*?)",'
pat2 = r'"tlink":"(.*?)",'
m1 = re.findall(pat1,res)
news_title = []
for i in m1:
news_title.append(i)
m2 = re.findall(pat2,res)
news_url = []
for j in m2:
news_url.append(j)
for i in range(0,len(news_url)):
#print(news_title[i],news_body[i])
download(news_title[i],news_url[i])
print('正在爬取第' + str(i) + '个新闻',news_title[i])
 

三、头条:

头条的新闻跟前两个也都不一样,它的标题和链接是封装到json文件中的,但是他json文件的url参数是通过一个js随机算法变化的,所以我们需要模拟json文件的参数,否则我们找不到json文件的具体url,我是通过http://www.jianshu.com/p/5a93673ce1c0这篇博客才了解到url获取方法的,而且也解决了总是下载重复新闻的问题,该网站自带反爬机制,需要添加cookie。关于新闻的内容,我用了正则表达式提取了中文。

from urllib import request
import requests
import json
import time
import math
import hashlib
import re
from bs4 import BeautifulSoup
def get_url(max_behot_time, AS, CP):
url = 'https://www.toutiao.com/api/pc/feed/?category=news_society&utm_source=toutiao&widen=1' \
'&max_behot_time={0}' \
'&max_behot_time_tmp={0}' \
'&tadrequire=true' \
'&as={1}' \
'&cp={2}'.format(max_behot_time, AS, CP)
return url def get_ASCP():
t = int(math.floor(time.time()))
e = hex(t).upper()[2:]
m = hashlib.md5()
m.update(str(t).encode(encoding='utf-8'))
i = m.hexdigest().upper() if len(e) != 8:
AS = '479BB4B7254C150'
CP = '7E0AC8874BB0985'
return AS,CP
n = i[0:5]
a = i[-5:]
s = ''
r = ''
for o in range(5):
s += n[o] + e[o]
r += e[o + 3] + a[o] AS = 'AL'+ s + e[-3:]
CP = e[0:3] + r + 'E1'
# print("AS:"+ AS,"CP:" + CP)
return AS,CP def download(title, news_url):
# print('正在爬')
req = request.urlopen(news_url)
if req.getcode() != 200:
return 0 res = req.read().decode('utf-8')
#print(res)
pat1 = r'content:(.*?),'
pat2 = re.compile('[\u4e00-\u9fa5]+')
result1 = re.findall(pat1,res)
#print(len(result1))
if len(result1) == 0:
return 0
print(result1)
result2 = re.findall(pat2,str(result1))
result3 = []
for i in result2:
if i not in result3:
result3.append(i)
#print(result2)
title = title.replace(':','')
title = title.replace('"','')
title = title.replace('|','')
title = title.replace('/','')
title = title.replace('\\','')
title = title.replace('*','')
title = title.replace('<','')
title = title.replace('>','')
title = title.replace('?','')
with open(r'D:\code\python\spider_news\Toutiao_news\society\\' + title + '.txt','w') as file_object:
file_object.write('\t\t\t\t')
file_object.write(title)
file_object.write('\n')
file_object.write('该新闻地址:')
file_object.write(news_url)
file_object.write('\n')
for i in result3:
#print(i)
file_object.write(i)
file_object.write('\n')
# file_object.write(tag.get_text())
#print('正在爬取') def get_item(url):
#time.sleep(5)
cookies = {'tt_webid': ''}
wbdata = requests.get(url,cookies = cookies)
wbdata2 = json.loads(wbdata.text)
data = wbdata2['data']
for news in data:
title = news['title']
news_url = news['source_url']
news_url = 'https://www.toutiao.com' + news_url
print(title, news_url)
if 'ad_label' in news:
print(news['ad_label'])
continue
download(title,news_url)
next_data = wbdata2['next']
next_max_behot_time = next_data['max_behot_time']
# print("next_max_behot_time:{0}".format(next_max_behot_time))
return next_max_behot_time if __name__ == '__main__': refresh = 50
for x in range(0,refresh+1): print('第{0}次:'.format(x))
if x == 0:
max_behot_time = 0
else:
max_behot_time = next_max_behot_time
#print(next_max_behot_time)
AS,CP = get_ASCP()
url = get_url(max_behot_time,AS,CP)
next_max_behot_time = get_item(url)
 

四、UC

UC和新浪差不多,没有太复杂的反爬虫,直接解析爬取就好。

from bs4 import BeautifulSoup
from urllib import request def download(title,url):
req = request.Request(url)
response = request.urlopen(req)
response = response.read().decode('utf-8')
soup = BeautifulSoup(response,'lxml')
tag = soup.find('div',class_='sm-article-content')
if tag == None:
return 0
title = title.replace(':','')
title = title.replace('"','')
title = title.replace('|','')
title = title.replace('/','')
title = title.replace('\\','')
title = title.replace('*','')
title = title.replace('<','')
title = title.replace('>','')
title = title.replace('?','')
with open(r'D:\code\python\spider_news\UC_news\society\\' + title + '.txt','w',encoding='utf-8') as file_object:
file_object.write('\t\t\t\t')
file_object.write(title)
file_object.write('\n')
file_object.write('该新闻地址:')
file_object.write(url)
file_object.write('\n')
file_object.write(tag.get_text())
#print('正在爬取') if __name__ == '__main__':
for i in range(0,7): url = 'https://news.uc.cn/c_shehui/'
# headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.91 Safari/537.36",
# "cookie":"sn=3957284397500558579; _uc_pramas=%7B%22fr%22%3A%22pc%22%7D"}
# res = request.Request(url,headers = headers)
res = request.urlopen(url)
req = res.read().decode('utf-8')
soup = BeautifulSoup(req,'lxml')
#print(soup.prettify())
tag = soup.find_all('div',class_ = 'txt-area-title')
#print(tag.name)
for x in tag:
news_url = 'https://news.uc.cn' + x.a.get('href')
print(x.a.string,news_url)
download(x.a.string,news_url)
 

Python3:爬取新浪、网易、今日头条、UC四大网站新闻标题及内容的更多相关文章

  1. selenium+BeautifulSoup+phantomjs爬取新浪新闻

    一 下载phantomjs,把phantomjs.exe的文件路径加到环境变量中,也可以phantomjs.exe拷贝到一个已存在的环境变量路径中,比如我用的anaconda,我把phantomjs. ...

  2. python3爬虫-爬取新浪新闻首页所有新闻标题

    准备工作:安装requests和BeautifulSoup4.打开cmd,输入如下命令 pip install requests pip install BeautifulSoup4 打开我们要爬取的 ...

  3. python3使用requests爬取新浪热门微博

    微博登录的实现代码来源:https://gist.github.com/mrluanma/3621775 相关环境 使用的python3.4,发现配置好环境后可以直接使用pip easy_instal ...

  4. Python 爬虫实例(7)—— 爬取 新浪军事新闻

    我们打开新浪新闻,看到页面如下,首先去爬取一级 url,图片中蓝色圆圈部分 第二zh张图片,显示需要分页, 源代码: # coding:utf-8 import json import redis i ...

  5. 网站爬取-案例三:今日头条抓取(ajax抓取JS数据)

    今日头条这类的网站制作,从数据形式,CSS样式都是通过数据接口的样式来决定的,所以它的抓取方法和其他网页的抓取方法不太一样,对它的抓取需要抓取后台传来的JSON数据,先来看一下今日头条的源码结构:我们 ...

  6. python2.7 爬虫初体验爬取新浪国内新闻_20161130

    python2.7 爬虫初学习 模块:BeautifulSoup requests 1.获取新浪国内新闻标题 2.获取新闻url 3.还没想好,想法是把第2步的url 获取到下载网页源代码 再去分析源 ...

  7. python爬取新浪股票数据—绘图【原创分享】

    目标:不做蜡烛图,只用折线图绘图,绘出四条线之间的关系. 注:未使用接口,仅爬虫学习,不做任何违法操作. """ 新浪财经,爬取历史股票数据 ""&q ...

  8. 【python3】爬取新浪的栏目分类

    目标地址: http://www.sina.com.cn/ 查看源代码,分析: 1 整个分类 在 div main-nav 里边包含 2 分组情况:1,4一组 . 2,3一组 . 5 一组 .6一组 ...

  9. xpath爬取新浪天气

    参考资料: http://cuiqingcai.com/1052.html http://cuiqingcai.com/2621.html http://www.cnblogs.com/jixin/p ...

随机推荐

  1. Java编程基本概念

    1.标识符 ①用于给变量.类和方法命名(类名首字母大写,变量和方法名首字母小写并遵循驼峰原则)②标识符的命名规范: ■标识符必须以字母.下划线和美元符$开头. ■标识符其他部分可以是字母.下划线.美元 ...

  2. Xcode模版生成文件头部注释

    在使用Xcode创建工程或者新建类的时候,顶部都会有一些xcode帮我们生成的注释 //// MySingletonClass.h// 单例模式//// Created by mark on 15/8 ...

  3. poj_1037 动态规划+字典序第k大

    题目大意 给定n个数字,规定一种 cute 排序:序列中的数字大小为严格的波浪形,即 a[0] > a[1] < a[2] > a[3] < .... 或者 a[0] < ...

  4. SenchaTouch调用纯数字键盘

    items:[ { itemId:"phoneNumber", xtype: "textfield", component:{xtype:"input ...

  5. 高中生的IT之路-1.2离开校园

    记得那是07年夏季的一天,高考成绩出来之后,班主任老师通知大家回学校报考志愿. 那天我刚到学校会议室,我还没来得及和同学见面就被班主任喊过去了,把志愿表递给我了我,我当时连仔细看那张志愿表都没看,随手 ...

  6. Thinkphp --- 去掉index.php

    这里我使用的面板是宝塔,操作的 apche: 具体的配置可以参考这里: https://www.cnblogs.com/fangziffff123/p/7588782.html 首先是:Thinkph ...

  7. RMAN概述及其体系结构

    1 Recovery Manager(RMAN)特性 是一种用于集备份(backup).还原(restore)和恢复(recover)数据库于一体的Oracle 工具,支持命令行及图形界面操作 能够备 ...

  8. 三维凸包求重心到面的最短距离(HDU4273)

    http://acm.hdu.edu.cn/showproblem.php?pid=4273 Rescue Time Limit: 2000/1000 MS (Java/Others)    Memo ...

  9. linux .zip 解压命令集

    zip: 压缩: zip [-AcdDfFghjJKlLmoqrSTuvVwXyz$][-b <工作目录>][-ll][-n <字尾字符串>][-t <日期时间>] ...

  10. Python大数据:jieba 中文分词,词频统计

    # -*- coding: UTF-8 -*- import sys import numpy as np import pandas as pd import jieba import jieba. ...