爬取YY评级信息
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @File : 爬取YY评级基本信息.py
# @Author: lattesea
# @Date : 2019/10/7
# @Desc :
import requests
import json
import csv
from fake_useragent import UserAgent
import time
import random class YYpingjiSpider(object):
def __init__(self):
self.url = 'https://api.ratingdog.cn/v1/search?limit=10&offset={}&type=3&qtext=&filter=%7B%7D&_=1570391570681'
self.url2 = 'https://api.ratingdog.cn/v1/GetIssuerInfo?IssuerID={}&IssuerType=1001'
self.url3 = 'https://api.ratingdog.cn/v1/GetIssuerInfo?IssuerID={}&IssuerType=1002' def get_headers(self):
ua = UserAgent()
headers = {
"Accept": "application/json, text/plain, */*",
"Origin": "https://www.ratingdog.cn",
"Referer": "https://www.ratingdog.cn/",
"Sec-Fetch-Mode": "cors",
"User-Agent": ua.random
}
return headers def parse_IssuerID_IssuerType(self, url):
IssuerID_list = []
html_json = requests.get(url=url, headers=self.get_headers()).text
html_py = json.loads(html_json)
for i in html_py['rows']:
IssuerID_list.append((i['IssuerID'], i['IssuerType']))
print(IssuerID_list)
return IssuerID_list def parse_basic_message_1002(self, IssuerID):
url = self.url3.format(IssuerID)
basic_message = {}
html_json = requests.get(url=url, headers=self.get_headers()).text
html_py = json.loads(html_json)
for i in html_py['rows']:
basic_message['IssuerName'] = html_py['rows']['IssuerName']
basic_message['CorporateRating'] = html_py['rows']['CorporateRating']
basic_message['RatingAgency'] = html_py['rows']['RatingAgency']
basic_message['Holder'] = html_py['rows']['Holder']
basic_message['Industry'] = html_py['rows']['Industry']
basic_message['Nature'] = html_py['rows']['Nature']
basic_message['YYRating'] = html_py['rows']['YYRating']
basic_message['IssuerType'] = html_py['rows']['IssuerType']
basic_message['CreditAnalysis'] = html_py['rows']['CreditAnalysis']
basic_message['PlatformImportance'] = html_py['rows']['CtExtendInfo']['PlatformImportance']
basic_message['PrincipalBusiness'] = html_py['rows']['CtExtendInfo']['PrincipalBusiness']
basic_message['GDP'] = html_py['rows']['CtExtendInfo']['GDP']
basic_message['Revenue'] = html_py['rows']['CtExtendInfo']['Revenue']
basic_message['YYRatio'] = html_py['rows']['CtExtendInfo']['YYRatio']
basic_message['IssuerCity'] = html_py['rows']['CtExtendInfo']['IssuerCity']
basic_message['ADLevel'] = html_py['rows']['CtExtendInfo']['ADLevel']
print(basic_message)
return basic_message def parse_basic_message_1001(self, IssuerID):
url = self.url2.format(IssuerID)
basic_message = {}
html_json = requests.get(url=url, headers=self.get_headers()).text
html_py = json.loads(html_json)
for i in html_py['rows']:
basic_message['IssuerName'] = html_py['rows']['IssuerName']
basic_message['CorporateRating'] = html_py['rows']['CorporateRating']
basic_message['RatingAgency'] = html_py['rows']['RatingAgency']
basic_message['Holder'] = html_py['rows']['Holder']
basic_message['Industry'] = html_py['rows']['Industry']
basic_message['Nature'] = html_py['rows']['Nature']
basic_message['YYRating'] = html_py['rows']['YYRating']
basic_message['IssuerType'] = html_py['rows']['IssuerType']
basic_message['CreditAnalysis'] = html_py['rows']['CreditAnalysis']
basic_message['YYIndustry'] = html_py['rows']['CyExtendInfo']['YYIndustry']
basic_message['YYIndustryId'] = html_py['rows']['CyExtendInfo']['YYIndustryId']
basic_message['IndustrylStatus'] = html_py['rows']['CyExtendInfo']['IndustrylStatus']
basic_message['ShareholderBackground'] = html_py['rows']['CyExtendInfo']['ShareholderBackground']
basic_message['OperatingStatus'] = html_py['rows']['CyExtendInfo']['OperatingStatus']
basic_message['FinancialStatus'] = html_py['rows']['CyExtendInfo']['FinancialStatus']
basic_message['Focus'] = html_py['rows']['CyExtendInfo']['Focus']
print(basic_message)
return basic_message def save_csv_1001(self, result):
keyword_list1 = ['IssuerName', 'CorporateRating', 'RatingAgency', 'Holder', 'Industry', 'Nature', 'YYRating',
'IssuerType', 'CreditAnalysis', 'YYIndustry', 'YYIndustryId', 'IndustrylStatus',
'ShareholderBackground', 'OperatingStatus', 'FinancialStatus', 'Focus'] with open('1001.csv', 'a', newline='') as f:
writer = csv.DictWriter(f, keyword_list1)
# for row in result:
writer.writerow(result) def save_csv_1002(self, result):
keyword_list2 = ['IssuerName', 'CorporateRating', 'RatingAgency', 'Holder', 'Industry', 'Nature', 'YYRating',
'IssuerType', 'CreditAnalysis', 'PlatformImportance', 'PrincipalBusiness', 'PrincipalBusiness',
'GDP', 'Revenue', 'YYRatio', 'IssuerCity', 'ADLevel'] with open('1002.csv', 'a', newline='') as f:
writer = csv.DictWriter(f, keyword_list2)
# for row in result:
writer.writerow(result) def run(self):
# self.parse_IssuerID()
# self.parse_basic_message_1001()
for i in range(0, 4631, 20):
url = self.url.format(i)
IssuerID_IssuerType = self.parse_IssuerID_IssuerType(url)
for j in IssuerID_IssuerType: if j[1] == '产业':
result = self.parse_basic_message_1001(j[0])
self.save_csv_1001(result)
elif j[1] == '城投':
result = self.parse_basic_message_1002(j[0])
self.save_csv_1002(result)
time.sleep(random.uniform(1, 4)) if __name__ == '__main__':
spider = YYpingjiSpider()
spider.run()
该网站主要是访问频率太高会被封账号
爬取YY评级信息的更多相关文章
- 【图文详解】scrapy爬虫与动态页面——爬取拉勾网职位信息(2)
上次挖了一个坑,今天终于填上了,还记得之前我们做的拉勾爬虫吗?那时我们实现了一页的爬取,今天让我们再接再厉,实现多页爬取,顺便实现职位和公司的关键词搜索功能. 之前的内容就不再介绍了,不熟悉的请一定要 ...
- 爬取拉勾网招聘信息并使用xlwt存入Excel
xlwt 1.3.0 xlwt 文档 xlrd 1.1.0 python操作excel之xlrd 1.Python模块介绍 - xlwt ,什么是xlwt? Python语言中,写入Excel文件的扩 ...
- 爬虫系列2:Requests+Xpath 爬取租房网站信息
Requests+Xpath 爬取租房网站信息 [抓取]:参考前文 爬虫系列1:https://www.cnblogs.com/yizhiamumu/p/9451093.html [分页]:参考前文 ...
- python itchat 爬取微信好友信息
原文链接:https://mp.weixin.qq.com/s/4EXgR4GkriTnAzVxluJxmg 「itchat」一个开源的微信个人接口,今天我们就用itchat爬取微信好友信息,无图言虚 ...
- 使用request爬取拉钩网信息
通过cookies信息爬取 分析header和cookies 通过subtext粘贴处理header和cookies信息 处理后,方便粘贴到代码中 爬取拉钩信息代码 import requests c ...
- Scrapy实战篇(七)之Scrapy配合Selenium爬取京东商城信息(下)
之前我们使用了selenium加Firefox作为下载中间件来实现爬取京东的商品信息.但是在大规模的爬取的时候,Firefox消耗资源比较多,因此我们希望换一种资源消耗更小的方法来爬取相关的信息. 下 ...
- 简单的python爬虫--爬取Taobao淘女郎信息
最近在学Python的爬虫,顺便就练习了一下爬取淘宝上的淘女郎信息:手法简单,由于淘宝网站本上做了很多的防爬措施,应此效果不太好! 爬虫的入口:https://mm.taobao.com/json/r ...
- selenium+phantomjs爬取京东商品信息
selenium+phantomjs爬取京东商品信息 今天自己实战写了个爬取京东商品信息,和上一篇的思路一样,附上链接:https://www.cnblogs.com/cany/p/10897618. ...
- Python爬虫-爬取京东商品信息-按给定关键词
目的:按给定关键词爬取京东商品信息,并保存至mongodb. 字段:title.url.store.store_url.item_id.price.comments_count.comments 工具 ...
随机推荐
- Python:百科
ylbtech-Python:百科 Python是一种跨平台的计算机程序设计语言.是一种面向对象的动态类型语言,最初被设计用于编写自动化脚本(shell),随着版本的不断更新和语言新功能的添加,越来越 ...
- mintUI 移动UI框架入门
入门地址: http://mint-ui.github.io/#!/zh-cn 下载依赖cd到项目目录下, 下载我们用的UI框架: 分为全局引入和按需引入 全局引入: npm install mint ...
- 自定义一个数组对象工具demo
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/ ...
- javascript之String对象
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/ ...
- py matplotlib 多个figure同时画多个图以及多个图例多个折线图
图例负号乱码的问题 import numpy as np import matplotlib.pyplot as pltimport matplotlibplt.rcParams['axes.un ...
- js判断字符串是否为JSON格式
不能简单地使用来判断字符串是否是JSON格式: function isJSON(str) { if (typeof str == 'string') { try { JSON.parse(str); ...
- 联想 ThinkPad 笔记本 Fn 键 关闭与启用方法
联想 ThinkPad 笔记本 Fn 键 关闭与启用方法 [最快捷的方式] 按 Fn + Esc 键,进行切换启用或者关闭 Fn 功能键 So easy!!! ^_^
- 记一次flannel调试
今天发现k8s集群中不同Node上的pod不能互相ping通.调试了很长时间,发现是flannel的问题,记之. 问题一:对flannel backend的选择 flannel支持多种backend, ...
- docker命令大全与资源汇总
容器生命周期管理 run //创建一个新的容器并运行一个命令 start/stop/restart //启动一个或多个已经被停止的容器:停止一个运行中的容器:重启容器 kill //杀掉一个运行 ...
- selenium三种断言以及异常类型
elenium提供了三种模式的断言:assert .verify.waitfor 1)Assert(断言) 失败时,该测试将终止. 2)Verify(验证) 失败时,该测试将继续执行,并将错误记入日志 ...