爬取沪深a股数据

首先从东方财富网获取股票代码

再从网易财经下载股票历史数据

import requests

import random

from bs4 import BeautifulSoup as bs

import time

#import redis

import re

import json

def get_stock_names():

    """

    通过东方财富网上爬取股票的名称代码,并存入redis数据库和本地txt文档

    """

    rds = redis.from_url('redis://:666666@192.168.3.98:6379', db=1, decode_responses=True)   # 连接redis db1

    url = "http://quote.eastmoney.com/stocklist.html"

    headers = {

            'Referer': 'http://quote.eastmoney.com/center/gridlist.html',

            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'

        }

    response = requests.get(url, headers=headers).content.decode('utf-8')   # 网站编码为gbk 需要解码

    soup = bs(response, 'lxml')

    all_ul = soup.find('div', id='table_wrapper-table').find_all('ul')   # 获取两个ul 标签数据

    with open('stock_names.txt', 'w+', encoding='utf-8') as f:

        for ul in all_ul:

            all_a = ul.find_all('a')            # 获取ul 下的所有的a 标签

            for a in all_a:

                rds.rpush('stock_names', a.text)       # a.text 为a标签中的text数据  rpush将数据右侧插入数据库

                f.write(a.text + '\n')

def get_data(stocklist, outfile=r'D:\PycharmProjects\web_scraping\stockdata'):

    headers = {

        'Referer': 'http://quotes.money.163.com/',

        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'

    }

    #filelist = [os.path.splitext(file)[0] for file in os.listdir(r'D:\PycharmProjects\web_scraping\stockdata')]

    for stock_code, stock_name in stocklist:

        #if stock_code in filelist: continue

        try:

            #stock_code = stock_name.split('(')[1].split(')')[0]

            # 由于东方财富网上获取的代码一部分为基金，无法获取数据，故将基金剔除掉。

            # 沪市股票以6,9开头，深市以0,2,3开头，但是部分基金也是2开头，201/202/203/204这些也是基金

            # 另外获取data的网址股票代码 沪市前加0， 深市前加1

            if int(stock_code[0]) in [0, 2, 3, 6, 9]:

                if int(stock_code[0]) in [6, 9]:

                    stock_code_new = '0' + stock_code

                elif int(stock_code[0]) in [0, 2, 3]:

                    if not int(stock_code[:3]) in [201, 202, 203, 204]:

                        stock_code_new = '1' + stock_code

                    else: continue

                else: continue

            else: continue

            stock_url = 'http://quotes.money.163.com/trade/lsjysj_{}.html'.format(stock_code)

            respones = requests.get(stock_url, headers=headers).text

            soup = bs(respones, 'lxml')

            start_time = soup.find('input', {'name': 'date_start_type'}).get('value').replace('-', '')  #获取起始时间

            end_time = soup.find('input', {'name': 'date_end_type'}).get('value').replace('-', '')  #获取结束时间

            time.sleep(random.choice([1, 2]))  #两次访问之间休息1-2秒

            download_url = "http://quotes.money.163.com/service/chddata.html?code={}&start={}&end={}&fields=TCLOSE;HIGH;LOW;TOPEN;LCLOSE;CHG;PCHG;TURNOVER;VOTURNOVER;VATURNOVER;TCAP;MCAP".format(stock_code_new, start_time, end_time)

            data = requests.get(download_url, headers=headers)

            file_name = outfile + '\\{}.csv'.format(stock_code)

            with open(file_name, 'wb') as f:

                for chunk in data.iter_content(chunk_size=10000):  #批量写入数据

                    if chunk:

                        f.write(chunk)

            print("{}数据已下载".format(stock_code))

        except Exception as e:

            print("{}({})数据下载报错".format(stock_name, stock_code))

            print(e)

import os

# 获取目录下所有文件，绝对路径

# 方法一

def file_name(file_dir):

    L=[]

    for root, dirs, files in os.walk(file_dir):

        for file in files:

            if os.path.splitext(file)[1] == '.jpeg':

                L.append(os.path.join(root, file))

    return L

# 方法二

def listdir(path, list_name):

    for file in os.listdir(path):  #不包括子目录文件 -> 递归

        file_path = os.path.join(path, file)

        if os.path.isdir(file_path):

            listdir(file_path, list_name)

        elif os.path.splitext(file_path)[1]=='.jpeg':

            list_name.append(file_path)

stocklist = []  #3770支，只有'0','3','6'开头的

max_page = 189

for i in range(max_page):

    url = '''http://1.push2.eastmoney.com/api/qt/clist/get?cb=jQuery112405721872315676919_1566176986516&pn={}

    &pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&fid=f3&fs=m:0+t:6,m:0+t:13,m:0+t:80,m:1+t:2&

    fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152

    &_=1566176986517'''.format(i+1)

    response = requests.get(url).content.decode('utf-8')

    json_text = re.sub(r'jQuery112405721872315676919_1566176986516\(', '', response)[:-2]

    #json_str = re.sub(r'\)', '', response)

    json_text = json.loads(json_text)

    for fi in json_text['data']['diff']:

        stocklist.append([fi['f12'], fi['f14']])

# 下载数据

get_data(stocklist, outfile=r'D:\PycharmProjects\web_scraping\stockdata')

参考资料：

爬虫：爬取股票历史交易数据

爬取东方财富股票信息网

Python爬虫（5）：比Selenium快100倍的方法爬东方财富网财务报表

爬取沪深a股数据的更多相关文章

[python]初试页面抓取——抓取沪深股市交易龙虎榜数据
[python]抓取沪深股市交易龙虎榜数据 python 3.5.0下运行没做自动建立files文件夹,需要手动在py文件目录下建立files文件夹后运行 #coding=utf-8 import ...
python 爬取天猫美的评论数据
笔者最近迷上了数据挖掘和机器学习,要做数据分析首先得有数据才行.对于我等平民来说,最廉价的获取数据的方法,应该是用爬虫在网络上爬取数据了.本文记录一下笔者爬取天猫某商品的全过程,淘宝上面的店铺也是类似 ...
爬虫（二）Python网络爬虫相关基础概念、爬取get请求的页面数据
什么是爬虫爬虫就是通过编写程序模拟浏览器上网,然后让其去互联网上抓取数据的过程. 哪些语言可以实现爬虫 1.php:可以实现爬虫.php被号称是全世界最优美的语言(当然是其自己号称的,就是王婆 ...
Python网络爬虫第三弹《爬取get请求的页面数据》
一.urllib库 urllib是Python自带的一个用于爬虫的库,其主要作用就是可以通过代码模拟浏览器发送请求.其常被用到的子模块在Python3中的为urllib.request和urllib. ...
Python爬虫《爬取get请求的页面数据》
一.urllib库 urllib是Python自带的一个用于爬虫的库,其主要作用就是可以通过代码模拟浏览器发送请求.其常被用到的子模块在Python3中的为urllib.request和urllib. ...
02. 爬取get请求的页面数据
目录 02. 爬取get请求的页面数据一.urllib库二.由易到难的爬虫程序: 02. 爬取get请求的页面数据一.urllib库 urllib是Python自带的一个用于爬虫的库,其主要作用 ...
Python3爬取王者官方网站英雄数据
爬取王者官方网站英雄数据众所周知,王者荣耀已经成为众多人们喜爱的一款休闲娱乐手游,今天就利用python3 爬虫技术爬取官方网站上的几十个英雄的资料,包括官方给出的人物定位,英雄名称,技能名称,CD ...
python爬虫爬取get请求的页面数据代码样例
废话不多说,上代码 #!/usr/bin/env python # -*- coding:utf-8 -*- # 导包 import urllib.request import urllib.pars ...
python网络爬虫第三弹(<爬取get请求的页面数据>)
一.urllib库 urllib是python自带的一个用于爬虫的库,其主要作用就是通过代码模拟浏览器发送请求,其常被用到的子模块在 python3中的为urllib.request 和 urllib ...

随机推荐

Prometheus学习系列（九）之Prometheus 联盟、迁移
前言本文来自Prometheus官网手册和 Prometheus简介 FEDERATION 允许Prometheus服务器从另一台Prometheus服务器抓取选定的时间序列. 一,用例联盟有不 ...
HALCON数据类型和C#对应数据类型的对比
摘要:HALCON数据类型:Iconic Variables(图形变量).Control Variables(控制变量).在C#中,图形变量用HObject声明,控制变量用HTuple声明.(halc ...
C# 调用OpenCVSharp报错“尝试读取或写入受保护的内存。这通常指示其他内存已损坏”
一.描述问题当托管代码调用非托管代码的时候,经常会出现如下报错:“尝试读取或写入受保护的内存.这通常指示其他内存已损坏”. 二.原因分析由于非托管代码的内存指针的回收是由非托管代码自身手动完成的, ...
HeadFirst设计模式<1>
HeadFirst设计模式<1> 1 策略模式鸭子飞行和嘎嘎叫策略 2 工厂模式简单工厂工厂方法抽象工厂简单工厂简单的pizza工厂通过一个工厂类的方法,创建和返回对象实例原 ...
asp.net允许跨域配置web.config
<configuration> <system.webServer> <modules> <add name="CultureAwareHttpMo ...
微信小程序连接低功率蓝牙控制单片机上硬件设备
1.软件部分介绍微信小程序是一种新的应用,用户不需要下载应用只用通过扫二维码或者打开链接就能使用,使用完后不需要卸载,直接关闭就行了.微信在2017年初推出微信小程序开发环境.任何企业,媒体,个人都 ...
【Java基础】String 相关知识点总结
String 相关知识点总结字符串的不可变性概述 String 被声明为 final,因此它不可继承在 Java8 中,String 内部使用 char 数组存储数据 public final ...
Redis集群模式下的redis-py-cluster方式读写测试
与MySQL主从复制,从节点可以分担部分读压力不一样,甚至可以增加slave或者slave的slave来分担读压力,Redis集群中的从节点,默认是不分担读请求的,从节点只作为主节点的备份,仅负责故障 ...
梁敬彬老师的《收获，不止SQL优化》，关于如何缩短SQL调优时间，给出了三个步骤，
梁敬彬老师的<收获,不止SQL优化>,关于如何缩短SQL调优时间,给出了三个步骤, 1. 先获取有助调优的数据库整体信息 2. 快速获取SQL运行台前信息 3. 快速获取SQL关联幕后信息 ...
CSS3 更改字体被选中样式
CSS3 更改字体被选中样式

爬取沪深a股数据

爬取沪深a股数据的更多相关文章

随机推荐

热门专题