CFDA

cfda数据抓取

1.网站数据是加密的,需要浏览器进行数据解析

2.网址url有js加密

3.PhantomJS无法解析数据, chrome无法获取数据,所有最终选择用Firefox浏览器

import pymysql

import time

import uuid

from lxml import etree

import logging

from selenium import webdriver

import threading

import queue

import re

logging.basicConfig(filename='shengchan.log', filemode="w", level=logging.INFO)

class App1Spider(object):

    def __init__(self):

        self.db = pymysql.connect(host='', port=, database='', user='',

                                  password='', charset='utf8')

        self.cursor = self.db.cursor()

        self.options = webdriver.FirefoxOptions()

        self.options.add_argument('--headless')

        # 谷歌文档提到需要加上这个属性来规避bug

        self.options.add_argument('--disable-gpu')

        # 设置默认编码为utf-8

        self.options.add_argument('lang=zh_CN.UTF-8')

        # 隐藏滚动条, 应对一些特殊页面

        self.options.add_argument('--hide-scrollbars')

        # 禁止加载图片

        self.options.add_argument('blink-settings=imagesEnabled=false')

        # 指定浏览器分辨率

        self.options.add_argument('window-size=1440x900')

        self.browser = webdriver.Firefox(firefox_options=self.options)

    def main(self):

        """

        入口函数

        :param response:

        :return:

        """

        start = 1

        while True:

            browser = self.go_index()

            if browser:

                for i in range(start, 520):

                    browser = self.go_page(browser, i)

                    if browser:

                        for j in range(15):

                            if i > 511:

                                detail_html = self.go_detail(browser, j)

                                if detail_html:

                                    id = (i - 1) * 15 + j + 1

                                    self.parse_detail(detail_html, id)

                                else:

                                    break

                    else:

                        start = i - 1

                        break

            else:

                continue

    def go_index(self):

        """

        访问主页

        :return: 浏览器对象

        """

        # print("!-- start index --!")

        index_url = "http://app1.sfda.gov.cn/datasearch/face3/base.jsp?tableId=34&tableName=TABLE34&title=%D2%A9%C6%B7%C9%FA%B2%FA%C6%F3%D2%B5&bcId=118103348874362715907884020353"

        try:

            self.browser.get(index_url)

            time.sleep(3)

        except:

            # print("!-- error to get index page --!")

            # print("网速不太好，休息1分钟")

            time.sleep(30)

            return None

        else:

            html = self.browser.page_source

            condition = re.search(r"管理局--数据查询", html)

            if condition:

                # print("!-- success to get index page --!")

                return self.browser

            else:

                # print("!-- error to get index page --!----")

                # print("网速不太好，休息1分钟------")

                time.sleep(30)

                return None

    def go_page(self, browser, page):

        """

        跳转到指定页面

        :param browser: 浏览器对象

        :param page: 要跳转的页码

        :return: 跳转后的浏览器对象

        """

        # logging.info("!-- start page %s --!" % page)

        print("!-- start page %s --!" % page)

        go_page_js = 'location.href="javascript:devPage(%s)";' % page

        try:

            browser.execute_script(go_page_js)

            # 需要等待firefox页面加载完成

            time.sleep(2)

        except Exception as e:

            print("!-- error to go page %s --!" % page)

            # logging.info("!-- error to go page %s --!" % page)

            return None

        else:

            html = browser.page_source

            condition = re.search(r"第 %s 页" % page, html)

            if condition:

                logging.info("!-- success to go page %s --!" % page)

                return browser

            else:

                logging.info("!-- error to go page %s --!" % page)

                return None

    def go_detail(self, browser, number):

        """

        包含了提取详情页面数据信息，保存数据信息。

        :param browser: 浏览器对象

        :return: 详细数据生成器

        """

        # logging.info("!-- go detail %s --!" % number)

        print("!-- go detail %s --!" % number)

        go_detail_js = "var div=document.getElementById('content');" \

                       "var c=div.getElementsByTagName('a')[{detail_num}].click();"

        return_list_js = 'location.href = "javascript:viewList();"'

        _go_detail_js = go_detail_js.format(detail_num=number)

        browser.execute_script(_go_detail_js)

        time.sleep(2)

        detail_html = browser.page_source

        condition = re.search(r"javascript:viewList", detail_html)

        if condition:

            browser.execute_script(return_list_js)

            time.sleep(2)

            return detail_html

        else:

            # logging.info("!-- error to get detail --! %s" % number)

            print("!-- error to get detail --! %s" % number)

            return None

    def parse_detail(self, detail_html, id):

        # print(id)

        """

        详情页面提取规则

        :param html: 被提取页面的html

        :return: data

        """

        response = etree.HTML(detail_html)

        try:

            # 厂家编号

            number = response.xpath('//*[@id="content"]/div/div/table[1]/tbody/tr[2]/td[2]/text()')[0].strip().replace("'", "‘")

        except:

            number = '00000000'

        try:

            # 生产地址

            manufactureAddress = response.xpath('//*[@id="content"]/div/div/table[1]/tbody/tr[11]/td[2]/text()')[0].strip().replace("'", "‘")

        except:

            manufactureAddress = ''

        try:

            # 生产范围

            manufactureRange = response.xpath('//*[@id="content"]/div/div/table[1]/tbody/tr[12]/td[2]/text()')[0].strip().replace("'", "‘")

        except:

            manufactureRange = ''

        try:

            # 发证日期

            certificateDate = response.xpath('//*[@id="content"]/div/div/table[1]/tbody/tr[13]/td[2]/text()')[0].strip().replace("'", "‘")

        except:

            certificateDate = '2018-01-01'

        try:

            # 有效期

            validityDate = response.xpath('//*[@id="content"]/div/div/table[1]/tbody/tr[14]/td[2]/text()')[0].strip().replace("'", "‘")

        except:

            validityDate = '2018-01-01'

        try:

            # 发证机关

            certificateOrgan = response.xpath('//*[@id="content"]/div/div/table[1]/tbody/tr[15]/td[2]/text()')[0].strip().replace("'", "‘")

        except:

            certificateOrgan = ''

        try:

            # 签发人

            Signer = response.xpath('//*[@id="content"]/div/div/table[1]/tbody/tr[16]/td[2]/text()')[0].strip().replace("'", "‘")

        except:

            Signer = ''

        try:

            # 日常监管机构

            superviseAgency = response.xpath('//*[@id="content"]/div/div/table[1]/tbody/tr[17]/td[2]/text()')[0].strip().replace("'", "‘")

        except:

            superviseAgency = ''

        try:

            # 日常监管人员

            superviser = response.xpath('//*[@id="content"]/div/div/table[1]/tbody/tr[18]/td[2]/text()')[0].strip().replace("'", "‘")

        except:

            superviser = ''

        try:

            # 社会信用代码/组织机构代码

            socialCreditCode = response.xpath('//*[@id="content"]/div/div/table[1]/tbody/tr[3]/td[2]/text()')[0].strip().replace("'", "‘")

        except:

            socialCreditCode = ''

        try:

            # 监督举报电话

            reportTel = response.xpath('//*[@id="content"]/div/div/table[1]/tbody/tr[19]/td[2]/text()')[0].strip().replace("'", "‘")

        except:

            reportTel = ''

        try:

            # 备注

            comment = response.xpath('//*[@id="content"]/div/div/table[1]/tbody/tr[20]/td[2]/text()')[0].strip().replace("'", "‘")

        except:

            comment = ''

        try:

            # 分类码

            classificationCode = response.xpath('//*[@id="content"]/div/div/table[1]/tbody/tr[4]/td[2]/text()')[0].strip().replace("'", "‘")

        except:

            classificationCode = ''

        try:

            # 省份

            province = response.xpath('//*[@id="content"]/div/div/table[1]/tbody/tr[5]/td[2]/text()')[0].strip().replace("'", "‘")

        except:

            province = ''

        try:

            # 企业名称

            companyName = response.xpath('//*[@id="content"]/div/div/table[1]/tbody/tr[6]/td[2]/text()')[0].strip().replace("'", "‘")

        except:

            companyName = ''

        try:

            # 法定代表人

            legalPeople = response.xpath('//*[@id="content"]/div/div/table[1]/tbody/tr[7]/td[2]/text()')[0].strip().replace("'", "‘")

        except:

            legalPeople = ''

        try:

            # 企业负责人

            companyResponsioner = response.xpath('//*[@id="content"]/div/div/table[1]/tbody/tr[8]/td[2]/text()')[0].strip().replace("'", "‘")

        except:

            companyResponsioner = ''

        try:

            # 质量负责人

            qualityResponsioner = response.xpath('//*[@id="content"]/div/div/table[1]/tbody/tr[9]/td[2]/text()')[0].strip().replace("'", "‘")

        except:

            qualityResponsioner = ''

        try:

            # 注册地址

            registerAddress = response.xpath('//*[@id="content"]/div/div/table[1]/tbody/tr[10]/td[2]/text()')[0].strip().replace("'", "‘")

        except:

            registerAddress = ''

        cjrepetition = self.cursor.execute("select id from cfda_drug_company20181205 where numbers = %s" % id)

        if not cjrepetition:

            cjsql = "insert into cfda_drug_company20181205(number, manufactureAddress,  manufactureRange, certificateDate, validityDate, certificateOrgan, Signer, superviseAgency, superviser, socialCreditCode, reportTel, comment, classificationCode, province, companyName, legalPeople, companyResponsioner, qualityResponsioner, registerAddress, numbers) values('{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}', {})"

            cjsql_data = cjsql.format(number, manufactureAddress, manufactureRange,

                                      certificateDate, validityDate, certificateOrgan,

                                      Signer, superviseAgency, superviser,

                                      socialCreditCode, reportTel, comment,

                                      classificationCode, province, companyName,

                                      legalPeople, companyResponsioner, qualityResponsioner,

                                      registerAddress, int(id))

            try:

                self.cursor.execute(cjsql_data)

                self.db.commit()

            except Exception as e:

                print('id:%s   e:%s' % (id, e))

if __name__ == '__main__':

    sheng = App1Spider()

    sheng.main()

CFDA的更多相关文章

JS base64 加密和后台 base64解密（防止中文乱码）
直接上代码 1,js(2个文件,网上找的) 不要觉的长,直接复制下来就OK //UnicodeAnsi.js文件 //把Unicode转成Ansi和把Ansi转换成Unicode function ...
OpenGL阴影，Shadow Volumes（附源程序，使用 VCGlib ）
实验平台:Win7,VS2010 先上结果截图: 本文是我前一篇博客:OpenGL阴影,Shadow Mapping(附源程序)的下篇,描述两个最常用的阴影技术中的第二个,Shadow Volu ...
C++ stringstream
C++ 引入了ostringstream.istringstream.stringstream这三个类,这三个类包含在sstream.h头文件中.三个类中 1)istringstream类用于执行C+ ...
基于nodejs实现js后端化处理
今天D哥给我提了个问题,"用php执行过js没"?咋一听,没戏~~毕竟常规情况下,js是依赖浏览器运行的.想在php后端采集的同时利用js运行结果并传递给php使用,没戏! 然后回 ...
uva 10129 play on words——yhx
aaarticlea/png;base64,iVBORw0KGgoAAAANSUhEUgAABNUAAANeCAYAAAA1BjiHAAAgAElEQVR4nOydabWsuhaFywIasIAHJK
第一部分 CLR基础：第2章生成、打包、部署和管理应用程序及类型
2.1.NET Framework部署目标 Microsoft Windows多年来因不稳定和复杂而口碑不佳.造成的原因:1.应用程序都使用来自微软和厂商的动态链接库(dynamic-link lib ...
Michael Kors - Wikipedia, the free encyclopedia
Michael Kors - Wikipedia, the free encyclopedia Michael Kors From Wikipedia, the free encyclopedia ...
Html5模拟通讯录人员排序（sen.js）
// JavaScript Document var PY_Json_Str = ""; var PY_Str_1 = ""; var PY_Str_2 = & ...
爬虫之scrapy-redis
redis分布式部署 scrapy框架是否可以自己实现分布式? 不可以原因有两点其一:因为多台机器上部署的scrapy会各自拥有各自的调度器,这样就使得多台机器无法分配start_urls列表中的u ...

随机推荐

Linux netstat命令查看并发连接数
netstat -n | awk '/^tcp/ {++S[$NF]} END {for(a in S) print a, S[a]}' 解释: 返回结果示例: LAST_ACK 5 (正在等待处理的 ...
w3cschool脚本算法编程实战课程
部分源码==>https://github.com/calamus0427/commonJS 翻转字符串算法挑战 function reverseString(str) { str = str. ...
Running a jupyter notebook server
你也许需要服务器运行jupyter notebook 阿里云: https://yq.aliyun.com/articles/98527 关于更安全的证书访问: http://jupyter-note ...
anchor values list
学习笔记：python3，代码。小例子习作（2017）
http://www.cnblogs.com/qq21270/p/7634025.html 学习笔记:python3,一些基本语句(一些基础语法的代码,被挪到这里了) 日期和时间操作 http://b ...
18.纯 CSS 创作 404 文字变形为 NON 文字的交互特效
原文地址:https://segmentfault.com/a/1190000014818274 感想: positon:absolute 和 :hover HTML代码: <!-- < ...
《算法》第五章部分程序 part 8
▶ 书中第五章部分程序,包括在加上自己补充的代码,适用于基因序列的 2-Bit 压缩算法,行程长压缩算法,Huffman 压缩算法,LZW 压缩算法 ● 适用于基因序列的 2-Bit 压缩算法 pac ...
maven 常用插件3
转载:http://www.cnblogs.com/dennyzhangdd/p/5831112.html 1.根据项目类型打包:jar/war打包插件. 首先看<packaging>ja ...
div的全屏与退出全屏
div的全屏与退出全屏作用:将div全屏与退出全屏,一般播放器使用较多. html按钮: <button onclick="showFull();"> 全屏 < ...
彻底关闭Windows Defender丨Win10
关闭Windows Defender Win10正式版怎么关闭windows defender 首先关闭windows defender,因重启电脑后win10 会自动重启defender,所以需要禁 ...

CFDA

cfda数据抓取

1.网站数据是加密的,需要浏览器进行数据解析

2.网址url有js加密

3.PhantomJS无法解析数据, chrome无法获取数据,所有最终选择用Firefox浏览器

CFDA的更多相关文章

随机推荐

热门专题