Python 爬取外文期刊论文信息（机械仪表工业）

NSTL国家科技图书文献中心 2017 机械仪表工业所有期刊论文信息

代码比较随意，不要介意

第一步，爬取所有期刊链接

#coding=utf-8

import time

from selenium import webdriver

from lxml import etree

from pymongo import MongoClient

client = MongoClient("IP", 27017)

db = client["nstl"]

collection=db["journal_urls"]

db.authenticate("","")

driver = webdriver.Chrome(executable_path=r"D:\chromedriver_win32\chromedriver.exe")

driver.get('https://www.nstl.gov.cn/facade/search/clcSearch.do?&lan=eng&clc=TH')

html = driver.page_source

tree = etree.HTML(html)

count = int(tree.xpath("//span[@id='totalPages1']/text()")[0])

# 共47页

for i in range(count):

    html = driver.page_source

    tree = etree.HTML(html)

    # 提取当前页所有期刊链接并存储

    table = tree.xpath("//div[@class='s2listtd2']/span/a/@href")

    for j in table:

        bson = {}

        bson['url'] = j

        collection.insert(bson)

    # i等于46时终止

    if i==(count-1):

        break

    # 点击接下来一页按钮

    driver.find_element_by_xpath('//div[@id="page"]/div//a[text()="%s"]'%str(i+2)).click()

    # 判断翻页成功后跳出while

    while True:

        time.sleep(1)

        if driver.page_source!=html:

            break

driver.close()

第二步，爬取每个期刊中所有2017年论文链接

#coding=utf-8

import requests

from pymongo import MongoClient

from lxml import etree

from selenium import webdriver

import time

client = MongoClient("IP", 27017)

db = client["nstl"]

collection1=db["journal_urls"]

collection2=db["journalArticle2017_urls"]

db.authenticate("","")

driver = webdriver.Chrome(executable_path=r"D:\chromedriver_win32\chromedriver.exe")

# 循环所有期刊链接

for item in collection1.find({}, {"url":1, "_id":0}):

    driver.get(item['url'][29:-4])

    html = driver.page_source

    tree = etree.HTML(html)

    # 判断如果有18年论文，需要点击出17年论文

    table_2018 = tree.xpath("//div[@id='year_2018']")

    if table_2018!=[]:

        driver.find_element_by_xpath("//div[@id='year_2017']").click()

        time.sleep(1)

        driver.find_element_by_xpath("//div[@id='volumeUl_2017']/div[@class='ltreebom2']").click()

    # 获取17年期的个数并循环

    table = tree.xpath("//div[@id='volumeUl_2017']//div[@class='ltreebom3']/a")

    for i in range(1, len(table)+1):

        wen_html = driver.page_source

        wen_tree = etree.HTML(wen_html)

        # 获取当前一期的所有论文链接

        wen_table = tree.xpath("//div[@class='s2listtd2']/a/@href")

        for j in wen_table:

            bson = {}

            bson['url'] = j

            collection2.insert(bson)

        # 判断结束循环

        if i==len(table):

            break

        # 点击出下一期论文

        try:

            driver.find_element_by_xpath("//div[@id='volumeUl_2017']//div[@class='ltreebom3'][%s]"%str(i+1)).click()

        except:

            break

        # 判断是否点击成功

        while True:

            time.sleep(1)

            if driver.page_source!=wen_html:

                break

driver.close()

第三步，爬取论文信息详情页源码

#coding=utf-8

import requests

from pymongo import MongoClient

from lxml import etree

from selenium import webdriver

import time

client = MongoClient("IP", 27017)

db = client["nstl"]

collection=db["journalArticle2017_urls"]

collection1=db["journalArticle2017_codes"]

db.authenticate("","")

driver = webdriver.Chrome(executable_path=r"D:\chromedriver_win32\chromedriver.exe")

# 循环所有论文并构造链接

for item in collection.find({}, {"url":1, "_id":0}):

    url = "https://www.nstl.gov.cn/facade/search/toFullView.do?checkedSEQNO="+item['url'][23:-11]+"&subDocType="+item['url'][-8:-3]

    # # post方法获取当前页源码

    # for i in range(100):

    #     try:

    #         result = requests.post(url, verify = False)

    #     except:

    #         time.sleep(1)

    #         continue

    #     html = result.text

    #     if html:

    #         break

    # 模拟浏览器获取源码, 得到含有文献数据的源码后跳出循环

    driver.get(url)

    for i in range(100):

        time.sleep(1)

        if driver.page_source!=html:

            break

    # 存储

    bson = {}

    html1 = driver.page_source

    bson['html'] = html1

    collection1.insert(bson)

driver.close()

第四步，解析源码

#coding=utf-8

from pymongo import MongoClient

from lxml import etree

client = MongoClient("IP", 27017)

db = client["nstl"]

collection1 = db["journalArticle2017_codes"]

collection2 = db["journalArticle2017_data"]

db.authenticate("","")

zzdw, km, ma, cbn, j, q, qy, zy, zys, flh, gjc, yz, wz = u'【作者单位】：', u'【刊名】：', u'【ISSN】：', u'【出版年】：', u'【卷】：', u'【期】：', u'【起页】：', u'【止页】：', u'【总页数】：', u'【分类号】：', u'【关键词】：', u'【语种】：', u'【文摘】：'

# 循环所有论文并构造链接

n = 0

for item in collection1.find({}, {"html":1, "_id":0}):

    html = item["html"]

    tree = etree.HTML(html)

    title = tree.xpath("//span[@name='title']/text()")

    author = tree.xpath("//a[starts-with(@href,'javascript:searchByAuthor')]/text()")

    organization = tree.xpath("//div[text()='%s']/following-sibling::*/text()"%zzdw)

    journal_name = tree.xpath("//div[text()='%s']/following-sibling::*/text()"%km)

    issn = tree.xpath("//div[text()='%s']/following-sibling::*/text()"%ma)

    publication_year = tree.xpath("//div[text()='%s']/following-sibling::*/text()"%cbn)

    volume = tree.xpath("//div[text()='%s']/following-sibling::*/text()"%j)

    issue = tree.xpath("//div[text()='%s']/following-sibling::*/text()"%q)

    page_start = tree.xpath("//div[text()='%s']/following-sibling::*/text()"%qy)

    page_end = tree.xpath("//div[text()='%s']/following-sibling::*/text()"%zy)

    page_count = tree.xpath("//div[text()='%s']/following-sibling::*/text()"%zys)

    clc = tree.xpath("//div[text()='%s']/following-sibling::*/text()"%flh)

    keywords = tree.xpath("//div[text()='%s']/following-sibling::*/span/a/text()"%gjc)

    language = tree.xpath("//div[text()='%s']/following-sibling::*/text()"%yz)

    summary = tree.xpath("//div[text()='%s']/following-sibling::*/text()"%wz)

    dc = {}

    dc['title'] = title[0]

    if author: dc['author'] = author

    if organization: dc['organization'] = organization[0]

    if journal_name: dc['journal_name'] = journal_name[0]

    if issn: dc['issn'] = issn[0]

    if publication_year: dc['publication_year'] = publication_year[0]

    if volume: dc['volume'] = volume[0]

    if issue: dc['issue'] = issue[0]

    if page_start: dc['page_start'] = page_start[0]

    if page_end: dc['page_end'] = page_end[0]

    if page_count: dc['page_count'] = page_count[0]

    if clc: dc['clc'] = clc[0]

    if keywords: dc['keywords'] = keywords[0]

    if language: dc['language'] = language[0]

    if summary: dc['summary'] = summary[0]

    collection2.insert(dc)

Python 爬取外文期刊论文信息（机械仪表工业）的更多相关文章

用Python爬取智联招聘信息做职业规划
上学期在实验室发表时写了一个爬取智联招牌信息的爬虫. 操作流程大致分为:信息爬取——数据结构化——存入数据库——所需技能等分词统计——数据可视化 1.数据爬取 job = "通信工程师&qu ...
python爬取 “得到” App 电子书信息
前言文的文字及图片来源于网络,仅供学习.交流使用,不具有任何商业用途,版权归原作者所有,如有问题请及时联系我们以作处理. 作者: 静觅崔庆才 PS:如有需要Python学习资料的小伙伴可以加点击下 ...
Python爬取房天下二手房信息
一.相关知识 BeautifulSoup4使用 python将信息写入csv import csv with open("11.csv","w") as csv ...
python爬取北京政府信件信息01
python爬取,找到目标地址,开始研究网页代码格式,于是就开始根据之前学的知识进行爬取,出师不利啊,一开始爬取就出现了个问题,这是之前是没有遇到过的,明明地址没问题,就是显示网页不存在,于是就在百度 ...
Python 爬取赶集网租房信息
代码已久,有可能需要调整 #coding:utf-8 from bs4 import BeautifulSoup #有这个bs4不用正则也可以定位要爬取的内容了 from urlparse impor ...
利用python爬取贝壳网租房信息
最近准备换房子,在网站上寻找各种房源信息,看得眼花缭乱,于是想着能否将基本信息汇总起来便于查找,便用python将基本信息爬下来放到excel,这样一来就容易搜索了. 1. 利用lxml中的xpath ...
python爬取实习僧招聘信息字体反爬
参考博客:http://www.cnblogs.com/eastonliu/p/9925652.html 实习僧招聘的网站采用了字体反爬,在页面上显示正常,查看源码关键信息乱码,如下图所示: 查看网页 ...
Python爬取简书主页信息
主要学习如何通过抓包工具分析简书的Ajax加载,有时间再写一个Multithread proxy spider提升效率. 1. 关键点: 使用单线程爬取,未登录,爬取简书主页Ajax加载的内容.主要有 ...
Python爬取B站视频信息
该文内容已失效,现已实现scrapy+scrapy-splash来爬取该网站视频及用户信息,由于B站的反爬封IP,以及网上的免费代理IP绝大部分失效,无法实现一个可靠的IP代理池,免费代理网站又是各种 ...

随机推荐

centos 7 开机启动配置
centos 7 开机启动 1 开机启动配置文件位于/usr/lib/systemd/system/ 2 nginx的配置[Unit]Description=nginx - high performa ...
SocketCluster
官网地址:https://socketcluster.io/ SocketCluster的组成部分,即运行一个SocketCluster服务器,它在服务器生成的进程 1.主进程(Server.js)一 ...
Aras Innovator DB备份与还原
错误信息确认到该问题是因为孤立帐号的问题,在解决孤立帐号之前,可以通过语句查看,另外,还原了DB后,系统不会自动创建原来的登陆帐号的,需要手动新增登陆帐号 #查看孤立帐号列表exec sp_chan ...
Linux操作命令（二）
本次实验将介绍 Linux 命令中 mkdir.rm.mv.cp.cat.nl 命令的用法. 1.mkdir mkdir命令用来创建指定名称的目录,要求创建目录的用户在当前目录中具有写权限,并且指定的 ...
POJ 1258 + POJ 1287 【最小生成树裸题/矩阵建图】
Farmer John has been elected mayor of his town! One of his campaign promises was to bring internet c ...
jsp页面中获取session中的值
Jsp中获取Session: session是jsp的内置对象,所以你可以直接写在jsp的 <% session.setAttribute("a", b); //把b放到se ...
Unity游戏开发之C#快速入门
C#是微软团队在开发.NET框架时开发的,它的构想接近于C.C++,也和JAVA十分相似,有许多强大的编程功能. 个人感受是C#吸收了众多编程语言的优点,从中可以看到C.C++.Java.Javasc ...
洛谷——P1104 生日
P1104 生日题目描述 cjf君想调查学校OI组每个同学的生日,并按照从大到小的顺序排序.但cjf君最近作业很多,没有时间,所以请你帮她排序. 输入输出格式输入格式: 有2行, 第1行为OI组总 ...
HTML5 form内button
突然发现奇怪的事在html5 中bottn 的type不是submit但是单击的时候它自己就提交表单了. 然后在一查就看到问题解决,加上type=“button”
BZOJ 4327 JSOI2012 玄武密码（后缀自动机）
[题目链接] http://www.lydsy.com/JudgeOnline/problem.php?id=4327 [题目大意] 求每个子串在母串中的最长匹配 [题解] 对母串建立后缀自动机,用每 ...

Python 爬取外文期刊论文信息（机械 仪表工业）

Python 爬取外文期刊论文信息（机械 仪表工业）的更多相关文章

随机推荐

热门专题

Python 爬取外文期刊论文信息（机械仪表工业）

Python 爬取外文期刊论文信息（机械仪表工业）的更多相关文章