Python 自用代码（知网会议论文网页源代码清洗）

#coding=utf-8

from pymongo import MongoClient

from lxml import etree

import requests

jigou = u"\r\n      【机构】\r\n      "

zuozhe = u"\r\n        【作者】\r\n          "

# 获取数据库

def get_db():

    client = MongoClient('localhost', 27017)

    db = client.cnki

    db.authenticate("用户名","密码")

    return db

# 获取第num条数据

def get_data(table, num):

    i = 1

    for item in table.find({}, {"html":1,"_id":0}):

        if i==num:

            if item.has_key('html') and item['html']:

                return item['html']

        else:

            i+=1

            continue

# 列表首元素转字符串

def list_str(list):

    if len(list)!=0:

        return list[0]

    else:

        return ""

# 作者英文名，机构英文名

def en_ls(list, length1, length2):

    if len(list)!=0:

        list = list[0].replace(u"【Author】","").replace("\r\n","").strip().split(";")

        if len(list)==(length2+length1)+1:

            return list2str(list[:length1]), list2str(list[length1:-1])

        else:

            return "", ""

    else:

        return "", ""

def hyxx(list):

    if len(list)!=0:

        hylmc,hymc,hysj,hydd,flh,zbdw = "","","","",[],""

        for item in list:

            if u"【会议录名称】" in item:

                hylmc = item.replace(u"【会议录名称】","").replace("\r\n","").strip()

                continue

            if u"【会议名称】" in item:

                hymc = item.replace(u"【会议名称】","").replace("\r\n","").strip()

                continue

            if u"【会议时间】" in item:

                hysj = item.replace(u"【会议时间】","").replace("\r\n","").strip()

                continue

            if u"【会议地点】" in item:

                hydd = item.replace(u"【会议地点】","").replace("\r\n","").strip()

                continue

            if u"【分类号】" in item:

                flh = item.replace(u"【分类号】","").replace("\r\n","").strip()

                continue

            if u"【主办单位】" in item:

                zbdw = item.replace(u"【主办单位】","").replace(u"、",";").replace("\r\n","").strip()

                continue

        return hylmc,hymc,hysj,hydd,flh,zbdw

    else:

        return "","","","","",""

# 列表转字符串

def list2str(list):

    if len(list)!=0:

        return ";".join(list)

    else:

        return ""    

# 构造论文入库字典

def standard_dict(html):

    dc = {}

    print 1

    # print html

    tree = etree.HTML(html)

    # 论文名称

    dc["title"] = list_str(tree.xpath("//span[@id='chTitle']/text()"))

    # 外文名称

    dc["title_eng"] = list_str(tree.xpath("//span[@id='enTitle']/text()"))

    # 作者

    dc["author"] = list2str(tree.xpath("//p[text()='%s']/a/text()"%zuozhe))

    # 作者数量

    length1 = len(tree.xpath("//p[text()='%s']/a/text()"%zuozhe))

    # 机构名称

    dc["organization"] = list2str(tree.xpath("//p[text()='%s']/a/text()"%jigou))

    # 机构数量

    length2 = len(tree.xpath("//p[text()='%s']/a/text()"%jigou))

    # 作者英文名, 机构英文名

    dc["author_eng"], dc["organization_eng"] = en_ls(tree.xpath("//p[@id='au_en']/text()"), length1, length2)

    # 摘要

    dc["summary"] = list_str(tree.xpath("//span[@id='ChDivSummary']/text()"))

    # 英文摘要

    dc["summary_eng"] = list_str(tree.xpath("//span[@id='EnChDivSummary']/text()"))

    # 关键词

    dc["keywords"] = list2str(tree.xpath("//div[@class='keywords']/span[1]/a/text()"))

    # 英文关键词

    dc["keywords_eng"] = list2str(tree.xpath("//div[@class='keywords']/span[2]/a/text()"))

    # 会议信息

    dc["proceeding_title"],dc["conference_title"],dc["conference_date"],dc["conference_place"],dc["huiyflh"],dc["conference_org"] = hyxx(tree.xpath("//div[@class='summary']/ul/li/text()"))

    if dc["proceeding_title"]=="":

        print 2

        dc["proceeding_title"] = list_str(tree.xpath("//div[@class='summary']/ul[1]/li/a/text()"))

    return dc

# 主函数

def main():

    db = get_db()

    collection=db.conference

    collection2 = db.conference_cleaned

    for item in collection.find({}, {"html":1,"_id":0}):

        if item.has_key('html') and item['html']:

            dc = standard_dict(item['html'])

            collection2.insert(dc)

if __name__ == '__main__':

    main()

    # 以下代码用于测试清洗特定一条数据

    # db = get_db()

    # collection=db.conference

    # data = get_data(collection, 1)

    # dc = standard_dict(data)

    # for k,v in dc.items():

    #     print k,v

Python 自用代码（知网会议论文网页源代码清洗）的更多相关文章

Python 自用代码（某方标准类网页源代码清洗）
用于mongodb中“标准”数据的清洗,数据为网页源代码,须从中提取: 标准名称,标准外文名称,标准编号,发布单位,发布日期,状态,实施日期,开本页数,采用关系,中图分类号,中国标准分类号,国际标准分 ...
python爬取中国知网部分论文信息
爬取指定主题的论文,并以相关度排序. #!/usr/bin/python3 # -*- coding: utf-8 -*- import requests import linecache impor ...
Python 自用代码（递归清洗采标情况）
将‘ISO 3408-1-2006,MOD ISO 3408-2-1991,MOD ISO 3408-3-2006,MOD’类似格式字符串存为: [{'code': 'ISO 3408-1-200 ...
Python 自用代码（调整日期格式）
2017年6月28日 to 2017-06-282017年10月27日 to 2017-10-272017年12月1日 to 2017-12-012017年7月1日 to 2017-07-01 #co ...
Python 自用代码（拆分txt文件）
现有一个28G的txt文件,里面每一行是一个分词过的专利全文文档,一共370多万行.我需要把它按每五万行为单位做成一个json文件,格式大致如下: [{"id":"100 ...
Python 自用代码（scrapy多级页面(三级页面)爬虫）
2017-03-28 入职接到的第一个小任务,scrapy多级页面爬虫,从来没写过爬虫,也没学过scrapy,甚至连xpath都没用过,最后用了将近一周才搞定.肯定有很多low爆的地方,希望大家可以给 ...
论文查重知网万方 paperpass
相信各个即将毕业的学生或在岗需要评职称.发论文的职场人士,论文检测都是必不可少的一道程序.面对市场上五花八门的检测软件,到底该如何选择?选择查重后到底该如何修改?现在就做一个知识的普及.其中对于中国的 ...
如何将知网下载的caj文件转换为pdf文件
一.问题描述: 最近在知网搜索论文的时候,经常遇到有的论文没有pdf文件的情况,但不得不吐槽我觉得知网做的阅读器确实是有点烂.所以想将caj文件转化为pdf文件,找到了一个比较好的方法,所以希望记录一 ...
Python开源爬虫项目代码：抓取淘宝、京东、QQ、知网数据--转
数据来源:数据挖掘入门与实战公众号: datadw scrapy_jingdong[9]- 京东爬虫.基于scrapy的京东网站爬虫,保存格式为csv.[9]: https://github.co ...

随机推荐

PL/SQL 09 包 package
--定义包头 create or replace package 包名as 变量.常量声明; 函数声明; 过程声明;end; --定义包体 create or replace package b ...
Oracle基础 03 回滚表空间 undo
--查询默认的undo表空间 select name,value from v$parameterwhere name like '%undo%'; --创建 undotbs2 表空间 create ...
iOS开发贝塞尔曲线UIBezierPath（后记）
使用CAShapeLayer与UIBezierPath可以实现不在view的drawRect方法中就画出一些想要的图形 . 1:UIBezierPath: UIBezierPath是在 UIKit 中 ...
spring 声明式事务中try catch捕获异常
原文:http://heroliuxun.iteye.com/blog/848122 今天遇到了一个这个问题最近遇到这样的问题,使用spring时,在业务层需要捕获异常(特殊需要),当前一般情况下不 ...
移动开发之css3实现背景渐变效果
前段时间由于手机项目需要实现一个背景渐变功能, 开始是想切个小背景图平铺下, 后来想到css3可以实现,如是用下面的代码就实现了. .sec_case_list li span{ backgro ...
创建ProcessEngine
activiti流程引擎是通过activiti.cfg.xml文件配置的(这并不符合Spring构建流程引擎的编码风格). ProcessEngine processEngine = ProcessE ...
学习PHPCMS需要掌握的函数
路径:phpcms\libs\classes\model.class.php /** * 执行sql查询 * @param $where 查询条件[例`name`='$name'] * @param ...
SQL 插入多行数据语句整理
参考别人的,希望对大家有用. 1.只是插入简单的有限行数据时用: insert 要插入的表名(列名1,列名2,....) select '列名1需要的数据','列名2需要的数据',... union ...
(15)python 数据库连接
python连接mysql两种方法一.python官网提供的 MySQL-python 软件下载地址 https://pypi.python.org/pypi/MySQL-python/1.2.5 ...
差分+树状数组线段树【P2357】守墓人
题目描述-->p2357 守墓人敲了一遍线段树,水过. 树状数组分析主要思路: 差分简单介绍一下差分(详细概念太麻烦,看下面. 给定一个数组 7 8 6 5 1 8 18 20 35 // ...

Python 自用代码（知网会议论文网页源代码清洗）

Python 自用代码（知网会议论文网页源代码清洗）的更多相关文章

随机推荐

热门专题