Python 自用代码（某方标准类网页源代码清洗）

用于mongodb中“标准”数据的清洗，数据为网页源代码，须从中提取：

标准名称,标准外文名称,标准编号,发布单位,发布日期,状态,实施日期,开本页数,采用关系,中图分类号,中国标准分类号,国际标准分类号,国别,关键词,摘要,替代标准。

提取后组成字典存入另一集合。

#coding=utf-8

from pymongo import MongoClient

from lxml import etree

import requests

s = [u'标准编号：',u'发布单位：',u'发布日期：',u'状态：',u'实施日期：',u'开本页数：',u'采用关系：',

    u'中图分类号：',u'中国标准分类号：',u'国际标准分类号：',u'国别：',u'关键词：',u'摘要：']

# 获取数据库

def get_db():

    client = MongoClient('IP', 27017)

    db = client.wanfang

    db.authenticate("用户名","密码")

    return db

# 获取第num条数据

def get_data(table, num):

    i = 1

    for item in table.find({}, {"content":1,"_id":0}):

        if i==num:

            if item.has_key('content') and item['content']:

                return item['content']

        else:

            i+=1

            continue

# 列表转字符串

def list_str(list):

    if len(list)!=0:

        return list[0]

    else:

        return ""

# 提取分类号

def code_ls(list):

    if len(list)!=0:

        ls = list[0].split()

        shanchu = []

        for i in ls:

            if ("("in i) or (")"in i) or ("（"in i) or("）"in i):

                shanchu.append(i)

        for i in shanchu:

            ls.remove(i)

        return ls

    else:

        return ""

# 构造关键词列表

def keywords_ls(list):

    if len(list)!=0:

        return list

    else:

        return ""

# 替代标准

def replace_str(replace):

    if replace!="":

        ls = [i.strip().replace("\r\n", "") for i in replace]

        if len(ls)!=0:

            return ls[0][5:]

        else:

            return ""

    else:

        return ""

# 提取摘要

def summary_str(list):

    if len(list)!=0:

        if list[0][0]!="<":

            return list[0]

        else:

            return ""

    else:

        return ""

# 调整日期格式

def date_str(list):

    if len(list)!=0:

        year = list[0].find(u'年')

        month = list[0].find(u'月')

        day = list[0].find(u'日')

        if month-year==2:

            list[0] = list[0].replace(u"年",u"年0")

        if day-month==2:

            list[0] = list[0].replace(u"月",u"月0")

        return list[0].replace(u"日","").replace(u"月","-").replace(u"年","-")

    else:

        return ""

# 调整采标格式

def adopted_ls(string, ls):

    dc = {}

    loc = string.find(',')

    if loc==-1:

        return ls

    else:

        dc["code"] = string[:loc].strip()

        dc["type"] = string[loc+1:loc+4]

        ls.append(dc)

        return adopted_ls(string[loc+4:],ls)

# 构造标准入库字典

def standard_dict(html):

    dc = {}

    tree = etree.HTML(html)

    # 标准名称

    dc["title"] = list_str(tree.xpath("//h1/text()"))

    # 外文名称

    dc["title_eng"] = list_str(tree.xpath("//h2/text()"))

    # 标准编号

    dc["standard_number"] = list_str(tree.xpath("//span[text()='%s']/following-sibling::*/text()"%(s[0])))

    # 发布单位

    dc["publishing_department"] = list_str(tree.xpath("//span[text()='%s']/following-sibling::*/text()"%(s[1])))

    # 发布日期

    dc["release_date"] = date_str(tree.xpath("//span[text()='%s']/following-sibling::*/text()"%(s[2])))

    # 状态

    dc["state"] = list_str(tree.xpath("//span[text()='%s']/following-sibling::*/text()"%(s[3])))

    # 实施日期

    dc["enforcement_date"] = date_str(tree.xpath("//span[text()='%s']/following-sibling::*/text()"%(s[4])))

    # 开本页数

    dc["pages"] = list_str(tree.xpath("//span[text()='%s']/following-sibling::*/text()"%(s[5])))

    # 采用关系

    dc["adopted"] = adopted_ls(list_str(tree.xpath("//span[text()='%s']/following-sibling::*/text()"%(s[6]))), [])

    # 中图分类号

    dc["clc"] = code_ls(tree.xpath("//span[text()='%s']/following-sibling::*/text()"%(s[7])))

    # 中国标准分类号

    dc["ccs"] = code_ls(tree.xpath("//span[text()='%s']/following-sibling::*/child::*/text()"%(s[8])))

    # 国际标准分类号

    dc["ics"] = code_ls(tree.xpath("//span[text()='%s']/following-sibling::*/text()"%(s[9])))

    # 国别

    dc["country"] = list_str(tree.xpath("//span[text()='%s']/following-sibling::*/text()"%(s[10])))

    # 关键词

    dc["keywords"] = keywords_ls(tree.xpath("//span[text()='%s']/following-sibling::*/child::*/text()"%(s[11])))

    # 摘要

    dc["summary"] = summary_str(tree.xpath("//span[text()='%s']/parent::*/following-sibling::*/text()"%(s[12])))

    # 替代标准

    dc["replace_for"] = replace_str(tree.xpath("//div[@id='replaceStandard']//child::*//text()"))

    return dc

# 主函数

def main():

    db = get_db()

    collection=db.standard

    collection2 = db.standard_cleaned

    for item in collection.find({}, {"content":1,"_id":0}):

        if item.has_key('content') and item['content']:

            dc = standard_dict(item['content'])

            collection2.insert(dc)

if __name__ == '__main__':

    main()

    # 以下代码用于测试清洗特定一条数据

    # db = get_db()

    # collection=db.standard

    # collection2 = db.standard_cleaned

    # data = get_data(collection, 8)

    # dc = standard_dict(data)

    # collection2.insert(dc)

    # for k,v in dc.items():

    #     print k,v

    # # 以下代码用于测试提取摘要

    # data = requests.get('http://d.wanfangdata.com.cn/Standard/ISO%208528-5-2013')

    # dc = standard_dict(data.text)

    # for k,v in dc.items():

    #     print k,v

    # # 以下代码用于测试修改日期格式

    # l1 = [u"2017年6月28日"]

    # l2 = [u"2017年10月27日"]

    # l3 = [u"2017年12月1日"]

    # l4 = [u"2017年7月1日"]

    # print date_str(l1)

    # print date_str(l2)

    # print date_str(l3)

    # print date_str(l4)

Python 自用代码（某方标准类网页源代码清洗）的更多相关文章

Python 自用代码（知网会议论文网页源代码清洗）
#coding=utf-8 from pymongo import MongoClient from lxml import etree import requests jigou = u" ...
Python 自用代码（递归清洗采标情况）
将‘ISO 3408-1-2006,MOD ISO 3408-2-1991,MOD ISO 3408-3-2006,MOD’类似格式字符串存为: [{'code': 'ISO 3408-1-200 ...
Python 自用代码（调整日期格式）
2017年6月28日 to 2017-06-282017年10月27日 to 2017-10-272017年12月1日 to 2017-12-012017年7月1日 to 2017-07-01 #co ...
Python 自用代码（拆分txt文件）
现有一个28G的txt文件,里面每一行是一个分词过的专利全文文档,一共370多万行.我需要把它按每五万行为单位做成一个json文件,格式大致如下: [{"id":"100 ...
Python 自用代码（scrapy多级页面(三级页面)爬虫）
2017-03-28 入职接到的第一个小任务,scrapy多级页面爬虫,从来没写过爬虫,也没学过scrapy,甚至连xpath都没用过,最后用了将近一周才搞定.肯定有很多low爆的地方,希望大家可以给 ...
（Python）自动生成代码（方法一）
在写某个平台的自动化脚本时,笔者把全部的操作都封装到了两个类中,page.py和commonpage.py: page.py部分代码: class BasePage(object): ''' 页面基础 ...
python代码规范与标准库参考
python代码规范与标准库参考 python代码规范参考文献: http://www.runoob.com/w3cnote/google-python-styleguide.html https:/ ...
【代码笔记】Java基础：Java的方法和类
面向过程与面向对象都是我们编程中,编写程序的一种思维方式.例如:公司打扫卫生(擦玻璃.扫地.拖地.倒垃圾等), 按照面向过程的程序设计方式会思考“打扫卫生我该怎么做,然后一件件的完成”,最后把公司卫生 ...
python开发面向对象基础：接口类&抽象类&多态&钻石继承
一,接口类继承有两种用途: 一:继承基类的方法,并且做出自己的改变或者扩展(代码重用) 二:声明某个子类兼容于某基类,定义一个接口类Interface,接口类中定义了一些接口名(就是函数名)且并未实 ...

随机推荐

Linux core dump file详解
Linux core dump file详解 http://www.cnblogs.com/langqi250/archive/2013/03/05/2944931.html
操作MySQL数据库相关代码
注意事项: 1.导入驱动包,如我导的是mysql-connector-java-5.1.26-bin.jar 2.修改下面代码中的用户名和密码 3.其中URL为"jdbc:mysql://数 ...
一次Ubuntu下的排雷记录
起因某天,发现一台服务器上出现了一个大量占用cpu资源的进程.尝试手动杀掉,但很快就会自动重新创建新的进程. 追查用命令lsof -p 10316 查看其文件路径: 该进程文件夹/proc/103 ...
[ Openstack ] Openstack-Mitaka 高可用之 Pacemaker+corosync+pcs 高可用集群
目录 Openstack-Mitaka 高可用之概述 Openstack-Mitaka 高可用之环境初始化 Openstack-Mitaka 高可用之 Mariadb-Galera集群 ...
【计算机网络】wireshark抓包分析1
学习计算机网络很久了,但总是局限于书本知识,感觉get不到重点.经师兄建议用wireshark抓包分析看看. 我自己以前并没有做过抓包分析,所以这篇博文可能会有很多错误,只是我自己的一个记录,路过的亲 ...
Spring boot 文件路径读取异常
在开发代码中,有一段需要获取resources目录下的一个配置文件(这里写作test.xml). 这段代码在ide中没有任何问题,但是一打成jar包发布到线上,这段代码就会报找不到对应文件的错误. 按 ...
LISTVIEW 消息结构宏
如果是要画的话,用CreateWindowEx创建指定 WC_LISTVIEW window class 关于其消息如下: LVM_APPROXIMATEVIEWRECT LVM_ARRANGE ...
厦门大学XMUNET+ ubuntu连接方法
编辑连接输入用户名和密码就好了 ubuntu16.04
SQL查询中in和exists的区别分析
select * from A where id in (select id from B); select * from A where exists (select 1 from B where ...
【树链剖分（区间线段树）】BZOJ4196-[NOI2015]软件包管理
[题目大意] 如果软件包A依赖软件包B,那么安装软件包A以前,必须先安装软件包B.同时,如果想要卸载软件包B,则必须卸载软件包A.而且,由于你之前的工作,除0号软件包以外,在你的管理器当中的软件包都会 ...

Python 自用代码（某方标准类网页源代码清洗）

Python 自用代码（某方标准类网页源代码清洗）的更多相关文章

随机推荐

热门专题