Python 自用代码（某方标准类网页源代码清洗）

用于mongodb中“标准”数据的清洗，数据为网页源代码，须从中提取：

标准名称,标准外文名称,标准编号,发布单位,发布日期,状态,实施日期,开本页数,采用关系,中图分类号,中国标准分类号,国际标准分类号,国别,关键词,摘要,替代标准。

提取后组成字典存入另一集合。

#coding=utf-8

from pymongo import MongoClient

from lxml import etree

import requests

s = [u'标准编号：',u'发布单位：',u'发布日期：',u'状态：',u'实施日期：',u'开本页数：',u'采用关系：',

    u'中图分类号：',u'中国标准分类号：',u'国际标准分类号：',u'国别：',u'关键词：',u'摘要：']

# 获取数据库

def get_db():

    client = MongoClient('IP', 27017)

    db = client.wanfang

    db.authenticate("用户名","密码")

    return db

# 获取第num条数据

def get_data(table, num):

    i = 1

    for item in table.find({}, {"content":1,"_id":0}):

        if i==num:

            if item.has_key('content') and item['content']:

                return item['content']

        else:

            i+=1

            continue

# 列表转字符串

def list_str(list):

    if len(list)!=0:

        return list[0]

    else:

        return ""

# 提取分类号

def code_ls(list):

    if len(list)!=0:

        ls = list[0].split()

        shanchu = []

        for i in ls:

            if ("("in i) or (")"in i) or ("（"in i) or("）"in i):

                shanchu.append(i)

        for i in shanchu:

            ls.remove(i)

        return ls

    else:

        return ""

# 构造关键词列表

def keywords_ls(list):

    if len(list)!=0:

        return list

    else:

        return ""

# 替代标准

def replace_str(replace):

    if replace!="":

        ls = [i.strip().replace("\r\n", "") for i in replace]

        if len(ls)!=0:

            return ls[0][5:]

        else:

            return ""

    else:

        return ""

# 提取摘要

def summary_str(list):

    if len(list)!=0:

        if list[0][0]!="<":

            return list[0]

        else:

            return ""

    else:

        return ""

# 调整日期格式

def date_str(list):

    if len(list)!=0:

        year = list[0].find(u'年')

        month = list[0].find(u'月')

        day = list[0].find(u'日')

        if month-year==2:

            list[0] = list[0].replace(u"年",u"年0")

        if day-month==2:

            list[0] = list[0].replace(u"月",u"月0")

        return list[0].replace(u"日","").replace(u"月","-").replace(u"年","-")

    else:

        return ""

# 调整采标格式

def adopted_ls(string, ls):

    dc = {}

    loc = string.find(',')

    if loc==-1:

        return ls

    else:

        dc["code"] = string[:loc].strip()

        dc["type"] = string[loc+1:loc+4]

        ls.append(dc)

        return adopted_ls(string[loc+4:],ls)

# 构造标准入库字典

def standard_dict(html):

    dc = {}

    tree = etree.HTML(html)

    # 标准名称

    dc["title"] = list_str(tree.xpath("//h1/text()"))

    # 外文名称

    dc["title_eng"] = list_str(tree.xpath("//h2/text()"))

    # 标准编号

    dc["standard_number"] = list_str(tree.xpath("//span[text()='%s']/following-sibling::*/text()"%(s[0])))

    # 发布单位

    dc["publishing_department"] = list_str(tree.xpath("//span[text()='%s']/following-sibling::*/text()"%(s[1])))

    # 发布日期

    dc["release_date"] = date_str(tree.xpath("//span[text()='%s']/following-sibling::*/text()"%(s[2])))

    # 状态

    dc["state"] = list_str(tree.xpath("//span[text()='%s']/following-sibling::*/text()"%(s[3])))

    # 实施日期

    dc["enforcement_date"] = date_str(tree.xpath("//span[text()='%s']/following-sibling::*/text()"%(s[4])))

    # 开本页数

    dc["pages"] = list_str(tree.xpath("//span[text()='%s']/following-sibling::*/text()"%(s[5])))

    # 采用关系

    dc["adopted"] = adopted_ls(list_str(tree.xpath("//span[text()='%s']/following-sibling::*/text()"%(s[6]))), [])

    # 中图分类号

    dc["clc"] = code_ls(tree.xpath("//span[text()='%s']/following-sibling::*/text()"%(s[7])))

    # 中国标准分类号

    dc["ccs"] = code_ls(tree.xpath("//span[text()='%s']/following-sibling::*/child::*/text()"%(s[8])))

    # 国际标准分类号

    dc["ics"] = code_ls(tree.xpath("//span[text()='%s']/following-sibling::*/text()"%(s[9])))

    # 国别

    dc["country"] = list_str(tree.xpath("//span[text()='%s']/following-sibling::*/text()"%(s[10])))

    # 关键词

    dc["keywords"] = keywords_ls(tree.xpath("//span[text()='%s']/following-sibling::*/child::*/text()"%(s[11])))

    # 摘要

    dc["summary"] = summary_str(tree.xpath("//span[text()='%s']/parent::*/following-sibling::*/text()"%(s[12])))

    # 替代标准

    dc["replace_for"] = replace_str(tree.xpath("//div[@id='replaceStandard']//child::*//text()"))

    return dc

# 主函数

def main():

    db = get_db()

    collection=db.standard

    collection2 = db.standard_cleaned

    for item in collection.find({}, {"content":1,"_id":0}):

        if item.has_key('content') and item['content']:

            dc = standard_dict(item['content'])

            collection2.insert(dc)

if __name__ == '__main__':

    main()

    # 以下代码用于测试清洗特定一条数据

    # db = get_db()

    # collection=db.standard

    # collection2 = db.standard_cleaned

    # data = get_data(collection, 8)

    # dc = standard_dict(data)

    # collection2.insert(dc)

    # for k,v in dc.items():

    #     print k,v

    # # 以下代码用于测试提取摘要

    # data = requests.get('http://d.wanfangdata.com.cn/Standard/ISO%208528-5-2013')

    # dc = standard_dict(data.text)

    # for k,v in dc.items():

    #     print k,v

    # # 以下代码用于测试修改日期格式

    # l1 = [u"2017年6月28日"]

    # l2 = [u"2017年10月27日"]

    # l3 = [u"2017年12月1日"]

    # l4 = [u"2017年7月1日"]

    # print date_str(l1)

    # print date_str(l2)

    # print date_str(l3)

    # print date_str(l4)

Python 自用代码（某方标准类网页源代码清洗）的更多相关文章

Python 自用代码（知网会议论文网页源代码清洗）
#coding=utf-8 from pymongo import MongoClient from lxml import etree import requests jigou = u" ...
Python 自用代码（递归清洗采标情况）
将‘ISO 3408-1-2006,MOD ISO 3408-2-1991,MOD ISO 3408-3-2006,MOD’类似格式字符串存为: [{'code': 'ISO 3408-1-200 ...
Python 自用代码（调整日期格式）
2017年6月28日 to 2017-06-282017年10月27日 to 2017-10-272017年12月1日 to 2017-12-012017年7月1日 to 2017-07-01 #co ...
Python 自用代码（拆分txt文件）
现有一个28G的txt文件,里面每一行是一个分词过的专利全文文档,一共370多万行.我需要把它按每五万行为单位做成一个json文件,格式大致如下: [{"id":"100 ...
Python 自用代码（scrapy多级页面(三级页面)爬虫）
2017-03-28 入职接到的第一个小任务,scrapy多级页面爬虫,从来没写过爬虫,也没学过scrapy,甚至连xpath都没用过,最后用了将近一周才搞定.肯定有很多low爆的地方,希望大家可以给 ...
（Python）自动生成代码（方法一）
在写某个平台的自动化脚本时,笔者把全部的操作都封装到了两个类中,page.py和commonpage.py: page.py部分代码: class BasePage(object): ''' 页面基础 ...
python代码规范与标准库参考
python代码规范与标准库参考 python代码规范参考文献: http://www.runoob.com/w3cnote/google-python-styleguide.html https:/ ...
【代码笔记】Java基础：Java的方法和类
面向过程与面向对象都是我们编程中,编写程序的一种思维方式.例如:公司打扫卫生(擦玻璃.扫地.拖地.倒垃圾等), 按照面向过程的程序设计方式会思考“打扫卫生我该怎么做,然后一件件的完成”,最后把公司卫生 ...
python开发面向对象基础：接口类&抽象类&多态&钻石继承
一,接口类继承有两种用途: 一:继承基类的方法,并且做出自己的改变或者扩展(代码重用) 二:声明某个子类兼容于某基类,定义一个接口类Interface,接口类中定义了一些接口名(就是函数名)且并未实 ...

随机推荐

操作MySQL数据库相关代码
注意事项: 1.导入驱动包,如我导的是mysql-connector-java-5.1.26-bin.jar 2.修改下面代码中的用户名和密码 3.其中URL为"jdbc:mysql://数 ...
JSTL c:url
c:url 标签 jstl 实例代码和用法. <c:url>标记格式化成一个字符串格式的URL,并将其存储到变量中.这个标签会在必要时自动执行URL重写. var属性指定的变量将包 ...
Webpack指南（一）：安装，创建项目，配置文件，开发环境以及问题汇总
Webpack是一个现代 JavaScript 应用程序的静态模块打包器(module bundler).当 webpack 处理应用程序时,它会递归地构建一个依赖关系图(dependency gra ...
MMSEG 中文分词算法翻译
算法原文位于:http://technology.chtsai.org/mmseg/ http://www.360doc.com/content/13/0217/15/11619026_2661428 ...
k8s的service
1.service简介本节开始学习 Service.我们不应该期望 Kubernetes Pod 是健壮的,而是要假设 Pod 中的容器很可能因为各种原因发生故障而死掉.Deployment 等 c ...
jquery禁用select和取消禁用
$("#id").attr("disabled","disabled"); $("#id").removeAttr(&q ...
System Center VMM请注意不同语言版本的差异
在私有云的项目中,经常需要判断System Center一些组件的连接是否OK. 我这里有开发,和测试两个环境,开发是英文版的System Center VMM,测试用的是中文版的System Cen ...
codeforces Round #441 A Trip For Meal【思路/模拟】
A. Trip For Meal time limit per test 1 second memory limit per test 512 megabytes input standard inp ...
使用Nginx+uWSGI部署Django项目
1.linux安装python3环境参考链接:https://www.cnblogs.com/zzqit/p/10087680.html 2.安装uwsgi pip3 install uwsgi l ...
洛谷——P1405 苦恼的小明
P1405 苦恼的小明题目描述黄小明和他的合伙人想要创办一所英语培训机构,注册的时候要填一张个人情况的表格,在身高一栏小明犯了愁. 身高要求精确到厘米,但小明实在太高了,无法在纸上填下这么长的数字 ...

Python 自用代码（某方标准类网页源代码清洗）

Python 自用代码（某方标准类网页源代码清洗）的更多相关文章

随机推荐

热门专题