import urllib
import urllib.request
import bs4
from bs4 import BeautifulSoup as bs
import re
import os # year = '97A'
# ss="./data/%s/"%year
'''
适应网页爬取95B-96B ''' '''
解决网页请求失败
resp = None
while (resp == None):
try:
resp = urllib.request.urlopen("http://baidu.com ")
except:
pass '''
def b0_trmd(year,ss):
if not os.path.exists(ss):
os.makedirs(ss)
# os.makedirs(ss)
p1=r"^([A-Z]{6})" url = "http://www.stylusstudio.com/edifact/D%s/messages.htm"%year
resp=None
while(resp==None):
try:
resp = urllib.request.urlopen(url)
except:
pass
data = resp.read().decode('cp852')
soup = bs(data, 'html.parser')
segment11= soup.find_all('table')# ResultSet
segment1=segment11[0].find_all('td')[1:]#表示第几个table,此时表示进去html网页中的第7个table,[1:],<class 'list'>
# segment2= soup.find_all('table')
# print(type(segment1))#
f2=open(ss+'./trmd1%s.txt'%year,'a',encoding='utf-8')
f3=open(ss+'./b0%s.txt'%year,'a',encoding='utf-8')
f4=open(ss+'./trmd%s.txt'%year,'a',encoding='utf-8')
pattern1=re.compile(p1)
tag_list=[]
for item in segment1:
# print(item.string)#如果一个标签里面没有标签了,那么 .string 就会返回标签里面的内容。如果标签里面只有唯一的一个标签了,那么 .string 也会返回最里面的内容。
str1=item.get_text()
# if str1.strip()=="":用于判断字符串是否含空格
# break
if item.string==None:
# print("hhusssssssssssssssssssss")
break
matcher1=re.findall(pattern1,str1)
if matcher1: f3.write(matcher1[0]+','+year+'\n')
tag_list.append(matcher1[0])
f4.write(matcher1[0]+',')
else:
f4.write(str1+'\n') # print(type(str1))
# test1(str1)
# print(str1)#以文本方式呈现 # print(item.get_text())#获取具体标签内部内容
# print([text for text in item.stripped_strings] )#以列表方式呈现 # str2=str([text for text in item.stripped_strings])
# #print(type(str1[0][0]))
f2.writelines(str1+'\n')
f2.close()
return tag_list
def test1(code_tag,year,ss): url = "http://www.stylusstudio.com/edifact/D%s/%s.htm"%(year,code_tag)
resp=None
while(resp==None):
try:
resp = urllib.request.urlopen(url)
except:
pass
data = resp.read().decode('UTF-8')
soup = bs(data, 'html.parser')
segment11= soup.find_all('table')
segment1=segment11[6].find_all('tr')#表示第几个table,此时表示进去html网页中的第7个table f2=open(ss+'./text1%s%s.txt'%(year,code_tag),'a',encoding='cp852')
for item in segment1: # #print(item)
'''
<tr class="FrameTreeFont"><td><span class="FrameDrawFont">│
<span class="FrameHideFont">─</span>│<span class="FrameHideFont">─</span>├─</span>
<a class="FrameItemFont" href="DAM_.htm" target="classFrame" title="Damage">DAM</a>
Damage</td><td align="right"><span class="FrameDetailFont"> ×1 
</span></td><td><span class="FrameDetailFont">(M)</span></td></tr>
'''
str12=item.get_text()
# #print(str12)#以文本方式呈现
# #print(type(str12))
'''
│─│─├─DAM Damage ×1 (M)
'''
# #print(item.td.span.get_text())#获取具体标签内部内容
# #print([text for text in item.stripped_strings] )#以列表方式呈现
'''
['│', '─', '│', '─', '├─', 'DAM', 'Damage', '×1', '(M)']
'''
'''
soup.get_text("|")#u'\nI linked to |example.com|\n'进一步,通过strip去除掉文本每个位的头尾空白。 soup.get_text("|", strip=True)#u'I linked to|example.com'
'''
str1=str([text for text in item.stripped_strings])
# #print(type(str1[0][0]))
f2.writelines(str12+'\n') f2.close()
def test2(code_tag,year,ss):
# p1=r"^(?:├─|└─)(.+)\n"
p1=r"^\W{2}(\w.+)\n"#
# p1=r"^\W{2}(Segment\sGroup\s\w.+)\n"#segement为第一层
# p2=r"^(?:│─├─|│─└─)(.+)\n"
p2=r"^\W{4}(\w.+)\n"
# p3=r"^(?:│───├─|│───└─|│─│─├─|│─│─└─)(.+)\n"
p3=r"^\W{6}(\w.+)\n"
# p4=r"^(?:)(.+)\n" p4=r"^\W{8}(\w.+)\n"
p5=r"^\W{10}(\w.+)\n"
p6=r"^\W{12}(\w.+)\n"
p7=r"^\W{14}(\w.+)\n"
p8=r"^\W{16}(\w.+)\n" p9=r"Segment\sGroup\s(?:([0-9]|[0-9][0-9]))"
# p10="Segment Group " pattern1=re.compile(p1)
pattern2=re.compile(p2)
pattern3=re.compile(p3)
pattern4=re.compile(p4) pattern5=re.compile(p5)
pattern6=re.compile(p6)
pattern7=re.compile(p7)
pattern8=re.compile(p8)
pattern9=re.compile(p9)
# pattern10=re.compile(p10) f1=open(ss+'./text1%s%s.txt'%(year,code_tag),'r',encoding='cp852')
f2=open(ss+'./text2%s%s.txt'%(year,code_tag),'a',encoding='utf-8')
# c=int()
# d=int()
listp=[0,0,0,0,0,0,0,0]#用于记录父节点
for line in f1.readlines(): matcher1=re.findall(pattern1,line)
matcher2=re.findall(pattern2,line)
matcher3=re.findall(pattern3,line)
matcher4=re.findall(pattern4,line) matcher5=re.findall(pattern5,line)
matcher6=re.findall(pattern6,line)
matcher7=re.findall(pattern7,line)
matcher8=re.findall(pattern8,line)
matcher9=re.findall(pattern9,line)
# #print(type(matcher1)) if matcher1: a='SG'+str(listp[0])+' '+matcher1[0]+'\n'
f2.write(a)
if matcher9:
listp[1]=matcher9[0]
if matcher2: b='SG'+str(listp[1])+' '+matcher2[0]+'\n'
f2.write(b)
if matcher9:
listp[2]=matcher9[0]
if matcher3: c='SG'+str(listp[2])+' '+matcher3[0]+'\n'
f2.write(c)
#print(c)
if matcher9:
listp[3]=matcher9[0]
if matcher4:
d='SG'+str(listp[3])+' '+matcher4[0]+'\n'
f2.write(d)
#print(d)
if matcher9:
listp[4]=matcher9[0]
if matcher5:
e='SG'+str(listp[4])+' '+matcher5[0]+'\n'
f2.write(e)
#print(d)
if matcher9:
listp[5]=matcher9[0]
if matcher6:
f='SG'+str(listp[5])+' '+matcher6[0]+'\n'
f2.write(f)
#print(d)
if matcher9:
listp[6]=matcher9[0]
if matcher7:
g='SG'+str(listp[6])+' '+matcher7[0]+'\n'
f2.write(g)
#print(d)
if matcher9:
listp[7]=matcher9[0]
if matcher8:
h='SG'+str(listp[7])+' '+matcher8[0]+'\n'
f2.write(h)
#print(d)
if matcher9:
listp[8]=matcher9[0]
f2.close()
f1.close()
f3=open(ss+'./text3%s%s.txt'%(year,code_tag),'w',encoding='utf-8')
f4=open(ss+'./text2%s%s.txt'%(year,code_tag),'r',encoding='utf-8')
for line1 in f4.readlines():
#print(line1)
# f3.write(line1.replace(" "," "))
f3.write(line1.replace("Segment Group ","SG"))
f4.close()
f3.close()
def test3(code_tag,year,ss):
f5=open(ss+'./text4%s%s.txt'%(year,code_tag),'a',encoding='utf-8')
f6=open(ss+'./text3%s%s.txt'%(year,code_tag),'r',encoding='utf-8')
p10=r"(^\w{3})\s(\w{3}).+×([0-9]|[0-9]{2}|[0-9]{3}|[0-9]{4}|[0-9]{5})\s\((\w)\)$"
pattern10=re.compile(p10)
i=0
for line2 in f6.readlines():
i=i+1
matcher10=re.findall(pattern10,line2)
# print(matcher10)
# print(type(matcher10))
if matcher10:
f5.write(str(matcher10[0])+'\n') f5.close()
f6.close()
# print(i)
return i
def test4(code_tag,year,ss):
url = "http://www.stylusstudio.com/edifact/D%s/%s.htm"%(year,code_tag)
resp=None
while(resp==None):
try:
resp = urllib.request.urlopen(url)
except:
pass
data = resp.read().decode('UTF-8')
soup = bs(data, 'html.parser')
segment11= soup.find_all('p')
# segment1=segment11[1].find_all('p')#表示第几个table,此时表示进去html网页中的第7个table
# #print(segment1)
f2=open(ss+'./text5%s%s.txt'%(year,code_tag),'a',encoding='utf-8')
for item in segment11:
str12=item.get_text()
#print(str12)#以文本方式呈现
#print(type(str12))
'''
│─│─├─DAM Damage ×1 (M)
'''
# #print(item.td.span.get_text())#获取具体标签内部内容
#print([text for text in item.stripped_strings] )#以列表方式呈现
'''
['│', '─', '│', '─', '├─', 'DAM', 'Damage', '×1', '(M)']
'''
'''
soup.get_text("|")#u'\nI linked to |example.com|\n'进一步,通过strip去除掉文本每个位的头尾空白。 soup.get_text("|", strip=True)#u'I linked to|example.com'
'''
str1=str([text for text in item.stripped_strings])
#print(type(str1[0][0]))
f2.writelines(str12+'\n') f2.close() # f2=open('./text1.txt','a',encoding='cp852')
# for item in segment1:
def test5(code_tag,num,year,ss):
f7=open(ss+'./text6%s%s.txt'%(year,code_tag),'a',encoding='utf-8')
f8=open(ss+'./text5%s%s.txt'%(year,code_tag),'r',encoding='utf-8')
p1=r"(^A\sservice\ssegment.+\n)"
# p2=r"((?:A\s\w|^Date|^This|^Document|^In\s|^Requirements\s|^Dimensions|^The|^If\s|^Through|^Instructions|^For|^An).+\n)"
p2=r"(^(?!Information.+\:|Note|It\sis\srecommended\sthat\swhere|ID\sshould\sbe\sspecified|All\sother\ssegments|A\sgroup\sof\ssegments\sthat\scontains\sa\sline\sitem\sand\sits\srelated\sinformation.+should\sbe\sconsigned.).+\n)"
pattern1=re.compile(p1)
pattern2=re.compile(p2)
# pattern3=re.compile(p3)
# pattern4=re.compile(p4)
flag=0
i=num
for line3 in f8.readlines():
matcher1=re.findall(pattern1,line3)
matcher2=re.findall(pattern2,line3)
# matcher3=re.findall(pattern3,line3)
# matcher4=re.findall(pattern4,line3) # #print(matcher10)
if matcher1 and flag==0:
f7.write(matcher1[0])
flag=1
i=i-1
if i==0:
break
continue
if (matcher2 and (flag==1 or flag==2)):
f7.write(matcher2[0])
flag=2
i=i-1
continue
f7.close()
f8.close() def join(code_tag,year,ss): f1 =open(ss+'text6%s%s.txt'%(year,code_tag),'r',encoding='utf-8')
f2= open(ss+'text4%s%s.txt'%(year,code_tag),'r',encoding='utf-8') list_note=[]
for line1 in f1:
list_note.append(line1)
f1.close()
p11=r"^\W{2}(\w{3}).+\n"
p12=r"^\W{2}\w{3}\W{2}\s\W(\w{3}).+\n"
p13=r"^\W{2}\w{3}\W{2}\s\W\w{3}\W{2}\s\W([0-9]|[0-9]{2}|[0-9]{3}|[0-9]{4}|[0-9]{5})\W.+\n"
p14=r"\W{2}\w{3}\W{2}\s\W\w{3}\W{2}\s\W.+(C|M)"
# print(list_note)
f2_w= open(ss+'b1%s%s.txt'%(year,code_tag),'a',encoding='utf-8')
f3_w= open(ss+'b1%s.csv'%year,'a',encoding='utf-8')
# for i in range(len(list_note)):
i=0
pattern11=re.compile(p11)
pattern12=re.compile(p12)
pattern13=re.compile(p13)
pattern14=re.compile(p14)
# f2_r = open(ss+'/new/%s_w.txt'%list_tag[i])
pos=[ '','','','','','','','','','','','','','','','','','','','',
'','','','','','','','','','','','','','','','','','','','',
'','','','','','','','','','','','','','','','','','','','',
'','','','','','','','','','','','','','','','','','','','',
'','','','','','','','','','','','','','','','','','','','',
'','','','','','','','','','','','','','','','','','','','',
'','','','','','','','','','','','','','','','','','','','',
'','','','','','','','','','','','','','','','','','','','',
'','','','','','','','','','','','','','','','','','','','',
'','','','','','','','','','','','','','','','','','','','',
'','','','','','','','','','','','','','','','','','','','',
'','','','','','','','','','','','','','','','','','','','',
'','','','','','','','','','','','','','','','','','','','',
'','','','','','','','','','','','','','','','','','','','',
'','','','','','','','','','','','','','','','','','','','',
'','','','','','','','','','','','','','','','','','','','',
'','','','','','','','','','','','','','','','','','','','',
'','','','','','','','','','','','','','','','','','','','',
'','','','','','','','','','','','','','','','','','','','',
'','','','','' ]
for line2 in f2:
matcher11=re.findall(pattern11,line2)
matcher12=re.findall(pattern12,line2)
matcher13=re.findall(pattern13,line2)
matcher14=re.findall(pattern14,line2)
# print(matcher11[0])
# print(matcher12[0])
# print(matcher13[0])
# print(matcher14[0])
# print(matcher11[0])
# a=list(line2)
# print(a)
# b=str(a)
# print(b)
# print(line2.split(','))
try:
str11="%s,%s,%s,%s,%s,%s,%s,\"%s\"\n"%(pos[i],code_tag,matcher12[0],matcher11[0],year,matcher14[0],matcher13[0],list_note[i].strip('\n')) i=i+1
# print(i)
# print(str11)
f2_w.write(str11)
f3_w.write(str11)
except:
print("---error---")
break f2_w.close()
f2.close() def test():#用户爬取网页,保存到本地
filename='./codeco.txt'
url = "http://www.stylusstudio.com/edifact/D95B/CODECO.htm"
resp = urllib.request.urlopen(url)
data = resp.read().decode('UTF-8')
# f1=open(filename,'w')
# f1.write(data)
# #print(type(data))
# #print(data)
f2=open('./text.txt','a')
soup = bs(data, 'html.parser') # sw=soup.find_all('table',border=0,width="100%")
# #print(sw[0])
segment1= soup.find_all('h4') segment2= soup.find_all('p')
# #print(type(segment))
#print(segment1)
#print(segment2)
nowplaying_list = []
for item in segment1:
#print(item)
# #print(item.name)
# #print(item.attrs)
# #print(type(item))
#print(item.get_text())
#print([text for text in item.stripped_strings] )
f2.writelines(str([text for text in item.stripped_strings])+'\n')
# nowplaying_dict = {}
# nowplaying_dict['id'] = item['a']
# for tag_img_item in item.find_all('img'):
# nowplaying_dict['name'] = tag_img_item['alt']
# nowplaying_list.append(nowplaying_dict)
# result= segment[0].find_all('h4')
# #print(result) for item in segment2: #print(item)
#print(item.get_text())
f2.writelines(str([text for text in item.stripped_strings] )+'\n')
f2.close()
# data={}
# data['word']='Jecvay Notes' # url_values=urllib.parse.urlencode(data)
# url="http://www.baidu.com/s?"
# full_url=url+url_values # data=urllib.request.urlopen(full_url).read()
# data=data.decode('UTF-8')
# #print(data)
if __name__=='__main__':
# '97A','97B','98A','98B','99A','99B'
year1=['00A','00B','01A','01B','01C','02A','02B','03A','03B','04A','04B']
for j in range(len(year1)): year=year1[j]
ss="./data/%s/"%year
tag=b0_trmd(year,ss)
print(tag)
for i in range(len(tag)):
test1(tag[i],year,ss)
test2(tag[i],year,ss)
num=test3(tag[i],year,ss)
test4(tag[i],year,ss)
test5(tag[i],num,year,ss)
join(tag[i],year,ss)
print("------%s-----ok"%i)
# str1='APERAK'
# join(str1)

python网页爬虫 spiders_97A-04B的更多相关文章

  1. Python 网页爬虫 & 文本处理 & 科学计算 & 机器学习 & 数据挖掘兵器谱(转)

    原文:http://www.52nlp.cn/python-网页爬虫-文本处理-科学计算-机器学习-数据挖掘 曾经因为NLTK的缘故开始学习Python,之后渐渐成为我工作中的第一辅助脚本语言,虽然开 ...

  2. 【Python】Python 网页爬虫 & 文本处理 & 科学计算 & 机器学习 & 数据挖掘兵器谱

    本文转载自:https://www.cnblogs.com/colipso/p/4284510.html 好文 mark http://www.52nlp.cn/python-%E7%BD%91%E9 ...

  3. Python网页爬虫(一)

    很多时候我们想要获得网站的数据,但是网站并没有提供相应的API调用,这时候应该怎么办呢?还有的时候我们需要模拟人的一些行为,例如点击网页上的按钮等,又有什么好的解决方法吗?这些正是python和网页爬 ...

  4. python 网页爬虫+保存图片+多线程+网络代理

    今天,又算是浪费了一天了.python爬虫,之前写过简单的版本,那个时候还不懂原理,现在算是收尾吧. 以前对网页爬虫不了解,感觉非常神奇,但是解开这面面纱,似乎里面的原理并不是很难掌握.首先,明白一个 ...

  5. python网页爬虫

    1. 静态页面爬取 这类最简单啦,右键->查看页面源码时,想下载的信息都能够显示在这里,这时只需要直接down页面源码,代码如下: # Simple open web import urllib ...

  6. python网页爬虫小项目开发

    这是我最近接的一个小项目,花了是整整四天多时间. 任务是将http://www.examcoo.com/index/detail/mid/7网站下所有的试卷里的试题全部提取出来,首先按照题型进行分类, ...

  7. python网页爬虫开发之二

    1.网站robots robotparser模块首先加载robots.txt文件,然后通过can_fetch()函数确定指定的用户代理是否允许访问网页. 2.识别网站技术 3.下载网页 使用urlli ...

  8. python网页爬虫开发之三

    1.抓取目录页后用lxml进行页面解析,获取抓取列表 python3.6 urlparse模块变为urllib.parse 2.Python中有一个专门生成各类假数据的库:Faker 3.python ...

  9. python网页爬虫开发之一

    1.beautifulsoap4 和 scrapy解析和下载网页的代码区别 bs可以离线解释html文件,但是获取html文件是由用户的其他行为的定义的,比如urllib或者request : 而sc ...

  10. python 网页爬虫,带登陆信息

    注意点: 1. 用Fiddler抓取登陆后的headers,cookies; 2. 每抓取一次网页暂停一点时间防止反爬虫; 3. 抓取前,需要关闭Fiddler以防止端口占用. 还需解决的问题: 爬取 ...

随机推荐

  1. docker 配置远程访问

    系统: centos 7 Docker version 1.12.6 yum 安装的  #yum install docker docker server在192.168.111.120上 # vim ...

  2. conductor FAQ

    在一段时间后(如1小时,1天等),您如何安排将任务放入队列中? 轮询任务后,更新任务的状态IN_PROGRESS并将其callbackAfterSeconds设置为所需的时间.任务将保留在队列中,直到 ...

  3. sar命令详细信息

    sar(System Activity Reporter系统活动情况报告)是目前 Linux 上最为全面的系统性能分析工具之一,可以从多方面对系统的活动进行报告,包括:文件的读写情况.系统调用的使用情 ...

  4. 使用HTTP头去绕过WAF(bypasswaf)

    在用node http get 请求的时候,发现的 解决方案: Add headers to all Burp requests to bypass some WAF products. This e ...

  5. samtools 的应用

    1)sam转bam samtools view -bS in.sam > in.bam -b 意思使输出使BAM format -S 意思使输入使SAM,如果@SQ 缺剩, 要写-t

  6. springboot分环境打包(maven动态选择环境)

    分环境打包核心点:spring.profiles.active pom.xml中添加: <profiles> <profile> <id>dev</id> ...

  7. 人机大战中AlphaGo及其执子人黄士杰

    2016年3月9日注定要写入围棋界的历史.IT界的历史以及科学界的历史.当天,韩国著名围棋棋手李世石VS谷歌AlphaGo的人机大战赛在韩国首尔举行.对弈的一方为拥有1200多个处理器的谷歌人工智能系 ...

  8. Princess Principal(思维题)

    Princess Principal https://www.nowcoder.com/acm/contest/201/J 题目描述 阿尔比恩王国(the Albion Kingdom)潜伏着一群代号 ...

  9. c语言定义函数指针和typedef简写

    二种方法来定义函数指针 #include<stdio.h> #include<stdlib.h> #include<Windows.h> int add(int a ...

  10. VC2008 类型重定义的问题

    Q: 比如"a.h"里定义了类a,类a所有函数的实现都放在"a.cpp"里.然后"b.h"和"c.h"都需要用到类a,所 ...