import urllib
import urllib.request
import bs4
from bs4 import BeautifulSoup as bs
import re
import os # year = '97A'
# ss="./data/%s/"%year
'''
适应网页爬取95B-96B ''' '''
解决网页请求失败
resp = None
while (resp == None):
try:
resp = urllib.request.urlopen("http://baidu.com ")
except:
pass '''
def b0_trmd(year,ss):
if not os.path.exists(ss):
os.makedirs(ss)
# os.makedirs(ss)
p1=r"^([A-Z]{6})" url = "http://www.stylusstudio.com/edifact/D%s/messages.htm"%year
resp=None
while(resp==None):
try:
resp = urllib.request.urlopen(url)
except:
pass
data = resp.read().decode('cp852')
soup = bs(data, 'html.parser')
segment11= soup.find_all('table')# ResultSet
segment1=segment11[0].find_all('td')[1:]#表示第几个table,此时表示进去html网页中的第7个table,[1:],<class 'list'>
# segment2= soup.find_all('table')
# print(type(segment1))#
f2=open(ss+'./trmd1%s.txt'%year,'a',encoding='utf-8')
f3=open(ss+'./b0%s.txt'%year,'a',encoding='utf-8')
f4=open(ss+'./trmd%s.txt'%year,'a',encoding='utf-8')
pattern1=re.compile(p1)
tag_list=[]
for item in segment1:
# print(item.string)#如果一个标签里面没有标签了,那么 .string 就会返回标签里面的内容。如果标签里面只有唯一的一个标签了,那么 .string 也会返回最里面的内容。
str1=item.get_text()
# if str1.strip()=="":用于判断字符串是否含空格
# break
if item.string==None:
# print("hhusssssssssssssssssssss")
break
matcher1=re.findall(pattern1,str1)
if matcher1: f3.write(matcher1[0]+','+year+'\n')
tag_list.append(matcher1[0])
f4.write(matcher1[0]+',')
else:
f4.write(str1+'\n') # print(type(str1))
# test1(str1)
# print(str1)#以文本方式呈现 # print(item.get_text())#获取具体标签内部内容
# print([text for text in item.stripped_strings] )#以列表方式呈现 # str2=str([text for text in item.stripped_strings])
# #print(type(str1[0][0]))
f2.writelines(str1+'\n')
f2.close()
return tag_list
def test1(code_tag,year,ss): url = "http://www.stylusstudio.com/edifact/D%s/%s.htm"%(year,code_tag)
resp=None
while(resp==None):
try:
resp = urllib.request.urlopen(url)
except:
pass
data = resp.read().decode('UTF-8')
soup = bs(data, 'html.parser')
segment11= soup.find_all('table')
segment1=segment11[6].find_all('tr')#表示第几个table,此时表示进去html网页中的第7个table f2=open(ss+'./text1%s%s.txt'%(year,code_tag),'a',encoding='cp852')
for item in segment1: # #print(item)
'''
<tr class="FrameTreeFont"><td><span class="FrameDrawFont">│
<span class="FrameHideFont">─</span>│<span class="FrameHideFont">─</span>├─</span>
<a class="FrameItemFont" href="DAM_.htm" target="classFrame" title="Damage">DAM</a>
Damage</td><td align="right"><span class="FrameDetailFont"> ×1 
</span></td><td><span class="FrameDetailFont">(M)</span></td></tr>
'''
str12=item.get_text()
# #print(str12)#以文本方式呈现
# #print(type(str12))
'''
│─│─├─DAM Damage ×1 (M)
'''
# #print(item.td.span.get_text())#获取具体标签内部内容
# #print([text for text in item.stripped_strings] )#以列表方式呈现
'''
['│', '─', '│', '─', '├─', 'DAM', 'Damage', '×1', '(M)']
'''
'''
soup.get_text("|")#u'\nI linked to |example.com|\n'进一步,通过strip去除掉文本每个位的头尾空白。 soup.get_text("|", strip=True)#u'I linked to|example.com'
'''
str1=str([text for text in item.stripped_strings])
# #print(type(str1[0][0]))
f2.writelines(str12+'\n') f2.close()
def test2(code_tag,year,ss):
# p1=r"^(?:├─|└─)(.+)\n"
p1=r"^\W{2}(\w.+)\n"#
# p1=r"^\W{2}(Segment\sGroup\s\w.+)\n"#segement为第一层
# p2=r"^(?:│─├─|│─└─)(.+)\n"
p2=r"^\W{4}(\w.+)\n"
# p3=r"^(?:│───├─|│───└─|│─│─├─|│─│─└─)(.+)\n"
p3=r"^\W{6}(\w.+)\n"
# p4=r"^(?:)(.+)\n" p4=r"^\W{8}(\w.+)\n"
p5=r"^\W{10}(\w.+)\n"
p6=r"^\W{12}(\w.+)\n"
p7=r"^\W{14}(\w.+)\n"
p8=r"^\W{16}(\w.+)\n" p9=r"Segment\sGroup\s(?:([0-9]|[0-9][0-9]))"
# p10="Segment Group " pattern1=re.compile(p1)
pattern2=re.compile(p2)
pattern3=re.compile(p3)
pattern4=re.compile(p4) pattern5=re.compile(p5)
pattern6=re.compile(p6)
pattern7=re.compile(p7)
pattern8=re.compile(p8)
pattern9=re.compile(p9)
# pattern10=re.compile(p10) f1=open(ss+'./text1%s%s.txt'%(year,code_tag),'r',encoding='cp852')
f2=open(ss+'./text2%s%s.txt'%(year,code_tag),'a',encoding='utf-8')
# c=int()
# d=int()
listp=[0,0,0,0,0,0,0,0]#用于记录父节点
for line in f1.readlines(): matcher1=re.findall(pattern1,line)
matcher2=re.findall(pattern2,line)
matcher3=re.findall(pattern3,line)
matcher4=re.findall(pattern4,line) matcher5=re.findall(pattern5,line)
matcher6=re.findall(pattern6,line)
matcher7=re.findall(pattern7,line)
matcher8=re.findall(pattern8,line)
matcher9=re.findall(pattern9,line)
# #print(type(matcher1)) if matcher1: a='SG'+str(listp[0])+' '+matcher1[0]+'\n'
f2.write(a)
if matcher9:
listp[1]=matcher9[0]
if matcher2: b='SG'+str(listp[1])+' '+matcher2[0]+'\n'
f2.write(b)
if matcher9:
listp[2]=matcher9[0]
if matcher3: c='SG'+str(listp[2])+' '+matcher3[0]+'\n'
f2.write(c)
#print(c)
if matcher9:
listp[3]=matcher9[0]
if matcher4:
d='SG'+str(listp[3])+' '+matcher4[0]+'\n'
f2.write(d)
#print(d)
if matcher9:
listp[4]=matcher9[0]
if matcher5:
e='SG'+str(listp[4])+' '+matcher5[0]+'\n'
f2.write(e)
#print(d)
if matcher9:
listp[5]=matcher9[0]
if matcher6:
f='SG'+str(listp[5])+' '+matcher6[0]+'\n'
f2.write(f)
#print(d)
if matcher9:
listp[6]=matcher9[0]
if matcher7:
g='SG'+str(listp[6])+' '+matcher7[0]+'\n'
f2.write(g)
#print(d)
if matcher9:
listp[7]=matcher9[0]
if matcher8:
h='SG'+str(listp[7])+' '+matcher8[0]+'\n'
f2.write(h)
#print(d)
if matcher9:
listp[8]=matcher9[0]
f2.close()
f1.close()
f3=open(ss+'./text3%s%s.txt'%(year,code_tag),'w',encoding='utf-8')
f4=open(ss+'./text2%s%s.txt'%(year,code_tag),'r',encoding='utf-8')
for line1 in f4.readlines():
#print(line1)
# f3.write(line1.replace(" "," "))
f3.write(line1.replace("Segment Group ","SG"))
f4.close()
f3.close()
def test3(code_tag,year,ss):
f5=open(ss+'./text4%s%s.txt'%(year,code_tag),'a',encoding='utf-8')
f6=open(ss+'./text3%s%s.txt'%(year,code_tag),'r',encoding='utf-8')
p10=r"(^\w{3})\s(\w{3}).+×([0-9]|[0-9]{2}|[0-9]{3}|[0-9]{4}|[0-9]{5})\s\((\w)\)$"
pattern10=re.compile(p10)
i=0
for line2 in f6.readlines():
i=i+1
matcher10=re.findall(pattern10,line2)
# print(matcher10)
# print(type(matcher10))
if matcher10:
f5.write(str(matcher10[0])+'\n') f5.close()
f6.close()
# print(i)
return i
def test4(code_tag,year,ss):
url = "http://www.stylusstudio.com/edifact/D%s/%s.htm"%(year,code_tag)
resp=None
while(resp==None):
try:
resp = urllib.request.urlopen(url)
except:
pass
data = resp.read().decode('UTF-8')
soup = bs(data, 'html.parser')
segment11= soup.find_all('p')
# segment1=segment11[1].find_all('p')#表示第几个table,此时表示进去html网页中的第7个table
# #print(segment1)
f2=open(ss+'./text5%s%s.txt'%(year,code_tag),'a',encoding='utf-8')
for item in segment11:
str12=item.get_text()
#print(str12)#以文本方式呈现
#print(type(str12))
'''
│─│─├─DAM Damage ×1 (M)
'''
# #print(item.td.span.get_text())#获取具体标签内部内容
#print([text for text in item.stripped_strings] )#以列表方式呈现
'''
['│', '─', '│', '─', '├─', 'DAM', 'Damage', '×1', '(M)']
'''
'''
soup.get_text("|")#u'\nI linked to |example.com|\n'进一步,通过strip去除掉文本每个位的头尾空白。 soup.get_text("|", strip=True)#u'I linked to|example.com'
'''
str1=str([text for text in item.stripped_strings])
#print(type(str1[0][0]))
f2.writelines(str12+'\n') f2.close() # f2=open('./text1.txt','a',encoding='cp852')
# for item in segment1:
def test5(code_tag,num,year,ss):
f7=open(ss+'./text6%s%s.txt'%(year,code_tag),'a',encoding='utf-8')
f8=open(ss+'./text5%s%s.txt'%(year,code_tag),'r',encoding='utf-8')
p1=r"(^A\sservice\ssegment.+\n)"
# p2=r"((?:A\s\w|^Date|^This|^Document|^In\s|^Requirements\s|^Dimensions|^The|^If\s|^Through|^Instructions|^For|^An).+\n)"
p2=r"(^(?!Information.+\:|Note|It\sis\srecommended\sthat\swhere|ID\sshould\sbe\sspecified|All\sother\ssegments|A\sgroup\sof\ssegments\sthat\scontains\sa\sline\sitem\sand\sits\srelated\sinformation.+should\sbe\sconsigned.).+\n)"
pattern1=re.compile(p1)
pattern2=re.compile(p2)
# pattern3=re.compile(p3)
# pattern4=re.compile(p4)
flag=0
i=num
for line3 in f8.readlines():
matcher1=re.findall(pattern1,line3)
matcher2=re.findall(pattern2,line3)
# matcher3=re.findall(pattern3,line3)
# matcher4=re.findall(pattern4,line3) # #print(matcher10)
if matcher1 and flag==0:
f7.write(matcher1[0])
flag=1
i=i-1
if i==0:
break
continue
if (matcher2 and (flag==1 or flag==2)):
f7.write(matcher2[0])
flag=2
i=i-1
continue
f7.close()
f8.close() def join(code_tag,year,ss): f1 =open(ss+'text6%s%s.txt'%(year,code_tag),'r',encoding='utf-8')
f2= open(ss+'text4%s%s.txt'%(year,code_tag),'r',encoding='utf-8') list_note=[]
for line1 in f1:
list_note.append(line1)
f1.close()
p11=r"^\W{2}(\w{3}).+\n"
p12=r"^\W{2}\w{3}\W{2}\s\W(\w{3}).+\n"
p13=r"^\W{2}\w{3}\W{2}\s\W\w{3}\W{2}\s\W([0-9]|[0-9]{2}|[0-9]{3}|[0-9]{4}|[0-9]{5})\W.+\n"
p14=r"\W{2}\w{3}\W{2}\s\W\w{3}\W{2}\s\W.+(C|M)"
# print(list_note)
f2_w= open(ss+'b1%s%s.txt'%(year,code_tag),'a',encoding='utf-8')
f3_w= open(ss+'b1%s.csv'%year,'a',encoding='utf-8')
# for i in range(len(list_note)):
i=0
pattern11=re.compile(p11)
pattern12=re.compile(p12)
pattern13=re.compile(p13)
pattern14=re.compile(p14)
# f2_r = open(ss+'/new/%s_w.txt'%list_tag[i])
pos=[ '','','','','','','','','','','','','','','','','','','','',
'','','','','','','','','','','','','','','','','','','','',
'','','','','','','','','','','','','','','','','','','','',
'','','','','','','','','','','','','','','','','','','','',
'','','','','','','','','','','','','','','','','','','','',
'','','','','','','','','','','','','','','','','','','','',
'','','','','','','','','','','','','','','','','','','','',
'','','','','','','','','','','','','','','','','','','','',
'','','','','','','','','','','','','','','','','','','','',
'','','','','','','','','','','','','','','','','','','','',
'','','','','','','','','','','','','','','','','','','','',
'','','','','','','','','','','','','','','','','','','','',
'','','','','','','','','','','','','','','','','','','','',
'','','','','','','','','','','','','','','','','','','','',
'','','','','','','','','','','','','','','','','','','','',
'','','','','','','','','','','','','','','','','','','','',
'','','','','','','','','','','','','','','','','','','','',
'','','','','','','','','','','','','','','','','','','','',
'','','','','','','','','','','','','','','','','','','','',
'','','','','' ]
for line2 in f2:
matcher11=re.findall(pattern11,line2)
matcher12=re.findall(pattern12,line2)
matcher13=re.findall(pattern13,line2)
matcher14=re.findall(pattern14,line2)
# print(matcher11[0])
# print(matcher12[0])
# print(matcher13[0])
# print(matcher14[0])
# print(matcher11[0])
# a=list(line2)
# print(a)
# b=str(a)
# print(b)
# print(line2.split(','))
try:
str11="%s,%s,%s,%s,%s,%s,%s,\"%s\"\n"%(pos[i],code_tag,matcher12[0],matcher11[0],year,matcher14[0],matcher13[0],list_note[i].strip('\n')) i=i+1
# print(i)
# print(str11)
f2_w.write(str11)
f3_w.write(str11)
except:
print("---error---")
break f2_w.close()
f2.close() def test():#用户爬取网页,保存到本地
filename='./codeco.txt'
url = "http://www.stylusstudio.com/edifact/D95B/CODECO.htm"
resp = urllib.request.urlopen(url)
data = resp.read().decode('UTF-8')
# f1=open(filename,'w')
# f1.write(data)
# #print(type(data))
# #print(data)
f2=open('./text.txt','a')
soup = bs(data, 'html.parser') # sw=soup.find_all('table',border=0,width="100%")
# #print(sw[0])
segment1= soup.find_all('h4') segment2= soup.find_all('p')
# #print(type(segment))
#print(segment1)
#print(segment2)
nowplaying_list = []
for item in segment1:
#print(item)
# #print(item.name)
# #print(item.attrs)
# #print(type(item))
#print(item.get_text())
#print([text for text in item.stripped_strings] )
f2.writelines(str([text for text in item.stripped_strings])+'\n')
# nowplaying_dict = {}
# nowplaying_dict['id'] = item['a']
# for tag_img_item in item.find_all('img'):
# nowplaying_dict['name'] = tag_img_item['alt']
# nowplaying_list.append(nowplaying_dict)
# result= segment[0].find_all('h4')
# #print(result) for item in segment2: #print(item)
#print(item.get_text())
f2.writelines(str([text for text in item.stripped_strings] )+'\n')
f2.close()
# data={}
# data['word']='Jecvay Notes' # url_values=urllib.parse.urlencode(data)
# url="http://www.baidu.com/s?"
# full_url=url+url_values # data=urllib.request.urlopen(full_url).read()
# data=data.decode('UTF-8')
# #print(data)
if __name__=='__main__':
# '97A','97B','98A','98B','99A','99B'
year1=['00A','00B','01A','01B','01C','02A','02B','03A','03B','04A','04B']
for j in range(len(year1)): year=year1[j]
ss="./data/%s/"%year
tag=b0_trmd(year,ss)
print(tag)
for i in range(len(tag)):
test1(tag[i],year,ss)
test2(tag[i],year,ss)
num=test3(tag[i],year,ss)
test4(tag[i],year,ss)
test5(tag[i],num,year,ss)
join(tag[i],year,ss)
print("------%s-----ok"%i)
# str1='APERAK'
# join(str1)

python网页爬虫 spiders_97A-04B的更多相关文章

  1. Python 网页爬虫 & 文本处理 & 科学计算 & 机器学习 & 数据挖掘兵器谱(转)

    原文:http://www.52nlp.cn/python-网页爬虫-文本处理-科学计算-机器学习-数据挖掘 曾经因为NLTK的缘故开始学习Python,之后渐渐成为我工作中的第一辅助脚本语言,虽然开 ...

  2. 【Python】Python 网页爬虫 & 文本处理 & 科学计算 & 机器学习 & 数据挖掘兵器谱

    本文转载自:https://www.cnblogs.com/colipso/p/4284510.html 好文 mark http://www.52nlp.cn/python-%E7%BD%91%E9 ...

  3. Python网页爬虫(一)

    很多时候我们想要获得网站的数据,但是网站并没有提供相应的API调用,这时候应该怎么办呢?还有的时候我们需要模拟人的一些行为,例如点击网页上的按钮等,又有什么好的解决方法吗?这些正是python和网页爬 ...

  4. python 网页爬虫+保存图片+多线程+网络代理

    今天,又算是浪费了一天了.python爬虫,之前写过简单的版本,那个时候还不懂原理,现在算是收尾吧. 以前对网页爬虫不了解,感觉非常神奇,但是解开这面面纱,似乎里面的原理并不是很难掌握.首先,明白一个 ...

  5. python网页爬虫

    1. 静态页面爬取 这类最简单啦,右键->查看页面源码时,想下载的信息都能够显示在这里,这时只需要直接down页面源码,代码如下: # Simple open web import urllib ...

  6. python网页爬虫小项目开发

    这是我最近接的一个小项目,花了是整整四天多时间. 任务是将http://www.examcoo.com/index/detail/mid/7网站下所有的试卷里的试题全部提取出来,首先按照题型进行分类, ...

  7. python网页爬虫开发之二

    1.网站robots robotparser模块首先加载robots.txt文件,然后通过can_fetch()函数确定指定的用户代理是否允许访问网页. 2.识别网站技术 3.下载网页 使用urlli ...

  8. python网页爬虫开发之三

    1.抓取目录页后用lxml进行页面解析,获取抓取列表 python3.6 urlparse模块变为urllib.parse 2.Python中有一个专门生成各类假数据的库:Faker 3.python ...

  9. python网页爬虫开发之一

    1.beautifulsoap4 和 scrapy解析和下载网页的代码区别 bs可以离线解释html文件,但是获取html文件是由用户的其他行为的定义的,比如urllib或者request : 而sc ...

  10. python 网页爬虫,带登陆信息

    注意点: 1. 用Fiddler抓取登陆后的headers,cookies; 2. 每抓取一次网页暂停一点时间防止反爬虫; 3. 抓取前,需要关闭Fiddler以防止端口占用. 还需解决的问题: 爬取 ...

随机推荐

  1. python 安装mysqldb组件

    只能源码安装 源码下载: http://sourceforge.net/projects/mysql-python/files/mysql-python/1.2.3/ http://sourcefor ...

  2. Life is in the little things --- Spreading wildly on Facebook

    这是在FaceBook上疯传的一组图 简笔画的图画的不算精细 但却狠狠地击中许多人的心灵   有时候生活中简单的一件小事, 恰恰是使得你的人生变得更有意义的一件大事! 别人总告诉你 人生是这样的 ▼ ...

  3. UGUI RectTransform

    RectTransform解析 当 Anchor 在同一点时,显示的是物体的座标与大小Pos X.Pos Y.Width.Height ,当 Anchor 不在同一点时(此时会形成矩形),显示的会是 ...

  4. Haskell语言学习笔记(61)Distributive

    Distributive class Functor g => Distributive g where distribute :: Functor f => f (g a) -> ...

  5. COM组件 IDispatch 及双接口的调用

    转自:http://blog.csdn.net/cnhk1225/article/details/50555647 一.前言 前段时间,由于工作比较忙,没有能及时地写作.其间收到了很多网友的来信询问和 ...

  6. 通过Roslyn构建自己的C#脚本(更新版)(转)

      http://www.cnblogs.com/TianFang/p/6939723.html   之前写过文章介绍过如何通过Roslyn构建自己的C#脚本,但那篇文章是参考自Roslyn CTP版 ...

  7. 关于struts2输出excel表

    web.xml: <web-app id="WebApp_9" version="2.4" xmlns="http://java.sun.com ...

  8. break、continue、pass介绍

    break.continue.pass介绍 break:跳出当前循环 continue:跳出本次循环,进行下一次循环 pass:什么也不做,占位.

  9. django1.8模板位置的设置setting.py

    大多数django教程比较老,给出的template的设置方案为: 更改工程下的setting.py文件, TEMPLATE_DIRS = (     os.path.join( APP_DIR, ' ...

  10. C/C++ 宏

    0. #define MALLOC(n, type) \ ( (type *) malloc((n)* sizeof(type))) 1. 宏可以像函数一样被定义,例如:    #define min ...