python网页爬虫 spiders

 import urllib

 import urllib.request

 import bs4

 from bs4 import BeautifulSoup as bs

 import re

 import os 

 # year = '97A'

 # ss="./data/%s/"%year

 '''

 适应网页爬取95B-96B

 '''

 '''

 解决网页请求失败

 resp = None

 while (resp == None):

     try:

         resp = urllib.request.urlopen("http://baidu.com 

 ")

     except:

         pass

 '''

 def b0_trmd(year,ss):

     if not os.path.exists(ss):

         os.makedirs(ss)

     # os.makedirs(ss)

     p1=r"^([A-Z]{6})"

     url = "http://www.stylusstudio.com/edifact/D%s/messages.htm"%year

     resp=None

     while(resp==None):

         try:

             resp = urllib.request.urlopen(url)

         except:

             pass

     data = resp.read().decode('cp852')

     soup = bs(data, 'html.parser')

     segment11= soup.find_all('table')# ResultSet

     segment1=segment11[0].find_all('td')[1:]#表示第几个table，此时表示进去html网页中的第7个table,[1:],<class 'list'>

     # segment2= soup.find_all('table')

     # print(type(segment1))#

     f2=open(ss+'./trmd1%s.txt'%year,'a',encoding='utf-8')

     f3=open(ss+'./b0%s.txt'%year,'a',encoding='utf-8')

     f4=open(ss+'./trmd%s.txt'%year,'a',encoding='utf-8')

     pattern1=re.compile(p1)

     tag_list=[]

     for item in segment1:

             # print(item.string)#如果一个标签里面没有标签了，那么 .string 就会返回标签里面的内容。如果标签里面只有唯一的一个标签了，那么 .string 也会返回最里面的内容。

             str1=item.get_text()

             # if str1.strip()=="":用于判断字符串是否含空格

             #     break

             if item.string==None:

                 # print("hhusssssssssssssssssssss")

                 break

             matcher1=re.findall(pattern1,str1)

             if matcher1:

                 f3.write(matcher1[0]+','+year+'\n')

                 tag_list.append(matcher1[0])

                 f4.write(matcher1[0]+',')

             else:

                 f4.write(str1+'\n')

             # print(type(str1))

             # test1(str1)

             # print(str1)#以文本方式呈现

             # print(item.get_text())#获取具体标签内部内容

             # print([text for text in item.stripped_strings] )#以列表方式呈现

             # str2=str([text for text in item.stripped_strings])

             # #print(type(str1[0][0]))

             f2.writelines(str1+'\n')

     f2.close()

     return tag_list

 def test1(code_tag,year,ss):

     url = "http://www.stylusstudio.com/edifact/D%s/%s.htm"%(year,code_tag)

     resp=None

     while(resp==None):

         try:

             resp = urllib.request.urlopen(url)

         except:

             pass

     data = resp.read().decode('UTF-8')

     soup = bs(data, 'html.parser')

     segment11= soup.find_all('table')

     segment1=segment11[6].find_all('tr')#表示第几个table，此时表示进去html网页中的第7个table

     f2=open(ss+'./text1%s%s.txt'%(year,code_tag),'a',encoding='cp852')

     for item in segment1:

             # #print(item)

             '''

             <tr class="FrameTreeFont"><td><span class="FrameDrawFont">│

             <span class="FrameHideFont">─</span>│<span class="FrameHideFont">─</span>├─</span>

             <a class="FrameItemFont" href="DAM_.htm" target="classFrame" title="Damage">DAM</a>

             Damage</td><td align="right"><span class="FrameDetailFont"> ×1 

             </span></td><td><span class="FrameDetailFont">(M)</span></td></tr>

             '''

             str12=item.get_text()

             # #print(str12)#以文本方式呈现

             # #print(type(str12))

             '''

             │─│─├─DAM Damage ×1 (M)

             '''

             # #print(item.td.span.get_text())#获取具体标签内部内容

             # #print([text for text in item.stripped_strings] )#以列表方式呈现

             '''

             ['│', '─', '│', '─', '├─', 'DAM', 'Damage', '×1', '(M)']

             '''

             '''

             soup.get_text("|")#u'\nI linked to |example.com|\n'进一步，通过strip去除掉文本每个位的头尾空白。

             soup.get_text("|", strip=True)#u'I linked to|example.com'

             '''

             str1=str([text for text in item.stripped_strings])

             # #print(type(str1[0][0]))

             f2.writelines(str12+'\n')

     f2.close()

 def test2(code_tag,year,ss):

     # p1=r"^(?:├─|└─)(.+)\n"

     p1=r"^\W{2}(\w.+)\n"#

     # p1=r"^\W{2}(Segment\sGroup\s\w.+)\n"#segement为第一层

     # p2=r"^(?:│─├─|│─└─)(.+)\n"

     p2=r"^\W{4}(\w.+)\n"

     # p3=r"^(?:│───├─|│───└─|│─│─├─|│─│─└─)(.+)\n"

     p3=r"^\W{6}(\w.+)\n"

     # p4=r"^(?:)(.+)\n"

     p4=r"^\W{8}(\w.+)\n"

     p5=r"^\W{10}(\w.+)\n"

     p6=r"^\W{12}(\w.+)\n"

     p7=r"^\W{14}(\w.+)\n"

     p8=r"^\W{16}(\w.+)\n"

     p9=r"Segment\sGroup\s(?:([0-9]|[0-9][0-9]))"

     # p10="Segment Group "

     pattern1=re.compile(p1)

     pattern2=re.compile(p2)

     pattern3=re.compile(p3)

     pattern4=re.compile(p4)

     pattern5=re.compile(p5)

     pattern6=re.compile(p6)

     pattern7=re.compile(p7)

     pattern8=re.compile(p8)

     pattern9=re.compile(p9)

     # pattern10=re.compile(p10)

     f1=open(ss+'./text1%s%s.txt'%(year,code_tag),'r',encoding='cp852')

     f2=open(ss+'./text2%s%s.txt'%(year,code_tag),'a',encoding='utf-8')

     # c=int()

     # d=int()

     listp=[0,0,0,0,0,0,0,0]#用于记录父节点

     for line in f1.readlines():

         matcher1=re.findall(pattern1,line)

         matcher2=re.findall(pattern2,line)

         matcher3=re.findall(pattern3,line)

         matcher4=re.findall(pattern4,line)

         matcher5=re.findall(pattern5,line)

         matcher6=re.findall(pattern6,line)

         matcher7=re.findall(pattern7,line)

         matcher8=re.findall(pattern8,line)

         matcher9=re.findall(pattern9,line)

         # #print(type(matcher1))

         if matcher1:

             a='SG'+str(listp[0])+' '+matcher1[0]+'\n'

             f2.write(a)

             if matcher9:

                 listp[1]=matcher9[0]

         if matcher2:

             b='SG'+str(listp[1])+' '+matcher2[0]+'\n'

             f2.write(b)

             if matcher9:

                 listp[2]=matcher9[0]

         if matcher3:

             c='SG'+str(listp[2])+' '+matcher3[0]+'\n'

             f2.write(c)

             #print(c)

             if matcher9:

                 listp[3]=matcher9[0]

         if matcher4:

             d='SG'+str(listp[3])+' '+matcher4[0]+'\n'

             f2.write(d)

             #print(d)

             if matcher9:

                 listp[4]=matcher9[0]

         if matcher5:

             e='SG'+str(listp[4])+' '+matcher5[0]+'\n'

             f2.write(e)

             #print(d)

             if matcher9:

                 listp[5]=matcher9[0]

         if matcher6:

             f='SG'+str(listp[5])+' '+matcher6[0]+'\n'

             f2.write(f)

             #print(d)

             if matcher9:

                 listp[6]=matcher9[0]

         if matcher7:

             g='SG'+str(listp[6])+' '+matcher7[0]+'\n'

             f2.write(g)

             #print(d)

             if matcher9:

                 listp[7]=matcher9[0]

         if matcher8:

             h='SG'+str(listp[7])+' '+matcher8[0]+'\n'

             f2.write(h)

             #print(d)

             if matcher9:

                 listp[8]=matcher9[0]

     f2.close()

     f1.close()

     f3=open(ss+'./text3%s%s.txt'%(year,code_tag),'w',encoding='utf-8')

     f4=open(ss+'./text2%s%s.txt'%(year,code_tag),'r',encoding='utf-8')

     for line1 in f4.readlines():

         #print(line1)

         # f3.write(line1.replace(" "," "))

         f3.write(line1.replace("Segment Group ","SG"))

     f4.close()

     f3.close()

 def test3(code_tag,year,ss):

     f5=open(ss+'./text4%s%s.txt'%(year,code_tag),'a',encoding='utf-8')

     f6=open(ss+'./text3%s%s.txt'%(year,code_tag),'r',encoding='utf-8')

     p10=r"(^\w{3})\s(\w{3}).+×([0-9]|[0-9]{2}|[0-9]{3}|[0-9]{4}|[0-9]{5})\s\((\w)\)$"

     pattern10=re.compile(p10)

     i=0

     for line2 in f6.readlines():

         i=i+1

         matcher10=re.findall(pattern10,line2)

         # print(matcher10)

         # print(type(matcher10))

         if matcher10:

             f5.write(str(matcher10[0])+'\n')

     f5.close()

     f6.close()

     # print(i)

     return i

 def test4(code_tag,year,ss):

     url = "http://www.stylusstudio.com/edifact/D%s/%s.htm"%(year,code_tag)

     resp=None

     while(resp==None):

         try:

             resp = urllib.request.urlopen(url)

         except:

             pass

     data = resp.read().decode('UTF-8')

     soup = bs(data, 'html.parser')

     segment11= soup.find_all('p')

     # segment1=segment11[1].find_all('p')#表示第几个table，此时表示进去html网页中的第7个table

     # #print(segment1)

     f2=open(ss+'./text5%s%s.txt'%(year,code_tag),'a',encoding='utf-8')

     for item in segment11:

         str12=item.get_text()

         #print(str12)#以文本方式呈现

         #print(type(str12))

         '''

         │─│─├─DAM Damage ×1 (M)

         '''

         # #print(item.td.span.get_text())#获取具体标签内部内容

         #print([text for text in item.stripped_strings] )#以列表方式呈现

         '''

         ['│', '─', '│', '─', '├─', 'DAM', 'Damage', '×1', '(M)']

         '''

         '''

         soup.get_text("|")#u'\nI linked to |example.com|\n'进一步，通过strip去除掉文本每个位的头尾空白。

         soup.get_text("|", strip=True)#u'I linked to|example.com'

         '''

         str1=str([text for text in item.stripped_strings])

         #print(type(str1[0][0]))

         f2.writelines(str12+'\n')

     f2.close()

     # f2=open('./text1.txt','a',encoding='cp852')

     # for item in segment1:

 def test5(code_tag,num,year,ss):

     f7=open(ss+'./text6%s%s.txt'%(year,code_tag),'a',encoding='utf-8')

     f8=open(ss+'./text5%s%s.txt'%(year,code_tag),'r',encoding='utf-8')

     p1=r"(^A\sservice\ssegment.+\n)"

     # p2=r"((?:A\s\w|^Date|^This|^Document|^In\s|^Requirements\s|^Dimensions|^The|^If\s|^Through|^Instructions|^For|^An).+\n)"

     p2=r"(^(?!Information.+\:|Note|It\sis\srecommended\sthat\swhere|ID\sshould\sbe\sspecified|All\sother\ssegments|A\sgroup\sof\ssegments\sthat\scontains\sa\sline\sitem\sand\sits\srelated\sinformation.+should\sbe\sconsigned.).+\n)"

     pattern1=re.compile(p1)

     pattern2=re.compile(p2)

     # pattern3=re.compile(p3)

     # pattern4=re.compile(p4)

     flag=0

     i=num

     for line3 in f8.readlines():

         matcher1=re.findall(pattern1,line3)

         matcher2=re.findall(pattern2,line3)

         # matcher3=re.findall(pattern3,line3)

         # matcher4=re.findall(pattern4,line3)

         # #print(matcher10)

         if matcher1 and flag==0:

             f7.write(matcher1[0])

             flag=1

             i=i-1

             if i==0:

                 break

             continue

         if (matcher2 and (flag==1 or flag==2)):

             f7.write(matcher2[0])

             flag=2

             i=i-1

             continue

     f7.close()

     f8.close()

 def join(code_tag,year,ss):

     f1 =open(ss+'text6%s%s.txt'%(year,code_tag),'r',encoding='utf-8')

     f2= open(ss+'text4%s%s.txt'%(year,code_tag),'r',encoding='utf-8')

     list_note=[]

     for line1 in f1:

         list_note.append(line1)

     f1.close()

     p11=r"^\W{2}(\w{3}).+\n"

     p12=r"^\W{2}\w{3}\W{2}\s\W(\w{3}).+\n"

     p13=r"^\W{2}\w{3}\W{2}\s\W\w{3}\W{2}\s\W([0-9]|[0-9]{2}|[0-9]{3}|[0-9]{4}|[0-9]{5})\W.+\n"

     p14=r"\W{2}\w{3}\W{2}\s\W\w{3}\W{2}\s\W.+(C|M)"

     # print(list_note)

     f2_w= open(ss+'b1%s%s.txt'%(year,code_tag),'a',encoding='utf-8')

     f3_w= open(ss+'b1%s.csv'%year,'a',encoding='utf-8')

     # for i in range(len(list_note)):

     i=0

     pattern11=re.compile(p11)

     pattern12=re.compile(p12)

     pattern13=re.compile(p13)

     pattern14=re.compile(p14)

     # f2_r = open(ss+'/new/%s_w.txt'%list_tag[i])

     pos=[

     '','','','','','','','','','','','','','','','','','','','',

     '','','','','','','','','','','','','','','','','','','','',

     '','','','','','','','','','','','','','','','','','','','',

     '','','','','','','','','','','','','','','','','','','','',

     '','','','','','','','','','','','','','','','','','','','',

     '','','','','','','','','','','','','','','','','','','','',

     '','','','','','','','','','','','','','','','','','','','',

     '','','','','','','','','','','','','','','','','','','','',

     '','','','','','','','','','','','','','','','','','','','',

     '','','','','','','','','','','','','','','','','','','','',

     '','','','','','','','','','','','','','','','','','','','',

     '','','','','','','','','','','','','','','','','','','','',

     '','','','','','','','','','','','','','','','','','','','',

     '','','','','','','','','','','','','','','','','','','','',

     '','','','','','','','','','','','','','','','','','','','',

     '','','','','','','','','','','','','','','','','','','','',

     '','','','','','','','','','','','','','','','','','','','',

     '','','','','','','','','','','','','','','','','','','','',

     '','','','','','','','','','','','','','','','','','','','',

     '','','','',''

     ]

     for line2 in f2:

         matcher11=re.findall(pattern11,line2)

         matcher12=re.findall(pattern12,line2)

         matcher13=re.findall(pattern13,line2)

         matcher14=re.findall(pattern14,line2)

         # print(matcher11[0])

         # print(matcher12[0])

         # print(matcher13[0])

         # print(matcher14[0])

         # print(matcher11[0])

         # a=list(line2)

         # print(a)

         # b=str(a)

         # print(b)

         # print(line2.split(','))

         try:

             str11="%s,%s,%s,%s,%s,%s,%s,\"%s\"\n"%(pos[i],code_tag,matcher12[0],matcher11[0],year,matcher14[0],matcher13[0],list_note[i].strip('\n'))

             i=i+1

             # print(i)

             # print(str11)

             f2_w.write(str11)

             f3_w.write(str11)

         except:

             print("---error---")

             break

     f2_w.close()

     f2.close()

 def test():#用户爬取网页，保存到本地

     filename='./codeco.txt'

     url = "http://www.stylusstudio.com/edifact/D95B/CODECO.htm"

     resp = urllib.request.urlopen(url)

     data = resp.read().decode('UTF-8')

     # f1=open(filename,'w')

     # f1.write(data)

     # #print(type(data))

     # #print(data)

     f2=open('./text.txt','a')

     soup = bs(data, 'html.parser')    

     # sw=soup.find_all('table',border=0,width="100%")

     # #print(sw[0])

     segment1= soup.find_all('h4')

     segment2= soup.find_all('p')

     # #print(type(segment))

     #print(segment1)

     #print(segment2)

     nowplaying_list = []

     for item in segment1:

             #print(item)

             # #print(item.name)

             # #print(item.attrs)

             # #print(type(item))

             #print(item.get_text())

             #print([text for text in item.stripped_strings] )

             f2.writelines(str([text for text in item.stripped_strings])+'\n')

             # nowplaying_dict = {}

             # nowplaying_dict['id'] = item['a']

             # for tag_img_item in item.find_all('img'):

             #     nowplaying_dict['name'] = tag_img_item['alt']

             #     nowplaying_list.append(nowplaying_dict)

     # result= segment[0].find_all('h4')

     # #print(result)

     for item in segment2:

             #print(item)

             #print(item.get_text())

             f2.writelines(str([text for text in item.stripped_strings] )+'\n')

     f2.close()

     # data={}

     # data['word']='Jecvay Notes'

     # url_values=urllib.parse.urlencode(data)

     # url="http://www.baidu.com/s?"

     # full_url=url+url_values

     # data=urllib.request.urlopen(full_url).read()

     # data=data.decode('UTF-8')

     # #print(data)

 if __name__=='__main__':

     # '97A','97B','98A','98B','99A','99B'

     year1=['00A','00B','01A','01B','01C','02A','02B','03A','03B','04A','04B']

     for j in range(len(year1)):

         year=year1[j]

         ss="./data/%s/"%year

         tag=b0_trmd(year,ss)

         print(tag)

         for i in range(len(tag)):

             test1(tag[i],year,ss)

             test2(tag[i],year,ss)

             num=test3(tag[i],year,ss)

             test4(tag[i],year,ss)

             test5(tag[i],num,year,ss)

             join(tag[i],year,ss)

             print("------%s-----ok"%i)

     # str1='APERAK'

     # join(str1)

python网页爬虫 spiders_97A-04B的更多相关文章

Python 网页爬虫 & 文本处理 & 科学计算 & 机器学习 & 数据挖掘兵器谱（转）
原文:http://www.52nlp.cn/python-网页爬虫-文本处理-科学计算-机器学习-数据挖掘曾经因为NLTK的缘故开始学习Python,之后渐渐成为我工作中的第一辅助脚本语言,虽然开 ...
【Python】Python 网页爬虫 & 文本处理 & 科学计算 & 机器学习 & 数据挖掘兵器谱
本文转载自:https://www.cnblogs.com/colipso/p/4284510.html 好文 mark http://www.52nlp.cn/python-%E7%BD%91%E9 ...
Python网页爬虫（一）
很多时候我们想要获得网站的数据,但是网站并没有提供相应的API调用,这时候应该怎么办呢?还有的时候我们需要模拟人的一些行为,例如点击网页上的按钮等,又有什么好的解决方法吗?这些正是python和网页爬 ...
python 网页爬虫+保存图片+多线程+网络代理
今天,又算是浪费了一天了.python爬虫,之前写过简单的版本,那个时候还不懂原理,现在算是收尾吧. 以前对网页爬虫不了解,感觉非常神奇,但是解开这面面纱,似乎里面的原理并不是很难掌握.首先,明白一个 ...
python网页爬虫
1. 静态页面爬取这类最简单啦,右键->查看页面源码时,想下载的信息都能够显示在这里,这时只需要直接down页面源码,代码如下: # Simple open web import urllib ...
python网页爬虫小项目开发
这是我最近接的一个小项目,花了是整整四天多时间. 任务是将http://www.examcoo.com/index/detail/mid/7网站下所有的试卷里的试题全部提取出来,首先按照题型进行分类, ...
python网页爬虫开发之二
1.网站robots robotparser模块首先加载robots.txt文件,然后通过can_fetch()函数确定指定的用户代理是否允许访问网页. 2.识别网站技术 3.下载网页使用urlli ...
python网页爬虫开发之三
1.抓取目录页后用lxml进行页面解析,获取抓取列表 python3.6 urlparse模块变为urllib.parse 2.Python中有一个专门生成各类假数据的库:Faker 3.python ...
python网页爬虫开发之一
1.beautifulsoap4 和 scrapy解析和下载网页的代码区别 bs可以离线解释html文件,但是获取html文件是由用户的其他行为的定义的,比如urllib或者request : 而sc ...
python 网页爬虫，带登陆信息
注意点: 1. 用Fiddler抓取登陆后的headers,cookies; 2. 每抓取一次网页暂停一点时间防止反爬虫; 3. 抓取前,需要关闭Fiddler以防止端口占用. 还需解决的问题: 爬取 ...

随机推荐

7 python 类的组合
1.组合与重用性软件重用的重要方式除了继承之外还有另外一种方式,即:组合组合指的是,在一个类中以另外一个类的对象作为 1.一个类的属性可以是一个类对象,通常情况下在一个类里面很少定义一个对象就是它 ...
Window python下载安装
Window python下载安装 http://www.runoob.com/python/python-install.html https://pan.baidu.com/s/1MoR9nWUY ...
cprogram作业
刘金福 SA17225205 第三次作业 url:http://blog.csdn.net/liu896749150/article/details/78176433 学号:SA17225404 姓名 ...
ubuntu中vim的设置
问题:刚安装的VIM中,backspace不能删除字符,且上下左右箭头没反应. 解决方法: sudo vi /etc/vim/vimrc.tiny 修改 set compatible为set noc ...
java重载（实现同一方法名，不同参数）
背景: 前几天写连接数据库时,因为要执行sql,有的是指向得到所有的执行结果,有的是想根据执行结果获得某一个字段的结果.这时我想通过同一个方法名,不同的参数,获得不同的结果.结果发现java的方法竟 ...
Mybatis-Generator自动生成Dao、Model、Mapping等相关映射文件（懒人版）
今天在学习mybatis生成相关的映射文件的时候,发现了往期的生成Dao.Model.Mapping等文章多数都是一样的,我也在学着重复造轮子,不过是懒人造的.本文旨在解决开发过程,简化配置文件的“手 ...
使用JSON.parse()转化成json对象需要注意的地方
http://blog.csdn.net/u011277123/article/details/53055479 有三种方法: var str = '{"name":"小 ...
重新认识trim，ltrim，rtrim，trailing和leading。
trim经常用来去除一个字符串的空格,select trim(' dhajkjwa ') from dual; 在上面的语句中,trim的前面也可以加r或者l,表示去掉前面或者后面的空格,r和l代表左 ...
第五章二叉树（c）二叉树
判断UNITY版本号
代码示例: #if (UNITY_5_3 || UNITY_5_4 || UNITY_5_5 || UNITY_5_6 || UNITY_5_7 || UNITY_5_8 || UNITY_5_9)u ...

python网页爬虫 spiders_97A-04B

python网页爬虫 spiders_97A-04B的更多相关文章

随机推荐

热门专题