2017-11-11 Sa Oct Spider

4:33 PM

Again.

Firstly test liburl:

# -*- coding: utf-8 -*-

import json
import datetime
import HTMLParser
import urlparse
import urllib
import urllib2
import cookielib
import string
import re
import sys
import threading
import os
import tempfile
from bs4 import BeautifulSoup
from prettytable import PrettyTable reload(sys)
sys.setdefaultencoding("utf-8") def openWithBrowser(filename):
os.system('python -m webbrowser "{}"'.format(filename)) name = 'xxx'
no = 'xxx' hosturl = 'http://android.gdgzez.com.cn/szxy/yanjiuxingxuexi/student_login.aspx'
posturl = 'http://android.gdgzez.com.cn/szxy/yanjiuxingxuexi/student_login.aspx'
cj = cookielib.LWPCookieJar()
cookie_support = urllib2.HTTPCookieProcessor(cj)
opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler)
urllib2.install_opener(opener)
h = urllib2.urlopen(hosturl) headers = {
'User-Agent' : 'Mozilla/5.0 (iPad; U; CPU OS 3_2_1 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Mobile/7B405',
'Referer' : 'http://android.gdgzez.com.cn/szxy/yanjiuxingxuexi/student_login.aspx'
} idx = name + ' ' + no postData = {
'__VIEWSTATE' : '/wEPDwULLTE3NzI1OTE3OTFkZJk4xBOpTGvHILGFeCbFQfQQv9dbWzdoB6AOexN4BTx0',
'__EVENTVALIDATION' : '/wEWBQLMmfO1BgL7uPQdAt765bwOAsaZ0ZUMApn3i+sBQi9nlVqoFrBfAjkxtVAWnUBZPnKm6VON7F01iBJzBXw=',
'name' : name,
'pwd' : '12345',
'btnchange' : '登录',
'xuehao' : no
} postData = urllib.urlencode(postData)
request = urllib2.Request(posturl, postData, headers)
response = urllib2.urlopen(request, timeout=5) with tempfile.NamedTemporaryFile(suffix='.html', delete=False) as f:
f.write(response.read())
openWithBrowser(f.name)

Good. Nothing changed. Them apply the table.

5:09 PM

# -*- coding: utf-8 -*-

import json
import datetime
import HTMLParser
import urlparse
import urllib
import urllib2
import cookielib
import string
import re
import sys
import threading
import os
import tempfile
from bs4 import BeautifulSoup
from prettytable import PrettyTable reload(sys)
sys.setdefaultencoding("utf-8") def openWithBrowser(filename):
os.system('python -m webbrowser "{}"'.format(filename)) version = datetime.datetime.now().strftime("%y-%m-%d %a %b %H-%M-%S result")
os.mkdir(version) hosturl = 'http://android.gdgzez.com.cn/szxy/yanjiuxingxuexi/student_login.aspx'
posturl = 'http://android.gdgzez.com.cn/szxy/yanjiuxingxuexi/student_login.aspx'
cj = cookielib.LWPCookieJar()
cookie_support = urllib2.HTTPCookieProcessor(cj)
opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler)
urllib2.install_opener(opener)
h = urllib2.urlopen(hosturl) headers = {
'User-Agent' : 'Mozilla/5.0 (iPad; U; CPU OS 3_2_1 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Mobile/7B405',
'Referer' : 'http://android.gdgzez.com.cn/szxy/yanjiuxingxuexi/student_login.aspx'
} def get(name, no):
global hosturl, posturl, cj, cookie_support, opener, h, headers postData = {
'__VIEWSTATE' : '/wEPDwULLTE3NzI1OTE3OTFkZJk4xBOpTGvHILGFeCbFQfQQv9dbWzdoB6AOexN4BTx0',
'__EVENTVALIDATION' : '/wEWBQLMmfO1BgL7uPQdAt765bwOAsaZ0ZUMApn3i+sBQi9nlVqoFrBfAjkxtVAWnUBZPnKm6VON7F01iBJzBXw=',
'name' : name,
'pwd' : '12345',
'btnchange' : '登录',
'xuehao' : no
} postData = urllib.urlencode(postData)
request = urllib2.Request(posturl, postData, headers)
response = urllib2.urlopen(request, timeout=5) with open('{}/{}.html'.format(version, no), 'w') as f:
f.write(response.read().replace('<head>', '<head><meta charset="utf-8">')) with open('result_utf8.csv', "rb") as f:
print version
for line in f:
(name, no, x1, x2) = line.split(',')
try:
get(name, no)
except:
pass

It took some time to output to Chinese filename. Gave up eventually. It even raised exception when I printed name (Chinese) to the console (decode stuff).

Then I'd write a reporter.

6:41 PM

# -*- coding: utf-8 -*-

import json
import datetime
import HTMLParser
import urlparse
import urllib
import urllib2
import cookielib
import string
import re
import sys
import threading
import os
import tempfile
from bs4 import BeautifulSoup
from prettytable import PrettyTable
import Tkinter reload(sys)
sys.setdefaultencoding("utf-8") csv = [line.split(',') for line in open('result_utf8.csv')] def getname(no):
for i in csv:
if i[1] == no:
return i[0]
return '' def getcourse(filename):
s = open(filename).read()
i = s.find('退选') if i != -1:
trbegin = s.find('<tr>', i) # s[trbegin...] e.g.
# <tr>
# <td width="10%">
# <a id="GridView1_ctl02_LinkButton1" href="ja
# vascript:__doPostBack('GridView1$ctl02$LinkButton1','')">348</a>
#
# </td><td>12</td><td>生物培优班</td><td>xxx</td><td>&n
# bsp;</td><td width="10%">
# <a id="GridView1_ctl02_LinkButton2" href="ja
# vascript:__doPostBack('GridView1$ctl02$LinkButton2','')">退选</a>
# </td> trend = s.find('</tr>', trbegin) read = 0
res = '' i = trbegin
while i < trend:
if s[i] == '<':
while s[i] != '>':
i += 1
i += 1
continue end = False
while s[i] != '<':
if s[i] == '&':
end = True
break
res += s[i]
i += 1 if end:
break res += ' ' res2 = ''
i = 0
while i < len(res) and not (res[i] in "0123456789"):
i += 1 while i < len(res):
if res[i] == '\n':
i += 1
else:
res2 += res[i]
if res[i] == ' ':
while i < len(res) and res[i] == ' ':
i += 1
else:
i += 1 return res2
return '' def report():
with tempfile.NamedTemporaryFile(suffix='.html', delete=False) as f:
wd = workdir.get() os.chdir(wd)
f.write('<head><meta charset="utf-8"></head>')
f.write('<h1>Spider report</h1>')
f.write('<p><b>Version {}</b></p>'.format(wd))
f.write('<table>') for i in os.listdir('.'):
(no, x1) = i.split('.')
name = getname(no)
s = getcourse(i)
f.write('<tr><th>{}</th><th>{}</th><td>{}</td></tr>'.format(no, name, s)) os.system('python -m webbrowser {}'.format(f.name))
os.chdir('..') gui = Tkinter.Tk()
workdir = Tkinter.StringVar()
Tkinter.Button(gui, text="Report", command=report).pack(side=Tkinter.LEFT)
Tkinter.Entry(gui, textvariable=workdir, width=40).pack(side=Tkinter.LEFT)
gui.mainloop()

2017-11-11 Sa Oct Spider的更多相关文章

  1. 2017年11月Dyn365/CRM用户社区活动报名

    UG是全球最大Dynamics的用户组织,由最终用户自发组织,由行业有经验的专家自愿贡献知识和经验的非营利机构,与会人员本着务实中立的态度,不进行推介产品,服务以及其他营销行为.在美国,微软Dynam ...

  2. WPS 表格筛选两列相同数据-完美-2017年11月1日更新

    应用: 1.选出A列中的数据是否在B列中出现过: 2.筛选出某一批序号在一个表格里面的位置(整批找出) 3.其实还有其他很多应用,难描述出来... ... A列中有几百的名字,本人想帅选出B列中的名字 ...

  3. 2017年11月GitHub上最热门的Java项目出炉

    2017年11月GitHub上最热门的Java项目出炉~ 一起来看看这些项目你使用过哪些呢? 1分布式 RPC 服务框架 dubbohttps://github.com/alibaba/dubbo S ...

  4. 2017.11.11 B201 练习题思路及解题方法

    2017.11.11 B201 练习题思路及解题方法 题目类型及涵盖知识点 本次总共有6道题目,都属于MISC分类的题目,涵盖的知识点有 信息隐藏 暴力破解 音轨,摩斯电码 gif修改,base64原 ...

  5. 【主席树维护mex】 【SG函数递推】 Problem H. Cups and Beans 2017.8.11

    Problem H. Cups and Beans 2017.8.11 原题: There are N cups numbered 0 through N − 1. For each i(1 ≤ i ...

  6. 2017-11-11 Sa Oct 消参

    2017-11-11 Sa Oct 消参 Prior versions: 2017-11-04 Sa Oct 消参 2017-11-10 Fr Oct 消参 2017-11-04 Sa $ P(-3, ...

  7. NOIp 11.11/12

    最后一场比较正式的NOIp模拟赛,写一发小总结.题目没什么好说的,大部分很简单,先贴一下代码. 1111 T1 //string //by Cydiater //2016.11.11 #include ...

  8. 11.11光棍节工作心得——github/MVP

    11.11光棍节工作心得 1.根据scrum meeting thirdday中前辈的指导进行学习 我在博客中贴了链接,竟然TrackBack引来了原博主,

  9. 下面程序的输出结果是____ A:11,10 B:11,11 C:10,10 D:10,11 int x=10; int y=x++; printf("%d,%d",(x++,y),y++);

    下面程序的输出结果是____ A:11,10 B:11,11 C:10,10 D:10,11 int x=10; int y=x++; printf("%d,%d",(x++,y) ...

随机推荐

  1. Java面试问题汇总

    转一些面试经验 刚看到下面这份面试清单,从个人的开发面试经历看,里面总结的大部分内容还是很不错的.年后想跳槽的朋友可以选取里面的问题准备一下. GitHub上的面试总结帖 Interview-Note ...

  2. [JAVA]字节数组流

    import java.io.*; public class ByteArrayStream { public static void main(String[] args) { byte[] dat ...

  3. Hibernate运行原生sql并将查询的结果转化为对象

    原生SQL查询执行的控制是通过SQLQuery接口进行的,通过执行Session.createSQLQuery()获取这个接口.下面来描述如何使用这个API进行查询.标量查询(Scalar queri ...

  4. 视频信号中xyz的提取

    视频信号中xyz的提取 `timescale 1ns / 1ps /////////////////////////////////////////////////////////////////// ...

  5. vmware上虚拟机:Network error: Connection refused 排查

    问题分析 vmware配置的fedora虚拟机, 主机能ping通虚拟机, 虚拟机也能ping通主机.但是用PUTTY连接虚拟机的时候出现 Network error: Connection refu ...

  6. Python3 标准库学习

    python3.5.6 官方文档  https://docs.python.org/3.5/library/index.html 1.介绍 2.内置函数 3.内置常量 3.1常数添加的 site模块 ...

  7. VirtualApk 插件入门小试

    1 官方资料 滴滴开源Android插件方案:VirtualAPK 2 宿主App集成方法 (1)在整个工程的build.gradle中添加依赖 dependencies { classpath 'c ...

  8. 学会学习:高效学习方式(使用vscode-snippet有感)

    入职以来我们团队一直都在使用vscode编辑器,后来也有人开始使用webstorm.很久之前我突然为每天重复的编写.vue文件里面的export.<script lang="scss& ...

  9. IE6设置li的float:left,不能自适应宽的解决方法

    原文地址:https://blog.csdn.net/u012299002/article/details/50589453 做个divcss页面,发现在IE6下,设置了li的float:left,L ...

  10. ubuntu16.04 install qtcreator

    1. 安装相关软件,搭建环境 sudo apt install qt-creator sudo apt install qt5-default source python35/bin/activate ...