Spidering and Modeling Email Data(week4&5)

Mailing List - Gmane

  • Crawl the archive of a mailing list
  • Do some analysis / cleanup
  • Visualize the data as word cloud and lines

code segment

gmane.py

import sqlite3
import time
import ssl
import urllib.request, urllib.parse, urllib.error
from urllib.parse import urljoin
from urllib.parse import urlparse
import re
from datetime import datetime, timedelta # Not all systems have this so conditionally define parser
try:
import dateutil.parser as parser
except:
pass def parsemaildate(md) :
# See if we have dateutil
try:
pdate = parser.parse(tdate)
test_at = pdate.isoformat()
return test_at
except:
pass # Non-dateutil version - we try our best pieces = md.split()
notz = " ".join(pieces[:4]).strip() # Try a bunch of format variations - strptime() is *lame*
dnotz = None
for form in [ '%d %b %Y %H:%M:%S', '%d %b %Y %H:%M:%S',
'%d %b %Y %H:%M', '%d %b %Y %H:%M', '%d %b %y %H:%M:%S',
'%d %b %y %H:%M:%S', '%d %b %y %H:%M', '%d %b %y %H:%M' ] :
try:
dnotz = datetime.strptime(notz, form)
break
except:
continue if dnotz is None :
# print 'Bad Date:',md
return None iso = dnotz.isoformat() tz = "+0000"
try:
tz = pieces[4]
ival = int(tz) # Only want numeric timezone values
if tz == '-0000' : tz = '+0000'
tzh = tz[:3]
tzm = tz[3:]
tz = tzh+":"+tzm
except:
pass return iso+tz # Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE conn = sqlite3.connect('content.sqlite')
cur = conn.cursor() baseurl = "http://mbox.dr-chuck.net/sakai.devel/" cur.execute('''CREATE TABLE IF NOT EXISTS Messages
(id INTEGER UNIQUE, email TEXT, sent_at TEXT,
subject TEXT, headers TEXT, body TEXT)''') # Pick up where we left off
start = None
cur.execute('SELECT max(id) FROM Messages' )
try:
row = cur.fetchone()
if row is None :
start = 0
else:
start = row[0]
except:
start = 0 if start is None : start = 0 many = 0
count = 0
fail = 0
while True:
if ( many < 1 ) :
sval = input('How many messages:')
if ( len(sval) < 1 ) : break
many = int(sval) start = start + 1
cur.execute('SELECT id FROM Messages WHERE id=?', (start,) )
try:
row = cur.fetchone()
if row is not None : continue
except:
row = None many = many - 1
url = baseurl + str(start) + '/' + str(start + 1) text = "None"
try:
# Open with a timeout of 30 seconds
document = urllib.request.urlopen(url, None, 30, context=ctx)
text = document.read().decode()
if document.getcode() != 200 :
print("Error code=",document.getcode(), url)
break
except KeyboardInterrupt:
print('')
print('Program interrupted by user...')
break
except Exception as e:
print("Unable to retrieve or parse page",url)
print("Error",e)
fail = fail + 1
if fail > 5 : break
continue print(url,len(text))
count = count + 1 if not text.startswith("From "):
print(text)
print("Did not find From ")
fail = fail + 1
if fail > 5 : break
continue pos = text.find("\n\n")
if pos > 0 :
hdr = text[:pos]
body = text[pos+2:]
else:
print(text)
print("Could not find break between headers and body")
fail = fail + 1
if fail > 5 : break
continue email = None
x = re.findall('\nFrom: .* <(\S+@\S+)>\n', hdr)
if len(x) == 1 :
email = x[0];
email = email.strip().lower()
email = email.replace("<","")
else:
x = re.findall('\nFrom: (\S+@\S+)\n', hdr)
if len(x) == 1 :
email = x[0];
email = email.strip().lower()
email = email.replace("<","") date = None
y = re.findall('\Date: .*, (.*)\n', hdr)
if len(y) == 1 :
tdate = y[0]
tdate = tdate[:26]
try:
sent_at = parsemaildate(tdate)
except:
print(text)
print("Parse fail",tdate)
fail = fail + 1
if fail > 5 : break
continue subject = None
z = re.findall('\Subject: (.*)\n', hdr)
if len(z) == 1 : subject = z[0].strip().lower(); # Reset the fail counter
fail = 0
print(" ",email,sent_at,subject)
cur.execute('''INSERT OR IGNORE INTO Messages (id, email, sent_at, subject, headers, body)
VALUES ( ?, ?, ?, ?, ?, ? )''', ( start, email, sent_at, subject, hdr, body))
if count % 50 == 0 : conn.commit()
if count % 100 == 0 : time.sleep(1) conn.commit()
cur.close()

gmodel.py

import sqlite3
import time
import re
import zlib
from datetime import datetime, timedelta # Not all systems have this
try:
import dateutil.parser as parser
except:
pass dnsmapping = dict()
mapping = dict() def fixsender(sender,allsenders=None) :
global dnsmapping
global mapping
if sender is None : return None
sender = sender.strip().lower()
sender = sender.replace('<','').replace('>','') # Check if we have a hacked gmane.org from address
if allsenders is not None and sender.endswith('gmane.org') :
pieces = sender.split('-')
realsender = None
for s in allsenders:
if s.startswith(pieces[0]) :
realsender = sender
sender = s
# print(realsender, sender)
break
if realsender is None :
for s in mapping:
if s.startswith(pieces[0]) :
realsender = sender
sender = mapping[s]
# print(realsender, sender)
break
if realsender is None : sender = pieces[0] mpieces = sender.split("@")
if len(mpieces) != 2 : return sender
dns = mpieces[1]
x = dns
pieces = dns.split(".")
if dns.endswith(".edu") or dns.endswith(".com") or dns.endswith(".org") or dns.endswith(".net") :
dns = ".".join(pieces[-2:])
else:
dns = ".".join(pieces[-3:])
# if dns != x : print(x,dns)
# if dns != dnsmapping.get(dns,dns) : print(dns,dnsmapping.get(dns,dns))
dns = dnsmapping.get(dns,dns)
return mpieces[0] + '@' + dns def parsemaildate(md) :
# See if we have dateutil
try:
pdate = parser.parse(md)
test_at = pdate.isoformat()
return test_at
except:
pass # Non-dateutil version - we try our best pieces = md.split()
notz = " ".join(pieces[:4]).strip() # Try a bunch of format variations - strptime() is *lame*
dnotz = None
for form in [ '%d %b %Y %H:%M:%S', '%d %b %Y %H:%M:%S',
'%d %b %Y %H:%M', '%d %b %Y %H:%M', '%d %b %y %H:%M:%S',
'%d %b %y %H:%M:%S', '%d %b %y %H:%M', '%d %b %y %H:%M' ] :
try:
dnotz = datetime.strptime(notz, form)
break
except:
continue if dnotz is None :
# print('Bad Date:',md)
return None iso = dnotz.isoformat() tz = "+0000"
try:
tz = pieces[4]
ival = int(tz) # Only want numeric timezone values
if tz == '-0000' : tz = '+0000'
tzh = tz[:3]
tzm = tz[3:]
tz = tzh+":"+tzm
except:
pass return iso+tz # Parse out the info...
def parseheader(hdr, allsenders=None):
if hdr is None or len(hdr) < 1 : return None
sender = None
x = re.findall('\nFrom: .* <(\S+@\S+)>\n', hdr)
if len(x) >= 1 :
sender = x[0]
else:
x = re.findall('\nFrom: (\S+@\S+)\n', hdr)
if len(x) >= 1 :
sender = x[0] # normalize the domain name of Email addresses
sender = fixsender(sender, allsenders) date = None
y = re.findall('\nDate: .*, (.*)\n', hdr)
sent_at = None
if len(y) >= 1 :
tdate = y[0]
tdate = tdate[:26]
try:
sent_at = parsemaildate(tdate)
except Exception as e:
# print('Date ignored ',tdate, e)
return None subject = None
z = re.findall('\nSubject: (.*)\n', hdr)
if len(z) >= 1 : subject = z[0].strip().lower() guid = None
z = re.findall('\nMessage-ID: (.*)\n', hdr)
if len(z) >= 1 : guid = z[0].strip().lower() if sender is None or sent_at is None or subject is None or guid is None :
return None
return (guid, sender, subject, sent_at) conn = sqlite3.connect('index.sqlite')
cur = conn.cursor() cur.execute('''DROP TABLE IF EXISTS Messages ''')
cur.execute('''DROP TABLE IF EXISTS Senders ''')
cur.execute('''DROP TABLE IF EXISTS Subjects ''')
cur.execute('''DROP TABLE IF EXISTS Replies ''') cur.execute('''CREATE TABLE IF NOT EXISTS Messages
(id INTEGER PRIMARY KEY, guid TEXT UNIQUE, sent_at INTEGER,
sender_id INTEGER, subject_id INTEGER,
headers BLOB, body BLOB)''')
cur.execute('''CREATE TABLE IF NOT EXISTS Senders
(id INTEGER PRIMARY KEY, sender TEXT UNIQUE)''')
cur.execute('''CREATE TABLE IF NOT EXISTS Subjects
(id INTEGER PRIMARY KEY, subject TEXT UNIQUE)''')
cur.execute('''CREATE TABLE IF NOT EXISTS Replies
(from_id INTEGER, to_id INTEGER)''') conn_1 = sqlite3.connect('mapping.sqlite')
cur_1 = conn_1.cursor() cur_1.execute('''SELECT old,new FROM DNSMapping''')
for message_row in cur_1 :
dnsmapping[message_row[0].strip().lower()] = message_row[1].strip().lower() mapping = dict()
cur_1.execute('''SELECT old,new FROM Mapping''')
for message_row in cur_1 :
old = fixsender(message_row[0])
new = fixsender(message_row[1])
mapping[old] = fixsender(new) # Done with mapping.sqlite
conn_1.close() # Open the main content (Read only)
conn_1 = sqlite3.connect('file:content.sqlite?mode=ro', uri=True)
cur_1 = conn_1.cursor() allsenders = list()
cur_1.execute('''SELECT email FROM Messages''')
for message_row in cur_1 :
sender = fixsender(message_row[0])
if sender is None : continue
if 'gmane.org' in sender : continue
if sender in allsenders: continue
allsenders.append(sender) print("Loaded allsenders",len(allsenders),"and mapping",len(mapping),"dns mapping",len(dnsmapping)) cur_1.execute('''SELECT headers, body, sent_at
FROM Messages ORDER BY sent_at''') senders = dict()
subjects = dict()
guids = dict() count = 0 for message_row in cur_1 :
hdr = message_row[0]
parsed = parseheader(hdr, allsenders)
if parsed is None: continue
(guid, sender, subject, sent_at) = parsed # Apply the sender mapping
sender = mapping.get(sender,sender) count = count + 1
if count % 250 == 1 : print(count,sent_at, sender)
# print(guid, sender, subject, sent_at) if 'gmane.org' in sender:
print("Error in sender ===", sender) sender_id = senders.get(sender,None)
subject_id = subjects.get(subject,None)
guid_id = guids.get(guid,None) if sender_id is None :
cur.execute('INSERT OR IGNORE INTO Senders (sender) VALUES ( ? )', ( sender, ) )
conn.commit()
cur.execute('SELECT id FROM Senders WHERE sender=? LIMIT 1', ( sender, ))
try:
row = cur.fetchone()
sender_id = row[0]
senders[sender] = sender_id
except:
print('Could not retrieve sender id',sender)
break
if subject_id is None :
cur.execute('INSERT OR IGNORE INTO Subjects (subject) VALUES ( ? )', ( subject, ) )
conn.commit()
cur.execute('SELECT id FROM Subjects WHERE subject=? LIMIT 1', ( subject, ))
try:
row = cur.fetchone()
subject_id = row[0]
subjects[subject] = subject_id
except:
print('Could not retrieve subject id',subject)
break
# print(sender_id, subject_id)
cur.execute('INSERT OR IGNORE INTO Messages (guid,sender_id,subject_id,sent_at,headers,body) VALUES ( ?,?,?,datetime(?),?,? )',
( guid, sender_id, subject_id, sent_at,
zlib.compress(message_row[0].encode()), zlib.compress(message_row[1].encode())) )
conn.commit()
cur.execute('SELECT id FROM Messages WHERE guid=? LIMIT 1', ( guid, ))
try:
row = cur.fetchone()
message_id = row[0]
guids[guid] = message_id
except:
print('Could not retrieve guid id',guid)
break cur.close()
cur_1.close()

gbasic.py

import sqlite3
import time
import zlib howmany = int(input("How many to dump? ")) conn = sqlite3.connect('index.sqlite')
cur = conn.cursor() cur.execute('SELECT id, sender FROM Senders')
senders = dict()
for message_row in cur :
senders[message_row[0]] = message_row[1] cur.execute('SELECT id, subject FROM Subjects')
subjects = dict()
for message_row in cur :
subjects[message_row[0]] = message_row[1] # cur.execute('SELECT id, guid,sender_id,subject_id,headers,body FROM Messages')
cur.execute('SELECT id, guid,sender_id,subject_id,sent_at FROM Messages')
messages = dict()
for message_row in cur :
messages[message_row[0]] = (message_row[1],message_row[2],message_row[3],message_row[4]) print("Loaded messages=",len(messages),"subjects=",len(subjects),"senders=",len(senders)) sendcounts = dict()
sendorgs = dict()
for (message_id, message) in list(messages.items()):
sender = message[1]
sendcounts[sender] = sendcounts.get(sender,0) + 1
pieces = senders[sender].split("@")
if len(pieces) != 2 : continue
dns = pieces[1]
sendorgs[dns] = sendorgs.get(dns,0) + 1 print('')
print('Top',howmany,'Email list participants') x = sorted(sendcounts, key=sendcounts.get, reverse=True)
for k in x[:howmany]:
print(senders[k], sendcounts[k])
if sendcounts[k] < 10 : break print('')
print('Top',howmany,'Email list organizations') x = sorted(sendorgs, key=sendorgs.get, reverse=True)
for k in x[:howmany]:
print(k, sendorgs[k])
if sendorgs[k] < 10 : break

Coursera课程笔记----P4E.Capstone----Week 4&5的更多相关文章

  1. Coursera课程笔记----P4E.Capstone----Week 6&7

    Visualizing Email Data(Week 6&7) code segment gword.py import sqlite3 import time import zlib im ...

  2. Coursera课程笔记----P4E.Capstone----Week 2&3

    Building a Search Engine(week 2&3) Search Engine Architecture Web Crawling Index Building Search ...

  3. 操作系统学习笔记----进程/线程模型----Coursera课程笔记

    操作系统学习笔记----进程/线程模型----Coursera课程笔记 进程/线程模型 0. 概述 0.1 进程模型 多道程序设计 进程的概念.进程控制块 进程状态及转换.进程队列 进程控制----进 ...

  4. Coursera课程笔记----C++程序设计----Week3

    类和对象(Week 3) 内联成员函数和重载成员函数 内联成员函数 inline + 成员函数 整个函数题出现在类定义内部 class B{ inline void func1(); //方式1 vo ...

  5. Coursera课程笔记----Write Professional Emails in English----Week 3

    Introduction and Announcement Emails (Week 3) Overview of Introduction & Announcement Emails Bas ...

  6. Coursera课程笔记----Write Professional Emails in English----Week 1

    Get to Know Basic Email Writing Structures(Week 1) Introduction to Course Email and Editing Basics S ...

  7. Coursera课程笔记----C程序设计进阶----Week 5

    指针(二) (Week 5) 字符串与指针 指向数组的指针 int a[10]; int *p; p = a; 指向字符串的指针 指向字符串的指针变量 char a[10]; char *p; p = ...

  8. Coursera课程笔记----Write Professional Emails in English----Week 5

    Culture Matters(Week 5) High/Low Context Communication High Context Communication The Middle East, A ...

  9. Coursera课程笔记----Write Professional Emails in English----Week 4

    Request and Apology Emails(Week 4) How to Write Request Emails Write more POLITELY & SINCERELUY ...

随机推荐

  1. yum 下载全量依赖 rpm 包及离线安装(终极解决方案)

    目录 简介 验证环境 查看依赖包 方案一(推荐):repotrack 方案二:yumdownloader 方案三:yum 的 downloadonly 插件 离线安装 rpm 参考资料 简介 通常生产 ...

  2. ADO.NET 事务控制

    在ADO.NET 中,可以使用Connection 和Transaction 对象来控制事务.若要执行事务,请执行下列操作: 1.调用Connection 对象的BeginTransaction 方法 ...

  3. 控件:DataGridView列类型

    DataGridView的列的类型提供有多种,包括有: (1)DataGridViewTextBoxColumn(文本列,默认的情况下就是这种) (2)DataGridViewComboBoxColu ...

  4. Daily Scrum 1/18/2016

    Yandong & Zhaoyang: Prepare bug bash slides for Beta release; Dong & Fuchen:Prepare demo for ...

  5. 💕《给产品经理讲JVM》:垃圾收集器

    前言 在上篇中,我们把 JVM 中的垃圾收集算法有了一个大概的了解,又是一个阴雨连绵的周末,宅在家里的我们又开始了新一轮的学习: 产品大大:上周末我们说了垃圾收集算法,下面是不是要讲一下这些算法的应用 ...

  6. PHP xml 外部实体注入漏洞学习

    XML与xxe注入基础知识 1.XMl定义 XML由3个部分构成,它们分别是:文档类型定义(Document Type Definition,DTD),即XML的布局语言:可扩展的样式语言(Exten ...

  7. eclipse git 文件状态 及git分支的创建与合并与删除

    eclipse里面Git文件状态及图标展示   EGit会出现如下图标,其对应状态及意义如下:      1)忽略[ ignored ]:仓库认为该文件不存在(如bin目录,不需要关注).通过右键Te ...

  8. [linux][mysql] 命令更改表结构:添加、删除、修改字段、调整字段顺序

    原文出处:http://www.phpernote.com/MySQL/1120.html 查看表结构: desc tabl_name; show columns fromtable_name: 常用 ...

  9. [PHP]听说随机数mt_rand()比rand()速度快,闲的无聊测试了一下!

    废话不说上码 //microtime() 函数返回当前 Unix 时间戳的微秒数.//当设置为 TRUE 时,规定函数应该返回一个浮点数,否则返回一个字符串.默认为 FALSE. <?php h ...

  10. python 基础篇 匿名函数

    匿名函数基础 首先,什么是匿名函数呢?以下是匿名函数的格式: lambda argument1, argument2,... argumentN : expression 我们可以看到,匿名函数的关键 ...