Visualizing Email Data(Week 6&7)

code segment

gword.py

import sqlite3
import time
import zlib
import string conn = sqlite3.connect('index.sqlite')
cur = conn.cursor() cur.execute('SELECT id, subject FROM Subjects')
subjects = dict()
for message_row in cur :
subjects[message_row[0]] = message_row[1] # cur.execute('SELECT id, guid,sender_id,subject_id,headers,body FROM Messages')
cur.execute('SELECT subject_id FROM Messages')
counts = dict()
for message_row in cur :
text = subjects[message_row[0]]
text = text.translate(str.maketrans('','',string.punctuation))
text = text.translate(str.maketrans('','','1234567890'))
text = text.strip()
text = text.lower()
words = text.split()
for word in words:
if len(word) < 4 : continue
counts[word] = counts.get(word,0) + 1 x = sorted(counts, key=counts.get, reverse=True)
highest = None
lowest = None
for k in x[:100]:
if highest is None or highest < counts[k] :
highest = counts[k]
if lowest is None or lowest > counts[k] :
lowest = counts[k]
print('Range of counts:',highest,lowest) # Spread the font sizes across 20-100 based on the count
bigsize = 80
smallsize = 20 fhand = open('gword.js','w')
fhand.write("gword = [")
first = True
for k in x[:100]:
if not first : fhand.write( ",\n")
first = False
size = counts[k]
size = (size - lowest) / float(highest - lowest)
size = int((size * bigsize) + smallsize)
fhand.write("{text: '"+k+"', size: "+str(size)+"}")
fhand.write( "\n];\n")
fhand.close() print("Output written to gword.js")
print("Open gword.htm in a browser to see the vizualization")

gline.py

import sqlite3
import time
import zlib conn = sqlite3.connect('index.sqlite')
cur = conn.cursor() cur.execute('SELECT id, sender FROM Senders')
senders = dict()
for message_row in cur :
senders[message_row[0]] = message_row[1] cur.execute('SELECT id, guid,sender_id,subject_id,sent_at FROM Messages')
messages = dict()
for message_row in cur :
messages[message_row[0]] = (message_row[1],message_row[2],message_row[3],message_row[4]) print("Loaded messages=",len(messages),"senders=",len(senders)) sendorgs = dict()
for (message_id, message) in list(messages.items()):
sender = message[1]
pieces = senders[sender].split("@")
if len(pieces) != 2 : continue
dns = pieces[1]
sendorgs[dns] = sendorgs.get(dns,0) + 1 # pick the top schools
orgs = sorted(sendorgs, key=sendorgs.get, reverse=True)
orgs = orgs[:10]
print("Top 10 Organizations")
print(orgs) counts = dict()
months = list()
# cur.execute('SELECT id, guid,sender_id,subject_id,sent_at FROM Messages')
for (message_id, message) in list(messages.items()):
sender = message[1]
pieces = senders[sender].split("@")
if len(pieces) != 2 : continue
dns = pieces[1]
if dns not in orgs : continue
month = message[3][:7]
if month not in months : months.append(month)
key = (month, dns)
counts[key] = counts.get(key,0) + 1 months.sort()
# print counts
# print months fhand = open('gline.js','w')
fhand.write("gline = [ ['Month'")
for org in orgs:
fhand.write(",'"+org+"'")
fhand.write("]") for month in months:
fhand.write(",\n['"+month+"'")
for org in orgs:
key = (month, org)
val = counts.get(key,0)
fhand.write(","+str(val))
fhand.write("]"); fhand.write("\n];\n")
fhand.close() print("Output written to gline.js")
print("Open gline.htm to visualize the data")

Coursera课程笔记----P4E.Capstone----Week 6&7的更多相关文章

  1. Coursera课程笔记----P4E.Capstone----Week 4&5

    Spidering and Modeling Email Data(week4&5) Mailing List - Gmane Crawl the archive of a mailing l ...

  2. Coursera课程笔记----P4E.Capstone----Week 2&3

    Building a Search Engine(week 2&3) Search Engine Architecture Web Crawling Index Building Search ...

  3. 操作系统学习笔记----进程/线程模型----Coursera课程笔记

    操作系统学习笔记----进程/线程模型----Coursera课程笔记 进程/线程模型 0. 概述 0.1 进程模型 多道程序设计 进程的概念.进程控制块 进程状态及转换.进程队列 进程控制----进 ...

  4. Coursera课程笔记----C++程序设计----Week3

    类和对象(Week 3) 内联成员函数和重载成员函数 内联成员函数 inline + 成员函数 整个函数题出现在类定义内部 class B{ inline void func1(); //方式1 vo ...

  5. Coursera课程笔记----Write Professional Emails in English----Week 3

    Introduction and Announcement Emails (Week 3) Overview of Introduction & Announcement Emails Bas ...

  6. Coursera课程笔记----Write Professional Emails in English----Week 1

    Get to Know Basic Email Writing Structures(Week 1) Introduction to Course Email and Editing Basics S ...

  7. Coursera课程笔记----C程序设计进阶----Week 5

    指针(二) (Week 5) 字符串与指针 指向数组的指针 int a[10]; int *p; p = a; 指向字符串的指针 指向字符串的指针变量 char a[10]; char *p; p = ...

  8. Coursera课程笔记----Write Professional Emails in English----Week 5

    Culture Matters(Week 5) High/Low Context Communication High Context Communication The Middle East, A ...

  9. Coursera课程笔记----Write Professional Emails in English----Week 4

    Request and Apology Emails(Week 4) How to Write Request Emails Write more POLITELY & SINCERELUY ...

随机推荐

  1. PHP代码审计(初级篇)

    一.常见的PHP框架 1.zendframwork: (ZF)是Zend公司推出的一套PHP开发框架 功能非常的强大,是一个重量级的框架,ZF 用 100%面向对象编码实现. ZF 的组件结构独一无二 ...

  2. Three.js如何选中外部模型

    1.问题 three.js中模型选中使用的是射线法,根据摄像机角度,鼠标点击位置和模型选中的distance参数判断来选中模型.对于原生的矢量模型完全没有问题,但是当遇到导入的外部模型,如obj.st ...

  3. Personal Photo Management Application

    Customer Problems & Needs People may take a large number of photos and their phone don't have en ...

  4. 3. css百度制作字体图片

    http://fontstore.baidu.com/static/editor/index.html?qq-pf-to=pcqq.group

  5. [php]微信测试号调取acces_token,自定义菜单以及被动响应消息

    <?php /**自己写的 */ $wechatObj = new wechatCallbackapiTest(); $wechatObj->valid(); $wechatObj-> ...

  6. 【考试总结】欢乐模拟赛_Day1

    \(T1\) 题目描述 给出一个 \(n × n\) 的, 元素为自然数的矩阵. 这个矩阵有许许多多个子矩阵, 定义它的所有子矩阵形成的集合为 \(S\) . 对于一个矩阵 \(k\) , 定义 \( ...

  7. pytorch 去除维度为1的维度

    out.squeeze(dim=1) out.squeeze_(dim=1)

  8. pytorch 中word embedding 词向量的使用

  9. word2sequence 把字符串转换数字编码

    地址:http://ai.stanford.edu/~amaas/data/sentiment/,这是一份包含了5万条流行电影的评论数据,其中训练集25000条,测试集25000条. 1.准备数据 d ...

  10. 不是广告--如何学Java,我说点不太一样的学习方式

    首先声明,这篇文章不是卖课程.介绍培训班的广告. 最近有不少读者通过微信问我:小白应该怎么学好 Java? 提问的人里有在校大学生.有刚参加工作的.有想转行做程序员的,还有一部分是最近找工作不顺的. ...