使用神经网络-垃圾邮件检测-LSTM或者CNN(一维卷积)效果都不错【代码有问题,pass】
from sklearn.feature_extraction.text import CountVectorizer
import os
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn import metrics
import matplotlib.pyplot as plt
import numpy as np
from sklearn import svm
from sklearn.feature_extraction.text import TfidfTransformer
import tensorflow as tf
import tflearn
from tflearn.layers.core import input_data, dropout, fully_connected
from tflearn.layers.conv import conv_1d, global_max_pool
from tflearn.layers.conv import conv_2d, max_pool_2d
from tflearn.layers.merge_ops import merge
from tflearn.layers.estimator import regression
from tflearn.data_utils import to_categorical, pad_sequences
from sklearn.neural_network import MLPClassifier
from tflearn.layers.normalization import local_response_normalization
from tensorflow.contrib import learn max_features=500
max_document_length=1024 def load_one_file(filename):
x=""
with open(filename) as f:
for line in f:
line=line.strip('\n')
line = line.strip('\r')
x+=line
return x def load_files_from_dir(rootdir):
x=[]
list = os.listdir(rootdir)
for i in range(0, len(list)):
path = os.path.join(rootdir, list[i])
if os.path.isfile(path):
v=load_one_file(path)
x.append(v)
return x def load_all_files():
ham=[]
spam=[]
for i in range(1,5):
path="../data/mail/enron%d/ham/" % i
print "Load %s" % path
ham+=load_files_from_dir(path)
path="../data/mail/enron%d/spam/" % i
print "Load %s" % path
spam+=load_files_from_dir(path)
return ham,spam def get_features_by_wordbag():
ham, spam=load_all_files()
x=ham+spam
y=[0]*len(ham)+[1]*len(spam)
vectorizer = CountVectorizer(
decode_error='ignore',
strip_accents='ascii',
max_features=max_features,
stop_words='english',
max_df=1.0,
min_df=1 )
print vectorizer
x=vectorizer.fit_transform(x)
x=x.toarray()
return x,y def show_diffrent_max_features():
global max_features
a=[]
b=[]
for i in range(1000,20000,2000):
max_features=i
print "max_features=%d" % i
x, y = get_features_by_wordbag()
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=0)
gnb = GaussianNB()
gnb.fit(x_train, y_train)
y_pred = gnb.predict(x_test)
score=metrics.accuracy_score(y_test, y_pred)
a.append(max_features)
b.append(score)
plt.plot(a, b, 'r')
plt.xlabel("max_features")
plt.ylabel("metrics.accuracy_score")
plt.title("metrics.accuracy_score VS max_features")
plt.legend()
plt.show() def do_nb_wordbag(x_train, x_test, y_train, y_test):
print "NB and wordbag"
gnb = GaussianNB()
gnb.fit(x_train,y_train)
y_pred=gnb.predict(x_test)
print metrics.accuracy_score(y_test, y_pred)
print metrics.confusion_matrix(y_test, y_pred) def do_svm_wordbag(x_train, x_test, y_train, y_test):
print "SVM and wordbag"
clf = svm.SVC()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print metrics.accuracy_score(y_test, y_pred)
print metrics.confusion_matrix(y_test, y_pred) def get_features_by_wordbag_tfidf():
ham, spam=load_all_files()
x=ham+spam
y=[0]*len(ham)+[1]*len(spam)
vectorizer = CountVectorizer(binary=True,
decode_error='ignore',
strip_accents='ascii',
max_features=max_features,
stop_words='english',
max_df=1.0,
min_df=1 )
print vectorizer
x=vectorizer.fit_transform(x)
x=x.toarray()
transformer = TfidfTransformer(smooth_idf=False)
print transformer
tfidf = transformer.fit_transform(x)
x = tfidf.toarray()
return x,y def do_cnn_wordbag(trainX, testX, trainY, testY):
global max_document_length
print "CNN and tf" trainX = pad_sequences(trainX, maxlen=max_document_length, value=0.)
testX = pad_sequences(testX, maxlen=max_document_length, value=0.)
# Converting labels to binary vectors
trainY = to_categorical(trainY, nb_classes=2)
testY = to_categorical(testY, nb_classes=2) # Building convolutional network
network = input_data(shape=[None,max_document_length], name='input')
network = tflearn.embedding(network, input_dim=1000000, output_dim=128)
branch1 = conv_1d(network, 128, 3, padding='valid', activation='relu', regularizer="L2")
branch2 = conv_1d(network, 128, 4, padding='valid', activation='relu', regularizer="L2")
branch3 = conv_1d(network, 128, 5, padding='valid', activation='relu', regularizer="L2")
network = merge([branch1, branch2, branch3], mode='concat', axis=1)
network = tf.expand_dims(network, 2)
network = global_max_pool(network)
network = dropout(network, 0.8)
network = fully_connected(network, 2, activation='softmax')
network = regression(network, optimizer='adam', learning_rate=0.001,
loss='categorical_crossentropy', name='target')
# Training
model = tflearn.DNN(network, tensorboard_verbose=0)
model.fit(trainX, trainY,
n_epoch=5, shuffle=True, validation_set=(testX, testY),
show_metric=True, batch_size=100,run_id="spam") def do_rnn_wordbag(trainX, testX, trainY, testY):
global max_document_length
print "RNN and wordbag" trainX = pad_sequences(trainX, maxlen=max_document_length, value=0.)
testX = pad_sequences(testX, maxlen=max_document_length, value=0.)
# Converting labels to binary vectors
trainY = to_categorical(trainY, nb_classes=2)
testY = to_categorical(testY, nb_classes=2) # Network building
net = tflearn.input_data([None, max_document_length])
net = tflearn.embedding(net, input_dim=10240000, output_dim=128)
net = tflearn.lstm(net, 128, dropout=0.8)
net = tflearn.fully_connected(net, 2, activation='softmax')
net = tflearn.regression(net, optimizer='adam', learning_rate=0.001,
loss='categorical_crossentropy') # Training
model = tflearn.DNN(net, tensorboard_verbose=0)
model.fit(trainX, trainY, validation_set=(testX, testY), show_metric=True,
batch_size=10,run_id="spm-run",n_epoch=5) def do_dnn_wordbag(x_train, x_test, y_train, y_testY):
print "DNN and wordbag" # Building deep neural network
clf = MLPClassifier(solver='lbfgs',
alpha=1e-5,
hidden_layer_sizes = (5, 2),
random_state = 1)
print clf
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print metrics.accuracy_score(y_test, y_pred)
print metrics.confusion_matrix(y_test, y_pred) def get_features_by_tf():
global max_document_length
x=[]
y=[]
ham, spam=load_all_files()
x=ham+spam
y=[0]*len(ham)+[1]*len(spam)
vp=tflearn.data_utils.VocabularyProcessor(max_document_length=max_document_length,
min_frequency=0,
vocabulary=None,
tokenizer_fn=None)
x=vp.fit_transform(x, unused_y=None)
x=np.array(list(x))
return x,y if __name__ == "__main__":
print "Hello spam-mail"
#print "get_features_by_wordbag"
#x,y=get_features_by_wordbag()
#x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.4, random_state = 0) #print "get_features_by_wordbag_tfidf"
#x,y=get_features_by_wordbag_tfidf()
#x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.4, random_state = 0)
#NB
#do_nb_wordbag(x_train, x_test, y_train, y_test)
#show_diffrent_max_features() #SVM
#do_svm_wordbag(x_train, x_test, y_train, y_test) #DNN
#do_dnn_wordbag(x_train, x_test, y_train, y_test) print "get_features_by_tf"
x,y=get_features_by_wordbag()
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.4, random_state = 0)
#CNN
do_cnn_wordbag(x_train, x_test, y_train, y_test) #RNN
#do_rnn_wordbag(x_train, x_test, y_train, y_test)
自己写检测算法的时候也记得多个算法比较下
使用神经网络-垃圾邮件检测-LSTM或者CNN(一维卷积)效果都不错【代码有问题,pass】的更多相关文章
- Atitti 文本分类  以及 垃圾邮件 判断原理 以及贝叶斯算法的应用解决方案
		Atitti 文本分类 以及 垃圾邮件 判断原理 以及贝叶斯算法的应用解决方案 1.1. 七.什么是贝叶斯过滤器?1 1.2. 八.建立历史资料库2 1.3. 十.联合概率的计算3 1.4. 十一. ... 
- 基于Python的邮件检测工具
		邮件快速检测工具 概要介绍 mmpi,是一款使用python实现的开源邮件快速检测工具库,基于community框架设计开发.mmpi支持对邮件头.邮件正文.邮件附件的解析检测,并输出json检测报告 ... 
- CNN实现垃圾邮件分类(行大小不一致要补全)
		以下是利用卷积神经网络对某一个句子的处理结构图 我们从上图可知,将一句话转化成一个矩阵.我们看到该句话有6个单词和一个标点符号,所以我们可以将该矩阵设置为7行,对于列的话每个单词可以用什么样的数值表示 ... 
- 数据挖掘、目标检测中的cnn和cn---卷积网络和卷积神经网络
		content 概述 文字识别系统LeNet-5 简化的LeNet-5系统 卷积神经网络的实现问题 深度神经网路已经在语音识别,图像识别等领域取得前所未有的成功.本人在多年之前也曾接触过神经网络.本系 ... 
- 【深度学习系列】PaddlePaddle垃圾邮件处理实战(二)
		PaddlePaddle垃圾邮件处理实战(二) 前文回顾 在上篇文章中我们讲了如何用支持向量机对垃圾邮件进行分类,auc为73.3%,本篇讲继续讲如何用PaddlePaddle实现邮件分类,将深度 ... 
- 如何基于TensorFlow使用LSTM和CNN实现时序分类任务
		https://www.jiqizhixin.com/articles/2017-09-12-5 By 蒋思源2017年9月12日 09:54 时序数据经常出现在很多领域中,如金融.信号处理.语音识别 ... 
- Deep Learning模型之:CNN卷积神经网络(一)深度解析CNN
		http://m.blog.csdn.net/blog/wu010555688/24487301 本文整理了网上几位大牛的博客,详细地讲解了CNN的基础结构与核心思想,欢迎交流. [1]Deep le ... 
- postfix反垃圾邮件说明
		参考地址:http://guailele.blog.51cto.com/1156442/780223 1.打开 smtp 的认证模块 在/etc/postfix/main.cf文件最后加上: sm ... 
- postfix疯狂外发垃圾邮件
		分析 一.查找main.cf配置文件 localhost# find / -name main.cf /etc/postfix/main.cf 二.打开/etc/postfix/main.cf来看看. ... 
随机推荐
- 4.Projects and Scenes介绍
			1.Project 一个项目是由一系列的文件(如图片.音频.几何).场景以及vzp文件组成.这些文件被导入到项目对应的文件夹中.项目外部资源在场景中被使用后,会导入项目中,除非该资源被标记为外部引用. ... 
- CI中的url相关函数以及路由设置和伪静态技术
			当使用CI框架进行开发时,我们的一些数据传递的URL不应该写死,可以使用如下方法:比如说我们需要表单提交一个数据: 1.在controller控制器中我们需要先创建一个加载helper和视图的方法: ... 
- 洛谷P2607 [ZJOI2008]骑士(树形dp)
			题目描述 Z国的骑士团是一个很有势力的组织,帮会中汇聚了来自各地的精英.他们劫富济贫,惩恶扬善,受到社会各界的赞扬. 最近发生了一件可怕的事情,邪恶的Y国发动了一场针对Z国的侵略战争.战火绵延五百里, ... 
- Activity创建时布局文件的实现原理
			setContenView(R.id.activity)实现原理 1.底层框架根据布局ID找到布局文件. 2.底层框架解析此布局文件(pull解析). 3.底层框架通过反射构建布局文件中的元素对象(E ... 
- Jenkins介绍-安装-部署...
			1.背景 大师Martin Fowler对持续集成是这样定义的:持续集成是一种软件开发实践,即团队开发成员经常集成他们的工作,通常每个成员每天至少集成一次,也就意味着每天可能会发生多次集成. ... 
- ASP.NET 微信公众平台模板消息推送功能完整开发
			最近公众平台的用户提出了新需求,他们希望当收到新的邮件或者日程的时候,公众平台能主动推送一条提醒给用户.看了看平台提供的接口,似乎只有[模板消息]能尽量满足这一需求,但不得不说微信提供的实例太少,而且 ... 
- epoll的实现与深入思考
			提契 纸上得来终觉浅,绝知此事要躬行. 正文 前段时间写了一篇epoll的学习文章,但没有自己的心得总觉得比较肤浅,花了一些时间补充一个epoll的实例,并浅析一下过程中遇到的问题. 上epoll_s ... 
- VTK读取序列化图像
			vtk获取内存中图像数据 原文链接:http://blog.csdn.net/zmy3376365/article/details/7717721 内存中有段图片数据 ,使用VTK来读入,然后就可以 ... 
- Java学习笔记2——数据类型和转换
			前提知识: 1字节=8bit:1bit以一个二极管表示,代表2个状态(0或者1):2bit代表22即4种状态(00,01,10,11),8bit即是28即256种状态,16bit即是65536种状态. ... 
- 计蒜客 阿里天池的新任务—简单( KMP水 )
			链接:传送门 思路:KMP模板题,直接生成 S 串,然后匹配一下 P 串在 S 串出现的次数,注意处理嵌套的情况即可,嵌套的情况即 S = "aaaaaa" ,P = " ... 
