from sklearn.feature_extraction.text import CountVectorizer
import os
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn import metrics
import matplotlib.pyplot as plt
import numpy as np
from sklearn import svm
from sklearn.feature_extraction.text import TfidfTransformer
import tensorflow as tf
import tflearn
from tflearn.layers.core import input_data, dropout, fully_connected
from tflearn.layers.conv import conv_1d, global_max_pool
from tflearn.layers.conv import conv_2d, max_pool_2d
from tflearn.layers.merge_ops import merge
from tflearn.layers.estimator import regression
from tflearn.data_utils import to_categorical, pad_sequences
from sklearn.neural_network import MLPClassifier
from tflearn.layers.normalization import local_response_normalization
from tensorflow.contrib import learn max_features=500
max_document_length=1024 def load_one_file(filename):
x=""
with open(filename) as f:
for line in f:
line=line.strip('\n')
line = line.strip('\r')
x+=line
return x def load_files_from_dir(rootdir):
x=[]
list = os.listdir(rootdir)
for i in range(0, len(list)):
path = os.path.join(rootdir, list[i])
if os.path.isfile(path):
v=load_one_file(path)
x.append(v)
return x def load_all_files():
ham=[]
spam=[]
for i in range(1,5):
path="../data/mail/enron%d/ham/" % i
print "Load %s" % path
ham+=load_files_from_dir(path)
path="../data/mail/enron%d/spam/" % i
print "Load %s" % path
spam+=load_files_from_dir(path)
return ham,spam def get_features_by_wordbag():
ham, spam=load_all_files()
x=ham+spam
y=[0]*len(ham)+[1]*len(spam)
vectorizer = CountVectorizer(
decode_error='ignore',
strip_accents='ascii',
max_features=max_features,
stop_words='english',
max_df=1.0,
min_df=1 )
print vectorizer
x=vectorizer.fit_transform(x)
x=x.toarray()
return x,y def show_diffrent_max_features():
global max_features
a=[]
b=[]
for i in range(1000,20000,2000):
max_features=i
print "max_features=%d" % i
x, y = get_features_by_wordbag()
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=0)
gnb = GaussianNB()
gnb.fit(x_train, y_train)
y_pred = gnb.predict(x_test)
score=metrics.accuracy_score(y_test, y_pred)
a.append(max_features)
b.append(score)
plt.plot(a, b, 'r')
plt.xlabel("max_features")
plt.ylabel("metrics.accuracy_score")
plt.title("metrics.accuracy_score VS max_features")
plt.legend()
plt.show() def do_nb_wordbag(x_train, x_test, y_train, y_test):
print "NB and wordbag"
gnb = GaussianNB()
gnb.fit(x_train,y_train)
y_pred=gnb.predict(x_test)
print metrics.accuracy_score(y_test, y_pred)
print metrics.confusion_matrix(y_test, y_pred) def do_svm_wordbag(x_train, x_test, y_train, y_test):
print "SVM and wordbag"
clf = svm.SVC()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print metrics.accuracy_score(y_test, y_pred)
print metrics.confusion_matrix(y_test, y_pred) def get_features_by_wordbag_tfidf():
ham, spam=load_all_files()
x=ham+spam
y=[0]*len(ham)+[1]*len(spam)
vectorizer = CountVectorizer(binary=True,
decode_error='ignore',
strip_accents='ascii',
max_features=max_features,
stop_words='english',
max_df=1.0,
min_df=1 )
print vectorizer
x=vectorizer.fit_transform(x)
x=x.toarray()
transformer = TfidfTransformer(smooth_idf=False)
print transformer
tfidf = transformer.fit_transform(x)
x = tfidf.toarray()
return x,y def do_cnn_wordbag(trainX, testX, trainY, testY):
global max_document_length
print "CNN and tf" trainX = pad_sequences(trainX, maxlen=max_document_length, value=0.)
testX = pad_sequences(testX, maxlen=max_document_length, value=0.)
# Converting labels to binary vectors
trainY = to_categorical(trainY, nb_classes=2)
testY = to_categorical(testY, nb_classes=2) # Building convolutional network
network = input_data(shape=[None,max_document_length], name='input')
network = tflearn.embedding(network, input_dim=1000000, output_dim=128)
branch1 = conv_1d(network, 128, 3, padding='valid', activation='relu', regularizer="L2")
branch2 = conv_1d(network, 128, 4, padding='valid', activation='relu', regularizer="L2")
branch3 = conv_1d(network, 128, 5, padding='valid', activation='relu', regularizer="L2")
network = merge([branch1, branch2, branch3], mode='concat', axis=1)
network = tf.expand_dims(network, 2)
network = global_max_pool(network)
network = dropout(network, 0.8)
network = fully_connected(network, 2, activation='softmax')
network = regression(network, optimizer='adam', learning_rate=0.001,
loss='categorical_crossentropy', name='target')
# Training
model = tflearn.DNN(network, tensorboard_verbose=0)
model.fit(trainX, trainY,
n_epoch=5, shuffle=True, validation_set=(testX, testY),
show_metric=True, batch_size=100,run_id="spam") def do_rnn_wordbag(trainX, testX, trainY, testY):
global max_document_length
print "RNN and wordbag" trainX = pad_sequences(trainX, maxlen=max_document_length, value=0.)
testX = pad_sequences(testX, maxlen=max_document_length, value=0.)
# Converting labels to binary vectors
trainY = to_categorical(trainY, nb_classes=2)
testY = to_categorical(testY, nb_classes=2) # Network building
net = tflearn.input_data([None, max_document_length])
net = tflearn.embedding(net, input_dim=10240000, output_dim=128)
net = tflearn.lstm(net, 128, dropout=0.8)
net = tflearn.fully_connected(net, 2, activation='softmax')
net = tflearn.regression(net, optimizer='adam', learning_rate=0.001,
loss='categorical_crossentropy') # Training
model = tflearn.DNN(net, tensorboard_verbose=0)
model.fit(trainX, trainY, validation_set=(testX, testY), show_metric=True,
batch_size=10,run_id="spm-run",n_epoch=5) def do_dnn_wordbag(x_train, x_test, y_train, y_testY):
print "DNN and wordbag" # Building deep neural network
clf = MLPClassifier(solver='lbfgs',
alpha=1e-5,
hidden_layer_sizes = (5, 2),
random_state = 1)
print clf
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print metrics.accuracy_score(y_test, y_pred)
print metrics.confusion_matrix(y_test, y_pred) def get_features_by_tf():
global max_document_length
x=[]
y=[]
ham, spam=load_all_files()
x=ham+spam
y=[0]*len(ham)+[1]*len(spam)
vp=tflearn.data_utils.VocabularyProcessor(max_document_length=max_document_length,
min_frequency=0,
vocabulary=None,
tokenizer_fn=None)
x=vp.fit_transform(x, unused_y=None)
x=np.array(list(x))
return x,y if __name__ == "__main__":
print "Hello spam-mail"
#print "get_features_by_wordbag"
#x,y=get_features_by_wordbag()
#x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.4, random_state = 0) #print "get_features_by_wordbag_tfidf"
#x,y=get_features_by_wordbag_tfidf()
#x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.4, random_state = 0)
#NB
#do_nb_wordbag(x_train, x_test, y_train, y_test)
#show_diffrent_max_features() #SVM
#do_svm_wordbag(x_train, x_test, y_train, y_test) #DNN
#do_dnn_wordbag(x_train, x_test, y_train, y_test) print "get_features_by_tf"
x,y=get_features_by_wordbag()
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.4, random_state = 0)
#CNN
do_cnn_wordbag(x_train, x_test, y_train, y_test) #RNN
#do_rnn_wordbag(x_train, x_test, y_train, y_test)

自己写检测算法的时候也记得多个算法比较下

使用神经网络-垃圾邮件检测-LSTM或者CNN(一维卷积)效果都不错【代码有问题,pass】的更多相关文章

  1. Atitti 文本分类  以及 垃圾邮件 判断原理 以及贝叶斯算法的应用解决方案

    Atitti 文本分类  以及 垃圾邮件 判断原理 以及贝叶斯算法的应用解决方案 1.1. 七.什么是贝叶斯过滤器?1 1.2. 八.建立历史资料库2 1.3. 十.联合概率的计算3 1.4. 十一. ...

  2. 基于Python的邮件检测工具

    邮件快速检测工具 概要介绍 mmpi,是一款使用python实现的开源邮件快速检测工具库,基于community框架设计开发.mmpi支持对邮件头.邮件正文.邮件附件的解析检测,并输出json检测报告 ...

  3. CNN实现垃圾邮件分类(行大小不一致要补全)

    以下是利用卷积神经网络对某一个句子的处理结构图 我们从上图可知,将一句话转化成一个矩阵.我们看到该句话有6个单词和一个标点符号,所以我们可以将该矩阵设置为7行,对于列的话每个单词可以用什么样的数值表示 ...

  4. 数据挖掘、目标检测中的cnn和cn---卷积网络和卷积神经网络

    content 概述 文字识别系统LeNet-5 简化的LeNet-5系统 卷积神经网络的实现问题 深度神经网路已经在语音识别,图像识别等领域取得前所未有的成功.本人在多年之前也曾接触过神经网络.本系 ...

  5. 【深度学习系列】PaddlePaddle垃圾邮件处理实战(二)

    PaddlePaddle垃圾邮件处理实战(二) 前文回顾   在上篇文章中我们讲了如何用支持向量机对垃圾邮件进行分类,auc为73.3%,本篇讲继续讲如何用PaddlePaddle实现邮件分类,将深度 ...

  6. 如何基于TensorFlow使用LSTM和CNN实现时序分类任务

    https://www.jiqizhixin.com/articles/2017-09-12-5 By 蒋思源2017年9月12日 09:54 时序数据经常出现在很多领域中,如金融.信号处理.语音识别 ...

  7. Deep Learning模型之:CNN卷积神经网络(一)深度解析CNN

    http://m.blog.csdn.net/blog/wu010555688/24487301 本文整理了网上几位大牛的博客,详细地讲解了CNN的基础结构与核心思想,欢迎交流. [1]Deep le ...

  8. postfix反垃圾邮件说明

    参考地址:http://guailele.blog.51cto.com/1156442/780223 1.打开 smtp 的认证模块 在/etc/postfix/main.cf文件最后加上:   sm ...

  9. postfix疯狂外发垃圾邮件

    分析 一.查找main.cf配置文件 localhost# find / -name main.cf /etc/postfix/main.cf 二.打开/etc/postfix/main.cf来看看. ...

随机推荐

  1. damn selenium

    Selenium+Python [ˈpaɪθən] 0.Selenium安装 pip install selenium 1.打开了浏览器,后边什么都不干了 需要将浏览器驱动放置在环境变量的目录下. 2 ...

  2. 管窥python语法

    刚接触python,mark下所见所得: 1.Python调用底层API,可在任何platform上运行,包括Windows.Mac.Unix: 2.用#符号对代码或语句进行注释,#后的代码不被编译: ...

  3. css3 flex 详解,可以实现div内容水平垂直居中

    先说一下flex一系列属性: 一.flex-direction: (元素排列方向) ※ flex-direction:row (横向从左到右排列==左对齐) ※ flex-direction:row- ...

  4. 【图文】Excel中vlookup函数的使用方法

    今天统计数据,用到了Excel中vlookup函数,第一次使用当然少不了百度,经过反复研究后,算是解决了问题,现整理成文档. 一.实现效果 Sheet1 Sheet2   注:上图中sheet1商品条 ...

  5. Online ML那点事>-

    一:译自wiki:    KeyWord:标签反馈; Survey: online machine learning is a model of induction that learns one i ...

  6. css 添加阴影

    添加阴影,分为内阴影和外阴影. inset:内阴影. 不写默认外阴影. box-shadow: 水平位移  垂直位移  模糊半径 #box-shadow{ -moz-box-shadow:5px 5p ...

  7. C#结束Explorer进程

    private void Form1_Load(object sender, EventArgs e) { Process[] processes = Process.GetProcesses();/ ...

  8. ZBrush 2018软件安装激活教程一点通

    Zbrush下载地址:https://pixologic.com/CD 安装教程:(此CD代码仅有效一次,一旦此代码被使用,您将收到一封包含你账户信息的电子邮件.请把那封电子邮件保存在你的记录里.) ...

  9. Select, Poll,Epoll

    Date: 2019-06-19 Author: Sun 1. Select ​ select最早于1983年出现在4.2BSD中,它通过一个select()系统调用来监视多个文件描述符的数组,当se ...

  10. 转载:jquery 对 Json 的各种遍历

    概述 JSON(javascript Object Notation) 是一种轻量级的数据交换格式,采用完全独立于语言的文本格式,是理想的数据交换格式.同时,JSON是 JavaScript 原生格式 ...