使用神经网络-垃圾邮件检测-LSTM或者CNN（一维卷积）效果都不错【代码有问题，pass】

from sklearn.feature_extraction.text import CountVectorizer

import os

from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import train_test_split

from sklearn import metrics

import matplotlib.pyplot as plt

import numpy as np

from sklearn import svm

from sklearn.feature_extraction.text import TfidfTransformer

import tensorflow as tf

import tflearn

from tflearn.layers.core import input_data, dropout, fully_connected

from tflearn.layers.conv import conv_1d, global_max_pool

from tflearn.layers.conv import conv_2d, max_pool_2d

from tflearn.layers.merge_ops import merge

from tflearn.layers.estimator import regression

from tflearn.data_utils import to_categorical, pad_sequences

from sklearn.neural_network import MLPClassifier

from tflearn.layers.normalization import local_response_normalization

from tensorflow.contrib import learn

max_features=500

max_document_length=1024

def load_one_file(filename):

    x=""

    with open(filename) as f:

        for line in f:

            line=line.strip('\n')

            line = line.strip('\r')

            x+=line

    return x

def load_files_from_dir(rootdir):

    x=[]

    list = os.listdir(rootdir)

    for i in range(0, len(list)):

        path = os.path.join(rootdir, list[i])

        if os.path.isfile(path):

            v=load_one_file(path)

            x.append(v)

    return x

def load_all_files():

    ham=[]

    spam=[]

    for i in range(1,5):

        path="../data/mail/enron%d/ham/" % i

        print "Load %s" % path

        ham+=load_files_from_dir(path)

        path="../data/mail/enron%d/spam/" % i

        print "Load %s" % path

        spam+=load_files_from_dir(path)

    return ham,spam

def get_features_by_wordbag():

    ham, spam=load_all_files()

    x=ham+spam

    y=[0]*len(ham)+[1]*len(spam)

    vectorizer = CountVectorizer(

                                 decode_error='ignore',

                                 strip_accents='ascii',

                                 max_features=max_features,

                                 stop_words='english',

                                 max_df=1.0,

                                 min_df=1 )

    print vectorizer

    x=vectorizer.fit_transform(x)

    x=x.toarray()

    return x,y

def show_diffrent_max_features():

    global max_features

    a=[]

    b=[]

    for i in range(1000,20000,2000):

        max_features=i

        print "max_features=%d" % i

        x, y = get_features_by_wordbag()

        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=0)

        gnb = GaussianNB()

        gnb.fit(x_train, y_train)

        y_pred = gnb.predict(x_test)

        score=metrics.accuracy_score(y_test, y_pred)

        a.append(max_features)

        b.append(score)

        plt.plot(a, b, 'r')

    plt.xlabel("max_features")

    plt.ylabel("metrics.accuracy_score")

    plt.title("metrics.accuracy_score VS max_features")

    plt.legend()

    plt.show()

def do_nb_wordbag(x_train, x_test, y_train, y_test):

    print "NB and wordbag"

    gnb = GaussianNB()

    gnb.fit(x_train,y_train)

    y_pred=gnb.predict(x_test)

    print metrics.accuracy_score(y_test, y_pred)

    print metrics.confusion_matrix(y_test, y_pred)

def do_svm_wordbag(x_train, x_test, y_train, y_test):

    print "SVM and wordbag"

    clf = svm.SVC()

    clf.fit(x_train, y_train)

    y_pred = clf.predict(x_test)

    print metrics.accuracy_score(y_test, y_pred)

    print metrics.confusion_matrix(y_test, y_pred)

def get_features_by_wordbag_tfidf():

    ham, spam=load_all_files()

    x=ham+spam

    y=[0]*len(ham)+[1]*len(spam)

    vectorizer = CountVectorizer(binary=True,

                                 decode_error='ignore',

                                 strip_accents='ascii',

                                 max_features=max_features,

                                 stop_words='english',

                                 max_df=1.0,

                                 min_df=1 )

    print vectorizer

    x=vectorizer.fit_transform(x)

    x=x.toarray()

    transformer = TfidfTransformer(smooth_idf=False)

    print transformer

    tfidf = transformer.fit_transform(x)

    x = tfidf.toarray()

    return  x,y

def do_cnn_wordbag(trainX, testX, trainY, testY):

    global max_document_length

    print "CNN and tf"

    trainX = pad_sequences(trainX, maxlen=max_document_length, value=0.)

    testX = pad_sequences(testX, maxlen=max_document_length, value=0.)

    # Converting labels to binary vectors

    trainY = to_categorical(trainY, nb_classes=2)

    testY = to_categorical(testY, nb_classes=2)

    # Building convolutional network

    network = input_data(shape=[None,max_document_length], name='input')

    network = tflearn.embedding(network, input_dim=1000000, output_dim=128)

    branch1 = conv_1d(network, 128, 3, padding='valid', activation='relu', regularizer="L2")

    branch2 = conv_1d(network, 128, 4, padding='valid', activation='relu', regularizer="L2")

    branch3 = conv_1d(network, 128, 5, padding='valid', activation='relu', regularizer="L2")

    network = merge([branch1, branch2, branch3], mode='concat', axis=1)

    network = tf.expand_dims(network, 2)

    network = global_max_pool(network)

    network = dropout(network, 0.8)

    network = fully_connected(network, 2, activation='softmax')

    network = regression(network, optimizer='adam', learning_rate=0.001,

                         loss='categorical_crossentropy', name='target')

    # Training

    model = tflearn.DNN(network, tensorboard_verbose=0)

    model.fit(trainX, trainY,

              n_epoch=5, shuffle=True, validation_set=(testX, testY),

              show_metric=True, batch_size=100,run_id="spam")

def do_rnn_wordbag(trainX, testX, trainY, testY):

    global max_document_length

    print "RNN and wordbag"

    trainX = pad_sequences(trainX, maxlen=max_document_length, value=0.)

    testX = pad_sequences(testX, maxlen=max_document_length, value=0.)

    # Converting labels to binary vectors

    trainY = to_categorical(trainY, nb_classes=2)

    testY = to_categorical(testY, nb_classes=2)

    # Network building

    net = tflearn.input_data([None, max_document_length])

    net = tflearn.embedding(net, input_dim=10240000, output_dim=128)

    net = tflearn.lstm(net, 128, dropout=0.8)

    net = tflearn.fully_connected(net, 2, activation='softmax')

    net = tflearn.regression(net, optimizer='adam', learning_rate=0.001,

                             loss='categorical_crossentropy')

    # Training

    model = tflearn.DNN(net, tensorboard_verbose=0)

    model.fit(trainX, trainY, validation_set=(testX, testY), show_metric=True,

              batch_size=10,run_id="spm-run",n_epoch=5)

def do_dnn_wordbag(x_train, x_test, y_train, y_testY):

    print "DNN and wordbag"

    # Building deep neural network

    clf = MLPClassifier(solver='lbfgs',

                        alpha=1e-5,

                        hidden_layer_sizes = (5, 2),

                        random_state = 1)

    print  clf

    clf.fit(x_train, y_train)

    y_pred = clf.predict(x_test)

    print metrics.accuracy_score(y_test, y_pred)

    print metrics.confusion_matrix(y_test, y_pred)

def  get_features_by_tf():

    global  max_document_length

    x=[]

    y=[]

    ham, spam=load_all_files()

    x=ham+spam

    y=[0]*len(ham)+[1]*len(spam)

    vp=tflearn.data_utils.VocabularyProcessor(max_document_length=max_document_length,

                                              min_frequency=0,

                                              vocabulary=None,

                                              tokenizer_fn=None)

    x=vp.fit_transform(x, unused_y=None)

    x=np.array(list(x))

    return x,y

if __name__ == "__main__":

    print "Hello spam-mail"

    #print "get_features_by_wordbag"

    #x,y=get_features_by_wordbag()

    #x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.4, random_state = 0)

    #print "get_features_by_wordbag_tfidf"

    #x,y=get_features_by_wordbag_tfidf()

    #x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.4, random_state = 0)

    #NB

    #do_nb_wordbag(x_train, x_test, y_train, y_test)

    #show_diffrent_max_features()

    #SVM

    #do_svm_wordbag(x_train, x_test, y_train, y_test)

    #DNN

    #do_dnn_wordbag(x_train, x_test, y_train, y_test)

    print "get_features_by_tf"

    x,y=get_features_by_wordbag()

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.4, random_state = 0)

    #CNN

    do_cnn_wordbag(x_train, x_test, y_train, y_test)

    #RNN

    #do_rnn_wordbag(x_train, x_test, y_train, y_test)

自己写检测算法的时候也记得多个算法比较下

使用神经网络-垃圾邮件检测-LSTM或者CNN（一维卷积）效果都不错【代码有问题，pass】的更多相关文章

Atitti 文本分类以及垃圾邮件判断原理以及贝叶斯算法的应用解决方案
Atitti 文本分类以及垃圾邮件判断原理以及贝叶斯算法的应用解决方案 1.1. 七.什么是贝叶斯过滤器?1 1.2. 八.建立历史资料库2 1.3. 十.联合概率的计算3 1.4. 十一. ...
基于Python的邮件检测工具
邮件快速检测工具概要介绍 mmpi,是一款使用python实现的开源邮件快速检测工具库,基于community框架设计开发.mmpi支持对邮件头.邮件正文.邮件附件的解析检测,并输出json检测报告 ...
CNN实现垃圾邮件分类(行大小不一致要补全)
以下是利用卷积神经网络对某一个句子的处理结构图我们从上图可知,将一句话转化成一个矩阵.我们看到该句话有6个单词和一个标点符号,所以我们可以将该矩阵设置为7行,对于列的话每个单词可以用什么样的数值表示 ...
数据挖掘、目标检测中的cnn和cn---卷积网络和卷积神经网络
content 概述文字识别系统LeNet-5 简化的LeNet-5系统卷积神经网络的实现问题深度神经网路已经在语音识别,图像识别等领域取得前所未有的成功.本人在多年之前也曾接触过神经网络.本系 ...
【深度学习系列】PaddlePaddle垃圾邮件处理实战（二）
PaddlePaddle垃圾邮件处理实战(二) 前文回顾在上篇文章中我们讲了如何用支持向量机对垃圾邮件进行分类,auc为73.3%,本篇讲继续讲如何用PaddlePaddle实现邮件分类,将深度 ...
如何基于TensorFlow使用LSTM和CNN实现时序分类任务
https://www.jiqizhixin.com/articles/2017-09-12-5 By 蒋思源2017年9月12日 09:54 时序数据经常出现在很多领域中,如金融.信号处理.语音识别 ...
Deep Learning模型之：CNN卷积神经网络（一）深度解析CNN
http://m.blog.csdn.net/blog/wu010555688/24487301 本文整理了网上几位大牛的博客,详细地讲解了CNN的基础结构与核心思想,欢迎交流. [1]Deep le ...
postfix反垃圾邮件说明
参考地址:http://guailele.blog.51cto.com/1156442/780223 1.打开 smtp 的认证模块在/etc/postfix/main.cf文件最后加上: sm ...
postfix疯狂外发垃圾邮件
分析一.查找main.cf配置文件 localhost# find / -name main.cf /etc/postfix/main.cf 二.打开/etc/postfix/main.cf来看看. ...

随机推荐

4.Projects and Scenes介绍
1.Project 一个项目是由一系列的文件(如图片.音频.几何).场景以及vzp文件组成.这些文件被导入到项目对应的文件夹中.项目外部资源在场景中被使用后,会导入项目中,除非该资源被标记为外部引用. ...
CI中的url相关函数以及路由设置和伪静态技术
当使用CI框架进行开发时,我们的一些数据传递的URL不应该写死,可以使用如下方法:比如说我们需要表单提交一个数据: 1.在controller控制器中我们需要先创建一个加载helper和视图的方法: ...
洛谷P2607 [ZJOI2008]骑士(树形dp)
题目描述 Z国的骑士团是一个很有势力的组织,帮会中汇聚了来自各地的精英.他们劫富济贫,惩恶扬善,受到社会各界的赞扬. 最近发生了一件可怕的事情,邪恶的Y国发动了一场针对Z国的侵略战争.战火绵延五百里, ...
Activity创建时布局文件的实现原理
setContenView(R.id.activity)实现原理 1.底层框架根据布局ID找到布局文件. 2.底层框架解析此布局文件(pull解析). 3.底层框架通过反射构建布局文件中的元素对象(E ...
Jenkins介绍-安装-部署...
1.背景大师Martin Fowler对持续集成是这样定义的:持续集成是一种软件开发实践,即团队开发成员经常集成他们的工作,通常每个成员每天至少集成一次,也就意味着每天可能会发生多次集成. ...
ASP.NET 微信公众平台模板消息推送功能完整开发
最近公众平台的用户提出了新需求,他们希望当收到新的邮件或者日程的时候,公众平台能主动推送一条提醒给用户.看了看平台提供的接口,似乎只有[模板消息]能尽量满足这一需求,但不得不说微信提供的实例太少,而且 ...
epoll的实现与深入思考
提契纸上得来终觉浅,绝知此事要躬行. 正文前段时间写了一篇epoll的学习文章,但没有自己的心得总觉得比较肤浅,花了一些时间补充一个epoll的实例,并浅析一下过程中遇到的问题. 上epoll_s ...
VTK读取序列化图像
vtk获取内存中图像数据原文链接:http://blog.csdn.net/zmy3376365/article/details/7717721 内存中有段图片数据 ,使用VTK来读入,然后就可以 ...
Java学习笔记2——数据类型和转换
前提知识: 1字节=8bit:1bit以一个二极管表示,代表2个状态(0或者1):2bit代表22即4种状态(00,01,10,11),8bit即是28即256种状态,16bit即是65536种状态. ...
计蒜客阿里天池的新任务—简单（ KMP水）
链接:传送门思路:KMP模板题,直接生成 S 串,然后匹配一下 P 串在 S 串出现的次数,注意处理嵌套的情况即可,嵌套的情况即 S = "aaaaaa" ,P = " ...

使用神经网络-垃圾邮件检测-LSTM或者CNN（一维卷积）效果都不错【代码有问题，pass】

使用神经网络-垃圾邮件检测-LSTM或者CNN（一维卷积）效果都不错【代码有问题，pass】的更多相关文章

随机推荐

热门专题