bayes

from numpy import *

import time

starttime = time.time()

def loadDataSet():

    postingList = [['my', 'dog', 'has', 'flea',

                    'problems', 'help', 'please'],

                    ['maybe', 'not', 'take', 'him',

                    'to', 'dog', 'park', 'stupid'],

                    ['my', 'dalmation', 'is', 'so', 'cute',

                    'I', 'love', 'him'],

                    ['stop', 'posting', 'stupid', 'worthless',

                    'garbage'],

                    ['mr', 'licks', 'ate', 'my', 'steak', 'how',

                    'to', 'stop', 'him'],

                    ['quit', 'buying', 'worthless', 'dog', 'food',

                    'stupid']]

    classVec = [0, 1, 0, 1, 0, 1]

    return postingList, classVec

def createVocabList(dataSet): # dataSet = postingList

    vocabSet = set([]) # vocabSet = set(dataSet)

    for document in dataSet:

        vocabSet = vocabSet | set(document) #

    return list(vocabSet) # createVocabList = list(set(dataSet)) 

def setOfWords2Vec(vocabList, inputSet):

    returnVec = [0] * len(vocabList) # [0, 0 , 0 ,0,..] len(vocabList)  0

    for word in vocabList:

        if word in inputSet:

            returnVec[vocabList.index(word)] = 1 + 1.0

        else:

            returnVec[vocabList.index(word)] = 1.0

            print "the word: %s is not in my Vocabulary!" % word

    return returnVec 

def txt2trainxy(filename1, filename2):

    import re

    reg = re.compile(r'\W*') #

    # step 1: loading data...

    print "stet 1: loading data..."

    from os import listdir

    ld1 = listdir('email/' + filename1); ld2 = listdir('email/' + filename2)

    filelist = ld1 + ld2

    trainy = ((filename1 + '\t') * len(ld1) + (filename2 + '\t') * len(ld2)).split()

    trainx = []; fulltext = []; i = 0

    for File in filelist:

        if i < len(ld1):

            fr = reg.split(open('email/' + filename1 + '/' + File).readlines()[0].lower())

        else:

            fr = reg.split(open('email/' + filename2 + '/' + File).readlines()[0].lower())

        trainx.append([f for f in fr if len(f) > 2]) #

        fulltext.extend([f for f in fr if len(f) > 2]) #

        i += 1

    fulltext = list(set(fulltext))

    # set of words

    trainxws = [[list(set(item)).count(strg) + 1.0 for strg in fulltext] for item in trainx]

    # bag of words

    trainxwb = [[item.count(strg) + 1.0 for strg in fulltext] for item in trainx]

    return trainxws, trainxwb, trainy, trainx, fulltext

def testx2vec(testx, fulltext):

    # set of words

    testxws = [list(set(testx)).count(strg) + 1.0 for strg in fulltext] #

    # bag of words

    testxwb = [testx.count(strg) + 1.0 for strg in fulltext] #

    for word in testx:

        if word not in fulltext:

            print "the word: %s is not in my fulltext!" % word

    return testxws, testxwb

def bayes(testx, trainx, trainy, fulltext):

    print "---Getting Prob..."

    s = set(trainy); l = len(trainy); r = len(trainx[0])

    IDs = [[id for id in range(l) if trainy[id] == item] for item in s]

    logproby = [log(array(trainy.count(item)) / float(l)) for item in s]

    numbxv = [sum([trainx[id] for id in ids], 0) for ids in IDs]

    numbx = [sum([trainx[id] for id in ids]) + 2.0 for ids in IDs] #

    probx = [numbxv[i] / float(numbx[i]) for i in range(len(s))]

    logprobx = [[log(p[i]) for i in range(r)] for p in probx]

    print "---Printing Prob..."

    #print probx

    print [fulltext[i] for i in (-array(probx)).argsort()[:,: 5][0]] # argsort() small to big

    print trainy[IDs[0][0]]

    print [fulltext[i] for i in (-array(probx)).argsort()[:,: 5][1]]

    print trainy[IDs[1][0]]

    """

    print IDs

    print numbxv

    print logprobx

    """

    # step 4: showing the result...

    print "---Showing the result..."

    # set of words

    sumlogpxws = sum(array(logprobx) * testx, 1)

    sumlogpxyws = array(sumlogpxws) + array(logproby)

    #print logprobx

    print sumlogpxws

    print sum(array(probx) * testx, 1)

    bestyws = trainy[IDs[sumlogpxyws.argmax()][0]]

    print "---From set of words: ", bestyws

    """

    # bag of words

    sumlogpxwb = sum(array(logprobx) * testxwb, 1)

    sumlogpxywb = array(sumlogpxwb) + array(logproby)

    bestywb = trainy[IDs[sumlogpxywb.argmax()][0]]

    print "---From bag of words: ", bestywb

    """

    return bestyws

def main():

    # step 1: loading data...

    trainxws, trainxwb, trainy, trainx, fulltext = txt2trainxy('spam','ham')

    print fulltext

    # step 2: training...

    print "step 2: training..."

    pass

    # step 3: testing...

    print "step 3: testing..."

    print "---Preparing testdata..."

    import random

    l = len(trainy)

    testid = random.sample(range(l), 20)

    testxxx = [trainxws[i] for i in testid]

    testyyy = [trainy[i] for i in testid]

    testtrainxws = [trainxws[i] for i in range(l) if i not in testid]

    testtrainy = [trainy[i] for i in range(l) if i not in testid]

    print "---Testing now..."

    errorcount = 0; p = len(testid)

    for i in range(p):

        if bayes(testxxx[i], testtrainxws, testtrainy, fulltext) != testyyy[i]:

            errorcount += 1

    print errorcount

    print p

    print "---Errorrate is: ", (errorcount / float(p))

    # step 4: showing the result

    print "step 4: using..."

    testx = ['love', 'my', 'dalmation']

    print "the testx is: ", testx

    print "---Changing testx into vector..."

    testxws, testxwb = testx2vec(testx, fulltext)

    #print testxws

    bayes(testxws, testtrainxws, testtrainy, fulltext)

main()

"""

trainx, trainy = loadDataSet()

fulltext = createVocabList(trainx)

print fulltext

print setOfWords2Vec(fulltext, trainx[0])

trainxws = []

for t in trainx:

    trainxws.append(setOfWords2Vec(fulltext, t))

testEntry1 = ['love', 'my', 'dalmation']

testEntry2 = ['stupid', 'garbage']

bayes(testEntry1, trainxws, trainy, fulltext)

"""

bayes的更多相关文章

【十大经典数据挖掘算法】Naïve Bayes
[十大经典数据挖掘算法]系列 C4.5 K-Means SVM Apriori EM PageRank AdaBoost kNN Naïve Bayes CART 朴素贝叶斯(Naïve Bayes) ...
最大似然判别法和Bayes公式判别法
最大似然判别法 Bayes公式判别法
[Machine Learning & Algorithm] 朴素贝叶斯算法（Naive Bayes）
生活中很多场合需要用到分类,比如新闻分类.病人分类等等. 本文介绍朴素贝叶斯分类器(Naive Bayes classifier),它是一种简单有效的常用分类算法. 一.病人分类的例子让我从一个例子 ...
Spark MLlib 之 Naive Bayes
1.前言: Naive Bayes(朴素贝叶斯)是一个简单的多类分类算法,该算法的前提是假设各特征之间是相互独立的.Naive Bayes 训练主要是为每一个特征,在给定的标签的条件下,计算每个特征在 ...
基于Bayes和KNN的newsgroup 18828文本分类器的Python实现
向@yangliuy大牛学习NLP,这篇博客是数据挖掘-基于贝叶斯算法及KNN算法的newsgroup18828文本分类器的JAVA实现(上)的Python实现.入门为主,没有太多自己的东西. 1. ...
Microsoft Naive Bayes 算法——三国人物身份划分
Microsoft朴素贝叶斯是SSAS中最简单的算法,通常用作理解数据基本分组的起点.这类处理的一般特征就是分类.这个算法之所以称为“朴素”,是因为所有属性的重要性是一样的,没有谁比谁更高.贝叶斯之名 ...
Naive Bayes理论与实践
Naive Bayes: 简单有效的常用分类算法,典型用途:垃圾邮件分类假设:给定目标值时属性之间相互条件独立同样,先验概率的贝叶斯估计是优点: 1. 无监督学习的一种,实现简单,没有迭代,学习 ...
[ML] Naive Bayes for Text Classification
TF-IDF Algorithm From http://www.ruanyifeng.com/blog/2013/03/tf-idf.html Chapter 1, 知道了"词频" ...
朴素贝叶斯方法（Naive Bayes Method）
朴素贝叶斯是一种很简单的分类方法,之所以称之为朴素,是因为它有着非常强的前提条件-其所有特征都是相互独立的,是一种典型的生成学习算法.所谓生成学习算法,是指由训练数据学习联合概率分布P(X,Y ...
数据挖掘十大经典算法(9) 朴素贝叶斯分类器 Naive Bayes
贝叶斯分类器贝叶斯分类器的分类原理是通过某对象的先验概率,利用贝叶斯公式计算出其后验概率,即该对象属于某一类的概率,选择具有最大后验概率的类作为该对象所属的类.眼下研究较多的贝叶斯分类器主要有四种, ...

随机推荐

COJ 0332 The Flash
传送门:http://oj.cnuschool.org.cn/oj/home/problem.htm?problemID=302 The Flash 难度级别:B: 运行时间限制:1000ms: 运行 ...
【转】Unable to execute dex: Java heap space 解决方案（如何为eclipse.int 添加内存）
原文网址:http://blog.csdn.net/zengyangtech/article/details/7003379 欢迎转载,转载请注明 http://blog.csdn.net/zengy ...
在 Windows 下远程桌面连接 Linux - XManager 篇
XManager是一个简单易用的高性能的运行在Windows平台上的X-Server软件,而Gnome和KDE就是X-Client,Linux下的X-Server则为Xorg.它能把远端Unix/Li ...
转：有关Java泛型的类型擦除（type erasing）
转载自:拈花微笑自从Java 5引入泛型之后,Java与C++对于泛型不同的实现的优劣便一直是饭后的谈资.在我之前的很多training中,当讲到Java泛型时总是会和C++的实现比较,一般得出的结 ...
jQuery表单验证以及将表单序列化为json对象小练习
jquery表单验证(非实时验证),同时,将表单序列化为json对象提交表单. <!DOCTYPE html> <html lang="en"> <h ...
[网络] SOCKET， TCP/UDP, HTTP, FTP
(一)TCP/UDP,SOCKET,HTTP,FTP简析 TCP/IP是个协议组,可分为三个层次:网络层.传输层和应用层: 网络层:IP协议.ICMP协议.ARP协议.RARP协议和BOOTP协议传 ...
webform repeater
repeater:由模板构成,解析后模板就不存在了需要指定数据源进行数据绑定 List<Fruit> list = new FruitDA().Select(); ...
hdu 2123
#include <iostream> using namespace std; int main() { int i,t,n,j,k,f; cin>>t; while(t-- ...
sessionstorage，localstorage和cookie之间的区别
sessionStorage 和 localStorage 是HTML5 Web Storage API 提供的,可以方便的在web请求之间保存数据.有了本地数据,就可以避免数据在浏览器和服务器间不必 ...
junit测试用例加载spring配置文件
junit加载pom引用项目的xml配置文件,如果定义了<beans profile="dev">,必须在测试用例类上面加上标记 @ActiveProfiles(&qu ...

bayes

bayes的更多相关文章

随机推荐

热门专题