kNN1

 # -*- coding: utf-8 -*-
 """
 kNN : 최근접 이웃
 """

 import numpy as np # 다차원배열, 선형대수 연산
 import matplotlib.pyplot as plt

 # 1. 알려진 두 집단 x,y 산점도 시각화
 plt.scatter(1.2, 1.1) # A 집단
 plt.scatter(1.0, 1.0)
 plt.scatter(1.8, 0.8) # B 집단
 plt.scatter(2, 0.9)

 plt.scatter(1.6, 0.85, color='r') # 분류대상(알려지지 않은 집단)
 plt.show()

 # 2. DATA 생성과 함수 정의
 p1 = [1.2, 1.1] # A 집단
 p2 = [1.0, 1.0]
 p3 = [1.8, 0.8] # B 집단
 p4 = [2, 0.9]
 category = ['A','A','B','B'] # 알려진 집단 분류범주(Y변수)
 p5 = [1.6, 0.85] # 분류대상 

 # data 생성 함수 정의
 def data_set():
     # 선형대수 연산 : numpy형 변환
     know_group = np.array([p1, p2, p3, p4]) # 알려진 집단
     not_know_group = np.array(p5) # 알려지지 않은 집단
     class_category = np.array(category) # 정답(분류범주)
     return know_group,not_know_group,class_category 

 know_group,not_know_group,class_category=data_set()
 print('알려진 집단')
 """
 [[1.2 1.1]
  [1.  1. ]
  [1.8 0.8]
  [2.  0.9]]
 """
 print(know_group)
 print('알려지지 않은 집단')
 print(not_know_group) #[1.6  0.85]

 print('정답')
 print(class_category) #['A' 'A' 'B' 'B']

 #
 #차(-) -> 자곱(**) -> 합(sum) -> 제곱근(sqrt)

 diff=know_group-not_know_group #2차원 -1차원
 print('차=\n',diff)
 """
 차=
  [[-0.4   0.25]
  [-0.6   0.15]
  [ 0.2  -0.05]
  [ 0.4   0.05]]
 """

 sq_diff = diff ** 2
 sq_sum = sq_diff.sum(axis=1) #행단위 합계
 print(sq_sum) #[0.2225 0.3825 0.0425 0.1625]
 distance=np.sqrt(sq_sum)
 print(distance) #[0.47169906 0.61846584 0.20615528 0.40311289]
 #[3 4 1 2]거리  k=3 (B(2)>A(1))
 print(class_category)#['A' 'A' 'B' 'B']

 def classfy(know,not_know,cate,k):
     #유클리드인 거리계산식
     diff=know-not_know
     sq_diff = diff ** 2
     sq_sum = sq_diff.sum(axis=1)
     distance=np.sqrt(sq_sum)

     #2.가장 가까운 거리 오름차순 정렬 -> index
     sortDist=distance.argsort() #sort->index
     #print(sortDist) #[2 3 0 1]

     #3.최근접 이윳
     class_result={} #빈 set
     for i in range(k):#0~2
         key = cate[sortDist[i]] #i=0 -> 'B'
         class_result[key]=class_result.get(key,0)+1
     return class_result

 #함수 호출
 class_result=classfy(know_group,not_know_group,class_category,3)
 print(class_result) #{'B': 2, 'A': 1}

 #vot 함수
 def class_vote(class_result):
     return max(class_result,key=class_result.get)

 vote_result=class_vote(class_result)
 print("분류결과=",vote_result)#분류결과= B

kNN Class

 # -*- coding: utf-8 -*-
 """
 class 구현
 """

 import numpy as np
 from Step01_kNN import data_set

 know_group,not_know_group,class_category=data_set()

 #class =Func1+Func2+Func3
 class kNNclassify:   

     #1.최근접 이웃
     def classfy(self,know,not_know,cate,k):
         #유클리드인 거리계산식
         diff=know-not_know
         sq_diff = diff ** 2
         sq_sum = sq_diff.sum(axis=1)
         distance=np.sqrt(sq_sum)

         #2.가장 가까운 거리 오름차순 정렬 -> index
         sortDist=distance.argsort() #sort->index
         #print(sortDist) #[2 3 0 1]

         #3.최근접 이윳(k=3)
         self.class_result={} #빈 set
         for i in range(k):#0~2
             key = cate[sortDist[i]] #i=0 -> 'B'
             self.class_result[key]=self.class_result.get(key,0)+1

     #vot 함수
     def class_vote(self):
         return max(self.class_result,key=self.class_result.get)

 #class object 생성
 obj=kNNclassify() #생성자

 #objext.menber : self.class_result
 obj.classfy(know_group,not_know_group,class_category,3)

 vote_result=obj.class_vote()
 print('kNN 분류결과=',vote_result)#kNN 분류결과= B

NB

 # -*- coding: utf-8 -*-
 """
 통계적 분류기 - NB
 """
 import pandas as pd
 from sklearn import model_selection#train/test
 from sklearn.naive_bayes import GaussianNB 

 iris=pd.read_csv("../data/iris.csv")
 print(iris.head())
 """
    Sepal.Length  Sepal.Width  Petal.Length  Petal.Width Species
 0           5.1          3.5           1.4          0.2  setosa
 1           4.9          3.0           1.4          0.2  setosa
 2           4.7          3.2           1.3          0.2  setosa
 3           4.6          3.1           1.5          0.2  setosa
 4           5.0          3.6           1.4          0.2  setosa
 """

 #2. x,y 변수 선정
 cols=list(iris.columns)
 x_cols=cols[:4] #X:1~4(연속형)
 y_cols=cols[-1] #y:5(범주형)

 #3.train/test split
 iris_df=iris
 print(iris_df.shape)#(150, 5)
 train_iris,test_iris=model_selection.train_test_split(iris_df,test_size=0.3,random_state=123)
 print(train_iris.shape)#(105, 5)
 print(test_iris.shape)#(45, 5)

 #4. model생성 train set
 obj=GaussianNB() #object
 model=obj.fit(train_iris[x_cols],train_iris[y_cols])

 #5.model 평가
 pred=model.predict(test_iris[x_cols]) #Y예측
 Y = test_iris[y_cols] #정답

 #confusion matrix
 matrix=pd.crosstab(pred,Y)
 print(matrix)
 """
 Species     setosa  versicolor  virginica
 row_0
 setosa          18           0          0
 versicolor       0          10          2
 virginica        0           0         15
 """

 acc= (matrix.ix[0,0]+matrix.ix[1,1]+matrix.ix[2,2])/len(Y)
 print('분류정확도=',acc)#분류정확도= 0.9555555555555556

SVM

 # -*- coding: utf-8 -*-
 """
 SVM Model
 """
 import pandas as pd
 from sklearn import model_selection#train/test
 from sklearn import svm #model

 iris=pd.read_csv("../data/iris.csv")
 print(iris.head())
 """
    Sepal.Length  Sepal.Width  Petal.Length  Petal.Width Species
 0           5.1          3.5           1.4          0.2  setosa
 1           4.9          3.0           1.4          0.2  setosa
 2           4.7          3.2           1.3          0.2  setosa
 3           4.6          3.1           1.5          0.2  setosa
 4           5.0          3.6           1.4          0.2  setosa
 """

 #2. x,y 변수 선정
 cols=list(iris.columns)
 x_cols=cols[:4] #X:1~4(연속형)
 y_cols=cols[-1] #y:5(범주형)

 #3.train/test split
 iris_df=iris
 print(iris_df.shape)#(150, 5)
 train_iris,test_iris=model_selection.train_test_split(iris_df,test_size=0.3,random_state=123)
 print(train_iris.shape)#(105, 5)
 print(test_iris.shape)#(45, 5)

 #4.model -SVM
 obj=svm.SVC()
 model=obj.fit(train_iris[x_cols],train_iris[y_cols])

 #5.model 평가
 pred=model.predict(test_iris[x_cols])
 Y=test_iris[y_cols]

 #confusion matrix
 matrix=pd.crosstab(pred,Y)
 print(matrix)
 """
 Species     setosa  versicolor  virginica
 row_0
 setosa          18           0          0
 versicolor       0          10          1
 virginica        0           0         16
 """

 acc= (matrix.ix[0,0]+matrix.ix[1,1]+matrix.ix[2,2])/len(Y)
 print('분류정확도=',acc)#분류정확도= 0.9777777777777777

spam_train_test

 # -*- coding: utf-8 -*-
 """
 NB vs SWM
 -data set :sparse matrix 이용
 -file name:../data/spam_tran_test.npy
 """
 from sklearn.naive_bayes import GaussianNB
 from sklearn import svm
 import numpy as np
 import pandas as pd

 #1.file Loading
 X_train,X_test,y_train,y_test=np.load("../data/spam_tran_test.npy")
 print(X_train.shape) #(3901, 4000)
 print(X_test.shape) #(1673, 4000)
 print(type(y_train))#<class 'list'>

 #list -> numpy형변환: 선형대수 연산
 y_train=np.array(y_train)
 y_test=np.array(y_test)
 print(type(y_train))#<class 'numpy.ndarray'> 선형대수 하기위해서

 #2.NB model생성
 obj =GaussianNB()
 nb_model=obj.fit(X_train,y_train)

 pred=nb_model.predict(X_test)
 Y=y_test

 matrix=pd.crosstab(pred,Y)
 print("nb matrix\n",matrix)
 """
  col_0     0(ham)    1(spam)
 row_0
 0      1264   28
 1       167  214
 """
 acc=(matrix.ix[0,0]+matrix.ix[1,1])/len(Y)
 print("NB acc=",acc) #NB acc= 0.8834429169157203

 #2) 정확률:예측치 yes-> 실제값 yes
 precision=matrix.ix[1,1]/(matrix.ix[1,0]+matrix.ix[1,1])
 print("정확률=",precision)#정확률= 0.5616797900262467

 #3) 재현률:실제값yes -> 예측치 yes
 recall=matrix.ix[1,1]/(matrix.ix[0,1]+matrix.ix[1,1])
 print("재현률=",recall)#재현률= 0.8842975206611571

 #4) f1 score:precision,recall
 f1_score=2 * (precision*recall)/(precision+recall)
 print('f1_score=',f1_score)#f1_score= 0.6869983948635634

 #3.SVM model
 svm_obj =svm.SVC(kernel='linear')#kenel 함수
 svm_model=svm_obj.fit(X_train,y_train)

 svm_pred=svm_model.predict(X_test)
 svm_Y=y_test

 svm_matrix=pd.crosstab(svm_pred,svm_Y)
 print("svm matrix\n",svm_matrix)

 """
 svm matrix
  col_0     0    1
 row_0
 0      1428   36
 1         3  206
 """

 svm_acc=(svm_matrix.ix[0,0]+svm_matrix.ix[1,1])/len(svm_Y)
 print("svm acc=",svm_acc) #svm acc= 0.976688583383144

sms_spam_data

 # -*- coding: utf-8 -*-
 """
 Created on Sat Feb 23 15:52:23 2019

 @author: 502-03
 """

 from sklearn.naive_bayes import GaussianNB
 from sklearn import svm
 import numpy as np
 import pandas as pd

 #1.file Loading
 X_train,X_test,y_train,y_test=np.load("../data/sms_spam_data.npy")
 print(X_train.shape) #(4446, 6000)
 print(X_test.shape) #(1112, 6000)
 print(type(y_train))#<class 'pandas.core.series.Series'>

 #NB model 생성
 obj=GaussianNB()
 nb_model=obj.fit(X_train,y_train)
 nb_pred=nb_model.predict(X_test)
 nb_Y=y_test

 nb_tab=pd.crosstab(nb_pred,nb_Y)
 print("nb_tab=\n",nb_tab)
 """
 nb_tab=
  type   ham  spam
 row_0
 ham    812    10
 spam   156   134
 """
 nb_acc=(nb_tab.ix[0,0]+nb_tab.ix[1,1])/len(nb_Y)
 print("nb_acc=",nb_acc) #nb_acc= 0.8507194244604317

 #svm
 obj=svm.SVC(kernel='linear')
 svc_model=obj.fit(X_train,y_train)
 svc_pred=svc_model.predict(X_test)
 svc_Y=y_test

 svc_tab=pd.crosstab(svc_pred,svc_Y)
 print("svc_tab=\n",svc_tab)
 """
 svc_tab=
  type   ham  spam
 row_0
 ham    964    20
 spam     4   124
 """
 svc_acc=(svc_tab.ix[0,0]+svc_tab.ix[1,1])/len(svc_Y)
 print("svc_acc=",svc_acc) #svc_acc= 0.9784172661870504

 precision=svc_tab.ix[1,1]/(svc_tab.ix[1,0]+svc_tab.ix[1,1])
 print("정확률",precision)#정확률 0.96875

 recall=svc_tab.ix[1,1]/(svc_tab.ix[0,1]+svc_tab.ix[1,1])
 print("재현률",recall)#재현률 0.8611111111111112

 f1_score=2* (precision * recall)/(precision + recall)
 print("f1_score",f1_score)#f1_score 0.911764705882353

Classification的更多相关文章

  1. W3School-CSS 分类 (Classification) 实例

    CSS 分类 (Classification) 实例 CSS 实例 CSS 背景实例 CSS 文本实例 CSS 字体(font)实例 CSS 边框(border)实例 CSS 外边距 (margin) ...

  2. Large Margin DAGs for Multiclass Classification

    Abstract We present a new learning architecture: the Decision Directed Acyclic Graph (DDAG), which i ...

  3. 《ImageNet Classification with Deep Convolutional Neural Networks》 剖析

    <ImageNet Classification with Deep Convolutional Neural Networks> 剖析 CNN 领域的经典之作, 作者训练了一个面向数量为 ...

  4. 自然语言23_Text Classification with NLTK

    QQ:231469242 欢迎喜欢nltk朋友交流 https://www.pythonprogramming.net/text-classification-nltk-tutorial/?compl ...

  5. MATLAB 图像分类 Image Category Classification Using Bag of Features

    使用MATLAB实现图像的识别,这是MATLAB官网上面的例子,学习一下. http://cn.mathworks.com/help/vision/examples/image-category-cl ...

  6. Galaxy Classification

    10.3 Data Preparation After removing a large number of the columns from the raw SDSS dataset, introd ...

  7. Kaiju: Fast and sensitive taxonomic classification for metagenomics

    Kaiju: Fast and sensitive taxonomic classification for  metagenomics   问题描述:However, nucleotide comp ...

  8. 《Automatic Face Classification of Cushing’s Syndrome in Women – A Novel Screening Approach》学习笔记

    <针对女性库欣综合征患者的自动面部分类-一种新颖的筛查方法> Abstract 目的:库兴氏综合征对身体造成相当大的伤害如果不及时治疗,还经常是诊断的时间太长.在这项研究中,我们旨在测试面 ...

  9. [CS231n-CNN] Image classification and the data-driven approach, k-nearest neighbor, Linear classification I

    课程主页:http://cs231n.stanford.edu/ Task: Challenges: _________________________________________________ ...

  10. [ML] Naive Bayes for Text Classification

    TF-IDF Algorithm From http://www.ruanyifeng.com/blog/2013/03/tf-idf.html Chapter 1, 知道了"词频" ...

随机推荐

  1. 进程初识和multiprocessing模块之Process

    一.什么是进程 进程就是运行中的程序 进程是操作系统中最小的资源分配单位 进程与进程之间的关系 : 数据隔离的 进程的id:Process id = pid pid是一个全系统唯一的对某个进程的标识, ...

  2. stop()在animate中的用法

    stop 是jQuery中用于控制页面动画效果的方法.运行之后立刻结束当前页面上的动画效果.stop在新版jQuery中添加了2个参数:第一个参数的意思是是否清空动画序列,也就是stop的是当前元素的 ...

  3. jdk8在windows及linux环境下安装

    jdk下载 下载地址:https://www.oracle.com/technetwork/java/javase/downloads/index.html 目前大部分公司内部使用的还是jdk8,大部 ...

  4. 如何确定Kafka的分区数、key和consumer线程数

    [原创]如何确定Kafka的分区数.key和consumer线程数   在Kafak中国社区的qq群中,这个问题被提及的比例是相当高的,这也是Kafka用户最常碰到的问题之一.本文结合Kafka源码试 ...

  5. 什么是Tensor

    https://blog.csdn.net/kansas_lh/article/details/79321234 tensor是tensorflow基础的一个概念——张量. Tensorflow用到了 ...

  6. 剑指Offer_编程题_22

    题目描述 输入两个整数序列,第一个序列表示栈的压入顺序,请判断第二个序列是否为该栈的弹出顺序.假设压入栈的所有数字均不相等.例如序列1,2,3,4,5是某栈的压入顺序,序列4,5,3,2,1是该压栈序 ...

  7. Linux中的pipe(管道)与named pipe(FIFO 命名管道)

    catalogue . pipe匿名管道 . named pipe(FIFO)有名管道 1. pipe匿名管道 管道是Linux中很重要的一种通信方式,是把一个程序的输出直接连接到另一个程序的输入,常 ...

  8. [物理学与PDEs]第2章第2节 粘性流体力学方程组 2.1 引言

    1.  实际的流体与理想流体的主要区别在于: 前者具有粘性 (内摩擦) 和热传导. 2.  内摩擦 (1)  当两层流体有相对运动时, 方有摩擦力; 它是一种内力; 单位面积上所受的内力称为应力; 而 ...

  9. LINQ to SQL 的常见异常及解决办法

    Ø  简介 本文主要介绍 LINQ to SQL 中常见的异常,以及对应的解决办法.包括以下内容: 1.   左连接情况下,右表非空类型字段可能抛出异常 1.   左连接情况下,右表非空类型字段可能抛 ...

  10. Numpy 学习(一)

    1.Numpy 中Matrices和arrays的区分 Numpy matrices必须是2维的,但是 numpy arrays (ndarrays) 可以是多维的(1D,2D,3D····ND). ...