kNN1

 # -*- coding: utf-8 -*-
 """
 kNN : 최근접 이웃
 """

 import numpy as np # 다차원배열, 선형대수 연산
 import matplotlib.pyplot as plt

 # 1. 알려진 두 집단 x,y 산점도 시각화
 plt.scatter(1.2, 1.1) # A 집단
 plt.scatter(1.0, 1.0)
 plt.scatter(1.8, 0.8) # B 집단
 plt.scatter(2, 0.9)

 plt.scatter(1.6, 0.85, color='r') # 분류대상(알려지지 않은 집단)
 plt.show()

 # 2. DATA 생성과 함수 정의
 p1 = [1.2, 1.1] # A 집단
 p2 = [1.0, 1.0]
 p3 = [1.8, 0.8] # B 집단
 p4 = [2, 0.9]
 category = ['A','A','B','B'] # 알려진 집단 분류범주(Y변수)
 p5 = [1.6, 0.85] # 분류대상 

 # data 생성 함수 정의
 def data_set():
     # 선형대수 연산 : numpy형 변환
     know_group = np.array([p1, p2, p3, p4]) # 알려진 집단
     not_know_group = np.array(p5) # 알려지지 않은 집단
     class_category = np.array(category) # 정답(분류범주)
     return know_group,not_know_group,class_category 

 know_group,not_know_group,class_category=data_set()
 print('알려진 집단')
 """
 [[1.2 1.1]
  [1.  1. ]
  [1.8 0.8]
  [2.  0.9]]
 """
 print(know_group)
 print('알려지지 않은 집단')
 print(not_know_group) #[1.6  0.85]

 print('정답')
 print(class_category) #['A' 'A' 'B' 'B']

 #
 #차(-) -> 자곱(**) -> 합(sum) -> 제곱근(sqrt)

 diff=know_group-not_know_group #2차원 -1차원
 print('차=\n',diff)
 """
 차=
  [[-0.4   0.25]
  [-0.6   0.15]
  [ 0.2  -0.05]
  [ 0.4   0.05]]
 """

 sq_diff = diff ** 2
 sq_sum = sq_diff.sum(axis=1) #행단위 합계
 print(sq_sum) #[0.2225 0.3825 0.0425 0.1625]
 distance=np.sqrt(sq_sum)
 print(distance) #[0.47169906 0.61846584 0.20615528 0.40311289]
 #[3 4 1 2]거리  k=3 (B(2)>A(1))
 print(class_category)#['A' 'A' 'B' 'B']

 def classfy(know,not_know,cate,k):
     #유클리드인 거리계산식
     diff=know-not_know
     sq_diff = diff ** 2
     sq_sum = sq_diff.sum(axis=1)
     distance=np.sqrt(sq_sum)

     #2.가장 가까운 거리 오름차순 정렬 -> index
     sortDist=distance.argsort() #sort->index
     #print(sortDist) #[2 3 0 1]

     #3.최근접 이윳
     class_result={} #빈 set
     for i in range(k):#0~2
         key = cate[sortDist[i]] #i=0 -> 'B'
         class_result[key]=class_result.get(key,0)+1
     return class_result

 #함수 호출
 class_result=classfy(know_group,not_know_group,class_category,3)
 print(class_result) #{'B': 2, 'A': 1}

 #vot 함수
 def class_vote(class_result):
     return max(class_result,key=class_result.get)

 vote_result=class_vote(class_result)
 print("분류결과=",vote_result)#분류결과= B

kNN Class

 # -*- coding: utf-8 -*-
 """
 class 구현
 """

 import numpy as np
 from Step01_kNN import data_set

 know_group,not_know_group,class_category=data_set()

 #class =Func1+Func2+Func3
 class kNNclassify:   

     #1.최근접 이웃
     def classfy(self,know,not_know,cate,k):
         #유클리드인 거리계산식
         diff=know-not_know
         sq_diff = diff ** 2
         sq_sum = sq_diff.sum(axis=1)
         distance=np.sqrt(sq_sum)

         #2.가장 가까운 거리 오름차순 정렬 -> index
         sortDist=distance.argsort() #sort->index
         #print(sortDist) #[2 3 0 1]

         #3.최근접 이윳(k=3)
         self.class_result={} #빈 set
         for i in range(k):#0~2
             key = cate[sortDist[i]] #i=0 -> 'B'
             self.class_result[key]=self.class_result.get(key,0)+1

     #vot 함수
     def class_vote(self):
         return max(self.class_result,key=self.class_result.get)

 #class object 생성
 obj=kNNclassify() #생성자

 #objext.menber : self.class_result
 obj.classfy(know_group,not_know_group,class_category,3)

 vote_result=obj.class_vote()
 print('kNN 분류결과=',vote_result)#kNN 분류결과= B

NB

 # -*- coding: utf-8 -*-
 """
 통계적 분류기 - NB
 """
 import pandas as pd
 from sklearn import model_selection#train/test
 from sklearn.naive_bayes import GaussianNB 

 iris=pd.read_csv("../data/iris.csv")
 print(iris.head())
 """
    Sepal.Length  Sepal.Width  Petal.Length  Petal.Width Species
 0           5.1          3.5           1.4          0.2  setosa
 1           4.9          3.0           1.4          0.2  setosa
 2           4.7          3.2           1.3          0.2  setosa
 3           4.6          3.1           1.5          0.2  setosa
 4           5.0          3.6           1.4          0.2  setosa
 """

 #2. x,y 변수 선정
 cols=list(iris.columns)
 x_cols=cols[:4] #X:1~4(연속형)
 y_cols=cols[-1] #y:5(범주형)

 #3.train/test split
 iris_df=iris
 print(iris_df.shape)#(150, 5)
 train_iris,test_iris=model_selection.train_test_split(iris_df,test_size=0.3,random_state=123)
 print(train_iris.shape)#(105, 5)
 print(test_iris.shape)#(45, 5)

 #4. model생성 train set
 obj=GaussianNB() #object
 model=obj.fit(train_iris[x_cols],train_iris[y_cols])

 #5.model 평가
 pred=model.predict(test_iris[x_cols]) #Y예측
 Y = test_iris[y_cols] #정답

 #confusion matrix
 matrix=pd.crosstab(pred,Y)
 print(matrix)
 """
 Species     setosa  versicolor  virginica
 row_0
 setosa          18           0          0
 versicolor       0          10          2
 virginica        0           0         15
 """

 acc= (matrix.ix[0,0]+matrix.ix[1,1]+matrix.ix[2,2])/len(Y)
 print('분류정확도=',acc)#분류정확도= 0.9555555555555556

SVM

 # -*- coding: utf-8 -*-
 """
 SVM Model
 """
 import pandas as pd
 from sklearn import model_selection#train/test
 from sklearn import svm #model

 iris=pd.read_csv("../data/iris.csv")
 print(iris.head())
 """
    Sepal.Length  Sepal.Width  Petal.Length  Petal.Width Species
 0           5.1          3.5           1.4          0.2  setosa
 1           4.9          3.0           1.4          0.2  setosa
 2           4.7          3.2           1.3          0.2  setosa
 3           4.6          3.1           1.5          0.2  setosa
 4           5.0          3.6           1.4          0.2  setosa
 """

 #2. x,y 변수 선정
 cols=list(iris.columns)
 x_cols=cols[:4] #X:1~4(연속형)
 y_cols=cols[-1] #y:5(범주형)

 #3.train/test split
 iris_df=iris
 print(iris_df.shape)#(150, 5)
 train_iris,test_iris=model_selection.train_test_split(iris_df,test_size=0.3,random_state=123)
 print(train_iris.shape)#(105, 5)
 print(test_iris.shape)#(45, 5)

 #4.model -SVM
 obj=svm.SVC()
 model=obj.fit(train_iris[x_cols],train_iris[y_cols])

 #5.model 평가
 pred=model.predict(test_iris[x_cols])
 Y=test_iris[y_cols]

 #confusion matrix
 matrix=pd.crosstab(pred,Y)
 print(matrix)
 """
 Species     setosa  versicolor  virginica
 row_0
 setosa          18           0          0
 versicolor       0          10          1
 virginica        0           0         16
 """

 acc= (matrix.ix[0,0]+matrix.ix[1,1]+matrix.ix[2,2])/len(Y)
 print('분류정확도=',acc)#분류정확도= 0.9777777777777777

spam_train_test

 # -*- coding: utf-8 -*-
 """
 NB vs SWM
 -data set :sparse matrix 이용
 -file name:../data/spam_tran_test.npy
 """
 from sklearn.naive_bayes import GaussianNB
 from sklearn import svm
 import numpy as np
 import pandas as pd

 #1.file Loading
 X_train,X_test,y_train,y_test=np.load("../data/spam_tran_test.npy")
 print(X_train.shape) #(3901, 4000)
 print(X_test.shape) #(1673, 4000)
 print(type(y_train))#<class 'list'>

 #list -> numpy형변환: 선형대수 연산
 y_train=np.array(y_train)
 y_test=np.array(y_test)
 print(type(y_train))#<class 'numpy.ndarray'> 선형대수 하기위해서

 #2.NB model생성
 obj =GaussianNB()
 nb_model=obj.fit(X_train,y_train)

 pred=nb_model.predict(X_test)
 Y=y_test

 matrix=pd.crosstab(pred,Y)
 print("nb matrix\n",matrix)
 """
  col_0     0(ham)    1(spam)
 row_0
 0      1264   28
 1       167  214
 """
 acc=(matrix.ix[0,0]+matrix.ix[1,1])/len(Y)
 print("NB acc=",acc) #NB acc= 0.8834429169157203

 #2) 정확률:예측치 yes-> 실제값 yes
 precision=matrix.ix[1,1]/(matrix.ix[1,0]+matrix.ix[1,1])
 print("정확률=",precision)#정확률= 0.5616797900262467

 #3) 재현률:실제값yes -> 예측치 yes
 recall=matrix.ix[1,1]/(matrix.ix[0,1]+matrix.ix[1,1])
 print("재현률=",recall)#재현률= 0.8842975206611571

 #4) f1 score:precision,recall
 f1_score=2 * (precision*recall)/(precision+recall)
 print('f1_score=',f1_score)#f1_score= 0.6869983948635634

 #3.SVM model
 svm_obj =svm.SVC(kernel='linear')#kenel 함수
 svm_model=svm_obj.fit(X_train,y_train)

 svm_pred=svm_model.predict(X_test)
 svm_Y=y_test

 svm_matrix=pd.crosstab(svm_pred,svm_Y)
 print("svm matrix\n",svm_matrix)

 """
 svm matrix
  col_0     0    1
 row_0
 0      1428   36
 1         3  206
 """

 svm_acc=(svm_matrix.ix[0,0]+svm_matrix.ix[1,1])/len(svm_Y)
 print("svm acc=",svm_acc) #svm acc= 0.976688583383144

sms_spam_data

 # -*- coding: utf-8 -*-
 """
 Created on Sat Feb 23 15:52:23 2019

 @author: 502-03
 """

 from sklearn.naive_bayes import GaussianNB
 from sklearn import svm
 import numpy as np
 import pandas as pd

 #1.file Loading
 X_train,X_test,y_train,y_test=np.load("../data/sms_spam_data.npy")
 print(X_train.shape) #(4446, 6000)
 print(X_test.shape) #(1112, 6000)
 print(type(y_train))#<class 'pandas.core.series.Series'>

 #NB model 생성
 obj=GaussianNB()
 nb_model=obj.fit(X_train,y_train)
 nb_pred=nb_model.predict(X_test)
 nb_Y=y_test

 nb_tab=pd.crosstab(nb_pred,nb_Y)
 print("nb_tab=\n",nb_tab)
 """
 nb_tab=
  type   ham  spam
 row_0
 ham    812    10
 spam   156   134
 """
 nb_acc=(nb_tab.ix[0,0]+nb_tab.ix[1,1])/len(nb_Y)
 print("nb_acc=",nb_acc) #nb_acc= 0.8507194244604317

 #svm
 obj=svm.SVC(kernel='linear')
 svc_model=obj.fit(X_train,y_train)
 svc_pred=svc_model.predict(X_test)
 svc_Y=y_test

 svc_tab=pd.crosstab(svc_pred,svc_Y)
 print("svc_tab=\n",svc_tab)
 """
 svc_tab=
  type   ham  spam
 row_0
 ham    964    20
 spam     4   124
 """
 svc_acc=(svc_tab.ix[0,0]+svc_tab.ix[1,1])/len(svc_Y)
 print("svc_acc=",svc_acc) #svc_acc= 0.9784172661870504

 precision=svc_tab.ix[1,1]/(svc_tab.ix[1,0]+svc_tab.ix[1,1])
 print("정확률",precision)#정확률 0.96875

 recall=svc_tab.ix[1,1]/(svc_tab.ix[0,1]+svc_tab.ix[1,1])
 print("재현률",recall)#재현률 0.8611111111111112

 f1_score=2* (precision * recall)/(precision + recall)
 print("f1_score",f1_score)#f1_score 0.911764705882353

Classification的更多相关文章

  1. W3School-CSS 分类 (Classification) 实例

    CSS 分类 (Classification) 实例 CSS 实例 CSS 背景实例 CSS 文本实例 CSS 字体(font)实例 CSS 边框(border)实例 CSS 外边距 (margin) ...

  2. Large Margin DAGs for Multiclass Classification

    Abstract We present a new learning architecture: the Decision Directed Acyclic Graph (DDAG), which i ...

  3. 《ImageNet Classification with Deep Convolutional Neural Networks》 剖析

    <ImageNet Classification with Deep Convolutional Neural Networks> 剖析 CNN 领域的经典之作, 作者训练了一个面向数量为 ...

  4. 自然语言23_Text Classification with NLTK

    QQ:231469242 欢迎喜欢nltk朋友交流 https://www.pythonprogramming.net/text-classification-nltk-tutorial/?compl ...

  5. MATLAB 图像分类 Image Category Classification Using Bag of Features

    使用MATLAB实现图像的识别,这是MATLAB官网上面的例子,学习一下. http://cn.mathworks.com/help/vision/examples/image-category-cl ...

  6. Galaxy Classification

    10.3 Data Preparation After removing a large number of the columns from the raw SDSS dataset, introd ...

  7. Kaiju: Fast and sensitive taxonomic classification for metagenomics

    Kaiju: Fast and sensitive taxonomic classification for  metagenomics   问题描述:However, nucleotide comp ...

  8. 《Automatic Face Classification of Cushing’s Syndrome in Women – A Novel Screening Approach》学习笔记

    <针对女性库欣综合征患者的自动面部分类-一种新颖的筛查方法> Abstract 目的:库兴氏综合征对身体造成相当大的伤害如果不及时治疗,还经常是诊断的时间太长.在这项研究中,我们旨在测试面 ...

  9. [CS231n-CNN] Image classification and the data-driven approach, k-nearest neighbor, Linear classification I

    课程主页:http://cs231n.stanford.edu/ Task: Challenges: _________________________________________________ ...

  10. [ML] Naive Bayes for Text Classification

    TF-IDF Algorithm From http://www.ruanyifeng.com/blog/2013/03/tf-idf.html Chapter 1, 知道了"词频" ...

随机推荐

  1. IO多路复用和local概念

    一.local 在多个线程之间使用threading.local对象,可以实现多个线程之间的数据隔离 import time import random from threading import T ...

  2. opencv 边缘检测原理

    只是实现一下,暂不考虑效率 import cv2 as cv import numpy as np import math # 从源码层面实现边缘检测 img = cv.imread('../imag ...

  3. 【设计模式】【应用】使用模板方法设计模式、策略模式 处理DAO中的增删改查

    原文:使用模板方法设计模式.策略模式 处理DAO中的增删改查 关于模板模式和策略模式参考前面的文章. 分析 在dao中,我们经常要做增删改查操作,如果每个对每个业务对象的操作都写一遍,代码量非常庞大. ...

  4. OpenLayers学习笔记(九)— 限制地图显示范围

    openlayers 3 地图上限制地图显示及拖动范围,坐标系是4326转3857,中心经纬度精确到小数点后六位,减少误差 GitHub:八至 作者:狐狸家的鱼 本文链接:ol3-限制地图显示及拖动范 ...

  5. zkclient中包引用不对,导致NoSuchMethodError

    nidonglin commented on 31 Oct 2014 Exception in thread "main" java.lang.NoSuchMethodError: ...

  6. (二叉树 BFS) leetcode102. Binary Tree Level Order Traversal

    Given a binary tree, return the level order traversal of its nodes' values. (ie, from left to right, ...

  7. Pandas系列(一)-Series详解

    一.初始Series Series 是一个带有 名称 和索引的一维数组,既然是数组,肯定要说到的就是数组中的元素类型,在 Series 中包含的数据类型可以是整数.浮点.字符串.Python对象等. ...

  8. Docker:私有仓库registry [十一]

    一.运行docker私有仓库 安装registry docker run -d -p 5000:5000 --restart=always --name registry -v /opt/myregi ...

  9. [简洁]JavaScript中添加、移除、移动、复制、创建和查找节点元素

    查找: document.getElementsByTagName通过标签名获取元素,不论有多少个都返回元素集合. document.getElementsByClassName通过类名获取元素,同上 ...

  10. centos7安装notepadqq

    这是在centos7 上发表的第一篇博文 对linux系统陌生,折腾了一天,安装好了搜狗输入法.相关文章也不少,但照着一步一步来,都没有成功.最后照着这篇弄成了: ****** 安装notepadd+ ...