kNN1

 # -*- coding: utf-8 -*-
 """
 kNN : 최근접 이웃
 """

 import numpy as np # 다차원배열, 선형대수 연산
 import matplotlib.pyplot as plt

 # 1. 알려진 두 집단 x,y 산점도 시각화
 plt.scatter(1.2, 1.1) # A 집단
 plt.scatter(1.0, 1.0)
 plt.scatter(1.8, 0.8) # B 집단
 plt.scatter(2, 0.9)

 plt.scatter(1.6, 0.85, color='r') # 분류대상(알려지지 않은 집단)
 plt.show()

 # 2. DATA 생성과 함수 정의
 p1 = [1.2, 1.1] # A 집단
 p2 = [1.0, 1.0]
 p3 = [1.8, 0.8] # B 집단
 p4 = [2, 0.9]
 category = ['A','A','B','B'] # 알려진 집단 분류범주(Y변수)
 p5 = [1.6, 0.85] # 분류대상 

 # data 생성 함수 정의
 def data_set():
     # 선형대수 연산 : numpy형 변환
     know_group = np.array([p1, p2, p3, p4]) # 알려진 집단
     not_know_group = np.array(p5) # 알려지지 않은 집단
     class_category = np.array(category) # 정답(분류범주)
     return know_group,not_know_group,class_category 

 know_group,not_know_group,class_category=data_set()
 print('알려진 집단')
 """
 [[1.2 1.1]
  [1.  1. ]
  [1.8 0.8]
  [2.  0.9]]
 """
 print(know_group)
 print('알려지지 않은 집단')
 print(not_know_group) #[1.6  0.85]

 print('정답')
 print(class_category) #['A' 'A' 'B' 'B']

 #
 #차(-) -> 자곱(**) -> 합(sum) -> 제곱근(sqrt)

 diff=know_group-not_know_group #2차원 -1차원
 print('차=\n',diff)
 """
 차=
  [[-0.4   0.25]
  [-0.6   0.15]
  [ 0.2  -0.05]
  [ 0.4   0.05]]
 """

 sq_diff = diff ** 2
 sq_sum = sq_diff.sum(axis=1) #행단위 합계
 print(sq_sum) #[0.2225 0.3825 0.0425 0.1625]
 distance=np.sqrt(sq_sum)
 print(distance) #[0.47169906 0.61846584 0.20615528 0.40311289]
 #[3 4 1 2]거리  k=3 (B(2)>A(1))
 print(class_category)#['A' 'A' 'B' 'B']

 def classfy(know,not_know,cate,k):
     #유클리드인 거리계산식
     diff=know-not_know
     sq_diff = diff ** 2
     sq_sum = sq_diff.sum(axis=1)
     distance=np.sqrt(sq_sum)

     #2.가장 가까운 거리 오름차순 정렬 -> index
     sortDist=distance.argsort() #sort->index
     #print(sortDist) #[2 3 0 1]

     #3.최근접 이윳
     class_result={} #빈 set
     for i in range(k):#0~2
         key = cate[sortDist[i]] #i=0 -> 'B'
         class_result[key]=class_result.get(key,0)+1
     return class_result

 #함수 호출
 class_result=classfy(know_group,not_know_group,class_category,3)
 print(class_result) #{'B': 2, 'A': 1}

 #vot 함수
 def class_vote(class_result):
     return max(class_result,key=class_result.get)

 vote_result=class_vote(class_result)
 print("분류결과=",vote_result)#분류결과= B

kNN Class

 # -*- coding: utf-8 -*-
 """
 class 구현
 """

 import numpy as np
 from Step01_kNN import data_set

 know_group,not_know_group,class_category=data_set()

 #class =Func1+Func2+Func3
 class kNNclassify:   

     #1.최근접 이웃
     def classfy(self,know,not_know,cate,k):
         #유클리드인 거리계산식
         diff=know-not_know
         sq_diff = diff ** 2
         sq_sum = sq_diff.sum(axis=1)
         distance=np.sqrt(sq_sum)

         #2.가장 가까운 거리 오름차순 정렬 -> index
         sortDist=distance.argsort() #sort->index
         #print(sortDist) #[2 3 0 1]

         #3.최근접 이윳(k=3)
         self.class_result={} #빈 set
         for i in range(k):#0~2
             key = cate[sortDist[i]] #i=0 -> 'B'
             self.class_result[key]=self.class_result.get(key,0)+1

     #vot 함수
     def class_vote(self):
         return max(self.class_result,key=self.class_result.get)

 #class object 생성
 obj=kNNclassify() #생성자

 #objext.menber : self.class_result
 obj.classfy(know_group,not_know_group,class_category,3)

 vote_result=obj.class_vote()
 print('kNN 분류결과=',vote_result)#kNN 분류결과= B

NB

 # -*- coding: utf-8 -*-
 """
 통계적 분류기 - NB
 """
 import pandas as pd
 from sklearn import model_selection#train/test
 from sklearn.naive_bayes import GaussianNB 

 iris=pd.read_csv("../data/iris.csv")
 print(iris.head())
 """
    Sepal.Length  Sepal.Width  Petal.Length  Petal.Width Species
 0           5.1          3.5           1.4          0.2  setosa
 1           4.9          3.0           1.4          0.2  setosa
 2           4.7          3.2           1.3          0.2  setosa
 3           4.6          3.1           1.5          0.2  setosa
 4           5.0          3.6           1.4          0.2  setosa
 """

 #2. x,y 변수 선정
 cols=list(iris.columns)
 x_cols=cols[:4] #X:1~4(연속형)
 y_cols=cols[-1] #y:5(범주형)

 #3.train/test split
 iris_df=iris
 print(iris_df.shape)#(150, 5)
 train_iris,test_iris=model_selection.train_test_split(iris_df,test_size=0.3,random_state=123)
 print(train_iris.shape)#(105, 5)
 print(test_iris.shape)#(45, 5)

 #4. model생성 train set
 obj=GaussianNB() #object
 model=obj.fit(train_iris[x_cols],train_iris[y_cols])

 #5.model 평가
 pred=model.predict(test_iris[x_cols]) #Y예측
 Y = test_iris[y_cols] #정답

 #confusion matrix
 matrix=pd.crosstab(pred,Y)
 print(matrix)
 """
 Species     setosa  versicolor  virginica
 row_0
 setosa          18           0          0
 versicolor       0          10          2
 virginica        0           0         15
 """

 acc= (matrix.ix[0,0]+matrix.ix[1,1]+matrix.ix[2,2])/len(Y)
 print('분류정확도=',acc)#분류정확도= 0.9555555555555556

SVM

 # -*- coding: utf-8 -*-
 """
 SVM Model
 """
 import pandas as pd
 from sklearn import model_selection#train/test
 from sklearn import svm #model

 iris=pd.read_csv("../data/iris.csv")
 print(iris.head())
 """
    Sepal.Length  Sepal.Width  Petal.Length  Petal.Width Species
 0           5.1          3.5           1.4          0.2  setosa
 1           4.9          3.0           1.4          0.2  setosa
 2           4.7          3.2           1.3          0.2  setosa
 3           4.6          3.1           1.5          0.2  setosa
 4           5.0          3.6           1.4          0.2  setosa
 """

 #2. x,y 변수 선정
 cols=list(iris.columns)
 x_cols=cols[:4] #X:1~4(연속형)
 y_cols=cols[-1] #y:5(범주형)

 #3.train/test split
 iris_df=iris
 print(iris_df.shape)#(150, 5)
 train_iris,test_iris=model_selection.train_test_split(iris_df,test_size=0.3,random_state=123)
 print(train_iris.shape)#(105, 5)
 print(test_iris.shape)#(45, 5)

 #4.model -SVM
 obj=svm.SVC()
 model=obj.fit(train_iris[x_cols],train_iris[y_cols])

 #5.model 평가
 pred=model.predict(test_iris[x_cols])
 Y=test_iris[y_cols]

 #confusion matrix
 matrix=pd.crosstab(pred,Y)
 print(matrix)
 """
 Species     setosa  versicolor  virginica
 row_0
 setosa          18           0          0
 versicolor       0          10          1
 virginica        0           0         16
 """

 acc= (matrix.ix[0,0]+matrix.ix[1,1]+matrix.ix[2,2])/len(Y)
 print('분류정확도=',acc)#분류정확도= 0.9777777777777777

spam_train_test

 # -*- coding: utf-8 -*-
 """
 NB vs SWM
 -data set :sparse matrix 이용
 -file name:../data/spam_tran_test.npy
 """
 from sklearn.naive_bayes import GaussianNB
 from sklearn import svm
 import numpy as np
 import pandas as pd

 #1.file Loading
 X_train,X_test,y_train,y_test=np.load("../data/spam_tran_test.npy")
 print(X_train.shape) #(3901, 4000)
 print(X_test.shape) #(1673, 4000)
 print(type(y_train))#<class 'list'>

 #list -> numpy형변환: 선형대수 연산
 y_train=np.array(y_train)
 y_test=np.array(y_test)
 print(type(y_train))#<class 'numpy.ndarray'> 선형대수 하기위해서

 #2.NB model생성
 obj =GaussianNB()
 nb_model=obj.fit(X_train,y_train)

 pred=nb_model.predict(X_test)
 Y=y_test

 matrix=pd.crosstab(pred,Y)
 print("nb matrix\n",matrix)
 """
  col_0     0(ham)    1(spam)
 row_0
 0      1264   28
 1       167  214
 """
 acc=(matrix.ix[0,0]+matrix.ix[1,1])/len(Y)
 print("NB acc=",acc) #NB acc= 0.8834429169157203

 #2) 정확률:예측치 yes-> 실제값 yes
 precision=matrix.ix[1,1]/(matrix.ix[1,0]+matrix.ix[1,1])
 print("정확률=",precision)#정확률= 0.5616797900262467

 #3) 재현률:실제값yes -> 예측치 yes
 recall=matrix.ix[1,1]/(matrix.ix[0,1]+matrix.ix[1,1])
 print("재현률=",recall)#재현률= 0.8842975206611571

 #4) f1 score:precision,recall
 f1_score=2 * (precision*recall)/(precision+recall)
 print('f1_score=',f1_score)#f1_score= 0.6869983948635634

 #3.SVM model
 svm_obj =svm.SVC(kernel='linear')#kenel 함수
 svm_model=svm_obj.fit(X_train,y_train)

 svm_pred=svm_model.predict(X_test)
 svm_Y=y_test

 svm_matrix=pd.crosstab(svm_pred,svm_Y)
 print("svm matrix\n",svm_matrix)

 """
 svm matrix
  col_0     0    1
 row_0
 0      1428   36
 1         3  206
 """

 svm_acc=(svm_matrix.ix[0,0]+svm_matrix.ix[1,1])/len(svm_Y)
 print("svm acc=",svm_acc) #svm acc= 0.976688583383144

sms_spam_data

 # -*- coding: utf-8 -*-
 """
 Created on Sat Feb 23 15:52:23 2019

 @author: 502-03
 """

 from sklearn.naive_bayes import GaussianNB
 from sklearn import svm
 import numpy as np
 import pandas as pd

 #1.file Loading
 X_train,X_test,y_train,y_test=np.load("../data/sms_spam_data.npy")
 print(X_train.shape) #(4446, 6000)
 print(X_test.shape) #(1112, 6000)
 print(type(y_train))#<class 'pandas.core.series.Series'>

 #NB model 생성
 obj=GaussianNB()
 nb_model=obj.fit(X_train,y_train)
 nb_pred=nb_model.predict(X_test)
 nb_Y=y_test

 nb_tab=pd.crosstab(nb_pred,nb_Y)
 print("nb_tab=\n",nb_tab)
 """
 nb_tab=
  type   ham  spam
 row_0
 ham    812    10
 spam   156   134
 """
 nb_acc=(nb_tab.ix[0,0]+nb_tab.ix[1,1])/len(nb_Y)
 print("nb_acc=",nb_acc) #nb_acc= 0.8507194244604317

 #svm
 obj=svm.SVC(kernel='linear')
 svc_model=obj.fit(X_train,y_train)
 svc_pred=svc_model.predict(X_test)
 svc_Y=y_test

 svc_tab=pd.crosstab(svc_pred,svc_Y)
 print("svc_tab=\n",svc_tab)
 """
 svc_tab=
  type   ham  spam
 row_0
 ham    964    20
 spam     4   124
 """
 svc_acc=(svc_tab.ix[0,0]+svc_tab.ix[1,1])/len(svc_Y)
 print("svc_acc=",svc_acc) #svc_acc= 0.9784172661870504

 precision=svc_tab.ix[1,1]/(svc_tab.ix[1,0]+svc_tab.ix[1,1])
 print("정확률",precision)#정확률 0.96875

 recall=svc_tab.ix[1,1]/(svc_tab.ix[0,1]+svc_tab.ix[1,1])
 print("재현률",recall)#재현률 0.8611111111111112

 f1_score=2* (precision * recall)/(precision + recall)
 print("f1_score",f1_score)#f1_score 0.911764705882353

Classification的更多相关文章

  1. W3School-CSS 分类 (Classification) 实例

    CSS 分类 (Classification) 实例 CSS 实例 CSS 背景实例 CSS 文本实例 CSS 字体(font)实例 CSS 边框(border)实例 CSS 外边距 (margin) ...

  2. Large Margin DAGs for Multiclass Classification

    Abstract We present a new learning architecture: the Decision Directed Acyclic Graph (DDAG), which i ...

  3. 《ImageNet Classification with Deep Convolutional Neural Networks》 剖析

    <ImageNet Classification with Deep Convolutional Neural Networks> 剖析 CNN 领域的经典之作, 作者训练了一个面向数量为 ...

  4. 自然语言23_Text Classification with NLTK

    QQ:231469242 欢迎喜欢nltk朋友交流 https://www.pythonprogramming.net/text-classification-nltk-tutorial/?compl ...

  5. MATLAB 图像分类 Image Category Classification Using Bag of Features

    使用MATLAB实现图像的识别,这是MATLAB官网上面的例子,学习一下. http://cn.mathworks.com/help/vision/examples/image-category-cl ...

  6. Galaxy Classification

    10.3 Data Preparation After removing a large number of the columns from the raw SDSS dataset, introd ...

  7. Kaiju: Fast and sensitive taxonomic classification for metagenomics

    Kaiju: Fast and sensitive taxonomic classification for  metagenomics   问题描述:However, nucleotide comp ...

  8. 《Automatic Face Classification of Cushing’s Syndrome in Women – A Novel Screening Approach》学习笔记

    <针对女性库欣综合征患者的自动面部分类-一种新颖的筛查方法> Abstract 目的:库兴氏综合征对身体造成相当大的伤害如果不及时治疗,还经常是诊断的时间太长.在这项研究中,我们旨在测试面 ...

  9. [CS231n-CNN] Image classification and the data-driven approach, k-nearest neighbor, Linear classification I

    课程主页:http://cs231n.stanford.edu/ Task: Challenges: _________________________________________________ ...

  10. [ML] Naive Bayes for Text Classification

    TF-IDF Algorithm From http://www.ruanyifeng.com/blog/2013/03/tf-idf.html Chapter 1, 知道了"词频" ...

随机推荐

  1. Linux下python3、virtualenv、Mysql、redis安装配置

    一.在Linux安装python解释器 1.下载python3源码包 cd /opt/ wget https://www.python.org/ftp/python/3.6.2/Python-3.6. ...

  2. 解决mysql表不能查询修改删除等操作并出现卡死

    问题现象1:进程wait卡住 测试环境mysql出现了一个怪表:select查询表卡死,alter修改表卡死,甚至我不想要这个表了,delete.truncate.drop表都卡死卡主了...... ...

  3. centos7下使用LVM给系统硬盘扩容超详细

    简单介绍: LVM是逻辑盘卷管理(Logical Volume Manager)的简称,它是Linux环境下对磁盘分区进行管理的一种机制,LVM是建立在硬盘和分区之上的一个逻辑层,来提高磁盘分区管理的 ...

  4. sopUI上手教程

    1.新建项目 File-->New SOAP Project-->Project Name:填入项目名  Initial WSDL:填入项目的 web Services 2.添加WSDL ...

  5. JMeter 的调式工具

    任何的编程工具都会相应的调式工具,JMeter的调式 工具主要有五种: 1.查看结果树:含请求信息.响应信息等 2.HTTP 镜像服务器:HTTP Mirror Server用于查看请求信息 3.De ...

  6. EM算法(Expectation Maximization Algorithm)初探

    1. 通过一个简单的例子直观上理解EM的核心思想 0x1: 问题背景 假设现在有两枚硬币Coin_a和Coin_b,随机抛掷后正面朝上/反面朝上的概率分别是 Coin_a:P1:-P1 Coin_b: ...

  7. 15、解决14中csv用excel打开乱码的问题 open('zhihu.csv','w',newline='',encoding='utf-8-sig')

    解决14中csv用excel打开乱码的问题 ,其实就是在写csv的时候把 utf-8 改成 utf-8-sig open('zhihu.csv','w',newline='',encoding='ut ...

  8. 五十一、进程间通信——System V IPC 之进程信号量

    51.1 进程信号量 51.1.1 信号量 本质上就是共享资源的数目,用来控制对共享资源的访问 用于进程间的互斥和同步 每种共享资源对应一个信号量,为了便于大量共享资源的操作引入了信号量集,可对所有信 ...

  9. SrpingBoot部署到云服务器

    预先准备事项 1.本地主机:安装maven 2.云端主机:安装和配置jdk 一.maven打包 方式一:maven手动版 切换至项目下,cmd:mvn package 查看target目录: 方式二: ...

  10. 20165237 2017-2018-2《Java程序设计》课程总结

    20165237 2017-2018-2<Java程序设计>课程总结 每周作业链接汇总 我期望的师生关系 学习基础和C语言基础调查 Linux安装及学习 第一周学习总结 第二周学习总结 第 ...