kNN1

 # -*- coding: utf-8 -*-
 """
 kNN : 최근접 이웃
 """

 import numpy as np # 다차원배열, 선형대수 연산
 import matplotlib.pyplot as plt

 # 1. 알려진 두 집단 x,y 산점도 시각화
 plt.scatter(1.2, 1.1) # A 집단
 plt.scatter(1.0, 1.0)
 plt.scatter(1.8, 0.8) # B 집단
 plt.scatter(2, 0.9)

 plt.scatter(1.6, 0.85, color='r') # 분류대상(알려지지 않은 집단)
 plt.show()

 # 2. DATA 생성과 함수 정의
 p1 = [1.2, 1.1] # A 집단
 p2 = [1.0, 1.0]
 p3 = [1.8, 0.8] # B 집단
 p4 = [2, 0.9]
 category = ['A','A','B','B'] # 알려진 집단 분류범주(Y변수)
 p5 = [1.6, 0.85] # 분류대상 

 # data 생성 함수 정의
 def data_set():
     # 선형대수 연산 : numpy형 변환
     know_group = np.array([p1, p2, p3, p4]) # 알려진 집단
     not_know_group = np.array(p5) # 알려지지 않은 집단
     class_category = np.array(category) # 정답(분류범주)
     return know_group,not_know_group,class_category 

 know_group,not_know_group,class_category=data_set()
 print('알려진 집단')
 """
 [[1.2 1.1]
  [1.  1. ]
  [1.8 0.8]
  [2.  0.9]]
 """
 print(know_group)
 print('알려지지 않은 집단')
 print(not_know_group) #[1.6  0.85]

 print('정답')
 print(class_category) #['A' 'A' 'B' 'B']

 #
 #차(-) -> 자곱(**) -> 합(sum) -> 제곱근(sqrt)

 diff=know_group-not_know_group #2차원 -1차원
 print('차=\n',diff)
 """
 차=
  [[-0.4   0.25]
  [-0.6   0.15]
  [ 0.2  -0.05]
  [ 0.4   0.05]]
 """

 sq_diff = diff ** 2
 sq_sum = sq_diff.sum(axis=1) #행단위 합계
 print(sq_sum) #[0.2225 0.3825 0.0425 0.1625]
 distance=np.sqrt(sq_sum)
 print(distance) #[0.47169906 0.61846584 0.20615528 0.40311289]
 #[3 4 1 2]거리  k=3 (B(2)>A(1))
 print(class_category)#['A' 'A' 'B' 'B']

 def classfy(know,not_know,cate,k):
     #유클리드인 거리계산식
     diff=know-not_know
     sq_diff = diff ** 2
     sq_sum = sq_diff.sum(axis=1)
     distance=np.sqrt(sq_sum)

     #2.가장 가까운 거리 오름차순 정렬 -> index
     sortDist=distance.argsort() #sort->index
     #print(sortDist) #[2 3 0 1]

     #3.최근접 이윳
     class_result={} #빈 set
     for i in range(k):#0~2
         key = cate[sortDist[i]] #i=0 -> 'B'
         class_result[key]=class_result.get(key,0)+1
     return class_result

 #함수 호출
 class_result=classfy(know_group,not_know_group,class_category,3)
 print(class_result) #{'B': 2, 'A': 1}

 #vot 함수
 def class_vote(class_result):
     return max(class_result,key=class_result.get)

 vote_result=class_vote(class_result)
 print("분류결과=",vote_result)#분류결과= B

kNN Class

 # -*- coding: utf-8 -*-
 """
 class 구현
 """

 import numpy as np
 from Step01_kNN import data_set

 know_group,not_know_group,class_category=data_set()

 #class =Func1+Func2+Func3
 class kNNclassify:   

     #1.최근접 이웃
     def classfy(self,know,not_know,cate,k):
         #유클리드인 거리계산식
         diff=know-not_know
         sq_diff = diff ** 2
         sq_sum = sq_diff.sum(axis=1)
         distance=np.sqrt(sq_sum)

         #2.가장 가까운 거리 오름차순 정렬 -> index
         sortDist=distance.argsort() #sort->index
         #print(sortDist) #[2 3 0 1]

         #3.최근접 이윳(k=3)
         self.class_result={} #빈 set
         for i in range(k):#0~2
             key = cate[sortDist[i]] #i=0 -> 'B'
             self.class_result[key]=self.class_result.get(key,0)+1

     #vot 함수
     def class_vote(self):
         return max(self.class_result,key=self.class_result.get)

 #class object 생성
 obj=kNNclassify() #생성자

 #objext.menber : self.class_result
 obj.classfy(know_group,not_know_group,class_category,3)

 vote_result=obj.class_vote()
 print('kNN 분류결과=',vote_result)#kNN 분류결과= B

NB

 # -*- coding: utf-8 -*-
 """
 통계적 분류기 - NB
 """
 import pandas as pd
 from sklearn import model_selection#train/test
 from sklearn.naive_bayes import GaussianNB 

 iris=pd.read_csv("../data/iris.csv")
 print(iris.head())
 """
    Sepal.Length  Sepal.Width  Petal.Length  Petal.Width Species
 0           5.1          3.5           1.4          0.2  setosa
 1           4.9          3.0           1.4          0.2  setosa
 2           4.7          3.2           1.3          0.2  setosa
 3           4.6          3.1           1.5          0.2  setosa
 4           5.0          3.6           1.4          0.2  setosa
 """

 #2. x,y 변수 선정
 cols=list(iris.columns)
 x_cols=cols[:4] #X:1~4(연속형)
 y_cols=cols[-1] #y:5(범주형)

 #3.train/test split
 iris_df=iris
 print(iris_df.shape)#(150, 5)
 train_iris,test_iris=model_selection.train_test_split(iris_df,test_size=0.3,random_state=123)
 print(train_iris.shape)#(105, 5)
 print(test_iris.shape)#(45, 5)

 #4. model생성 train set
 obj=GaussianNB() #object
 model=obj.fit(train_iris[x_cols],train_iris[y_cols])

 #5.model 평가
 pred=model.predict(test_iris[x_cols]) #Y예측
 Y = test_iris[y_cols] #정답

 #confusion matrix
 matrix=pd.crosstab(pred,Y)
 print(matrix)
 """
 Species     setosa  versicolor  virginica
 row_0
 setosa          18           0          0
 versicolor       0          10          2
 virginica        0           0         15
 """

 acc= (matrix.ix[0,0]+matrix.ix[1,1]+matrix.ix[2,2])/len(Y)
 print('분류정확도=',acc)#분류정확도= 0.9555555555555556

SVM

 # -*- coding: utf-8 -*-
 """
 SVM Model
 """
 import pandas as pd
 from sklearn import model_selection#train/test
 from sklearn import svm #model

 iris=pd.read_csv("../data/iris.csv")
 print(iris.head())
 """
    Sepal.Length  Sepal.Width  Petal.Length  Petal.Width Species
 0           5.1          3.5           1.4          0.2  setosa
 1           4.9          3.0           1.4          0.2  setosa
 2           4.7          3.2           1.3          0.2  setosa
 3           4.6          3.1           1.5          0.2  setosa
 4           5.0          3.6           1.4          0.2  setosa
 """

 #2. x,y 변수 선정
 cols=list(iris.columns)
 x_cols=cols[:4] #X:1~4(연속형)
 y_cols=cols[-1] #y:5(범주형)

 #3.train/test split
 iris_df=iris
 print(iris_df.shape)#(150, 5)
 train_iris,test_iris=model_selection.train_test_split(iris_df,test_size=0.3,random_state=123)
 print(train_iris.shape)#(105, 5)
 print(test_iris.shape)#(45, 5)

 #4.model -SVM
 obj=svm.SVC()
 model=obj.fit(train_iris[x_cols],train_iris[y_cols])

 #5.model 평가
 pred=model.predict(test_iris[x_cols])
 Y=test_iris[y_cols]

 #confusion matrix
 matrix=pd.crosstab(pred,Y)
 print(matrix)
 """
 Species     setosa  versicolor  virginica
 row_0
 setosa          18           0          0
 versicolor       0          10          1
 virginica        0           0         16
 """

 acc= (matrix.ix[0,0]+matrix.ix[1,1]+matrix.ix[2,2])/len(Y)
 print('분류정확도=',acc)#분류정확도= 0.9777777777777777

spam_train_test

 # -*- coding: utf-8 -*-
 """
 NB vs SWM
 -data set :sparse matrix 이용
 -file name:../data/spam_tran_test.npy
 """
 from sklearn.naive_bayes import GaussianNB
 from sklearn import svm
 import numpy as np
 import pandas as pd

 #1.file Loading
 X_train,X_test,y_train,y_test=np.load("../data/spam_tran_test.npy")
 print(X_train.shape) #(3901, 4000)
 print(X_test.shape) #(1673, 4000)
 print(type(y_train))#<class 'list'>

 #list -> numpy형변환: 선형대수 연산
 y_train=np.array(y_train)
 y_test=np.array(y_test)
 print(type(y_train))#<class 'numpy.ndarray'> 선형대수 하기위해서

 #2.NB model생성
 obj =GaussianNB()
 nb_model=obj.fit(X_train,y_train)

 pred=nb_model.predict(X_test)
 Y=y_test

 matrix=pd.crosstab(pred,Y)
 print("nb matrix\n",matrix)
 """
  col_0     0(ham)    1(spam)
 row_0
 0      1264   28
 1       167  214
 """
 acc=(matrix.ix[0,0]+matrix.ix[1,1])/len(Y)
 print("NB acc=",acc) #NB acc= 0.8834429169157203

 #2) 정확률:예측치 yes-> 실제값 yes
 precision=matrix.ix[1,1]/(matrix.ix[1,0]+matrix.ix[1,1])
 print("정확률=",precision)#정확률= 0.5616797900262467

 #3) 재현률:실제값yes -> 예측치 yes
 recall=matrix.ix[1,1]/(matrix.ix[0,1]+matrix.ix[1,1])
 print("재현률=",recall)#재현률= 0.8842975206611571

 #4) f1 score:precision,recall
 f1_score=2 * (precision*recall)/(precision+recall)
 print('f1_score=',f1_score)#f1_score= 0.6869983948635634

 #3.SVM model
 svm_obj =svm.SVC(kernel='linear')#kenel 함수
 svm_model=svm_obj.fit(X_train,y_train)

 svm_pred=svm_model.predict(X_test)
 svm_Y=y_test

 svm_matrix=pd.crosstab(svm_pred,svm_Y)
 print("svm matrix\n",svm_matrix)

 """
 svm matrix
  col_0     0    1
 row_0
 0      1428   36
 1         3  206
 """

 svm_acc=(svm_matrix.ix[0,0]+svm_matrix.ix[1,1])/len(svm_Y)
 print("svm acc=",svm_acc) #svm acc= 0.976688583383144

sms_spam_data

 # -*- coding: utf-8 -*-
 """
 Created on Sat Feb 23 15:52:23 2019

 @author: 502-03
 """

 from sklearn.naive_bayes import GaussianNB
 from sklearn import svm
 import numpy as np
 import pandas as pd

 #1.file Loading
 X_train,X_test,y_train,y_test=np.load("../data/sms_spam_data.npy")
 print(X_train.shape) #(4446, 6000)
 print(X_test.shape) #(1112, 6000)
 print(type(y_train))#<class 'pandas.core.series.Series'>

 #NB model 생성
 obj=GaussianNB()
 nb_model=obj.fit(X_train,y_train)
 nb_pred=nb_model.predict(X_test)
 nb_Y=y_test

 nb_tab=pd.crosstab(nb_pred,nb_Y)
 print("nb_tab=\n",nb_tab)
 """
 nb_tab=
  type   ham  spam
 row_0
 ham    812    10
 spam   156   134
 """
 nb_acc=(nb_tab.ix[0,0]+nb_tab.ix[1,1])/len(nb_Y)
 print("nb_acc=",nb_acc) #nb_acc= 0.8507194244604317

 #svm
 obj=svm.SVC(kernel='linear')
 svc_model=obj.fit(X_train,y_train)
 svc_pred=svc_model.predict(X_test)
 svc_Y=y_test

 svc_tab=pd.crosstab(svc_pred,svc_Y)
 print("svc_tab=\n",svc_tab)
 """
 svc_tab=
  type   ham  spam
 row_0
 ham    964    20
 spam     4   124
 """
 svc_acc=(svc_tab.ix[0,0]+svc_tab.ix[1,1])/len(svc_Y)
 print("svc_acc=",svc_acc) #svc_acc= 0.9784172661870504

 precision=svc_tab.ix[1,1]/(svc_tab.ix[1,0]+svc_tab.ix[1,1])
 print("정확률",precision)#정확률 0.96875

 recall=svc_tab.ix[1,1]/(svc_tab.ix[0,1]+svc_tab.ix[1,1])
 print("재현률",recall)#재현률 0.8611111111111112

 f1_score=2* (precision * recall)/(precision + recall)
 print("f1_score",f1_score)#f1_score 0.911764705882353

Classification的更多相关文章

  1. W3School-CSS 分类 (Classification) 实例

    CSS 分类 (Classification) 实例 CSS 实例 CSS 背景实例 CSS 文本实例 CSS 字体(font)实例 CSS 边框(border)实例 CSS 外边距 (margin) ...

  2. Large Margin DAGs for Multiclass Classification

    Abstract We present a new learning architecture: the Decision Directed Acyclic Graph (DDAG), which i ...

  3. 《ImageNet Classification with Deep Convolutional Neural Networks》 剖析

    <ImageNet Classification with Deep Convolutional Neural Networks> 剖析 CNN 领域的经典之作, 作者训练了一个面向数量为 ...

  4. 自然语言23_Text Classification with NLTK

    QQ:231469242 欢迎喜欢nltk朋友交流 https://www.pythonprogramming.net/text-classification-nltk-tutorial/?compl ...

  5. MATLAB 图像分类 Image Category Classification Using Bag of Features

    使用MATLAB实现图像的识别,这是MATLAB官网上面的例子,学习一下. http://cn.mathworks.com/help/vision/examples/image-category-cl ...

  6. Galaxy Classification

    10.3 Data Preparation After removing a large number of the columns from the raw SDSS dataset, introd ...

  7. Kaiju: Fast and sensitive taxonomic classification for metagenomics

    Kaiju: Fast and sensitive taxonomic classification for  metagenomics   问题描述:However, nucleotide comp ...

  8. 《Automatic Face Classification of Cushing’s Syndrome in Women – A Novel Screening Approach》学习笔记

    <针对女性库欣综合征患者的自动面部分类-一种新颖的筛查方法> Abstract 目的:库兴氏综合征对身体造成相当大的伤害如果不及时治疗,还经常是诊断的时间太长.在这项研究中,我们旨在测试面 ...

  9. [CS231n-CNN] Image classification and the data-driven approach, k-nearest neighbor, Linear classification I

    课程主页:http://cs231n.stanford.edu/ Task: Challenges: _________________________________________________ ...

  10. [ML] Naive Bayes for Text Classification

    TF-IDF Algorithm From http://www.ruanyifeng.com/blog/2013/03/tf-idf.html Chapter 1, 知道了"词频" ...

随机推荐

  1. 部分安卓机型1px边框无法显示解决方法

    实践发现css样式中使用1px solid 在部分安卓机型无法显示. 网上找到解决方案:使用行内样式设置边框.

  2. HTML编辑器KindEditor

    KindEditor 是一套开源的在线HTML编辑器,主要用于让用户在网站上获得所见即所得编辑效果,开发人员可以用 KindEditor 把传统的多行文本输入框(textarea)替换为可视化的富文本 ...

  3. Redux Todos Example

    此项目模板是使用Create React App构建的,它提供了一种简单的方法来启动React项目而无需构建配置. 使用Create-React-App构建的项目包括对ES6语法的支持,以及几种非官方 ...

  4. Vue+koa2开发一款全栈小程序(6.个人中心)

    1.用户信息的获取和展示 1.初始化数据库 cd到server目录下,执行 node tools/initdb.js 登录mysql控制界面,查看初始化以后生成的表 show databases; u ...

  5. GWAS群体分层 (Population stratification):利用plink对基因型进行PCA

    一.为什么要做祖先成分的PCA? GWAS研究时经常碰到群体分层的现象,即该群体的祖先来源多样性,我们知道的,不同群体SNP频率不一样,导致后面做关联分析的时候可能出现假阳性位点(不一定是显著信号位点 ...

  6. 深入理解pthread_cond_wait、pthread_cond_signal

    ===============================man pthread_cond_wait的解释========================== LINUX环境下多线程编程肯定会遇到 ...

  7. Vue(小案例_vue+axios仿手机app)_图片列表操作

    一.前言 1.让图片还没有被完全加载出来的时候给用户提示                                       2.图片查看器 二.主要内容 1.让图片还没有被完全加载出来的时候 ...

  8. vmware(1):vmware中的bridge、nat、host-only的区别

    VMWare提供了三种工作模式,它们是bridged(桥接模式).NAT(网络地址转换模式)和host-only(主机模式) bridged(桥接模式) 在这种模式下,VMWare虚拟出来的操作系统就 ...

  9. 四大解析器(BeautifulSoup、PyQuery、lxml、正则)性能比较

    用标题中的四种方式解析网页,比较其解析速度.当然比较结果数值与电脑配置,python版本都有关系,但总体差别不会很大. 下面是我的结果,lxml xpath最快,bs4最慢 ==== Python v ...

  10. Django admin修改密码

    django的admin用户被我多动症一样的测试,给密码弄丢了,需要重置. 从数据库重置的可能性为0,因为django对于密码有保护策略.考虑从运行程序的地方进行重置: 1.在程序的文件夹下,执行这样 ...