kNN1

 # -*- coding: utf-8 -*-
 """
 kNN : 최근접 이웃
 """

 import numpy as np # 다차원배열, 선형대수 연산
 import matplotlib.pyplot as plt

 # 1. 알려진 두 집단 x,y 산점도 시각화
 plt.scatter(1.2, 1.1) # A 집단
 plt.scatter(1.0, 1.0)
 plt.scatter(1.8, 0.8) # B 집단
 plt.scatter(2, 0.9)

 plt.scatter(1.6, 0.85, color='r') # 분류대상(알려지지 않은 집단)
 plt.show()

 # 2. DATA 생성과 함수 정의
 p1 = [1.2, 1.1] # A 집단
 p2 = [1.0, 1.0]
 p3 = [1.8, 0.8] # B 집단
 p4 = [2, 0.9]
 category = ['A','A','B','B'] # 알려진 집단 분류범주(Y변수)
 p5 = [1.6, 0.85] # 분류대상 

 # data 생성 함수 정의
 def data_set():
     # 선형대수 연산 : numpy형 변환
     know_group = np.array([p1, p2, p3, p4]) # 알려진 집단
     not_know_group = np.array(p5) # 알려지지 않은 집단
     class_category = np.array(category) # 정답(분류범주)
     return know_group,not_know_group,class_category 

 know_group,not_know_group,class_category=data_set()
 print('알려진 집단')
 """
 [[1.2 1.1]
  [1.  1. ]
  [1.8 0.8]
  [2.  0.9]]
 """
 print(know_group)
 print('알려지지 않은 집단')
 print(not_know_group) #[1.6  0.85]

 print('정답')
 print(class_category) #['A' 'A' 'B' 'B']

 #
 #차(-) -> 자곱(**) -> 합(sum) -> 제곱근(sqrt)

 diff=know_group-not_know_group #2차원 -1차원
 print('차=\n',diff)
 """
 차=
  [[-0.4   0.25]
  [-0.6   0.15]
  [ 0.2  -0.05]
  [ 0.4   0.05]]
 """

 sq_diff = diff ** 2
 sq_sum = sq_diff.sum(axis=1) #행단위 합계
 print(sq_sum) #[0.2225 0.3825 0.0425 0.1625]
 distance=np.sqrt(sq_sum)
 print(distance) #[0.47169906 0.61846584 0.20615528 0.40311289]
 #[3 4 1 2]거리  k=3 (B(2)>A(1))
 print(class_category)#['A' 'A' 'B' 'B']

 def classfy(know,not_know,cate,k):
     #유클리드인 거리계산식
     diff=know-not_know
     sq_diff = diff ** 2
     sq_sum = sq_diff.sum(axis=1)
     distance=np.sqrt(sq_sum)

     #2.가장 가까운 거리 오름차순 정렬 -> index
     sortDist=distance.argsort() #sort->index
     #print(sortDist) #[2 3 0 1]

     #3.최근접 이윳
     class_result={} #빈 set
     for i in range(k):#0~2
         key = cate[sortDist[i]] #i=0 -> 'B'
         class_result[key]=class_result.get(key,0)+1
     return class_result

 #함수 호출
 class_result=classfy(know_group,not_know_group,class_category,3)
 print(class_result) #{'B': 2, 'A': 1}

 #vot 함수
 def class_vote(class_result):
     return max(class_result,key=class_result.get)

 vote_result=class_vote(class_result)
 print("분류결과=",vote_result)#분류결과= B

kNN Class

 # -*- coding: utf-8 -*-
 """
 class 구현
 """

 import numpy as np
 from Step01_kNN import data_set

 know_group,not_know_group,class_category=data_set()

 #class =Func1+Func2+Func3
 class kNNclassify:   

     #1.최근접 이웃
     def classfy(self,know,not_know,cate,k):
         #유클리드인 거리계산식
         diff=know-not_know
         sq_diff = diff ** 2
         sq_sum = sq_diff.sum(axis=1)
         distance=np.sqrt(sq_sum)

         #2.가장 가까운 거리 오름차순 정렬 -> index
         sortDist=distance.argsort() #sort->index
         #print(sortDist) #[2 3 0 1]

         #3.최근접 이윳(k=3)
         self.class_result={} #빈 set
         for i in range(k):#0~2
             key = cate[sortDist[i]] #i=0 -> 'B'
             self.class_result[key]=self.class_result.get(key,0)+1

     #vot 함수
     def class_vote(self):
         return max(self.class_result,key=self.class_result.get)

 #class object 생성
 obj=kNNclassify() #생성자

 #objext.menber : self.class_result
 obj.classfy(know_group,not_know_group,class_category,3)

 vote_result=obj.class_vote()
 print('kNN 분류결과=',vote_result)#kNN 분류결과= B

NB

 # -*- coding: utf-8 -*-
 """
 통계적 분류기 - NB
 """
 import pandas as pd
 from sklearn import model_selection#train/test
 from sklearn.naive_bayes import GaussianNB 

 iris=pd.read_csv("../data/iris.csv")
 print(iris.head())
 """
    Sepal.Length  Sepal.Width  Petal.Length  Petal.Width Species
 0           5.1          3.5           1.4          0.2  setosa
 1           4.9          3.0           1.4          0.2  setosa
 2           4.7          3.2           1.3          0.2  setosa
 3           4.6          3.1           1.5          0.2  setosa
 4           5.0          3.6           1.4          0.2  setosa
 """

 #2. x,y 변수 선정
 cols=list(iris.columns)
 x_cols=cols[:4] #X:1~4(연속형)
 y_cols=cols[-1] #y:5(범주형)

 #3.train/test split
 iris_df=iris
 print(iris_df.shape)#(150, 5)
 train_iris,test_iris=model_selection.train_test_split(iris_df,test_size=0.3,random_state=123)
 print(train_iris.shape)#(105, 5)
 print(test_iris.shape)#(45, 5)

 #4. model생성 train set
 obj=GaussianNB() #object
 model=obj.fit(train_iris[x_cols],train_iris[y_cols])

 #5.model 평가
 pred=model.predict(test_iris[x_cols]) #Y예측
 Y = test_iris[y_cols] #정답

 #confusion matrix
 matrix=pd.crosstab(pred,Y)
 print(matrix)
 """
 Species     setosa  versicolor  virginica
 row_0
 setosa          18           0          0
 versicolor       0          10          2
 virginica        0           0         15
 """

 acc= (matrix.ix[0,0]+matrix.ix[1,1]+matrix.ix[2,2])/len(Y)
 print('분류정확도=',acc)#분류정확도= 0.9555555555555556

SVM

 # -*- coding: utf-8 -*-
 """
 SVM Model
 """
 import pandas as pd
 from sklearn import model_selection#train/test
 from sklearn import svm #model

 iris=pd.read_csv("../data/iris.csv")
 print(iris.head())
 """
    Sepal.Length  Sepal.Width  Petal.Length  Petal.Width Species
 0           5.1          3.5           1.4          0.2  setosa
 1           4.9          3.0           1.4          0.2  setosa
 2           4.7          3.2           1.3          0.2  setosa
 3           4.6          3.1           1.5          0.2  setosa
 4           5.0          3.6           1.4          0.2  setosa
 """

 #2. x,y 변수 선정
 cols=list(iris.columns)
 x_cols=cols[:4] #X:1~4(연속형)
 y_cols=cols[-1] #y:5(범주형)

 #3.train/test split
 iris_df=iris
 print(iris_df.shape)#(150, 5)
 train_iris,test_iris=model_selection.train_test_split(iris_df,test_size=0.3,random_state=123)
 print(train_iris.shape)#(105, 5)
 print(test_iris.shape)#(45, 5)

 #4.model -SVM
 obj=svm.SVC()
 model=obj.fit(train_iris[x_cols],train_iris[y_cols])

 #5.model 평가
 pred=model.predict(test_iris[x_cols])
 Y=test_iris[y_cols]

 #confusion matrix
 matrix=pd.crosstab(pred,Y)
 print(matrix)
 """
 Species     setosa  versicolor  virginica
 row_0
 setosa          18           0          0
 versicolor       0          10          1
 virginica        0           0         16
 """

 acc= (matrix.ix[0,0]+matrix.ix[1,1]+matrix.ix[2,2])/len(Y)
 print('분류정확도=',acc)#분류정확도= 0.9777777777777777

spam_train_test

 # -*- coding: utf-8 -*-
 """
 NB vs SWM
 -data set :sparse matrix 이용
 -file name:../data/spam_tran_test.npy
 """
 from sklearn.naive_bayes import GaussianNB
 from sklearn import svm
 import numpy as np
 import pandas as pd

 #1.file Loading
 X_train,X_test,y_train,y_test=np.load("../data/spam_tran_test.npy")
 print(X_train.shape) #(3901, 4000)
 print(X_test.shape) #(1673, 4000)
 print(type(y_train))#<class 'list'>

 #list -> numpy형변환: 선형대수 연산
 y_train=np.array(y_train)
 y_test=np.array(y_test)
 print(type(y_train))#<class 'numpy.ndarray'> 선형대수 하기위해서

 #2.NB model생성
 obj =GaussianNB()
 nb_model=obj.fit(X_train,y_train)

 pred=nb_model.predict(X_test)
 Y=y_test

 matrix=pd.crosstab(pred,Y)
 print("nb matrix\n",matrix)
 """
  col_0     0(ham)    1(spam)
 row_0
 0      1264   28
 1       167  214
 """
 acc=(matrix.ix[0,0]+matrix.ix[1,1])/len(Y)
 print("NB acc=",acc) #NB acc= 0.8834429169157203

 #2) 정확률:예측치 yes-> 실제값 yes
 precision=matrix.ix[1,1]/(matrix.ix[1,0]+matrix.ix[1,1])
 print("정확률=",precision)#정확률= 0.5616797900262467

 #3) 재현률:실제값yes -> 예측치 yes
 recall=matrix.ix[1,1]/(matrix.ix[0,1]+matrix.ix[1,1])
 print("재현률=",recall)#재현률= 0.8842975206611571

 #4) f1 score:precision,recall
 f1_score=2 * (precision*recall)/(precision+recall)
 print('f1_score=',f1_score)#f1_score= 0.6869983948635634

 #3.SVM model
 svm_obj =svm.SVC(kernel='linear')#kenel 함수
 svm_model=svm_obj.fit(X_train,y_train)

 svm_pred=svm_model.predict(X_test)
 svm_Y=y_test

 svm_matrix=pd.crosstab(svm_pred,svm_Y)
 print("svm matrix\n",svm_matrix)

 """
 svm matrix
  col_0     0    1
 row_0
 0      1428   36
 1         3  206
 """

 svm_acc=(svm_matrix.ix[0,0]+svm_matrix.ix[1,1])/len(svm_Y)
 print("svm acc=",svm_acc) #svm acc= 0.976688583383144

sms_spam_data

 # -*- coding: utf-8 -*-
 """
 Created on Sat Feb 23 15:52:23 2019

 @author: 502-03
 """

 from sklearn.naive_bayes import GaussianNB
 from sklearn import svm
 import numpy as np
 import pandas as pd

 #1.file Loading
 X_train,X_test,y_train,y_test=np.load("../data/sms_spam_data.npy")
 print(X_train.shape) #(4446, 6000)
 print(X_test.shape) #(1112, 6000)
 print(type(y_train))#<class 'pandas.core.series.Series'>

 #NB model 생성
 obj=GaussianNB()
 nb_model=obj.fit(X_train,y_train)
 nb_pred=nb_model.predict(X_test)
 nb_Y=y_test

 nb_tab=pd.crosstab(nb_pred,nb_Y)
 print("nb_tab=\n",nb_tab)
 """
 nb_tab=
  type   ham  spam
 row_0
 ham    812    10
 spam   156   134
 """
 nb_acc=(nb_tab.ix[0,0]+nb_tab.ix[1,1])/len(nb_Y)
 print("nb_acc=",nb_acc) #nb_acc= 0.8507194244604317

 #svm
 obj=svm.SVC(kernel='linear')
 svc_model=obj.fit(X_train,y_train)
 svc_pred=svc_model.predict(X_test)
 svc_Y=y_test

 svc_tab=pd.crosstab(svc_pred,svc_Y)
 print("svc_tab=\n",svc_tab)
 """
 svc_tab=
  type   ham  spam
 row_0
 ham    964    20
 spam     4   124
 """
 svc_acc=(svc_tab.ix[0,0]+svc_tab.ix[1,1])/len(svc_Y)
 print("svc_acc=",svc_acc) #svc_acc= 0.9784172661870504

 precision=svc_tab.ix[1,1]/(svc_tab.ix[1,0]+svc_tab.ix[1,1])
 print("정확률",precision)#정확률 0.96875

 recall=svc_tab.ix[1,1]/(svc_tab.ix[0,1]+svc_tab.ix[1,1])
 print("재현률",recall)#재현률 0.8611111111111112

 f1_score=2* (precision * recall)/(precision + recall)
 print("f1_score",f1_score)#f1_score 0.911764705882353

Classification的更多相关文章

W3School-CSS 分类 (Classification) 实例
CSS 分类 (Classification) 实例 CSS 实例 CSS 背景实例 CSS 文本实例 CSS 字体(font)实例 CSS 边框(border)实例 CSS 外边距 (margin) ...
Large Margin DAGs for Multiclass Classification
Abstract We present a new learning architecture: the Decision Directed Acyclic Graph (DDAG), which i ...
《ImageNet Classification with Deep Convolutional Neural Networks》剖析
<ImageNet Classification with Deep Convolutional Neural Networks> 剖析 CNN 领域的经典之作, 作者训练了一个面向数量为 ...
自然语言23_Text Classification with NLTK
QQ:231469242 欢迎喜欢nltk朋友交流 https://www.pythonprogramming.net/text-classification-nltk-tutorial/?compl ...
MATLAB 图像分类 Image Category Classification Using Bag of Features
使用MATLAB实现图像的识别,这是MATLAB官网上面的例子,学习一下. http://cn.mathworks.com/help/vision/examples/image-category-cl ...
Galaxy Classification
10.3 Data Preparation After removing a large number of the columns from the raw SDSS dataset, introd ...
Kaiju: Fast and sensitive taxonomic classification for metagenomics
Kaiju: Fast and sensitive taxonomic classification for metagenomics 问题描述:However, nucleotide comp ...
《Automatic Face Classification of Cushing’s Syndrome in Women – A Novel Screening Approach》学习笔记
<针对女性库欣综合征患者的自动面部分类-一种新颖的筛查方法> Abstract 目的:库兴氏综合征对身体造成相当大的伤害如果不及时治疗,还经常是诊断的时间太长.在这项研究中,我们旨在测试面 ...
[CS231n-CNN] Image classification and the data-driven approach, k-nearest neighbor, Linear classification I
课程主页:http://cs231n.stanford.edu/ Task: Challenges: _________________________________________________ ...
[ML] Naive Bayes for Text Classification
TF-IDF Algorithm From http://www.ruanyifeng.com/blog/2013/03/tf-idf.html Chapter 1, 知道了"词频" ...

随机推荐

三菱FX系列PLC教程
标题日期点击第一章:可编程控制器概论 2014-11-04 1401 1-0 课程概述 2014-11-05 192237 1-1 PLC的定义功能与特点 2014-11-05 16 ...
Ninja编译过程分析
在Android N的系统上,初次使用了Ninja的编译系统.对于Ninja,最初的印象是用在了Chromium open source code的编译中,在chromium的编译环境中,使用ninj ...
python中assert()函数的使用
关于assert()函数的使用,主要参考博客https://blog.csdn.net/qq_37119902/article/details/79637578 assert函数主要是用来声明某个函数 ...
Spotlight--你不得不用的Mac查询利器
世界上有两种Mac用户:一种是经常使用Spotlight的,另一种是忽略Spotlight的.如果你是第二种用户,那么你需要改变.Mac所有方面的使用场景,都会随着Spotlight而变得更快.你只需 ...
Docker：基础介绍 [一]
一.Docker介绍 Docker是Docker.lnc公司开源的一个基于LXC技术之上构建的Container容器引擎,源代码托管在Github上,基于Go语言并遵从Apache2.0协议开源 Do ...
jQuery使用（十四）：extend()方法
浅层克隆深层克隆扩展方法一.extend的基本使用语法: $.extend( target [, object1 ] [, objectN ] ) $.extend( [deep ], tar ...
HP Z620 Windows 7 系统安装（含磁盘阵列）
由于HP Z620 做了Raid 5磁盘阵列,导致安装系统时,系统加载不了磁盘的驱动,无法将系统安装到硬盘上,正确的方法是:下载SATA驱动,在需要加载驱动的地方,利用另一个U盘,“浏览”解压好的驱动 ...
QGE 在齐次 Besov 空间中的准则
在 [Zhang, Zujin. On the blow-up criterion for the quasi-geostrophic equations in homogeneous Besov s ...
sublime text3格式化html,css,js代码
需要安装HTML/CSS/JS prettify插件. 安装步骤:首选项 -> Package Control -> Install Package -> HTML-CSS-JS P ...
【OpenGL】搭建opgl环境
*GLFW+GLEW环境. 工具: GLFW库(下载) GLEW库 cMake软件(下载) 用cMake编译GLFW和GLEW成vs工程文件包,运行得到编译后文件. 在编译后文件夹中找到各个必需文件, ...

Classification

kNN1

kNN Class

NB

SVM

spam_train_test

sms_spam_data

Classification的更多相关文章

随机推荐

热门专题