Classification
kNN1
# -*- coding: utf-8 -*-
"""
kNN : 최근접 이웃
"""
import numpy as np # 다차원배열, 선형대수 연산
import matplotlib.pyplot as plt
# 1. 알려진 두 집단 x,y 산점도 시각화
plt.scatter(1.2, 1.1) # A 집단
plt.scatter(1.0, 1.0)
plt.scatter(1.8, 0.8) # B 집단
plt.scatter(2, 0.9)
plt.scatter(1.6, 0.85, color='r') # 분류대상(알려지지 않은 집단)
plt.show()
# 2. DATA 생성과 함수 정의
p1 = [1.2, 1.1] # A 집단
p2 = [1.0, 1.0]
p3 = [1.8, 0.8] # B 집단
p4 = [2, 0.9]
category = ['A','A','B','B'] # 알려진 집단 분류범주(Y변수)
p5 = [1.6, 0.85] # 분류대상
# data 생성 함수 정의
def data_set():
# 선형대수 연산 : numpy형 변환
know_group = np.array([p1, p2, p3, p4]) # 알려진 집단
not_know_group = np.array(p5) # 알려지지 않은 집단
class_category = np.array(category) # 정답(분류범주)
return know_group,not_know_group,class_category
know_group,not_know_group,class_category=data_set()
print('알려진 집단')
"""
[[1.2 1.1]
[1. 1. ]
[1.8 0.8]
[2. 0.9]]
"""
print(know_group)
print('알려지지 않은 집단')
print(not_know_group) #[1.6 0.85]
print('정답')
print(class_category) #['A' 'A' 'B' 'B']
#
#차(-) -> 자곱(**) -> 합(sum) -> 제곱근(sqrt)
diff=know_group-not_know_group #2차원 -1차원
print('차=\n',diff)
"""
차=
[[-0.4 0.25]
[-0.6 0.15]
[ 0.2 -0.05]
[ 0.4 0.05]]
"""
sq_diff = diff ** 2
sq_sum = sq_diff.sum(axis=1) #행단위 합계
print(sq_sum) #[0.2225 0.3825 0.0425 0.1625]
distance=np.sqrt(sq_sum)
print(distance) #[0.47169906 0.61846584 0.20615528 0.40311289]
#[3 4 1 2]거리 k=3 (B(2)>A(1))
print(class_category)#['A' 'A' 'B' 'B']
def classfy(know,not_know,cate,k):
#유클리드인 거리계산식
diff=know-not_know
sq_diff = diff ** 2
sq_sum = sq_diff.sum(axis=1)
distance=np.sqrt(sq_sum)
#2.가장 가까운 거리 오름차순 정렬 -> index
sortDist=distance.argsort() #sort->index
#print(sortDist) #[2 3 0 1]
#3.최근접 이윳
class_result={} #빈 set
for i in range(k):#0~2
key = cate[sortDist[i]] #i=0 -> 'B'
class_result[key]=class_result.get(key,0)+1
return class_result
#함수 호출
class_result=classfy(know_group,not_know_group,class_category,3)
print(class_result) #{'B': 2, 'A': 1}
#vot 함수
def class_vote(class_result):
return max(class_result,key=class_result.get)
vote_result=class_vote(class_result)
print("분류결과=",vote_result)#분류결과= B
kNN Class
# -*- coding: utf-8 -*-
"""
class 구현
"""
import numpy as np
from Step01_kNN import data_set
know_group,not_know_group,class_category=data_set()
#class =Func1+Func2+Func3
class kNNclassify:
#1.최근접 이웃
def classfy(self,know,not_know,cate,k):
#유클리드인 거리계산식
diff=know-not_know
sq_diff = diff ** 2
sq_sum = sq_diff.sum(axis=1)
distance=np.sqrt(sq_sum)
#2.가장 가까운 거리 오름차순 정렬 -> index
sortDist=distance.argsort() #sort->index
#print(sortDist) #[2 3 0 1]
#3.최근접 이윳(k=3)
self.class_result={} #빈 set
for i in range(k):#0~2
key = cate[sortDist[i]] #i=0 -> 'B'
self.class_result[key]=self.class_result.get(key,0)+1
#vot 함수
def class_vote(self):
return max(self.class_result,key=self.class_result.get)
#class object 생성
obj=kNNclassify() #생성자
#objext.menber : self.class_result
obj.classfy(know_group,not_know_group,class_category,3)
vote_result=obj.class_vote()
print('kNN 분류결과=',vote_result)#kNN 분류결과= B
NB
# -*- coding: utf-8 -*-
"""
통계적 분류기 - NB
"""
import pandas as pd
from sklearn import model_selection#train/test
from sklearn.naive_bayes import GaussianNB
iris=pd.read_csv("../data/iris.csv")
print(iris.head())
"""
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
0 5.1 3.5 1.4 0.2 setosa
1 4.9 3.0 1.4 0.2 setosa
2 4.7 3.2 1.3 0.2 setosa
3 4.6 3.1 1.5 0.2 setosa
4 5.0 3.6 1.4 0.2 setosa
"""
#2. x,y 변수 선정
cols=list(iris.columns)
x_cols=cols[:4] #X:1~4(연속형)
y_cols=cols[-1] #y:5(범주형)
#3.train/test split
iris_df=iris
print(iris_df.shape)#(150, 5)
train_iris,test_iris=model_selection.train_test_split(iris_df,test_size=0.3,random_state=123)
print(train_iris.shape)#(105, 5)
print(test_iris.shape)#(45, 5)
#4. model생성 train set
obj=GaussianNB() #object
model=obj.fit(train_iris[x_cols],train_iris[y_cols])
#5.model 평가
pred=model.predict(test_iris[x_cols]) #Y예측
Y = test_iris[y_cols] #정답
#confusion matrix
matrix=pd.crosstab(pred,Y)
print(matrix)
"""
Species setosa versicolor virginica
row_0
setosa 18 0 0
versicolor 0 10 2
virginica 0 0 15
"""
acc= (matrix.ix[0,0]+matrix.ix[1,1]+matrix.ix[2,2])/len(Y)
print('분류정확도=',acc)#분류정확도= 0.9555555555555556
SVM
# -*- coding: utf-8 -*-
"""
SVM Model
"""
import pandas as pd
from sklearn import model_selection#train/test
from sklearn import svm #model
iris=pd.read_csv("../data/iris.csv")
print(iris.head())
"""
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
0 5.1 3.5 1.4 0.2 setosa
1 4.9 3.0 1.4 0.2 setosa
2 4.7 3.2 1.3 0.2 setosa
3 4.6 3.1 1.5 0.2 setosa
4 5.0 3.6 1.4 0.2 setosa
"""
#2. x,y 변수 선정
cols=list(iris.columns)
x_cols=cols[:4] #X:1~4(연속형)
y_cols=cols[-1] #y:5(범주형)
#3.train/test split
iris_df=iris
print(iris_df.shape)#(150, 5)
train_iris,test_iris=model_selection.train_test_split(iris_df,test_size=0.3,random_state=123)
print(train_iris.shape)#(105, 5)
print(test_iris.shape)#(45, 5)
#4.model -SVM
obj=svm.SVC()
model=obj.fit(train_iris[x_cols],train_iris[y_cols])
#5.model 평가
pred=model.predict(test_iris[x_cols])
Y=test_iris[y_cols]
#confusion matrix
matrix=pd.crosstab(pred,Y)
print(matrix)
"""
Species setosa versicolor virginica
row_0
setosa 18 0 0
versicolor 0 10 1
virginica 0 0 16
"""
acc= (matrix.ix[0,0]+matrix.ix[1,1]+matrix.ix[2,2])/len(Y)
print('분류정확도=',acc)#분류정확도= 0.9777777777777777
spam_train_test
# -*- coding: utf-8 -*-
"""
NB vs SWM
-data set :sparse matrix 이용
-file name:../data/spam_tran_test.npy
"""
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
import numpy as np
import pandas as pd
#1.file Loading
X_train,X_test,y_train,y_test=np.load("../data/spam_tran_test.npy")
print(X_train.shape) #(3901, 4000)
print(X_test.shape) #(1673, 4000)
print(type(y_train))#<class 'list'>
#list -> numpy형변환: 선형대수 연산
y_train=np.array(y_train)
y_test=np.array(y_test)
print(type(y_train))#<class 'numpy.ndarray'> 선형대수 하기위해서
#2.NB model생성
obj =GaussianNB()
nb_model=obj.fit(X_train,y_train)
pred=nb_model.predict(X_test)
Y=y_test
matrix=pd.crosstab(pred,Y)
print("nb matrix\n",matrix)
"""
col_0 0(ham) 1(spam)
row_0
0 1264 28
1 167 214
"""
acc=(matrix.ix[0,0]+matrix.ix[1,1])/len(Y)
print("NB acc=",acc) #NB acc= 0.8834429169157203
#2) 정확률:예측치 yes-> 실제값 yes
precision=matrix.ix[1,1]/(matrix.ix[1,0]+matrix.ix[1,1])
print("정확률=",precision)#정확률= 0.5616797900262467
#3) 재현률:실제값yes -> 예측치 yes
recall=matrix.ix[1,1]/(matrix.ix[0,1]+matrix.ix[1,1])
print("재현률=",recall)#재현률= 0.8842975206611571
#4) f1 score:precision,recall
f1_score=2 * (precision*recall)/(precision+recall)
print('f1_score=',f1_score)#f1_score= 0.6869983948635634
#3.SVM model
svm_obj =svm.SVC(kernel='linear')#kenel 함수
svm_model=svm_obj.fit(X_train,y_train)
svm_pred=svm_model.predict(X_test)
svm_Y=y_test
svm_matrix=pd.crosstab(svm_pred,svm_Y)
print("svm matrix\n",svm_matrix)
"""
svm matrix
col_0 0 1
row_0
0 1428 36
1 3 206
"""
svm_acc=(svm_matrix.ix[0,0]+svm_matrix.ix[1,1])/len(svm_Y)
print("svm acc=",svm_acc) #svm acc= 0.976688583383144
sms_spam_data
# -*- coding: utf-8 -*-
"""
Created on Sat Feb 23 15:52:23 2019
@author: 502-03
"""
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
import numpy as np
import pandas as pd
#1.file Loading
X_train,X_test,y_train,y_test=np.load("../data/sms_spam_data.npy")
print(X_train.shape) #(4446, 6000)
print(X_test.shape) #(1112, 6000)
print(type(y_train))#<class 'pandas.core.series.Series'>
#NB model 생성
obj=GaussianNB()
nb_model=obj.fit(X_train,y_train)
nb_pred=nb_model.predict(X_test)
nb_Y=y_test
nb_tab=pd.crosstab(nb_pred,nb_Y)
print("nb_tab=\n",nb_tab)
"""
nb_tab=
type ham spam
row_0
ham 812 10
spam 156 134
"""
nb_acc=(nb_tab.ix[0,0]+nb_tab.ix[1,1])/len(nb_Y)
print("nb_acc=",nb_acc) #nb_acc= 0.8507194244604317
#svm
obj=svm.SVC(kernel='linear')
svc_model=obj.fit(X_train,y_train)
svc_pred=svc_model.predict(X_test)
svc_Y=y_test
svc_tab=pd.crosstab(svc_pred,svc_Y)
print("svc_tab=\n",svc_tab)
"""
svc_tab=
type ham spam
row_0
ham 964 20
spam 4 124
"""
svc_acc=(svc_tab.ix[0,0]+svc_tab.ix[1,1])/len(svc_Y)
print("svc_acc=",svc_acc) #svc_acc= 0.9784172661870504
precision=svc_tab.ix[1,1]/(svc_tab.ix[1,0]+svc_tab.ix[1,1])
print("정확률",precision)#정확률 0.96875
recall=svc_tab.ix[1,1]/(svc_tab.ix[0,1]+svc_tab.ix[1,1])
print("재현률",recall)#재현률 0.8611111111111112
f1_score=2* (precision * recall)/(precision + recall)
print("f1_score",f1_score)#f1_score 0.911764705882353
Classification的更多相关文章
- W3School-CSS 分类 (Classification) 实例
CSS 分类 (Classification) 实例 CSS 实例 CSS 背景实例 CSS 文本实例 CSS 字体(font)实例 CSS 边框(border)实例 CSS 外边距 (margin) ...
- Large Margin DAGs for Multiclass Classification
Abstract We present a new learning architecture: the Decision Directed Acyclic Graph (DDAG), which i ...
- 《ImageNet Classification with Deep Convolutional Neural Networks》 剖析
<ImageNet Classification with Deep Convolutional Neural Networks> 剖析 CNN 领域的经典之作, 作者训练了一个面向数量为 ...
- 自然语言23_Text Classification with NLTK
QQ:231469242 欢迎喜欢nltk朋友交流 https://www.pythonprogramming.net/text-classification-nltk-tutorial/?compl ...
- MATLAB 图像分类 Image Category Classification Using Bag of Features
使用MATLAB实现图像的识别,这是MATLAB官网上面的例子,学习一下. http://cn.mathworks.com/help/vision/examples/image-category-cl ...
- Galaxy Classification
10.3 Data Preparation After removing a large number of the columns from the raw SDSS dataset, introd ...
- Kaiju: Fast and sensitive taxonomic classification for metagenomics
Kaiju: Fast and sensitive taxonomic classification for metagenomics 问题描述:However, nucleotide comp ...
- 《Automatic Face Classification of Cushing’s Syndrome in Women – A Novel Screening Approach》学习笔记
<针对女性库欣综合征患者的自动面部分类-一种新颖的筛查方法> Abstract 目的:库兴氏综合征对身体造成相当大的伤害如果不及时治疗,还经常是诊断的时间太长.在这项研究中,我们旨在测试面 ...
- [CS231n-CNN] Image classification and the data-driven approach, k-nearest neighbor, Linear classification I
课程主页:http://cs231n.stanford.edu/ Task: Challenges: _________________________________________________ ...
- [ML] Naive Bayes for Text Classification
TF-IDF Algorithm From http://www.ruanyifeng.com/blog/2013/03/tf-idf.html Chapter 1, 知道了"词频" ...
随机推荐
- vue+webpack+vue-cli获取URL地址参数
在没有使用webpack+vue router开发中,想要获取RUL传的参数地址,直接通过一个函数就可以获得. 比如在 www.test.com/test.html?sign=test 地址中,想 ...
- mpvue——支持less
安装 安装less和less-loader,我用的是淘宝源,你也可以直接npm $ cnpm install less less-loader --save 配置 打开build目录下的webpack ...
- 【51NOD1965】奇怪的式子 min_25筛
题目描述 给你\(n\),求 \[ \prod_{i=1}^n{\sigma_0(i)}^{i+\mu(i)} \] 对\({10}^{12}+39\)取模. \(\sigma_0(i)\)表示约数个 ...
- 联想的笔记本有隐藏分区 导致无法安装win10 eufi启动 报错:windows无法更新计算机的启动配置。无法安装
联想的笔记本都带着类似一键还原等的系统恢复软件,这些软件往往是将出厂设置备份在单 独的一个分区,此分区默认为隐藏,在 Windows 的磁盘管理中可以看到.打开磁盘管理器 的方法是右击计算机——管理, ...
- Vue(小案例_vue+axios仿手机app)_购物车
一.前言 1.购物车 二.主要内容 1.效果演示如下,当我们选择商品数量改变的时候,也要让购物车里面的数据改变 2.具体实现 (1)小球从上面跳到下面的效果 (2)当点击上面的“加入购物车按钮”让小球 ...
- 安装mysql和xampp遇到问题
1.mysql的期望地址和配置的地址不一致: 解决方法:修改注册表 在附件命令提示符输入regedit 找[HKEY_LOCAL_MACHINE\SYSTEM\CurrentControlSet\Se ...
- Hadoop记录- Yarn scheduler队列采集
#!/bin/sh ip=10.116.100.11 port=8088 export HADOOP_HOME=/app/hadoop/bin rmstate1=$($HADOOP_HOME/yarn ...
- Groovy 设计模式 -- 享元模式
Flyweight Pattern 享元模式, 将对象的相同属性, 以节省内存为目的,存储为一份公共对象, 所有对象共用此分对象. The Flyweight Pattern is a pattern ...
- a标签跳页传参,以及截取URL参数
<a href="dd.index?aa=1&&bb=2"></a> //截取URL参数 // console.log(window.loc ...
- Codeforces Round #527 (Div. 3) . F Tree with Maximum Cost
题目链接 题意:给你一棵树,让你找一个顶点iii,使得这个点的∑dis(i,j)∗a[j]\sum dis(i,j)*a[j]∑dis(i,j)∗a[j]最大.dis(i,j)dis(i,j)dis( ...