『Sklearn』数据划分方法
原理介绍
K折交叉验证:
KFold,GroupKFold,StratifiedKFold,

留一法:
LeaveOneGroupOut,LeavePGroupsOut,LeaveOneOut,LeavePOut,

随机划分法:
ShuffleSplit,GroupShuffleSplit,StratifiedShuffleSplit,

代码实现
流程:
实例化分类器 -> 迭代器迭代组[.split()]
KFold(n_splits=2)
#KFold
import numpy as np
from sklearn.model_selection import KFold
X=np.array([[1,2],[3,4],[5,6],[7,8],[9,10],[11,12]])
y=np.array([1,2,3,4,5,6])
kf=KFold(n_splits=2) # 定义分成几个组
# kf.get_n_splits(X) # 查询分成几个组
print(kf)
for train_index,test_index in kf.split(X):
print("Train Index:",train_index,",Test Index:",test_index)
X_train,X_test=X[train_index],X[test_index]
y_train,y_test=y[train_index],y[test_index]
#print(X_train,X_test,y_train,y_test)
GroupKFold(n_splits=2)
# GroupKFold,不是很懂这个划分方法
import numpy as np
from sklearn.model_selection import GroupKFold
X=np.array([[1,2],[3,4],[5,6],[7,8],[9,10],[11,12]])
y=np.array([1,2,3,4,5,6])
groups=np.array([1,2,3,4,5,6])
group_kfold=GroupKFold(n_splits=2)
group_kfold.get_n_splits(X,y,groups)
print(group_kfold)
for train_index,test_index in group_kfold.split(X,y,groups):
print("Train Index:",train_index,",Test Index:",test_index)
X_train,X_test=X[train_index],X[test_index]
y_train,y_test=y[train_index],y[test_index]
#print(X_train,X_test,y_train,y_test) #GroupKFold(n_splits=2)
#Train Index: [0 2 4] ,Test Index: [1 3 5]
#Train Index: [1 3 5] ,Test Index: [0 2 4]
StratifiedKFold(n_splits=3)
# stratifiedKFold:保证训练集中每一类的比例是相同的(尽量)
import numpy as np
from sklearn.model_selection import StratifiedKFold
X=np.array([[1,2],[3,4],[5,6],[7,8],[9,10],[11,12]])
y=np.array([1,1,1,2,2,2])
skf=StratifiedKFold(n_splits=3)
skf.get_n_splits(X,y)
print(skf)
for train_index,test_index in skf.split(X,y):
print("Train Index:",train_index,",Test Index:",test_index)
X_train,X_test=X[train_index],X[test_index]
y_train,y_test=y[train_index],y[test_index]
#print(X_train,X_test,y_train,y_test) #StratifiedKFold(n_splits=3, random_state=None, shuffle=False)
#Train Index: [1 2 4 5] ,Test Index: [0 3]
#Train Index: [0 2 3 5] ,Test Index: [1 4]
LeaveOneOut()
# leaveOneOut:测试集就留下一个
import numpy as np
from sklearn.model_selection import LeaveOneOut
X=np.array([[1,2],[3,4],[5,6],[7,8],[9,10],[11,12]])
y=np.array([1,2,3,4,5,6])
loo=LeaveOneOut()
loo.get_n_splits(X)
print(loo)
for train_index,test_index in loo.split(X,y):
print("Train Index:",train_index,",Test Index:",test_index)
X_train,X_test=X[train_index],X[test_index]
y_train,y_test=y[train_index],y[test_index]
#print(X_train,X_test,y_train,y_test)
#LeaveOneOut()
#Train Index: [1 2 3 4 5] ,Test Index: [0]
#Train Index: [0 2 3 4 5] ,Test Index: [1]
#Train Index: [0 1 3 4 5] ,Test Index: [2]
#Train Index: [0 1 2 4 5] ,Test Index: [3]
#Train Index: [0 1 2 3 5] ,Test Index: [4]
#Train Index: [0 1 2 3 4] ,Test Index: [5]
LeavePOut(p=3)
LeavePOut:测试集留下P个
import numpy as np
from sklearn.model_selection import LeavePOut
X=np.array([[1,2],[3,4],[5,6],[7,8],[9,10],[11,12]])
y=np.array([1,2,3,4,5,6])
lpo=LeavePOut(p=3)
lpo.get_n_splits(X)
print(lpo)
for train_index,test_index in lpo.split(X,y):
print("Train Index:",train_index,",Test Index:",test_index)
X_train,X_test=X[train_index],X[test_index]
y_train,y_test=y[train_index],y[test_index]
#print(X_train,X_test,y_train,y_test) #LeavePOut(p=3)
#Train Index: [3 4 5] ,Test Index: [0 1 2]
#Train Index: [2 4 5] ,Test Index: [0 1 3]
#Train Index: [2 3 5] ,Test Index: [0 1 4]
#Train Index: [2 3 4] ,Test Index: [0 1 5]
#Train Index: [1 4 5] ,Test Index: [0 2 3]
#Train Index: [1 3 5] ,Test Index: [0 2 4]
#Train Index: [1 3 4] ,Test Index: [0 2 5]
#Train Index: [1 2 5] ,Test Index: [0 3 4]
#Train Index: [1 2 4] ,Test Index: [0 3 5]
#Train Index: [1 2 3] ,Test Index: [0 4 5]
#Train Index: [0 4 5] ,Test Index: [1 2 3]
#Train Index: [0 3 5] ,Test Index: [1 2 4]
#Train Index: [0 3 4] ,Test Index: [1 2 5]
#Train Index: [0 2 5] ,Test Index: [1 3 4]
#Train Index: [0 2 4] ,Test Index: [1 3 5]
#Train Index: [0 2 3] ,Test Index: [1 4 5]
#Train Index: [0 1 5] ,Test Index: [2 3 4]
#Train Index: [0 1 4] ,Test Index: [2 3 5]
#Train Index: [0 1 3] ,Test Index: [2 4 5]
#Train Index: [0 1 2] ,Test Index: [3 4 5]
ShuffleSplit(n_splits=3,test_size=.25,random_state=0)
# ShuffleSplit 把数据集打乱顺序,然后划分测试集和训练集,训练集额和测试集的比例随机选定,
# 训练集和测试集的比例的和可以小于1
import numpy as np
from sklearn.model_selection import ShuffleSplit
X=np.array([[1,2],[3,4],[5,6],[7,8],[9,10],[11,12]])
y=np.array([1,2,3,4,5,6])
rs=ShuffleSplit(n_splits=3,test_size=.25,random_state=0)
rs.get_n_splits(X)
print(rs)
for train_index,test_index in rs.split(X,y):
print("Train Index:",train_index,",Test Index:",test_index)
X_train,X_test=X[train_index],X[test_index]
y_train,y_test=y[train_index],y[test_index]
#print(X_train,X_test,y_train,y_test)
print("==============================")
rs=ShuffleSplit(n_splits=3,train_size=.5,test_size=.25,random_state=0)
rs.get_n_splits(X)
print(rs)
for train_index,test_index in rs.split(X,y):
print("Train Index:",train_index,",Test Index:",test_index) #ShuffleSplit(n_splits=3, random_state=0, test_size=0.25, train_size=None)
#Train Index: [1 3 0 4] ,Test Index: [5 2]
#Train Index: [4 0 2 5] ,Test Index: [1 3]
#Train Index: [1 2 4 0] ,Test Index: [3 5]
#==============================
#ShuffleSplit(n_splits=3, random_state=0, test_size=0.25, train_size=0.5)
#Train Index: [1 3 0] ,Test Index: [5 2]
#Train Index: [4 0 2] ,Test Index: [1 3]
#Train Index: [1 2 4] ,Test Index: [3 5]
StratifiedShuffleSplit(n_splits=3,test_size=.5,random_state=0)
# StratifiedShuffleSplitShuffleSplit 把数据集打乱顺序,然后划分测试集和训练集,
# 训练集额和测试集的比例随机选定,训练集和测试集的比例的和可以小于1,但是还要保证训练集中各类所占的比例是一样的 import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
X=np.array([[1,2],[3,4],[5,6],[7,8],[9,10],[11,12]])
y=np.array([1,2,1,2,1,2])
sss=StratifiedShuffleSplit(n_splits=3,test_size=.5,random_state=0)
sss.get_n_splits(X,y)
print(sss)
for train_index,test_index in sss.split(X,y):
print("Train Index:",train_index,",Test Index:",test_index)
X_train,X_test=X[train_index],X[test_index]
y_train,y_test=y[train_index],y[test_index]
#print(X_train,X_test,y_train,y_test) #StratifiedShuffleSplit(n_splits=3, random_state=0, test_size=0.5,train_size=None)
#Train Index: [5 4 1] ,Test Index: [3 2 0]
#Train Index: [5 2 3] ,Test Index: [0 4 1]
#Train Index: [5 0 4] ,Test Index: [3 1 2]
『Sklearn』数据划分方法的更多相关文章
- 『Sklearn』特征向量化处理
『Kaggle』分类任务_决策树&集成模型&DataFrame向量化操作 1 2 3 4 5 6 7 8 9 '''特征提取器''' from sklearn.feature_extr ...
- 『Sklearn』框架自带数据集接口
自带数据集类型如下: # 自带小型数据集# sklearn.datasets.load_<name># 在线下载数据集# sklearn.datasets.fetch_<name&g ...
- JS 中通过对象关联实现『继承』
JS 中继承其实是种委托,而不是传统面向对象中的复制父类到子类,只是通过原型链将要做的事委托给父类. 下面介绍通过对象关联来实现『继承』的方法: Foo = { // 需要提供一个 init 方法来初 ...
- 『Python』__getattr__()特殊方法
self的认识 & __getattr__()特殊方法 将字典调用方式改为通过属性查询的一个小class, class Dict(dict): def __init__(self, **kw) ...
- 『TensorFlow』模型保存和载入方法汇总
『TensorFlow』第七弹_保存&载入会话_霸王回马 一.TensorFlow常规模型加载方法 保存模型 tf.train.Saver()类,.save(sess, ckpt文件目录)方法 ...
- 『转载』hadoop2.x常用端口、定义方法及默认端口
『转载』hadoop2.x常用端口.定义方法及默认端口 1.问题导读 DataNode的http服务的端口.ipc服务的端口分别是哪个? NameNode的http服务的端口.ipc服务的端口分别是哪 ...
- 『计算机视觉』Mask-RCNN_推断网络终篇:使用detect方法进行推断
一.detect和build 前面多节中我们花了大量笔墨介绍build方法的inference分支,这节我们看看它是如何被调用的. 在dimo.ipynb中,涉及model的操作我们简单进行一下汇总, ...
- 『TensorFlow』读书笔记_降噪自编码器
『TensorFlow』降噪自编码器设计 之前学习过的代码,又敲了一遍,新的收获也还是有的,因为这次注释写的比较详尽,所以再次记录一下,具体的相关知识查阅之前写的文章即可(见上面链接). # Aut ...
- 『AngularJS』$location 服务
项目中关于 $location的用法 简介 $location服务解析在浏览器地址栏中的URL(基于window.location)并且让URL在你的应用中可用.改变在地址栏中的URL会作用到$loc ...
随机推荐
- python 采坑总结 调用键盘事件后导致键盘失灵的可能原因
在练习python封装键盘事件的时候,实现一个keyDown和keyUp的功能: @staticmethod def keyDown(keyName): #按下按键 ...
- HTML5代码规范
HTML5代码规范html标签里面等号两边不要留空格在IE下可能会识别不了html5等号前后可以使用空格,但仍不推荐使用. HTML 代码约定很多 Web 开发人员对 HTML 的代码规范知之甚少.在 ...
- 20145333茹翔 Exp5 Adobe阅读器漏洞攻击
20145333茹翔 Exp5 Adobe阅读器漏洞攻击 实验过程 主机为kali的ip地址为:192.168.1.111.靶机windows xp 的ip地址为:192.168.1.110 使用命令 ...
- HTML切换页面IE版本
<!--[if !IE]><!--> 除IE外都可识别 <!--<![endif]--><!--[if IE]> 所有的IE可识别 <![e ...
- CSS 常用语法与盒模型分析
CSS基础知识 CSS规则由两个主要的部分构成:选择器,以及一条或者多条声明 selector { property: value; property: value; ... property: va ...
- SPOJ LAS(BFS)题解
题目:VJ 思路: BFS+回溯,但是要剪枝,看了dalao的题解,超时+WA无数发,终于过了 #include<cstdio> #include<cstring> #incl ...
- C#中的编译开关
c#不支持宏定义,只支持编译开关. ======================================我想加上#define xxx就编译a代码,否则编译b代码,像这样的:#if xxxa ...
- ACM-ICPC 2018 南京赛区网络预赛 L. Magical Girl Haze 最短路+分层图
类似题解 There are NN cities in the country, and MM directional roads from uu to v(1\le u, v\le n)v(1≤u, ...
- BZOJ3942: [Usaco2015 Feb]Censoring 栈+KMP
Description Farmer John has purchased a subscription to Good Hooveskeeping magazine for his cows, so ...
- 瞎折腾之Webhooks
之前听学长介绍过webhooks,也知道有这个东西,但没有真正的用于项目部署,长久以来一直过着“刀耕火种”的生活......长久以来,都是这么更新代码的: 由于之前做的项目刚刚上线,需要对其进行持续的 ...