近期的事务与sklearn有关,且主要用到了分类。在此做一点笔记

进行分类大概涉及三个知识点:

一. 分类器

二.特征选择

三.模型选择

一.分类器(Classification)

实例一:plot_classifier_comparison.py

# Code source: Gaël Varoquaux
# Andreas Müller
# Modified for documentation by Jaques Grobler
# License: BSD 3 clause import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis h = .02 # step size in the mesh names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Decision Tree",
"Random Forest", "AdaBoost", "Naive Bayes", "Linear Discriminant Analysis",
"Quadratic Discriminant Analysis"]
classifiers = [
KNeighborsClassifier(3),
SVC(kernel="linear", C=0.025),
SVC(gamma=2, C=1),
DecisionTreeClassifier(max_depth=5),
RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
AdaBoostClassifier(),
GaussianNB(),
LinearDiscriminantAnalysis(),
QuadraticDiscriminantAnalysis()] X, y = make_classification(n_features=2, n_redundant=0, n_informative=2,
random_state=1, n_clusters_per_class=1)
rng = np.random.RandomState(2)
X += 2 * rng.uniform(size=X.shape)
linearly_separable = (X, y) datasets = [make_moons(noise=0.3, random_state=0),
make_circles(noise=0.2, factor=0.5, random_state=1),
linearly_separable
] figure = plt.figure(figsize=(27, 9))
i = 1
# iterate over datasets
for ds in datasets:
# preprocess dataset, split into training and test part
X, y = ds
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4) x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h)) # just plot the dataset first
cm = plt.cm.RdBu
cm_bright = ListedColormap(['#FF0000', '#0000FF'])
ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
# Plot the training points
ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright)
# and testing points
ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6)
ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
ax.set_xticks(())
ax.set_yticks(())
i += 1 # iterate over classifiers
for name, clf in zip(names, classifiers):
ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test) # Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, m_max]x[y_min, y_max].
if hasattr(clf, "decision_function"):
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
else:
Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1] # Put the result into a color plot
Z = Z.reshape(xx.shape)
ax.contourf(xx, yy, Z, cmap=cm, alpha=.8) # Plot also the training points
ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright)
# and testing points
ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright,
alpha=0.6) ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
ax.set_xticks(())
ax.set_yticks(())
ax.set_title(name)
ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip(''),
size=15, horizontalalignment='right')
i += 1 figure.subplots_adjust(left=.02, right=.98)
plt.show()

二.特征选择(Feature Selection)

主要包含下面一个模块
    >>> sklearn.feature_selection

例一:feature_selection_pipeline.py

from sklearn.datasets import samples_generator
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn import svm
from sklearn.pipeline import make_pipeline # 生成数据
X, y = samples_generator.make_classification(
n_features=20, n_informative=3, n_redundant=0, n_classes=4,
n_clusters_per_class=2) # 两个步骤
# 1) 方差分析过滤,使用最好的3个特征
anova_filter = SelectKBest(f_regression, k=3)
# 2) 支持向量机分类
clf = svm.SVC(kernel='linear') # 组合成一个分类器
anova_svm = make_pipeline(anova_filter, clf)
anova_svm.fit(X, y)
anova_svm.predict(X)

例二:plot_rbm_logistic_classification.py

# Authors: Yann N. Dauphin, Vlad Niculae, Gabriel Synnaeve
# License: BSD import numpy as np
import matplotlib.pyplot as plt from scipy.ndimage import convolve
from sklearn import linear_model, datasets, metrics
from sklearn.cross_validation import train_test_split
from sklearn.neural_network import BernoulliRBM
from sklearn.pipeline import Pipeline ###############################################################################
# Setting up def nudge_dataset(X, Y):
"""
This produces a dataset 5 times bigger than the original one,
by moving the 8x8 images in X around by 1px to left, right, down, up
"""
direction_vectors = [
[[0, 1, 0],
[0, 0, 0],
[0, 0, 0]], [[0, 0, 0],
[1, 0, 0],
[0, 0, 0]], [[0, 0, 0],
[0, 0, 1],
[0, 0, 0]], [[0, 0, 0],
[0, 0, 0],
[0, 1, 0]]] shift = lambda x, w: convolve(x.reshape((8, 8)), mode='constant',
weights=w).ravel()
X = np.concatenate([X] +
[np.apply_along_axis(shift, 1, X, vector)
for vector in direction_vectors])
Y = np.concatenate([Y for _ in range(5)], axis=0)
return X, Y # Load Data
digits = datasets.load_digits()
X = np.asarray(digits.data, 'float32')
X, Y = nudge_dataset(X, digits.target)
X = (X - np.min(X, 0)) / (np.max(X, 0) + 0.0001) # 0-1 scaling X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
test_size=0.2,
random_state=0) # Models we will use
logistic = linear_model.LogisticRegression()
rbm = BernoulliRBM(random_state=0, verbose=True) classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)]) ###############################################################################
# Training # Hyper-parameters. These were set by cross-validation,
# using a GridSearchCV. Here we are not performing cross-validation to
# save time.
rbm.learning_rate = 0.06
rbm.n_iter = 20
# More components tend to give better prediction performance, but larger
# fitting time
rbm.n_components = 100
logistic.C = 6000.0 # Training RBM-Logistic Pipeline
classifier.fit(X_train, Y_train) # Training Logistic regression
logistic_classifier = linear_model.LogisticRegression(C=100.0)
logistic_classifier.fit(X_train, Y_train) ###############################################################################
# Evaluation print()
print("Logistic regression using RBM features:\n%s\n" % (
metrics.classification_report(
Y_test,
classifier.predict(X_test)))) print("Logistic regression using raw pixel features:\n%s\n" % (
metrics.classification_report(
Y_test,
logistic_classifier.predict(X_test)))) ###############################################################################
# Plotting plt.figure(figsize=(4.2, 4))
for i, comp in enumerate(rbm.components_):
plt.subplot(10, 10, i + 1)
plt.imshow(comp.reshape((8, 8)), cmap=plt.cm.gray_r,
interpolation='nearest')
plt.xticks(())
plt.yticks(())
plt.suptitle('100 components extracted by RBM', fontsize=16)
plt.subplots_adjust(0.08, 0.02, 0.92, 0.85, 0.08, 0.23) plt.show()

三.模型选择(Model Selection)

主要包含下面两个模块

>>> sklearn.grid_search

>>> sklearn.cross_validation

实例一:randomized_search.py

import numpy as np

from time import time
from operator import itemgetter
from scipy.stats import randint as sp_randint from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.datasets import load_digits
from sklearn.ensemble import RandomForestClassifier # get some data
digits = load_digits()
X, y = digits.data, digits.target # build a classifier
clf = RandomForestClassifier(n_estimators=20) # Utility function to report best scores
def report(grid_scores, n_top=3):
top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
for i, score in enumerate(top_scores):
print("Model with rank: {0}".format(i + 1))
print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
score.mean_validation_score,
np.std(score.cv_validation_scores)))
print("Parameters: {0}".format(score.parameters))
print("") # specify parameters and distributions to sample from
param_dist = {"max_depth": [3, None],
"max_features": sp_randint(1, 11),
"min_samples_split": sp_randint(1, 11),
"min_samples_leaf": sp_randint(1, 11),
"bootstrap": [True, False],
"criterion": ["gini", "entropy"]} # run randomized search
n_iter_search = 20
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
n_iter=n_iter_search) start = time()
random_search.fit(X, y)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
" parameter settings." % ((time() - start), n_iter_search))
report(random_search.grid_scores_) # use a full grid over all parameters
param_grid = {"max_depth": [3, None],
"max_features": [1, 3, 10],
"min_samples_split": [1, 3, 10],
"min_samples_leaf": [1, 3, 10],
"bootstrap": [True, False],
"criterion": ["gini", "entropy"]} # run grid search
grid_search = GridSearchCV(clf, param_grid=param_grid)
start = time()
grid_search.fit(X, y) print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
% (time() - start, len(grid_search.grid_scores_)))
report(grid_search.grid_scores_)

实例二:grid_search_text_feature_extraction.py

from pprint import pprint
from time import time
import logging from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline print(__doc__) # Display progress logs on stdout
logging.basicConfig(level=logging.INFO,
format='%(asctime)s %(levelname)s %(message)s') ###############################################################################
# Load some categories from the training set
categories = [
'alt.atheism',
'talk.religion.misc',
]
# Uncomment the following to do the analysis on all the categories
#categories = None print("Loading 20 newsgroups dataset for categories:")
print(categories) data = fetch_20newsgroups(subset='train', categories=categories)
print("%d documents" % len(data.filenames))
print("%d categories" % len(data.target_names))
print() ###############################################################################
# define a pipeline combining a text feature extractor with a simple
# classifier
pipeline = Pipeline([
('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', SGDClassifier()),
]) # uncommenting more parameters will give better exploring power but will
# increase processing time in a combinatorial way
parameters = {
'vect__max_df': (0.5, 0.75, 1.0),
#'vect__max_features': (None, 5000, 10000, 50000),
'vect__ngram_range': ((1, 1), (1, 2)), # unigrams or bigrams
#'tfidf__use_idf': (True, False),
#'tfidf__norm': ('l1', 'l2'),
'clf__alpha': (0.00001, 0.000001),
'clf__penalty': ('l2', 'elasticnet'),
#'clf__n_iter': (10, 50, 80),
} if __name__ == "__main__":
# multiprocessing requires the fork to happen in a __main__ protected
# block # find the best parameters for both the feature extraction and the
# classifier
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1) print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(data.data, data.target)
print("done in %0.3fs" % (time() - t0))
print() print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
print("\t%s: %r" % (param_name, best_parameters[param_name]))

sklearn分类的更多相关文章

  1. Sklearn分类树在合成数集上的表现

    小伙伴们大家好~o( ̄▽ ̄)ブ,今天我们开始来看一下Sklearn分类树的表现,我的开发环境是Jupyter lab,所用的库和版本大家参考: Python 3.7.1(你的版本至少要3.4以上) S ...

  2. python + sklearn ︱分类效果评估——acc、recall、F1、ROC、回归、距离

    之前提到过聚类之后,聚类质量的评价: 聚类︱python实现 六大 分群质量评估指标(兰德系数.互信息.轮廓系数) R语言相关分类效果评估: R语言︱分类器的性能表现评价(混淆矩阵,准确率,召回率,F ...

  3. [Example of Sklearn] - 分类对比

    refrence :http://cloga.info/python/2014/02/07/classify_use_Sklearn/ 加载数据集 这里我使用pandas来加载数据集,数据集采用kag ...

  4. Python sklearn 分类效果评估

    https://blog.csdn.net/sinat_26917383/article/details/75199996

  5. sklearn调用分类算法的评价指标

    sklearn分类算法的评价指标调用#二分类问题的算法评价指标import numpy as npimport matplotlib.pyplot as pltimport pandas as pdf ...

  6. 特征选取1-from sklearn.feature_selection import SelectKBest

    sklearn实战-乳腺癌细胞数据挖掘(博主亲自录制视频) https://study.163.com/course/introduction.htm?courseId=1005269003& ...

  7. XGBoost类库使用小结

    在XGBoost算法原理小结中,我们讨论了XGBoost的算法原理,这一片我们讨论如何使用XGBoost的Python类库,以及一些重要参数的意义和调参思路. 本文主要参考了XGBoost的Pytho ...

  8. Sklearn库例子——决策树分类

    Sklearn上关于决策树算法使用的介绍:http://scikit-learn.org/stable/modules/tree.html 1.关于决策树:决策树是一个非参数的监督式学习方法,主要用于 ...

  9. 利用sklearn对MNIST手写数据集开始一个简单的二分类判别器项目(在这个过程中学习关于模型性能的评价指标,如accuracy,precision,recall,混淆矩阵)

    .caret, .dropup > .btn > .caret { border-top-color: #000 !important; } .label { border: 1px so ...

随机推荐

  1. iOS多线程-01-pthread与NSTread

    简介 恰当的使用多线程编程可以提供任务的执行效率和系统资源的利用率 多线程是为了提高资源利用率,和应用程序的响应速度,多个线程共享应用资源 每个应用程序都有一个主线程,通常用来做UI界面刷新等 比较耗 ...

  2. 多线程基础(五)NSThread线程通信

    5.多线程基础 线程间通信   什么叫线程间通信 在一个进程中,线程往往不是孤立存在的,多个线程之间需要经常进行通信   线程间通信的体现 1个线程传递数据给另一个线程 在1个线程中执行完特定任务后, ...

  3. 《慕客网:IOS动画案例之会跳动的登入界面(下)》学习笔记 -Sketch的使用

    导出选中的一个图片,比如这里我们选中background,然后点击软件的右下角,可以设置导出的尺寸: 然后添加1倍,2倍,3倍的尺寸,因为在ihpne6之后就需要这三个尺寸倍数的UI,以适应不同设备的 ...

  4. URL最大长度限制

    在开发调试支付宝接口时,突然发现支付宝接口的URL很长,远远大于之前自己印象中的255个字符.赶紧搜索查证了一番,理解如下: URL不能大于255bytes的说法确实存在,在RFC2616中提到: T ...

  5. [Jmeter]打开jmeter.bat报错的解决思路与方法

    解决过程: 打开apache-jmeter-3.0的jmeter.bat时,报错如下: 查看报错信息,应该是属于环境变量配置问题. 因此加上jave_home的路径语句在jmeter.bat文件上: ...

  6. 一个超复杂的间接递归——C语言初学者代码中的常见错误与瑕疵(6)

    问题: 问题出处见 C语言初学者代码中的常见错误与瑕疵(5) . 在该文的最后,曾提到完成的代码还有进一步改进的余地.本文完成了这个改进.所以本文讨论的并不是初学者代码中的常见错误与瑕疵,而是对我自己 ...

  7. java使用this关键字调用本类重载构造器

    在构造器中可以调用本类的其他重载构造器,不能使用构造器名称来调用另一个构造器,而是应该使用Java特定的this(-.)来调用. this(-.)方法必须出现在构造器中的第一行,用来调用其他重载构造器 ...

  8. 【mysql】使用脚本对mysql状态进行监控

    1.mysqladmin 使用mysqladmin extended-status命令可以获得所有MySQL性能指标,即show global status的输出,不过,因为多数这些指标都是累计值,如 ...

  9. MFC Grid control 2.27

    原文链接地址:http://www.codeproject.com/Articles/8/MFC-Grid-control MFCGridCtrl是个强大的类,用于数据的表格显示. 1.类特征 Cel ...

  10. 2015年p2p网络借贷平台的发展现状

    2015年春暖花开,莺飞草长,股市大涨大跌起起落落,P2P网络借贷收到越来越多的人关注,P2P网络借贷平台是p2p借贷与网络借贷相结合的金 融服务网站,这么多P2P网络借贷平台排我们应该如何选择呢?小 ...