# daal4py Decision Forest Classification Training example Serialization

import daal4py as d4p
import numpy as np
import pickle
from sklearn.datasets import fetch_mldata
from sklearn.model_selection import train_test_split def get_mnist():
mnist = fetch_mldata('MNIST original')
X_train, X_test, y_train, y_test = train_test_split(mnist.data, mnist.target, train_size=60000, test_size=10000)
data = np.ascontiguousarray(X_train, dtype=np.float32)
labels = np.ascontiguousarray(y_train, dtype=np.float32).reshape(y_train.shape[0],1) return data, labels # serialized model can be used only by daal4py with pickle
def pickle_serialization(result, file='df_result.pkl'):
with open(file,'wb') as out:
pickle.dump(result, out) # universal naitive DAAL model serializtion. Can be used in all DAAL interfaces C++/Java/pydaal/daal4py
def native_serialization(result, file='native_result.txt'):
daal_buff = result.__getstate__()
File = open(file, "wb")
File.write(daal_buff) if __name__ == "__main__":
data, labels = get_mnist() # 'fptype' parameter should be the same type as input numpy arrays to archive the best performance
# (no data conversation in this case)
train = d4p.decision_forest_classification_training(10, fptype='float', nTrees=100, minObservationsInLeafNode=1,
engine = d4p.engines_mt19937(seed=777),bootstrap=True)
result = train.compute(data, labels) # serialize model to file
pickle_serialization(result)
native_serialization(result)

  

python预测

import daal4py as d4p

import numpy as np
import pickle
from sklearn.datasets import fetch_mldata
from sklearn.model_selection import train_test_split def get_mnist_test():
mnist = fetch_mldata('MNIST original')
X_train, X_test, y_train, y_test = train_test_split(mnist.data, mnist.target, train_size=60000, test_size=10000)
pdata = np.ascontiguousarray(X_test, dtype=np.float32)
plabels = np.ascontiguousarray(y_test, dtype=np.float32).reshape(y_test.shape[0],1) return pdata, plabels def checkAccuracy(plabels, prediction):
t = 0
count = 0
for i in plabels:
if i != prediction[t]:
count = count + 1
t = t + 1
return (1 - count/t) def pickle_deserialization(file='df_result.pkl'):
with open(file,'rb') as inp:
return pickle.load(inp) def native_deserialization(file='native_result.txt'):
daal_result = d4p.decision_forest_classification_training_result()
File = open(file, "rb")
daal_buff = File.read()
daal_result.__setstate__(daal_buff)
return daal_result if __name__ == "__main__":
nClasses = 10 pdata, plabels = get_mnist_test() #deserialize model
deserialized_result_pickle = pickle_deserialization() deserialized_result_naitive = native_deserialization() # now predict using the deserialized model from the training above, fptype is float as input data
predict_algo = d4p.decision_forest_classification_prediction(nClasses, fptype='float') # just set pickle-obtained model into compute
predict_result = predict_algo.compute(pdata, deserialized_result_pickle.model) print("\nAccuracy:", checkAccuracy(plabels, predict_result.prediction)) # the same result as above. just set native-obtained model into compute
predict_result = predict_algo.compute(pdata, deserialized_result_naitive.model) print("\nAccuracy:", checkAccuracy(plabels, predict_result.prediction))

c++使用该daal4py的模型:  

/**
* <a name="DAAL-EXAMPLE-CPP-DF_CLS_DENSE_BATCH"></a>
* \example df_cls_dense_batch.cpp
*/ #include "daal.h"
#include "service.h"
#include "stdio.h"
using namespace std;
using namespace daal;
using namespace daal::algorithms;
using namespace daal::algorithms::decision_forest::classification; /* Input data set parameters */
const string testDatasetFileName = "../data/batch/mnist_test_data.csv";
const string labels = "../data/batch/mnist_test_labels.csv"; const size_t nFeatures = 784; /* Number of features in training and testing data sets */
const size_t nClasses = 10; /* Number of classes */ void testModel();
void loadData(const std::string& dataFileName, const std::string& labelsFileName, NumericTablePtr& pData, NumericTablePtr& pDependentVar);
void check_accuracy(NumericTablePtr prediction, NumericTablePtr testGroundTruth); int main(int argc, char *argv[])
{
checkArguments(argc, argv, 2, &labels, &testDatasetFileName); /* Deserialization */
size_t size = 0;
byte * buffer = NULL;
FILE * pFile;
size_t result; pFile = fopen ( "../data/batch/native_result.txt" , "rb" );
if (pFile==NULL)
{
fputs ("File error",stderr);
exit (1);
} // obtain file size:
fseek (pFile , 0 , SEEK_END);
size = ftell (pFile);
std::cout << "size: " << size << "\n";
rewind(pFile); // allocate memory to contain the whole file:
buffer = (byte*) malloc (sizeof(byte)*size);
if (buffer == NULL)
{
fputs ("Memory error",stderr);
exit (2);
} // copy the file into the buffer:
result = fread (buffer,1,size,pFile);
if (result != size)
{
fputs ("Reading error",stderr);
exit (3);
}
/* the result buffer is now loaded in the buffer. */ /* Create a data archive to deserialize the numeric table */
OutputDataArchive out_dataArch(buffer, size);
free (buffer);
fclose (pFile); /* needed for result allocation */
training::Batch<> train(nClasses);
train.getResult()->deserialize(out_dataArch); /* Create Numeric Tables for testing data and ground truth values */
NumericTablePtr testData;
NumericTablePtr testGroundTruth; loadData(testDatasetFileName, labels, testData, testGroundTruth);
/* Create an algorithm object to predict values of decision forest classification */
prediction::Batch<> algorithm(nClasses); /* Pass a testing data set and the trained model to the algorithm */
algorithm.input.set(classifier::prediction::data, testData);
/* set deserialized model */
algorithm.input.set(classifier::prediction::model, train.getResult()->get(classifier::training::model)); /* Predict values of decision forest classification */
algorithm.compute(); /* Retrieve the algorithm results */
NumericTablePtr prediction = algorithm.getResult()->get(classifier::prediction::prediction);
printNumericTable(prediction, "Prediction results (first 10 rows):", 10);
printNumericTable(testGroundTruth, "Ground truth (first 10 rows):", 10); check_accuracy(prediction, testGroundTruth); return 0;
} void check_accuracy(NumericTablePtr prediction, NumericTablePtr testGroundTruth)
{
/* check accuracy */
BlockDescriptor<double> blockPr;
prediction->getBlockOfRows(0, prediction->getNumberOfRows(), readOnly, blockPr); double* valueP = (blockPr.getBlockPtr()); BlockDescriptor<double> blockGT;
testGroundTruth->getBlockOfRows(0, testGroundTruth->getNumberOfRows(), readOnly, blockGT); double* valueG = (blockGT.getBlockPtr()); size_t count = 0;
for(size_t i = 0; i < testGroundTruth->getNumberOfRows(); i++)
{
if(valueG[i] != valueP[i])
count++;
}
testGroundTruth->releaseBlockOfRows(blockGT);
prediction->releaseBlockOfRows(blockPr);
cout << "accuracy: " << 1- double(count)/double(testGroundTruth->getNumberOfRows()) << "\n";
} void loadData(const std::string& dataFileName,const std::string& labelsFileName, NumericTablePtr& pData, NumericTablePtr& pDependentVar)
{
/* Initialize FileDataSource<CSVFeatureManager> to retrieve the input data from a .csv file */
FileDataSource<CSVFeatureManager> trainDataSource(dataFileName,
DataSource::notAllocateNumericTable,
DataSource::doDictionaryFromContext); FileDataSource<CSVFeatureManager> trainLabels(labelsFileName,
DataSource::notAllocateNumericTable,
DataSource::doDictionaryFromContext); /* Create Numeric Tables for training data and dependent variables */
pData.reset(new HomogenNumericTable<>(nFeatures, 0, NumericTable::notAllocate));
pDependentVar.reset(new HomogenNumericTable<>(1, 0, NumericTable::notAllocate)); /* Retrieve the data from input file */
trainDataSource.loadDataBlock(pData.get());
trainLabels.loadDataBlock(pDependentVar.get());
NumericTableDictionaryPtr pDictionary = pData->getDictionarySharedPtr();
}

  

daal4py 随机森林模型训练mnist并保存模型给C++ daal predict使用的更多相关文章

  1. [Python] 波士顿房价的7种模型(线性拟合、二次多项式、Ridge、Lasso、SVM、决策树、随机森林)的训练效果对比

    目录 1. 载入数据 列解释Columns: 2. 数据分析 2.1 预处理 2.2 可视化 3. 训练模型 3.1 线性拟合 3.2 多项式回归(二次) 3.3 脊回归(Ridge Regressi ...

  2. 搭建简单模型训练MNIST数据集

    # -*- coding = utf-8 -*- # @Time : 2021/3/16 # @Author : pistachio # @File : test1.py # @Software : ...

  3. 随机森林RF、XGBoost、GBDT和LightGBM的原理和区别

    目录 1.基本知识点介绍 2.各个算法原理 2.1 随机森林 -- RandomForest 2.2 XGBoost算法 2.3 GBDT算法(Gradient Boosting Decision T ...

  4. RandomForest随机森林总结

    1.随机森林原理介绍 随机森林,指的是利用多棵树对样本进行训练并预测的一种分类器.该分类器最早由Leo Breiman和Adele Cutler提出,并被注册成了商标.简单来说,随机森林就是由多棵CA ...

  5. 机器学习之路:python 集成分类器 随机森林分类RandomForestClassifier 梯度提升决策树分类GradientBoostingClassifier 预测泰坦尼克号幸存者

    python3 学习使用随机森林分类器 梯度提升决策树分类 的api,并将他们和单一决策树预测结果做出对比 附上我的git,欢迎大家来参考我其他分类器的代码: https://github.com/l ...

  6. OpenCV:使用 随机森林与GBDT

    随机森林顾名思义,是用随机的方式建立一个森林.简单来说,随机森林就是由多棵CART(Classification And Regression Tree)构成的.对于每棵树,它们使用的训练集是从总的训 ...

  7. 基于opencv的RandomForest随机森林

    2.OpenCV函数使用 OpenCV提供了随机森林的相关类和函数.具体使用方法如下: (1)首先利用CvRTParams定义自己的参数,其格式如下 CvRTParams::CvRTParams(in ...

  8. kaggle数据挖掘竞赛初步--Titanic<随机森林&特征重要性>

    完整代码: https://github.com/cindycindyhi/kaggle-Titanic 特征工程系列: Titanic系列之原始数据分析和数据处理 Titanic系列之数据变换 Ti ...

  9. R语言︱决策树族——随机森林算法

    每每以为攀得众山小,可.每每又切实来到起点,大牛们,缓缓脚步来俺笔记葩分享一下吧,please~ --------------------------- 笔者寄语:有一篇<有监督学习选择深度学习 ...

随机推荐

  1. 【BZOJ】1875: [SDOI2009]HH去散步

    题目链接:http://www.lydsy.com/JudgeOnline/problem.php?id=1875 注意的是路径不可以重复,所以这题把边看成点.每一条无向边拆成两条有向边. 令${F[ ...

  2. 力扣(LeetCode) 35. 搜索插入位置

    给定一个排序数组和一个目标值,在数组中找到目标值,并返回其索引.如果目标值不存在于数组中,返回它将会被按顺序插入的位置. 你可以假设数组中无重复元素. 示例 1: 输入: [1,3,5,6], 5 输 ...

  3. 《剑指offer》第五十五题(平衡二叉树)

    // 面试题55(二):平衡二叉树 // 题目:输入一棵二叉树的根结点,判断该树是不是平衡二叉树.如果某二叉树中 // 任意结点的左右子树的深度相差不超过1,那么它就是一棵平衡二叉树. #includ ...

  4. vue-cli3快速创建项目

    文档:https://cli.vuejs.org/zh/guide/ 条件: npm 更至最新 node >=8.9 1.全局安装 npm install -g @vue/cli 或 yarn ...

  5. Windows上搭建Flume运行环境

    1.如果没有安装过Java环境,则需首先安装JDK. 可参考<Windows上搭建Kafka运行环境>中的搭建环境安装JDK部分 2.官方下载Flume(当前为apache-flume-1 ...

  6. 批标准化 Batch Normalization

    2018-12-05 20:28:15 在机器学习领域有一个很重要的假设,即独立同分布假设,也就是说训练集和测试集是满足相同分布的,这是通过训练数据获得的模型能够在测试集获得好的效果的一个基本保障.而 ...

  7. MySQL Connector/J

    5.1 Developer Guide 1. MysQL为由Java语言编程的客户端程序提供连接:MySQL Connector/J,这是一个实现Java Database Connectivity( ...

  8. 练习:将从表读出来的时间戳除以1000(java读时间戳会多出3个000)用jackson包 实现

    练习:将从表读出来的时间戳除以1000(java读时间戳会多出3个000)jackson包 实现 entity @Entity @DynamicUpdate //自动更新日期 @Data //get/ ...

  9. java8新特性: lambda表达式:直接获得某个list/array/对象里面的字段集合

    java8新特性: lambda表达式:直接获得某个list/array/对象里面的字段集合 比如,我有一张表: entity Category.java service CategoryServic ...

  10. 雷林鹏分享:jQuery EasyUI 树形菜单 - 树形网格动态加载

    jQuery EasyUI 树形菜单 - 树形网格动态加载 动态加载树形网格有助于从服务器上加载部分的行数据,避免加载大型数据的长时间等待.本教程将向您展示如何创建带有动态加载特性的树形网格(Tree ...