随机森林是决策树的集合。 随机森林结合许多决策树,以减少过度拟合的风险。 spark.ml实现支持随机森林,使用连续和分类特征,做二分类和多分类以及回归。

导入包

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.Row
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.Column
import org.apache.spark.sql.DataFrameReader
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
import org.apache.spark.sql.Encoder
import org.apache.spark.sql.DataFrameStatFunctions
import org.apache.spark.sql.functions._ import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.feature.{ IndexToString, StringIndexer, VectorIndexer }
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.{ RandomForestClassificationModel, RandomForestClassifier }
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.tuning.{ ParamGridBuilder, CrossValidator }

导入源数据

// affairs:一年来婚外情的频率
// gender:性别
// age:年龄
// yearsmarried:婚龄
// children:是否有小孩
// religiousness:宗教信仰程度(5分制,1分表示反对,5分表示非常信仰)
// education:学历
// occupation:职业(逆向编号的戈登7种分类)
// rating:对婚姻的自我评分(5分制,1表示非常不幸福,5表示非常幸福) val spark = SparkSession.builder().appName("Spark Random Forest Classifier").config("spark.some.config.option", "some-value").getOrCreate() // For implicit conversions like converting RDDs to DataFrames
import spark.implicits._ val dataList: List[(Double, String, Double, Double, String, Double, Double, Double, Double)] = List(
(0, "male", 37, 10, "no", 3, 18, 7, 4),
(0, "female", 27, 4, "no", 4, 14, 6, 4),
(0, "female", 32, 15, "yes", 1, 12, 1, 4),
(0, "male", 57, 15, "yes", 5, 18, 6, 5),
(0, "male", 22, 0.75, "no", 2, 17, 6, 3),
(0, "female", 32, 1.5, "no", 2, 17, 5, 5),
(0, "female", 22, 0.75, "no", 2, 12, 1, 3),
(0, "male", 57, 15, "yes", 2, 14, 4, 4),
(0, "female", 32, 15, "yes", 4, 16, 1, 2),
(0, "male", 22, 1.5, "no", 4, 14, 4, 5),
(0, "male", 37, 15, "yes", 2, 20, 7, 2),
(0, "male", 27, 4, "yes", 4, 18, 6, 4),
(0, "male", 47, 15, "yes", 5, 17, 6, 4),
(0, "female", 22, 1.5, "no", 2, 17, 5, 4),
(0, "female", 27, 4, "no", 4, 14, 5, 4),
(0, "female", 37, 15, "yes", 1, 17, 5, 5),
(0, "female", 37, 15, "yes", 2, 18, 4, 3),
(0, "female", 22, 0.75, "no", 3, 16, 5, 4),
(0, "female", 22, 1.5, "no", 2, 16, 5, 5),
(0, "female", 27, 10, "yes", 2, 14, 1, 5),
(0, "female", 22, 1.5, "no", 2, 16, 5, 5),
(0, "female", 22, 1.5, "no", 2, 16, 5, 5),
(0, "female", 27, 10, "yes", 4, 16, 5, 4),
(0, "female", 32, 10, "yes", 3, 14, 1, 5),
(0, "male", 37, 4, "yes", 2, 20, 6, 4),
(0, "female", 22, 1.5, "no", 2, 18, 5, 5),
(0, "female", 27, 7, "no", 4, 16, 1, 5),
(0, "male", 42, 15, "yes", 5, 20, 6, 4),
(0, "male", 27, 4, "yes", 3, 16, 5, 5),
(0, "female", 27, 4, "yes", 3, 17, 5, 4),
(0, "male", 42, 15, "yes", 4, 20, 6, 3),
(0, "female", 22, 1.5, "no", 3, 16, 5, 5),
(0, "male", 27, 0.417, "no", 4, 17, 6, 4),
(0, "female", 42, 15, "yes", 5, 14, 5, 4),
(0, "male", 32, 4, "yes", 1, 18, 6, 4),
(0, "female", 22, 1.5, "no", 4, 16, 5, 3),
(0, "female", 42, 15, "yes", 3, 12, 1, 4),
(0, "female", 22, 4, "no", 4, 17, 5, 5),
(0, "male", 22, 1.5, "yes", 1, 14, 3, 5),
(0, "female", 22, 0.75, "no", 3, 16, 1, 5),
(0, "male", 32, 10, "yes", 5, 20, 6, 5),
(0, "male", 52, 15, "yes", 5, 18, 6, 3),
(0, "female", 22, 0.417, "no", 5, 14, 1, 4),
(0, "female", 27, 4, "yes", 2, 18, 6, 1),
(0, "female", 32, 7, "yes", 5, 17, 5, 3),
(0, "male", 22, 4, "no", 3, 16, 5, 5),
(0, "female", 27, 7, "yes", 4, 18, 6, 5),
(0, "female", 42, 15, "yes", 2, 18, 5, 4),
(0, "male", 27, 1.5, "yes", 4, 16, 3, 5),
(0, "male", 42, 15, "yes", 2, 20, 6, 4),
(0, "female", 22, 0.75, "no", 5, 14, 3, 5),
(0, "male", 32, 7, "yes", 2, 20, 6, 4),
(0, "male", 27, 4, "yes", 5, 20, 6, 5),
(0, "male", 27, 10, "yes", 4, 20, 6, 4),
(0, "male", 22, 4, "no", 1, 18, 5, 5),
(0, "female", 37, 15, "yes", 4, 14, 3, 1),
(0, "male", 22, 1.5, "yes", 5, 16, 4, 4),
(0, "female", 37, 15, "yes", 4, 17, 1, 5),
(0, "female", 27, 0.75, "no", 4, 17, 5, 4),
(0, "male", 32, 10, "yes", 4, 20, 6, 4),
(0, "female", 47, 15, "yes", 5, 14, 7, 2),
(0, "male", 37, 10, "yes", 3, 20, 6, 4),
(0, "female", 22, 0.75, "no", 2, 16, 5, 5),
(0, "male", 27, 4, "no", 2, 18, 4, 5),
(0, "male", 32, 7, "no", 4, 20, 6, 4),
(0, "male", 42, 15, "yes", 2, 17, 3, 5),
(0, "male", 37, 10, "yes", 4, 20, 6, 4),
(0, "female", 47, 15, "yes", 3, 17, 6, 5),
(0, "female", 22, 1.5, "no", 5, 16, 5, 5),
(0, "female", 27, 1.5, "no", 2, 16, 6, 4),
(0, "female", 27, 4, "no", 3, 17, 5, 5),
(0, "female", 32, 10, "yes", 5, 14, 4, 5),
(0, "female", 22, 0.125, "no", 2, 12, 5, 5),
(0, "male", 47, 15, "yes", 4, 14, 4, 3),
(0, "male", 32, 15, "yes", 1, 14, 5, 5),
(0, "male", 27, 7, "yes", 4, 16, 5, 5),
(0, "female", 22, 1.5, "yes", 3, 16, 5, 5),
(0, "male", 27, 4, "yes", 3, 17, 6, 5),
(0, "female", 22, 1.5, "no", 3, 16, 5, 5),
(0, "male", 57, 15, "yes", 2, 14, 7, 2),
(0, "male", 17.5, 1.5, "yes", 3, 18, 6, 5),
(0, "male", 57, 15, "yes", 4, 20, 6, 5),
(0, "female", 22, 0.75, "no", 2, 16, 3, 4),
(0, "male", 42, 4, "no", 4, 17, 3, 3),
(0, "female", 22, 1.5, "yes", 4, 12, 1, 5),
(0, "female", 22, 0.417, "no", 1, 17, 6, 4),
(0, "female", 32, 15, "yes", 4, 17, 5, 5),
(0, "female", 27, 1.5, "no", 3, 18, 5, 2),
(0, "female", 22, 1.5, "yes", 3, 14, 1, 5),
(0, "female", 37, 15, "yes", 3, 14, 1, 4),
(0, "female", 32, 15, "yes", 4, 14, 3, 4),
(0, "male", 37, 10, "yes", 2, 14, 5, 3),
(0, "male", 37, 10, "yes", 4, 16, 5, 4),
(0, "male", 57, 15, "yes", 5, 20, 5, 3),
(0, "male", 27, 0.417, "no", 1, 16, 3, 4),
(0, "female", 42, 15, "yes", 5, 14, 1, 5),
(0, "male", 57, 15, "yes", 3, 16, 6, 1),
(0, "male", 37, 10, "yes", 1, 16, 6, 4),
(0, "male", 37, 15, "yes", 3, 17, 5, 5),
(0, "male", 37, 15, "yes", 4, 20, 6, 5),
(0, "female", 27, 10, "yes", 5, 14, 1, 5),
(0, "male", 37, 10, "yes", 2, 18, 6, 4),
(0, "female", 22, 0.125, "no", 4, 12, 4, 5),
(0, "male", 57, 15, "yes", 5, 20, 6, 5),
(0, "female", 37, 15, "yes", 4, 18, 6, 4),
(0, "male", 22, 4, "yes", 4, 14, 6, 4),
(0, "male", 27, 7, "yes", 4, 18, 5, 4),
(0, "male", 57, 15, "yes", 4, 20, 5, 4),
(0, "male", 32, 15, "yes", 3, 14, 6, 3),
(0, "female", 22, 1.5, "no", 2, 14, 5, 4),
(0, "female", 32, 7, "yes", 4, 17, 1, 5),
(0, "female", 37, 15, "yes", 4, 17, 6, 5),
(0, "female", 32, 1.5, "no", 5, 18, 5, 5),
(0, "male", 42, 10, "yes", 5, 20, 7, 4),
(0, "female", 27, 7, "no", 3, 16, 5, 4),
(0, "male", 37, 15, "no", 4, 20, 6, 5),
(0, "male", 37, 15, "yes", 4, 14, 3, 2),
(0, "male", 32, 10, "no", 5, 18, 6, 4),
(0, "female", 22, 0.75, "no", 4, 16, 1, 5),
(0, "female", 27, 7, "yes", 4, 12, 2, 4),
(0, "female", 27, 7, "yes", 2, 16, 2, 5),
(0, "female", 42, 15, "yes", 5, 18, 5, 4),
(0, "male", 42, 15, "yes", 4, 17, 5, 3),
(0, "female", 27, 7, "yes", 2, 16, 1, 2),
(0, "female", 22, 1.5, "no", 3, 16, 5, 5),
(0, "male", 37, 15, "yes", 5, 20, 6, 5),
(0, "female", 22, 0.125, "no", 2, 14, 4, 5),
(0, "male", 27, 1.5, "no", 4, 16, 5, 5),
(0, "male", 32, 1.5, "no", 2, 18, 6, 5),
(0, "male", 27, 1.5, "no", 2, 17, 6, 5),
(0, "female", 27, 10, "yes", 4, 16, 1, 3),
(0, "male", 42, 15, "yes", 4, 18, 6, 5),
(0, "female", 27, 1.5, "no", 2, 16, 6, 5),
(0, "male", 27, 4, "no", 2, 18, 6, 3),
(0, "female", 32, 10, "yes", 3, 14, 5, 3),
(0, "female", 32, 15, "yes", 3, 18, 5, 4),
(0, "female", 22, 0.75, "no", 2, 18, 6, 5),
(0, "female", 37, 15, "yes", 2, 16, 1, 4),
(0, "male", 27, 4, "yes", 4, 20, 5, 5),
(0, "male", 27, 4, "no", 1, 20, 5, 4),
(0, "female", 27, 10, "yes", 2, 12, 1, 4),
(0, "female", 32, 15, "yes", 5, 18, 6, 4),
(0, "male", 27, 7, "yes", 5, 12, 5, 3),
(0, "male", 52, 15, "yes", 2, 18, 5, 4),
(0, "male", 27, 4, "no", 3, 20, 6, 3),
(0, "male", 37, 4, "yes", 1, 18, 5, 4),
(0, "male", 27, 4, "yes", 4, 14, 5, 4),
(0, "female", 52, 15, "yes", 5, 12, 1, 3),
(0, "female", 57, 15, "yes", 4, 16, 6, 4),
(0, "male", 27, 7, "yes", 1, 16, 5, 4),
(0, "male", 37, 7, "yes", 4, 20, 6, 3),
(0, "male", 22, 0.75, "no", 2, 14, 4, 3),
(0, "male", 32, 4, "yes", 2, 18, 5, 3),
(0, "male", 37, 15, "yes", 4, 20, 6, 3),
(0, "male", 22, 0.75, "yes", 2, 14, 4, 3),
(0, "male", 42, 15, "yes", 4, 20, 6, 3),
(0, "female", 52, 15, "yes", 5, 17, 1, 1),
(0, "female", 37, 15, "yes", 4, 14, 1, 2),
(0, "male", 27, 7, "yes", 4, 14, 5, 3),
(0, "male", 32, 4, "yes", 2, 16, 5, 5),
(0, "female", 27, 4, "yes", 2, 18, 6, 5),
(0, "female", 27, 4, "yes", 2, 18, 5, 5),
(0, "male", 37, 15, "yes", 5, 18, 6, 5),
(0, "female", 47, 15, "yes", 5, 12, 5, 4),
(0, "female", 32, 10, "yes", 3, 17, 1, 4),
(0, "female", 27, 1.5, "yes", 4, 17, 1, 2),
(0, "female", 57, 15, "yes", 2, 18, 5, 2),
(0, "female", 22, 1.5, "no", 4, 14, 5, 4),
(0, "male", 42, 15, "yes", 3, 14, 3, 4),
(0, "male", 57, 15, "yes", 4, 9, 2, 2),
(0, "male", 57, 15, "yes", 4, 20, 6, 5),
(0, "female", 22, 0.125, "no", 4, 14, 4, 5),
(0, "female", 32, 10, "yes", 4, 14, 1, 5),
(0, "female", 42, 15, "yes", 3, 18, 5, 4),
(0, "female", 27, 1.5, "no", 2, 18, 6, 5),
(0, "male", 32, 0.125, "yes", 2, 18, 5, 2),
(0, "female", 27, 4, "no", 3, 16, 5, 4),
(0, "female", 27, 10, "yes", 2, 16, 1, 4),
(0, "female", 32, 7, "yes", 4, 16, 1, 3),
(0, "female", 37, 15, "yes", 4, 14, 5, 4),
(0, "female", 42, 15, "yes", 5, 17, 6, 2),
(0, "male", 32, 1.5, "yes", 4, 14, 6, 5),
(0, "female", 32, 4, "yes", 3, 17, 5, 3),
(0, "female", 37, 7, "no", 4, 18, 5, 5),
(0, "female", 22, 0.417, "yes", 3, 14, 3, 5),
(0, "female", 27, 7, "yes", 4, 14, 1, 5),
(0, "male", 27, 0.75, "no", 3, 16, 5, 5),
(0, "male", 27, 4, "yes", 2, 20, 5, 5),
(0, "male", 32, 10, "yes", 4, 16, 4, 5),
(0, "male", 32, 15, "yes", 1, 14, 5, 5),
(0, "male", 22, 0.75, "no", 3, 17, 4, 5),
(0, "female", 27, 7, "yes", 4, 17, 1, 4),
(0, "male", 27, 0.417, "yes", 4, 20, 5, 4),
(0, "male", 37, 15, "yes", 4, 20, 5, 4),
(0, "female", 37, 15, "yes", 2, 14, 1, 3),
(0, "male", 22, 4, "yes", 1, 18, 5, 4),
(0, "male", 37, 15, "yes", 4, 17, 5, 3),
(0, "female", 22, 1.5, "no", 2, 14, 4, 5),
(0, "male", 52, 15, "yes", 4, 14, 6, 2),
(0, "female", 22, 1.5, "no", 4, 17, 5, 5),
(0, "male", 32, 4, "yes", 5, 14, 3, 5),
(0, "male", 32, 4, "yes", 2, 14, 3, 5),
(0, "female", 22, 1.5, "no", 3, 16, 6, 5),
(0, "male", 27, 0.75, "no", 2, 18, 3, 3),
(0, "female", 22, 7, "yes", 2, 14, 5, 2),
(0, "female", 27, 0.75, "no", 2, 17, 5, 3),
(0, "female", 37, 15, "yes", 4, 12, 1, 2),
(0, "female", 22, 1.5, "no", 1, 14, 1, 5),
(0, "female", 37, 10, "no", 2, 12, 4, 4),
(0, "female", 37, 15, "yes", 4, 18, 5, 3),
(0, "female", 42, 15, "yes", 3, 12, 3, 3),
(0, "male", 22, 4, "no", 2, 18, 5, 5),
(0, "male", 52, 7, "yes", 2, 20, 6, 2),
(0, "male", 27, 0.75, "no", 2, 17, 5, 5),
(0, "female", 27, 4, "no", 2, 17, 4, 5),
(0, "male", 42, 1.5, "no", 5, 20, 6, 5),
(0, "male", 22, 1.5, "no", 4, 17, 6, 5),
(0, "male", 22, 4, "no", 4, 17, 5, 3),
(0, "female", 22, 4, "yes", 1, 14, 5, 4),
(0, "male", 37, 15, "yes", 5, 20, 4, 5),
(0, "female", 37, 10, "yes", 3, 16, 6, 3),
(0, "male", 42, 15, "yes", 4, 17, 6, 5),
(0, "female", 47, 15, "yes", 4, 17, 5, 5),
(0, "male", 22, 1.5, "no", 4, 16, 5, 4),
(0, "female", 32, 10, "yes", 3, 12, 1, 4),
(0, "female", 22, 7, "yes", 1, 14, 3, 5),
(0, "female", 32, 10, "yes", 4, 17, 5, 4),
(0, "male", 27, 1.5, "yes", 2, 16, 2, 4),
(0, "male", 37, 15, "yes", 4, 14, 5, 5),
(0, "male", 42, 4, "yes", 3, 14, 4, 5),
(0, "female", 37, 15, "yes", 5, 14, 5, 4),
(0, "female", 32, 7, "yes", 4, 17, 5, 5),
(0, "female", 42, 15, "yes", 4, 18, 6, 5),
(0, "male", 27, 4, "no", 4, 18, 6, 4),
(0, "male", 22, 0.75, "no", 4, 18, 6, 5),
(0, "male", 27, 4, "yes", 4, 14, 5, 3),
(0, "female", 22, 0.75, "no", 5, 18, 1, 5),
(0, "female", 52, 15, "yes", 5, 9, 5, 5),
(0, "male", 32, 10, "yes", 3, 14, 5, 5),
(0, "female", 37, 15, "yes", 4, 16, 4, 4),
(0, "male", 32, 7, "yes", 2, 20, 5, 4),
(0, "female", 42, 15, "yes", 3, 18, 1, 4),
(0, "male", 32, 15, "yes", 1, 16, 5, 5),
(0, "male", 27, 4, "yes", 3, 18, 5, 5),
(0, "female", 32, 15, "yes", 4, 12, 3, 4),
(0, "male", 22, 0.75, "yes", 3, 14, 2, 4),
(0, "female", 22, 1.5, "no", 3, 16, 5, 3),
(0, "female", 42, 15, "yes", 4, 14, 3, 5),
(0, "female", 52, 15, "yes", 3, 16, 5, 4),
(0, "male", 37, 15, "yes", 5, 20, 6, 4),
(0, "female", 47, 15, "yes", 4, 12, 2, 3),
(0, "male", 57, 15, "yes", 2, 20, 6, 4),
(0, "male", 32, 7, "yes", 4, 17, 5, 5),
(0, "female", 27, 7, "yes", 4, 17, 1, 4),
(0, "male", 22, 1.5, "no", 1, 18, 6, 5),
(0, "female", 22, 4, "yes", 3, 9, 1, 4),
(0, "female", 22, 1.5, "no", 2, 14, 1, 5),
(0, "male", 42, 15, "yes", 2, 20, 6, 4),
(0, "male", 57, 15, "yes", 4, 9, 2, 4),
(0, "female", 27, 7, "yes", 2, 18, 1, 5),
(0, "female", 22, 4, "yes", 3, 14, 1, 5),
(0, "male", 37, 15, "yes", 4, 14, 5, 3),
(0, "male", 32, 7, "yes", 1, 18, 6, 4),
(0, "female", 22, 1.5, "no", 2, 14, 5, 5),
(0, "female", 22, 1.5, "yes", 3, 12, 1, 3),
(0, "male", 52, 15, "yes", 2, 14, 5, 5),
(0, "female", 37, 15, "yes", 2, 14, 1, 1),
(0, "female", 32, 10, "yes", 2, 14, 5, 5),
(0, "male", 42, 15, "yes", 4, 20, 4, 5),
(0, "female", 27, 4, "yes", 3, 18, 4, 5),
(0, "male", 37, 15, "yes", 4, 20, 6, 5),
(0, "male", 27, 1.5, "no", 3, 18, 5, 5),
(0, "female", 22, 0.125, "no", 2, 16, 6, 3),
(0, "male", 32, 10, "yes", 2, 20, 6, 3),
(0, "female", 27, 4, "no", 4, 18, 5, 4),
(0, "female", 27, 7, "yes", 2, 12, 5, 1),
(0, "male", 32, 4, "yes", 5, 18, 6, 3),
(0, "female", 37, 15, "yes", 2, 17, 5, 5),
(0, "male", 47, 15, "no", 4, 20, 6, 4),
(0, "male", 27, 1.5, "no", 1, 18, 5, 5),
(0, "male", 37, 15, "yes", 4, 20, 6, 4),
(0, "female", 32, 15, "yes", 4, 18, 1, 4),
(0, "female", 32, 7, "yes", 4, 17, 5, 4),
(0, "female", 42, 15, "yes", 3, 14, 1, 3),
(0, "female", 27, 7, "yes", 3, 16, 1, 4),
(0, "male", 27, 1.5, "no", 3, 16, 4, 2),
(0, "male", 22, 1.5, "no", 3, 16, 3, 5),
(0, "male", 27, 4, "yes", 3, 16, 4, 2),
(0, "female", 27, 7, "yes", 3, 12, 1, 2),
(0, "female", 37, 15, "yes", 2, 18, 5, 4),
(0, "female", 37, 7, "yes", 3, 14, 4, 4),
(0, "male", 22, 1.5, "no", 2, 16, 5, 5),
(0, "male", 37, 15, "yes", 5, 20, 5, 4),
(0, "female", 22, 1.5, "no", 4, 16, 5, 3),
(0, "female", 32, 10, "yes", 4, 16, 1, 5),
(0, "male", 27, 4, "no", 2, 17, 5, 3),
(0, "female", 22, 0.417, "no", 4, 14, 5, 5),
(0, "female", 27, 4, "no", 2, 18, 5, 5),
(0, "male", 37, 15, "yes", 4, 18, 5, 3),
(0, "male", 37, 10, "yes", 5, 20, 7, 4),
(0, "female", 27, 7, "yes", 2, 14, 4, 2),
(0, "male", 32, 4, "yes", 2, 16, 5, 5),
(0, "male", 32, 4, "yes", 2, 16, 6, 4),
(0, "male", 22, 1.5, "no", 3, 18, 4, 5),
(0, "female", 22, 4, "yes", 4, 14, 3, 4),
(0, "female", 17.5, 0.75, "no", 2, 18, 5, 4),
(0, "male", 32, 10, "yes", 4, 20, 4, 5),
(0, "female", 32, 0.75, "no", 5, 14, 3, 3),
(0, "male", 37, 15, "yes", 4, 17, 5, 3),
(0, "male", 32, 4, "no", 3, 14, 4, 5),
(0, "female", 27, 1.5, "no", 2, 17, 3, 2),
(0, "female", 22, 7, "yes", 4, 14, 1, 5),
(0, "male", 47, 15, "yes", 5, 14, 6, 5),
(0, "male", 27, 4, "yes", 1, 16, 4, 4),
(0, "female", 37, 15, "yes", 5, 14, 1, 3),
(0, "male", 42, 4, "yes", 4, 18, 5, 5),
(0, "female", 32, 4, "yes", 2, 14, 1, 5),
(0, "male", 52, 15, "yes", 2, 14, 7, 4),
(0, "female", 22, 1.5, "no", 2, 16, 1, 4),
(0, "male", 52, 15, "yes", 4, 12, 2, 4),
(0, "female", 22, 0.417, "no", 3, 17, 1, 5),
(0, "female", 22, 1.5, "no", 2, 16, 5, 5),
(0, "male", 27, 4, "yes", 4, 20, 6, 4),
(0, "female", 32, 15, "yes", 4, 14, 1, 5),
(0, "female", 27, 1.5, "no", 2, 16, 3, 5),
(0, "male", 32, 4, "no", 1, 20, 6, 5),
(0, "male", 37, 15, "yes", 3, 20, 6, 4),
(0, "female", 32, 10, "no", 2, 16, 6, 5),
(0, "female", 32, 10, "yes", 5, 14, 5, 5),
(0, "male", 37, 1.5, "yes", 4, 18, 5, 3),
(0, "male", 32, 1.5, "no", 2, 18, 4, 4),
(0, "female", 32, 10, "yes", 4, 14, 1, 4),
(0, "female", 47, 15, "yes", 4, 18, 5, 4),
(0, "female", 27, 10, "yes", 5, 12, 1, 5),
(0, "male", 27, 4, "yes", 3, 16, 4, 5),
(0, "female", 37, 15, "yes", 4, 12, 4, 2),
(0, "female", 27, 0.75, "no", 4, 16, 5, 5),
(0, "female", 37, 15, "yes", 4, 16, 1, 5),
(0, "female", 32, 15, "yes", 3, 16, 1, 5),
(0, "female", 27, 10, "yes", 2, 16, 1, 5),
(0, "male", 27, 7, "no", 2, 20, 6, 5),
(0, "female", 37, 15, "yes", 2, 14, 1, 3),
(0, "male", 27, 1.5, "yes", 2, 17, 4, 4),
(0, "female", 22, 0.75, "yes", 2, 14, 1, 5),
(0, "male", 22, 4, "yes", 4, 14, 2, 4),
(0, "male", 42, 0.125, "no", 4, 17, 6, 4),
(0, "male", 27, 1.5, "yes", 4, 18, 6, 5),
(0, "male", 27, 7, "yes", 3, 16, 6, 3),
(0, "female", 52, 15, "yes", 4, 14, 1, 3),
(0, "male", 27, 1.5, "no", 5, 20, 5, 2),
(0, "female", 27, 1.5, "no", 2, 16, 5, 5),
(0, "female", 27, 1.5, "no", 3, 17, 5, 5),
(0, "male", 22, 0.125, "no", 5, 16, 4, 4),
(0, "female", 27, 4, "yes", 4, 16, 1, 5),
(0, "female", 27, 4, "yes", 4, 12, 1, 5),
(0, "female", 47, 15, "yes", 2, 14, 5, 5),
(0, "female", 32, 15, "yes", 3, 14, 5, 3),
(0, "male", 42, 7, "yes", 2, 16, 5, 5),
(0, "male", 22, 0.75, "no", 4, 16, 6, 4),
(0, "male", 27, 0.125, "no", 3, 20, 6, 5),
(0, "male", 32, 10, "yes", 3, 20, 6, 5),
(0, "female", 22, 0.417, "no", 5, 14, 4, 5),
(0, "female", 47, 15, "yes", 5, 14, 1, 4),
(0, "female", 32, 10, "yes", 3, 14, 1, 5),
(0, "male", 57, 15, "yes", 4, 17, 5, 5),
(0, "male", 27, 4, "yes", 3, 20, 6, 5),
(0, "female", 32, 7, "yes", 4, 17, 1, 5),
(0, "female", 37, 10, "yes", 4, 16, 1, 5),
(0, "female", 32, 10, "yes", 1, 18, 1, 4),
(0, "female", 22, 4, "no", 3, 14, 1, 4),
(0, "female", 27, 7, "yes", 4, 14, 3, 2),
(0, "male", 57, 15, "yes", 5, 18, 5, 2),
(0, "male", 32, 7, "yes", 2, 18, 5, 5),
(0, "female", 27, 1.5, "no", 4, 17, 1, 3),
(0, "male", 22, 1.5, "no", 4, 14, 5, 5),
(0, "female", 22, 1.5, "yes", 4, 14, 5, 4),
(0, "female", 32, 7, "yes", 3, 16, 1, 5),
(0, "female", 47, 15, "yes", 3, 16, 5, 4),
(0, "female", 22, 0.75, "no", 3, 16, 1, 5),
(0, "female", 22, 1.5, "yes", 2, 14, 5, 5),
(0, "female", 27, 4, "yes", 1, 16, 5, 5),
(0, "male", 52, 15, "yes", 4, 16, 5, 5),
(0, "male", 32, 10, "yes", 4, 20, 6, 5),
(0, "male", 47, 15, "yes", 4, 16, 6, 4),
(0, "female", 27, 7, "yes", 2, 14, 1, 2),
(0, "female", 22, 1.5, "no", 4, 14, 4, 5),
(0, "female", 32, 10, "yes", 2, 16, 5, 4),
(0, "female", 22, 0.75, "no", 2, 16, 5, 4),
(0, "female", 22, 1.5, "no", 2, 16, 5, 5),
(0, "female", 42, 15, "yes", 3, 18, 6, 4),
(0, "female", 27, 7, "yes", 5, 14, 4, 5),
(0, "male", 42, 15, "yes", 4, 16, 4, 4),
(0, "female", 57, 15, "yes", 3, 18, 5, 2),
(0, "male", 42, 15, "yes", 3, 18, 6, 2),
(0, "female", 32, 7, "yes", 2, 14, 1, 2),
(0, "male", 22, 4, "no", 5, 12, 4, 5),
(0, "female", 22, 1.5, "no", 1, 16, 6, 5),
(0, "female", 22, 0.75, "no", 1, 14, 4, 5),
(0, "female", 32, 15, "yes", 4, 12, 1, 5),
(0, "male", 22, 1.5, "no", 2, 18, 5, 3),
(0, "male", 27, 4, "yes", 5, 17, 2, 5),
(0, "female", 27, 4, "yes", 4, 12, 1, 5),
(0, "male", 42, 15, "yes", 5, 18, 5, 4),
(0, "male", 32, 1.5, "no", 2, 20, 7, 3),
(0, "male", 57, 15, "no", 4, 9, 3, 1),
(0, "male", 37, 7, "no", 4, 18, 5, 5),
(0, "male", 52, 15, "yes", 2, 17, 5, 4),
(0, "male", 47, 15, "yes", 4, 17, 6, 5),
(0, "female", 27, 7, "no", 2, 17, 5, 4),
(0, "female", 27, 7, "yes", 4, 14, 5, 5),
(0, "female", 22, 4, "no", 2, 14, 3, 3),
(0, "male", 37, 7, "yes", 2, 20, 6, 5),
(0, "male", 27, 7, "no", 4, 12, 4, 3),
(0, "male", 42, 10, "yes", 4, 18, 6, 4),
(0, "female", 22, 1.5, "no", 3, 14, 1, 5),
(0, "female", 22, 4, "yes", 2, 14, 1, 3),
(0, "female", 57, 15, "no", 4, 20, 6, 5),
(0, "male", 37, 15, "yes", 4, 14, 4, 3),
(0, "female", 27, 7, "yes", 3, 18, 5, 5),
(0, "female", 17.5, 10, "no", 4, 14, 4, 5),
(0, "male", 22, 4, "yes", 4, 16, 5, 5),
(0, "female", 27, 4, "yes", 2, 16, 1, 4),
(0, "female", 37, 15, "yes", 2, 14, 5, 1),
(0, "female", 22, 1.5, "no", 5, 14, 1, 4),
(0, "male", 27, 7, "yes", 2, 20, 5, 4),
(0, "male", 27, 4, "yes", 4, 14, 5, 5),
(0, "male", 22, 0.125, "no", 1, 16, 3, 5),
(0, "female", 27, 7, "yes", 4, 14, 1, 4),
(0, "female", 32, 15, "yes", 5, 16, 5, 3),
(0, "male", 32, 10, "yes", 4, 18, 5, 4),
(0, "female", 32, 15, "yes", 2, 14, 3, 4),
(0, "female", 22, 1.5, "no", 3, 17, 5, 5),
(0, "male", 27, 4, "yes", 4, 17, 4, 4),
(0, "female", 52, 15, "yes", 5, 14, 1, 5),
(0, "female", 27, 7, "yes", 2, 12, 1, 2),
(0, "female", 27, 7, "yes", 3, 12, 1, 4),
(0, "female", 42, 15, "yes", 2, 14, 1, 4),
(0, "female", 42, 15, "yes", 4, 14, 5, 4),
(0, "male", 27, 7, "yes", 4, 14, 3, 3),
(0, "male", 27, 7, "yes", 2, 20, 6, 2),
(0, "female", 42, 15, "yes", 3, 12, 3, 3),
(0, "male", 27, 4, "yes", 3, 16, 3, 5),
(0, "female", 27, 7, "yes", 3, 14, 1, 4),
(0, "female", 22, 1.5, "no", 2, 14, 4, 5),
(0, "female", 27, 4, "yes", 4, 14, 1, 4),
(0, "female", 22, 4, "no", 4, 14, 5, 5),
(0, "female", 22, 1.5, "no", 2, 16, 4, 5),
(0, "male", 47, 15, "no", 4, 14, 5, 4),
(0, "male", 37, 10, "yes", 2, 18, 6, 2),
(0, "male", 37, 15, "yes", 3, 17, 5, 4),
(0, "female", 27, 4, "yes", 2, 16, 1, 4),
(3, "male", 27, 1.5, "no", 3, 18, 4, 4),
(3, "female", 27, 4, "yes", 3, 17, 1, 5),
(7, "male", 37, 15, "yes", 5, 18, 6, 2),
(12, "female", 32, 10, "yes", 3, 17, 5, 2),
(1, "male", 22, 0.125, "no", 4, 16, 5, 5),
(1, "female", 22, 1.5, "yes", 2, 14, 1, 5),
(12, "male", 37, 15, "yes", 4, 14, 5, 2),
(7, "female", 22, 1.5, "no", 2, 14, 3, 4),
(2, "male", 37, 15, "yes", 2, 18, 6, 4),
(3, "female", 32, 15, "yes", 4, 12, 3, 2),
(1, "female", 37, 15, "yes", 4, 14, 4, 2),
(7, "female", 42, 15, "yes", 3, 17, 1, 4),
(12, "female", 42, 15, "yes", 5, 9, 4, 1),
(12, "male", 37, 10, "yes", 2, 20, 6, 2),
(12, "female", 32, 15, "yes", 3, 14, 1, 2),
(3, "male", 27, 4, "no", 1, 18, 6, 5),
(7, "male", 37, 10, "yes", 2, 18, 7, 3),
(7, "female", 27, 4, "no", 3, 17, 5, 5),
(1, "male", 42, 15, "yes", 4, 16, 5, 5),
(1, "female", 47, 15, "yes", 5, 14, 4, 5),
(7, "female", 27, 4, "yes", 3, 18, 5, 4),
(1, "female", 27, 7, "yes", 5, 14, 1, 4),
(12, "male", 27, 1.5, "yes", 3, 17, 5, 4),
(12, "female", 27, 7, "yes", 4, 14, 6, 2),
(3, "female", 42, 15, "yes", 4, 16, 5, 4),
(7, "female", 27, 10, "yes", 4, 12, 7, 3),
(1, "male", 27, 1.5, "no", 2, 18, 5, 2),
(1, "male", 32, 4, "no", 4, 20, 6, 4),
(1, "female", 27, 7, "yes", 3, 14, 1, 3),
(3, "female", 32, 10, "yes", 4, 14, 1, 4),
(3, "male", 27, 4, "yes", 2, 18, 7, 2),
(1, "female", 17.5, 0.75, "no", 5, 14, 4, 5),
(1, "female", 32, 10, "yes", 4, 18, 1, 5),
(7, "female", 32, 7, "yes", 2, 17, 6, 4),
(7, "male", 37, 15, "yes", 2, 20, 6, 4),
(7, "female", 37, 10, "no", 1, 20, 5, 3),
(12, "female", 32, 10, "yes", 2, 16, 5, 5),
(7, "male", 52, 15, "yes", 2, 20, 6, 4),
(7, "female", 42, 15, "yes", 1, 12, 1, 3),
(1, "male", 52, 15, "yes", 2, 20, 6, 3),
(2, "male", 37, 15, "yes", 3, 18, 6, 5),
(12, "female", 22, 4, "no", 3, 12, 3, 4),
(12, "male", 27, 7, "yes", 1, 18, 6, 2),
(1, "male", 27, 4, "yes", 3, 18, 5, 5),
(12, "male", 47, 15, "yes", 4, 17, 6, 5),
(12, "female", 42, 15, "yes", 4, 12, 1, 1),
(7, "male", 27, 4, "no", 3, 14, 3, 4),
(7, "female", 32, 7, "yes", 4, 18, 4, 5),
(1, "male", 32, 0.417, "yes", 3, 12, 3, 4),
(3, "male", 47, 15, "yes", 5, 16, 5, 4),
(12, "male", 37, 15, "yes", 2, 20, 5, 4),
(7, "male", 22, 4, "yes", 2, 17, 6, 4),
(1, "male", 27, 4, "no", 2, 14, 4, 5),
(7, "female", 52, 15, "yes", 5, 16, 1, 3),
(1, "male", 27, 4, "no", 3, 14, 3, 3),
(1, "female", 27, 10, "yes", 4, 16, 1, 4),
(1, "male", 32, 7, "yes", 3, 14, 7, 4),
(7, "male", 32, 7, "yes", 2, 18, 4, 1),
(3, "male", 22, 1.5, "no", 1, 14, 3, 2),
(7, "male", 22, 4, "yes", 3, 18, 6, 4),
(7, "male", 42, 15, "yes", 4, 20, 6, 4),
(2, "female", 57, 15, "yes", 1, 18, 5, 4),
(7, "female", 32, 4, "yes", 3, 18, 5, 2),
(1, "male", 27, 4, "yes", 1, 16, 4, 4),
(7, "male", 32, 7, "yes", 4, 16, 1, 4),
(2, "male", 57, 15, "yes", 1, 17, 4, 4),
(7, "female", 42, 15, "yes", 4, 14, 5, 2),
(7, "male", 37, 10, "yes", 1, 18, 5, 3),
(3, "male", 42, 15, "yes", 3, 17, 6, 1),
(1, "female", 52, 15, "yes", 3, 14, 4, 4),
(2, "female", 27, 7, "yes", 3, 17, 5, 3),
(12, "male", 32, 7, "yes", 2, 12, 4, 2),
(1, "male", 22, 4, "no", 4, 14, 2, 5),
(3, "male", 27, 7, "yes", 3, 18, 6, 4),
(12, "female", 37, 15, "yes", 1, 18, 5, 5),
(7, "female", 32, 15, "yes", 3, 17, 1, 3),
(7, "female", 27, 7, "no", 2, 17, 5, 5),
(1, "female", 32, 7, "yes", 3, 17, 5, 3),
(1, "male", 32, 1.5, "yes", 2, 14, 2, 4),
(12, "female", 42, 15, "yes", 4, 14, 1, 2),
(7, "male", 32, 10, "yes", 3, 14, 5, 4),
(7, "male", 37, 4, "yes", 1, 20, 6, 3),
(1, "female", 27, 4, "yes", 2, 16, 5, 3),
(12, "female", 42, 15, "yes", 3, 14, 4, 3),
(1, "male", 27, 10, "yes", 5, 20, 6, 5),
(12, "male", 37, 10, "yes", 2, 20, 6, 2),
(12, "female", 27, 7, "yes", 1, 14, 3, 3),
(3, "female", 27, 7, "yes", 4, 12, 1, 2),
(3, "male", 32, 10, "yes", 2, 14, 4, 4),
(12, "female", 17.5, 0.75, "yes", 2, 12, 1, 3),
(12, "female", 32, 15, "yes", 3, 18, 5, 4),
(2, "female", 22, 7, "no", 4, 14, 4, 3),
(1, "male", 32, 7, "yes", 4, 20, 6, 5),
(7, "male", 27, 4, "yes", 2, 18, 6, 2),
(1, "female", 22, 1.5, "yes", 5, 14, 5, 3),
(12, "female", 32, 15, "no", 3, 17, 5, 1),
(12, "female", 42, 15, "yes", 2, 12, 1, 2),
(7, "male", 42, 15, "yes", 3, 20, 5, 4),
(12, "male", 32, 10, "no", 2, 18, 4, 2),
(12, "female", 32, 15, "yes", 3, 9, 1, 1),
(7, "male", 57, 15, "yes", 5, 20, 4, 5),
(12, "male", 47, 15, "yes", 4, 20, 6, 4),
(2, "female", 42, 15, "yes", 2, 17, 6, 3),
(12, "male", 37, 15, "yes", 3, 17, 6, 3),
(12, "male", 37, 15, "yes", 5, 17, 5, 2),
(7, "male", 27, 10, "yes", 2, 20, 6, 4),
(2, "male", 37, 15, "yes", 2, 16, 5, 4),
(12, "female", 32, 15, "yes", 1, 14, 5, 2),
(7, "male", 32, 10, "yes", 3, 17, 6, 3),
(2, "male", 37, 15, "yes", 4, 18, 5, 1),
(7, "female", 27, 1.5, "no", 2, 17, 5, 5),
(3, "female", 47, 15, "yes", 2, 17, 5, 2),
(12, "male", 37, 15, "yes", 2, 17, 5, 4),
(12, "female", 27, 4, "no", 2, 14, 5, 5),
(2, "female", 27, 10, "yes", 4, 14, 1, 5),
(1, "female", 22, 4, "yes", 3, 16, 1, 3),
(12, "male", 52, 7, "no", 4, 16, 5, 5),
(2, "female", 27, 4, "yes", 1, 16, 3, 5),
(7, "female", 37, 15, "yes", 2, 17, 6, 4),
(2, "female", 27, 4, "no", 1, 17, 3, 1),
(12, "female", 17.5, 0.75, "yes", 2, 12, 3, 5),
(7, "female", 32, 15, "yes", 5, 18, 5, 4),
(7, "female", 22, 4, "no", 1, 16, 3, 5),
(2, "male", 32, 4, "yes", 4, 18, 6, 4),
(1, "female", 22, 1.5, "yes", 3, 18, 5, 2),
(3, "female", 42, 15, "yes", 2, 17, 5, 4),
(1, "male", 32, 7, "yes", 4, 16, 4, 4),
(12, "male", 37, 15, "no", 3, 14, 6, 2),
(1, "male", 42, 15, "yes", 3, 16, 6, 3),
(1, "male", 27, 4, "yes", 1, 18, 5, 4),
(2, "male", 37, 15, "yes", 4, 20, 7, 3),
(7, "male", 37, 15, "yes", 3, 20, 6, 4),
(3, "male", 22, 1.5, "no", 2, 12, 3, 3),
(3, "male", 32, 4, "yes", 3, 20, 6, 2),
(2, "male", 32, 15, "yes", 5, 20, 6, 5),
(12, "female", 52, 15, "yes", 1, 18, 5, 5),
(12, "male", 47, 15, "no", 1, 18, 6, 5),
(3, "female", 32, 15, "yes", 4, 16, 4, 4),
(7, "female", 32, 15, "yes", 3, 14, 3, 2),
(7, "female", 27, 7, "yes", 4, 16, 1, 2),
(12, "male", 42, 15, "yes", 3, 18, 6, 2),
(7, "female", 42, 15, "yes", 2, 14, 3, 2),
(12, "male", 27, 7, "yes", 2, 17, 5, 4),
(3, "male", 32, 10, "yes", 4, 14, 4, 3),
(7, "male", 47, 15, "yes", 3, 16, 4, 2),
(1, "male", 22, 1.5, "yes", 1, 12, 2, 5),
(7, "female", 32, 10, "yes", 2, 18, 5, 4),
(2, "male", 32, 10, "yes", 2, 17, 6, 5),
(2, "male", 22, 7, "yes", 3, 18, 6, 2),
(1, "female", 32, 15, "yes", 3, 14, 1, 5)) val data = dataList.toDF("affairs", "gender", "age", "yearsmarried", "children", "religiousness", "education", "occupation", "rating")

随机森林建模

data.createOrReplaceTempView("data") 

// 字符类型转换成数值
val labelWhere = "case when affairs=0 then 0 else cast(1 as double) end as label"
val genderWhere = "case when gender='female' then 0 else cast(1 as double) end as gender"
val childrenWhere = "case when children='no' then 0 else cast(1 as double) end as children" val dataLabelDF = spark.sql(s"select $labelWhere, $genderWhere,age,yearsmarried,$childrenWhere,religiousness,education,occupation,rating from data") val featuresArray = Array("gender", "age", "yearsmarried", "children", "religiousness", "education", "occupation", "rating") // 字段转换成特征向量
val assembler = new VectorAssembler().setInputCols(featuresArray).setOutputCol("features")
val vecDF: DataFrame = assembler.transform(dataLabelDF)
vecDF.show(10, truncate = false) // 将数据分为训练和测试集(30%进行测试)
val Array(trainingDF, testDF) = vecDF.randomSplit(Array(0.7, 0.3)) // 索引标签,将元数据添加到标签列中
val labelIndexer = new StringIndexer().setInputCol("label").setOutputCol("indexedLabel").fit(vecDF)
//labelIndexer.transform(vecDF).show(10, truncate = false) // 自动识别分类的特征,并对它们进行索引
// 具有大于5个不同的值的特征被视为连续。
val featureIndexer = new VectorIndexer().setInputCol("features").setOutputCol("indexedFeatures").setMaxCategories(5).fit(vecDF)
//featureIndexer.transform(vecDF).show(10, truncate = false) // 训练随机森林模型
val rf = new RandomForestClassifier().setLabelCol("indexedLabel").setFeaturesCol("indexedFeatures").setNumTrees(10) // 将索引标签转换回原始标签
val labelConverter = new IndexToString().setInputCol("prediction").setOutputCol("predictedLabel").setLabels(labelIndexer.labels) // Chain indexers and forest in a Pipeline.
val pipeline = new Pipeline().setStages(Array(labelIndexer, featureIndexer, rf, labelConverter)) // Train model. This also runs the indexers.
val model = pipeline.fit(trainingDF) // 输出随机森林模型的全部参数值
model.stages(2).extractParamMap() // 作出预测
val predictions = model.transform(testDF) // Select example rows to display.
predictions.select("predictedLabel", "label", "features").show(10, false) // 选择(预测标签,实际标签),并计算测试误差
val evaluator = new MulticlassClassificationEvaluator().setLabelCol("indexedLabel").setPredictionCol("prediction").setMetricName("accuracy")
val accuracy = evaluator.evaluate(predictions)
println("Test Error = " + (1.0 - accuracy)) // 这里的stages(2)中的“2”对应pipeline中的“rf”,将model强制转换为RandomForestClassificationModel类型
val rfModel = model.stages(2).asInstanceOf[RandomForestClassificationModel]
println("Learned classification forest model:\n" + rfModel.toDebugString)

代码执行结果

vecDF.show(10, truncate = false)
+-----+------+----+------------+--------+-------------+---------+----------+------+------------------------------------+
|label|gender|age |yearsmarried|children|religiousness|education|occupation|rating|features |
+-----+------+----+------------+--------+-------------+---------+----------+------+------------------------------------+
|0.0 |1.0 |37.0|10.0 |0.0 |3.0 |18.0 |7.0 |4.0 |[1.0,37.0,10.0,0.0,3.0,18.0,7.0,4.0]|
|0.0 |0.0 |27.0|4.0 |0.0 |4.0 |14.0 |6.0 |4.0 |[0.0,27.0,4.0,0.0,4.0,14.0,6.0,4.0] |
|0.0 |0.0 |32.0|15.0 |1.0 |1.0 |12.0 |1.0 |4.0 |[0.0,32.0,15.0,1.0,1.0,12.0,1.0,4.0]|
|0.0 |1.0 |57.0|15.0 |1.0 |5.0 |18.0 |6.0 |5.0 |[1.0,57.0,15.0,1.0,5.0,18.0,6.0,5.0]|
|0.0 |1.0 |22.0|0.75 |0.0 |2.0 |17.0 |6.0 |3.0 |[1.0,22.0,0.75,0.0,2.0,17.0,6.0,3.0]|
|0.0 |0.0 |32.0|1.5 |0.0 |2.0 |17.0 |5.0 |5.0 |[0.0,32.0,1.5,0.0,2.0,17.0,5.0,5.0] |
|0.0 |0.0 |22.0|0.75 |0.0 |2.0 |12.0 |1.0 |3.0 |[0.0,22.0,0.75,0.0,2.0,12.0,1.0,3.0]|
|0.0 |1.0 |57.0|15.0 |1.0 |2.0 |14.0 |4.0 |4.0 |[1.0,57.0,15.0,1.0,2.0,14.0,4.0,4.0]|
|0.0 |0.0 |32.0|15.0 |1.0 |4.0 |16.0 |1.0 |2.0 |[0.0,32.0,15.0,1.0,4.0,16.0,1.0,2.0]|
|0.0 |1.0 |22.0|1.5 |0.0 |4.0 |14.0 |4.0 |5.0 |[1.0,22.0,1.5,0.0,4.0,14.0,4.0,5.0] |
+-----+------+----+------------+--------+-------------+---------+----------+------+------------------------------------+
only showing top 10 rows // 将数据分为训练和测试集(30%进行测试)
val Array(trainingDF, testDF) = vecDF.randomSplit(Array(0.7, 0.3))
trainingDF: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [label: double, gender: double ... 8 more fields]
testDF: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [label: double, gender: double ... 8 more fields] // 索引标签,将元数据添加到标签列中
val labelIndexer = new StringIndexer().setInputCol("label").setOutputCol("indexedLabel").fit(vecDF)
labelIndexer: org.apache.spark.ml.feature.StringIndexerModel = strIdx_37df210602df
//labelIndexer.transform(vecDF).show(10, truncate = false) // 自动识别分类的特征,并对它们进行索引
// 具有大于5个不同的值的特征被视为连续。
val featureIndexer = new VectorIndexer().setInputCol("features").setOutputCol("indexedFeatures").setMaxCategories(5).fit(vecDF)
featureIndexer: org.apache.spark.ml.feature.VectorIndexerModel = vecIdx_9595c228f520
//featureIndexer.transform(vecDF).show(10, truncate = false) // 训练随机森林模型
val rf = new RandomForestClassifier().setLabelCol("indexedLabel").setFeaturesCol("indexedFeatures").setNumTrees(10)
rf: org.apache.spark.ml.classification.RandomForestClassifier = rfc_d0e7623d0b10 // 将索引标签转换回原始标签
val labelConverter = new IndexToString().setInputCol("prediction").setOutputCol("predictedLabel").setLabels(labelIndexer.labels)
labelConverter: org.apache.spark.ml.feature.IndexToString = idxToStr_32d6938f2c94 // Chain indexers and forest in a Pipeline.
val pipeline = new Pipeline().setStages(Array(labelIndexer, featureIndexer, rf, labelConverter))
pipeline: org.apache.spark.ml.Pipeline = pipeline_97716da42fed // Train model. This also runs the indexers.
val model = pipeline.fit(trainingDF)
model: org.apache.spark.ml.PipelineModel = pipeline_97716da42fed // 输出随机森林模型的全部参数值
model.stages(2).extractParamMap()
res10: org.apache.spark.ml.param.ParamMap =
{
rfc_0d830180d598-cacheNodeIds: false,
rfc_0d830180d598-checkpointInterval: 10,
rfc_0d830180d598-featureSubsetStrategy: auto,
rfc_0d830180d598-featuresCol: indexedFeatures,
rfc_0d830180d598-impurity: gini,
rfc_0d830180d598-labelCol: indexedLabel,
rfc_0d830180d598-maxBins: 32,
rfc_0d830180d598-maxDepth: 5,
rfc_0d830180d598-maxMemoryInMB: 256,
rfc_0d830180d598-minInfoGain: 0.0,
rfc_0d830180d598-minInstancesPerNode: 1,
rfc_0d830180d598-predictionCol: prediction,
rfc_0d830180d598-probabilityCol: probability,
rfc_0d830180d598-rawPredictionCol: rawPrediction,
rfc_0d830180d598-seed: 207336481,
rfc_0d830180d598-subsamplingRate: 1.0
} // 作出预测
val predictions = model.transform(testDF)
predictions: org.apache.spark.sql.DataFrame = [label: double, gender: double ... 14 more fields] predictions.select("predictedLabel", "label", "features").show(10,false)
+--------------+-----+-------------------------------------+
|predictedLabel|label|features |
+--------------+-----+-------------------------------------+
|0.0 |0.0 |[0.0,22.0,0.125,0.0,4.0,12.0,4.0,5.0]|
|0.0 |0.0 |[0.0,22.0,0.125,0.0,4.0,14.0,4.0,5.0]|
|0.0 |0.0 |[0.0,22.0,0.417,0.0,1.0,17.0,6.0,4.0]|
|0.0 |0.0 |[0.0,22.0,0.417,0.0,4.0,14.0,5.0,5.0]|
|0.0 |0.0 |[0.0,22.0,0.417,1.0,3.0,14.0,3.0,5.0]|
|0.0 |0.0 |[0.0,22.0,0.75,0.0,5.0,18.0,1.0,5.0] |
|0.0 |0.0 |[0.0,22.0,1.5,0.0,1.0,14.0,1.0,5.0] |
|0.0 |0.0 |[0.0,22.0,1.5,0.0,4.0,16.0,5.0,3.0] |
|0.0 |0.0 |[0.0,22.0,1.5,0.0,4.0,17.0,5.0,5.0] |
|0.0 |0.0 |[0.0,22.0,1.5,1.0,3.0,12.0,1.0,3.0] |
+--------------+-----+-------------------------------------+
only showing top 10 rows // 选择(预测标签,实际标签),并计算测试误差
val evaluator = new MulticlassClassificationEvaluator().setLabelCol("indexedLabel").setPredictionCol("prediction").setMetricName("accuracy")
evaluator: org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator = mcEval_13a195abc422 val accuracy = evaluator.evaluate(predictions)
accuracy: Double = 0.7365591397849462 println("Test Error = " + (1.0 - accuracy))
Test Error = 0.26344086021505375 // 这里的stages(2)中的“2”对应pipeline中的“rf”,将model强制转换为RandomForestClassificationModel类型
val rfModel = model.stages(2).asInstanceOf[RandomForestClassificationModel]
rfModel: org.apache.spark.ml.classification.RandomForestClassificationModel = RandomForestClassificationModel (uid=rfc_f7bb5e488533) with 10 trees println("Learned classification forest model:\n" + rfModel.toDebugString)
Learned classification forest model:
RandomForestClassificationModel (uid=rfc_f7bb5e488533) with 10 trees
Tree 0 (weight 1.0):
If (feature 2 <= 1.5)
If (feature 5 <= 12.0)
If (feature 6 <= 1.0)
Predict: 0.0
Else (feature 6 > 1.0)
If (feature 2 <= 0.125)
Predict: 0.0
Else (feature 2 > 0.125)
Predict: 1.0
Else (feature 5 > 12.0)
If (feature 0 in {0.0})
If (feature 5 <= 16.0)
Predict: 0.0
Else (feature 5 > 16.0)
If (feature 1 <= 22.0)
Predict: 0.0
Else (feature 1 > 22.0)
Predict: 0.0
Else (feature 0 not in {0.0})
If (feature 2 <= 0.75)
If (feature 4 in {0.0,1.0,2.0,4.0})
Predict: 0.0
Else (feature 4 not in {0.0,1.0,2.0,4.0})
Predict: 0.0
Else (feature 2 > 0.75)
If (feature 1 <= 22.0)
Predict: 0.0
Else (feature 1 > 22.0)
Predict: 1.0
Else (feature 2 > 1.5)
If (feature 1 <= 42.0)
If (feature 1 <= 27.0)
If (feature 5 <= 16.0)
If (feature 6 <= 5.0)
Predict: 0.0
Else (feature 6 > 5.0)
Predict: 1.0
Else (feature 5 > 16.0)
If (feature 4 in {3.0})
Predict: 0.0
Else (feature 4 not in {3.0})
Predict: 0.0
Else (feature 1 > 27.0)
If (feature 4 in {0.0,3.0,4.0})
If (feature 2 <= 4.0)
Predict: 1.0
Else (feature 2 > 4.0)
Predict: 0.0
Else (feature 4 not in {0.0,3.0,4.0})
If (feature 6 <= 4.0)
Predict: 0.0
Else (feature 6 > 4.0)
Predict: 1.0
Else (feature 1 > 42.0)
If (feature 4 in {2.0,4.0})
Predict: 0.0
Else (feature 4 not in {2.0,4.0})
If (feature 4 in {0.0})
Predict: 1.0
Else (feature 4 not in {0.0})
If (feature 3 in {0.0})
Predict: 0.0
Else (feature 3 not in {0.0})
Predict: 0.0
Tree 1 (weight 1.0):
If (feature 7 in {0.0,2.0,4.0})
If (feature 7 in {0.0})
If (feature 1 <= 42.0)
If (feature 4 in {1.0})
Predict: 0.0
Else (feature 4 not in {1.0})
Predict: 1.0
Else (feature 1 > 42.0)
Predict: 0.0
Else (feature 7 not in {0.0})
If (feature 1 <= 17.5)
If (feature 4 in {3.0})
Predict: 0.0
Else (feature 4 not in {3.0})
Predict: 1.0
Else (feature 1 > 17.5)
If (feature 0 in {0.0})
If (feature 4 in {1.0,3.0,4.0})
Predict: 0.0
Else (feature 4 not in {1.0,3.0,4.0})
Predict: 0.0
Else (feature 0 not in {0.0})
If (feature 6 <= 2.0)
Predict: 1.0
Else (feature 6 > 2.0)
Predict: 0.0
Else (feature 7 not in {0.0,2.0,4.0})
If (feature 3 in {0.0})
If (feature 5 <= 14.0)
If (feature 4 in {1.0,3.0})
Predict: 0.0
Else (feature 4 not in {1.0,3.0})
If (feature 0 in {0.0})
Predict: 0.0
Else (feature 0 not in {0.0})
Predict: 1.0
Else (feature 5 > 14.0)
If (feature 0 in {0.0})
Predict: 0.0
Else (feature 0 not in {0.0})
If (feature 4 in {0.0,2.0,3.0,4.0})
Predict: 0.0
Else (feature 4 not in {0.0,2.0,3.0,4.0})
Predict: 1.0
Else (feature 3 not in {0.0})
If (feature 5 <= 12.0)
If (feature 0 in {1.0})
Predict: 0.0
Else (feature 0 not in {1.0})
If (feature 6 <= 1.0)
Predict: 0.0
Else (feature 6 > 1.0)
Predict: 0.0
Else (feature 5 > 12.0)
If (feature 4 in {0.0,2.0,3.0,4.0})
If (feature 1 <= 47.0)
Predict: 0.0
Else (feature 1 > 47.0)
Predict: 1.0
Else (feature 4 not in {0.0,2.0,3.0,4.0})
If (feature 1 <= 22.0)
Predict: 1.0
Else (feature 1 > 22.0)
Predict: 0.0
Tree 2 (weight 1.0):
If (feature 7 in {0.0})
If (feature 4 in {1.0})
Predict: 0.0
Else (feature 4 not in {1.0})
If (feature 6 <= 5.0)
If (feature 1 <= 42.0)
Predict: 1.0
Else (feature 1 > 42.0)
Predict: 0.0
Else (feature 6 > 5.0)
Predict: 0.0
Else (feature 7 not in {0.0})
If (feature 5 <= 16.0)
If (feature 7 in {1.0})
If (feature 6 <= 4.0)
If (feature 2 <= 7.0)
Predict: 0.0
Else (feature 2 > 7.0)
Predict: 1.0
Else (feature 6 > 4.0)
Predict: 1.0
Else (feature 7 not in {1.0})
If (feature 3 in {1.0})
If (feature 1 <= 17.5)
Predict: 1.0
Else (feature 1 > 17.5)
Predict: 0.0
Else (feature 3 not in {1.0})
If (feature 0 in {0.0})
Predict: 0.0
Else (feature 0 not in {0.0})
Predict: 0.0
Else (feature 5 > 16.0)
If (feature 3 in {0.0})
If (feature 4 in {4.0})
Predict: 0.0
Else (feature 4 not in {4.0})
If (feature 5 <= 18.0)
Predict: 0.0
Else (feature 5 > 18.0)
Predict: 0.0
Else (feature 3 not in {0.0})
If (feature 4 in {0.0,3.0,4.0})
If (feature 7 in {2.0})
Predict: 0.0
Else (feature 7 not in {2.0})
Predict: 0.0
Else (feature 4 not in {0.0,3.0,4.0})
If (feature 6 <= 4.0)
Predict: 0.0
Else (feature 6 > 4.0)
Predict: 1.0
Tree 3 (weight 1.0):
If (feature 3 in {0.0})
If (feature 7 in {3.0})
Predict: 0.0
Else (feature 7 not in {3.0})
If (feature 2 <= 10.0)
If (feature 4 in {2.0,3.0,4.0})
If (feature 4 in {4.0})
Predict: 0.0
Else (feature 4 not in {4.0})
Predict: 0.0
Else (feature 4 not in {2.0,3.0,4.0})
If (feature 7 in {0.0,2.0,4.0})
Predict: 0.0
Else (feature 7 not in {0.0,2.0,4.0})
Predict: 1.0
Else (feature 2 > 10.0)
Predict: 1.0
Else (feature 3 not in {0.0})
If (feature 6 <= 2.0)
If (feature 5 <= 16.0)
If (feature 7 in {0.0,1.0,2.0,4.0})
If (feature 4 in {0.0,1.0,3.0,4.0})
Predict: 0.0
Else (feature 4 not in {0.0,1.0,3.0,4.0})
Predict: 1.0
Else (feature 7 not in {0.0,1.0,2.0,4.0})
If (feature 1 <= 22.0)
Predict: 0.0
Else (feature 1 > 22.0)
Predict: 0.0
Else (feature 5 > 16.0)
If (feature 7 in {0.0,1.0,3.0})
Predict: 0.0
Else (feature 7 not in {0.0,1.0,3.0})
Predict: 1.0
Else (feature 6 > 2.0)
If (feature 4 in {0.0,3.0,4.0})
If (feature 7 in {0.0,2.0,3.0,4.0})
If (feature 4 in {3.0,4.0})
Predict: 0.0
Else (feature 4 not in {3.0,4.0})
Predict: 0.0
Else (feature 7 not in {0.0,2.0,3.0,4.0})
If (feature 6 <= 4.0)
Predict: 0.0
Else (feature 6 > 4.0)
Predict: 1.0
Else (feature 4 not in {0.0,3.0,4.0})
If (feature 1 <= 22.0)
If (feature 5 <= 14.0)
Predict: 1.0
Else (feature 5 > 14.0)
Predict: 1.0
Else (feature 1 > 22.0)
If (feature 6 <= 6.0)
Predict: 0.0
Else (feature 6 > 6.0)
Predict: 1.0
Tree 4 (weight 1.0):
If (feature 7 in {0.0,2.0,4.0})
If (feature 7 in {0.0})
If (feature 6 <= 5.0)
If (feature 3 in {0.0})
Predict: 0.0
Else (feature 3 not in {0.0})
If (feature 4 in {2.0,4.0})
Predict: 1.0
Else (feature 4 not in {2.0,4.0})
Predict: 1.0
Else (feature 6 > 5.0)
Predict: 0.0
Else (feature 7 not in {0.0})
If (feature 2 <= 1.5)
If (feature 5 <= 12.0)
If (feature 2 <= 0.125)
Predict: 0.0
Else (feature 2 > 0.125)
Predict: 0.0
Else (feature 5 > 12.0)
If (feature 1 <= 17.5)
Predict: 1.0
Else (feature 1 > 17.5)
Predict: 0.0
Else (feature 2 > 1.5)
If (feature 2 <= 7.0)
If (feature 4 in {1.0,3.0,4.0})
Predict: 0.0
Else (feature 4 not in {1.0,3.0,4.0})
Predict: 0.0
Else (feature 2 > 7.0)
If (feature 5 <= 16.0)
Predict: 0.0
Else (feature 5 > 16.0)
Predict: 0.0
Else (feature 7 not in {0.0,2.0,4.0})
If (feature 5 <= 12.0)
Predict: 0.0
Else (feature 5 > 12.0)
If (feature 4 in {0.0,3.0,4.0})
If (feature 1 <= 47.0)
If (feature 1 <= 22.0)
Predict: 0.0
Else (feature 1 > 22.0)
Predict: 0.0
Else (feature 1 > 47.0)
Predict: 1.0
Else (feature 4 not in {0.0,3.0,4.0})
If (feature 1 <= 27.0)
If (feature 3 in {0.0})
Predict: 0.0
Else (feature 3 not in {0.0})
Predict: 0.0
Else (feature 1 > 27.0)
If (feature 5 <= 14.0)
Predict: 1.0
Else (feature 5 > 14.0)
Predict: 1.0
Tree 5 (weight 1.0):
If (feature 7 in {0.0})
If (feature 1 <= 42.0)
If (feature 6 <= 4.0)
Predict: 1.0
Else (feature 6 > 4.0)
If (feature 4 in {1.0})
Predict: 0.0
Else (feature 4 not in {1.0})
Predict: 1.0
Else (feature 1 > 42.0)
Predict: 0.0
Else (feature 7 not in {0.0})
If (feature 2 <= 1.5)
If (feature 4 in {0.0,2.0,3.0})
If (feature 1 <= 22.0)
If (feature 0 in {0.0})
Predict: 0.0
Else (feature 0 not in {0.0})
Predict: 0.0
Else (feature 1 > 22.0)
Predict: 0.0
Else (feature 4 not in {0.0,2.0,3.0})
If (feature 1 <= 17.5)
If (feature 6 <= 4.0)
Predict: 1.0
Else (feature 6 > 4.0)
Predict: 0.0
Else (feature 1 > 17.5)
If (feature 0 in {0.0})
Predict: 0.0
Else (feature 0 not in {0.0})
Predict: 0.0
Else (feature 2 > 1.5)
If (feature 6 <= 5.0)
If (feature 5 <= 17.0)
If (feature 7 in {2.0,4.0})
Predict: 0.0
Else (feature 7 not in {2.0,4.0})
Predict: 0.0
Else (feature 5 > 17.0)
If (feature 6 <= 1.0)
Predict: 0.0
Else (feature 6 > 1.0)
Predict: 0.0
Else (feature 6 > 5.0)
If (feature 4 in {0.0,3.0,4.0})
If (feature 7 in {3.0,4.0})
Predict: 0.0
Else (feature 7 not in {3.0,4.0})
Predict: 0.0
Else (feature 4 not in {0.0,3.0,4.0})
If (feature 6 <= 6.0)
Predict: 0.0
Else (feature 6 > 6.0)
Predict: 0.0
Tree 6 (weight 1.0):
If (feature 4 in {0.0,3.0,4.0})
If (feature 5 <= 12.0)
If (feature 7 in {1.0,2.0,3.0,4.0})
Predict: 0.0
Else (feature 7 not in {1.0,2.0,3.0,4.0})
If (feature 6 <= 3.0)
Predict: 0.0
Else (feature 6 > 3.0)
Predict: 1.0
Else (feature 5 > 12.0)
If (feature 7 in {0.0,1.0,2.0})
If (feature 6 <= 1.0)
If (feature 7 in {0.0,2.0})
Predict: 0.0
Else (feature 7 not in {0.0,2.0})
Predict: 0.0
Else (feature 6 > 1.0)
If (feature 1 <= 37.0)
Predict: 1.0
Else (feature 1 > 37.0)
Predict: 0.0
Else (feature 7 not in {0.0,1.0,2.0})
If (feature 1 <= 17.5)
If (feature 4 in {3.0})
Predict: 0.0
Else (feature 4 not in {3.0})
Predict: 1.0
Else (feature 1 > 17.5)
If (feature 6 <= 4.0)
Predict: 0.0
Else (feature 6 > 4.0)
Predict: 0.0
Else (feature 4 not in {0.0,3.0,4.0})
If (feature 7 in {0.0,4.0})
If (feature 5 <= 12.0)
If (feature 2 <= 0.125)
Predict: 0.0
Else (feature 2 > 0.125)
If (feature 1 <= 17.5)
Predict: 1.0
Else (feature 1 > 17.5)
Predict: 0.0
Else (feature 5 > 12.0)
If (feature 7 in {0.0})
If (feature 1 <= 42.0)
Predict: 1.0
Else (feature 1 > 42.0)
Predict: 0.0
Else (feature 7 not in {0.0})
If (feature 2 <= 1.5)
Predict: 0.0
Else (feature 2 > 1.5)
Predict: 0.0
Else (feature 7 not in {0.0,4.0})
If (feature 6 <= 4.0)
If (feature 7 in {3.0})
If (feature 0 in {0.0})
Predict: 0.0
Else (feature 0 not in {0.0})
Predict: 0.0
Else (feature 7 not in {3.0})
If (feature 5 <= 16.0)
Predict: 0.0
Else (feature 5 > 16.0)
Predict: 1.0
Else (feature 6 > 4.0)
If (feature 6 <= 6.0)
If (feature 3 in {0.0})
Predict: 0.0
Else (feature 3 not in {0.0})
Predict: 1.0
Else (feature 6 > 6.0)
If (feature 5 <= 18.0)
Predict: 1.0
Else (feature 5 > 18.0)
Predict: 0.0
Tree 7 (weight 1.0):
If (feature 7 in {0.0,2.0,4.0})
If (feature 2 <= 1.5)
If (feature 4 in {1.0,2.0,3.0})
If (feature 1 <= 17.5)
Predict: 1.0
Else (feature 1 > 17.5)
Predict: 0.0
Else (feature 4 not in {1.0,2.0,3.0})
If (feature 5 <= 14.0)
If (feature 0 in {0.0})
Predict: 0.0
Else (feature 0 not in {0.0})
Predict: 1.0
Else (feature 5 > 14.0)
Predict: 0.0
Else (feature 2 > 1.5)
If (feature 7 in {0.0,2.0})
If (feature 4 in {1.0,3.0,4.0})
If (feature 5 <= 16.0)
Predict: 0.0
Else (feature 5 > 16.0)
Predict: 0.0
Else (feature 4 not in {1.0,3.0,4.0})
If (feature 6 <= 5.0)
Predict: 1.0
Else (feature 6 > 5.0)
Predict: 0.0
Else (feature 7 not in {0.0,2.0})
If (feature 4 in {0.0,1.0,3.0})
If (feature 1 <= 42.0)
Predict: 0.0
Else (feature 1 > 42.0)
Predict: 0.0
Else (feature 4 not in {0.0,1.0,3.0})
If (feature 5 <= 16.0)
Predict: 0.0
Else (feature 5 > 16.0)
Predict: 0.0
Else (feature 7 not in {0.0,2.0,4.0})
If (feature 2 <= 0.75)
Predict: 0.0
Else (feature 2 > 0.75)
If (feature 4 in {4.0})
If (feature 6 <= 5.0)
If (feature 1 <= 37.0)
Predict: 1.0
Else (feature 1 > 37.0)
Predict: 0.0
Else (feature 6 > 5.0)
Predict: 0.0
Else (feature 4 not in {4.0})
If (feature 5 <= 12.0)
If (feature 1 <= 27.0)
Predict: 0.0
Else (feature 1 > 27.0)
Predict: 0.0
Else (feature 5 > 12.0)
If (feature 7 in {1.0})
Predict: 1.0
Else (feature 7 not in {1.0})
Predict: 0.0
Tree 8 (weight 1.0):
If (feature 5 <= 16.0)
If (feature 4 in {0.0,1.0})
If (feature 0 in {0.0})
If (feature 2 <= 0.75)
If (feature 1 <= 17.5)
Predict: 1.0
Else (feature 1 > 17.5)
Predict: 0.0
Else (feature 2 > 0.75)
If (feature 6 <= 4.0)
Predict: 0.0
Else (feature 6 > 4.0)
Predict: 0.0
Else (feature 0 not in {0.0})
If (feature 5 <= 12.0)
Predict: 1.0
Else (feature 5 > 12.0)
If (feature 7 in {2.0,4.0})
Predict: 0.0
Else (feature 7 not in {2.0,4.0})
Predict: 0.0
Else (feature 4 not in {0.0,1.0})
If (feature 7 in {0.0,2.0,3.0,4.0})
If (feature 1 <= 22.0)
If (feature 6 <= 3.0)
Predict: 0.0
Else (feature 6 > 3.0)
Predict: 0.0
Else (feature 1 > 22.0)
If (feature 6 <= 6.0)
Predict: 0.0
Else (feature 6 > 6.0)
Predict: 1.0
Else (feature 7 not in {0.0,2.0,3.0,4.0})
If (feature 1 <= 42.0)
If (feature 6 <= 4.0)
Predict: 0.0
Else (feature 6 > 4.0)
Predict: 1.0
Else (feature 1 > 42.0)
Predict: 0.0
Else (feature 5 > 16.0)
If (feature 5 <= 18.0)
If (feature 4 in {3.0})
If (feature 7 in {1.0,2.0,3.0})
Predict: 0.0
Else (feature 7 not in {1.0,2.0,3.0})
If (feature 6 <= 5.0)
Predict: 0.0
Else (feature 6 > 5.0)
Predict: 0.0
Else (feature 4 not in {3.0})
If (feature 2 <= 0.75)
Predict: 0.0
Else (feature 2 > 0.75)
If (feature 3 in {0.0})
Predict: 0.0
Else (feature 3 not in {0.0})
Predict: 1.0
Else (feature 5 > 18.0)
If (feature 1 <= 27.0)
If (feature 7 in {3.0})
If (feature 3 in {0.0})
Predict: 0.0
Else (feature 3 not in {0.0})
Predict: 1.0
Else (feature 7 not in {3.0})
If (feature 2 <= 4.0)
Predict: 0.0
Else (feature 2 > 4.0)
Predict: 1.0
Else (feature 1 > 27.0)
If (feature 6 <= 5.0)
If (feature 6 <= 4.0)
Predict: 0.0
Else (feature 6 > 4.0)
Predict: 0.0
Else (feature 6 > 5.0)
If (feature 4 in {3.0,4.0})
Predict: 0.0
Else (feature 4 not in {3.0,4.0})
Predict: 0.0
Tree 9 (weight 1.0):
If (feature 5 <= 16.0)
If (feature 6 <= 2.0)
If (feature 1 <= 42.0)
If (feature 6 <= 1.0)
If (feature 5 <= 9.0)
Predict: 1.0
Else (feature 5 > 9.0)
Predict: 0.0
Else (feature 6 > 1.0)
If (feature 1 <= 27.0)
Predict: 0.0
Else (feature 1 > 27.0)
Predict: 1.0
Else (feature 1 > 42.0)
Predict: 0.0
Else (feature 6 > 2.0)
If (feature 1 <= 27.0)
If (feature 5 <= 14.0)
If (feature 6 <= 3.0)
Predict: 0.0
Else (feature 6 > 3.0)
Predict: 0.0
Else (feature 5 > 14.0)
Predict: 0.0
Else (feature 1 > 27.0)
If (feature 4 in {1.0,2.0,4.0})
If (feature 5 <= 9.0)
Predict: 0.0
Else (feature 5 > 9.0)
Predict: 0.0
Else (feature 4 not in {1.0,2.0,4.0})
If (feature 7 in {2.0,3.0,4.0})
Predict: 0.0
Else (feature 7 not in {2.0,3.0,4.0})
Predict: 1.0
Else (feature 5 > 16.0)
If (feature 6 <= 4.0)
If (feature 4 in {3.0})
Predict: 0.0
Else (feature 4 not in {3.0})
If (feature 1 <= 42.0)
If (feature 3 in {0.0})
Predict: 0.0
Else (feature 3 not in {0.0})
Predict: 0.0
Else (feature 1 > 42.0)
Predict: 1.0
Else (feature 6 > 4.0)
If (feature 4 in {3.0,4.0})
If (feature 1 <= 37.0)
If (feature 3 in {0.0})
Predict: 0.0
Else (feature 3 not in {0.0})
Predict: 0.0
Else (feature 1 > 37.0)
If (feature 1 <= 42.0)
Predict: 0.0
Else (feature 1 > 42.0)
Predict: 0.0
Else (feature 4 not in {3.0,4.0})
If (feature 4 in {0.0,2.0})
If (feature 7 in {0.0,1.0,2.0})
Predict: 1.0
Else (feature 7 not in {0.0,1.0,2.0})
Predict: 1.0
Else (feature 4 not in {0.0,2.0})
If (feature 0 in {0.0})
Predict: 0.0
Else (feature 0 not in {0.0})
Predict: 0.0

随机森林模型调优

// 字段转换成特征向量
val assembler = new VectorAssembler().setInputCols(featuresArray).setOutputCol("features")
val vecDF: DataFrame = assembler.transform(dataLabelDF)
vecDF.show(10, truncate = false) // 将数据分为训练和测试集(30%进行测试)
val Array(trainingDF, testDF) = vecDF.randomSplit(Array(0.7, 0.3)) // 索引标签,将元数据添加到标签列中
val labelIndexer = new StringIndexer().setInputCol("label").setOutputCol("indexedLabel").fit(vecDF)
//labelIndexer.transform(vecDF).show(10, truncate = false) // 自动识别分类的特征,并对它们进行索引
// 具有大于5个不同的值的特征被视为连续。
val featureIndexer = new VectorIndexer().setInputCol("features").setOutputCol("indexedFeatures").setMaxCategories(5).fit(vecDF)
//featureIndexer.transform(vecDF).show(10, truncate = false) // 训练随机森林模型
val rf = new RandomForestClassifier().setLabelCol("indexedLabel").setFeaturesCol("indexedFeatures") // 将索引标签转换回原始标签
val labelConverter = new IndexToString().setInputCol("prediction").setOutputCol("predictedLabel").setLabels(labelIndexer.labels) // Chain indexers and forest in a Pipeline.
val pipeline = new Pipeline().setStages(Array(labelIndexer, featureIndexer, rf, labelConverter)) // 设置参数网格
//impurity 不纯度
//maxBins 离散化"连续特征"的最大划分数
//maxDepth 树的最大深度
//minInfoGain 一个节点分裂的最小信息增益,值为[0,1]
//minInstancesPerNode 每个节点包含的最小样本数 >=1
//numTrees 树的数量
//featureSubsetStrategy // 在每个树节点处分割的特征数,参数值比较多,详细的请参考官方文档
//SubsamplingRate(1.0) 给每棵树分配“学习数据”的比例,范围(0, 1]
//maxMemoryInMB 如果太小,则每次迭代将拆分1个节点,其聚合可能超过此大小。
//checkpointInterval 设置检查点间隔(> = 1)或禁用检查点(-1)。 例如 10意味着,每10次迭代,缓存将获得检查点。
//cacheNodeIds 如果为false,则算法将树传递给执行器以将实例与节点匹配。 如果为true,算法将缓存每个实例的节点ID。 缓存可以加速更大深度的树的训练。 用户可以通过设置checkpointInterval来设置检查或禁用缓存的频率。(default = false)
//seed 种子
val paramGrid = new ParamGridBuilder()
.addGrid(rf.impurity, Array("entropy", "gini"))
.addGrid(rf.maxBins, Array(32, 64))
.addGrid(rf.maxDepth, Array(5, 7, 10))
.addGrid(rf.minInfoGain, Array(0, 0.5, 1))
.addGrid(rf.minInstancesPerNode, Array(10, 20))
.addGrid(rf.numTrees, Array(20, 50))
.addGrid(rf.featureSubsetStrategy, Array("auto", "sqrt"))
.addGrid(rf.subsamplingRate, Array(0.8, 1))
.addGrid(rf.maxMemoryInMB, Array(256, 512))
.addGrid(rf.checkpointInterval, Array(10, 20))
.addGrid(rf.cacheNodeIds, Array(false, true))
.addGrid(rf.seed, Array(123456L, 111L))
.build() // 选择(预测标签,实际标签),并计算测试误差。indexedLabel与prediction都是索引化的,因此可以直接比较
val classEvaluator = new MulticlassClassificationEvaluator().setLabelCol("indexedLabel").setPredictionCol("prediction").setMetricName("accuracy") // 设置交叉验证
val cv = new CrossValidator().setEstimator(pipeline).setEvaluator(classEvaluator).setEstimatorParamMaps(paramGrid).setNumFolds(5) // 执行交叉验证,并选择出最好的参数集合
val cvModel = cv.fit(trainingDF) // 查看全部参数
cvModel.extractParamMap()
// cvModel.avgMetrics.length=cvModel.getEstimatorParamMaps.length
// cvModel.avgMetrics与cvModel.getEstimatorParamMaps中的元素一一对应
cvModel.avgMetrics.length
cvModel.avgMetrics // 参数对应的平均度量 cvModel.getEstimatorParamMaps.length
cvModel.getEstimatorParamMaps // 参数组合的集合 cvModel.getEvaluator.extractParamMap() // 评估的参数 cvModel.getEvaluator.isLargerBetter // 评估的度量值是大的好,还是小的好 ,根据评估度量,系统会自动识别
cvModel.getNumFolds // 交叉验证的折数 //################################
// 测试模型
val predictDF: DataFrame = cvModel.transform(testDF).selectExpr(
//"race","poverty","smoke","alcohol","agemth","ybirth","yschool","pc3mth", "features",
"predictedLabel", "label", "features")
predictDF.show(20, false)

Spark2 Random Forests 随机森林的更多相关文章

  1. R语言之Random Forest随机森林

    什么是随机森林? 随机森林就是通过集成学习的思想将多棵树集成的一种算法,它的基本单元是决策树,而它的本质属于机器学习的一大分支——集成学习(Ensemble Learning)方法.随机森林的名称中有 ...

  2. 机器学习中的算法(1)-决策树模型组合之随机森林与GBDT

    版权声明: 本文由LeftNotEasy发布于http://leftnoteasy.cnblogs.com, 本文可以被全部的转载或者部分使用,但请注明出处,如果有问题,请联系wheeleast@gm ...

  3. 机器学习中的算法——决策树模型组合之随机森林与GBDT

    前言: 决策树这种算法有着很多良好的特性,比如说训练时间复杂度较低,预测的过程比较快速,模型容易展示(容易将得到的决策树做成图片展示出来)等.但是同时,单决策树又有一些不好的地方,比如说容易over- ...

  4. paper 84:机器学习算法--随机森林

    http://www.cnblogs.com/wentingtu/archive/2011/12/13/2286212.html中一些内容 基础内容: 这里只是准备简单谈谈基础的内容,主要参考一下别人 ...

  5. 决策树模型组合之(在线)随机森林与GBDT

    前言: 决策树这种算法有着很多良好的特性,比如说训练时间复杂度较低,预测的过程比较快速,模型容易展示(容易将得到的决策树做成图片展示出来)等.但是同时, 单决策树又有一些不好的地方,比如说容易over ...

  6. 机器学习中的算法-决策树模型组合之随机森林与GBDT

    机器学习中的算法(1)-决策树模型组合之随机森林与GBDT 版权声明: 本文由LeftNotEasy发布于http://leftnoteasy.cnblogs.com, 本文可以被全部的转载或者部分使 ...

  7. 随机森林与GBDT

    前言: 决策树这种算法有着很多良好的特性,比如说训练时间复杂度较低,预测的过程比较快速,模型容易展示(容易将得到的决策树做成图片展示出来)等.但是同时,单决策树又有一些不好的地方,比如说容易over- ...

  8. 决策树模型组合之随机森林与GBDT

    版权声明: 本文由LeftNotEasy发布于http://leftnoteasy.cnblogs.com, 本文可以被全部的转载或者部分使用,但请注明出处,如果有问题,请联系wheeleast@gm ...

  9. 决策树模型组合之随机森林与GBDT(转)

    版权声明: 本文由LeftNotEasy发布于http://leftnoteasy.cnblogs.com, 本文可以被全部的转载或者部分使用,但请注明出处,如果有问题,请联系wheeleast@gm ...

随机推荐

  1. C# SpinLock实现

    关于SpinLock自旋锁网上已经有很多说明,这里也copy了一部分,我这里主要关注微软的实现,学习人家的实现方式. 如果由于垃圾回收,基于对象的锁对象开销太高,可以使用SpinLock结构..NET ...

  2. vs2010编译错误(报错:LINK : fatal error LNK1123: 转换到 COFF 期间失败: 文件无效或损坏)

    报错:LINK : fatal error LNK1123: 转换到 COFF 期间失败: 文件无效或损坏 1> 这段时间忙于看文献,没用过VS了. 今天用着用着就报错了: LINK : fat ...

  3. SPLIT_STR

    CREATE DEFINER=`root`@`%` FUNCTION `vir`.`SPLIT_STR`( x VARCHAR(1000), delim VARCHAR(12), pos INT) R ...

  4. 关于VIM自动缩进失效(filetype indent on无效)的详细分析

    关于VIM自动缩进失效(filetype indent on无效)的详细分析 set filetype=xml filetype indent on 执行对齐命令:ggvG

  5. 关于JAVA 中的Configuration类

    properties文件是Java平台默认的配置文件格式,其优点是格式清晰,简单易懂,使用commons-configuration读取properties文件也比较简单,代码如下: 基本用法: 1. ...

  6. [Done]SnowFlake生成Long类型主键返回前台过长导致精度缺失的问题

    问题描述: 在开发过程中,项目的主键生成器是SnowFlake,其生成的long主键是28位, 但是js中Long的最大值:https://blog.csdn.net/sunmerZeal/artic ...

  7. Qt下多线程日之类

    刚google到了,晚上回去试一下! 代码地址 https://gitorious.org/cutelogger/cutelogger/source/e3c2745c6c5f38896f87472e0 ...

  8. Oracle之外键(Foreign Key)使用方法具体解释(二)- 级联删除(DELETE CASCADE)

    Oracle外键(Foreign Key)之级联删除(DELETE CASCADE) 目标 演示样例解说怎样在Oracle外键中使用级联删除 什么是级联删除(DELETE CASCADE)? 级联删除 ...

  9. Docker 使用Docker知识简易部署一个LNMP平台

    1.自定义网络 docker network create lnmp 2.创建Mysql数据库容器(这里我们首先得创建一个mysql-vol数据卷) docker volume create mysq ...

  10. FFmpeg: AVFormatContext 结构体分析

    AVFormatContext 结构体分析这个结构体描述了一个媒体文件或媒体流的构成和基本信息.这是FFMpeg中最为基本的一个结构,是其他所有结构的根,是一个多媒体文件或流的根本抽象.主要成员释义: ...