随机森林是决策树的集合。 随机森林结合许多决策树,以减少过度拟合的风险。 spark.ml实现支持随机森林,使用连续和分类特征,做二分类和多分类以及回归。

导入包

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.Row
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.Column
import org.apache.spark.sql.DataFrameReader
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
import org.apache.spark.sql.Encoder
import org.apache.spark.sql.DataFrameStatFunctions
import org.apache.spark.sql.functions._ import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.feature.{ IndexToString, StringIndexer, VectorIndexer }
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.{ RandomForestClassificationModel, RandomForestClassifier }
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.tuning.{ ParamGridBuilder, CrossValidator }

导入源数据

// affairs:一年来婚外情的频率
// gender:性别
// age:年龄
// yearsmarried:婚龄
// children:是否有小孩
// religiousness:宗教信仰程度(5分制,1分表示反对,5分表示非常信仰)
// education:学历
// occupation:职业(逆向编号的戈登7种分类)
// rating:对婚姻的自我评分(5分制,1表示非常不幸福,5表示非常幸福) val spark = SparkSession.builder().appName("Spark Random Forest Classifier").config("spark.some.config.option", "some-value").getOrCreate() // For implicit conversions like converting RDDs to DataFrames
import spark.implicits._ val dataList: List[(Double, String, Double, Double, String, Double, Double, Double, Double)] = List(
(0, "male", 37, 10, "no", 3, 18, 7, 4),
(0, "female", 27, 4, "no", 4, 14, 6, 4),
(0, "female", 32, 15, "yes", 1, 12, 1, 4),
(0, "male", 57, 15, "yes", 5, 18, 6, 5),
(0, "male", 22, 0.75, "no", 2, 17, 6, 3),
(0, "female", 32, 1.5, "no", 2, 17, 5, 5),
(0, "female", 22, 0.75, "no", 2, 12, 1, 3),
(0, "male", 57, 15, "yes", 2, 14, 4, 4),
(0, "female", 32, 15, "yes", 4, 16, 1, 2),
(0, "male", 22, 1.5, "no", 4, 14, 4, 5),
(0, "male", 37, 15, "yes", 2, 20, 7, 2),
(0, "male", 27, 4, "yes", 4, 18, 6, 4),
(0, "male", 47, 15, "yes", 5, 17, 6, 4),
(0, "female", 22, 1.5, "no", 2, 17, 5, 4),
(0, "female", 27, 4, "no", 4, 14, 5, 4),
(0, "female", 37, 15, "yes", 1, 17, 5, 5),
(0, "female", 37, 15, "yes", 2, 18, 4, 3),
(0, "female", 22, 0.75, "no", 3, 16, 5, 4),
(0, "female", 22, 1.5, "no", 2, 16, 5, 5),
(0, "female", 27, 10, "yes", 2, 14, 1, 5),
(0, "female", 22, 1.5, "no", 2, 16, 5, 5),
(0, "female", 22, 1.5, "no", 2, 16, 5, 5),
(0, "female", 27, 10, "yes", 4, 16, 5, 4),
(0, "female", 32, 10, "yes", 3, 14, 1, 5),
(0, "male", 37, 4, "yes", 2, 20, 6, 4),
(0, "female", 22, 1.5, "no", 2, 18, 5, 5),
(0, "female", 27, 7, "no", 4, 16, 1, 5),
(0, "male", 42, 15, "yes", 5, 20, 6, 4),
(0, "male", 27, 4, "yes", 3, 16, 5, 5),
(0, "female", 27, 4, "yes", 3, 17, 5, 4),
(0, "male", 42, 15, "yes", 4, 20, 6, 3),
(0, "female", 22, 1.5, "no", 3, 16, 5, 5),
(0, "male", 27, 0.417, "no", 4, 17, 6, 4),
(0, "female", 42, 15, "yes", 5, 14, 5, 4),
(0, "male", 32, 4, "yes", 1, 18, 6, 4),
(0, "female", 22, 1.5, "no", 4, 16, 5, 3),
(0, "female", 42, 15, "yes", 3, 12, 1, 4),
(0, "female", 22, 4, "no", 4, 17, 5, 5),
(0, "male", 22, 1.5, "yes", 1, 14, 3, 5),
(0, "female", 22, 0.75, "no", 3, 16, 1, 5),
(0, "male", 32, 10, "yes", 5, 20, 6, 5),
(0, "male", 52, 15, "yes", 5, 18, 6, 3),
(0, "female", 22, 0.417, "no", 5, 14, 1, 4),
(0, "female", 27, 4, "yes", 2, 18, 6, 1),
(0, "female", 32, 7, "yes", 5, 17, 5, 3),
(0, "male", 22, 4, "no", 3, 16, 5, 5),
(0, "female", 27, 7, "yes", 4, 18, 6, 5),
(0, "female", 42, 15, "yes", 2, 18, 5, 4),
(0, "male", 27, 1.5, "yes", 4, 16, 3, 5),
(0, "male", 42, 15, "yes", 2, 20, 6, 4),
(0, "female", 22, 0.75, "no", 5, 14, 3, 5),
(0, "male", 32, 7, "yes", 2, 20, 6, 4),
(0, "male", 27, 4, "yes", 5, 20, 6, 5),
(0, "male", 27, 10, "yes", 4, 20, 6, 4),
(0, "male", 22, 4, "no", 1, 18, 5, 5),
(0, "female", 37, 15, "yes", 4, 14, 3, 1),
(0, "male", 22, 1.5, "yes", 5, 16, 4, 4),
(0, "female", 37, 15, "yes", 4, 17, 1, 5),
(0, "female", 27, 0.75, "no", 4, 17, 5, 4),
(0, "male", 32, 10, "yes", 4, 20, 6, 4),
(0, "female", 47, 15, "yes", 5, 14, 7, 2),
(0, "male", 37, 10, "yes", 3, 20, 6, 4),
(0, "female", 22, 0.75, "no", 2, 16, 5, 5),
(0, "male", 27, 4, "no", 2, 18, 4, 5),
(0, "male", 32, 7, "no", 4, 20, 6, 4),
(0, "male", 42, 15, "yes", 2, 17, 3, 5),
(0, "male", 37, 10, "yes", 4, 20, 6, 4),
(0, "female", 47, 15, "yes", 3, 17, 6, 5),
(0, "female", 22, 1.5, "no", 5, 16, 5, 5),
(0, "female", 27, 1.5, "no", 2, 16, 6, 4),
(0, "female", 27, 4, "no", 3, 17, 5, 5),
(0, "female", 32, 10, "yes", 5, 14, 4, 5),
(0, "female", 22, 0.125, "no", 2, 12, 5, 5),
(0, "male", 47, 15, "yes", 4, 14, 4, 3),
(0, "male", 32, 15, "yes", 1, 14, 5, 5),
(0, "male", 27, 7, "yes", 4, 16, 5, 5),
(0, "female", 22, 1.5, "yes", 3, 16, 5, 5),
(0, "male", 27, 4, "yes", 3, 17, 6, 5),
(0, "female", 22, 1.5, "no", 3, 16, 5, 5),
(0, "male", 57, 15, "yes", 2, 14, 7, 2),
(0, "male", 17.5, 1.5, "yes", 3, 18, 6, 5),
(0, "male", 57, 15, "yes", 4, 20, 6, 5),
(0, "female", 22, 0.75, "no", 2, 16, 3, 4),
(0, "male", 42, 4, "no", 4, 17, 3, 3),
(0, "female", 22, 1.5, "yes", 4, 12, 1, 5),
(0, "female", 22, 0.417, "no", 1, 17, 6, 4),
(0, "female", 32, 15, "yes", 4, 17, 5, 5),
(0, "female", 27, 1.5, "no", 3, 18, 5, 2),
(0, "female", 22, 1.5, "yes", 3, 14, 1, 5),
(0, "female", 37, 15, "yes", 3, 14, 1, 4),
(0, "female", 32, 15, "yes", 4, 14, 3, 4),
(0, "male", 37, 10, "yes", 2, 14, 5, 3),
(0, "male", 37, 10, "yes", 4, 16, 5, 4),
(0, "male", 57, 15, "yes", 5, 20, 5, 3),
(0, "male", 27, 0.417, "no", 1, 16, 3, 4),
(0, "female", 42, 15, "yes", 5, 14, 1, 5),
(0, "male", 57, 15, "yes", 3, 16, 6, 1),
(0, "male", 37, 10, "yes", 1, 16, 6, 4),
(0, "male", 37, 15, "yes", 3, 17, 5, 5),
(0, "male", 37, 15, "yes", 4, 20, 6, 5),
(0, "female", 27, 10, "yes", 5, 14, 1, 5),
(0, "male", 37, 10, "yes", 2, 18, 6, 4),
(0, "female", 22, 0.125, "no", 4, 12, 4, 5),
(0, "male", 57, 15, "yes", 5, 20, 6, 5),
(0, "female", 37, 15, "yes", 4, 18, 6, 4),
(0, "male", 22, 4, "yes", 4, 14, 6, 4),
(0, "male", 27, 7, "yes", 4, 18, 5, 4),
(0, "male", 57, 15, "yes", 4, 20, 5, 4),
(0, "male", 32, 15, "yes", 3, 14, 6, 3),
(0, "female", 22, 1.5, "no", 2, 14, 5, 4),
(0, "female", 32, 7, "yes", 4, 17, 1, 5),
(0, "female", 37, 15, "yes", 4, 17, 6, 5),
(0, "female", 32, 1.5, "no", 5, 18, 5, 5),
(0, "male", 42, 10, "yes", 5, 20, 7, 4),
(0, "female", 27, 7, "no", 3, 16, 5, 4),
(0, "male", 37, 15, "no", 4, 20, 6, 5),
(0, "male", 37, 15, "yes", 4, 14, 3, 2),
(0, "male", 32, 10, "no", 5, 18, 6, 4),
(0, "female", 22, 0.75, "no", 4, 16, 1, 5),
(0, "female", 27, 7, "yes", 4, 12, 2, 4),
(0, "female", 27, 7, "yes", 2, 16, 2, 5),
(0, "female", 42, 15, "yes", 5, 18, 5, 4),
(0, "male", 42, 15, "yes", 4, 17, 5, 3),
(0, "female", 27, 7, "yes", 2, 16, 1, 2),
(0, "female", 22, 1.5, "no", 3, 16, 5, 5),
(0, "male", 37, 15, "yes", 5, 20, 6, 5),
(0, "female", 22, 0.125, "no", 2, 14, 4, 5),
(0, "male", 27, 1.5, "no", 4, 16, 5, 5),
(0, "male", 32, 1.5, "no", 2, 18, 6, 5),
(0, "male", 27, 1.5, "no", 2, 17, 6, 5),
(0, "female", 27, 10, "yes", 4, 16, 1, 3),
(0, "male", 42, 15, "yes", 4, 18, 6, 5),
(0, "female", 27, 1.5, "no", 2, 16, 6, 5),
(0, "male", 27, 4, "no", 2, 18, 6, 3),
(0, "female", 32, 10, "yes", 3, 14, 5, 3),
(0, "female", 32, 15, "yes", 3, 18, 5, 4),
(0, "female", 22, 0.75, "no", 2, 18, 6, 5),
(0, "female", 37, 15, "yes", 2, 16, 1, 4),
(0, "male", 27, 4, "yes", 4, 20, 5, 5),
(0, "male", 27, 4, "no", 1, 20, 5, 4),
(0, "female", 27, 10, "yes", 2, 12, 1, 4),
(0, "female", 32, 15, "yes", 5, 18, 6, 4),
(0, "male", 27, 7, "yes", 5, 12, 5, 3),
(0, "male", 52, 15, "yes", 2, 18, 5, 4),
(0, "male", 27, 4, "no", 3, 20, 6, 3),
(0, "male", 37, 4, "yes", 1, 18, 5, 4),
(0, "male", 27, 4, "yes", 4, 14, 5, 4),
(0, "female", 52, 15, "yes", 5, 12, 1, 3),
(0, "female", 57, 15, "yes", 4, 16, 6, 4),
(0, "male", 27, 7, "yes", 1, 16, 5, 4),
(0, "male", 37, 7, "yes", 4, 20, 6, 3),
(0, "male", 22, 0.75, "no", 2, 14, 4, 3),
(0, "male", 32, 4, "yes", 2, 18, 5, 3),
(0, "male", 37, 15, "yes", 4, 20, 6, 3),
(0, "male", 22, 0.75, "yes", 2, 14, 4, 3),
(0, "male", 42, 15, "yes", 4, 20, 6, 3),
(0, "female", 52, 15, "yes", 5, 17, 1, 1),
(0, "female", 37, 15, "yes", 4, 14, 1, 2),
(0, "male", 27, 7, "yes", 4, 14, 5, 3),
(0, "male", 32, 4, "yes", 2, 16, 5, 5),
(0, "female", 27, 4, "yes", 2, 18, 6, 5),
(0, "female", 27, 4, "yes", 2, 18, 5, 5),
(0, "male", 37, 15, "yes", 5, 18, 6, 5),
(0, "female", 47, 15, "yes", 5, 12, 5, 4),
(0, "female", 32, 10, "yes", 3, 17, 1, 4),
(0, "female", 27, 1.5, "yes", 4, 17, 1, 2),
(0, "female", 57, 15, "yes", 2, 18, 5, 2),
(0, "female", 22, 1.5, "no", 4, 14, 5, 4),
(0, "male", 42, 15, "yes", 3, 14, 3, 4),
(0, "male", 57, 15, "yes", 4, 9, 2, 2),
(0, "male", 57, 15, "yes", 4, 20, 6, 5),
(0, "female", 22, 0.125, "no", 4, 14, 4, 5),
(0, "female", 32, 10, "yes", 4, 14, 1, 5),
(0, "female", 42, 15, "yes", 3, 18, 5, 4),
(0, "female", 27, 1.5, "no", 2, 18, 6, 5),
(0, "male", 32, 0.125, "yes", 2, 18, 5, 2),
(0, "female", 27, 4, "no", 3, 16, 5, 4),
(0, "female", 27, 10, "yes", 2, 16, 1, 4),
(0, "female", 32, 7, "yes", 4, 16, 1, 3),
(0, "female", 37, 15, "yes", 4, 14, 5, 4),
(0, "female", 42, 15, "yes", 5, 17, 6, 2),
(0, "male", 32, 1.5, "yes", 4, 14, 6, 5),
(0, "female", 32, 4, "yes", 3, 17, 5, 3),
(0, "female", 37, 7, "no", 4, 18, 5, 5),
(0, "female", 22, 0.417, "yes", 3, 14, 3, 5),
(0, "female", 27, 7, "yes", 4, 14, 1, 5),
(0, "male", 27, 0.75, "no", 3, 16, 5, 5),
(0, "male", 27, 4, "yes", 2, 20, 5, 5),
(0, "male", 32, 10, "yes", 4, 16, 4, 5),
(0, "male", 32, 15, "yes", 1, 14, 5, 5),
(0, "male", 22, 0.75, "no", 3, 17, 4, 5),
(0, "female", 27, 7, "yes", 4, 17, 1, 4),
(0, "male", 27, 0.417, "yes", 4, 20, 5, 4),
(0, "male", 37, 15, "yes", 4, 20, 5, 4),
(0, "female", 37, 15, "yes", 2, 14, 1, 3),
(0, "male", 22, 4, "yes", 1, 18, 5, 4),
(0, "male", 37, 15, "yes", 4, 17, 5, 3),
(0, "female", 22, 1.5, "no", 2, 14, 4, 5),
(0, "male", 52, 15, "yes", 4, 14, 6, 2),
(0, "female", 22, 1.5, "no", 4, 17, 5, 5),
(0, "male", 32, 4, "yes", 5, 14, 3, 5),
(0, "male", 32, 4, "yes", 2, 14, 3, 5),
(0, "female", 22, 1.5, "no", 3, 16, 6, 5),
(0, "male", 27, 0.75, "no", 2, 18, 3, 3),
(0, "female", 22, 7, "yes", 2, 14, 5, 2),
(0, "female", 27, 0.75, "no", 2, 17, 5, 3),
(0, "female", 37, 15, "yes", 4, 12, 1, 2),
(0, "female", 22, 1.5, "no", 1, 14, 1, 5),
(0, "female", 37, 10, "no", 2, 12, 4, 4),
(0, "female", 37, 15, "yes", 4, 18, 5, 3),
(0, "female", 42, 15, "yes", 3, 12, 3, 3),
(0, "male", 22, 4, "no", 2, 18, 5, 5),
(0, "male", 52, 7, "yes", 2, 20, 6, 2),
(0, "male", 27, 0.75, "no", 2, 17, 5, 5),
(0, "female", 27, 4, "no", 2, 17, 4, 5),
(0, "male", 42, 1.5, "no", 5, 20, 6, 5),
(0, "male", 22, 1.5, "no", 4, 17, 6, 5),
(0, "male", 22, 4, "no", 4, 17, 5, 3),
(0, "female", 22, 4, "yes", 1, 14, 5, 4),
(0, "male", 37, 15, "yes", 5, 20, 4, 5),
(0, "female", 37, 10, "yes", 3, 16, 6, 3),
(0, "male", 42, 15, "yes", 4, 17, 6, 5),
(0, "female", 47, 15, "yes", 4, 17, 5, 5),
(0, "male", 22, 1.5, "no", 4, 16, 5, 4),
(0, "female", 32, 10, "yes", 3, 12, 1, 4),
(0, "female", 22, 7, "yes", 1, 14, 3, 5),
(0, "female", 32, 10, "yes", 4, 17, 5, 4),
(0, "male", 27, 1.5, "yes", 2, 16, 2, 4),
(0, "male", 37, 15, "yes", 4, 14, 5, 5),
(0, "male", 42, 4, "yes", 3, 14, 4, 5),
(0, "female", 37, 15, "yes", 5, 14, 5, 4),
(0, "female", 32, 7, "yes", 4, 17, 5, 5),
(0, "female", 42, 15, "yes", 4, 18, 6, 5),
(0, "male", 27, 4, "no", 4, 18, 6, 4),
(0, "male", 22, 0.75, "no", 4, 18, 6, 5),
(0, "male", 27, 4, "yes", 4, 14, 5, 3),
(0, "female", 22, 0.75, "no", 5, 18, 1, 5),
(0, "female", 52, 15, "yes", 5, 9, 5, 5),
(0, "male", 32, 10, "yes", 3, 14, 5, 5),
(0, "female", 37, 15, "yes", 4, 16, 4, 4),
(0, "male", 32, 7, "yes", 2, 20, 5, 4),
(0, "female", 42, 15, "yes", 3, 18, 1, 4),
(0, "male", 32, 15, "yes", 1, 16, 5, 5),
(0, "male", 27, 4, "yes", 3, 18, 5, 5),
(0, "female", 32, 15, "yes", 4, 12, 3, 4),
(0, "male", 22, 0.75, "yes", 3, 14, 2, 4),
(0, "female", 22, 1.5, "no", 3, 16, 5, 3),
(0, "female", 42, 15, "yes", 4, 14, 3, 5),
(0, "female", 52, 15, "yes", 3, 16, 5, 4),
(0, "male", 37, 15, "yes", 5, 20, 6, 4),
(0, "female", 47, 15, "yes", 4, 12, 2, 3),
(0, "male", 57, 15, "yes", 2, 20, 6, 4),
(0, "male", 32, 7, "yes", 4, 17, 5, 5),
(0, "female", 27, 7, "yes", 4, 17, 1, 4),
(0, "male", 22, 1.5, "no", 1, 18, 6, 5),
(0, "female", 22, 4, "yes", 3, 9, 1, 4),
(0, "female", 22, 1.5, "no", 2, 14, 1, 5),
(0, "male", 42, 15, "yes", 2, 20, 6, 4),
(0, "male", 57, 15, "yes", 4, 9, 2, 4),
(0, "female", 27, 7, "yes", 2, 18, 1, 5),
(0, "female", 22, 4, "yes", 3, 14, 1, 5),
(0, "male", 37, 15, "yes", 4, 14, 5, 3),
(0, "male", 32, 7, "yes", 1, 18, 6, 4),
(0, "female", 22, 1.5, "no", 2, 14, 5, 5),
(0, "female", 22, 1.5, "yes", 3, 12, 1, 3),
(0, "male", 52, 15, "yes", 2, 14, 5, 5),
(0, "female", 37, 15, "yes", 2, 14, 1, 1),
(0, "female", 32, 10, "yes", 2, 14, 5, 5),
(0, "male", 42, 15, "yes", 4, 20, 4, 5),
(0, "female", 27, 4, "yes", 3, 18, 4, 5),
(0, "male", 37, 15, "yes", 4, 20, 6, 5),
(0, "male", 27, 1.5, "no", 3, 18, 5, 5),
(0, "female", 22, 0.125, "no", 2, 16, 6, 3),
(0, "male", 32, 10, "yes", 2, 20, 6, 3),
(0, "female", 27, 4, "no", 4, 18, 5, 4),
(0, "female", 27, 7, "yes", 2, 12, 5, 1),
(0, "male", 32, 4, "yes", 5, 18, 6, 3),
(0, "female", 37, 15, "yes", 2, 17, 5, 5),
(0, "male", 47, 15, "no", 4, 20, 6, 4),
(0, "male", 27, 1.5, "no", 1, 18, 5, 5),
(0, "male", 37, 15, "yes", 4, 20, 6, 4),
(0, "female", 32, 15, "yes", 4, 18, 1, 4),
(0, "female", 32, 7, "yes", 4, 17, 5, 4),
(0, "female", 42, 15, "yes", 3, 14, 1, 3),
(0, "female", 27, 7, "yes", 3, 16, 1, 4),
(0, "male", 27, 1.5, "no", 3, 16, 4, 2),
(0, "male", 22, 1.5, "no", 3, 16, 3, 5),
(0, "male", 27, 4, "yes", 3, 16, 4, 2),
(0, "female", 27, 7, "yes", 3, 12, 1, 2),
(0, "female", 37, 15, "yes", 2, 18, 5, 4),
(0, "female", 37, 7, "yes", 3, 14, 4, 4),
(0, "male", 22, 1.5, "no", 2, 16, 5, 5),
(0, "male", 37, 15, "yes", 5, 20, 5, 4),
(0, "female", 22, 1.5, "no", 4, 16, 5, 3),
(0, "female", 32, 10, "yes", 4, 16, 1, 5),
(0, "male", 27, 4, "no", 2, 17, 5, 3),
(0, "female", 22, 0.417, "no", 4, 14, 5, 5),
(0, "female", 27, 4, "no", 2, 18, 5, 5),
(0, "male", 37, 15, "yes", 4, 18, 5, 3),
(0, "male", 37, 10, "yes", 5, 20, 7, 4),
(0, "female", 27, 7, "yes", 2, 14, 4, 2),
(0, "male", 32, 4, "yes", 2, 16, 5, 5),
(0, "male", 32, 4, "yes", 2, 16, 6, 4),
(0, "male", 22, 1.5, "no", 3, 18, 4, 5),
(0, "female", 22, 4, "yes", 4, 14, 3, 4),
(0, "female", 17.5, 0.75, "no", 2, 18, 5, 4),
(0, "male", 32, 10, "yes", 4, 20, 4, 5),
(0, "female", 32, 0.75, "no", 5, 14, 3, 3),
(0, "male", 37, 15, "yes", 4, 17, 5, 3),
(0, "male", 32, 4, "no", 3, 14, 4, 5),
(0, "female", 27, 1.5, "no", 2, 17, 3, 2),
(0, "female", 22, 7, "yes", 4, 14, 1, 5),
(0, "male", 47, 15, "yes", 5, 14, 6, 5),
(0, "male", 27, 4, "yes", 1, 16, 4, 4),
(0, "female", 37, 15, "yes", 5, 14, 1, 3),
(0, "male", 42, 4, "yes", 4, 18, 5, 5),
(0, "female", 32, 4, "yes", 2, 14, 1, 5),
(0, "male", 52, 15, "yes", 2, 14, 7, 4),
(0, "female", 22, 1.5, "no", 2, 16, 1, 4),
(0, "male", 52, 15, "yes", 4, 12, 2, 4),
(0, "female", 22, 0.417, "no", 3, 17, 1, 5),
(0, "female", 22, 1.5, "no", 2, 16, 5, 5),
(0, "male", 27, 4, "yes", 4, 20, 6, 4),
(0, "female", 32, 15, "yes", 4, 14, 1, 5),
(0, "female", 27, 1.5, "no", 2, 16, 3, 5),
(0, "male", 32, 4, "no", 1, 20, 6, 5),
(0, "male", 37, 15, "yes", 3, 20, 6, 4),
(0, "female", 32, 10, "no", 2, 16, 6, 5),
(0, "female", 32, 10, "yes", 5, 14, 5, 5),
(0, "male", 37, 1.5, "yes", 4, 18, 5, 3),
(0, "male", 32, 1.5, "no", 2, 18, 4, 4),
(0, "female", 32, 10, "yes", 4, 14, 1, 4),
(0, "female", 47, 15, "yes", 4, 18, 5, 4),
(0, "female", 27, 10, "yes", 5, 12, 1, 5),
(0, "male", 27, 4, "yes", 3, 16, 4, 5),
(0, "female", 37, 15, "yes", 4, 12, 4, 2),
(0, "female", 27, 0.75, "no", 4, 16, 5, 5),
(0, "female", 37, 15, "yes", 4, 16, 1, 5),
(0, "female", 32, 15, "yes", 3, 16, 1, 5),
(0, "female", 27, 10, "yes", 2, 16, 1, 5),
(0, "male", 27, 7, "no", 2, 20, 6, 5),
(0, "female", 37, 15, "yes", 2, 14, 1, 3),
(0, "male", 27, 1.5, "yes", 2, 17, 4, 4),
(0, "female", 22, 0.75, "yes", 2, 14, 1, 5),
(0, "male", 22, 4, "yes", 4, 14, 2, 4),
(0, "male", 42, 0.125, "no", 4, 17, 6, 4),
(0, "male", 27, 1.5, "yes", 4, 18, 6, 5),
(0, "male", 27, 7, "yes", 3, 16, 6, 3),
(0, "female", 52, 15, "yes", 4, 14, 1, 3),
(0, "male", 27, 1.5, "no", 5, 20, 5, 2),
(0, "female", 27, 1.5, "no", 2, 16, 5, 5),
(0, "female", 27, 1.5, "no", 3, 17, 5, 5),
(0, "male", 22, 0.125, "no", 5, 16, 4, 4),
(0, "female", 27, 4, "yes", 4, 16, 1, 5),
(0, "female", 27, 4, "yes", 4, 12, 1, 5),
(0, "female", 47, 15, "yes", 2, 14, 5, 5),
(0, "female", 32, 15, "yes", 3, 14, 5, 3),
(0, "male", 42, 7, "yes", 2, 16, 5, 5),
(0, "male", 22, 0.75, "no", 4, 16, 6, 4),
(0, "male", 27, 0.125, "no", 3, 20, 6, 5),
(0, "male", 32, 10, "yes", 3, 20, 6, 5),
(0, "female", 22, 0.417, "no", 5, 14, 4, 5),
(0, "female", 47, 15, "yes", 5, 14, 1, 4),
(0, "female", 32, 10, "yes", 3, 14, 1, 5),
(0, "male", 57, 15, "yes", 4, 17, 5, 5),
(0, "male", 27, 4, "yes", 3, 20, 6, 5),
(0, "female", 32, 7, "yes", 4, 17, 1, 5),
(0, "female", 37, 10, "yes", 4, 16, 1, 5),
(0, "female", 32, 10, "yes", 1, 18, 1, 4),
(0, "female", 22, 4, "no", 3, 14, 1, 4),
(0, "female", 27, 7, "yes", 4, 14, 3, 2),
(0, "male", 57, 15, "yes", 5, 18, 5, 2),
(0, "male", 32, 7, "yes", 2, 18, 5, 5),
(0, "female", 27, 1.5, "no", 4, 17, 1, 3),
(0, "male", 22, 1.5, "no", 4, 14, 5, 5),
(0, "female", 22, 1.5, "yes", 4, 14, 5, 4),
(0, "female", 32, 7, "yes", 3, 16, 1, 5),
(0, "female", 47, 15, "yes", 3, 16, 5, 4),
(0, "female", 22, 0.75, "no", 3, 16, 1, 5),
(0, "female", 22, 1.5, "yes", 2, 14, 5, 5),
(0, "female", 27, 4, "yes", 1, 16, 5, 5),
(0, "male", 52, 15, "yes", 4, 16, 5, 5),
(0, "male", 32, 10, "yes", 4, 20, 6, 5),
(0, "male", 47, 15, "yes", 4, 16, 6, 4),
(0, "female", 27, 7, "yes", 2, 14, 1, 2),
(0, "female", 22, 1.5, "no", 4, 14, 4, 5),
(0, "female", 32, 10, "yes", 2, 16, 5, 4),
(0, "female", 22, 0.75, "no", 2, 16, 5, 4),
(0, "female", 22, 1.5, "no", 2, 16, 5, 5),
(0, "female", 42, 15, "yes", 3, 18, 6, 4),
(0, "female", 27, 7, "yes", 5, 14, 4, 5),
(0, "male", 42, 15, "yes", 4, 16, 4, 4),
(0, "female", 57, 15, "yes", 3, 18, 5, 2),
(0, "male", 42, 15, "yes", 3, 18, 6, 2),
(0, "female", 32, 7, "yes", 2, 14, 1, 2),
(0, "male", 22, 4, "no", 5, 12, 4, 5),
(0, "female", 22, 1.5, "no", 1, 16, 6, 5),
(0, "female", 22, 0.75, "no", 1, 14, 4, 5),
(0, "female", 32, 15, "yes", 4, 12, 1, 5),
(0, "male", 22, 1.5, "no", 2, 18, 5, 3),
(0, "male", 27, 4, "yes", 5, 17, 2, 5),
(0, "female", 27, 4, "yes", 4, 12, 1, 5),
(0, "male", 42, 15, "yes", 5, 18, 5, 4),
(0, "male", 32, 1.5, "no", 2, 20, 7, 3),
(0, "male", 57, 15, "no", 4, 9, 3, 1),
(0, "male", 37, 7, "no", 4, 18, 5, 5),
(0, "male", 52, 15, "yes", 2, 17, 5, 4),
(0, "male", 47, 15, "yes", 4, 17, 6, 5),
(0, "female", 27, 7, "no", 2, 17, 5, 4),
(0, "female", 27, 7, "yes", 4, 14, 5, 5),
(0, "female", 22, 4, "no", 2, 14, 3, 3),
(0, "male", 37, 7, "yes", 2, 20, 6, 5),
(0, "male", 27, 7, "no", 4, 12, 4, 3),
(0, "male", 42, 10, "yes", 4, 18, 6, 4),
(0, "female", 22, 1.5, "no", 3, 14, 1, 5),
(0, "female", 22, 4, "yes", 2, 14, 1, 3),
(0, "female", 57, 15, "no", 4, 20, 6, 5),
(0, "male", 37, 15, "yes", 4, 14, 4, 3),
(0, "female", 27, 7, "yes", 3, 18, 5, 5),
(0, "female", 17.5, 10, "no", 4, 14, 4, 5),
(0, "male", 22, 4, "yes", 4, 16, 5, 5),
(0, "female", 27, 4, "yes", 2, 16, 1, 4),
(0, "female", 37, 15, "yes", 2, 14, 5, 1),
(0, "female", 22, 1.5, "no", 5, 14, 1, 4),
(0, "male", 27, 7, "yes", 2, 20, 5, 4),
(0, "male", 27, 4, "yes", 4, 14, 5, 5),
(0, "male", 22, 0.125, "no", 1, 16, 3, 5),
(0, "female", 27, 7, "yes", 4, 14, 1, 4),
(0, "female", 32, 15, "yes", 5, 16, 5, 3),
(0, "male", 32, 10, "yes", 4, 18, 5, 4),
(0, "female", 32, 15, "yes", 2, 14, 3, 4),
(0, "female", 22, 1.5, "no", 3, 17, 5, 5),
(0, "male", 27, 4, "yes", 4, 17, 4, 4),
(0, "female", 52, 15, "yes", 5, 14, 1, 5),
(0, "female", 27, 7, "yes", 2, 12, 1, 2),
(0, "female", 27, 7, "yes", 3, 12, 1, 4),
(0, "female", 42, 15, "yes", 2, 14, 1, 4),
(0, "female", 42, 15, "yes", 4, 14, 5, 4),
(0, "male", 27, 7, "yes", 4, 14, 3, 3),
(0, "male", 27, 7, "yes", 2, 20, 6, 2),
(0, "female", 42, 15, "yes", 3, 12, 3, 3),
(0, "male", 27, 4, "yes", 3, 16, 3, 5),
(0, "female", 27, 7, "yes", 3, 14, 1, 4),
(0, "female", 22, 1.5, "no", 2, 14, 4, 5),
(0, "female", 27, 4, "yes", 4, 14, 1, 4),
(0, "female", 22, 4, "no", 4, 14, 5, 5),
(0, "female", 22, 1.5, "no", 2, 16, 4, 5),
(0, "male", 47, 15, "no", 4, 14, 5, 4),
(0, "male", 37, 10, "yes", 2, 18, 6, 2),
(0, "male", 37, 15, "yes", 3, 17, 5, 4),
(0, "female", 27, 4, "yes", 2, 16, 1, 4),
(3, "male", 27, 1.5, "no", 3, 18, 4, 4),
(3, "female", 27, 4, "yes", 3, 17, 1, 5),
(7, "male", 37, 15, "yes", 5, 18, 6, 2),
(12, "female", 32, 10, "yes", 3, 17, 5, 2),
(1, "male", 22, 0.125, "no", 4, 16, 5, 5),
(1, "female", 22, 1.5, "yes", 2, 14, 1, 5),
(12, "male", 37, 15, "yes", 4, 14, 5, 2),
(7, "female", 22, 1.5, "no", 2, 14, 3, 4),
(2, "male", 37, 15, "yes", 2, 18, 6, 4),
(3, "female", 32, 15, "yes", 4, 12, 3, 2),
(1, "female", 37, 15, "yes", 4, 14, 4, 2),
(7, "female", 42, 15, "yes", 3, 17, 1, 4),
(12, "female", 42, 15, "yes", 5, 9, 4, 1),
(12, "male", 37, 10, "yes", 2, 20, 6, 2),
(12, "female", 32, 15, "yes", 3, 14, 1, 2),
(3, "male", 27, 4, "no", 1, 18, 6, 5),
(7, "male", 37, 10, "yes", 2, 18, 7, 3),
(7, "female", 27, 4, "no", 3, 17, 5, 5),
(1, "male", 42, 15, "yes", 4, 16, 5, 5),
(1, "female", 47, 15, "yes", 5, 14, 4, 5),
(7, "female", 27, 4, "yes", 3, 18, 5, 4),
(1, "female", 27, 7, "yes", 5, 14, 1, 4),
(12, "male", 27, 1.5, "yes", 3, 17, 5, 4),
(12, "female", 27, 7, "yes", 4, 14, 6, 2),
(3, "female", 42, 15, "yes", 4, 16, 5, 4),
(7, "female", 27, 10, "yes", 4, 12, 7, 3),
(1, "male", 27, 1.5, "no", 2, 18, 5, 2),
(1, "male", 32, 4, "no", 4, 20, 6, 4),
(1, "female", 27, 7, "yes", 3, 14, 1, 3),
(3, "female", 32, 10, "yes", 4, 14, 1, 4),
(3, "male", 27, 4, "yes", 2, 18, 7, 2),
(1, "female", 17.5, 0.75, "no", 5, 14, 4, 5),
(1, "female", 32, 10, "yes", 4, 18, 1, 5),
(7, "female", 32, 7, "yes", 2, 17, 6, 4),
(7, "male", 37, 15, "yes", 2, 20, 6, 4),
(7, "female", 37, 10, "no", 1, 20, 5, 3),
(12, "female", 32, 10, "yes", 2, 16, 5, 5),
(7, "male", 52, 15, "yes", 2, 20, 6, 4),
(7, "female", 42, 15, "yes", 1, 12, 1, 3),
(1, "male", 52, 15, "yes", 2, 20, 6, 3),
(2, "male", 37, 15, "yes", 3, 18, 6, 5),
(12, "female", 22, 4, "no", 3, 12, 3, 4),
(12, "male", 27, 7, "yes", 1, 18, 6, 2),
(1, "male", 27, 4, "yes", 3, 18, 5, 5),
(12, "male", 47, 15, "yes", 4, 17, 6, 5),
(12, "female", 42, 15, "yes", 4, 12, 1, 1),
(7, "male", 27, 4, "no", 3, 14, 3, 4),
(7, "female", 32, 7, "yes", 4, 18, 4, 5),
(1, "male", 32, 0.417, "yes", 3, 12, 3, 4),
(3, "male", 47, 15, "yes", 5, 16, 5, 4),
(12, "male", 37, 15, "yes", 2, 20, 5, 4),
(7, "male", 22, 4, "yes", 2, 17, 6, 4),
(1, "male", 27, 4, "no", 2, 14, 4, 5),
(7, "female", 52, 15, "yes", 5, 16, 1, 3),
(1, "male", 27, 4, "no", 3, 14, 3, 3),
(1, "female", 27, 10, "yes", 4, 16, 1, 4),
(1, "male", 32, 7, "yes", 3, 14, 7, 4),
(7, "male", 32, 7, "yes", 2, 18, 4, 1),
(3, "male", 22, 1.5, "no", 1, 14, 3, 2),
(7, "male", 22, 4, "yes", 3, 18, 6, 4),
(7, "male", 42, 15, "yes", 4, 20, 6, 4),
(2, "female", 57, 15, "yes", 1, 18, 5, 4),
(7, "female", 32, 4, "yes", 3, 18, 5, 2),
(1, "male", 27, 4, "yes", 1, 16, 4, 4),
(7, "male", 32, 7, "yes", 4, 16, 1, 4),
(2, "male", 57, 15, "yes", 1, 17, 4, 4),
(7, "female", 42, 15, "yes", 4, 14, 5, 2),
(7, "male", 37, 10, "yes", 1, 18, 5, 3),
(3, "male", 42, 15, "yes", 3, 17, 6, 1),
(1, "female", 52, 15, "yes", 3, 14, 4, 4),
(2, "female", 27, 7, "yes", 3, 17, 5, 3),
(12, "male", 32, 7, "yes", 2, 12, 4, 2),
(1, "male", 22, 4, "no", 4, 14, 2, 5),
(3, "male", 27, 7, "yes", 3, 18, 6, 4),
(12, "female", 37, 15, "yes", 1, 18, 5, 5),
(7, "female", 32, 15, "yes", 3, 17, 1, 3),
(7, "female", 27, 7, "no", 2, 17, 5, 5),
(1, "female", 32, 7, "yes", 3, 17, 5, 3),
(1, "male", 32, 1.5, "yes", 2, 14, 2, 4),
(12, "female", 42, 15, "yes", 4, 14, 1, 2),
(7, "male", 32, 10, "yes", 3, 14, 5, 4),
(7, "male", 37, 4, "yes", 1, 20, 6, 3),
(1, "female", 27, 4, "yes", 2, 16, 5, 3),
(12, "female", 42, 15, "yes", 3, 14, 4, 3),
(1, "male", 27, 10, "yes", 5, 20, 6, 5),
(12, "male", 37, 10, "yes", 2, 20, 6, 2),
(12, "female", 27, 7, "yes", 1, 14, 3, 3),
(3, "female", 27, 7, "yes", 4, 12, 1, 2),
(3, "male", 32, 10, "yes", 2, 14, 4, 4),
(12, "female", 17.5, 0.75, "yes", 2, 12, 1, 3),
(12, "female", 32, 15, "yes", 3, 18, 5, 4),
(2, "female", 22, 7, "no", 4, 14, 4, 3),
(1, "male", 32, 7, "yes", 4, 20, 6, 5),
(7, "male", 27, 4, "yes", 2, 18, 6, 2),
(1, "female", 22, 1.5, "yes", 5, 14, 5, 3),
(12, "female", 32, 15, "no", 3, 17, 5, 1),
(12, "female", 42, 15, "yes", 2, 12, 1, 2),
(7, "male", 42, 15, "yes", 3, 20, 5, 4),
(12, "male", 32, 10, "no", 2, 18, 4, 2),
(12, "female", 32, 15, "yes", 3, 9, 1, 1),
(7, "male", 57, 15, "yes", 5, 20, 4, 5),
(12, "male", 47, 15, "yes", 4, 20, 6, 4),
(2, "female", 42, 15, "yes", 2, 17, 6, 3),
(12, "male", 37, 15, "yes", 3, 17, 6, 3),
(12, "male", 37, 15, "yes", 5, 17, 5, 2),
(7, "male", 27, 10, "yes", 2, 20, 6, 4),
(2, "male", 37, 15, "yes", 2, 16, 5, 4),
(12, "female", 32, 15, "yes", 1, 14, 5, 2),
(7, "male", 32, 10, "yes", 3, 17, 6, 3),
(2, "male", 37, 15, "yes", 4, 18, 5, 1),
(7, "female", 27, 1.5, "no", 2, 17, 5, 5),
(3, "female", 47, 15, "yes", 2, 17, 5, 2),
(12, "male", 37, 15, "yes", 2, 17, 5, 4),
(12, "female", 27, 4, "no", 2, 14, 5, 5),
(2, "female", 27, 10, "yes", 4, 14, 1, 5),
(1, "female", 22, 4, "yes", 3, 16, 1, 3),
(12, "male", 52, 7, "no", 4, 16, 5, 5),
(2, "female", 27, 4, "yes", 1, 16, 3, 5),
(7, "female", 37, 15, "yes", 2, 17, 6, 4),
(2, "female", 27, 4, "no", 1, 17, 3, 1),
(12, "female", 17.5, 0.75, "yes", 2, 12, 3, 5),
(7, "female", 32, 15, "yes", 5, 18, 5, 4),
(7, "female", 22, 4, "no", 1, 16, 3, 5),
(2, "male", 32, 4, "yes", 4, 18, 6, 4),
(1, "female", 22, 1.5, "yes", 3, 18, 5, 2),
(3, "female", 42, 15, "yes", 2, 17, 5, 4),
(1, "male", 32, 7, "yes", 4, 16, 4, 4),
(12, "male", 37, 15, "no", 3, 14, 6, 2),
(1, "male", 42, 15, "yes", 3, 16, 6, 3),
(1, "male", 27, 4, "yes", 1, 18, 5, 4),
(2, "male", 37, 15, "yes", 4, 20, 7, 3),
(7, "male", 37, 15, "yes", 3, 20, 6, 4),
(3, "male", 22, 1.5, "no", 2, 12, 3, 3),
(3, "male", 32, 4, "yes", 3, 20, 6, 2),
(2, "male", 32, 15, "yes", 5, 20, 6, 5),
(12, "female", 52, 15, "yes", 1, 18, 5, 5),
(12, "male", 47, 15, "no", 1, 18, 6, 5),
(3, "female", 32, 15, "yes", 4, 16, 4, 4),
(7, "female", 32, 15, "yes", 3, 14, 3, 2),
(7, "female", 27, 7, "yes", 4, 16, 1, 2),
(12, "male", 42, 15, "yes", 3, 18, 6, 2),
(7, "female", 42, 15, "yes", 2, 14, 3, 2),
(12, "male", 27, 7, "yes", 2, 17, 5, 4),
(3, "male", 32, 10, "yes", 4, 14, 4, 3),
(7, "male", 47, 15, "yes", 3, 16, 4, 2),
(1, "male", 22, 1.5, "yes", 1, 12, 2, 5),
(7, "female", 32, 10, "yes", 2, 18, 5, 4),
(2, "male", 32, 10, "yes", 2, 17, 6, 5),
(2, "male", 22, 7, "yes", 3, 18, 6, 2),
(1, "female", 32, 15, "yes", 3, 14, 1, 5)) val data = dataList.toDF("affairs", "gender", "age", "yearsmarried", "children", "religiousness", "education", "occupation", "rating")

随机森林建模

data.createOrReplaceTempView("data") 

// 字符类型转换成数值
val labelWhere = "case when affairs=0 then 0 else cast(1 as double) end as label"
val genderWhere = "case when gender='female' then 0 else cast(1 as double) end as gender"
val childrenWhere = "case when children='no' then 0 else cast(1 as double) end as children" val dataLabelDF = spark.sql(s"select $labelWhere, $genderWhere,age,yearsmarried,$childrenWhere,religiousness,education,occupation,rating from data") val featuresArray = Array("gender", "age", "yearsmarried", "children", "religiousness", "education", "occupation", "rating") // 字段转换成特征向量
val assembler = new VectorAssembler().setInputCols(featuresArray).setOutputCol("features")
val vecDF: DataFrame = assembler.transform(dataLabelDF)
vecDF.show(10, truncate = false) // 将数据分为训练和测试集(30%进行测试)
val Array(trainingDF, testDF) = vecDF.randomSplit(Array(0.7, 0.3)) // 索引标签,将元数据添加到标签列中
val labelIndexer = new StringIndexer().setInputCol("label").setOutputCol("indexedLabel").fit(vecDF)
//labelIndexer.transform(vecDF).show(10, truncate = false) // 自动识别分类的特征,并对它们进行索引
// 具有大于5个不同的值的特征被视为连续。
val featureIndexer = new VectorIndexer().setInputCol("features").setOutputCol("indexedFeatures").setMaxCategories(5).fit(vecDF)
//featureIndexer.transform(vecDF).show(10, truncate = false) // 训练随机森林模型
val rf = new RandomForestClassifier().setLabelCol("indexedLabel").setFeaturesCol("indexedFeatures").setNumTrees(10) // 将索引标签转换回原始标签
val labelConverter = new IndexToString().setInputCol("prediction").setOutputCol("predictedLabel").setLabels(labelIndexer.labels) // Chain indexers and forest in a Pipeline.
val pipeline = new Pipeline().setStages(Array(labelIndexer, featureIndexer, rf, labelConverter)) // Train model. This also runs the indexers.
val model = pipeline.fit(trainingDF) // 输出随机森林模型的全部参数值
model.stages(2).extractParamMap() // 作出预测
val predictions = model.transform(testDF) // Select example rows to display.
predictions.select("predictedLabel", "label", "features").show(10, false) // 选择(预测标签,实际标签),并计算测试误差
val evaluator = new MulticlassClassificationEvaluator().setLabelCol("indexedLabel").setPredictionCol("prediction").setMetricName("accuracy")
val accuracy = evaluator.evaluate(predictions)
println("Test Error = " + (1.0 - accuracy)) // 这里的stages(2)中的“2”对应pipeline中的“rf”,将model强制转换为RandomForestClassificationModel类型
val rfModel = model.stages(2).asInstanceOf[RandomForestClassificationModel]
println("Learned classification forest model:\n" + rfModel.toDebugString)

代码执行结果

vecDF.show(10, truncate = false)
+-----+------+----+------------+--------+-------------+---------+----------+------+------------------------------------+
|label|gender|age |yearsmarried|children|religiousness|education|occupation|rating|features |
+-----+------+----+------------+--------+-------------+---------+----------+------+------------------------------------+
|0.0 |1.0 |37.0|10.0 |0.0 |3.0 |18.0 |7.0 |4.0 |[1.0,37.0,10.0,0.0,3.0,18.0,7.0,4.0]|
|0.0 |0.0 |27.0|4.0 |0.0 |4.0 |14.0 |6.0 |4.0 |[0.0,27.0,4.0,0.0,4.0,14.0,6.0,4.0] |
|0.0 |0.0 |32.0|15.0 |1.0 |1.0 |12.0 |1.0 |4.0 |[0.0,32.0,15.0,1.0,1.0,12.0,1.0,4.0]|
|0.0 |1.0 |57.0|15.0 |1.0 |5.0 |18.0 |6.0 |5.0 |[1.0,57.0,15.0,1.0,5.0,18.0,6.0,5.0]|
|0.0 |1.0 |22.0|0.75 |0.0 |2.0 |17.0 |6.0 |3.0 |[1.0,22.0,0.75,0.0,2.0,17.0,6.0,3.0]|
|0.0 |0.0 |32.0|1.5 |0.0 |2.0 |17.0 |5.0 |5.0 |[0.0,32.0,1.5,0.0,2.0,17.0,5.0,5.0] |
|0.0 |0.0 |22.0|0.75 |0.0 |2.0 |12.0 |1.0 |3.0 |[0.0,22.0,0.75,0.0,2.0,12.0,1.0,3.0]|
|0.0 |1.0 |57.0|15.0 |1.0 |2.0 |14.0 |4.0 |4.0 |[1.0,57.0,15.0,1.0,2.0,14.0,4.0,4.0]|
|0.0 |0.0 |32.0|15.0 |1.0 |4.0 |16.0 |1.0 |2.0 |[0.0,32.0,15.0,1.0,4.0,16.0,1.0,2.0]|
|0.0 |1.0 |22.0|1.5 |0.0 |4.0 |14.0 |4.0 |5.0 |[1.0,22.0,1.5,0.0,4.0,14.0,4.0,5.0] |
+-----+------+----+------------+--------+-------------+---------+----------+------+------------------------------------+
only showing top 10 rows // 将数据分为训练和测试集(30%进行测试)
val Array(trainingDF, testDF) = vecDF.randomSplit(Array(0.7, 0.3))
trainingDF: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [label: double, gender: double ... 8 more fields]
testDF: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [label: double, gender: double ... 8 more fields] // 索引标签,将元数据添加到标签列中
val labelIndexer = new StringIndexer().setInputCol("label").setOutputCol("indexedLabel").fit(vecDF)
labelIndexer: org.apache.spark.ml.feature.StringIndexerModel = strIdx_37df210602df
//labelIndexer.transform(vecDF).show(10, truncate = false) // 自动识别分类的特征,并对它们进行索引
// 具有大于5个不同的值的特征被视为连续。
val featureIndexer = new VectorIndexer().setInputCol("features").setOutputCol("indexedFeatures").setMaxCategories(5).fit(vecDF)
featureIndexer: org.apache.spark.ml.feature.VectorIndexerModel = vecIdx_9595c228f520
//featureIndexer.transform(vecDF).show(10, truncate = false) // 训练随机森林模型
val rf = new RandomForestClassifier().setLabelCol("indexedLabel").setFeaturesCol("indexedFeatures").setNumTrees(10)
rf: org.apache.spark.ml.classification.RandomForestClassifier = rfc_d0e7623d0b10 // 将索引标签转换回原始标签
val labelConverter = new IndexToString().setInputCol("prediction").setOutputCol("predictedLabel").setLabels(labelIndexer.labels)
labelConverter: org.apache.spark.ml.feature.IndexToString = idxToStr_32d6938f2c94 // Chain indexers and forest in a Pipeline.
val pipeline = new Pipeline().setStages(Array(labelIndexer, featureIndexer, rf, labelConverter))
pipeline: org.apache.spark.ml.Pipeline = pipeline_97716da42fed // Train model. This also runs the indexers.
val model = pipeline.fit(trainingDF)
model: org.apache.spark.ml.PipelineModel = pipeline_97716da42fed // 输出随机森林模型的全部参数值
model.stages(2).extractParamMap()
res10: org.apache.spark.ml.param.ParamMap =
{
rfc_0d830180d598-cacheNodeIds: false,
rfc_0d830180d598-checkpointInterval: 10,
rfc_0d830180d598-featureSubsetStrategy: auto,
rfc_0d830180d598-featuresCol: indexedFeatures,
rfc_0d830180d598-impurity: gini,
rfc_0d830180d598-labelCol: indexedLabel,
rfc_0d830180d598-maxBins: 32,
rfc_0d830180d598-maxDepth: 5,
rfc_0d830180d598-maxMemoryInMB: 256,
rfc_0d830180d598-minInfoGain: 0.0,
rfc_0d830180d598-minInstancesPerNode: 1,
rfc_0d830180d598-predictionCol: prediction,
rfc_0d830180d598-probabilityCol: probability,
rfc_0d830180d598-rawPredictionCol: rawPrediction,
rfc_0d830180d598-seed: 207336481,
rfc_0d830180d598-subsamplingRate: 1.0
} // 作出预测
val predictions = model.transform(testDF)
predictions: org.apache.spark.sql.DataFrame = [label: double, gender: double ... 14 more fields] predictions.select("predictedLabel", "label", "features").show(10,false)
+--------------+-----+-------------------------------------+
|predictedLabel|label|features |
+--------------+-----+-------------------------------------+
|0.0 |0.0 |[0.0,22.0,0.125,0.0,4.0,12.0,4.0,5.0]|
|0.0 |0.0 |[0.0,22.0,0.125,0.0,4.0,14.0,4.0,5.0]|
|0.0 |0.0 |[0.0,22.0,0.417,0.0,1.0,17.0,6.0,4.0]|
|0.0 |0.0 |[0.0,22.0,0.417,0.0,4.0,14.0,5.0,5.0]|
|0.0 |0.0 |[0.0,22.0,0.417,1.0,3.0,14.0,3.0,5.0]|
|0.0 |0.0 |[0.0,22.0,0.75,0.0,5.0,18.0,1.0,5.0] |
|0.0 |0.0 |[0.0,22.0,1.5,0.0,1.0,14.0,1.0,5.0] |
|0.0 |0.0 |[0.0,22.0,1.5,0.0,4.0,16.0,5.0,3.0] |
|0.0 |0.0 |[0.0,22.0,1.5,0.0,4.0,17.0,5.0,5.0] |
|0.0 |0.0 |[0.0,22.0,1.5,1.0,3.0,12.0,1.0,3.0] |
+--------------+-----+-------------------------------------+
only showing top 10 rows // 选择(预测标签,实际标签),并计算测试误差
val evaluator = new MulticlassClassificationEvaluator().setLabelCol("indexedLabel").setPredictionCol("prediction").setMetricName("accuracy")
evaluator: org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator = mcEval_13a195abc422 val accuracy = evaluator.evaluate(predictions)
accuracy: Double = 0.7365591397849462 println("Test Error = " + (1.0 - accuracy))
Test Error = 0.26344086021505375 // 这里的stages(2)中的“2”对应pipeline中的“rf”,将model强制转换为RandomForestClassificationModel类型
val rfModel = model.stages(2).asInstanceOf[RandomForestClassificationModel]
rfModel: org.apache.spark.ml.classification.RandomForestClassificationModel = RandomForestClassificationModel (uid=rfc_f7bb5e488533) with 10 trees println("Learned classification forest model:\n" + rfModel.toDebugString)
Learned classification forest model:
RandomForestClassificationModel (uid=rfc_f7bb5e488533) with 10 trees
Tree 0 (weight 1.0):
If (feature 2 <= 1.5)
If (feature 5 <= 12.0)
If (feature 6 <= 1.0)
Predict: 0.0
Else (feature 6 > 1.0)
If (feature 2 <= 0.125)
Predict: 0.0
Else (feature 2 > 0.125)
Predict: 1.0
Else (feature 5 > 12.0)
If (feature 0 in {0.0})
If (feature 5 <= 16.0)
Predict: 0.0
Else (feature 5 > 16.0)
If (feature 1 <= 22.0)
Predict: 0.0
Else (feature 1 > 22.0)
Predict: 0.0
Else (feature 0 not in {0.0})
If (feature 2 <= 0.75)
If (feature 4 in {0.0,1.0,2.0,4.0})
Predict: 0.0
Else (feature 4 not in {0.0,1.0,2.0,4.0})
Predict: 0.0
Else (feature 2 > 0.75)
If (feature 1 <= 22.0)
Predict: 0.0
Else (feature 1 > 22.0)
Predict: 1.0
Else (feature 2 > 1.5)
If (feature 1 <= 42.0)
If (feature 1 <= 27.0)
If (feature 5 <= 16.0)
If (feature 6 <= 5.0)
Predict: 0.0
Else (feature 6 > 5.0)
Predict: 1.0
Else (feature 5 > 16.0)
If (feature 4 in {3.0})
Predict: 0.0
Else (feature 4 not in {3.0})
Predict: 0.0
Else (feature 1 > 27.0)
If (feature 4 in {0.0,3.0,4.0})
If (feature 2 <= 4.0)
Predict: 1.0
Else (feature 2 > 4.0)
Predict: 0.0
Else (feature 4 not in {0.0,3.0,4.0})
If (feature 6 <= 4.0)
Predict: 0.0
Else (feature 6 > 4.0)
Predict: 1.0
Else (feature 1 > 42.0)
If (feature 4 in {2.0,4.0})
Predict: 0.0
Else (feature 4 not in {2.0,4.0})
If (feature 4 in {0.0})
Predict: 1.0
Else (feature 4 not in {0.0})
If (feature 3 in {0.0})
Predict: 0.0
Else (feature 3 not in {0.0})
Predict: 0.0
Tree 1 (weight 1.0):
If (feature 7 in {0.0,2.0,4.0})
If (feature 7 in {0.0})
If (feature 1 <= 42.0)
If (feature 4 in {1.0})
Predict: 0.0
Else (feature 4 not in {1.0})
Predict: 1.0
Else (feature 1 > 42.0)
Predict: 0.0
Else (feature 7 not in {0.0})
If (feature 1 <= 17.5)
If (feature 4 in {3.0})
Predict: 0.0
Else (feature 4 not in {3.0})
Predict: 1.0
Else (feature 1 > 17.5)
If (feature 0 in {0.0})
If (feature 4 in {1.0,3.0,4.0})
Predict: 0.0
Else (feature 4 not in {1.0,3.0,4.0})
Predict: 0.0
Else (feature 0 not in {0.0})
If (feature 6 <= 2.0)
Predict: 1.0
Else (feature 6 > 2.0)
Predict: 0.0
Else (feature 7 not in {0.0,2.0,4.0})
If (feature 3 in {0.0})
If (feature 5 <= 14.0)
If (feature 4 in {1.0,3.0})
Predict: 0.0
Else (feature 4 not in {1.0,3.0})
If (feature 0 in {0.0})
Predict: 0.0
Else (feature 0 not in {0.0})
Predict: 1.0
Else (feature 5 > 14.0)
If (feature 0 in {0.0})
Predict: 0.0
Else (feature 0 not in {0.0})
If (feature 4 in {0.0,2.0,3.0,4.0})
Predict: 0.0
Else (feature 4 not in {0.0,2.0,3.0,4.0})
Predict: 1.0
Else (feature 3 not in {0.0})
If (feature 5 <= 12.0)
If (feature 0 in {1.0})
Predict: 0.0
Else (feature 0 not in {1.0})
If (feature 6 <= 1.0)
Predict: 0.0
Else (feature 6 > 1.0)
Predict: 0.0
Else (feature 5 > 12.0)
If (feature 4 in {0.0,2.0,3.0,4.0})
If (feature 1 <= 47.0)
Predict: 0.0
Else (feature 1 > 47.0)
Predict: 1.0
Else (feature 4 not in {0.0,2.0,3.0,4.0})
If (feature 1 <= 22.0)
Predict: 1.0
Else (feature 1 > 22.0)
Predict: 0.0
Tree 2 (weight 1.0):
If (feature 7 in {0.0})
If (feature 4 in {1.0})
Predict: 0.0
Else (feature 4 not in {1.0})
If (feature 6 <= 5.0)
If (feature 1 <= 42.0)
Predict: 1.0
Else (feature 1 > 42.0)
Predict: 0.0
Else (feature 6 > 5.0)
Predict: 0.0
Else (feature 7 not in {0.0})
If (feature 5 <= 16.0)
If (feature 7 in {1.0})
If (feature 6 <= 4.0)
If (feature 2 <= 7.0)
Predict: 0.0
Else (feature 2 > 7.0)
Predict: 1.0
Else (feature 6 > 4.0)
Predict: 1.0
Else (feature 7 not in {1.0})
If (feature 3 in {1.0})
If (feature 1 <= 17.5)
Predict: 1.0
Else (feature 1 > 17.5)
Predict: 0.0
Else (feature 3 not in {1.0})
If (feature 0 in {0.0})
Predict: 0.0
Else (feature 0 not in {0.0})
Predict: 0.0
Else (feature 5 > 16.0)
If (feature 3 in {0.0})
If (feature 4 in {4.0})
Predict: 0.0
Else (feature 4 not in {4.0})
If (feature 5 <= 18.0)
Predict: 0.0
Else (feature 5 > 18.0)
Predict: 0.0
Else (feature 3 not in {0.0})
If (feature 4 in {0.0,3.0,4.0})
If (feature 7 in {2.0})
Predict: 0.0
Else (feature 7 not in {2.0})
Predict: 0.0
Else (feature 4 not in {0.0,3.0,4.0})
If (feature 6 <= 4.0)
Predict: 0.0
Else (feature 6 > 4.0)
Predict: 1.0
Tree 3 (weight 1.0):
If (feature 3 in {0.0})
If (feature 7 in {3.0})
Predict: 0.0
Else (feature 7 not in {3.0})
If (feature 2 <= 10.0)
If (feature 4 in {2.0,3.0,4.0})
If (feature 4 in {4.0})
Predict: 0.0
Else (feature 4 not in {4.0})
Predict: 0.0
Else (feature 4 not in {2.0,3.0,4.0})
If (feature 7 in {0.0,2.0,4.0})
Predict: 0.0
Else (feature 7 not in {0.0,2.0,4.0})
Predict: 1.0
Else (feature 2 > 10.0)
Predict: 1.0
Else (feature 3 not in {0.0})
If (feature 6 <= 2.0)
If (feature 5 <= 16.0)
If (feature 7 in {0.0,1.0,2.0,4.0})
If (feature 4 in {0.0,1.0,3.0,4.0})
Predict: 0.0
Else (feature 4 not in {0.0,1.0,3.0,4.0})
Predict: 1.0
Else (feature 7 not in {0.0,1.0,2.0,4.0})
If (feature 1 <= 22.0)
Predict: 0.0
Else (feature 1 > 22.0)
Predict: 0.0
Else (feature 5 > 16.0)
If (feature 7 in {0.0,1.0,3.0})
Predict: 0.0
Else (feature 7 not in {0.0,1.0,3.0})
Predict: 1.0
Else (feature 6 > 2.0)
If (feature 4 in {0.0,3.0,4.0})
If (feature 7 in {0.0,2.0,3.0,4.0})
If (feature 4 in {3.0,4.0})
Predict: 0.0
Else (feature 4 not in {3.0,4.0})
Predict: 0.0
Else (feature 7 not in {0.0,2.0,3.0,4.0})
If (feature 6 <= 4.0)
Predict: 0.0
Else (feature 6 > 4.0)
Predict: 1.0
Else (feature 4 not in {0.0,3.0,4.0})
If (feature 1 <= 22.0)
If (feature 5 <= 14.0)
Predict: 1.0
Else (feature 5 > 14.0)
Predict: 1.0
Else (feature 1 > 22.0)
If (feature 6 <= 6.0)
Predict: 0.0
Else (feature 6 > 6.0)
Predict: 1.0
Tree 4 (weight 1.0):
If (feature 7 in {0.0,2.0,4.0})
If (feature 7 in {0.0})
If (feature 6 <= 5.0)
If (feature 3 in {0.0})
Predict: 0.0
Else (feature 3 not in {0.0})
If (feature 4 in {2.0,4.0})
Predict: 1.0
Else (feature 4 not in {2.0,4.0})
Predict: 1.0
Else (feature 6 > 5.0)
Predict: 0.0
Else (feature 7 not in {0.0})
If (feature 2 <= 1.5)
If (feature 5 <= 12.0)
If (feature 2 <= 0.125)
Predict: 0.0
Else (feature 2 > 0.125)
Predict: 0.0
Else (feature 5 > 12.0)
If (feature 1 <= 17.5)
Predict: 1.0
Else (feature 1 > 17.5)
Predict: 0.0
Else (feature 2 > 1.5)
If (feature 2 <= 7.0)
If (feature 4 in {1.0,3.0,4.0})
Predict: 0.0
Else (feature 4 not in {1.0,3.0,4.0})
Predict: 0.0
Else (feature 2 > 7.0)
If (feature 5 <= 16.0)
Predict: 0.0
Else (feature 5 > 16.0)
Predict: 0.0
Else (feature 7 not in {0.0,2.0,4.0})
If (feature 5 <= 12.0)
Predict: 0.0
Else (feature 5 > 12.0)
If (feature 4 in {0.0,3.0,4.0})
If (feature 1 <= 47.0)
If (feature 1 <= 22.0)
Predict: 0.0
Else (feature 1 > 22.0)
Predict: 0.0
Else (feature 1 > 47.0)
Predict: 1.0
Else (feature 4 not in {0.0,3.0,4.0})
If (feature 1 <= 27.0)
If (feature 3 in {0.0})
Predict: 0.0
Else (feature 3 not in {0.0})
Predict: 0.0
Else (feature 1 > 27.0)
If (feature 5 <= 14.0)
Predict: 1.0
Else (feature 5 > 14.0)
Predict: 1.0
Tree 5 (weight 1.0):
If (feature 7 in {0.0})
If (feature 1 <= 42.0)
If (feature 6 <= 4.0)
Predict: 1.0
Else (feature 6 > 4.0)
If (feature 4 in {1.0})
Predict: 0.0
Else (feature 4 not in {1.0})
Predict: 1.0
Else (feature 1 > 42.0)
Predict: 0.0
Else (feature 7 not in {0.0})
If (feature 2 <= 1.5)
If (feature 4 in {0.0,2.0,3.0})
If (feature 1 <= 22.0)
If (feature 0 in {0.0})
Predict: 0.0
Else (feature 0 not in {0.0})
Predict: 0.0
Else (feature 1 > 22.0)
Predict: 0.0
Else (feature 4 not in {0.0,2.0,3.0})
If (feature 1 <= 17.5)
If (feature 6 <= 4.0)
Predict: 1.0
Else (feature 6 > 4.0)
Predict: 0.0
Else (feature 1 > 17.5)
If (feature 0 in {0.0})
Predict: 0.0
Else (feature 0 not in {0.0})
Predict: 0.0
Else (feature 2 > 1.5)
If (feature 6 <= 5.0)
If (feature 5 <= 17.0)
If (feature 7 in {2.0,4.0})
Predict: 0.0
Else (feature 7 not in {2.0,4.0})
Predict: 0.0
Else (feature 5 > 17.0)
If (feature 6 <= 1.0)
Predict: 0.0
Else (feature 6 > 1.0)
Predict: 0.0
Else (feature 6 > 5.0)
If (feature 4 in {0.0,3.0,4.0})
If (feature 7 in {3.0,4.0})
Predict: 0.0
Else (feature 7 not in {3.0,4.0})
Predict: 0.0
Else (feature 4 not in {0.0,3.0,4.0})
If (feature 6 <= 6.0)
Predict: 0.0
Else (feature 6 > 6.0)
Predict: 0.0
Tree 6 (weight 1.0):
If (feature 4 in {0.0,3.0,4.0})
If (feature 5 <= 12.0)
If (feature 7 in {1.0,2.0,3.0,4.0})
Predict: 0.0
Else (feature 7 not in {1.0,2.0,3.0,4.0})
If (feature 6 <= 3.0)
Predict: 0.0
Else (feature 6 > 3.0)
Predict: 1.0
Else (feature 5 > 12.0)
If (feature 7 in {0.0,1.0,2.0})
If (feature 6 <= 1.0)
If (feature 7 in {0.0,2.0})
Predict: 0.0
Else (feature 7 not in {0.0,2.0})
Predict: 0.0
Else (feature 6 > 1.0)
If (feature 1 <= 37.0)
Predict: 1.0
Else (feature 1 > 37.0)
Predict: 0.0
Else (feature 7 not in {0.0,1.0,2.0})
If (feature 1 <= 17.5)
If (feature 4 in {3.0})
Predict: 0.0
Else (feature 4 not in {3.0})
Predict: 1.0
Else (feature 1 > 17.5)
If (feature 6 <= 4.0)
Predict: 0.0
Else (feature 6 > 4.0)
Predict: 0.0
Else (feature 4 not in {0.0,3.0,4.0})
If (feature 7 in {0.0,4.0})
If (feature 5 <= 12.0)
If (feature 2 <= 0.125)
Predict: 0.0
Else (feature 2 > 0.125)
If (feature 1 <= 17.5)
Predict: 1.0
Else (feature 1 > 17.5)
Predict: 0.0
Else (feature 5 > 12.0)
If (feature 7 in {0.0})
If (feature 1 <= 42.0)
Predict: 1.0
Else (feature 1 > 42.0)
Predict: 0.0
Else (feature 7 not in {0.0})
If (feature 2 <= 1.5)
Predict: 0.0
Else (feature 2 > 1.5)
Predict: 0.0
Else (feature 7 not in {0.0,4.0})
If (feature 6 <= 4.0)
If (feature 7 in {3.0})
If (feature 0 in {0.0})
Predict: 0.0
Else (feature 0 not in {0.0})
Predict: 0.0
Else (feature 7 not in {3.0})
If (feature 5 <= 16.0)
Predict: 0.0
Else (feature 5 > 16.0)
Predict: 1.0
Else (feature 6 > 4.0)
If (feature 6 <= 6.0)
If (feature 3 in {0.0})
Predict: 0.0
Else (feature 3 not in {0.0})
Predict: 1.0
Else (feature 6 > 6.0)
If (feature 5 <= 18.0)
Predict: 1.0
Else (feature 5 > 18.0)
Predict: 0.0
Tree 7 (weight 1.0):
If (feature 7 in {0.0,2.0,4.0})
If (feature 2 <= 1.5)
If (feature 4 in {1.0,2.0,3.0})
If (feature 1 <= 17.5)
Predict: 1.0
Else (feature 1 > 17.5)
Predict: 0.0
Else (feature 4 not in {1.0,2.0,3.0})
If (feature 5 <= 14.0)
If (feature 0 in {0.0})
Predict: 0.0
Else (feature 0 not in {0.0})
Predict: 1.0
Else (feature 5 > 14.0)
Predict: 0.0
Else (feature 2 > 1.5)
If (feature 7 in {0.0,2.0})
If (feature 4 in {1.0,3.0,4.0})
If (feature 5 <= 16.0)
Predict: 0.0
Else (feature 5 > 16.0)
Predict: 0.0
Else (feature 4 not in {1.0,3.0,4.0})
If (feature 6 <= 5.0)
Predict: 1.0
Else (feature 6 > 5.0)
Predict: 0.0
Else (feature 7 not in {0.0,2.0})
If (feature 4 in {0.0,1.0,3.0})
If (feature 1 <= 42.0)
Predict: 0.0
Else (feature 1 > 42.0)
Predict: 0.0
Else (feature 4 not in {0.0,1.0,3.0})
If (feature 5 <= 16.0)
Predict: 0.0
Else (feature 5 > 16.0)
Predict: 0.0
Else (feature 7 not in {0.0,2.0,4.0})
If (feature 2 <= 0.75)
Predict: 0.0
Else (feature 2 > 0.75)
If (feature 4 in {4.0})
If (feature 6 <= 5.0)
If (feature 1 <= 37.0)
Predict: 1.0
Else (feature 1 > 37.0)
Predict: 0.0
Else (feature 6 > 5.0)
Predict: 0.0
Else (feature 4 not in {4.0})
If (feature 5 <= 12.0)
If (feature 1 <= 27.0)
Predict: 0.0
Else (feature 1 > 27.0)
Predict: 0.0
Else (feature 5 > 12.0)
If (feature 7 in {1.0})
Predict: 1.0
Else (feature 7 not in {1.0})
Predict: 0.0
Tree 8 (weight 1.0):
If (feature 5 <= 16.0)
If (feature 4 in {0.0,1.0})
If (feature 0 in {0.0})
If (feature 2 <= 0.75)
If (feature 1 <= 17.5)
Predict: 1.0
Else (feature 1 > 17.5)
Predict: 0.0
Else (feature 2 > 0.75)
If (feature 6 <= 4.0)
Predict: 0.0
Else (feature 6 > 4.0)
Predict: 0.0
Else (feature 0 not in {0.0})
If (feature 5 <= 12.0)
Predict: 1.0
Else (feature 5 > 12.0)
If (feature 7 in {2.0,4.0})
Predict: 0.0
Else (feature 7 not in {2.0,4.0})
Predict: 0.0
Else (feature 4 not in {0.0,1.0})
If (feature 7 in {0.0,2.0,3.0,4.0})
If (feature 1 <= 22.0)
If (feature 6 <= 3.0)
Predict: 0.0
Else (feature 6 > 3.0)
Predict: 0.0
Else (feature 1 > 22.0)
If (feature 6 <= 6.0)
Predict: 0.0
Else (feature 6 > 6.0)
Predict: 1.0
Else (feature 7 not in {0.0,2.0,3.0,4.0})
If (feature 1 <= 42.0)
If (feature 6 <= 4.0)
Predict: 0.0
Else (feature 6 > 4.0)
Predict: 1.0
Else (feature 1 > 42.0)
Predict: 0.0
Else (feature 5 > 16.0)
If (feature 5 <= 18.0)
If (feature 4 in {3.0})
If (feature 7 in {1.0,2.0,3.0})
Predict: 0.0
Else (feature 7 not in {1.0,2.0,3.0})
If (feature 6 <= 5.0)
Predict: 0.0
Else (feature 6 > 5.0)
Predict: 0.0
Else (feature 4 not in {3.0})
If (feature 2 <= 0.75)
Predict: 0.0
Else (feature 2 > 0.75)
If (feature 3 in {0.0})
Predict: 0.0
Else (feature 3 not in {0.0})
Predict: 1.0
Else (feature 5 > 18.0)
If (feature 1 <= 27.0)
If (feature 7 in {3.0})
If (feature 3 in {0.0})
Predict: 0.0
Else (feature 3 not in {0.0})
Predict: 1.0
Else (feature 7 not in {3.0})
If (feature 2 <= 4.0)
Predict: 0.0
Else (feature 2 > 4.0)
Predict: 1.0
Else (feature 1 > 27.0)
If (feature 6 <= 5.0)
If (feature 6 <= 4.0)
Predict: 0.0
Else (feature 6 > 4.0)
Predict: 0.0
Else (feature 6 > 5.0)
If (feature 4 in {3.0,4.0})
Predict: 0.0
Else (feature 4 not in {3.0,4.0})
Predict: 0.0
Tree 9 (weight 1.0):
If (feature 5 <= 16.0)
If (feature 6 <= 2.0)
If (feature 1 <= 42.0)
If (feature 6 <= 1.0)
If (feature 5 <= 9.0)
Predict: 1.0
Else (feature 5 > 9.0)
Predict: 0.0
Else (feature 6 > 1.0)
If (feature 1 <= 27.0)
Predict: 0.0
Else (feature 1 > 27.0)
Predict: 1.0
Else (feature 1 > 42.0)
Predict: 0.0
Else (feature 6 > 2.0)
If (feature 1 <= 27.0)
If (feature 5 <= 14.0)
If (feature 6 <= 3.0)
Predict: 0.0
Else (feature 6 > 3.0)
Predict: 0.0
Else (feature 5 > 14.0)
Predict: 0.0
Else (feature 1 > 27.0)
If (feature 4 in {1.0,2.0,4.0})
If (feature 5 <= 9.0)
Predict: 0.0
Else (feature 5 > 9.0)
Predict: 0.0
Else (feature 4 not in {1.0,2.0,4.0})
If (feature 7 in {2.0,3.0,4.0})
Predict: 0.0
Else (feature 7 not in {2.0,3.0,4.0})
Predict: 1.0
Else (feature 5 > 16.0)
If (feature 6 <= 4.0)
If (feature 4 in {3.0})
Predict: 0.0
Else (feature 4 not in {3.0})
If (feature 1 <= 42.0)
If (feature 3 in {0.0})
Predict: 0.0
Else (feature 3 not in {0.0})
Predict: 0.0
Else (feature 1 > 42.0)
Predict: 1.0
Else (feature 6 > 4.0)
If (feature 4 in {3.0,4.0})
If (feature 1 <= 37.0)
If (feature 3 in {0.0})
Predict: 0.0
Else (feature 3 not in {0.0})
Predict: 0.0
Else (feature 1 > 37.0)
If (feature 1 <= 42.0)
Predict: 0.0
Else (feature 1 > 42.0)
Predict: 0.0
Else (feature 4 not in {3.0,4.0})
If (feature 4 in {0.0,2.0})
If (feature 7 in {0.0,1.0,2.0})
Predict: 1.0
Else (feature 7 not in {0.0,1.0,2.0})
Predict: 1.0
Else (feature 4 not in {0.0,2.0})
If (feature 0 in {0.0})
Predict: 0.0
Else (feature 0 not in {0.0})
Predict: 0.0

随机森林模型调优

// 字段转换成特征向量
val assembler = new VectorAssembler().setInputCols(featuresArray).setOutputCol("features")
val vecDF: DataFrame = assembler.transform(dataLabelDF)
vecDF.show(10, truncate = false) // 将数据分为训练和测试集(30%进行测试)
val Array(trainingDF, testDF) = vecDF.randomSplit(Array(0.7, 0.3)) // 索引标签,将元数据添加到标签列中
val labelIndexer = new StringIndexer().setInputCol("label").setOutputCol("indexedLabel").fit(vecDF)
//labelIndexer.transform(vecDF).show(10, truncate = false) // 自动识别分类的特征,并对它们进行索引
// 具有大于5个不同的值的特征被视为连续。
val featureIndexer = new VectorIndexer().setInputCol("features").setOutputCol("indexedFeatures").setMaxCategories(5).fit(vecDF)
//featureIndexer.transform(vecDF).show(10, truncate = false) // 训练随机森林模型
val rf = new RandomForestClassifier().setLabelCol("indexedLabel").setFeaturesCol("indexedFeatures") // 将索引标签转换回原始标签
val labelConverter = new IndexToString().setInputCol("prediction").setOutputCol("predictedLabel").setLabels(labelIndexer.labels) // Chain indexers and forest in a Pipeline.
val pipeline = new Pipeline().setStages(Array(labelIndexer, featureIndexer, rf, labelConverter)) // 设置参数网格
//impurity 不纯度
//maxBins 离散化"连续特征"的最大划分数
//maxDepth 树的最大深度
//minInfoGain 一个节点分裂的最小信息增益,值为[0,1]
//minInstancesPerNode 每个节点包含的最小样本数 >=1
//numTrees 树的数量
//featureSubsetStrategy // 在每个树节点处分割的特征数,参数值比较多,详细的请参考官方文档
//SubsamplingRate(1.0) 给每棵树分配“学习数据”的比例,范围(0, 1]
//maxMemoryInMB 如果太小,则每次迭代将拆分1个节点,其聚合可能超过此大小。
//checkpointInterval 设置检查点间隔(> = 1)或禁用检查点(-1)。 例如 10意味着,每10次迭代,缓存将获得检查点。
//cacheNodeIds 如果为false,则算法将树传递给执行器以将实例与节点匹配。 如果为true,算法将缓存每个实例的节点ID。 缓存可以加速更大深度的树的训练。 用户可以通过设置checkpointInterval来设置检查或禁用缓存的频率。(default = false)
//seed 种子
val paramGrid = new ParamGridBuilder()
.addGrid(rf.impurity, Array("entropy", "gini"))
.addGrid(rf.maxBins, Array(32, 64))
.addGrid(rf.maxDepth, Array(5, 7, 10))
.addGrid(rf.minInfoGain, Array(0, 0.5, 1))
.addGrid(rf.minInstancesPerNode, Array(10, 20))
.addGrid(rf.numTrees, Array(20, 50))
.addGrid(rf.featureSubsetStrategy, Array("auto", "sqrt"))
.addGrid(rf.subsamplingRate, Array(0.8, 1))
.addGrid(rf.maxMemoryInMB, Array(256, 512))
.addGrid(rf.checkpointInterval, Array(10, 20))
.addGrid(rf.cacheNodeIds, Array(false, true))
.addGrid(rf.seed, Array(123456L, 111L))
.build() // 选择(预测标签,实际标签),并计算测试误差。indexedLabel与prediction都是索引化的,因此可以直接比较
val classEvaluator = new MulticlassClassificationEvaluator().setLabelCol("indexedLabel").setPredictionCol("prediction").setMetricName("accuracy") // 设置交叉验证
val cv = new CrossValidator().setEstimator(pipeline).setEvaluator(classEvaluator).setEstimatorParamMaps(paramGrid).setNumFolds(5) // 执行交叉验证,并选择出最好的参数集合
val cvModel = cv.fit(trainingDF) // 查看全部参数
cvModel.extractParamMap()
// cvModel.avgMetrics.length=cvModel.getEstimatorParamMaps.length
// cvModel.avgMetrics与cvModel.getEstimatorParamMaps中的元素一一对应
cvModel.avgMetrics.length
cvModel.avgMetrics // 参数对应的平均度量 cvModel.getEstimatorParamMaps.length
cvModel.getEstimatorParamMaps // 参数组合的集合 cvModel.getEvaluator.extractParamMap() // 评估的参数 cvModel.getEvaluator.isLargerBetter // 评估的度量值是大的好,还是小的好 ,根据评估度量,系统会自动识别
cvModel.getNumFolds // 交叉验证的折数 //################################
// 测试模型
val predictDF: DataFrame = cvModel.transform(testDF).selectExpr(
//"race","poverty","smoke","alcohol","agemth","ybirth","yschool","pc3mth", "features",
"predictedLabel", "label", "features")
predictDF.show(20, false)

Spark2 Random Forests 随机森林的更多相关文章

  1. R语言之Random Forest随机森林

    什么是随机森林? 随机森林就是通过集成学习的思想将多棵树集成的一种算法,它的基本单元是决策树,而它的本质属于机器学习的一大分支——集成学习(Ensemble Learning)方法.随机森林的名称中有 ...

  2. 机器学习中的算法(1)-决策树模型组合之随机森林与GBDT

    版权声明: 本文由LeftNotEasy发布于http://leftnoteasy.cnblogs.com, 本文可以被全部的转载或者部分使用,但请注明出处,如果有问题,请联系wheeleast@gm ...

  3. 机器学习中的算法——决策树模型组合之随机森林与GBDT

    前言: 决策树这种算法有着很多良好的特性,比如说训练时间复杂度较低,预测的过程比较快速,模型容易展示(容易将得到的决策树做成图片展示出来)等.但是同时,单决策树又有一些不好的地方,比如说容易over- ...

  4. paper 84:机器学习算法--随机森林

    http://www.cnblogs.com/wentingtu/archive/2011/12/13/2286212.html中一些内容 基础内容: 这里只是准备简单谈谈基础的内容,主要参考一下别人 ...

  5. 决策树模型组合之(在线)随机森林与GBDT

    前言: 决策树这种算法有着很多良好的特性,比如说训练时间复杂度较低,预测的过程比较快速,模型容易展示(容易将得到的决策树做成图片展示出来)等.但是同时, 单决策树又有一些不好的地方,比如说容易over ...

  6. 机器学习中的算法-决策树模型组合之随机森林与GBDT

    机器学习中的算法(1)-决策树模型组合之随机森林与GBDT 版权声明: 本文由LeftNotEasy发布于http://leftnoteasy.cnblogs.com, 本文可以被全部的转载或者部分使 ...

  7. 随机森林与GBDT

    前言: 决策树这种算法有着很多良好的特性,比如说训练时间复杂度较低,预测的过程比较快速,模型容易展示(容易将得到的决策树做成图片展示出来)等.但是同时,单决策树又有一些不好的地方,比如说容易over- ...

  8. 决策树模型组合之随机森林与GBDT

    版权声明: 本文由LeftNotEasy发布于http://leftnoteasy.cnblogs.com, 本文可以被全部的转载或者部分使用,但请注明出处,如果有问题,请联系wheeleast@gm ...

  9. 决策树模型组合之随机森林与GBDT(转)

    版权声明: 本文由LeftNotEasy发布于http://leftnoteasy.cnblogs.com, 本文可以被全部的转载或者部分使用,但请注明出处,如果有问题,请联系wheeleast@gm ...

随机推荐

  1. maven学习二(dependencies)

    在前面一篇文章maven学习一(HelloWorld工程)已经对maven有了基本介绍,本文开始介绍maven依赖,通过如何如何增加log4j来学习maven的dependencies no-depe ...

  2. 奇怪吸引子---Lorenz

    奇怪吸引子是混沌学的重要组成理论,用于演化过程的终极状态,具有如下特征:终极性.稳定性.吸引性.吸引子是一个数学概念,描写运动的收敛类型.它是指这样的一个集合,当时间趋于无穷大时,在任何一个有界集上出 ...

  3. 如何用C#动态编译、执行代码

    在开始之前,先熟悉几个类及部分属性.方法:CSharpCodeProvider.ICodeCompiler.CompilerParameters.CompilerResults.Assembly. 一 ...

  4. 微信小程序无法获取UnionId的情况及处理

    问题背景:做了微信小程序,一切都还正常,但是最后体验版放出去时,却发现很多用户无法绑定用户,后台返回:参数非法.经过多方排查,发现是微信拿到的code请求返回的数据里没有UnionId,也就是接口返回 ...

  5. js-BootstrapValidator简单使用

    本例使用版本 <!-- 新 Bootstrap 核心 CSS 文件 --> <link href="http://cdn.static.runoob.com/libs/bo ...

  6. 【ZH奶酪】如何用Python实现编辑距离?

    1. 什么是编辑距离? 编辑距离(Edit Distance),又称Levenshtein距离,是指两个字串之间,由一个转成另一个所需的最少编辑操作次数.许可的编辑操作包括将一个字符替换成另一个字符, ...

  7. hihocoder第238周:杨氏矩阵的个数

    题目链接 问题描述 给定一个N行M列的矩阵,往里面填入$1-N\times M$个数字,使得这个矩阵每行.每列都满足递增.问:有多少种填法? 问题分析 这个问题很难,如果能够直接想到,那就是天才了. ...

  8. 基于CentOS搭建基于 ZIPKIN 的数据追踪系统

    系统要求:CentOS 7.2 64 位操作系统 配置 Java 环境 安装 JDK Zipkin 使用 Java8 -openjdk* -y 安装完成后,查看是否安装成功: java -versio ...

  9. Effective Java 第三版——45. 明智审慎地使用Stream

    Tips <Effective Java, Third Edition>一书英文版已经出版,这本书的第二版想必很多人都读过,号称Java四大名著之一,不过第二版2009年出版,到现在已经将 ...

  10. react-无状态组件

    import React, { Component } from "react"; //import PostItem from "./PostItem"; / ...