lakala GradientBoostedTrees

/**

  * Created by lkl on 2017/12/6.

  */

import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics

import org.apache.spark.mllib.linalg.Vectors

import org.apache.spark.mllib.regression.LabeledPoint

import org.apache.spark.mllib.tree.GradientBoostedTrees

import org.apache.spark.mllib.tree.configuration.BoostingStrategy

import org.apache.spark.sql.hive.HiveContext

import org.apache.spark.{SparkConf, SparkContext}

import scala.collection.mutable.ArrayBuffer

object GradientBoostingClassificationForLK {

//http://blog.csdn.net/xubo245/article/details/51499643

  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setAppName("GradientBoostingClassificationForLK")

    val sc = new SparkContext(conf)

    // sc is an existing SparkContext.

    val hc = new HiveContext(sc)

    if(args.length!=){

      println("请输入参数：trainingData对应的库名、表名、模型运行时间")

      System.exit()

    }

    //分别传入库名、表名、对比效果路径

//    val database = args(0)

//    val table = args(1)

//    val date = args(2)

     //lkl_card_score.overdue_result_all_new_woe

     val format = new java.text.SimpleDateFormat("yyyyMMdd")

     val database ="lkl_card_score"

     val table = "overdue_result_all_new_woe"

     val date =format.format(new java.util.Date())

    //提取数据集 RDD[LabeledPoint]

    //val data = hc.sql(s"select * from $database.$table").map{

    val data = hc.sql(s"select * from lkl_card_score.overdue_result_all_new_woe").map{

      row =>

        var arr = new ArrayBuffer[Double]()

        //剔除label、contact字段

        for(i <-  until row.size){

          if(row.isNullAt(i)){

            arr += 0.0

          }

          else if(row.get(i).isInstanceOf[Int])

            arr += row.getInt(i).toDouble

          else if(row.get(i).isInstanceOf[Double])

            arr += row.getDouble(i)

          else if(row.get(i).isInstanceOf[Long])

            arr += row.getLong(i).toDouble

          else if(row.get(i).isInstanceOf[String])

            arr += 0.0

        }

        LabeledPoint(row.getInt(), Vectors.dense(arr.toArray))

    }

    // Split the data into training and test sets (30% held out for testing)

    val splits = data.randomSplit(Array(0.7, 0.3))

    val (trainingData, testData) = (splits(), splits())

    // Train a GradientBoostedTrees model.

    // The defaultParams for Classification use LogLoss by default.

    val boostingStrategy = BoostingStrategy.defaultParams("Classification")

    boostingStrategy.setNumIterations() // Note: Use more iterations in practice.

    boostingStrategy.treeStrategy.setNumClasses()

    boostingStrategy.treeStrategy.setMaxDepth()

    // Empty categoricalFeaturesInfo indicates all features are continuous.

    //boostingStrategy.treeStrategy.setCategoricalFeaturesInfo(Map[Int, Int]())

    val model = GradientBoostedTrees.train(trainingData, boostingStrategy)

    // Evaluate model on test instances and compute test error

    val predictionAndLabels = testData.map { point =>

      val prediction = model.predict(point.features)

      (point.label, prediction)

    }

    predictionAndLabels.map(x => {"predicts: "+x._1+"--> labels:"+x._2}).saveAsTextFile(s"hdfs://ns1/tmp/$date/predictionAndLabels")

    //===================================================================

    //使用BinaryClassificationMetrics评估模型

    val metrics = new BinaryClassificationMetrics(predictionAndLabels)

    // Precision by threshold

    val precision = metrics.precisionByThreshold

    precision.map({case (t, p) =>

      "Threshold: "+t+"Precision:"+p

    }).saveAsTextFile(s"hdfs://ns1/tmp/$date/precision")

    // Recall by threshold

    val recall = metrics.recallByThreshold

    recall.map({case (t, r) =>

      "Threshold: "+t+"Recall:"+r

    }).saveAsTextFile(s"hdfs://ns1/tmp/$date/recall")

    //the beta factor in F-Measure computation.

    val f1Score = metrics.fMeasureByThreshold

    f1Score.map(x => {"Threshold: "+x._1+"--> F-score:"+x._2+"--> Beta = 1"})

      .saveAsTextFile(s"hdfs://ns1/tmp/$date/f1Score")

    /**

      * 如果要选择Threshold, 这三个指标中, 自然F1最为合适

      * 求出最大的F1, 对应的threshold就是最佳的threshold

      */

    /*val maxFMeasure = f1Score.select(max("F-Measure")).head().getDouble(0)

    val bestThreshold = f1Score.where($"F-Measure" === maxFMeasure)

      .select("threshold").head().getDouble(0)*/

    // Precision-Recall Curve

    val prc = metrics.pr

    prc.map(x => {"Recall: " + x._1 + "--> Precision: "+x._2 }).saveAsTextFile(s"hdfs://ns1/tmp/$date/prc")

    // AUPRC，精度，召回曲线下的面积

    val auPRC = metrics.areaUnderPR

    sc.makeRDD(Seq("Area under precision-recall curve = " +auPRC)).saveAsTextFile(s"hdfs://ns1/tmp/$date/auPRC")

    //roc

    val roc = metrics.roc

    roc.map(x => {"FalsePositiveRate:" + x._1 + "--> Recall: " +x._2}).saveAsTextFile(s"hdfs://ns1/tmp/$date/roc")

    // AUC

    val auROC = metrics.areaUnderROC

    sc.makeRDD(Seq("Area under ROC = " + +auROC)).saveAsTextFile(s"hdfs://ns1/tmp/$date/auROC")

    println("Area under ROC = " + auROC)

    val testErr = predictionAndLabels.filter(r => r._1 != r._2).count.toDouble / testData.count()

    sc.makeRDD(Seq("Test Mean Squared Error = " + testErr)).saveAsTextFile(s"hdfs://ns1/tmp/$date/testErr")

    sc.makeRDD(Seq("Learned regression tree model: " + model.toDebugString)).saveAsTextFile(s"hdfs://ns1/tmp/$date/GBDTclassification")

  }

}

lakala GradientBoostedTrees的更多相关文章

lakala反欺诈建模实际应用代码GBDT监督学习
/** * Created by lkl on 2018/1/16. */ import org.apache.spark.mllib.evaluation.BinaryClassificationM ...
lakala proportion轨迹分析代码
/** * Created by lkl on 2017/12/7. */ import breeze.numerics.abs import org.apache.spark.sql.SQLCont ...
决策树和基于决策树的集成方法（DT,RF,GBDT,XGBT）复习总结
摘要: 1.算法概述 2.算法推导 3.算法特性及优缺点 4.注意事项 5.实现和具体例子内容: 1.算法概述 1.1 决策树(DT)是一种基本的分类和回归方法.在分类问题中它可以认为是if-the ...
《Spark 官方文档》机器学习库（MLlib）指南
spark-2.0.2 机器学习库(MLlib)指南 MLlib是Spark的机器学习(ML)库.旨在简化机器学习的工程实践工作,并方便扩展到更大规模.MLlib由一些通用的学习算法和工具组成,包括分 ...
ORACLE11G常用函数
1 单值函数 1.1 日期函数 1.1.1 Round [舍入到最接近的日期](day:舍入到最接近的星期日) select sysdate S1, round(sysdate) S2 , round ...
决策树和基于决策树的集成方法（DT,RF,GBDT,XGB）复习总结
摘要: 1.算法概述 2.算法推导 3.算法特性及优缺点 4.注意事项 5.实现和具体例子内容: 1.算法概述 1.1 决策树(DT)是一种基本的分类和回归方法.在分类问题中它可以认为是if-the ...
MLlib--GBDT算法
转载请标明出处http://www.cnblogs.com/haozhengfei/p/8b9cb1875288d9f6cfc2f5a9b2f10eac.html GBDT算法江湖传言:GBDT算法 ...
spark MLlib Classification and regression 学习
二分类:SVMs,logistic regression,decision trees,random forests,gradient-boosted trees,naive Bayes 多分类: ...
Oracle分析函数及常用函数： over(),rank()over()作用及用法--分区(分组)求和& 不连续/连续排名
(1) 函数: over()的作用及用法: -- 分区(分组)求和. sum() over( partition by column1 order by column2 )主要用来对某个字 ...

随机推荐

第21章 RTX 低功耗之睡眠模式
低功耗是 MCU 的一项非常重要的指标,比如某些可穿戴的设备,其携带的电量有限,如果整个电路消耗的电量特别大的话,就会经常出现电量不足的情况,影响用户体验. 本章节为大家讲解 M3/4的低功耗方式之睡 ...
html页面去掉滚动条
有时候特别需要,个别网页要去掉横向滚动条和竖向滚动条,那该怎么去掉呢,很简单,看代码: 让竖条没有: <body style=`overflow:-Scroll;overflow-y:hidde ...
JAVA-JSP内置对象之out对象
相关资料:<21天学通Java Web开发> out对象1.out对象用来向网页输出信息. 方法返回值方法说 ...
java基础篇---文件上传（commons-FileUpload组件）
上一篇讲解了smartupload组件上传,那么这一篇我们讲解commons-FileUpload组件上传 FileUpload是Apache组织(www.apache.org)提供的免费的上传组件, ...
[转]mysql写注释的几种方法
原文地址:https://www.cnblogs.com/JiangLe/p/6897403.html MySQL的注释风格总的来说有三种.它们分别是 1.单行注释可以用"#" s ...
Android—— Fragment 真正的完全解析（上）（转）
转载请标明出处:http://blog.csdn.net/lmj623565791/article/details/37970961 自从Fragment出现,曾经有段时间,感觉大家谈什么都能跟Fra ...
git 忽略已经添加到版本库的文件
第一步: 指令:git rm -r --cached YOUR_PATH YOUR_PATH 即你的文件,-r 指定了递归所有的子文件夹. 第二步: 修改项目根目录下的 .gitignore 文件, ...
solr报错 ERROR SolrDispatchFilter null:ClientAbortException: java.net.SocketException: Broken pipe 原因是nginx截断了请求
[root@localhost nginx]# lltotal 36drwx------. 2 www root 4096 Aug 13 13:25 client_body_tempdrwxr-xr- ...
RGB转灰度图的几种算法
https://blog.csdn.net/cool1949/article/details/6649429 方法一: 对于彩色转灰度,有一个很著名的心理学公式: Gray = R*0.299 ...
Spring Cloud / Spring Boot There was an unexpected error (type=Unauthorized, status=401). Full authentication is required to access this resource.
访问EndPoint时会出现没有权限 There was an unexpected error (type=Unauthorized, status=401). Full authenticat ...

lakala GradientBoostedTrees

lakala GradientBoostedTrees的更多相关文章

随机推荐

热门专题