lakala GradientBoostedTrees

/**

  * Created by lkl on 2017/12/6.

  */

import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics

import org.apache.spark.mllib.linalg.Vectors

import org.apache.spark.mllib.regression.LabeledPoint

import org.apache.spark.mllib.tree.GradientBoostedTrees

import org.apache.spark.mllib.tree.configuration.BoostingStrategy

import org.apache.spark.sql.hive.HiveContext

import org.apache.spark.{SparkConf, SparkContext}

import scala.collection.mutable.ArrayBuffer

object GradientBoostingClassificationForLK {

//http://blog.csdn.net/xubo245/article/details/51499643

  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setAppName("GradientBoostingClassificationForLK")

    val sc = new SparkContext(conf)

    // sc is an existing SparkContext.

    val hc = new HiveContext(sc)

    if(args.length!=){

      println("请输入参数：trainingData对应的库名、表名、模型运行时间")

      System.exit()

    }

    //分别传入库名、表名、对比效果路径

//    val database = args(0)

//    val table = args(1)

//    val date = args(2)

     //lkl_card_score.overdue_result_all_new_woe

     val format = new java.text.SimpleDateFormat("yyyyMMdd")

     val database ="lkl_card_score"

     val table = "overdue_result_all_new_woe"

     val date =format.format(new java.util.Date())

    //提取数据集 RDD[LabeledPoint]

    //val data = hc.sql(s"select * from $database.$table").map{

    val data = hc.sql(s"select * from lkl_card_score.overdue_result_all_new_woe").map{

      row =>

        var arr = new ArrayBuffer[Double]()

        //剔除label、contact字段

        for(i <-  until row.size){

          if(row.isNullAt(i)){

            arr += 0.0

          }

          else if(row.get(i).isInstanceOf[Int])

            arr += row.getInt(i).toDouble

          else if(row.get(i).isInstanceOf[Double])

            arr += row.getDouble(i)

          else if(row.get(i).isInstanceOf[Long])

            arr += row.getLong(i).toDouble

          else if(row.get(i).isInstanceOf[String])

            arr += 0.0

        }

        LabeledPoint(row.getInt(), Vectors.dense(arr.toArray))

    }

    // Split the data into training and test sets (30% held out for testing)

    val splits = data.randomSplit(Array(0.7, 0.3))

    val (trainingData, testData) = (splits(), splits())

    // Train a GradientBoostedTrees model.

    // The defaultParams for Classification use LogLoss by default.

    val boostingStrategy = BoostingStrategy.defaultParams("Classification")

    boostingStrategy.setNumIterations() // Note: Use more iterations in practice.

    boostingStrategy.treeStrategy.setNumClasses()

    boostingStrategy.treeStrategy.setMaxDepth()

    // Empty categoricalFeaturesInfo indicates all features are continuous.

    //boostingStrategy.treeStrategy.setCategoricalFeaturesInfo(Map[Int, Int]())

    val model = GradientBoostedTrees.train(trainingData, boostingStrategy)

    // Evaluate model on test instances and compute test error

    val predictionAndLabels = testData.map { point =>

      val prediction = model.predict(point.features)

      (point.label, prediction)

    }

    predictionAndLabels.map(x => {"predicts: "+x._1+"--> labels:"+x._2}).saveAsTextFile(s"hdfs://ns1/tmp/$date/predictionAndLabels")

    //===================================================================

    //使用BinaryClassificationMetrics评估模型

    val metrics = new BinaryClassificationMetrics(predictionAndLabels)

    // Precision by threshold

    val precision = metrics.precisionByThreshold

    precision.map({case (t, p) =>

      "Threshold: "+t+"Precision:"+p

    }).saveAsTextFile(s"hdfs://ns1/tmp/$date/precision")

    // Recall by threshold

    val recall = metrics.recallByThreshold

    recall.map({case (t, r) =>

      "Threshold: "+t+"Recall:"+r

    }).saveAsTextFile(s"hdfs://ns1/tmp/$date/recall")

    //the beta factor in F-Measure computation.

    val f1Score = metrics.fMeasureByThreshold

    f1Score.map(x => {"Threshold: "+x._1+"--> F-score:"+x._2+"--> Beta = 1"})

      .saveAsTextFile(s"hdfs://ns1/tmp/$date/f1Score")

    /**

      * 如果要选择Threshold, 这三个指标中, 自然F1最为合适

      * 求出最大的F1, 对应的threshold就是最佳的threshold

      */

    /*val maxFMeasure = f1Score.select(max("F-Measure")).head().getDouble(0)

    val bestThreshold = f1Score.where($"F-Measure" === maxFMeasure)

      .select("threshold").head().getDouble(0)*/

    // Precision-Recall Curve

    val prc = metrics.pr

    prc.map(x => {"Recall: " + x._1 + "--> Precision: "+x._2 }).saveAsTextFile(s"hdfs://ns1/tmp/$date/prc")

    // AUPRC，精度，召回曲线下的面积

    val auPRC = metrics.areaUnderPR

    sc.makeRDD(Seq("Area under precision-recall curve = " +auPRC)).saveAsTextFile(s"hdfs://ns1/tmp/$date/auPRC")

    //roc

    val roc = metrics.roc

    roc.map(x => {"FalsePositiveRate:" + x._1 + "--> Recall: " +x._2}).saveAsTextFile(s"hdfs://ns1/tmp/$date/roc")

    // AUC

    val auROC = metrics.areaUnderROC

    sc.makeRDD(Seq("Area under ROC = " + +auROC)).saveAsTextFile(s"hdfs://ns1/tmp/$date/auROC")

    println("Area under ROC = " + auROC)

    val testErr = predictionAndLabels.filter(r => r._1 != r._2).count.toDouble / testData.count()

    sc.makeRDD(Seq("Test Mean Squared Error = " + testErr)).saveAsTextFile(s"hdfs://ns1/tmp/$date/testErr")

    sc.makeRDD(Seq("Learned regression tree model: " + model.toDebugString)).saveAsTextFile(s"hdfs://ns1/tmp/$date/GBDTclassification")

  }

}

lakala GradientBoostedTrees的更多相关文章

lakala反欺诈建模实际应用代码GBDT监督学习
/** * Created by lkl on 2018/1/16. */ import org.apache.spark.mllib.evaluation.BinaryClassificationM ...
lakala proportion轨迹分析代码
/** * Created by lkl on 2017/12/7. */ import breeze.numerics.abs import org.apache.spark.sql.SQLCont ...
决策树和基于决策树的集成方法（DT,RF,GBDT,XGBT）复习总结
摘要: 1.算法概述 2.算法推导 3.算法特性及优缺点 4.注意事项 5.实现和具体例子内容: 1.算法概述 1.1 决策树(DT)是一种基本的分类和回归方法.在分类问题中它可以认为是if-the ...
《Spark 官方文档》机器学习库（MLlib）指南
spark-2.0.2 机器学习库(MLlib)指南 MLlib是Spark的机器学习(ML)库.旨在简化机器学习的工程实践工作,并方便扩展到更大规模.MLlib由一些通用的学习算法和工具组成,包括分 ...
ORACLE11G常用函数
1 单值函数 1.1 日期函数 1.1.1 Round [舍入到最接近的日期](day:舍入到最接近的星期日) select sysdate S1, round(sysdate) S2 , round ...
决策树和基于决策树的集成方法（DT,RF,GBDT,XGB）复习总结
摘要: 1.算法概述 2.算法推导 3.算法特性及优缺点 4.注意事项 5.实现和具体例子内容: 1.算法概述 1.1 决策树(DT)是一种基本的分类和回归方法.在分类问题中它可以认为是if-the ...
MLlib--GBDT算法
转载请标明出处http://www.cnblogs.com/haozhengfei/p/8b9cb1875288d9f6cfc2f5a9b2f10eac.html GBDT算法江湖传言:GBDT算法 ...
spark MLlib Classification and regression 学习
二分类:SVMs,logistic regression,decision trees,random forests,gradient-boosted trees,naive Bayes 多分类: ...
Oracle分析函数及常用函数： over(),rank()over()作用及用法--分区(分组)求和& 不连续/连续排名
(1) 函数: over()的作用及用法: -- 分区(分组)求和. sum() over( partition by column1 order by column2 )主要用来对某个字 ...

随机推荐

Nvidia显卡安装驱动
首先要知道需要安装哪个类型的显卡驱动,可以使用如下指令查看 sudo add-apt-repository ppa:graphics-drivers在#current那一栏中找到显卡驱动型号,使用的是 ...
Spring boot 报错 Unable to start EmbeddedWebApplicationContext due to missing EmbeddedServletContainerFactory bean.
在实际开发中修改别人的代码,发现了这个报错,后来发现是因为pom.xml里面只要将注释掉的部分注释掉就好了.
C#如何调用R
1. 现在R中安装rscproxy库 > install.packages(rscproxy) > library(rscproxy) 2. 到这个网站http://rcom.univ ...
PostgreSQL LIKE 查询效率提升实验<转>
一.未做索引的查询效率作为对比,先对未索引的查询做测试 EXPLAIN ANALYZE select * from gallery_map where author = '曹志耘'; QUERY P ...
uboot下emmc内容烧写（拷贝）步骤
一.目的:嵌入式开发板,通过emmc上的内核文件加载启动linux操作系统,以及存放其他程序文件.需要将所需文件先写入emmc中. 二.总体步骤是:uboot启动后,进入linux下,将emmc分区并 ...
MVC个人网站开发笔记-150302
上传图片参考这篇文章:http://www.cnblogs.com/kissdodog/archive/2012/12/15/2819025.html 调用ajaxFileUpload,控制器里面编 ...
kafka生产消费原理笔记
一.什么是kafka Kafka是最初由Linkedin公司开发,是一个分布式.支持分区的(partition).多副本的(replica),基于zookeeper协调的分布式消息系统,它的最大的特性 ...
jsp标准动作
JSP标准动作元素的使用格式为:<jsp:标记名>,它採用严格的xml标签语法来表示.这些jsp标签动作元素是在用户请求阶段运行的,这些标准动作元素是内置在jsp文件里的,所以能够直接使用 ...
百度地图Api进阶教程-弹出信息窗口5.html
<!DOCTYPE html> <html> <head> <meta name="viewport" content="ini ...
Web API（七）：Basic基础认证
1.WebApi中为什么需要身份认证我们在使用WebApi的时候,都是通过URL去获取数据.也就是说,任何人只要知道了URL地址,就能随意的访问后台的服务接口,就可以访问或者修改数据库数据了,这样就 ...

lakala GradientBoostedTrees

lakala GradientBoostedTrees的更多相关文章

随机推荐

热门专题