scala-spark练手--dataframe数据可视化初稿
成品:http://www.cnblogs.com/drawwindows/p/5640606.html 初稿:
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{Logging, SparkConf, SparkContext}
import org.apache.spark.sql.{DataFrame, Row, SaveMode, _}
import com.alibaba.fastjson.{JSON, JSONObject}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path} import scala.collection.mutable.ArrayBuffer object DataFrameVisiualize extends Logging { def runforstatistic(hiveContext: HiveContext, params: JSONObject) = {
val arr = params.getJSONArray("targetType")
var i = 0
while( i < arr.size()){
val obj = arr.getJSONObject(i)
if("dataset".equalsIgnoreCase(obj.getString("targetType"))){
val tableNameKey = obj.getString("targetName")
val tableName = params.getString(tableNameKey)
val user = params.getString("user")
run(hiveContext, tableName, user)
}
i = i+1
}
} def run(hiveContext: HiveContext, tableName: String, user: String) = {
val pathParent = s"/user/$user/mlaas/tableStatistic/$tableName"
// val conf = new SparkConf().setAppName("DataFrameVisiualizeJob")
// val sc = new SparkContext(conf)
// val hiveContext = new HiveContext(sc)
// val sqlContext = new SQLContext(sc)
//0.获取DB的schema信息
val schemadf = hiveContext.sql("desc " + tableName)
//schema信息落地
val filePathSchema = pathParent + "/schemajson"
schemadf.write.mode(SaveMode.Overwrite).format("json").save(filePathSchema) //1.加载表到dataframe
val df = hiveContext.sql("select * from " + tableName)
//2.获取dataframe的describe信息,默认为获取到的都为数值型列
val dfdesc = df.describe()
// //3.描述信息落地
// val filePath = pathParent + "/describejson"
// des.write.mode(SaveMode.Overwrite).format("json").save(filePath)
// val dfdesc = sqlContext.read.format("json").load(filePath) //4.列信息区分为mathColArr 和 strColArr
val mathColArr = dfdesc.columns.filter(!_.equalsIgnoreCase("summary"))
val (colMin, colMax, colMean, colStddev, colMedian) = getDesfromDF(dfdesc, mathColArr)
val allColArr = df.columns val strColArr = allColArr.filter(!_.equalsIgnoreCase("summary")).diff(mathColArr) saveRecords(hiveContext, tableName, 100, pathParent + "/recordsjson")
val jsonobj = getAllStatistics(hiveContext, tableName, allColArr, strColArr, mathColArr, 10, colMin, colMax) jsonobj.put("colMin", colMin)
jsonobj.put("colMax", colMax)
jsonobj.put("colMean", colMean)
jsonobj.put("colStddev", colStddev)
jsonobj.put("colMedian", colMedian) val jsonStr = jsonobj.toString
val conf1 = new Configuration()
val fs = FileSystem.get(conf1)
val fileName = pathParent + "/jsonObj"
val path = new Path(fileName)
val hdfsOutStream = fs.create(path)
hdfsOutStream.writeBytes(jsonStr)
hdfsOutStream.flush()
hdfsOutStream.close()
// fs.close(); } def saveRecords(hiveContext: HiveContext, tableName: String, num: Int, filePath: String) : Unit = {
hiveContext.sql(s"select * from $tableName limit $num").write.mode(SaveMode.Overwrite).format("json").save(filePath)
}
/**
* 根据allCols, mathColArr, strColArr 三个数组,返回带有所有统计信息(除去已经根据describe获取到的)的dataframes。
* 返回的dataframe结果进行遍历,填充各个属性的值。
*/
def getAllStatistics(hiveContext: HiveContext, tableName: String, allColArr: Array[String], strColArr: Array[String], mathColArr: Array[String], partNum: Int, colMin: java.util.HashMap[String, Double], colMax: java.util.HashMap[String, Double]) :
JSONObject = {
val jsonobj = new JSONObject()
val sb = new StringBuffer()
sb.append("select ")
for(col <- allColArr){
sb.append(s"count(distinct($col)) as unique_$col , sum(case when $col is null then 1 else 0 end) as missing_$col, ")
}
sb.append(s"sum(1) as totalrows from $tableName")
val df = hiveContext.sql(sb.toString)
val colUnique = new java.util.HashMap[String, Long]//唯一值
val colMissing = new java.util.HashMap[String, Long]//缺失值
df.take(1).foreach(row => (jsonobj.put("totalrows", row.getAs[Long]("totalrows")),allColArr.foreach(col => (colUnique.put(col, row.getAs[Long]("unique_"+col)),colMissing.put(col, row.getAs[Long]("missing_"+col))) ) )) val dfArr = ArrayBuffer[DataFrame]()
val strHistogramSql = new StringBuffer()
strHistogramSql.append(s"""
SELECT tta.colName, tta.value, tta.num
FROM (
SELECT ta.colName, ta.value, ta.num, ROW_NUMBER() OVER (PARTITION BY ta.colName ORDER BY ta.num DESC) AS row
FROM (
""") var vergin = 0
for(col <- strColArr){
if(vergin == 1){
strHistogramSql.append(" UNION ALL ")
}
vergin = 1
strHistogramSql.append(s"""
SELECT 'StrHistogram_$col' AS colName, $col AS value, COUNT(1) AS num
FROM $tableName
GROUP BY $col """)
}
strHistogramSql.append(s"""
) ta
) tta
WHERE tta.row <= $partNum
""")
val dfStrHistogram = hiveContext.sql(strHistogramSql.toString)
dfArr.append(dfStrHistogram)
for(col <- mathColArr){
val df1 = hiveContext.sql(s"select 'Quartile_$col' as colName, ntil, max($col) as num from (select $col, ntile(4) OVER (order by $col)as ntil from $tableName) tt group by ntil ")
log.info("col is :" + col + ", min is :" + colMin.get(col) + ", max is : " + colMax.get(col))
//need toString first, then toDouble。 or:ClassCastException
val min = colMin.get(col).toString.toDouble
val max = colMax.get(col).toString.toDouble
val df2 = getHistogramMathDF(col, hiveContext, tableName, min, max, partNum)
dfArr.append(df1)
dfArr.append(df2)
}
val dfAll = dfArr.reduce(_.unionAll(_))
val allRows = dfAll.collect()
val mathColMapQuartile = new java.util.HashMap[String, Array[java.util.HashMap[String,Long]]] //四分位
val mathColMapHistogram = new java.util.HashMap[String, Array[java.util.HashMap[String,Long]]]//条形图
val strColMapHistogram = new java.util.HashMap[String, Array[java.util.HashMap[String,Long]]]//条形图
val (mathColMapQuartile1, mathColMapHistogram1, strColMapHistogram1) = readRows(allRows)
for(col <- strColArr){
strColMapHistogram.put(col,strColMapHistogram1.get(col).toArray[java.util.HashMap[String,Long]])
}
for(col <- mathColArr){
mathColMapQuartile.put(col,mathColMapQuartile1.get(col).toArray[java.util.HashMap[String,Long]])
mathColMapHistogram.put(col,mathColMapHistogram1.get(col).toArray[java.util.HashMap[String,Long]])
}
jsonobj.put("mathColMapQuartile", mathColMapQuartile)
jsonobj.put("mathColMapHistogram", mathColMapHistogram)
jsonobj.put("strColMapHistogram", strColMapHistogram)
jsonobj.put("colUnique", colUnique)
jsonobj.put("colMissing", colMissing)
jsonobj
}
def readRows(rows: Array[Row]) : (java.util.HashMap[String, ArrayBuffer[java.util.HashMap[String,Long]]] , java.util.HashMap[String, ArrayBuffer[java.util.HashMap[String,Long]]], java.util.HashMap[String, ArrayBuffer[java.util.HashMap[String,Long]]])={
val mathColMapQuartile = new java.util.HashMap[String, ArrayBuffer[java.util.HashMap[String,Long]]] //四分位
val mathColMapHistogram = new java.util.HashMap[String, ArrayBuffer[java.util.HashMap[String,Long]]]//条形图
val strColMapHistogram = new java.util.HashMap[String, ArrayBuffer[java.util.HashMap[String,Long]]]//条形图
rows.foreach( row => {
val colName = row.getAs[String]("colName")
if (colName.startsWith("StrHistogram")) {
val value = row.getAs[String](1)
val num = row.getAs[Long](2)
val map = new java.util.HashMap[String, Long]()
val col = colName.substring(colName.indexOf('_') + 1)
map.put(value, num)
val mapValue = strColMapHistogram.get(col)
if (mapValue == null) {
val mapValueNew = ArrayBuffer[java.util.HashMap[String, Long]]()
mapValueNew.append(map)
strColMapHistogram.put(col, mapValueNew)
} else {
mapValue.append(map)
strColMapHistogram.put(col, mapValue)
}
} else if (colName.toString.startsWith("Quartile")) {
val value = row.getAs[String](1)
val num = row.getAs[Long](2)
val map = new java.util.HashMap[String, Long]()
val col = colName.substring(colName.indexOf('_') + 1)
map.put(value, num)
val mapValue = mathColMapQuartile.get(col)
if (mapValue == null) {
val mapValueNew = ArrayBuffer[java.util.HashMap[String, Long]]()
mapValueNew.append(map)
mathColMapQuartile.put(col, mapValueNew)
} else {
mapValue.append(map)
mathColMapQuartile.put(col, mapValue)
}
} else if (colName.toString.startsWith("MathHistogram")) {
val value = row.getAs[String](1)
val num = row.getAs[Long](2)
val map = new java.util.HashMap[String, Long]()
val col = colName.substring(colName.indexOf('_') + 1)
map.put(value, num)
val mapValue = mathColMapHistogram.get(col)
if (mapValue == null) {
val mapValueNew = ArrayBuffer[java.util.HashMap[String, Long]]()
mapValueNew.append(map)
mathColMapHistogram.put(col, mapValueNew)
} else {
mapValue.append(map)
mathColMapHistogram.put(col, mapValue)
}
}
})
(mathColMapQuartile, mathColMapHistogram, strColMapHistogram)
}
/** 数值型的列的条形分布获取方法*/
def getHistogramMathDF(col : String, hiveContext: HiveContext, tableName: String, min: Double, max: Double, partNum: Int) : DataFrame = {
val len = (max - min) / partNum
log.info(s"len is : $len")
val sb = new StringBuffer()
sb.append(s"select $col, (case ")
val firstRight = min + len
sb.append(s" when ($col >= $min and $col <= $firstRight) then 1 ")
for (i <- 2 until (partNum + 1)) {
val left = min + len * (i - 1)
val right = min + len * i
sb.append(s" when ($col > $left and $col <= $right) then $i ")
}
sb.append(s" else 0 end ) as partNum from $tableName")
sb.insert(0, s"select 'MathHistogram_$col' as colName, partNum, count(1) as num from ( ")
sb.append(") temptableScala group by partNum")
log.info("getHistogram is: " + sb.toString)
val df = hiveContext.sql(sb.toString)
df
}
def getDesfromDF(dfdesc : DataFrame, mathColArr: Array[String]):
(java.util.HashMap[String, Double], java.util.HashMap[String, Double], java.util.HashMap[String, Double], java.util.HashMap[String, Double], java.util.HashMap[String, Double])= {
val allRows = dfdesc.collect()
val colMin = new java.util.HashMap[String, Double]//最小值
val colMax = new java.util.HashMap[String, Double]//最大值
val colMean = new java.util.HashMap[String, Double]//平均值
val colStddev = new java.util.HashMap[String, Double]//标准差
val colMedian = new java.util.HashMap[String, Double]//中位值
allRows.foreach(row => {
val mapKey = row.getAs[String]("summary")
for(col <- mathColArr){
if("mean".equalsIgnoreCase(mapKey)){
colMean.put(col, row.getAs[Double](col))
}else if("stddev".equalsIgnoreCase(mapKey)){
colStddev.put(col, row.getAs[Double](col))
}else if("min".equalsIgnoreCase(mapKey)){
log.info("col is " + col +", min is : "+ row.getAs[Double](col))
colMin.put(col, row.getAs[Double](col))
}else if("max".equalsIgnoreCase(mapKey)){
log.info("col is " + col +", max is : "+ row.getAs[Double](col))
colMax.put(col, row.getAs[Double](col))
}else{
colMedian.put(col, row.getAs[Double](col))
}
}
})
(colMin, colMax, colMean, colStddev, colMedian)
}
}
scala-spark练手--dataframe数据可视化初稿的更多相关文章
- Spark GraphX 的数据可视化
概述 Spark GraphX 本身并不提供可视化的支持, 我们通过第三方库 GraphStream 和 Breeze 来实现这一目标 详细 代码下载:http://www.demodashi.com ...
- 大数据技术之_27_电商平台数据分析项目_02_预备知识 + Scala + Spark Core + Spark SQL + Spark Streaming + Java 对象池
第0章 预备知识0.1 Scala0.1.1 Scala 操作符0.1.2 拉链操作0.2 Spark Core0.2.1 Spark RDD 持久化0.2.2 Spark 共享变量0.3 Spark ...
- 练手mysqlbinlog日志恢复数据(centos6.5 64,mysql5.1)
练手mysql bin log日志相关 系统是centos 6.5 64 阿里云的服务器 mysql版本5.1 1 如何开启bin-log日志? vi /etc/my.cnf [mysqld] log ...
- spark 将dataframe数据写入Hive分区表
从spark1.2 到spark1.3,spark SQL中的SchemaRDD变为了DataFrame,DataFrame相对于SchemaRDD有了较大改变,同时提供了更多好用且方便的API.Da ...
- python实现列表页数据的批量抓取练手练手的
python实现列表页数据的批量抓取,练手的,下回带分页的 #!/usr/bin/env python # coding=utf-8 import requests from bs4 import B ...
- Python--matplotlib 绘图可视化练手--折线图/条形图
最近学习matplotlib绘图可视化,感觉知识点比较多,边学习边记录. 对于数据可视化,个人建议Jupyter Notebook. 1.首先导包,设置环境 import pandas as pd i ...
- Spark入门之DataFrame/DataSet
目录 Part I. Gentle Overview of Big Data and Spark Overview 1.基本架构 2.基本概念 3.例子(可跳过) Spark工具箱 1.Dataset ...
- 使用bokeh-scala进行数据可视化
目录 前言 bokeh简介及胡扯 bokeh-scala基本代码 我的封装 总结 一.前言 最近在使用spark集群以及geotrellis框架(相关文章见http://www.cnbl ...
- 大数据基础知识问答----spark篇,大数据生态圈
Spark相关知识点 1.Spark基础知识 1.Spark是什么? UCBerkeley AMPlab所开源的类HadoopMapReduce的通用的并行计算框架 dfsSpark基于mapredu ...
随机推荐
- 遍历 DataSet
DataSet ds=new DataSet ; //获取dataset的第一张table,取其他table只须改下标 DataTable dt=ds.tables[]; //遍历行 foreach( ...
- 关于H5 storage 的一些注意事项以及用法
在我们使用H5 storage之前,先了解一下storage的介绍吧: Storage模块管理应用本地数据存储,用于应用数据的保存和读取,应用本地数据localStorage,sessionStora ...
- apache设置映射文件夹的配置方法
在apache的配置文件中加入以下配置 Alias /uploadImage F:/upload <Directory F:/upload/UploadFiles> Option ...
- 整理grep实战文本搜索过滤技巧
一:grep的简介: 文本搜索工具,根据用户指定的文本模式对目标文件进行逐行搜索,显示能够被模式所匹配到的行.配合正则表达式的使用可以实现强大的文本处理.下面一一说明正则的例子. 二:文本处理工具分类 ...
- Ubuntu Vim YouCompleteMe 安装
0. 必要工具安装 sudo apt-get install build-essential cmake 1. 安装 vundle mkdir ~/.vim/bundle git clone http ...
- Oracle分析函数之LEAD和LAG
LAG 访问结果集中当前行之前的行 LAG (value_expr [,offset] [,default])OVER ( [ partition_by_clause ] order_by_claus ...
- 微软Hololens学院教程-Hologram 212-Voice(语音)【微软教程已经更新,本文是老版本】
这是老版本的教程,为了不耽误大家的时间,请直接看原文,本文仅供参考哦!原文链接:https://developer.microsoft.com/EN-US/WINDOWS/HOLOGRAPHIC/ho ...
- Android Learning:微信第三方登录
这两天,解决了微信第三方授权登录的问题,作为一个新手,想想也是一把辛酸泪.我想着,就把我的遇到的坑给大家分享一下,避免新手遇到我这样的问题能够顺利避开. 步骤一 微信开发者平台 我开始的解决思路是,去 ...
- python 操作mongodb数据库模糊查询
# -*- coding: utf-8 -*-import pymongoimport refrom pymongo import MongoClient #创建连接#10.20.66.106clie ...
- Java 编程:如何提高性能?(简单总结篇)
开发者在编程中除了要有编程规范,还要注意性能,在 Java 编程中有什么提高性能的好办法呢? 本文转自国内 ITOM 行业领军企业 OneAPM Cloud Insight(一款能够优雅监控多种操作系 ...