spark_learn
package chapter03 import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext} /**
* Created by chenzechao on 2017/12/21.
*/ /**
spark-shell \
--master yarn-client \
--driver-memory 1G \
--driver-cores 1 \
--queue root.queue_0101_04 \
--executor-memory 2G \
--num-executors 2 \
--conf spark.executor.cores=1 \
--name 'tmp_abc_test' \
--conf spark.yarn.executor.memoryOverhead=4096 \
--conf spark.driver.maxResultSize=8G \
--conf spark.sql.hive.metastore.version=1.2.1 \
--conf spark.sql.shuffle.partitions=150
*/ object document {
// 0 获取参数flag //0.设置环境
val conf = new SparkConf().setAppName("tianchi").setMaster("local[*]")
val sc = new SparkContext(conf)
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
val hiveContext = new HiveContext(sc) val jsonFile = "file:///tmp/upload/data/json_file"
val jsonFile_hdfs = "/tmp/ccc/tmpc/json_file"
// 执行SQL
val df1 = sqlContext.sql("select * from sx_360_safe.sub_ladm_exc_app_s16_all_for_double").limit(200).cache()
df1.count() // Print the schema in a tree format
df1.printSchema() // Select only then "gu_flag" column
df1.select("gu_flag").show() // Select everybody, but increment the age by 1
df1.select(df1("empno"),df1("age"),df1("age") + 1 ).show // Select emp age older than 21
df1.filter(df1("age") > 21).select(df1("empno"),df1("age")).show() // Count emp by age
df1.groupBy(df1("age")).count().sort(df1("age")).show()
val gb = df1.groupBy(df1("age")).count()
gb.sort(gb("count")).show() // save dataFrame as json file
df1.write.mode("Overwrite").format("json").save(jsonFile_hdfs)
df1.write.mode("Append").format("json").save(jsonFile_hdfs)
df1.select(df1("empno"), df1("gu_flag")).write.mode("Overwrite").format("parquet").saveAsTable("sx_360_safe.tmp_czc_20180323_04") // this is used to implicitly convert an RDD to a DataFrame.
import sqlContext.implicits._ val df2 = sqlContext.read.json(jsonFile) // Encoders for most common types are automatically provided by importing sqlContext.implicits._
val ds1 = Seq(1, 2, 3).toDS()
ds1.map(_ + 1).collect() // Encoders are also created for case class
case class Person(name:String ,age: Long)
val ds = Seq(Person("Andy",35)).toDS()
ds.show() /**
* Inferring the Schema Using Reflection
*/
import sqlContext.implicits._
case class Person2(name:String, age:Int)
val people = sc.textFile("/tmp/ccc/data/tmpa").filter(_.length > 1).map(_.split(",")).map(p => Person2(p(0),p(1).trim.toInt)).toDF()
people.registerTempTable("people")
sqlContext.sql("select * from people limit 10").show val teenagers = sqlContext.sql("select name,age from people where age >= 23 and age<= 26")
teenagers.map(t => "Name: " + t(0)).collect().foreach(println) // or by field name
teenagers.map(t => "Name: " + t.getAs[String]("name")).collect().foreach(println) // row.getValuesMap[T] retrieves multiple columns at once into a Map[String,T]
teenagers.map(_.getValuesMap[Any](List("name","age"))).collect().foreach(println) /**
* Programmatically Specifying the Schema
*/
val schemaString = "name age"
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{StructType,StructField,StringType} val schema =
StructType(
schemaString.split(" ").map(fieldName => StructField(fieldName,StringType,true))
) // Convert records of the RDD (people) to Rows
val people2 = sc.textFile("/tmp/ccc/data/tmpa")
val rowRDD = people2.map(_.split(",")).map(p => Row(p(0),p(1).trim)) // Apply the schema to the RDD
val peopleDataFrame = sqlContext.createDataFrame(rowRDD,schema) // Register the DataFrames as a table
peopleDataFrame.registerTempTable("people") // SQL val df = sqlContext.read.load("/tmp/examples/src/main/resources/users.parquet") val df3 = sqlContext.read.format("json").load("/tmp/examples/src/main/resources/people.json") // Run SQL on files directly
val df4 = sqlContext.sql("select * from parquet.`/tmp/examples/src/main/resources/users.parquet`") // Save modes
/**
* ErrorIfExists (default)
* Append
* Overwrite
* Ignore
*/ val parquetFile = sqlContext.read.parquet("") }
spark_learn的更多相关文章
随机推荐
- Cg与RenderMonkey 之旅
http://news.mydrivers.com/1/15/15020_all.htm [前言] 您可能还没有意识到---您手头的这块显卡(或者说这块GPU)---它不仅仅是一个应用工具(游戏.平面 ...
- 标准模板库(STL)学习指南之priority_queue优先队列
转载自CSDN博客:http://blog.csdn.net/suwei19870312/article/details/5294016 priority_queue 调用 STL里面的 make_h ...
- 洛谷【P1601】A+B Problem(高精)
题目传送门:https://www.luogu.org/problemnew/show/P1601 高精度加法板子.我们灵性地回忆一波小学学加法列竖式的场景(从\(6\)岁开始口算从未打过草稿的大佬请 ...
- java代码throws异常
总结:抛出异常 package com.ds; //异常捕获 public class fdsg { private static void throwException() { try { Stri ...
- JVM体系结构之三:方法区之1
一.简介 方法区在JVM中也是一个非常重要的区域,它与堆一样,是被线程共享的区域.在方法区中,存储了每个类的信息(包括类的名称.方法信息.字段信息).静态变量.常量以及编译器编译后的代码等. 方法区( ...
- shell入门-sort排序
命令:sort 选项:-t:-kn 指定根据某段来排序 这里n代表数字,范围指定n,N.从n到N范围 -n 按数字顺序排列 -r 反序排列 -u 去重复排序 -un 数字顺序排列并去重复,系 ...
- js中Function方法
function.apply(thisArg,argArray) apply方法调用function,传递一个会绑定到this上的对象和一个可选的数组作为参数. apply方法被用在apply调用模式 ...
- 关于startservice的几个启动返回值的意义
START_NOT_STICKY 如果服务进程在它启动后(从onStartCommand()返回后)被kill掉, 并且没有新启动的intent传给他, 那么将服务移出启动状态并且不重新生成, 直到再 ...
- [51nod1094]和为k的连续区间
法一:暴力$O({n^2})$看脸过 #include<bits/stdc++.h> using namespace std; typedef long long ll; ],sum[]; ...
- 17、GATK使用简介 Part2/2
转载:http://blog.sina.com.cn/s/blog_6721167201018jik.html Change Logs: 13/01/12: 增加了一篇文献,外加一些无聊的修改.12/ ...