package chapter03

import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext} /**
* Created by chenzechao on 2017/12/21.
*/ /**
spark-shell \
--master yarn-client \
--driver-memory 1G \
--driver-cores 1 \
--queue root.queue_0101_04 \
--executor-memory 2G \
--num-executors 2 \
--conf spark.executor.cores=1 \
--name 'tmp_abc_test' \
--conf spark.yarn.executor.memoryOverhead=4096 \
--conf spark.driver.maxResultSize=8G \
--conf spark.sql.hive.metastore.version=1.2.1 \
--conf spark.sql.shuffle.partitions=150
*/ object document {
// 0 获取参数flag //0.设置环境
val conf = new SparkConf().setAppName("tianchi").setMaster("local[*]")
val sc = new SparkContext(conf)
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
val hiveContext = new HiveContext(sc) val jsonFile = "file:///tmp/upload/data/json_file"
val jsonFile_hdfs = "/tmp/ccc/tmpc/json_file"
// 执行SQL
val df1 = sqlContext.sql("select * from sx_360_safe.sub_ladm_exc_app_s16_all_for_double").limit(200).cache()
df1.count() // Print the schema in a tree format
df1.printSchema() // Select only then "gu_flag" column
df1.select("gu_flag").show() // Select everybody, but increment the age by 1
df1.select(df1("empno"),df1("age"),df1("age") + 1 ).show // Select emp age older than 21
df1.filter(df1("age") > 21).select(df1("empno"),df1("age")).show() // Count emp by age
df1.groupBy(df1("age")).count().sort(df1("age")).show()
val gb = df1.groupBy(df1("age")).count()
gb.sort(gb("count")).show() // save dataFrame as json file
df1.write.mode("Overwrite").format("json").save(jsonFile_hdfs)
df1.write.mode("Append").format("json").save(jsonFile_hdfs)
df1.select(df1("empno"), df1("gu_flag")).write.mode("Overwrite").format("parquet").saveAsTable("sx_360_safe.tmp_czc_20180323_04") // this is used to implicitly convert an RDD to a DataFrame.
import sqlContext.implicits._ val df2 = sqlContext.read.json(jsonFile) // Encoders for most common types are automatically provided by importing sqlContext.implicits._
val ds1 = Seq(1, 2, 3).toDS()
ds1.map(_ + 1).collect() // Encoders are also created for case class
case class Person(name:String ,age: Long)
val ds = Seq(Person("Andy",35)).toDS()
ds.show() /**
* Inferring the Schema Using Reflection
*/
import sqlContext.implicits._
case class Person2(name:String, age:Int)
val people = sc.textFile("/tmp/ccc/data/tmpa").filter(_.length > 1).map(_.split(",")).map(p => Person2(p(0),p(1).trim.toInt)).toDF()
people.registerTempTable("people")
sqlContext.sql("select * from people limit 10").show val teenagers = sqlContext.sql("select name,age from people where age >= 23 and age<= 26")
teenagers.map(t => "Name: " + t(0)).collect().foreach(println) // or by field name
teenagers.map(t => "Name: " + t.getAs[String]("name")).collect().foreach(println) // row.getValuesMap[T] retrieves multiple columns at once into a Map[String,T]
teenagers.map(_.getValuesMap[Any](List("name","age"))).collect().foreach(println) /**
* Programmatically Specifying the Schema
*/
val schemaString = "name age"
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{StructType,StructField,StringType} val schema =
StructType(
schemaString.split(" ").map(fieldName => StructField(fieldName,StringType,true))
) // Convert records of the RDD (people) to Rows
val people2 = sc.textFile("/tmp/ccc/data/tmpa")
val rowRDD = people2.map(_.split(",")).map(p => Row(p(0),p(1).trim)) // Apply the schema to the RDD
val peopleDataFrame = sqlContext.createDataFrame(rowRDD,schema) // Register the DataFrames as a table
peopleDataFrame.registerTempTable("people") // SQL val df = sqlContext.read.load("/tmp/examples/src/main/resources/users.parquet") val df3 = sqlContext.read.format("json").load("/tmp/examples/src/main/resources/people.json") // Run SQL on files directly
val df4 = sqlContext.sql("select * from parquet.`/tmp/examples/src/main/resources/users.parquet`") // Save modes
/**
* ErrorIfExists (default)
* Append
* Overwrite
* Ignore
*/ val parquetFile = sqlContext.read.parquet("") }

spark_learn的更多相关文章

随机推荐

  1. bzoj 4773: 负环 floyd

    题目: 对于边带权的有向图,找出一个点数最小的环,使得环上的边权和为负. 2 <= n <= 300. 题解: 我们可以考虑从小到大枚举答案. 然后每次枚举更大的答案的时候就从当前的较小的 ...

  2. bzoj 3779: 重组病毒 LCT+线段树+倍增

    题目: 黑客们通过对已有的病毒反编译,将许多不同的病毒重组,并重新编译出了新型的重组病毒.这种病毒的繁殖和变异能力极强.为了阻止这种病毒传播,某安全机构策划了一次实验,来研究这种病毒. 实验在一个封闭 ...

  3. Oracle RAC TAF 无缝failover

    理论背景: TAF( Transparent Application Failover ) allows oracle clients to reconnect to a surviving inst ...

  4. python中全局变量的使用

    python中在module定义的变量可以认为是全局变量, 而对于全局变量的赋值有个地方需要注意. test.py ------------------------------------------ ...

  5. Hibernate Validator--创建自己的约束规则

    尽管Bean Validation API定义了一大堆标准的约束条件, 但是肯定还是有这些约束不能满足我们需求的时候, 在这种情况下, 你可以根据你的特定的校验需求来创建自己的约束条件. 3.1. 创 ...

  6. Webpack打包之后[-webkit-box-orient: vertical]样式丢失

    背景:项目是用的vue全家桶套餐 今天在工作中遇到一个问题,需求是要求文字只能显示3行,超过3行则隐藏且显示 '...', 于是我加了如下样式在标签里面: display: -webkit-box;- ...

  7. LAMP 2.0Apache日志切割

    每次访问网站就会产生若干条日志,当然前提是已经配置了日志. 配置日志的文件在 vim /usr/local/apache2/conf/extra/httpd-vhosts.conf 把注释掉的这两行打 ...

  8. JavaScript语言精粹知识点总结

    1.NaN是一个数值,它表示一个不能产生正常结果的运算结果.NaN不等于任何值,包括它自己. 2.Infinity表示所有大于1.79769313486231570e+308的值,所以Infinity ...

  9. C基础题-sizeof

    sizeof  C语言中判断数据类型或者表达式长度符:关键字:字节数的计算在程序编译时进行,而不是在程序执行的过程中才计算出来! 一.关于sizeof简单的总结 1.sizeof的使用形式:sizeo ...

  10. [poj3259]Wormholes(spfa判负环)

    题意:有向图判负环. 解题关键:spfa算法+hash判负圈. spfa判断负环:若一个点入队次数大于节点数,则存在负环.  两点间如果有最短路,那么每个结点最多经过一次,这条路不超过$n-1$条边. ...