package chapter03

import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext} /**
* Created by chenzechao on 2017/12/21.
*/ /**
spark-shell \
--master yarn-client \
--driver-memory 1G \
--driver-cores 1 \
--queue root.queue_0101_04 \
--executor-memory 2G \
--num-executors 2 \
--conf spark.executor.cores=1 \
--name 'tmp_abc_test' \
--conf spark.yarn.executor.memoryOverhead=4096 \
--conf spark.driver.maxResultSize=8G \
--conf spark.sql.hive.metastore.version=1.2.1 \
--conf spark.sql.shuffle.partitions=150
*/ object document {
// 0 获取参数flag //0.设置环境
val conf = new SparkConf().setAppName("tianchi").setMaster("local[*]")
val sc = new SparkContext(conf)
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
val hiveContext = new HiveContext(sc) val jsonFile = "file:///tmp/upload/data/json_file"
val jsonFile_hdfs = "/tmp/ccc/tmpc/json_file"
// 执行SQL
val df1 = sqlContext.sql("select * from sx_360_safe.sub_ladm_exc_app_s16_all_for_double").limit(200).cache()
df1.count() // Print the schema in a tree format
df1.printSchema() // Select only then "gu_flag" column
df1.select("gu_flag").show() // Select everybody, but increment the age by 1
df1.select(df1("empno"),df1("age"),df1("age") + 1 ).show // Select emp age older than 21
df1.filter(df1("age") > 21).select(df1("empno"),df1("age")).show() // Count emp by age
df1.groupBy(df1("age")).count().sort(df1("age")).show()
val gb = df1.groupBy(df1("age")).count()
gb.sort(gb("count")).show() // save dataFrame as json file
df1.write.mode("Overwrite").format("json").save(jsonFile_hdfs)
df1.write.mode("Append").format("json").save(jsonFile_hdfs)
df1.select(df1("empno"), df1("gu_flag")).write.mode("Overwrite").format("parquet").saveAsTable("sx_360_safe.tmp_czc_20180323_04") // this is used to implicitly convert an RDD to a DataFrame.
import sqlContext.implicits._ val df2 = sqlContext.read.json(jsonFile) // Encoders for most common types are automatically provided by importing sqlContext.implicits._
val ds1 = Seq(1, 2, 3).toDS()
ds1.map(_ + 1).collect() // Encoders are also created for case class
case class Person(name:String ,age: Long)
val ds = Seq(Person("Andy",35)).toDS()
ds.show() /**
* Inferring the Schema Using Reflection
*/
import sqlContext.implicits._
case class Person2(name:String, age:Int)
val people = sc.textFile("/tmp/ccc/data/tmpa").filter(_.length > 1).map(_.split(",")).map(p => Person2(p(0),p(1).trim.toInt)).toDF()
people.registerTempTable("people")
sqlContext.sql("select * from people limit 10").show val teenagers = sqlContext.sql("select name,age from people where age >= 23 and age<= 26")
teenagers.map(t => "Name: " + t(0)).collect().foreach(println) // or by field name
teenagers.map(t => "Name: " + t.getAs[String]("name")).collect().foreach(println) // row.getValuesMap[T] retrieves multiple columns at once into a Map[String,T]
teenagers.map(_.getValuesMap[Any](List("name","age"))).collect().foreach(println) /**
* Programmatically Specifying the Schema
*/
val schemaString = "name age"
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{StructType,StructField,StringType} val schema =
StructType(
schemaString.split(" ").map(fieldName => StructField(fieldName,StringType,true))
) // Convert records of the RDD (people) to Rows
val people2 = sc.textFile("/tmp/ccc/data/tmpa")
val rowRDD = people2.map(_.split(",")).map(p => Row(p(0),p(1).trim)) // Apply the schema to the RDD
val peopleDataFrame = sqlContext.createDataFrame(rowRDD,schema) // Register the DataFrames as a table
peopleDataFrame.registerTempTable("people") // SQL val df = sqlContext.read.load("/tmp/examples/src/main/resources/users.parquet") val df3 = sqlContext.read.format("json").load("/tmp/examples/src/main/resources/people.json") // Run SQL on files directly
val df4 = sqlContext.sql("select * from parquet.`/tmp/examples/src/main/resources/users.parquet`") // Save modes
/**
* ErrorIfExists (default)
* Append
* Overwrite
* Ignore
*/ val parquetFile = sqlContext.read.parquet("") }

spark_learn的更多相关文章

随机推荐

  1. bzoj 3752: Hack 预处理+暴力dfs

    题目大意: 定义字符串的hash值\(h = \sum_{i=0}^{n-1}p^{n-i-1}s_i\) 现在给定K个长度不超过L的字符串S,对于每个字符串S,求字典序最小长度不超过L的字符串T使得 ...

  2. Java中Calendar/SimpleDateFormat/Date常用方法总结

    //获取当前时刻yyyy-MM-dd HH:mm:ss Calendar calendar = Calendar.getInstance(); SimpleDateFormat sdf = new S ...

  3. CSS之EM相对单位

    之前以为em单位只是在font-size中起到继承作用, 后来慢慢觉得,继承,应该是在几乎所有样式中都可以是实现的,比如height,width,border... 今天才简单测试了下,果真是可以实现 ...

  4. javascript:delete 删除对象的属性

    delete 运算符删除对以前定义的对象属性或方法的引用. 不可以删除的如下: 1通过var定义的变量 var a=1;delete a//false 2 声明后的函数 function a(){}; ...

  5. 使用Visual Studio进行单元测试-Part3

    本文主要介绍Visual Studio(2012+)单元测试框架的一些技巧: 测试覆盖率的统计 测试分组 测试Setup/TearDown 测试调试 Exception测试 代码覆盖率的统计 VS下面 ...

  6. Netty,Netty

    Windows防火墙会自动关闭空闲的TCP链接,所以Netty需要心跳,如果发现链接断开需要进行关闭Session: 怎么来理解TCP的流式传输呢? int blocksize = buffer.re ...

  7. web攻击之三:SQL注入攻击的种类和防范手段

    观察近来的一些安全事件及其后果,安全专家们已经得到一个结论,这些威胁主要是通过SQL注入造成的.虽然前面有许多文章讨论了SQL注入,但今天所讨论的内容也许可帮助你检查自己的服务器,并采取相应防范措施. ...

  8. 【转】Ruby on Rails中select使用方法

    在Ruby on Rails中真的有一堆Select helper可以用,我们经常容易混淆.常见的有三个..select, select_tag, collection_select(其余的什么sel ...

  9. 使用LookAndFeel为界面更换皮肤

    ----------------siwuxie095                             在 Windows 系统中,默认的 Java 运行环境(JRE)会为当前的窗体程序 指定一 ...

  10. 看下面代码输出结果Java

    编译错误 在调用子类构造器之前,会先调用父类构造器,当子类构造器中没有使用"super(参数或无参数)"指定调用父类构造器时,是默认调用父类的无参构造器,如果父类中包含有参构造器, ...