spark_learn

package chapter03

import org.apache.spark.sql.DataFrame

import org.apache.spark.sql.hive.HiveContext

import org.apache.spark.{SparkConf, SparkContext}

/**

  * Created by chenzechao on 2017/12/21.

  */

/**

  spark-shell                                      \

  --master yarn-client                             \

  --driver-memory 1G                               \

  --driver-cores 1                                 \

  --queue root.queue_0101_04                       \

  --executor-memory 2G                            \

  --num-executors 2                                \

  --conf spark.executor.cores=1                    \

  --name 'tmp_abc_test'                           \

  --conf spark.yarn.executor.memoryOverhead=4096   \

  --conf spark.driver.maxResultSize=8G             \

  --conf spark.sql.hive.metastore.version=1.2.1    \

  --conf spark.sql.shuffle.partitions=150

  */

object document {

  // 0 获取参数flag

  //0.设置环境

  val conf = new SparkConf().setAppName("tianchi").setMaster("local[*]")

  val sc = new SparkContext(conf)

  val sqlContext = new org.apache.spark.sql.SQLContext(sc)

  val hiveContext = new HiveContext(sc)

  val jsonFile = "file:///tmp/upload/data/json_file"

  val jsonFile_hdfs = "/tmp/ccc/tmpc/json_file"

  // 执行SQL

  val df1 = sqlContext.sql("select * from sx_360_safe.sub_ladm_exc_app_s16_all_for_double").limit(200).cache()

  df1.count()

  // Print the schema in a tree format

  df1.printSchema()

  // Select only then "gu_flag" column

  df1.select("gu_flag").show()

  // Select everybody, but increment the age by 1

  df1.select(df1("empno"),df1("age"),df1("age") + 1 ).show

  // Select emp age older than 21

  df1.filter(df1("age") > 21).select(df1("empno"),df1("age")).show()

  // Count emp by age

  df1.groupBy(df1("age")).count().sort(df1("age")).show()

  val gb = df1.groupBy(df1("age")).count()

  gb.sort(gb("count")).show()

  // save dataFrame as json file

  df1.write.mode("Overwrite").format("json").save(jsonFile_hdfs)

  df1.write.mode("Append").format("json").save(jsonFile_hdfs)

  df1.select(df1("empno"), df1("gu_flag")).write.mode("Overwrite").format("parquet").saveAsTable("sx_360_safe.tmp_czc_20180323_04")

  // this is used to implicitly convert an RDD to a DataFrame.

  import sqlContext.implicits._

  val df2 = sqlContext.read.json(jsonFile)

  // Encoders for most common types are automatically provided by importing sqlContext.implicits._

  val ds1 = Seq(1, 2, 3).toDS()

  ds1.map(_ + 1).collect()

  // Encoders are also created for case class

  case class Person(name:String ,age: Long)

  val ds = Seq(Person("Andy",35)).toDS()

  ds.show()

  /**

    * Inferring the Schema Using Reflection

    */

  import sqlContext.implicits._

  case class Person2(name:String, age:Int)

  val people = sc.textFile("/tmp/ccc/data/tmpa").filter(_.length > 1).map(_.split(",")).map(p => Person2(p(0),p(1).trim.toInt)).toDF()

  people.registerTempTable("people")

  sqlContext.sql("select * from people limit 10").show

  val teenagers = sqlContext.sql("select name,age from people where age >= 23 and age<= 26")

  teenagers.map(t => "Name: " + t(0)).collect().foreach(println)

  // or by field name

  teenagers.map(t => "Name: " + t.getAs[String]("name")).collect().foreach(println)

// row.getValuesMap[T] retrieves multiple columns at once into a Map[String,T]

  teenagers.map(_.getValuesMap[Any](List("name","age"))).collect().foreach(println)

  /**

    * Programmatically Specifying the Schema

    */

  val schemaString  = "name age"

  import org.apache.spark.sql.Row

  import org.apache.spark.sql.types.{StructType,StructField,StringType}

  val schema =

    StructType(

      schemaString.split(" ").map(fieldName => StructField(fieldName,StringType,true))

  )

  // Convert records of the RDD (people) to Rows

  val people2 = sc.textFile("/tmp/ccc/data/tmpa")

  val rowRDD = people2.map(_.split(",")).map(p => Row(p(0),p(1).trim))

  // Apply the schema to the RDD

  val peopleDataFrame = sqlContext.createDataFrame(rowRDD,schema)

  // Register the DataFrames as a table

  peopleDataFrame.registerTempTable("people")

  // SQL

  val df = sqlContext.read.load("/tmp/examples/src/main/resources/users.parquet")

  val df3 = sqlContext.read.format("json").load("/tmp/examples/src/main/resources/people.json")

  // Run SQL on files directly

  val df4 = sqlContext.sql("select * from parquet.`/tmp/examples/src/main/resources/users.parquet`")

  // Save modes

  /**

    * ErrorIfExists (default)

    * Append

    * Overwrite

    * Ignore

    */

  val parquetFile = sqlContext.read.parquet("")

}

spark_learn的更多相关文章

随机推荐

Unity中的ShaderToys——将大神们写的shader搬到unity中来吧
http://lib.csdn.net/article/unity3d/38699 这篇文章翻译自国外的一篇文章(这里是原文链接),正在使用unity的你是否在shader toy上发现很多牛逼哄哄的 ...
Javascript：必须知道的Javascript知识点之“单线程事件驱动”
heiboard: Javascript:必须知道的Javascript知识点之“单线程事件驱动”
MFC中如何不使用Unicode字符集
命令窗口:调试->属性-> 把字符集设置为:未设置
图解Stm32使用jlink下载程序时jtag接口(SW和JTAG模式)的简化方法
转自: http://www.it165.net/embed/html/201308/2332.html 用过stm32的人都知道stm32有两种常用下载程序的方法,用串口和jlink.串口下载方法和 ...
ng2 样式控制之style绑定和class绑定
记一次SQL xml字段关联查询
需求: 一张表是APP表,结构如下: app_category为该游戏所属的类别ID,xml字段类型另一张表是类别表,就ID对应名称,这就不上图了. 还有一张表是每个游戏的下载记录,结构如下: Do ...
Web项目的导出和部署
-----------------siwuxie095 Web 项目的导出工程结构目录如下: ...
微信小程序报错.wxss无法找到
小程序原来一直运行正常,编译都没有问题,但今天更新了一下工具,就一直编译不过,报.wxss无法找到,搜索半天,才解决. 解决方案如下: 在控制台输入openVendor(), 在打开的目录中清除wcs ...
hadoop2.6.0完全分布式部署
这里是hadoop最小的配置,也就是修改最少量的东西让hadoop跑起来. 系统是 Centos6.7 64位, hadoop是2.6.0,虚拟机是VMWare WorkStation 假设虚拟机启动 ...
应用程序无法正常启动提示错误0xc000007b 问题的原因和解决方法
应用程序无法正常启动提示错误0xc000007b 问题的原因和解决方法前提条件: 你使用的是VS201x软件编写程序,你使用的电脑是X64位的,并且你在使用OpenCV库.你编写的程序可以正常编译, ...

spark_learn

spark_learn的更多相关文章

随机推荐

热门专题