SparkSQLDemo.scala


import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.types.{StringType, StructField, StructType} object SparkSQLDemo { // $example on:create_ds$
case class Person(name: String, age: Long)
// $example on:create_ds$ def main(args: Array[String]): Unit = {
//开启SparkSession
// $example on: init_session$
val spark = SparkSession
.builder()
.appName("SparkSQLDemo")
.master("local")
.config("spark.some.config.option", "some-value")
.getOrCreate()
// $example off: init_session$ // runBasicDataFrameDemo(spark)
// runDatasetCreationDemo(spark)
// runInferSchemaDemo(spark)
runProgrammaticSchemaDemo(spark) //关闭SparkSeesion
spark.stop() } private def runBasicDataFrameDemo(spark: SparkSession) = { val df = spark.read.json("/Users/hadoop/app/spark/examples/src/main/resources/people.json") //Displays the content of the DataFrame to stdout
df.show() //Print the schema in a tree format
df.printSchema() //Select only the "name" column
df.select("name").show() //This import is needed to use the $-notation
import spark.implicits._
df.select($"name", $"age" + 1).show() //Select people older than 21
df.select($"age" > 21).show() //Count people by age
df.groupBy("age").count().show() //$example on: global_temp_view$
//Register the DataFrame as a SQL temporary view
df.createOrReplaceTempView("people")
val sqlDF = spark.sql("select * from people")
sqlDF.show() //Register the DataFrame as a global temporary view
df.createGlobalTempView("people") //Global temporary view is tied to a system preserved database `global_temp`
spark.sql("select * from global_temp.people").show //Global temporary view is cross-session
spark.newSession().sql("select * from global_temp.people").show() } private def runDatasetCreationDemo(spark: SparkSession) = { // A container for a [[Dataset]], used for implicit conversions in Scala.
// To use this, import implicit conversions in SQL:
import spark.implicits._ // .toDS() -> 这是用括号声明的,以防止Scala编译器将`rdd.toDS(“1”)`视为调用此toDS然后应用于返回的数据集。 //Encoder are created for case classes (为case class 创建编码器)
val caseClassDS = Seq(Person("Andy", 32)).toDS()
caseClassDS.show() //Encoders for most common types are automatically provided by importing spark.implicits._
val primitiveDS = Seq(1, 2, 3).toDS()
primitiveDS.map(_ + 1).foreach(println(_))//.collect() //DataFrames can be converted to a Dataset by providing a class. Mapping will bedone by name
val path = "/Users/hadoop/app/spark/examples/src/main/resources/people.json"
val peopleDS = spark.read.json(path).as[Person]
peopleDS.show() } private def runInferSchemaDemo(spark: SparkSession) = { // $example on: schema_inferring$
//For implicit conversions from RDDs to DataFrames
import spark.implicits._ //Create an RDD of Person objects from a text file, convert it to a DataFrame
val peopleDF = spark.sparkContext
.textFile("/Users/hadoop/app/spark/examples/src/main/resources/people.txt")
.map(_.split(","))
.map(x => Person(x(0), x(1).trim.toInt))
.toDF() //Register the DataFrame as a temporary view
peopleDF.createOrReplaceTempView("people") //SQL statements can be run by using the sql methods provided by Spark
val teenagersDF = spark.sql("select name, age from people where age between 13 and 19") //The columns of a row in the result can be accessed by field index
//(结果中的行的列可以通过字段索引访问)
teenagersDF.map(teenager => s"Name: ${teenager(0)}").show() //or by field name
teenagersDF.map(teenager => s"Name: ${teenager.getAs[String]("name")}").show() //No pre-defined encoders for Dataset[Map[K,V]], define explicitly
//(Dataset[Map[K,V]] 没有预定义的编码器, 显式定义)
implicit val mapEncoder = org.apache.spark.sql.Encoders.kryo[Map[String, Any]] //Primitive types and case classes can be also defined as
//(原始类型和case类也可以定义为隐式val )
//implicit val stringIntMapEncoder: Encoder[Map[String, Any]] = ExpressionEncoder() //row.getValuesMap[T] retrieves multiple columns at once into a Map[String, T]
teenagersDF.map(teenager =>
teenager.getValuesMap[Any](List("name", "age"))
).foreach(println(_))//.collect() // $example off: schema_inferring$ } private def runProgrammaticSchemaDemo(spark: SparkSession) = { import spark.implicits._
// $example on: programmatic_schema$ //Create an RDD
val peopleRDD = spark.sparkContext.textFile("/Users/hadoop/app/spark/examples/src/main/resources/people.txt") //The schema is encoded in a string
val schemaString = "name age" //Generate the schema based on the string of schema
val fields = schemaString.split(" ")
.map(fieldName => StructField(fieldName, StringType, nullable = true))
val schema = StructType(fields) //Convert records of the RDD (people) to Rows
val rowRDD = peopleRDD
.map(_.split(","))
.map(attributes => Row(attributes(0), attributes(1).trim)) //Apply the schema to the RDD
val peopleDF = spark.createDataFrame(rowRDD, schema) //Creates a temporary view using the DataFrame
peopleDF.createOrReplaceTempView("people") //SQL can be run over a temporary view created using DataFrames
val results = spark.sql("select name from people") //The results of SQL queries are DataFrames and support all the normal RDD operations
//The columns of a row in the result can be accessed by field index or by field name
results.map(attributes => s"Name: ${attributes(0)}").show() // $exmaple off: programmatic_schema$
}
}


sparkSQL中的example学习(1)的更多相关文章

  1. sparkSQL中的example学习(3)

    UserDefinedTypedAggregation.scala(用户可自定义类型) import org.apache.spark.sql.expressions.Aggregator impor ...

  2. sparkSQL中的example学习(2)

    UserDefinedUntypedAggregate.scala(默认返回类型为空,不能更改) import org.apache.spark.sql.{Row, SparkSession} imp ...

  3. PHP中的Libevent学习

    wangbin@2012,1,3 目录 Libevent在php中的应用学习 1.      Libevent介绍 2.      为什么要学习libevent 3.      Php libeven ...

  4. JS中childNodes深入学习

    原文:JS中childNodes深入学习 <html xmlns="http://www.w3.org/1999/xhtml"> <head> <ti ...

  5. CNCC2017中的深度学习与跨媒体智能

    CNCC2017中的深度学习与跨媒体智能 转载请注明作者:梦里茶 目录 机器学习与跨媒体智能 传统方法与深度学习 图像分割 小数据集下的深度学习 语音前沿技术 生成模型 基于贝叶斯的视觉信息编解码 珠 ...

  6. 【Spark篇】---SparkSQL中自定义UDF和UDAF,开窗函数的应用

    一.前述 SparkSQL中的UDF相当于是1进1出,UDAF相当于是多进一出,类似于聚合函数. 开窗函数一般分组取topn时常用. 二.UDF和UDAF函数 1.UDF函数 java代码: Spar ...

  7. 图解BERT(NLP中的迁移学习)

    目录 一.例子:句子分类 二.模型架构 模型的输入 模型的输出 三.与卷积网络并行 四.嵌入表示的新时代 回顾一下词嵌入 ELMo: 语境的重要性 五.ULM-FiT:搞懂NLP中的迁移学习 六.Tr ...

  8. python中confIgparser模块学习

    python中configparser模块学习 ConfigParser模块在python中用来读取配置文件,配置文件的格式跟windows下的ini配置文件相似,可以包含一个或多个节(section ...

  9. Scala中的类学习

    Scala中的类学习 从java了解类的情况下,了解Scala的类并不难.Scala类中的字段自动带getter和setter方法,用@BeanProperty注解生成javaBean对象的getXX ...

随机推荐

  1. 高并发高可、O2O、微服务架构用学习网站

    高并发高可.O2O.微服务架构用学习网站 https://www.itkc8.com 非常感谢http://www.cnblogs.com/skyblog/p/5044486.html 关于架构,笔者 ...

  2. mssql的text字段中文乱码

    问题: 1.在页面存入中文后乱码,从前端从后台发现数据未发生异常,发现是存入数据库后乱码: 经查询该字段为text字段,存入中文会乱码 如图 解决办法: 1.将text转为varchar或nvarch ...

  3. Service__cmd安装MySQL并连接SQLyog

    整理记录关于使用cmd安装mysql的过程   1.配置环境变量 1) 计算机->属性->高级系统设置->环境变量 2)先添加变量 变量名:MYSQL_HOME 变量值:D:\mys ...

  4. 传入一个Map<String,Long> 返回它按value排序后的结果

    //传入一个Map<String,Long> 返回它按value排序后的结果 sort为正序还是倒序(-1倒序),size为要几条数据 private static Map<Stri ...

  5. Django框架(二十四)-- Django rest_framework-路由控制与响应器

    一.路由控制 # 1.基本路由: url(r'^publish/$', views.PublishView.as_view()), # 2.半自动路径:views.PublishView.as_vie ...

  6. mysql使用命令

    1.创建用户 create user 'name'@'host' identified by 'psssword'; 2.授权 grant select, updata,insert (all) on ...

  7. linux 安装PostgreSQL12

    一.安装步骤 1.设置保存安装包的目录 # cd /usr/local/src 2.开始下载源包 # wget https://ftp.postgresql.org/pub/source/v12.1/ ...

  8. 题解:T103180 しろは的军训列队

    题目链接 solution: 按题目随便假设找到了一个x,它的位置的ap,属性bp 看下图 $$$$$$$$$$$$$$$$|||||P &&&&&&& ...

  9. git报错_you are not allowed to push code to protected branches on this project

    问题描述 今天在提交代码的时候,由于使用的是新库,写完代码后,进行push,发现报错 you are not allowed to push code to protected branches on ...

  10. 快读&快写模板【附O2优化】

    快读&快写模板 快读快写,顾名思义,就是提升输入和输出的速度.在这里简单介绍一下几种输入输出的优劣. C++ cin/cout 输入输出:优点是读入的时候不用管数据类型,也就是说不用背scan ...