spark算子
1.map
一条一条读取
def map(): Unit ={
val list = List("张无忌", "赵敏", "周芷若")
val listRDD = sc.parallelize(list)
val nameRDD = listRDD.map(name => "Hello " + name)
nameRDD.foreach(name => println(name))
}
2.flatMap
扁平化
def flatMap(): Unit ={
val list = List("张无忌 赵敏","宋青书 周芷若")
val listRDD = sc.parallelize(list)
val nameRDD = listRDD.flatMap(line => line.split(" ")).map(name => "Hello " + name)
nameRDD.foreach(name => println(name))
}
3.mapPartitions
一次读取一个分区数据
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession
import scala.collection.mutable.ListBuffer
object Demo {
val conf = new SparkConf().setAppName("Demo").setMaster("local");
// val spark = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()
val spark = new SparkContext(conf)
def main(args: Array[String]): Unit = {
val list = List(1, 2, 3, 4, 5, 6)
val rdd = spark.parallelize(list, 2)
rdd.foreach(println)
val rdd2 = rdd.mapPartitions(iterator => {
val newList = new ListBuffer[String]
while (iterator.hasNext) {
newList.append("hello" + iterator.next())
}
newList.toIterator
})
rdd2.foreach(name => println(name))
}
}
4.mapPartitionsWithIndex
一次读取一个分区数据,并且知道是哪个分区的
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession
import scala.collection.mutable.ListBuffer
object Demo {
val conf = new SparkConf().setAppName("Demo").setMaster("local");
// val spark = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()
val spark = new SparkContext(conf)
def main(args: Array[String]): Unit = {
val list = List(1, 2, 3, 4, 5, 6)
val rdd = spark.parallelize(list, 2)
val rdd2 = rdd.mapPartitionsWithIndex((index, iterator) => {
val newList = new ListBuffer[String]
while (iterator.hasNext) {
newList.append(index + "_" + iterator.next())
}
newList.toIterator
})
rdd2.foreach(name => println(name))
}
}
5.reduce
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession
import scala.collection.mutable.ListBuffer
object Demo {
val conf = new SparkConf().setAppName("Demo").setMaster("local");
// val spark = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()
val spark = new SparkContext(conf)
def main(args: Array[String]): Unit = {
val list = List(1, 2, 3, 4, 5, 6)
val rdd = spark.parallelize(list)
val result = rdd.reduce((x, y) => x + y)
println(result)
}
}
6.reduceBykey
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession
import scala.collection.mutable.ListBuffer
object Demo {
val conf = new SparkConf().setAppName("Demo").setMaster("local");
// val spark = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()
val spark = new SparkContext(conf)
def main(args: Array[String]): Unit = {
val list = List(("武当", 99), ("少林", 97), ("武当", 89), ("少林", 77))
val rdd = spark.parallelize(list)
val rdd2 = rdd.reduceByKey(_ + _)
rdd2.foreach(tuple => println(tuple._1 + ":" + tuple._2))
}
}
7.union
合并,但不去重
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession
import scala.collection.mutable.ListBuffer
object Demo {
val conf = new SparkConf().setAppName("Demo").setMaster("local");
// val spark = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()
val spark = new SparkContext(conf)
def main(args: Array[String]): Unit = {
val list1 = List(1,2,3,4)
val list2 = List(3,4,5,6)
val rdd1 = spark.parallelize(list1)
val rdd2 = spark.parallelize(list2)
rdd1.union(rdd2).foreach(println)
}
}
8.join
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession
import scala.collection.mutable.ListBuffer
object Demo {
val conf = new SparkConf().setAppName("Demo").setMaster("local");
// val spark = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()
val spark = new SparkContext(conf)
def main(args: Array[String]): Unit = {
val list1 = List((1, "东方不败"), (2, "令狐冲"), (3, "林平之"))
val list2 = List((1, 99), (2, 98), (3, 97))
val rdd1 = spark.parallelize(list1)
val rdd2 = spark.parallelize(list2)
val rdd3 = rdd1.join(rdd2)
rdd3.foreach(tuple => {
val id = tuple._1
val new_tuple = tuple._2
val name = new_tuple._1
val score = new_tuple._2
println("学号:" + id + " 姓名:" + name + " 成绩:" + score)
})
}
}
9.groupbyKey
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession
import scala.collection.mutable.ListBuffer
object Demo {
val conf = new SparkConf().setAppName("Demo").setMaster("local");
// val spark = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()
val spark = new SparkContext(conf)
def main(args: Array[String]): Unit = {
val list = List(("武当", "张三丰"), ("峨眉", "灭绝师太"), ("武当", "宋青书"), ("峨眉", "周芷若"))
val rdd1 = spark.parallelize(list)
val rdd2 = rdd1.groupByKey()
rdd2.foreach(t => {
val menpai = t._1
val iterator = t._2.iterator
var people = ""
while (iterator.hasNext) people = people + iterator.next + " "
println("门派:" + menpai + "人员:" + people)
})
}
}
10.cartesian
笛卡尔积
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession
import scala.collection.mutable.ListBuffer
object Demo {
val conf = new SparkConf().setAppName("Demo").setMaster("local");
// val spark = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()
val spark = new SparkContext(conf)
def main(args: Array[String]): Unit = {
val list1 = List("A", "B")
val list2 = List(1, 2, 3)
val list1RDD = spark.parallelize(list1)
val list2RDD = spark.parallelize(list2)
list1RDD.cartesian(list2RDD).foreach(t => println(t._1 + "->" + t._2))
}
}
11.filter
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession
import scala.collection.mutable.ListBuffer
object Demo {
val conf = new SparkConf().setAppName("Demo").setMaster("local");
// val spark = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()
val spark = new SparkContext(conf)
def main(args: Array[String]): Unit = {
val list = List(1,2,3,4,5,6,7,8,9,10)
val listRDD = spark.parallelize(list)
listRDD.filter(num => num % 2 ==0).foreach(print(_))
}
}
12.distinct
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession
import scala.collection.mutable.ListBuffer
object Demo {
val conf = new SparkConf().setAppName("Demo").setMaster("local");
// val spark = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()
val spark = new SparkContext(conf)
def main(args: Array[String]): Unit = {
val list = List(1,1,2,2,3,3,4,5)
val rdd = spark.parallelize(list)
rdd.distinct().foreach(println)
}
}
13.intersection
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession
import scala.collection.mutable.ListBuffer
object Demo {
val conf = new SparkConf().setAppName("Demo").setMaster("local");
// val spark = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()
val spark = new SparkContext(conf)
def main(args: Array[String]): Unit = {
val list1 = List(1,2,3,4)
val list2 = List(3,4,5,6)
val list1RDD = spark.parallelize(list1)
val list2RDD = spark.parallelize(list2)
list1RDD.intersection(list2RDD).foreach(println(_))
}
}
14.coalesce
分区有多-->少
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession
import scala.collection.mutable.ListBuffer
object Demo {
val conf = new SparkConf().setAppName("Demo").setMaster("local");
// val spark = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()
val spark = new SparkContext(conf)
def main(args: Array[String]): Unit = {
val list = List(1,2,3,4,5)
spark.parallelize(list,3).coalesce(1).foreach(println(_))
}
}
15.repartition
进行重分区
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession
import scala.collection.mutable.ListBuffer
object Demo {
val conf = new SparkConf().setAppName("Demo").setMaster("local");
// val spark = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()
val spark = new SparkContext(conf)
def main(args: Array[String]): Unit = {
val list = List(1,2,3,4)
val listRDD = spark.parallelize(list,1)
listRDD.repartition(2).foreach(println(_))
}
}
16.repartitionAndSortWithinPartitions
在给定的partitioner内部进行排序,性能比repartition要高。
import org.apache.spark.{HashPartitioner, SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession
import scala.collection.mutable.ListBuffer
object Demo {
val conf = new SparkConf().setAppName("Demo").setMaster("local");
// val spark = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()
val spark = new SparkContext(conf)
def main(args: Array[String]): Unit = {
val list = List(1, 4, 55, 66, 33, 48, 23)
val listRDD = spark.parallelize(list, 1)
listRDD.map(num => (num, num))
.repartitionAndSortWithinPartitions(new HashPartitioner(2))
.mapPartitionsWithIndex((index, iterator) => {
val listBuffer: ListBuffer[String] = new ListBuffer
while (iterator.hasNext) {
listBuffer.append(index + "_" + iterator.next())
}
listBuffer.iterator
}, false)
.foreach(println(_))
}
}
17.cogroup
import org.apache.spark.{HashPartitioner, SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession
import scala.collection.mutable.ListBuffer
object Demo {
val conf = new SparkConf().setAppName("Demo").setMaster("local");
// val spark = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()
val spark = new SparkContext(conf)
def main(args: Array[String]): Unit = {
val list1 = List((1, "www"), (2, "bbs"))
val list2 = List((1, "cnblog"), (2, "cnblog"), (3, "very"))
val list3 = List((1, "com"), (2, "com"), (3, "good"))
val list1RDD = spark.parallelize(list1)
val list2RDD = spark.parallelize(list2)
val list3RDD = spark.parallelize(list3)
list1RDD.cogroup(list2RDD,list3RDD).foreach(tuple =>
println(tuple._1 + " " + tuple._2._1 + " " + tuple._2._2 + " " + tuple._2._3))
}
}
18.sortByKey
import org.apache.spark.{HashPartitioner, SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession
import scala.collection.mutable.ListBuffer
object Demo {
val conf = new SparkConf().setAppName("Demo").setMaster("local");
// val spark = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()
val spark = new SparkContext(conf)
def main(args: Array[String]): Unit = {
val list = List((99, "张三丰"), (96, "东方不败"), (66, "林平之"), (98, "聂风"))
spark.parallelize(list).sortByKey(false).foreach(tuple => println(tuple._2 + "->" + tuple._1))
}
}
19.aggregateByKey
import org.apache.spark.{HashPartitioner, SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession
import scala.collection.mutable.ListBuffer
object Demo {
val conf = new SparkConf().setAppName("Demo").setMaster("local");
// val spark = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()
val spark = new SparkContext(conf)
def main(args: Array[String]): Unit = {
val list = List("you,jump", "i,jump")
spark.parallelize(list)
.flatMap(_.split(","))
.map((_, 1))
.aggregateByKey(0)(_ + _, _ + _)
.foreach(tuple => println(tuple._1 + "->" + tuple._2))
}
}
apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession
import scala.collection.mutable.ListBuffer
object Demo {
val conf = new SparkConf().setAppName("Demo").setMaster("local");
// val spark = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()
val spark = new SparkContext(conf)
def main(args: Array[String]): Unit = {
val list = List(("武当", "张三丰"), ("峨眉", "灭绝师太"), ("武当", "宋青书"), ("峨眉", "周芷若"))
val rdd1 = spark.parallelize(list)
val rdd2 = rdd1.groupByKey()
rdd2.foreach(t => {
val menpai = t._1
val iterator = t._2.iterator
var people = ""
while (iterator.hasNext) people = people + iterator.next + " "
println("门派:" + menpai + "人员:" + people)
})
}
}
spark算子的更多相关文章
- (转)Spark 算子系列文章
http://lxw1234.com/archives/2015/07/363.htm Spark算子:RDD基本转换操作(1)–map.flagMap.distinct Spark算子:RDD创建操 ...
- Spark算子总结及案例
spark算子大致上可分三大类算子: 1.Value数据类型的Transformation算子,这种变换不触发提交作业,针对处理的数据项是Value型的数据. 2.Key-Value数据类型的Tran ...
- UserView--第二种方式(避免第一种方式Set饱和),基于Spark算子的java代码实现
UserView--第二种方式(避免第一种方式Set饱和),基于Spark算子的java代码实现 测试数据 java代码 package com.hzf.spark.study; import ...
- UserView--第一种方式set去重,基于Spark算子的java代码实现
UserView--第一种方式set去重,基于Spark算子的java代码实现 测试数据 java代码 package com.hzf.spark.study; import java.util.Ha ...
- spark算子之DataFrame和DataSet
前言 传统的RDD相对于mapreduce和storm提供了丰富强大的算子.在spark慢慢步入DataFrame到DataSet的今天,在算子的类型基本不变的情况下,这两个数据集提供了更为强大的的功 ...
- Spark算子总结(带案例)
Spark算子总结(带案例) spark算子大致上可分三大类算子: 1.Value数据类型的Transformation算子,这种变换不触发提交作业,针对处理的数据项是Value型的数据. 2.Key ...
- Spark算子---实战应用
Spark算子实战应用 数据集 :http://grouplens.org/datasets/movielens/ MovieLens 1M Datase 相关数据文件 : users.dat --- ...
- spark算子集锦
Spark 是大数据领域的一大利器,花时间总结了一下 Spark 常用算子,正所谓温故而知新. Spark 算子按照功能分,可以分成两大类:transform 和 action.Transform 不 ...
- Spark算子使用
一.spark的算子分类 转换算子和行动算子 转换算子:在使用的时候,spark是不会真正执行,直到需要行动算子之后才会执行.在spark中每一个算子在计算之后就会产生一个新的RDD. 二.在编写sp ...
- Spark:常用transformation及action,spark算子详解
常用transformation及action介绍,spark算子详解 一.常用transformation介绍 1.1 transformation操作实例 二.常用action介绍 2.1 act ...
随机推荐
- Pthread 用法笔记
什么是线程? 从技术上讲,一个线程被定义为一个独立的指令流. 一个进程可以包含一个或多个线程. 线程操作包括线程创建,终止,同步(连接,阻塞),调度,数据管理和进程交互. 进程内的所有线程共享: 相同 ...
- 清北学堂Day2
算数基本定理: 1.整数及其相关 2.唯一分解定理 对于任意的大于1的正整数N,N一定能够分解成有限个质数的乘积,即 其中P1<P2<...<Pk,a1,a2,...,ak>= ...
- Loj #528. 「LibreOJ β Round #4」求和 (莫比乌斯反演)
题目链接:https://loj.ac/problem/528 题目:给定两个正整数N,M,你需要计算ΣΣu(gcd(i,j))^2 mod 998244353 ,其中i属于[1,N],j属于[1,M ...
- Stanford Local 2016 G "Ground Defense"(线段树)
传送门 题意: 有 n 个城市,编号 1~n: 有两种操作:Update,Query Update: E i s a d 更新区间[ i,i+d-1 ], i 节点降落 s 人, i+1 节点降落 s ...
- 礼物(中国剩余定理+拓展gcd求逆元+分治=拓展Lucus)
礼物 题意: 求\[C(n,m)\ \%\ p\] \(n,m,p\le 10^9\),且若\(p=\prod_{i=1}^{k}{p_i}^{c_i}\),则\(\forall i\in [1..k ...
- (二分查找 拓展) leetcode 69. Sqrt(x)
Implement int sqrt(int x). Compute and return the square root of x, where x is guaranteed to be a no ...
- EXCEL(1)级联下拉框
EXCEL级联下拉框 http://jingyan.baidu.com/article/3c343ff756e0cf0d377963f9.html 在输入一些多级项目时,如果输入前一级内容后,能够自动 ...
- 深入剖析Kubernetes学习笔记:深入理解镜像(08)
一.Python 应用案例环境 [root@k8s-node1 Flask]# pwd /opt/Dockerfile/Flask [root@k8s-node1 Flask]# ll total 1 ...
- 2018-2019-2 《Java程序设计》第1周学习总结
# 20175319 2018-2019-2 <Java程序设计>第1周学习总结 ## 教材学习内容总结 第一周我根据老师提供的博客,下载和设置了各种需要的软件,并对这些软件进行初步的了解 ...
- 18、cookies与session学习笔记
本文记录学习 cookies 和 session 的一些小练习和知识点 知识点1 cookies 和 session 的由来 HTTP协议是无状态的协议,因为一旦浏览器和服务器之间的请求 ...