spark算子
1.map
一条一条读取
def map(): Unit ={
val list = List("张无忌", "赵敏", "周芷若")
val listRDD = sc.parallelize(list)
val nameRDD = listRDD.map(name => "Hello " + name)
nameRDD.foreach(name => println(name))
}
2.flatMap
扁平化
def flatMap(): Unit ={
val list = List("张无忌 赵敏","宋青书 周芷若")
val listRDD = sc.parallelize(list)
val nameRDD = listRDD.flatMap(line => line.split(" ")).map(name => "Hello " + name)
nameRDD.foreach(name => println(name))
}
3.mapPartitions
一次读取一个分区数据
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession
import scala.collection.mutable.ListBuffer
object Demo {
val conf = new SparkConf().setAppName("Demo").setMaster("local");
// val spark = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()
val spark = new SparkContext(conf)
def main(args: Array[String]): Unit = {
val list = List(1, 2, 3, 4, 5, 6)
val rdd = spark.parallelize(list, 2)
rdd.foreach(println)
val rdd2 = rdd.mapPartitions(iterator => {
val newList = new ListBuffer[String]
while (iterator.hasNext) {
newList.append("hello" + iterator.next())
}
newList.toIterator
})
rdd2.foreach(name => println(name))
}
}
4.mapPartitionsWithIndex
一次读取一个分区数据,并且知道是哪个分区的
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession
import scala.collection.mutable.ListBuffer
object Demo {
val conf = new SparkConf().setAppName("Demo").setMaster("local");
// val spark = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()
val spark = new SparkContext(conf)
def main(args: Array[String]): Unit = {
val list = List(1, 2, 3, 4, 5, 6)
val rdd = spark.parallelize(list, 2)
val rdd2 = rdd.mapPartitionsWithIndex((index, iterator) => {
val newList = new ListBuffer[String]
while (iterator.hasNext) {
newList.append(index + "_" + iterator.next())
}
newList.toIterator
})
rdd2.foreach(name => println(name))
}
}
5.reduce
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession
import scala.collection.mutable.ListBuffer
object Demo {
val conf = new SparkConf().setAppName("Demo").setMaster("local");
// val spark = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()
val spark = new SparkContext(conf)
def main(args: Array[String]): Unit = {
val list = List(1, 2, 3, 4, 5, 6)
val rdd = spark.parallelize(list)
val result = rdd.reduce((x, y) => x + y)
println(result)
}
}
6.reduceBykey
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession
import scala.collection.mutable.ListBuffer
object Demo {
val conf = new SparkConf().setAppName("Demo").setMaster("local");
// val spark = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()
val spark = new SparkContext(conf)
def main(args: Array[String]): Unit = {
val list = List(("武当", 99), ("少林", 97), ("武当", 89), ("少林", 77))
val rdd = spark.parallelize(list)
val rdd2 = rdd.reduceByKey(_ + _)
rdd2.foreach(tuple => println(tuple._1 + ":" + tuple._2))
}
}
7.union
合并,但不去重
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession
import scala.collection.mutable.ListBuffer
object Demo {
val conf = new SparkConf().setAppName("Demo").setMaster("local");
// val spark = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()
val spark = new SparkContext(conf)
def main(args: Array[String]): Unit = {
val list1 = List(1,2,3,4)
val list2 = List(3,4,5,6)
val rdd1 = spark.parallelize(list1)
val rdd2 = spark.parallelize(list2)
rdd1.union(rdd2).foreach(println)
}
}
8.join
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession
import scala.collection.mutable.ListBuffer
object Demo {
val conf = new SparkConf().setAppName("Demo").setMaster("local");
// val spark = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()
val spark = new SparkContext(conf)
def main(args: Array[String]): Unit = {
val list1 = List((1, "东方不败"), (2, "令狐冲"), (3, "林平之"))
val list2 = List((1, 99), (2, 98), (3, 97))
val rdd1 = spark.parallelize(list1)
val rdd2 = spark.parallelize(list2)
val rdd3 = rdd1.join(rdd2)
rdd3.foreach(tuple => {
val id = tuple._1
val new_tuple = tuple._2
val name = new_tuple._1
val score = new_tuple._2
println("学号:" + id + " 姓名:" + name + " 成绩:" + score)
})
}
}
9.groupbyKey
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession
import scala.collection.mutable.ListBuffer
object Demo {
val conf = new SparkConf().setAppName("Demo").setMaster("local");
// val spark = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()
val spark = new SparkContext(conf)
def main(args: Array[String]): Unit = {
val list = List(("武当", "张三丰"), ("峨眉", "灭绝师太"), ("武当", "宋青书"), ("峨眉", "周芷若"))
val rdd1 = spark.parallelize(list)
val rdd2 = rdd1.groupByKey()
rdd2.foreach(t => {
val menpai = t._1
val iterator = t._2.iterator
var people = ""
while (iterator.hasNext) people = people + iterator.next + " "
println("门派:" + menpai + "人员:" + people)
})
}
}
10.cartesian
笛卡尔积
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession
import scala.collection.mutable.ListBuffer
object Demo {
val conf = new SparkConf().setAppName("Demo").setMaster("local");
// val spark = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()
val spark = new SparkContext(conf)
def main(args: Array[String]): Unit = {
val list1 = List("A", "B")
val list2 = List(1, 2, 3)
val list1RDD = spark.parallelize(list1)
val list2RDD = spark.parallelize(list2)
list1RDD.cartesian(list2RDD).foreach(t => println(t._1 + "->" + t._2))
}
}
11.filter
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession
import scala.collection.mutable.ListBuffer
object Demo {
val conf = new SparkConf().setAppName("Demo").setMaster("local");
// val spark = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()
val spark = new SparkContext(conf)
def main(args: Array[String]): Unit = {
val list = List(1,2,3,4,5,6,7,8,9,10)
val listRDD = spark.parallelize(list)
listRDD.filter(num => num % 2 ==0).foreach(print(_))
}
}
12.distinct
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession
import scala.collection.mutable.ListBuffer
object Demo {
val conf = new SparkConf().setAppName("Demo").setMaster("local");
// val spark = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()
val spark = new SparkContext(conf)
def main(args: Array[String]): Unit = {
val list = List(1,1,2,2,3,3,4,5)
val rdd = spark.parallelize(list)
rdd.distinct().foreach(println)
}
}
13.intersection
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession
import scala.collection.mutable.ListBuffer
object Demo {
val conf = new SparkConf().setAppName("Demo").setMaster("local");
// val spark = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()
val spark = new SparkContext(conf)
def main(args: Array[String]): Unit = {
val list1 = List(1,2,3,4)
val list2 = List(3,4,5,6)
val list1RDD = spark.parallelize(list1)
val list2RDD = spark.parallelize(list2)
list1RDD.intersection(list2RDD).foreach(println(_))
}
}
14.coalesce
分区有多-->少
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession
import scala.collection.mutable.ListBuffer
object Demo {
val conf = new SparkConf().setAppName("Demo").setMaster("local");
// val spark = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()
val spark = new SparkContext(conf)
def main(args: Array[String]): Unit = {
val list = List(1,2,3,4,5)
spark.parallelize(list,3).coalesce(1).foreach(println(_))
}
}
15.repartition
进行重分区
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession
import scala.collection.mutable.ListBuffer
object Demo {
val conf = new SparkConf().setAppName("Demo").setMaster("local");
// val spark = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()
val spark = new SparkContext(conf)
def main(args: Array[String]): Unit = {
val list = List(1,2,3,4)
val listRDD = spark.parallelize(list,1)
listRDD.repartition(2).foreach(println(_))
}
}
16.repartitionAndSortWithinPartitions
在给定的partitioner内部进行排序,性能比repartition要高。
import org.apache.spark.{HashPartitioner, SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession
import scala.collection.mutable.ListBuffer
object Demo {
val conf = new SparkConf().setAppName("Demo").setMaster("local");
// val spark = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()
val spark = new SparkContext(conf)
def main(args: Array[String]): Unit = {
val list = List(1, 4, 55, 66, 33, 48, 23)
val listRDD = spark.parallelize(list, 1)
listRDD.map(num => (num, num))
.repartitionAndSortWithinPartitions(new HashPartitioner(2))
.mapPartitionsWithIndex((index, iterator) => {
val listBuffer: ListBuffer[String] = new ListBuffer
while (iterator.hasNext) {
listBuffer.append(index + "_" + iterator.next())
}
listBuffer.iterator
}, false)
.foreach(println(_))
}
}
17.cogroup
import org.apache.spark.{HashPartitioner, SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession
import scala.collection.mutable.ListBuffer
object Demo {
val conf = new SparkConf().setAppName("Demo").setMaster("local");
// val spark = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()
val spark = new SparkContext(conf)
def main(args: Array[String]): Unit = {
val list1 = List((1, "www"), (2, "bbs"))
val list2 = List((1, "cnblog"), (2, "cnblog"), (3, "very"))
val list3 = List((1, "com"), (2, "com"), (3, "good"))
val list1RDD = spark.parallelize(list1)
val list2RDD = spark.parallelize(list2)
val list3RDD = spark.parallelize(list3)
list1RDD.cogroup(list2RDD,list3RDD).foreach(tuple =>
println(tuple._1 + " " + tuple._2._1 + " " + tuple._2._2 + " " + tuple._2._3))
}
}
18.sortByKey
import org.apache.spark.{HashPartitioner, SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession
import scala.collection.mutable.ListBuffer
object Demo {
val conf = new SparkConf().setAppName("Demo").setMaster("local");
// val spark = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()
val spark = new SparkContext(conf)
def main(args: Array[String]): Unit = {
val list = List((99, "张三丰"), (96, "东方不败"), (66, "林平之"), (98, "聂风"))
spark.parallelize(list).sortByKey(false).foreach(tuple => println(tuple._2 + "->" + tuple._1))
}
}
19.aggregateByKey
import org.apache.spark.{HashPartitioner, SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession
import scala.collection.mutable.ListBuffer
object Demo {
val conf = new SparkConf().setAppName("Demo").setMaster("local");
// val spark = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()
val spark = new SparkContext(conf)
def main(args: Array[String]): Unit = {
val list = List("you,jump", "i,jump")
spark.parallelize(list)
.flatMap(_.split(","))
.map((_, 1))
.aggregateByKey(0)(_ + _, _ + _)
.foreach(tuple => println(tuple._1 + "->" + tuple._2))
}
}
apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession
import scala.collection.mutable.ListBuffer
object Demo {
val conf = new SparkConf().setAppName("Demo").setMaster("local");
// val spark = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()
val spark = new SparkContext(conf)
def main(args: Array[String]): Unit = {
val list = List(("武当", "张三丰"), ("峨眉", "灭绝师太"), ("武当", "宋青书"), ("峨眉", "周芷若"))
val rdd1 = spark.parallelize(list)
val rdd2 = rdd1.groupByKey()
rdd2.foreach(t => {
val menpai = t._1
val iterator = t._2.iterator
var people = ""
while (iterator.hasNext) people = people + iterator.next + " "
println("门派:" + menpai + "人员:" + people)
})
}
}
spark算子的更多相关文章
- (转)Spark 算子系列文章
http://lxw1234.com/archives/2015/07/363.htm Spark算子:RDD基本转换操作(1)–map.flagMap.distinct Spark算子:RDD创建操 ...
- Spark算子总结及案例
spark算子大致上可分三大类算子: 1.Value数据类型的Transformation算子,这种变换不触发提交作业,针对处理的数据项是Value型的数据. 2.Key-Value数据类型的Tran ...
- UserView--第二种方式(避免第一种方式Set饱和),基于Spark算子的java代码实现
UserView--第二种方式(避免第一种方式Set饱和),基于Spark算子的java代码实现 测试数据 java代码 package com.hzf.spark.study; import ...
- UserView--第一种方式set去重,基于Spark算子的java代码实现
UserView--第一种方式set去重,基于Spark算子的java代码实现 测试数据 java代码 package com.hzf.spark.study; import java.util.Ha ...
- spark算子之DataFrame和DataSet
前言 传统的RDD相对于mapreduce和storm提供了丰富强大的算子.在spark慢慢步入DataFrame到DataSet的今天,在算子的类型基本不变的情况下,这两个数据集提供了更为强大的的功 ...
- Spark算子总结(带案例)
Spark算子总结(带案例) spark算子大致上可分三大类算子: 1.Value数据类型的Transformation算子,这种变换不触发提交作业,针对处理的数据项是Value型的数据. 2.Key ...
- Spark算子---实战应用
Spark算子实战应用 数据集 :http://grouplens.org/datasets/movielens/ MovieLens 1M Datase 相关数据文件 : users.dat --- ...
- spark算子集锦
Spark 是大数据领域的一大利器,花时间总结了一下 Spark 常用算子,正所谓温故而知新. Spark 算子按照功能分,可以分成两大类:transform 和 action.Transform 不 ...
- Spark算子使用
一.spark的算子分类 转换算子和行动算子 转换算子:在使用的时候,spark是不会真正执行,直到需要行动算子之后才会执行.在spark中每一个算子在计算之后就会产生一个新的RDD. 二.在编写sp ...
- Spark:常用transformation及action,spark算子详解
常用transformation及action介绍,spark算子详解 一.常用transformation介绍 1.1 transformation操作实例 二.常用action介绍 2.1 act ...
随机推荐
- Assignment HDU - 2853(求最小改变边数)
Assignment Time Limit: 2000/1000 MS (Java/Others) Memory Limit: 32768/32768 K (Java/Others)Total ...
- 如何取消Paypal自动付款功能
在国外在线服务消费肯定会常遇到PayPal的支付方式,有些人可能PayPal有些余额可能会用这个工具来支付,但付款后,可能服务因为不满意而退掉,但第二年却自动续约了?但明明服务已退掉,这该怎么处理呢? ...
- maven 使用 log4j
Log4j是Apache的一个开源项目,通过使用Log4j,我们可以控制日志信息输送的目的地是控制台.文件.GUI组件,甚至是套接口服务器.NT的事件记录器.UNIX Syslog守护进程等:我们也可 ...
- Python判断自定义的参数格式是否正确
import argparse def args_validation(valid_list, valid_value): assert valid_value in valid_list, 'inv ...
- mtd-utils 安装
title: mkdosfs 安装 tags: linux date: 2018/12/26/ 17:08:54 --- mtd-utils安装 for 主机 在制作根文件系统中需要使用它制作jffs ...
- Pandas系列(五)-分类数据处理
内容目录 1. 创建对象 2. 常用操作 3. 内存使用量的陷阱 一.创建对象 1.基本概念:分类数据直白来说就是取值为有限的,或者说是固定数量的可能值.例如:性别.血型. 2.创建分类数据:这里以血 ...
- win10添加右键打开命令窗口
新建文件cmd.reg,将下面代码贴入 Windows Registry Editor Version 5.00 [HKEY_CLASSES_ROOT\Directory\Background\she ...
- Word中页眉、页码设置
本篇博文简单介绍一下文档中页眉.页码设置的问题 一个项目中,封面一般不需要页眉,要关闭首页的页眉,可以在"页眉和页脚工具->选项->首页不同"可以如下设置: 图 1关闭 ...
- .NET面试题系列(十七)前端面试
JavaScript js如何实现继承 CSS 行内元素和块状元素的区别 CSS让2个DIV在同一行显示的解决方法 在CSS中,div属于块级元素,每个块级元素默认占一行高度,一行内添加一个块级 ...
- end to end testing
概念 https://www.softwaretestinghelp.com/what-is-end-to-end-testing/ What is “End to End Testing”? Ter ...