Spark中的键值对操作
1.PairRDD介绍
List<String> list=new ArrayList<String>();
list.add("this is a test");
list.add("how are you?");
list.add("do you love me?");
list.add("can you tell me?");
JavaRDD<String> lines=sc.parallelize(list);
JavaPairRDD<String,String> map =lines.mapToPair(
new PairFunction<String, String, String>() {
public Tuple2<String, String> call(String s) throws Exception {
return new Tuple2<String, String>(s.split(" ")[0],s);
//获取第一个单词作为key,s为value
}
}
);

| 函数名 | 目的 | 示例 | 结果 |
| substractByKey | 删掉RDD中键与other RDD 中的键相同的元素 |
rdd.subtractByKey(other) | {(1,2)} |
| join | 对两个RDD进行内连接 |
rdd.join(other) | {(3,(4,9)),(3,(6,9))} |
| rightOuterJoin | 对两个RDD进行连接操作,右外连接 | rdd.rightOuterJoin(other) | {(3,(4,9)),(3,(6,9))} |
| leftOuterJoin | 对两个RDD进行连接操作,左外连接 | rdd.rightOuterJoin(other) | {(1,(2,None)),(3,(4,9)),(3,(6,9))} |
| cogroup | 将两个RDD中拥有相同键的数据分组 | rdd.cogroup(other) | {1,([2],[]),(3,[4,6],[9])} |
JavaPairRDD<String,String> result=map.filter(
new Function<Tuple2<String, String>, Boolean>() {
public Boolean call(Tuple2<String, String> value) throws Exception {
return value._2().length()<20;
}
}
);
for(Tuple2 tuple:result.collect()){
System.out.println(tuple._1()+": "+tuple._2());
strLine.add("how are you");
strLine.add("I am ok");
strLine.add("do you love me");
JavaRDD<String> input=sc.parallelize(strLine);
JavaRDD<String> words=input.flatMap(
new FlatMapFunction<String, String>() {
public Iterable<String> call(String s) throws Exception {
return Arrays.asList(s.split(" "));
}
}
);
JavaPairRDD<String,Integer> result=words.mapToPair(
new PairFunction<String, String, Integer>() {
public Tuple2<String, Integer> call(String s) throws Exception {
return new Tuple2(s, 1);
}
}
).reduceByKey(
new org.apache.spark.api.java.function.Function2<Integer, Integer, Integer>() {
public Integer call(Integer v1, Integer v2) throws Exception {
return v1 + v2;
}
}
) ;
List<Tuple2<Integer,Integer>> list=new ArrayList<Tuple2<Integer, Integer>>();
list.add(new Tuple2<Integer,Integer>(1,1));
list.add(new Tuple2<Integer, Integer>(1,3));
list.add(new Tuple2<Integer, Integer>(2,2));
list.add(new Tuple2<Integer, Integer>(2,8));
JavaPairRDD<Integer,Integer> map=sc.parallelizePairs(list);
JavaPairRDD<Integer,Integer> results=map.foldByKey(0, new Function2<Integer, Integer, Integer>() {
public Integer call(Integer v1, Integer v2) throws Exception {
return v1 + v2;
}
});
for(Tuple2<Integer,Integer> tuple:results.collect())
System.out.println(tuple._1()+"->"+tuple._2());
public class AvgCount implements Serializable{
private int total_;
private int num_;
public AvgCount(int total,int num){
total_=total;
num_=num;
}
public float avg(){
return total_/(float) num_;
}//createCombiner()
static Function<Integer,AvgCount> createAcc =new Function<Integer,AvgCount>(){
public AvgCount call(Integer x){
return new AvgCount(x,1);
}
};//mergeValue()
static Function2<AvgCount,Integer,AvgCount> addAndCount=new Function2<AvgCount, Integer, AvgCount>() {
public AvgCount call(AvgCount a, Integer x) throws Exception {
a.total_+=x;
a.num_+=1;
return a;
}
}; //mmergeCombiners()
static Function2<AvgCount,AvgCount,AvgCount> combine=new Function2<AvgCount, AvgCount, AvgCount>() {
public AvgCount call(AvgCount a, AvgCount b) throws Exception {
a.total_+=b.total_;
a.num_+=b.num_;
return a;
}
};
public static void main(String args[]){
AvgCount initial =new AvgCount(0,0);
SparkConf conf = new SparkConf().setMaster("local").setAppName("my app");
JavaSparkContext sc = new JavaSparkContext(conf);
List<Tuple2<Integer,Integer>> list=new ArrayList<Tuple2<Integer, Integer>>();
list.add(new Tuple2<Integer,Integer>(1,1));
list.add(new Tuple2<Integer, Integer>(1,3));
list.add(new Tuple2<Integer, Integer>(2,2));
list.add(new Tuple2<Integer, Integer>(2,8));
JavaPairRDD<Integer,Integer> nums=sc.parallelizePairs(list);
JavaPairRDD<Integer,AvgCount> avgCounts=nums.combineByKey(createAcc,addAndCount,combine);
Map<Integer,AvgCount> countMap= avgCounts.collectAsMap();
for(Map.Entry<Integer,AvgCount> entry:countMap.entrySet())
System.out.println(entry.getKey()+": "+entry.getValue().avg());
}
}
JavaPairRDD<Integer,AvgCount> avgCounts=nums.combineByKey(createAcc,addAndCount,combine,10);
List<Tuple2<Integer,Integer>> list1=new ArrayList<Tuple2<Integer, Integer>>();
list1.add(new Tuple2<Integer,Integer>(1,1));
list1.add(new Tuple2<Integer, Integer>(2,2));
list1.add(new Tuple2<Integer, Integer>(1,3));
list1.add(new Tuple2<Integer, Integer>(2,4));
JavaPairRDD<Integer,Integer> nums1=sc.parallelizePairs(list1);
JavaPairRDD<Integer,Iterable<Integer>>results =nums1.groupByKey();
//接下来遍历输出results,注意其中关于Iterable遍历的处理
for(Tuple2<Integer,Iterable<Integer>> tuple :results.collect()){
System.out.print(tuple._1()+": ");
Iterator<Integer> it= tuple._2().iterator();
while(it.hasNext()){
System.out.print(it.next()+" ");
}
System.out.println();
}
List<Tuple2<Integer,Integer>> list1=new ArrayList<Tuple2<Integer, Integer>>();
List<Tuple2<Integer,Integer>> list2=new ArrayList<Tuple2<Integer, Integer>>();
list1.add(new Tuple2<Integer,Integer>(1,1));
list1.add(new Tuple2<Integer, Integer>(2,2));
list1.add(new Tuple2<Integer, Integer>(1,3));
list1.add(new Tuple2<Integer, Integer>(2,4));
list1.add(new Tuple2<Integer, Integer>(3,4));
list2.add(new Tuple2<Integer,Integer>(1,1));
list2.add(new Tuple2<Integer, Integer>(1,3));
list2.add(new Tuple2<Integer, Integer>(2,3));
JavaPairRDD<Integer,Integer> nums1=sc.parallelizePairs(list1);
JavaPairRDD<Integer,Integer> nums2=sc.parallelizePairs(list2);
JavaPairRDD<Integer,Tuple2<Iterable<Integer>,Iterable<Integer>>> results=nums1.cogroup(nums2);
for(Tuple2<Integer,Tuple2<Iterable<Integer>,Iterable<Integer>>> tuple:results.collect()){
System.out.print(tuple._1()+" [ ");
Iterator it1=tuple._2()._1().iterator();
while(it1.hasNext()){
System.out.print(it1.next()+" ");
}
System.out.print("] [ ");
Iterator it2=tuple._2()._2().iterator();
while(it2.hasNext()){
System.out.print(it2.next()+" ");
}
System.out.print("] \n");
}
}
JavaRDD<Integer> nums=sc.parallelize(Arrays.asList(1,5,3,2,6,3));
JavaRDD<Integer> results =nums.sortBy(new Function<Integer, Object>() {
public Object call(Integer v1) throws Exception {
return v1;
}
},false,1);
for(Integer a:results.collect())
System.out.println(a);
ist<Tuple2<Integer, Integer>> list1 = new ArrayList<Tuple2<Integer, Integer>>();
list1.add(new Tuple2<Integer, Integer>(1, 1));
list1.add(new Tuple2<Integer, Integer>(2, 2));
list1.add(new Tuple2<Integer, Integer>(1, 3));
list1.add(new Tuple2<Integer, Integer>(2, 4));
list1.add(new Tuple2<Integer, Integer>(3, 4));
JavaPairRDD<Integer, Integer> nums1 = sc.parallelizePairs(list1);
class comp implements Comparator<Integer>, Serializable {
public int compare(Integer a, Integer b) {
return a.compareTo(b);
}
};
JavaPairRDD<Integer,Integer> results=nums1.sortByKey(new comp());
for(Tuple2<Integer,Integer> tuple: results.collect()){
System.out.println(tuple._1()+": "+tuple._2());
}
List<Tuple2<String,Iterable<String>>> list1=new ArrayList<Tuple2<String, Iterable<String>>>();
list1.add(new Tuple2<String, Iterable<String>>("zhou",Arrays.asList("it","math")));
list1.add(new Tuple2<String, Iterable<String>>("gan",Arrays.asList("money","book")));
JavaPairRDD<String,Iterable<String>> userData=sc.parallelizePairs(list1);
List<Tuple2<String,String>> list2=new ArrayList<Tuple2<String, String>>();
list2.add(new Tuple2<String, String>("zhou","it") );
list2.add(new Tuple2<String,String>("zhou","stock"));
list2.add(new Tuple2<String, String>("gan","money"));
list2.add(new Tuple2<String, String>("gan","book"));
JavaPairRDD<String,String> events=sc.parallelizePairs(list2);
JavaPairRDD<String, Tuple2<Iterable<String>, String>> joined = userData.join(events);
long a=joined.filter(
new Function<Tuple2<String, Tuple2<Iterable<String>, String>>, Boolean>() {
public Boolean call(Tuple2<String, Tuple2<Iterable<String>, String>> tuple) throws Exception {
boolean has = false;
Iterable<String> user=tuple._2()._1();
String link=tuple._2()._2();
for (String s : user) {
if (s.compareTo(link) == 0) {
has = true;
break;
}
}
//保留不在用户订阅表中的RDD
return !has;
}
}
).count();
System.out.println(a);
List<Tuple2<String,Iterable<String>>> list1=new ArrayList<Tuple2<String, Iterable<String>>>();
list1.add(new Tuple2<String, Iterable<String>>("zhou",Arrays.asList("it","math")));
list1.add(new Tuple2<String, Iterable<String>>("gan",Arrays.asList("money","book")));
JavaPairRDD<String,Iterable<String>> userData=sc.parallelizePairs(list1);//请注意,partitionBy是转化操作
userData=userData.partitionBy(new HashPartitioner(100)).persist(StorageLevel.MEMORY_AND_DISK());
Optional<Partitioner> partitioner = userData.partitioner();
System.out.println(partitioner.get());
System.out.println(partitioner.isPresent());
public class main {
private static class Sum implements Function2<Double, Double, Double> {
public Double call(Double a, Double b) {
return a + b;
}
}
public static void main(String args[]){
SparkConf conf =new SparkConf();
conf.setAppName("my spark app");
conf.setMaster("local");
JavaSparkContext sc =new JavaSparkContext(conf);
JavaRDD<String> inputs= sc.textFile("C:\\url.txt");
/*
#以下是url的内容:
www.baidu.com www.hao123.com
www.baidu.com www.2345.com
www.baidu.com www.zhouyang.com
www.hao123.com www.baidu.com
www.hao123.com www.zhouyang.com
www.zhouyang.com www.baidu.com
*/
JavaPairRDD<String, Iterable<String>> links = inputs.mapToPair(
new PairFunction<String, String, String>() {
public Tuple2<String, String> call(String s) throws Exception {
String[] parts = s.split(" ");
return new Tuple2<String, String>(parts[0], parts[1]);
}
}
).distinct().groupByKey().cache();
JavaPairRDD<String, Double> ranks = links.mapValues(
new Function<Iterable<String>, Double>() {
public Double call(Iterable<String> v1) throws Exception {
return 1.0;
}
}
);
JavaPairRDD<String, Tuple2<Iterable<String>, Double>> join = links.join(ranks);
for(int current=0;current<10;current++){
// Calculates URL contributions to the rank of other URLs.
JavaPairRDD<String, Double> contribs = links.join(ranks).values()
.flatMapToPair(new PairFlatMapFunction<Tuple2<Iterable<String>, Double>, String, Double>() {
public Iterable<Tuple2<String, Double>> call(Tuple2<Iterable<String>, Double> s) {
int urlCount = Iterables.size(s._1());
List<Tuple2<String, Double>> results = new ArrayList<Tuple2<String, Double>>();
for (String n : s._1()) {
results.add(new Tuple2<String, Double>(n, s._2() / urlCount));
}
return results;
}
});
// Re-calculates URL ranks based on neighbor contributions.
ranks = contribs.reduceByKey(new Sum()).mapValues(new Function<Double, Double>() {
public Double call(Double sum) {
return 0.15 + sum * 0.85;
}
});
}
// Collects all URL ranks and dump them to console.
List<Tuple2<String, Double>> output = ranks.collect();
for (Tuple2<?,?> tuple : output) {
System.out.println(tuple._1() + " has rank: " + tuple._2() + ".");
}
}
}
class myPartitioner extends Partitioner{
int num;//分区数目
public myPartitioner(int num){//构造方法,初始化num
this.num=num;
}
@Override
public int numPartitions() {//返回分区数目
return num;
}
@Override
public int getPartition(Object key) {
String url =(String)key;
String domain="";
try {
domain=new URL(url).getHost();//获取域名
} catch (MalformedURLException e) {
e.printStackTrace();
}
int code =domain.hashCode()%num;//获取该域名对应的hash值
if(code<0){//getPartition()方法只能返回非负数,对负数进行处理
code+=num;
}
return code;
}
@Override
public boolean equals(Object obj) {
if(obj instanceof myPartitioner){//如果obj是myPartitioner的实例
return ((myPartitioner) obj).num==num;//看分区数是否相同
}
else//否则直接返回false
return false;
}
}
Spark中的键值对操作的更多相关文章
- Spark中的键值对操作-scala
1.PairRDD介绍 Spark为包含键值对类型的RDD提供了一些专有的操作.这些RDD被称为PairRDD.PairRDD提供了并行操作各个键或跨节点重新进行数据分组的操作接口.例如,Pa ...
- Spark学习之键值对操作总结
键值对 RDD 是 Spark 中许多操作所需要的常见数据类型.键值对 RDD 通常用来进行聚合计算.我们一般要先通过一些初始 ETL(抽取.转化.装载)操作来将数据转化为键值对形式.键值对 RDD ...
- Spark学习笔记——键值对操作
键值对 RDD是 Spark 中许多操作所需要的常见数据类型 键值对 RDD 通常用来进行聚合计算.我们一般要先通过一些初始 ETL(抽取.转化.装载)操作来将数据转化为键值对形式. Spark 为包 ...
- Redis中的键值过期操作
1.过期设置 Redis 中设置过期时间主要通过以下四种方式: expire key seconds:设置 key 在 n 秒后过期: pexpire key milliseconds:设置 key ...
- Redis源码解析:09redis数据库实现(键值对操作、键超时功能、键空间通知)
本章对Redis服务器的数据库实现进行介绍,说明Redis数据库相关操作的实现,包括数据库中键值对的添加.删除.查看.更新等操作的实现:客户端切换数据库的实现:键超时相关功能的实现.键空间事件通知等. ...
- Spark学习笔记3:键值对操作
键值对RDD通常用来进行聚合计算,Spark为包含键值对类型的RDD提供了一些专有的操作.这些RDD被称为pair RDD.pair RDD提供了并行操作各个键或跨节点重新进行数据分组的操作接口. S ...
- Spark学习之键值对(pair RDD)操作(3)
Spark学习之键值对(pair RDD)操作(3) 1. 我们通常从一个RDD中提取某些字段(如代表事件时间.用户ID或者其他标识符的字段),并使用这些字段为pair RDD操作中的键. 2. 创建 ...
- spark入门(三)键值对操作
1 简述 Spark为包含键值对类型的RDD提供了一些专有的操作.这些RDD被称为PairRDD. 2 创建PairRDD 2.1 在sprk中,很多存储键值对的数据在读取时直接返回由其键值对数据组成 ...
- Spark基础:(三)Spark 键值对操作
1.pair RDD的简介 Spark为包含键值对类型的RDD提供了一些专有的操作,这些RDD就被称为pair RDD 那么如何创建pair RDD呢? 在不同的语言中有着不同的创建方式 在pytho ...
随机推荐
- C#调用C++动态库时类型转换
因为本人主要从事c#开发,但是在工作中经常需要用到c++编写的DLL,因此需要知道c++中的类型与c#中的类型是如何转换的.搜集整理如下. //C++中的DLL函数原型为 //extern &qu ...
- Entity Framework 学习初级篇2--ObjectContext、ObjectQuery、ObjectStateEntry、ObjectStateManager类的介绍
本节,简单的介绍EF中的ObjectContext.ObjectQuery.ObjectStateEntry.ObjectStateManager这个几个比较重要的类,它们都位于System.Data ...
- tps,qps
http://blog.itpub.net/22664653/viewspace-767265/
- 一种快速查询多点DS18B20温度的方法(转)
源:http://hi.baidu.com/james_xiao/item/79b961c90623093e45941623 一种快速查询多点DS18B20温度的方法 引言 为了满足实时性要 ...
- cakephp 的事件系统(Getting to grips with CakePHP’s events system), 基于观察者模式
This article was written about CakePHP 2.x and has been untested with CakePHP 3.x CakePHP seems to g ...
- Python3基础 函数 收集参数+普通参数 的示例
镇场诗: 诚听如来语,顿舍世间名与利.愿做地藏徒,广演是经阎浮提. 愿尽吾所学,成就一良心博客.愿诸后来人,重现智慧清净体.-------------------------------------- ...
- 数据结构录 之 单调队列&单调栈。
队列和栈是很常见的应用,大部分算法中都能见到他们的影子. 而单纯的队列和栈经常不能满足需求,所以需要一些很神奇的队列和栈的扩展. 其中最出名的应该是优先队列吧我觉得,然后还有两种比较小众的扩展就是单调 ...
- Django之路:模型(数据库)和自定义Field以及数据表的更改
一.Django 模型(数据库) Django模型是与数据库相关的,与数据库相关的代码一般写在models.py中,Django支持sqlite3,MySQL,PostgreSQL等数据库,只需要在s ...
- Spring MVC 与ExtJS完美集成
http://blog.csdn.net/q262800095/article/details/12021191 http://www.jb51.net/article/25267.htm
- google-c-style
http://zhanxw.com/blog/2011/03/learning-and-applying-coding-style-from-google-in-emacs/ http://stack ...