定义IntPair 以及 IntPair(first,second)的compareto,先比較first的大小,再比較second的大小

定义FirstPartitioner是为了让partition的时候依照IntPair的first来做为选择reduce的根据

定义FirstGroupingComparator是为了:《Pro Hadoop》,里面有一部分内容详解了这个问题,看后最终明确了,和大家分享一下。reduce方法每次是读一条记录,读到对应的key,可是处理value集合时,处理完当前记录的values后,还会推断下一条记录是不是和当前的key是不是同一个组,假设是的话,会继续读取这些记录的值,而这个记录也会被觉得已经处理了,直到记录不是当前组,这次reduce调用才结束,这样一次reduce调用就会处理掉一个组中的全部记录,而不不过一条了。

以下是从hadoop里取出的源码,能够再理解下:

/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/ package org.apache.hadoop.examples; import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.StringTokenizer; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.util.GenericOptionsParser; /**
* This is an example Hadoop Map/Reduce application.
* It reads the text input files that must contain two integers per a line.
* The output is sorted by the first and second number and grouped on the
* first number.
*
* To run: bin/hadoop jar build/hadoop-examples.jar secondarysort
* <i>in-dir</i> <i>out-dir</i>
*/
public class SecondarySort { /**
* Define a pair of integers that are writable.
* They are serialized in a byte comparable format.
*/
public static class IntPair
implements WritableComparable<IntPair> {
private int first = 0;
private int second = 0; /**
* Set the left and right values.
*/
public void set(int left, int right) {
first = left;
second = right;
}
public IntPair(){}
public IntPair(int left,int right){
set(left, right);
}
public int getFirst() {
return first;
}
public int getSecond() {
return second;
}
/**
* Read the two integers.
* Encoded as: MIN_VALUE -> 0, 0 -> -MIN_VALUE, MAX_VALUE-> -1
*/
@Override
public void readFields(DataInput in) throws IOException {
first = in.readInt() + Integer.MIN_VALUE;
second = in.readInt() + Integer.MIN_VALUE;
}
@Override
public void write(DataOutput out) throws IOException {
out.writeInt(first - Integer.MIN_VALUE);
out.writeInt(second - Integer.MIN_VALUE);
}
@Override
public int hashCode() {
return first * 157 + second;
}
@Override
public boolean equals(Object right) {
if (right instanceof IntPair) {
IntPair r = (IntPair) right;
return r.first == first && r.second == second;
} else {
return false;
}
}
/** A Comparator that compares serialized IntPair. */
public static class Comparator extends WritableComparator {
public Comparator() {
super(IntPair.class);
} public int compare(byte[] b1, int s1, int l1,
byte[] b2, int s2, int l2) {
return compareBytes(b1, s1, l1, b2, s2, l2);
}
} static { // register this comparator
WritableComparator.define(IntPair.class, new Comparator());
} @Override
public int compareTo(IntPair o) {
if (first != o.first) {
return first < o.first ? -1 : 1;
} else if (second != o.second) {
return second < o.second ? -1 : 1;
} else {
return 0;
}
}
} /**
* Partition based on the first part of the pair.
*/
public static class FirstPartitioner extends Partitioner<IntPair,IntWritable>{
@Override
public int getPartition(IntPair key, IntWritable value,
int numPartitions) {
return Math.abs(key.getFirst() * 127) % numPartitions;
}
} /**
* Compare only the first part of the pair, so that reduce is called once
* for each value of the first part.
*/
public static class FirstGroupingComparator
implements RawComparator<IntPair> {
@Override
public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
return WritableComparator.compareBytes(b1, s1, Integer.SIZE/8,
b2, s2, Integer.SIZE/8);
} @Override
public int compare(IntPair o1, IntPair o2) {
int l = o1.getFirst();
int r = o2.getFirst();
return l == r ? 0 : (l < r ? -1 : 1);
}
} /**
* Read two integers from each line and generate a key, value pair
* as ((left, right), right).
*/
public static class MapClass
extends Mapper<LongWritable, Text, IntPair, IntWritable> { private final IntPair key = new IntPair();
private final IntWritable value = new IntWritable(); @Override
public void map(LongWritable inKey, Text inValue,
Context context) throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(inValue.toString());
int left = 0;
int right = 0;
if (itr.hasMoreTokens()) {
left = Integer.parseInt(itr.nextToken());
if (itr.hasMoreTokens()) {
right = Integer.parseInt(itr.nextToken());
}
key.set(left, right);
value.set(right);
context.write(key, value);
}
}
} /**
* A reducer class that just emits the sum of the input values.
*/
public static class Reduce
extends Reducer<IntPair, IntWritable, Text, IntWritable> {
private static final Text SEPARATOR =
new Text("------------------------------------------------");
private final Text first = new Text(); @Override
public void reduce(IntPair key, Iterable<IntWritable> values,
Context context
) throws IOException, InterruptedException {
context.write(SEPARATOR, null);
first.set(Integer.toString(key.getFirst()));
for(IntWritable value: values) {
context.write(first, value);
}
}
} public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage: secondarysrot <in> <out>");
System.exit(2);
}
Job job = new Job(conf, "secondary sort");
job.setJarByClass(SecondarySort.class);
job.setMapperClass(MapClass.class);
job.setReducerClass(Reduce.class); // group and partition by the first int in the pair
job.setPartitionerClass(FirstPartitioner.class);
job.setGroupingComparatorClass(FirstGroupingComparator.class); // the map output is IntPair, IntWritable
job.setMapOutputKeyClass(IntPair.class);
job.setMapOutputValueClass(IntWritable.class); // the reduce output is Text, IntWritable
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
} }

SecondarySort 原理的更多相关文章

  1. hadoop自带例子SecondarySort源码分析MapReduce原理

    这里分析MapReduce原理并没用WordCount,目前没用过hadoop也没接触过大数据,感觉,只是感觉,在项目中,如果真的用到了MapReduce那待排序的肯定会更加实用. 先贴上源码 pac ...

  2. Hadoop MapReduce 二次排序原理及其应用

    关于二次排序主要涉及到这么几个东西: 在0.20.0 以前使用的是 setPartitionerClass setOutputkeyComparatorClass setOutputValueGrou ...

  3. 奇异值分解(SVD)原理与在降维中的应用

    奇异值分解(Singular Value Decomposition,以下简称SVD)是在机器学习领域广泛应用的算法,它不光可以用于降维算法中的特征分解,还可以用于推荐系统,以及自然语言处理等领域.是 ...

  4. node.js学习(三)简单的node程序&&模块简单使用&&commonJS规范&&深入理解模块原理

    一.一个简单的node程序 1.新建一个txt文件 2.修改后缀 修改之后会弹出这个,点击"是" 3.运行test.js 源文件 使用node.js运行之后的. 如果该路径下没有该 ...

  5. 线性判别分析LDA原理总结

    在主成分分析(PCA)原理总结中,我们对降维算法PCA做了总结.这里我们就对另外一种经典的降维方法线性判别分析(Linear Discriminant Analysis, 以下简称LDA)做一个总结. ...

  6. [原] KVM 虚拟化原理探究(1)— overview

    KVM 虚拟化原理探究- overview 标签(空格分隔): KVM 写在前面的话 本文不介绍kvm和qemu的基本安装操作,希望读者具有一定的KVM实践经验.同时希望借此系列博客,能够对KVM底层 ...

  7. H5单页面手势滑屏切换原理

    H5单页面手势滑屏切换是采用HTML5 触摸事件(Touch) 和 CSS3动画(Transform,Transition)来实现的,效果图如下所示,本文简单说一下其实现原理和主要思路. 1.实现原理 ...

  8. .NET Core中间件的注册和管道的构建(1)---- 注册和构建原理

    .NET Core中间件的注册和管道的构建(1)---- 注册和构建原理 0x00 问题的产生 管道是.NET Core中非常关键的一个概念,很多重要的组件都以中间件的形式存在,包括权限管理.会话管理 ...

  9. python自动化测试(2)-自动化基本技术原理

    python自动化测试(2) 自动化基本技术原理 1   概述 在之前的文章里面提到过:做自动化的首要本领就是要会 透过现象看本质 ,落实到实际的IT工作中就是 透过界面看数据. 掌握上面的这样的本领 ...

随机推荐

  1. JAVA NIO 结合多线程

    NIO 的选择器采用了多路复用(Multiplexing)技术,可在一个选择器上处理多个套接字, 通过获取读写通道来进行 IO 操作.由于网络带宽等原因,在通道的读.写操作中是容易出现等待的, 所以在 ...

  2. Java客户端协议处理框架简介

    无论FTP客户程序,还是HTTP客户程序,或是其他基于特定应用层协议的客户程序,在与远程服务器通信时,都需要建立与远程服务器的连接,然后发送和接收与协议相符的数据.客户程序还需要对服务器发送的数据进行 ...

  3. 重启iis线程池和iis站点

    服务器监控. 一定时间内或者iis异常,就重启线程池和站点 一般重启站点没啥用.. 重启线程池 效果明显. 重启站点: /// <summary> /// 根据名字重启站点.(没重启线程池 ...

  4. nutch 1.7 导入 eclipse

    开发环境建议:ubuntu+eclipse (windows + cygwin + eclipse不推荐) 第一步:下载http://archive.apache.org/dist/nutch/从上述 ...

  5. hadoop 各种counter 解读

    http://blog.sina.com.cn/s/blog_61ef49250100uxwh.html 经过了两天的休息与放松,精神饱满了吧?上星期我们学习了MapReduce的过程,了解了其基本过 ...

  6. ld: symbol dyld_stub_binding_helper not found, normally in crt1.o/dylib1.o/bundle1.o for architecture i386

    就是选择的运行版本太低了,点击项目,project,把iOS DeployMent Target改为比较高的版本就行

  7. Codeforces Round #131 (Div. 2) : B

    首先能被2,5整除的数结尾必须是0: 如果没有0肯定不行: 然后判断他们的和ans%3: 如果==0,直接从大到小输出就行: 如果==1,要么删除它们之间最小的那个%3==1的,要么删除两个小的并且% ...

  8. 【MongoDB】应用场景

    24 Use Cases24.1 适合场景 Archiving and event logging 归档和日志记录 Document and Content Management Systems ...

  9. Android的系统架构

    转自Android的系统架构 从上图中可以看出,Android系统架构为四层结构,从上层到下层分别是应用程序层.应用程序框架层.系统运行库层以及Linux内核层,分别介绍如下:     1)应用程序层 ...

  10. 子元素div高度不确定时父div高度如何自适应

    粘自:http://www.jb51.net/css/110652.html 在最外层div加以下样式 height:100%; overflow:hidden; 其它方法: Div即父容器不根据内容 ...