SecondarySort 原理

定义IntPair 以及 IntPair（first,second）的compareto，先比較first的大小，再比較second的大小

定义FirstPartitioner是为了让partition的时候依照IntPair的first来做为选择reduce的根据

定义FirstGroupingComparator是为了：《Pro Hadoop》，里面有一部分内容详解了这个问题，看后最终明确了，和大家分享一下。reduce方法每次是读一条记录，读到对应的key，可是处理value集合时，处理完当前记录的values后，还会推断下一条记录是不是和当前的key是不是同一个组，假设是的话，会继续读取这些记录的值，而这个记录也会被觉得已经处理了，直到记录不是当前组，这次reduce调用才结束，这样一次reduce调用就会处理掉一个组中的全部记录，而不不过一条了。

以下是从hadoop里取出的源码，能够再理解下：

/**

 * Licensed to the Apache Software Foundation (ASF) under one

 * or more contributor license agreements.  See the NOTICE file

 * distributed with this work for additional information

 * regarding copyright ownership.  The ASF licenses this file

 * to you under the Apache License, Version 2.0 (the

 * "License"); you may not use this file except in compliance

 * with the License.  You may obtain a copy of the License at

 *

 *     http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing, software

 * distributed under the License is distributed on an "AS IS" BASIS,

 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 * See the License for the specific language governing permissions and

 * limitations under the License.

 */

package org.apache.hadoop.examples;

import java.io.DataInput;

import java.io.DataOutput;

import java.io.IOException;

import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.RawComparator;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.io.WritableComparable;

import org.apache.hadoop.io.WritableComparator;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Partitioner;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.util.GenericOptionsParser;

/**

 * This is an example Hadoop Map/Reduce application.

 * It reads the text input files that must contain two integers per a line.

 * The output is sorted by the first and second number and grouped on the

 * first number.

 *

 * To run: bin/hadoop jar build/hadoop-examples.jar secondarysort

 *            <i>in-dir</i> <i>out-dir</i>

 */

public class SecondarySort {

  /**

   * Define a pair of integers that are writable.

   * They are serialized in a byte comparable format.

   */

  public static class IntPair

                      implements WritableComparable<IntPair> {

    private int first = 0;

    private int second = 0;

    /**

     * Set the left and right values.

     */

    public void set(int left, int right) {

      first = left;

      second = right;

    }

    public IntPair(){}

    public IntPair(int left,int right){

    	set(left, right);

    }

    public int getFirst() {

      return first;

    }

    public int getSecond() {

      return second;

    }

    /**

     * Read the two integers.

     * Encoded as: MIN_VALUE -> 0, 0 -> -MIN_VALUE, MAX_VALUE-> -1

     */

    @Override

    public void readFields(DataInput in) throws IOException {

      first = in.readInt() + Integer.MIN_VALUE;

      second = in.readInt() + Integer.MIN_VALUE;

    }

    @Override

    public void write(DataOutput out) throws IOException {

      out.writeInt(first - Integer.MIN_VALUE);

      out.writeInt(second - Integer.MIN_VALUE);

    }

    @Override

    public int hashCode() {

      return first * 157 + second;

    }

    @Override

    public boolean equals(Object right) {

      if (right instanceof IntPair) {

        IntPair r = (IntPair) right;

        return r.first == first && r.second == second;

      } else {

        return false;

      }

    }

    /** A Comparator that compares serialized IntPair. */

    public static class Comparator extends WritableComparator {

      public Comparator() {

        super(IntPair.class);

      }

      public int compare(byte[] b1, int s1, int l1,

                         byte[] b2, int s2, int l2) {

        return compareBytes(b1, s1, l1, b2, s2, l2);

      }

    }

    static {                                        // register this comparator

      WritableComparator.define(IntPair.class, new Comparator());

    }

    @Override

    public int compareTo(IntPair o) {

      if (first != o.first) {

        return first < o.first ? -1 : 1;

      } else if (second != o.second) {

        return second < o.second ? -1 : 1;

      } else {

        return 0;

      }

    }

  }

  /**

   * Partition based on the first part of the pair.

   */

  public static class FirstPartitioner extends Partitioner<IntPair,IntWritable>{

    @Override

    public int getPartition(IntPair key, IntWritable value,

                            int numPartitions) {

      return Math.abs(key.getFirst() * 127) % numPartitions;

    }

  }

  /**

   * Compare only the first part of the pair, so that reduce is called once

   * for each value of the first part.

   */

  public static class FirstGroupingComparator

                implements RawComparator<IntPair> {

    @Override

    public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {

      return WritableComparator.compareBytes(b1, s1, Integer.SIZE/8,

                                             b2, s2, Integer.SIZE/8);

    }

    @Override

    public int compare(IntPair o1, IntPair o2) {

      int l = o1.getFirst();

      int r = o2.getFirst();

      return l == r ? 0 : (l < r ? -1 : 1);

    }

  }

  /**

   * Read two integers from each line and generate a key, value pair

   * as ((left, right), right).

   */

  public static class MapClass

         extends Mapper<LongWritable, Text, IntPair, IntWritable> {

    private final IntPair key = new IntPair();

    private final IntWritable value = new IntWritable();

    @Override

    public void map(LongWritable inKey, Text inValue,

                    Context context) throws IOException, InterruptedException {

      StringTokenizer itr = new StringTokenizer(inValue.toString());

      int left = 0;

      int right = 0;

      if (itr.hasMoreTokens()) {

        left = Integer.parseInt(itr.nextToken());

        if (itr.hasMoreTokens()) {

          right = Integer.parseInt(itr.nextToken());

        }

        key.set(left, right);

        value.set(right);

        context.write(key, value);

      }

    }

  }

  /**

   * A reducer class that just emits the sum of the input values.

   */

  public static class Reduce

         extends Reducer<IntPair, IntWritable, Text, IntWritable> {

    private static final Text SEPARATOR =

      new Text("------------------------------------------------");

    private final Text first = new Text();

    @Override

    public void reduce(IntPair key, Iterable<IntWritable> values,

                       Context context

                       ) throws IOException, InterruptedException {

      context.write(SEPARATOR, null);

      first.set(Integer.toString(key.getFirst()));

      for(IntWritable value: values) {

        context.write(first, value);

      }

    }

  }

  public static void main(String[] args) throws Exception {

    Configuration conf = new Configuration();

    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

    if (otherArgs.length != 2) {

      System.err.println("Usage: secondarysrot <in> <out>");

      System.exit(2);

    }

    Job job = new Job(conf, "secondary sort");

    job.setJarByClass(SecondarySort.class);

    job.setMapperClass(MapClass.class);

    job.setReducerClass(Reduce.class);

    // group and partition by the first int in the pair

    job.setPartitionerClass(FirstPartitioner.class);

    job.setGroupingComparatorClass(FirstGroupingComparator.class);

    // the map output is IntPair, IntWritable

    job.setMapOutputKeyClass(IntPair.class);

    job.setMapOutputValueClass(IntWritable.class);

    // the reduce output is Text, IntWritable

    job.setOutputKeyClass(Text.class);

    job.setOutputValueClass(IntWritable.class);

    FileInputFormat.addInputPath(job, new Path(otherArgs[0]));

    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));

    System.exit(job.waitForCompletion(true) ? 0 : 1);

  }

}

SecondarySort 原理的更多相关文章

hadoop自带例子SecondarySort源码分析MapReduce原理
这里分析MapReduce原理并没用WordCount,目前没用过hadoop也没接触过大数据,感觉,只是感觉,在项目中,如果真的用到了MapReduce那待排序的肯定会更加实用. 先贴上源码 pac ...
Hadoop MapReduce 二次排序原理及其应用
关于二次排序主要涉及到这么几个东西: 在0.20.0 以前使用的是 setPartitionerClass setOutputkeyComparatorClass setOutputValueGrou ...
奇异值分解(SVD)原理与在降维中的应用
奇异值分解(Singular Value Decomposition,以下简称SVD)是在机器学习领域广泛应用的算法,它不光可以用于降维算法中的特征分解,还可以用于推荐系统,以及自然语言处理等领域.是 ...
node.js学习（三）简单的node程序&&模块简单使用&&commonJS规范&&深入理解模块原理
一.一个简单的node程序 1.新建一个txt文件 2.修改后缀修改之后会弹出这个,点击"是" 3.运行test.js 源文件使用node.js运行之后的. 如果该路径下没有该 ...
线性判别分析LDA原理总结
在主成分分析(PCA)原理总结中,我们对降维算法PCA做了总结.这里我们就对另外一种经典的降维方法线性判别分析(Linear Discriminant Analysis, 以下简称LDA)做一个总结. ...
[原] KVM 虚拟化原理探究（1）— overview
KVM 虚拟化原理探究- overview 标签(空格分隔): KVM 写在前面的话本文不介绍kvm和qemu的基本安装操作,希望读者具有一定的KVM实践经验.同时希望借此系列博客,能够对KVM底层 ...
H5单页面手势滑屏切换原理
H5单页面手势滑屏切换是采用HTML5 触摸事件(Touch) 和 CSS3动画(Transform,Transition)来实现的,效果图如下所示,本文简单说一下其实现原理和主要思路. 1.实现原理 ...
.NET Core中间件的注册和管道的构建（1）---- 注册和构建原理
.NET Core中间件的注册和管道的构建(1)---- 注册和构建原理 0x00 问题的产生管道是.NET Core中非常关键的一个概念,很多重要的组件都以中间件的形式存在,包括权限管理.会话管理 ...
python自动化测试（2）-自动化基本技术原理
python自动化测试(2) 自动化基本技术原理 1 概述在之前的文章里面提到过:做自动化的首要本领就是要会透过现象看本质 ,落实到实际的IT工作中就是透过界面看数据. 掌握上面的这样的本领 ...

随机推荐

关于A*寻路算法的认识
最近要参加学校的APP比赛,我们组做的是一个3D迷宫的小APP,我负责的是迷宫的生成与寻路. 寻路算法选择的是A*寻路算法,具体参考的是下面的这篇博客. 本文主要是谈谈自己对A*算法的理解,具体细节, ...
Jquery 实现瀑布流功能
实现展示地址:http://sandbox.runjs.cn/show/mbojrgag 源码地址:http://runjs.cn/code/qps1jebl 效果截图:
angularJs工作日记-自定义指令Directive01
新项目组使用完善的angularMVVM设计思路架构,很庆幸能够来到这个项目组,在这里的每一天都能够学习到新的知识,为了防止以后忘记,记录一下个人的理解首先接触最多的是directive,direc ...
EntityFramework 和 linq 判断是否在指定时间段内的方法
EntityFramework: System.Data.Objects.EntityFunctions.DiffDays(DateTime.Now, inputTime)判断当前时间与指定时间相差多 ...
StartSSL免费SSL证书成功申请-HTTPS让访问网站更安全
StartSSL免费SSL证书成功申请-HTTPS让访问网站更安全一.StartSSL个人证书登录申请 1.StartSSL官网: 1.官方首页:http://www.startssl.com/ 2 ...
VC 项目支撑文件解释
1.解决方案文件: a.sln 解决方案.把项目中的所有元素或者多个项目整合到一个解决方案中去. b.suo 解决方案定制项.存储用户级别对解决方案的定制,比如打开状态,断点信息. 这两个文件 ...
iOS图案锁,支持动画、图片、绘图
最近忙着搭建一个聊天用的框架,过几天应该会整理写出来吧,原理不难,但是实现后会省很多事.好久没写博客,周末心血来潮写了个图案锁,这东西没什么技术含量,网上一堆,这次这个图案锁顺便联系了怎么打包使用.a ...
python作用域 scope
可以先看:http://www.cnblogs.com/youxin/p/3645734.html 几个概念:python能够改变变量作用域的代码段是def.class.lamda.if/elif/e ...
Android学习之 UI效果
探究Android的多分辨率支持以及各种类型图标尺寸大小 - CSDN 各种数字提醒控件-Android 代码仓库-eoe Android ViewBadger - 开源中国社区 Android 微信 ...
C/C++中程序在使用堆内存时的内存复用问题
在一个C/C++程序中,如果使用了堆内存的管理机制,那么内存究竟是怎么分配与回收的呢? 先看一个程序: #include <iostream> using namespace std; i ...

SecondarySort 原理

SecondarySort 原理的更多相关文章

随机推荐

热门专题