Hadoop学习笔记： MapReduce Java编程简介

概述

本文主要基于Hadoop 1.0.0后推出的新Java API为例介绍MapReduce的Java编程模型。新旧API主要区别在于新API(org.apache.hadoop.mapreduce)将原来的旧API(org.apache.hadoop.mapred)中的接口转换为了抽象类。

MapReduce编程主要将程序运行过程分为两个阶段：Map阶段和Reduce阶段。其中Map阶段由若干Map task组成，主要由InputFormat, Mapper, Partitioner等类完成工作。Reduce阶段由若干Reduce task组成，主要由Reducer, OutputFormat等类完成工作。

Java API

InputFormat:

InputFormat是一个抽象类，所有输入格式都继承于这个类，例如读取数据库的DBInputFormat，读取普通文件的FileInputFormat等。MapReduce程序依靠InputFormat类/子类完成如下工作：

1. 确定所有输入文件，将其划分为逻辑上的InputSplits分片。每个分片分别交给一个Mapper处理。

2. 提供对象RecordReader，提取分片中的内容。

public abstract class InputFormat<K, V> {

  public abstract List<InputSplit> getSplits(JobContext context

                               ) throws IOException, InterruptedException;

  public abstract

    RecordReader<K,V> createRecordReader(InputSplit split,

                                         TaskAttemptContext context

                                        ) throws IOException,

                                                 InterruptedException;

}

除非有特殊指明，TextInputFormat为默认类，该类是FileInputFormat的一个子类。使用TextInputFormat为输入格式时，Key为每一行内容在文件中的偏移量，Value为每一行的实际内容。InputFormat类的层次如下：

InputSplit:

InputSplit是一个抽象类，定义了如下三个方法。getLength()用来获取分片大小，以支持对分片进行排序。getLocation()用来获取分片的位置列表。从代码也可以看出，实际上Input split并不存放数据，只是存放了实际文件的位置信息。

public abstract class InputSplit {

  public abstract long getLength() throws IOException, InterruptedException;

  public abstract String[] getLocations() throws IOException, InterruptedException;

  public SplitLocationInfo[] getLocationInfo() throws IOException {

    return null;

  }

}

RecordReader

RecordReader将InputSplit解析成<K,V>对，作为Mapper的输入。

public abstract class RecordReader<KEYIN, VALUEIN> implements Closeable {

  public abstract void initialize(InputSplit split,

                                  TaskAttemptContext context

                                  ) throws IOException, InterruptedException;

  public abstract

  boolean nextKeyValue() throws IOException, InterruptedException;

  public abstract

  KEYIN getCurrentKey() throws IOException, InterruptedException;

  public abstract

  VALUEIN getCurrentValue() throws IOException, InterruptedException;

  public abstract float getProgress() throws IOException, InterruptedException;

  public abstract void close() throws IOException;

}

自定义InputFormat代码实例：

package InputFormat;

import java.io.DataInput;

import java.io.DataOutput;

import java.io.IOException;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.io.Writable;

public class URLWritable implements Writable{

    private Text url;

    public URLWritable(){

        url = new Text();

    }

    public URLWritable(Text url){

        this.url = url;

    }

    public Text getUrl() {

        return url;

    }

    public void setUrl(Text url) {

        this.url = url;

    }

    @Override

    public void readFields(DataInput in) throws IOException {

        url.set(in.readUTF());

    }

    @Override

    public void write(DataOutput out) throws IOException {

        out.writeUTF(url.toString());

    }

}

package InputFormat;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.InputSplit;

import org.apache.hadoop.mapreduce.RecordReader;

import org.apache.hadoop.mapreduce.TaskAttemptContext;

import org.apache.hadoop.mapreduce.lib.input.*;

public class URLRecordReader extends RecordReader<Text, URLWritable>{

    private KeyValueLineRecordReader lineReader;

    private Text key;

    private URLWritable value;

    public URLRecordReader(Configuration conf) throws IOException{

        super();

        this.lineReader = new KeyValueLineRecordReader(conf);

        this.key = new Text();

        this.value = new URLWritable();

    }

    @Override

    public void close() throws IOException {

        lineReader.close();

    }

    @Override

    public Text getCurrentKey() throws IOException, InterruptedException {

        key = lineReader.getCurrentKey();

        return key;

    }

    @Override

    public URLWritable getCurrentValue() throws IOException, InterruptedException {

        value.setUrl(lineReader.getCurrentValue());

        return value;

    }

    @Override

    public float getProgress() throws IOException, InterruptedException {

        return lineReader.getProgress();

    }

    @Override

    public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException, InterruptedException {

        lineReader.initialize(genericSplit, context);

    }

    @Override

    public boolean nextKeyValue() throws IOException, InterruptedException {

        return lineReader.nextKeyValue();

    }

}

package InputFormat;

import java.io.IOException;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.*;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

public class URLInputFormat extends FileInputFormat<Text, URLWritable>{

    @Override

    public RecordReader<Text, URLWritable> createRecordReader(InputSplit genericSplit, TaskAttemptContext context)

            throws IOException, InterruptedException {

        context.setStatus(genericSplit.toString());

        return new URLRecordReader(context.getConfiguration());

    }

}

package InputFormat;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.conf.Configured;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

import org.apache.hadoop.util.Tool;

import org.apache.hadoop.util.ToolRunner;

public class InputFormatTest extends Configured implements Tool{

    public static class MapperTest extends Mapper<Text, URLWritable, Text, Text>{

        public void map(Text key, URLWritable value, Context context)

                throws IOException, InterruptedException {

            context.write(key, value.getUrl());

        }

    }

    public static class ReducerTest extends Reducer<Text, Text, Text, Text>{

        public void reduce(Text key, Iterable<Text> values, Context context)

                throws IOException, InterruptedException {

            String out="";

            for(Text val : values) {

                out+=val.toString()+"|";

            }

            context.write(key, new Text(out));

        }

    }

    public static void main(String[] args) {

        try {

            int returnCode = ToolRunner.run(new InputFormatTest(), args);

            System.exit(returnCode);

        } catch (Exception e) {

            e.printStackTrace();

        }

    }

    @Override

    public int run(String[] arg0) throws Exception {

        Configuration conf = new Configuration();

        conf.set("fs.default.name","hdfs://localhost:9001");

//        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

//        if (otherArgs.length < 2) {

//            System.err.println("Usage: Test_1 <in> [<in>...] <out>");

//            System.exit(2);

//        }

//

        Job job = Job.getInstance(conf, "InputFormatTest");

        job.setJarByClass(InputFormatTest.class);

        job.setMapperClass(MapperTest.class);

        job.setCombinerClass(ReducerTest.class);

        job.setReducerClass(ReducerTest.class);

        job.setInputFormatClass(URLInputFormat.class); // 设置文件输入格式

        job.setOutputFormatClass(TextOutputFormat.class);// 使用默认的output格格式

        job.setMapOutputKeyClass(Text.class);

        job.setMapOutputValueClass(Text.class);

        job.setOutputKeyClass(Text.class);

        job.setOutputValueClass(Text.class);

        FileInputFormat.addInputPath(job, new Path("/home/test_input/"));

        FileOutputFormat.setOutputPath(job, new Path("/home/test_output"));

//        for (int i = 0; i < otherArgs.length - 1; ++i) {

//            FileInputFormat.addInputPath(job, new Path(otherArgs[i]));

//        }

        //FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1]));

        job.waitForCompletion(true);

        return job.isSuccessful() ? 0 : 1;

    }

}

Mapper

MapReduce框架对于每一个从InputFormat产生的InputSplit，都生成一个map task来进行处理，因此分片数也就等于map task的数量。Mapper类包含如下四个方法，setup方法会在map方法执行前被调用一次，而cleanup在map结束时执行一次，默认不做任何事。run方法也就是每个map task调用的方法。map task具体的业务逻辑我们一般通过重写Mapper子类的map方法来实现。

public class Mapper<KEYIN, VALUEIN, KEYOUT, VALUEOUT> {

  public abstract class Context

    implements MapContext<KEYIN,VALUEIN,KEYOUT,VALUEOUT> {

  }

  protected void setup(Context context

                       ) throws IOException, InterruptedException {

    // NOTHING

  }
  protected void map(KEYIN key, VALUEIN value,

                     Context context) throws IOException, InterruptedException {

    context.write((KEYOUT) key, (VALUEOUT) value);

  }

  protected void cleanup(Context context

                         ) throws IOException, InterruptedException {

    // NOTHING

  }

  public void run(Context context) throws IOException, InterruptedException {

    setup(context);

    try {

      while (context.nextKeyValue()) {

        map(context.getCurrentKey(), context.getCurrentValue(), context);

      }

    } finally {

      cleanup(context);

    }

  }

}

Partitioner

由Mapper生成的中间键值对将由Partitioner来决定交由哪一个Reducer进行处理，partition的数量也就等于reducer的数量。Hadoop目前提供了四种不同的Partitioner: BinaryPartitioner, HashPartitioner, KeyFieldBasedPartitioner, TotalOrderPartitioner，默认为HashPartitioner。我们也可以通过继承Partitioner抽象类来实现自己特殊的Partitoner逻辑。

public abstract class Partitioner<KEY, VALUE> {

  public abstract int getPartition(KEY key, VALUE value, int numPartitions);

}

HashPartitioner代码:

public class HashPartitioner<K, V> extends Partitioner<K, V> {

  public int getPartition(K key, V value,

                          int numReduceTasks) {

    return (key.hashCode() & Integer.MAX_VALUE) % numReduceTasks;

  }

}

Reducer

Reducer中的方法类似于Mapper，一般我们通过重写Reducer子类的reduce方法来实现具体业务逻辑。

public class Reducer<KEYIN,VALUEIN,KEYOUT,VALUEOUT> {

  public abstract class Context

    implements ReduceContext<KEYIN,VALUEIN,KEYOUT,VALUEOUT> {

  }

  protected void setup(Context context

                       ) throws IOException, InterruptedException {

    // NOTHING

  }

  protected void reduce(KEYIN key, Iterable<VALUEIN> values, Context context

                        ) throws IOException, InterruptedException {

    for(VALUEIN value: values) {

      context.write((KEYOUT) key, (VALUEOUT) value);

    }

  }

  protected void cleanup(Context context

                         ) throws IOException, InterruptedException {

    // NOTHING

  }

  public void run(Context context) throws IOException, InterruptedException {

    setup(context);

    try {

      while (context.nextKey()) {

        reduce(context.getCurrentKey(), context.getValues(), context);

        // If a back up store is used, reset it

        Iterator<VALUEIN> iter = context.getValues().iterator();

        if(iter instanceof ReduceContext.ValueIterator) {

          ((ReduceContext.ValueIterator<VALUEIN>)iter).resetBackupStore();

        }

      }

    } finally {

      cleanup(context);

    }

  }

}

OutputFormat

MapReduce程序依靠OutputFormat类/子类完成如下工作：

1. 验证任务的输出，比如输出目录是否存在等。

2. 提供对象RecordWriter，将内容写到文件系统上。

Hadoop默认的是TextOutputFormat，我们也可以自定义自己特殊的outputFormat，代码可以参考之前给出的自定义InputFormat。

public abstract class OutputFormat<K, V> {

  public abstract RecordWriter<K, V>

    getRecordWriter(TaskAttemptContext context

                    ) throws IOException, InterruptedException;

  public abstract void checkOutputSpecs(JobContext context

                                        ) throws IOException,

                                                 InterruptedException;

  public abstract

  OutputCommitter getOutputCommitter(TaskAttemptContext context

                                     ) throws IOException, InterruptedException;

}

MapReduce Java程序框架

public class Test extends Configured implements Tool{

    public static class MapperTest extends Mapper<Text, Text, Text, Text>{

        public void map(Text key, Text value, Context context)

                throws IOException, InterruptedException {

                        //map logic

            context.write(key, value);

        }

    }

    public static class ReducerTest extends Reducer<Text, Text, Text, Text>{

        public void reduce(Text key, Iterable<Text> values, Context context)

                throws IOException, InterruptedException {

            String out="";

            //reduce logic

            context.write(key, value);

        }

    }

    public static void main(String[] args) {

        try {

            int returnCode = ToolRunner.run(new InputFormatTest(), args);

            System.exit(returnCode);

        } catch (Exception e) {

            e.printStackTrace();

        }

    }

    @Override

    public int run(String[] arg0) throws Exception {

        Configuration conf = new Configuration();

        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

        if (otherArgs.length < 2) {

            System.err.println("Usage: Test_1 <in> [<in>...] <out>");

            System.exit(2);

        }

        Job job = Job.getInstance(conf, "Test");

        job.setJarByClass(InputFormatTest.class);

        job.setMapperClass(MapperTest.class);

        job.setCombinerClass(ReducerTest.class);

        job.setReducerClass(ReducerTest.class);

        job.setInputFormatClass(TextInputFormat.class); // 设置文件输入格式

        job.setOutputFormatClass(TextOutputFormat.class);// 使用默认的output格格式

        job.setMapOutputKeyClass(Text.class);

        job.setMapOutputValueClass(Text.class);

        job.setOutputKeyClass(Text.class);

        job.setOutputValueClass(Text.class);

        for (int i = 0; i < otherArgs.length - 1; ++i) {

            FileInputFormat.addInputPath(job, new Path(otherArgs[i]));

        }

        FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1]));

        job.waitForCompletion(true);

        return job.isSuccessful() ? 0 : 1;

    }

}

MapReduce Java doc：http://hadoop.apache.org/docs/current/api/

MapReduce Shuffle详解：http://langyu.iteye.com/blog/992916