Hadoop MapReduce输入输出类型

一、输入格式

　　1、输入分片split

　　　　　　一个分片对应一个map任务；

　　　　　　一个分片包含一个表（整个文件）上的若干行，而一条记录（单行）对应一行；

　　　　　　分片包含一个以字节为单位的长度和一组存储位置，分片不包含实际的数据；

　　　　　　map处理时会用分片的大小来排序，优先处理最大的分片；

　　hadoop中Java定义的分片为InputSplit抽象类：主要两个方法，涉及分片长度，分片起始位置

    public abstract class InputSplit{

         public abstract long getLength() throws IOException, InterruptedException;

         public abstract String[] getLocations() throws IOException, InterruptedException;

    }

　　InputSplit不需要手动去处理它，它是由InputFormat生成；InputFormat负责产生输入分片并将它们分割成记录：

    public abstract class InputFormat<K, V> {

        public abstract List<InputSplit> getSplits( JobContext context) throws IOException, InterruptedException;

        public abstract RecordReader<K, V> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException;

    }

　　InputFormat抽象类定义的两个方法：getSplits() 和 createRecordReader()

　　运行作业的客户端会调用getSplits()来计算分片，然后将它们发送到jobtracker，jobtracker会使用其存储位置来调度map任务从而在tasktracker上来处理这个分片数据。在tasktracker上，map任务把输入分片传给InputFormat的getRecordReader()方法来获得这个分片的RecordReader。RecordReader就是一个集合迭代器，map任务用一个RecordReader来生成记录的键/值对，然后再传递给map函数。

　　2、FileInputFormat类

　　　 FileInputFormat类是所有指定数据源实现类的基类，它本身主要有两个功能：a. 指定输入文件位置；b. 输入文件生成分片的实现代码段，具体实现由子类完成；

　　　 继承图：

　　　　设置输入文件位置：

　　　　　　FileInputFormat.addInputPath(job, new Path("hdfs://fileClusters:9000/wordcount.txt"));

　　　　　　或 FileInputFormat.setInputPaths(job, new Path("hdfs://fileClusters:9000/wordcount.txt"));　　　　　　

　　　　　　可添加文件过滤器, FileInputFormat 中静态方法：

　　　　　　　　public static void setInputPathFilter(Job job, Class<? extends PathFilter> filter)

　　　　　　　　文件添加时，默认就会有一个过滤器，过滤掉"." 和 "_"开头的文件，会过滤掉隐藏文件；自定义的过滤器也是在默认过滤的基础上过滤；

　　　　切分的分片大小：

　　　　　　　　一个split的大小计算：max( minimumSize, min( maximumSize, blockSize ))；

　　　　　　　　minimumSize默认为1，maximumSize默认为Long.MAX_VALUE；

　　　　　　　　所以通常 blockSize 在 minimumSize和maximumSize之间，所以一般分片大小就是块大小。

　　　　设置不切分文件：

　　　　　　　　两种方法：

　　　　　　　　　　a. 设置minimumSize的大小为Long.MAX_VALUE;

　　　　　　　　　　b. 在实现FileInputFormat的子类时，重写isSplitable()方法返回为false;

　　　　在mapper中获取文件分片信息：

　　　　　　　　在mapper中可以获取当前处理的分片的信息，可通过context.getInputSplit()方法来获取一个split；当输入的格式源于FileInputFormat时，该方法返回的InputSplit可以被强制转换化一个FileSplit(继承自InputSplit)，可调用如下信息：

　　　　　　　　　　　a. getPath() Path/String 文件的路径

　　　　　　　　　　　b. getStart()　long

　　　　　　　　　　 c. getLength()　long

　　　　　自定义一个输入格式，把整个文件作为一条记录：　

// Example 7-2. An InputFormat for reading a whole file as a record

class WholeFileInputFormat extends FileInputFormat<NullWritable, BytesWritable> {

    @Override

    protected boolean isSplitable(JobContext context, Path file) {

        return false;

    }

    @Override

    public RecordReader<NullWritable, BytesWritable> createRecordReader(InputSplit split, TaskAttemptContext context)

            throws IOException, InterruptedException {

        WholeFileRecordReader reader = new WholeFileRecordReader();

        reader.initialize(split, context);

        return reader;

    }

}

//主要是实现RecordReader类

class WholeFileRecordReader extends RecordReader<NullWritable, BytesWritable> {

    private FileSplit fileSplit;

    private Configuration conf;

    private BytesWritable value = new BytesWritable();

    private boolean processed = false;

    @Override

    public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {

        this.fileSplit = (FileSplit) split;

        this.conf = context.getConfiguration();

    }

    @Override

    public boolean nextKeyValue() throws IOException, InterruptedException {

        if (!processed) {

            byte[] contents = new byte[(int) fileSplit.getLength()];

            Path file = fileSplit.getPath();

            FileSystem fs = file.getFileSystem(conf);

            FSDataInputStream in = null;

            try {

                in = fs.open(file);

                IOUtils.readFully(in, contents, 0, contents.length);

                value.set(contents, 0, contents.length);

            } finally {

                IOUtils.closeStream(in);

            }

            processed = true;

            return true;

        }

        return false;

    }

    @Override

    public NullWritable getCurrentKey() throws IOException, InterruptedException {

        return NullWritable.get();

    }

    @Override

    public BytesWritable getCurrentValue() throws IOException, InterruptedException {

        return value;

    }

    @Override

    public float getProgress() throws IOException {

        return processed ? 1.0f : 0.0f;

    }

    @Override

    public void close() throws IOException {

        // do nothing }

    }

}

　　　　　　　整个文件作为一条记录的应用，把多个小文件合并为一个大文件：

public class SmallFilesToSequenceFileConverter extends Configured implements Tool {

    static class SequenceFileMapper extends Mapper<NullWritable, BytesWritable, Text, BytesWritable> {

        private Text filenameKey;

        @Override

        protected void setup(Context context) throws IOException, InterruptedException {

            InputSplit split = context.getInputSplit();

            Path path = ((FileSplit) split).getPath();

            filenameKey = new Text(path.toString());

        }

        @Override

        protected void map(NullWritable key, BytesWritable value, Context context)

                throws IOException, InterruptedException {

            context.write(filenameKey, value);

        }

    }

    @Override

    public int run(String[] args) throws Exception {

        Job job = JobBuilder.parseInputAndOutput(this, getConf(), args);

        if (job == null) {

            return -1;

        }

        job.setInputFormatClass(WholeFileInputFormat.class);

        job.setOutputFormatClass(SequenceFileOutputFormat.class);

        job.setOutputKeyClass(Text.class);

        job.setOutputValueClass(BytesWritable.class);

        job.setMapperClass(SequenceFileMapper.class);

        return job.waitForCompletion(true) ? 0 : 1;

    }

    public static void main(String[] args) throws Exception {

        int exitCode = ToolRunner.run(new SmallFilesToSequenceFileConverter(), args);

        System.exit(exitCode);

    }

}

　　　　　文本输入：

　　　　　　　　a. TextInputFormat 行首偏移量：行内容

　　　　　　　　　　扩展：如何处理跨行Block和InputSplit

　　　　　　　　b. KeyValueTextInputFormat　　以tab划分一行的key value

　　　　　　　　c. NLineInputFormat　　让每个map收到定义的相同行数，每个分片只包含N行

　　　　　二进制输入：

　　　　　　　　Hadoop的MapReduce不只是可以处理文本信息，还可以处理二进制格式，通过会用以下几个类：

　　　　　　　　　　SequenceFileInputFormat，处理SequenceFile 和 MapFile的文件类型；

　　　　　　　　　　SequenceFileAsTextInputFormat 是 SequenceFileInputFormat的扩展，它将SequenceFile的键值转换为Text对象，这个转化是通过键和值上调用toString()方法实现。

　　　　　　　　　　SequenceFileAsBinaryInputFormat 也是SequenceFileInputFormat的扩展，它将SequenceFile的键值作为二进制对象。它们被封装为BytesWritable对象，因而可以任意解释这些字节数组。

　　　　　　多输入MultipleInputs：

　　　　　　　　它可为每条输入路径指定InputForamt 和 Mapper：　　　　　　　

MutipleInputs.addInputPath(job , ncdcInputPath, TextInputFormat.class, MaxTemperatureMapper.class);

MutipleInputs.addInputPath(job ,metofficeInputPath, TextInputFormat.class, MetofficeMaxTemperatureMapper.class);

//MutipleInputs还有一个重载，当只用一个Mapper时

public static void addInputPath(Job job, Path path, class<? extends InputFormat> inputFormatClass);

　　　　　　　　它取代了FileInputFormat.addInputPath() 和 job.setMapperClass()的调用。

　　　　　　合成输入：CompositeInputFormat ( join )、CombineFileInputFormat ( 多个小文件合并成一个split输入，是一个抽象类 )

二、输出格式

继承图：　　

　　文体输出TextOutputFormat：

　　默认的输出是文本输出TextOutputFormat，它把每条记录写为文本行，它调用toString()方法把key value转化为字符串。

　　与之对应的输入为KeyValueTextInputFormat；

　　二进制输出：与输入对应。

多输出：

　　默认一个reducer生成一个输出文件，命名为part-r-00000；

　　有时需要对输出的文件名进行控制或让每个redeucer输出多个文件，可利用 MultipleOutputFormat 类；

　　范例：按气象站来区分气象数据，各个气象站输出到不同的文件中：

　　　　方法一：可利用每个reducer创建一个输出文件的特点，通过设置多个分区，来输出到各个文件，这样做有两点不好：

　　　　　　　　　　a. 分区个数必须预先就知道；可能有空reducer，可能有的获取不到气象站信息导致值丢失；

　　　　　　　　　　b. 每个reducer处理一个气象站，可能需要过多的reducer，也会有严重的数据倾斜问题；

　　　　方法二：使用 MutipleOutputs 类：

public class PartitionByStationUsingMultipleOutputs extends Configured implements Tool {

    static class StationMapper extends Mapper<LongWritable, Text, Text, Text> {

        private NcdcRecordParser parser = new NcdcRecordParser();

        @Override

        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

            parser.parse(value);

            context.write(new Text(parser.getStationId()), value);

        }

    }

    static class MultipleOutputsReducer extends Reducer<Text, Text, NullWritable, Text> {

        private MultipleOutputs<NullWritable, Text> multipleOutputs;

        @Override

        protected void setup(Context context) throws IOException, InterruptedException {

            multipleOutputs = new MultipleOutputs<NullWritable, Text>(context);

        }

        @Override

        protected void reduce(Text key, Iterable<Text> values, Context context)

                throws IOException, InterruptedException {

            for (Text value : values) {

                multipleOutputs.write(NullWritable.get(), value, key.toString());

            }

        }

        @Override

        protected void cleanup(Context context) throws IOException, InterruptedException {

            multipleOutputs.close();

        }

    }

    @Override

    public int run(String[] args) throws Exception {

        Job job = JobBuilder.parseInputAndOutput(this, getConf(), args);

        if (job == null) {

            return -1;

        }

        job.setMapperClass(StationMapper.class);

        job.setMapOutputKeyClass(Text.class);

        job.setReducerClass(MultipleOutputsReducer.class);

        job.setOutputKeyClass(NullWritable.class);

        return job.waitForCompletion(true) ? 0 : 1;

    }

    public static void main(String[] args) throws Exception {

        int exitCode = ToolRunner.run(new PartitionByStationUsingMultipleOutputs(), args);

        System.exit(exitCode);

    }

}

　　　　　　　　输出文件结果如下：　　　　　　　　　　

　　　　　　　　　　output/010010-99999-r-00027

　　　　　　　　　　output/010050-99999-r-00013

　　　　　　　　　　output/010100-99999-r-00015

　　　　　　　　　　output/010280-99999-r-00014

　　　　方法三：MultipleOutputFormat（旧API已移弃）的使用　　　　

public static class WordCountOutputFormat extends MultipleOutputFormat<Text, IntWritable> {

    private TextOutputFormat<Text, IntWritable> output = null;

    @Override

    protected RecordWriter<Text, IntWritable> getBaseRecordWriter(FileSystem fs, JobConf job, String name,

            Progressable arg3) throws IOException {

        if (output == null) {

            output = new TextOutputFormat<Text, IntWritable>();

        }

        return output.getRecordWriter(fs, job, name, arg3);

    }

    @Override

    protected String generateFileNameForKeyValue(Text key, IntWritable value, String name) {

        char c = key.toString().toLowerCase().charAt(0);

        if (c >= 'a' && c <= 'z') {

            return c + ".txt";

        }

        return "result.txt";

    }

}