hadoop 使用map将SequenFile里的小文件解压出来

上例中将HDFS里小文件通过mapper压缩到一个文件中，本例将这些小文件解压出来。

mapreduce可以按SequenceFile的key进行分片。

1、mapper

public class MultiOutputMapper extends Mapper<Text,BytesWritable,NullWritable,Text> {

    private MultipleOutputs<NullWritable,Text> multipleOutputs;

    private long splitLength;

    /**

     * Called once at the beginning of the task.

     *

     * @param context

     */

    @Override

    protected void setup(Context context) throws IOException, InterruptedException {

        multipleOutputs = new MultipleOutputs<NullWritable, Text>(context);

        InputSplit split = context.getInputSplit();

        splitLength = split.getLength();

    }

    /**

     * Called once for each key/value pair in the input split. Most applications

     * should override this, but the default is the identity function.

     *

     * @param key

     * @param value

     * @param context

     */

    @Override

    protected void map(Text key, BytesWritable value, Context context) throws IOException, InterruptedException {

        System.out.printf("split length:%s,value length:%s,bytes length:%s",splitLength,value.getLength(),value.getBytes().length);

        int length = value.getLength();

        byte[] bytes = new byte[length];

        System.arraycopy(value.getBytes(),0,bytes,0,length);

        Text contents = new Text(bytes);

        //根据SequenceFile里key的原路径路径生成文件

//        multipleOutputs.write(NullWritable.get(),new Text(bytes),key.toString());

        //在output里输出文件

        Path path = new Path(key.toString());

        String outputFileName = String.format("%s/%s","2019",path.getName());

        multipleOutputs.write(NullWritable.get(),new Text(bytes),outputFileName);

//

//        multipleOutputs.write(NullWritable.get(),new Text(value.getBytes()),key.toString());//这句是错的。

//        通过测试，对于SequenceFile，是按key进入分片，value的length是实际长度,value.getbytes的长度是value的buff长度，两个不一定相等

//        split length:88505,value length:4364,bytes length:6546

    }

    /**

     * Called once at the end of the task.

     *

     * @param context

     */

    @Override

    protected void cleanup(Context context) throws IOException, InterruptedException {

        multipleOutputs.close();

        super.cleanup(context);

    }

}

2、job

public class SequenceFileToSmallFileConverter {

    public static void main(String[] args) throws Exception{

        long startTime = System.currentTimeMillis();

        Configuration conf = new Configuration();

        Path outPath = new Path(args[1]);

        FileSystem fileSystem = outPath.getFileSystem(conf);

        //删除输出路径

        if(fileSystem.exists(outPath))

        {

            fileSystem.delete(outPath,true);

        }

        Job job = Job.getInstance(conf,"SequenceFileToSmallFileConverter");

        job.setJarByClass(SequenceFileToSmallFileConverter.class);

        job.setMapperClass(MultiOutputMapper.class);

        job.setNumReduceTasks(0);

        job.setInputFormatClass(SequenceFileInputFormat.class);

        //TextOutputFormat会在每行文本后面加入换行符号，如果是这个文本作为一个整体来处理，最后就会比预期多一个换行符号

//        job.setOutputFormatClass(TextOutputFormat.class);

        //WholeTextOutputFormat与TextOutputFormat的区别就是没有在每行写入换行符

        job.setOutputFormatClass(WholeTextOutputFormat.class);

        job.setOutputKeyClass(NullWritable.class);

        job.setOutputValueClass(BytesWritable.class);

        FileInputFormat.addInputPath(job,new Path(args[0]));

        FileOutputFormat.setOutputPath(job,new Path(args[1]));

        int exitCode = job.waitForCompletion(true) ? 0:1;

        long endTime = System.currentTimeMillis();

        long timeSpan = endTime - startTime;

        System.out.println("运行耗时："+timeSpan+"毫秒。");

        System.exit(exitCode);

    }

}

public class WholeTextOutputFormat<K, V> extends FileOutputFormat<K, V> {

    public static String SEPARATOR = "mapreduce.output.textoutputformat.separator";

    /**

     * @deprecated Use {@link #SEPARATOR}

     */

    @Deprecated

    public static String SEPERATOR = SEPARATOR;

    protected static class LineRecordWriter<K, V>

            extends RecordWriter<K, V> {

        private static final byte[] NEWLINE =

                "\n".getBytes(StandardCharsets.UTF_8);

        protected DataOutputStream out;

        private final byte[] keyValueSeparator;

        public LineRecordWriter(DataOutputStream out, String keyValueSeparator) {

            this.out = out;

            this.keyValueSeparator =

                    keyValueSeparator.getBytes(StandardCharsets.UTF_8);

        }

        public LineRecordWriter(DataOutputStream out) {

            this(out, "\t");

        }

        /**

         * Write the object to the byte stream, handling Text as a special

         * case.

         * @param o the object to print

         * @throws IOException if the write throws, we pass it on

         */

        private void writeObject(Object o) throws IOException {

            if (o instanceof Text) {

                Text to = (Text) o;

                out.write(to.getBytes(), 0, to.getLength());

            } else {

                out.write(o.toString().getBytes(StandardCharsets.UTF_8));

            }

        }

        public synchronized void write(K key, V value)

                throws IOException {

            boolean nullKey = key == null || key instanceof NullWritable;

            boolean nullValue = value == null || value instanceof NullWritable;

            if (nullKey && nullValue) {

                return;

            }

            if (!nullKey) {

                writeObject(key);

            }

            if (!(nullKey || nullValue)) {

                out.write(keyValueSeparator);

            }

            if (!nullValue) {

                writeObject(value);

            }

//            out.write(NEWLINE);将文本当做整体，各key之间不主动加入换行符号

        }

        public synchronized

        void close(TaskAttemptContext context) throws IOException {

            out.close();

        }

    }

    public RecordWriter<K, V>

    getRecordWriter(TaskAttemptContext job

    ) throws IOException, InterruptedException {

        Configuration conf = job.getConfiguration();

        boolean isCompressed = getCompressOutput(job);

        String keyValueSeparator= conf.get(SEPARATOR, "\t");

        CompressionCodec codec = null;

        String extension = "";

        if (isCompressed) {

            Class<? extends CompressionCodec> codecClass =

                    getOutputCompressorClass(job, GzipCodec.class);

            codec = ReflectionUtils.newInstance(codecClass, conf);

            extension = codec.getDefaultExtension();

        }

        Path file = getDefaultWorkFile(job, extension);

        FileSystem fs = file.getFileSystem(conf);

        FSDataOutputStream fileOut = fs.create(file, false);

        if (isCompressed) {

            return new WholeTextOutputFormat.LineRecordWriter<>(

                    new DataOutputStream(codec.createOutputStream(fileOut)),

                    keyValueSeparator);

        } else {

            return new WholeTextOutputFormat.LineRecordWriter<>(fileOut, keyValueSeparator);

        }

    }

}

3、验证，压入SequenceFile和解压后的文件完全相同。

[hadoop@bigdata-senior01 ~]$ hadoop fs -checksum /demo/1.txt-m-00000 /demo3/1.txt

/demo/1.txt-m-00000    MD5-of-0MD5-of-512CRC32C    0000020000000000000000007b6bd9c9f517a6ea12ede79fd43700ca

/demo3/1.txt    MD5-of-0MD5-of-512CRC32C    0000020000000000000000007b6bd9c9f517a6ea12ede79fd43700ca

hadoop 使用map将SequenFile里的小文件解压出来的更多相关文章

Hadoop MapReduce编程 API入门系列之小文件合并（二十九）
不多说,直接上代码. Hadoop 自身提供了几种机制来解决相关的问题,包括HAR,SequeueFile和CombineFileInputFormat. Hadoop 自身提供的几种小文件合并机制 ...
Hadoop经典案例（排序&Join&topk&小文件合并）
①自定义按某列排序,二次排序 writablecomparable中的compareto方法 ②topk a利用treemap,缺点:map中的key不允许重复:https://blog.csdn.n ...
《OD学hadoop》在LINUX下如何将tar压缩文件解压到指定的目录下
linux下tar命令解压到指定的目录 :#tar zxvf /bbs.tar.zip -C /zzz/bbs //把根目录下的bbs.tar.zip解压到/zzz/bbs下,前提要保证存在/zzz/ ...
Hadoop小文件存储方案
原文地址:https://www.cnblogs.com/ballwql/p/8944025.html HDFS总体架构在介绍文件存储方案之前,我觉得有必要先介绍下关于HDFS存储架构方面的一些知识 ...
hive 处理小文件，减少map数
1.hive.merge.mapfiles,True时会合并map输出.2.hive.merge.mapredfiles,True时会合并reduce输出.3.hive.merge.size.per. ...
hive小文件合并设置参数
Hive的后端存储是HDFS,它对大文件的处理是非常高效的,如果合理配置文件系统的块大小,NameNode可以支持很大的数据量.但是在数据仓库中,越是上层的表其汇总程度就越高,数据量也就越小.而且这些 ...
合并hive/hdfs小文件
磁盘: heads/sectors/cylinders,分别就是磁头/扇区/柱面,每个扇区512byte(现在新的硬盘每个扇区有4K) 文件系统: 文件系统不是一个扇区一个扇区的来读数据,太慢了,所以 ...
spark sql/hive小文件问题
针对hive on mapreduce 1:我们可以通过一些配置项来使Hive在执行结束后对结果文件进行合并: 参数详细内容可参考官网:https://cwiki.apache.org/conflue ...
彻底解决Hive小文件问题
最近发现离线任务对一个增量Hive表的查询越来越慢,这引起了我的注意,我在cmd窗口手动执行count操作查询发现,速度确实很慢,才不到五千万的数据,居然需要300s,这显然是有问题的,我推测可能是有 ...

随机推荐

python字符串的方法介绍
博文取自鱼C论坛文章: http://bbs.fishc.com/forum.php?mod=viewthread&tid=38992&extra=page%3D1%26filter% ...
P1803 凌乱的yyy
P1803 凌乱的yyy 题目背景快noip了,yyy很紧张! 题目描述现在各大oj上有n个比赛,每个比赛的开始.结束的时间点是知道的. yyy认为,参加越多的比赛,noip就能考的越好(假的) ...
阿里otter使用问题汇总
最近在使用otter做为和表从库.(100个分表太难查询了) user_00,user_01...user_99 => user_all 1.问题DDL语句不能执行(exception:setl ...
nodeJs 安装 npm nodeModules package.json
Nodejs 1.安装nodejs 从nodejs官网下载最新版本的node,设置环境变量这样就可以在cmd下直接用命令行操作npm 环境变量:path d:/nodejs 查看本机node及n ...
180713-Spring之借助Redis设计访问计数器之扩展篇
之前写了一篇博文,简单的介绍了下如何利用Redis配合Spring搭建一个web的访问计数器,之前的内容比较初级,现在考虑对其进行扩展,新增访问者记录记录当前站点的总访问人数(根据Ip或则设备号) ...
第五模块：WEB开发基础第2章·JavaScript基础
01-JavaScript的历史发展过程 02-js的引入方式和输出 03-命名规范和变量的声明定义 04-五种基本数据类型 05-运算符 06-字符串处理 07-数据类型转换 08-流程控制语句if ...
【Extremely Basic Words for Listening】word list
[Extremely Basic Words for Listening]word list updated continuously recite count: 0 careless exercis ...
git push origin master 错误解决办法
一.错误代码如下: error: failed to push some refs to 'https://github.com/wbingithub/drag.git' 二.在网上搜了一下,如下写就 ...
MySQL数据库怎么截取字符串？
函数: 1.从左开始截取字符串 left(str, length) 说明:left(被截取字段,截取长度) 例:select left(content,200) as abstract from my ...
ubuntu samba配置注意事项
1. 下载samba前, ubuntu镜像源需要更新为国内源,否则samba的安装会非常慢亲测,清华的镜像源速度满足要求. A.登录 https://mirrors.tuna.tsinghua.ed ...

hadoop 使用map将SequenFile里的小文件解压出来

hadoop 使用map将SequenFile里的小文件解压出来的更多相关文章

随机推荐

热门专题