倒排索引

/**

 *

 *

 * <pre>

 *file1.txt:

 *hello ketty

 *hello tomcat

 *

 *file2.txt:

 *hello hadoop

 *

 *map1:

 *hello:file1.txt 1

 *hello:file1.txt 1

 *ketty:file1.txt 1

 *tomcat:file1.txt 1

 *hello:file2.txt 1

 *hadoop:file2.txt 1

 *

 *reduce1:

 *hello:file1.txt 2

 *ketty:file1.txt 1

 *tomcat:file1.txt 1

 *hello:file2.txt 1

 *hadoop:file2.txt 1

 *

 *reduce2:

 *hello file1.txt 2,file2.txt 1

 *ketty file1.txt 1

 *tomcat file1.txt 1

 *hadoop file2.txt 1

 *</pre>

 * @author huqiao

 */

public class InvertedIndex {

    /**

     * input:files to be inverted index<br/>

     * output: someword:filename  count

     * @author huqiao

     */

    static class WordInFileCountMapper extends Mapper<LongWritable,Text,Text,LongWritable>{

        @Override

        protected void map(LongWritable key, Text value,Context ctx)

                throws IOException, InterruptedException {

            String line = value.toString();

            String[] words = line.split(" ");

            FileSplit fileSplit = (FileSplit)ctx.getInputSplit();

            String fileName = fileSplit.getPath().getName();

            for(String word : words) {

                ctx.write(new Text(word + ":" + fileName), new LongWritable(1));

            }

        }

    }

    /**

     * output:

     * <pre>

     *hello:file1.txt 2

     *ketty:file1.txt 1

     *tomcat:file1.txt 1

     *hello:file2.txt 1

     *hadoop:file2.txt 1

     *</pre>

     * @author huqiao

     */

    static class WordInFileCountReducer extends Reducer<Text,LongWritable,Text,LongWritable>{

        @Override

        protected void reduce(Text key, Iterable<LongWritable> values, Context ctx) throws IOException, InterruptedException {

            int total = 0;

            for(LongWritable value : values) {

                total += value.get();

            }

            ctx.write(key, new LongWritable(total));

        }

    }

    /**

     * output:

     * <pre>

     * hello-->WordCountRecord{fileName:file1.txt,count:2}

     * ...

     * </pre>

     * @author huqiao

     */

    static class InvertedIndexMapper extends Mapper<LongWritable,Text,Text,WordCountRecord>{

        @Override

        protected void map(LongWritable key, Text value,Context ctx)

                throws IOException, InterruptedException {

            String line = value.toString();

            String[] lineArray = line.split("\t");

            String[] wordAndFileName = lineArray[0].split(":");

            String word = wordAndFileName[0];

            String fileName = wordAndFileName[1];

            Long count = Long.parseLong(lineArray[1]);

            ctx.write(new Text(word), new WordCountRecord(fileName, count));

        }

    }

    /**

     * output:

     * <pre>

     * hello-->file1.txt 2,file2.txt 1

     * ...

     * </pre>

     * @author huqiao

     */

    static class InvertedIndexReducer extends Reducer<Text,WordCountRecord,Text,Text>{

        @Override

        protected void reduce(Text key, Iterable<WordCountRecord> values, Context ctx) throws IOException, InterruptedException {

             StringBuffer output = new StringBuffer();

             for(WordCountRecord value : values) {

                 output.append(value.getFileName() + " " + value.getCount()+",");

             }

             ctx.write(key, new Text(output.toString()));

        }

    }

    public static void main(String[] args) throws Exception{

        String inputPath = args[0];

        String outputPath = args[1];

        String phase = args[2];

        FileSystem fs = FileSystem.get(new URI("hdfs://vcentos1:9000"),new Configuration(),"root");

        //delete output path when it existed

        Path output = new Path(outputPath);

        if(fs.exists(output)) {

            fs.delete(output,true);

        }

        if("phase1".equals(phase)) {

             doPhase1(inputPath,outputPath);

        }else {

            doPhase2(inputPath,outputPath);

        }

    }

    private static void doPhase1(String inputPath,String outputPath)throws Exception {

             Job job = Job.getInstance();

            job.setJarByClass(InvertedIndex.class);

            job.setMapperClass(WordInFileCountMapper.class);

            job.setReducerClass(WordInFileCountReducer.class);

            job.setMapOutputKeyClass(Text.class);

            job.setMapOutputValueClass(LongWritable.class);

            job.setOutputKeyClass(Text.class);

            job.setOutputValueClass(LongWritable.class);

            FileInputFormat.setInputPaths(job, new Path(inputPath));

            FileOutputFormat.setOutputPath(job, new Path(outputPath));

            boolean success = job.waitForCompletion(true);

            System.exit(success ? 0 : 1);

    }

    private static void doPhase2(String inputPath,String outputPath)throws Exception {

        Job job = Job.getInstance();

        job.setJarByClass(InvertedIndex.class);

        job.setMapperClass(InvertedIndexMapper.class);

        job.setReducerClass(InvertedIndexReducer.class);

        job.setMapOutputKeyClass(Text.class);

        job.setMapOutputValueClass(WordCountRecord.class);

        job.setOutputKeyClass(Text.class);

        job.setOutputValueClass(Text.class);

        FileInputFormat.setInputPaths(job, new Path(inputPath));

        FileOutputFormat.setOutputPath(job, new Path(outputPath));

        boolean success = job.waitForCompletion(true);

        System.exit(success ? 0 : 1);

    }

}

执行时分两个阶段：

 hadoop jar mr.jar me.huqiao.hadoop.demo_code.invertedsort.InvertedIndex /invertedindex/input /invertedindex/phase-a-output/ phase1

然后以第一个阶段的输出作为第二个阶段的输入：

hadoop jar mr.jar me.huqiao.hadoop.demo_code.invertedsort.InvertedIndex /invertedindex/phase-a-output /invertedindex/phase-b-output/ phase2

最终效果类似于：

about   logs.txt ,

are     text.txt ,

hadoop  file1.txt ,

hdfs    file1.txt ,

hello   text.txt ,logs.txt ,file1.txt ,

how     logs.txt ,text.txt ,

kitty   logs.txt ,

today   logs.txt ,

tom     text.txt ,

you     text.txt ,

找出价格最贵的商品

共同QQ好友

大数据学习（6）MapReduce应用的更多相关文章

大数据学习系列之四 ----- Hadoop+Hive环境搭建图文详解(单机)
引言在大数据学习系列之一 ----- Hadoop环境搭建(单机) 成功的搭建了Hadoop的环境,在大数据学习系列之二 ----- HBase环境搭建(单机)成功搭建了HBase的环境以及相关使用 ...
大数据学习系列之五 ----- Hive整合HBase图文详解
引言在上一篇大数据学习系列之四 ----- Hadoop+Hive环境搭建图文详解(单机) 和之前的大数据学习系列之二 ----- HBase环境搭建(单机) 中成功搭建了Hive和HBase的环 ...
大数据学习系列之六 ----- Hadoop+Spark环境搭建
引言在上一篇中大数据学习系列之五 ----- Hive整合HBase图文详解 : http://www.panchengming.com/2017/12/18/pancm62/ 中使用Hive整合 ...
大数据学习系列之七 ----- Hadoop+Spark+Zookeeper+HBase+Hive集群搭建图文详解
引言在之前的大数据学习系列中,搭建了Hadoop+Spark+HBase+Hive 环境以及一些测试.其实要说的话,我开始学习大数据的时候,搭建的就是集群,并不是单机模式和伪分布式.至于为什么先写单 ...
大数据学习系列之九---- Hive整合Spark和HBase以及相关测试
前言在之前的大数据学习系列之七 ----- Hadoop+Spark+Zookeeper+HBase+Hive集群搭建中介绍了集群的环境搭建,但是在使用hive进行数据查询的时候会非常的慢,因为h ...
大数据学习系列之—HBASE
hadoop生态系统 zookeeper负责协调 hbase必须依赖zookeeper flume 日志工具 sqoop 负责 hdfs dbms 数据转换数据到关系型数据库转换大数据学习群119 ...
大数据学习之Hadoop快速入门
1.Hadoop生态概况 Hadoop是一个由Apache基金会所开发的分布式系统集成架构,用户可以在不了解分布式底层细节情况下,开发分布式程序,充分利用集群的威力来进行高速运算与存储,具有可靠.高效 ...
大数据学习（一） | 初识 Hadoop
作者: seriouszyx 首发地址:https://seriouszyx.top/ 代码均可在 Github 上找到(求Star) 最近想要了解一些前沿技术,不能一门心思眼中只有 web,因为我目 ...
大数据学习路线，来qun里分享干货，
一.Linux lucene: 全文检索引擎的架构 solr: 基于lucene的全文搜索服务器,实现了可配置.可扩展并对查询性能进行了优化,并且提供了一个完善的功能管理界面. 推荐一个大数据学习群 ...
大数据篇：MapReduce
MapReduce MapReduce是什么? MapReduce源自于Google发表于2004年12月的MapReduce论文,是面向大数据并行处理的计算模型.框架和平台,而Hadoop MapR ...

随机推荐

C#Session丢失问题的解决办法
关于c# SESSION丢失问题解决办法我们在用C#开发程序的时候经常会遇到Session很不稳定,老是数据丢失.下面就是Session数据丢失的解决办法希望对您有好处.1.在WEB.CONFI ...
web前端-----JAVA Script（一）
JavaScript概述 JavaScript的历史 1992年Nombas开发出C-minus-minus(C--)的嵌入式脚本语言(最初绑定在CEnvi软件中).后将其改名ScriptEase ...
[转]开源框架完美组合之Spring.NET + NHibernate + ASP.NET MVC + jQuery + easyUI 中英文双语言小型企业网站Demo
热衷于开源框架探索的我发现ASP.NET MVC与jQuery easyUI的组合很给力.由于原先一直受Ext JS框架的licence所苦恼,于是痛下决心寻找一个完全免费的js框架——easyUI. ...
[转]Oracle执行计划详解
Oracle执行计划详解 --- 作者:TTT BLOG 本文地址:http://blog.chinaunix.net/u3/107265/showart_2192657.html --- 简介: ...
两种数据传输的方式——get和post。
Form提供了两种数据传输的方式——get和post.虽然它们都是数据的提交方式,但是在实际传输时确有很大的不同,并且可能会对数据产生严重的影响.虽然为了方便的得到变量值,Web容器已经屏蔽了二者的一 ...
TFBOY 养成记一些比较好多文章。
API解释中文版(简书文章,没事看看): http://www.jianshu.com/p/e3a79eac554f Tensorlfow op辨异:tf.add()与tf.nn.bias_add() ...
postman 时间戳和加密
在使用postman进行接口测试的时候,对于有些接口字段需要时间戳加密,这个时候我们就遇到2个问题,其一是接口中的时间戳如何得到?其二就是对于现在常用的md5加密操作如何在postman中使用代码实现 ...
Mybatis分页插件PageHelper使用
一． Mybatis分页插件PageHelper使用 1.不使用插件如何分页: 使用mybatis实现: 1)接口: List<Student> selectStudent(Map< ...
Idea Live Templates代码模板
一. 概念创建代码模板进行快速代码编写,如sout-->System.out.println();. 如我们经常要写logger的定义:private static final Logger ...
调用支付宝第三方接口(沙箱环境) SpringMVC+Maven
一.蚂蚁金服开放平台的操作网址:https://open.alipay.com/platform/home.htm 支付宝扫码登陆

大数据学习（6）MapReduce应用

倒排索引

找出价格最贵的商品

共同QQ好友

大数据学习（6）MapReduce应用的更多相关文章

随机推荐

热门专题