【Hadoop】Hadoop MR 如何实现倒排索引算法？

1、概念、方案

2、代码示例

InverseIndexOne

package com.ares.hadoop.mr.inverseindex;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.conf.Configured;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.util.StringUtils;

import org.apache.hadoop.util.Tool;

import org.apache.hadoop.util.ToolRunner;

import org.apache.log4j.Logger;

public class InverseIndexOne extends Configured implements Tool {

    private static final Logger LOGGER = Logger.getLogger(InverseIndexOne.class);

    enum Counter {

        LINESKIP

    }

    public static class InverseIndexOneMapper

        extends Mapper<LongWritable, Text, Text, LongWritable> {

        private String line;

        private final static char separatorA = ' ';

        private final static char separatorB = '-';

        private String fileName;

        private Text text = new Text();

        private final static LongWritable ONE = new LongWritable(1L);

        @Override

        protected void map(LongWritable key, Text value,

                Mapper<LongWritable, Text, Text, LongWritable>.Context context)

                throws IOException, InterruptedException {

            // TODO Auto-generated method stub

            //super.map(key, value, context);

            try {

                line = value.toString();

                String[] fields = StringUtils.split(line, separatorA);

                FileSplit fileSplit = (FileSplit) context.getInputSplit();

                fileName = fileSplit.getPath().getName();

                for (int i = ; i < fields.length; i++) {

                    text.set(fields[i] + separatorB + fileName);

                    context.write(text, ONE);

                }

            } catch (Exception e) {

                // TODO: handle exception

                LOGGER.error(e);

                System.out.println(e);

                context.getCounter(Counter.LINESKIP).increment();

                return;

            }

        }

    }

    public static class InverseIndexOneReducer

    extends Reducer<Text, LongWritable, Text, LongWritable> {

        private LongWritable result = new LongWritable();

        @Override

        protected void reduce(Text key, Iterable<LongWritable> values,

                Reducer<Text, LongWritable, Text, LongWritable>.Context context)

                throws IOException, InterruptedException {

            // TODO Auto-generated method stub

            //super.reduce(arg0, arg1, arg2);

            long count = ;

            for (LongWritable value : values) {

                count += value.get();

            }

            result.set(count);

            context.write(key, result);

        }

    }

    @Override

    public int run(String[] args) throws Exception {

        // TODO Auto-generated method stub

        //return 0;

        String errMsg = "InverseIndexOne: TEST STARTED...";

        LOGGER.debug(errMsg);

        System.out.println(errMsg);

        Configuration conf = new Configuration();

        //FOR Eclipse JVM Debug

        //conf.set("mapreduce.job.jar", "flowsum.jar");

        Job job = Job.getInstance(conf);

        // JOB NAME

        job.setJobName("InverseIndexOne");

        // JOB MAPPER & REDUCER

        job.setJarByClass(InverseIndexOne.class);

        job.setMapperClass(InverseIndexOneMapper.class);

        job.setReducerClass(InverseIndexOneReducer.class);

        // JOB PARTITION

        //job.setPartitionerClass(FlowGroupPartition.class);

        // JOB REDUCE TASK NUMBER

        //job.setNumReduceTasks(5);

        // MAP & REDUCE

        job.setOutputKeyClass(Text.class);

        job.setOutputValueClass(LongWritable.class);

        // MAP

        job.setMapOutputKeyClass(Text.class);

        job.setMapOutputValueClass(LongWritable.class);

        // JOB INPUT & OUTPUT PATH

        //FileInputFormat.addInputPath(job, new Path(args[0]));

        FileInputFormat.setInputPaths(job, args[]);

        Path output = new Path(args[]);

//        FileSystem fs = FileSystem.get(conf);

//        if (fs.exists(output)) {

//            fs.delete(output, true);

//        }

        FileOutputFormat.setOutputPath(job, output);

        // VERBOSE OUTPUT

        if (job.waitForCompletion(true)) {

            errMsg = "InverseIndexOne: TEST SUCCESSFULLY...";

            LOGGER.debug(errMsg);

            System.out.println(errMsg);

            return ;

        } else {

            errMsg = "InverseIndexOne: TEST FAILED...";

            LOGGER.debug(errMsg);

            System.out.println(errMsg);

            return ;

        }

    }

    public static void main(String[] args) throws Exception {

        if (args.length != ) {

            String errMsg = "InverseIndexOne: ARGUMENTS ERROR";

            LOGGER.error(errMsg);

            System.out.println(errMsg);

            System.exit(-);

        }

        int result = ToolRunner.run(new Configuration(), new InverseIndexOne(), args);

        System.exit(result);

    }

}

InverseIndexTwo

package com.ares.hadoop.mr.inverseindex;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.conf.Configured;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.util.StringUtils;

import org.apache.hadoop.util.Tool;

import org.apache.hadoop.util.ToolRunner;

import org.apache.log4j.Logger;

public class InverseIndexTwo extends Configured implements Tool{

    private static final Logger LOGGER = Logger.getLogger(InverseIndexOne.class);

    enum Counter {

        LINESKIP

    }

    public static class InverseIndexTwoMapper extends

    Mapper<LongWritable, Text, Text, Text> {

        private String line;

        private final static char separatorA = '\t';

        private final static char separatorB = '-';        

        private Text textKey = new Text();

        private Text textValue = new Text();

        @Override

        protected void map(LongWritable key, Text value,

                Mapper<LongWritable, Text, Text, Text>.Context context)

                throws IOException, InterruptedException {

            // TODO Auto-generated method stub

            //super.map(key, value, context);

            try {

                line = value.toString();

                String[] fields = StringUtils.split(line, separatorA);

                String[] wordAndfileName = StringUtils.split(fields[], separatorB);

                long count = Long.parseLong(fields[]);

                String word = wordAndfileName[];

                String fileName = wordAndfileName[];

                textKey.set(word);

                textValue.set(fileName + separatorB + count);

                context.write(textKey, textValue);

            } catch (Exception e) {

                // TODO: handle exception

                LOGGER.error(e);

                System.out.println(e);

                context.getCounter(Counter.LINESKIP).increment();

                return;

            }

        }

    }

    public static class InverseIndexTwoReducer extends

    Reducer<Text, Text, Text, Text> {

        private Text textValue = new Text();

        @Override

        protected void reduce(Text key, Iterable<Text> values,

                Reducer<Text, Text, Text, Text>.Context context)

                throws IOException, InterruptedException {

            // TODO Auto-generated method stub

            //super.reduce(arg0, arg1, arg2);

            StringBuilder index = new StringBuilder("");

//            for (Text text : values) {

//                if (condition) {

//

//                }

//                index.append(text.toString() + separatorA);

//            }

            String separatorA = "";

            for (Text text : values) {

                index.append(separatorA + text.toString());

                separatorA = ",";

            }

            textValue.set(index.toString());

            context.write(key, textValue);

        }

    }

    @Override

    public int run(String[] args) throws Exception {

        // TODO Auto-generated method stub

        //return 0;

        String errMsg = "InverseIndexTwo: TEST STARTED...";

        LOGGER.debug(errMsg);

        System.out.println(errMsg);

        Configuration conf = new Configuration();

        //FOR Eclipse JVM Debug

        //conf.set("mapreduce.job.jar", "flowsum.jar");

        Job job = Job.getInstance(conf);

        // JOB NAME

        job.setJobName("InverseIndexTwo");

        // JOB MAPPER & REDUCER

        job.setJarByClass(InverseIndexTwo.class);

        job.setMapperClass(InverseIndexTwoMapper.class);

        job.setReducerClass(InverseIndexTwoReducer.class);

        // JOB PARTITION

        //job.setPartitionerClass(FlowGroupPartition.class);

        // JOB REDUCE TASK NUMBER

        //job.setNumReduceTasks(5);

        // MAP & REDUCE

        job.setOutputKeyClass(Text.class);

        job.setOutputValueClass(Text.class);

        // MAP

        job.setMapOutputKeyClass(Text.class);

        job.setMapOutputValueClass(Text.class);

        // JOB INPUT & OUTPUT PATH

        //FileInputFormat.addInputPath(job, new Path(args[0]));

        FileInputFormat.setInputPaths(job, args[]);

        Path output = new Path(args[]);

//        FileSystem fs = FileSystem.get(conf);

//        if (fs.exists(output)) {

//            fs.delete(output, true);

//        }

        FileOutputFormat.setOutputPath(job, output);

        // VERBOSE OUTPUT

        if (job.waitForCompletion(true)) {

            errMsg = "InverseIndexTwo: TEST SUCCESSFULLY...";

            LOGGER.debug(errMsg);

            System.out.println(errMsg);

            return ;

        } else {

            errMsg = "InverseIndexTwo: TEST FAILED...";

            LOGGER.debug(errMsg);

            System.out.println(errMsg);

            return ;

        }

    }

    public static void main(String[] args) throws Exception {

        if (args.length != ) {

            String errMsg = "InverseIndexOne: ARGUMENTS ERROR";

            LOGGER.error(errMsg);

            System.out.println(errMsg);

            System.exit(-);

        }

        int result = ToolRunner.run(new Configuration(), new InverseIndexTwo(), args);

        System.exit(result);

    }

}

参考资料：

How to check if processing the last item in an Iterator?：http://stackoverflow.com/questions/9633991/how-to-check-if-processing-the-last-item-in-an-iterator

【Hadoop】Hadoop MR 如何实现倒排索引算法？的更多相关文章

hadoop修改MR的提交的代码程序的副本数
hadoop修改MR的提交的代码程序的副本数 Under-Replicated Blocks的数量很多,有7万多个.hadoop fsck -blocks 检查发现有很多replica missing ...
腾讯公司数据分析岗位的hadoop工作　线性回归　k-means算法　朴素贝叶斯算法　SpringMVC组件　某公司的广告投放系统　KNN算法　社交网络模型　SpringMVC注解方式
腾讯公司数据分析岗位的hadoop工作线性回归 k-means算法朴素贝叶斯算法 SpringMVC组件某公司的广告投放系统 KNN算法社交网络模型 SpringMVC注解方式某移动公司实时 ...
Hadoop【MR开发规范、序列化】
Hadoop[MR开发规范.序列化] 目录 Hadoop[MR开发规范.序列化] 一.MapReduce编程规范 1.Mapper阶段 2.Reducer阶段 3.Driver阶段二.WordCou ...
[Hadoop]Hadoop章2 HDFS原理及读写过程
HDFS(Hadoop Distributed File System )Hadoop分布式文件系统. HDFS有很多特点: ① 保存多个副本,且提供容错机制,副本丢失或宕机自动恢复.默认存3份. ② ...
hadoop hadoop install (1)
vmuser@vmuser-VirtualBox:~$ sudo useradd -m hadoop -s /bin/bash[sudo] vmuser 的密码: vmuser@vmuser-Virt ...
MR案例：倒排索引
1.map阶段:将单词和URI组成Key值(如“MapReduce :1.txt”),将词频作为value. 利用MR框架自带的Map端排序,将同一文档的相同单词的词频组成列表,传递给Combine过 ...
Hadoop hadoop 机架感知配置
机架感知脚本使用python3编写机架感知脚本,报存到topology.py,给予执行权限 import sys import os DEFAULT_RACK="/default-rack ...
hadoop之 mr输出到hbase
1.注意问题: 1.在开发过程中一定要导入hbase源码中的lib库否则出现如下错误 TableMapReducUtil 找不到什么-- 2.编码: import java.io.IOExceptio ...
Hadoop案例（四）倒排索引（多job串联)与全局计数器
一. 倒排索引(多job串联) 1. 需求分析有大量的文本(文档.网页),需要建立搜索索引 xyg pingping xyg ss xyg ss a.txt xyg pingping xyg pin ...

随机推荐

堆栈（Stacks）
堆栈(Stacks) 准备工作安装Docker 1.13及以上版本安装Docker Compose正如第三部分的准备工作. 安装Docker Machine正如第四部分的准备工作. 阅读第一部分的 ...
code forces 996D Suit and Tie
D. Suit and Tie time limit per test 2 seconds memory limit per test 256 megabytes input standard inp ...
springMvc <form:form>标签 <form:input>标签需要注意的问题
在用springMVC <form:form>表单时,喜欢报的错误如下所示: 错误的Controller层的代码如下: @RequestMapping(value = "test ...
java爬虫--使用正则表达式获取网页中的email
package com.enation.newtest; import java.io.*; import java.util.regex.*; import java.net.*; public c ...
UBI 文件系统移植 sys 设备信息【转】
转自:http://blog.chinaunix.net/uid-25304914-id-3058647.html cat /sys/class/misc/ubi_ctrl/dev --------- ...
ajax和json数据
一.Ajax概述 1．什么是同步,什么是异步同步现象:客户端发送请求到服务器端,当服务器返回响应之前,客户端都处于等待卡死状态异步现象:客户端发送请求到服务器端,无论服务器是否返回响应, ...
web前端性能优化，提升静态文件的加载速度
原文地址:传送门 WeTest 导读此文总结了笔者在Web静态资源方面的一些优化经验. 如何优化用户在访问网页时, 最直观的感受就是页面内容出来的速度,我们要做的优化工作, 也主要是为了这个目标. ...
(13)oracle导出、导入
导出导出分三种导出表.导出方案(用户).导出数据库导入导出不需要进入sqlplus,都需要从cmd进到所安装的oracle目录的bin文件夹下例如:D:\app\Administrat ...
Codeforces 940F Machine Learning （带修改莫队）
题目链接 Codeforces Round #466 (Div. 2) Problem F 题意给定一列数和若干个询问,每一次询问要求集合$\left\{c_{0}, c_{1}, c_{2}, ...
AppScan 浏览器兼容解决
手动探索的时候,因为打开的浏览器是appscan自带的,可能会存在兼容性问题,有些页面无法正常打开.那么是否可以用我们电脑上的浏览器(IE .火狐.谷歌)来进行录制菜单栏--工具---选项----首 ...

【Hadoop】Hadoop MR 如何实现倒排索引算法？

【Hadoop】Hadoop MR 如何实现倒排索引算法？的更多相关文章

随机推荐

热门专题