1.倒排索引的实现

 import java.io.IOException;
import java.util.StringTokenizer; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class InvertedIndex { public static class InvertedIndexMap extends Mapper<Object,Text,Text,Text>{ private Text valueInfo = new Text();
private Text keyInfo = new Text();
private FileSplit split; public void map(Object key, Text value,Context context)
throws IOException, InterruptedException {
//获取<key value>对所属的FileSplit对象
split = (FileSplit) context.getInputSplit();
StringTokenizer stk = new StringTokenizer(value.toString());
while (stk.hasMoreElements()) {
//key值由(单词:URI)组成
keyInfo.set(stk.nextToken()+":"+split.getPath().toString());
//词频
valueInfo.set("1");
context.write(keyInfo, valueInfo);
}
}
} public static class InvertedIndexCombiner extends Reducer<Text,Text,Text,Text>{ Text info = new Text(); public void reduce(Text key, Iterable<Text> values,Context contex)
throws IOException, InterruptedException {
int sum = 0;
for (Text value : values) {
sum += Integer.parseInt(value.toString());
}
int splitIndex = key.toString().indexOf(":");
//重新设置value值由(URI+:词频组成)
info.set(key.toString().substring(splitIndex+1) +":"+ sum);
//重新设置key值为单词
key.set(key.toString().substring(0,splitIndex));
contex.write(key, info);
}
} public static class InvertedIndexReduce extends Reducer<Text,Text,Text,Text>{ private Text result = new Text(); public void reduce(Text key, Iterable<Text> values,Context contex)
throws IOException, InterruptedException {
//生成文档列表
String fileList = new String();
for (Text value : values) {
fileList += value.toString()+";";
}
result.set(fileList);
contex.write(key, result);
}
} public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(); Job job = new Job(conf,"InvertedIndex"); job.setJarByClass(InvertedIndex.class); job.setMapperClass(InvertedIndexMap.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class); job.setCombinerClass(InvertedIndexCombiner.class); job.setReducerClass(InvertedIndexReduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path("./in/invertedindex/"));
FileOutputFormat.setOutputPath(job, new Path("./out/")); System.exit(job.waitForCompletion(true)?0:1); }
}

2.word count

 import java.io.IOException;

 import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser; public class WordCount {
public static class WordMapper extends Mapper<Object, String, String, IntWritable> {
private static final IntWritable one = new IntWritable(1);
public void map(Object key, String value, Context context) throws IOException, InterruptedException {
String[] words = value.split(" ");
for (String word : words) {
context.write(word, one);
}
}
}
public static class WordReducer extends Reducer<String, Iterable<IntWritable>, String, IntWritable> {
private static IntWritable ans = new IntWritable();
public void reduce(String key, Iterable<IntWritable> value, Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable count : value) {
sum += count.get();
}
ans.set(sum);
context.write(key, ans);
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage: wordCount <int> <count>");
System.exit(2);
}
Job job = new Job(conf, "word count");
job.setJarByClass(WordCount.class);
job.setMapperClass(WordMapper.class);
job.setCombinerClass(WordReducer.class);
job.setReducerClass(WordReducer.class);
job.setOutputKeyClass(String.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true)?0:1);
}
}

3.找出访问量最多的表

 import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; /** * 用Hadoop分析海量日志文件,每行日志记录了如下数据: * TableName(表名),Time(时间),User(用户),TimeSpan(时间开销) * 要求编写MapReduce程序算出高峰时间段(如9-10点)哪张表被访问的最频繁 * 以及这段时间访问这张表最多的用户,以及这个用户访问这张表的总时间开销。 * @author drguo *t003 6:00 u002 180 *t003 7:00 u002 180 *t003 7:08 u002 180 *t003 7:25 u002 180 *t002 8:00 u002 180 *t001 8:00 u001 240 *t001 9:00 u002 300 *t001 9:11 u001 240 *t003 9:26 u001 180 *t001 9:39 u001 300 * * * 先找出9-10点访问量最大的表 * */
//club.drguo.xx.mapreduce.tablecount.TableCount
public class TableCount {
public static class TableCountMapper extends Mapper<LongWritable, Text, Text, LongWritable>{
private Text k = new Text();
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String line = value.toString();
String[] strings = StringUtils.split(line, " ");
String tabName = strings[0];
String time = strings[1];
String[] times = time.split(":");
int hour = Integer.parseInt(times[0]);
k.set(tabName);
if(hour==9){
context.write(k, new LongWritable(1));
System.out.println("-----------------------------------------------"+k);
}
}
}
public static class TableCountReducer extends Reducer<Text, LongWritable, Text, LongWritable>{
private TreeMap<Text, Long> map = new TreeMap<Text, Long>();
@Override
protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
Text tabName = new Text(key.toString());//不要直接Text tabName = key;
long count = 0;
for(LongWritable value : values){
count += value.get();
}
System.out.println(tabName+"--------------------------"+count);
map.put(tabName, count);
}
@Override
protected void cleanup(Reducer<Text, LongWritable, Text, LongWritable>.Context context)
throws IOException, InterruptedException {
Text tableName = null;
Long maxCount = 0L;
for(Text key : map.keySet()){
System.out.println("key="+key+"-----------------value="+map.get(key));
while(map.get(key)>maxCount){
maxCount = map.get(key);
tableName = key;
}
}
context.write(tableName, new LongWritable(maxCount));
}
}
public static void main(String[] args) throws Exception {
Configuration configuration = new Configuration();
Job job = Job.getInstance(configuration,"tablejob");
job.setJarByClass(TableCount.class); job.setMapperClass(TableCountMapper.class);
job.setReducerClass(TableCountReducer.class); job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class); job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class); FileInputFormat.setInputPaths(job, "hdfs://localhost:9000/log");
FileOutputFormat.setOutputPath(job, new Path("hdfs://localhost:9000/tablecount")); System.exit(job.waitForCompletion(true)?0:1);
}
}

MapReduce的几个实现的更多相关文章

  1. Mapreduce的文件和hbase共同输入

    Mapreduce的文件和hbase共同输入 package duogemap;   import java.io.IOException;   import org.apache.hadoop.co ...

  2. mapreduce多文件输出的两方法

    mapreduce多文件输出的两方法   package duogemap;   import java.io.IOException;   import org.apache.hadoop.conf ...

  3. mapreduce中一个map多个输入路径

    package duogemap; import java.io.IOException; import java.util.ArrayList; import java.util.List; imp ...

  4. Hadoop 中利用 mapreduce 读写 mysql 数据

    Hadoop 中利用 mapreduce 读写 mysql 数据   有时候我们在项目中会遇到输入结果集很大,但是输出结果很小,比如一些 pv.uv 数据,然后为了实时查询的需求,或者一些 OLAP ...

  5. [Hadoop in Action] 第5章 高阶MapReduce

    链接多个MapReduce作业 执行多个数据集的联结 生成Bloom filter   1.链接MapReduce作业   [顺序链接MapReduce作业]   mapreduce-1 | mapr ...

  6. MapReduce

    2016-12-21  16:53:49 mapred-default.xml mapreduce.input.fileinputformat.split.minsize 0 The minimum ...

  7. 使用mapreduce计算环比的实例

    最近做了一个小的mapreduce程序,主要目的是计算环比值最高的前5名,本来打算使用spark计算,可是本人目前spark还只是简单看了下,因此就先改用mapreduce计算了,今天和大家分享下这个 ...

  8. MapReduce剖析笔记之八: Map输出数据的处理类MapOutputBuffer分析

    在上一节我们分析了Child子进程启动,处理Map.Reduce任务的主要过程,但对于一些细节没有分析,这一节主要对MapOutputBuffer这个关键类进行分析. MapOutputBuffer顾 ...

  9. MapReduce剖析笔记之七:Child子进程处理Map和Reduce任务的主要流程

    在上一节我们分析了TaskTracker如何对JobTracker分配过来的任务进行初始化,并创建各类JVM启动所需的信息,最终创建JVM的整个过程,本节我们继续来看,JVM启动后,执行的是Child ...

  10. MapReduce剖析笔记之六:TaskTracker初始化任务并启动JVM过程

    在上面一节我们分析了JobTracker调用JobQueueTaskScheduler进行任务分配,JobQueueTaskScheduler又调用JobInProgress按照一定顺序查找任务的流程 ...

随机推荐

  1. nginx配置技巧汇总

    https://segmentfault.com/a/1190000000437323

  2. 编译ros程序包--4

    编译程序包(原创博文,转载请标明出处--周学伟http://www.cnblogs.com/zxouxuewei/) 1.编译程序包: 一旦安装了所需的系统依赖项,我们就可以开始编译刚才创建的程序包了 ...

  3. orcale_proceduie_function_两三栗

    --获取部门树 procedure: create or replace procedure P_UTIL_TREE_ALL(P_APPL_NAME in VARCHAR2, P_HIERARCHY_ ...

  4. html5 file 自定义文件过滤

    使用 acctpe属性即可 示例: gif,jpg <input type="file" name="pic" accept="image/gi ...

  5. string permutation with upcase and lowcase

    Give a string, which only contains a-z. List all the permutation of upcase and lowcase. For example, ...

  6. /etc/motd

    /etc/motd 用于自定义欢迎界面,用法如下: [root@localhost ~]$ cat /etc/motd .=""=. / _ _ \ | d b | \ /\ / ...

  7. docker tag 详解

    docker tag 用于给镜像打标签,语法如下: docker tag SOURCE_IMAGE[:TAG] TARGET_IMAGE[:TAG] ① 比如我现在有一个 centos 镜像: [ro ...

  8. Qt监控后台服务运行状态

    mainwindow.h #ifndef MAINWINDOW_H #define MAINWINDOW_H #include <QMainWindow> #include <QMa ...

  9. IIS7以上版本去掉伪静态去掉index.php方法

    1,由于从iis7以上的版本httpd.ini文件已不会被解析,将以下的xml文件复制到web.config 的文件中,然后放到网站的根目录即可. <?xml version="1.0 ...

  10. Excel 2010 如何将筛选后的数据复制粘贴到另一个工作表筛选后的表格里

    如果你是指自动筛选后,把筛选数据复制/粘贴到另外一个工作表中,不妨试试试 第一步选中筛选后的数据区域:第二步执行菜单命令“编辑/定位/定位条件/可见单元格”,确定:第三步单击复制按钮或者Ctrl+C或 ...