关于MapReduce中自定义分组类(三)
/*** Define the comparator that controls which keys are grouped together* for a single call to* {@link Reducer#reduce(Object, Iterable,* org.apache.hadoop.mapreduce.Reducer.Context)}* @param cls the raw comparator to use* @throws IllegalStateException if the job is submitted* @see #setCombinerKeyGroupingComparatorClass(Class)*/publicvoid setGroupingComparatorClass(Class<? extends RawComparator> cls) throws IllegalStateException{ensureState(JobState.DEFINE);conf.setOutputValueGroupingComparator(cls);}
/*** Set the user defined {@link RawComparator} comparator for* grouping keys in the input to the reduce.** <p>This comparator should be provided if the equivalence rules for keys* for sorting the intermediates are different from those for grouping keys* before each call to* {@link Reducer#reduce(Object, java.util.Iterator, OutputCollector, Reporter)}.</p>** <p>For key-value pairs (K1,V1) and (K2,V2), the values (V1, V2) are passed* in a single call to the reduce function if K1 and K2 compare as equal.</p>** <p>Since {@link #setOutputKeyComparatorClass(Class)} can be used to control* how keys are sorted, this can be used in conjunction to simulate* <i>secondary sort on values</i>.</p>** <p><i>Note</i>: This is not a guarantee of the reduce sort being* <i>stable</i> in any sense. (In any case, with the order of available* map-outputs to the reduce being non-deterministic, it wouldn't make* that much sense.)</p>** @param theClass the comparator class to be used for grouping keys.* It should implement <code>RawComparator</code>.* @see #setOutputKeyComparatorClass(Class)* @see #setCombinerKeyGroupingComparator(Class)*/publicvoid setOutputValueGroupingComparator(Class<? extends RawComparator> theClass){setClass(JobContext.GROUP_COMPARATOR_CLASS,theClass,RawComparator.class);}
/*** Get the user defined {@link WritableComparable} comparator for* grouping keys of inputs to the reduce.** @return comparator set by the user for grouping values.* @see #setOutputValueGroupingComparator(Class) for details.*/publicRawComparator getOutputValueGroupingComparator(){Class<? extends RawComparator> theClass = getClass(JobContext.GROUP_COMPARATOR_CLASS, null,RawComparator.class);if(theClass == null){return getOutputKeyComparator();}returnReflectionUtils.newInstance(theClass,this);}
RawComparator comparator = job.getOutputValueGroupingComparator();
if(useNewApi){runNewReducer(job, umbilical, reporter, rIter, comparator,keyClass, valueClass);}else{runOldReducer(job, umbilical, reporter, rIter, comparator,keyClass, valueClass);}
private<INKEY,INVALUE,OUTKEY,OUTVALUE>void runNewReducer(JobConf job,final TaskUmbilicalProtocol umbilical,final TaskReporter reporter,RawKeyValueIterator rIter,RawComparator<INKEY> comparator,Class<INKEY> keyClass,Class<INVALUE> valueClass) throws IOException,InterruptedException,ClassNotFoundException{// wrap value iterator to report progress.final RawKeyValueIterator rawIter = rIter;rIter =newRawKeyValueIterator(){publicvoid close() throws IOException{rawIter.close();}publicDataInputBuffer getKey() throws IOException{return rawIter.getKey();}publicProgress getProgress(){return rawIter.getProgress();}publicDataInputBuffer getValue() throws IOException{return rawIter.getValue();}public boolean next() throws IOException{boolean ret = rawIter.next();reporter.setProgress(rawIter.getProgress().getProgress());return ret;}};// make a task context so we can get the classesorg.apache.hadoop.mapreduce.TaskAttemptContext taskContext =new org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl(job,getTaskID(), reporter);// make a reducerorg.apache.hadoop.mapreduce.Reducer<INKEY,INVALUE,OUTKEY,OUTVALUE> reducer =(org.apache.hadoop.mapreduce.Reducer<INKEY,INVALUE,OUTKEY,OUTVALUE>)ReflectionUtils.newInstance(taskContext.getReducerClass(), job);org.apache.hadoop.mapreduce.RecordWriter<OUTKEY,OUTVALUE> trackedRW =newNewTrackingRecordWriter<OUTKEY, OUTVALUE>(this, taskContext);job.setBoolean("mapred.skip.on", isSkipping());job.setBoolean(JobContext.SKIP_RECORDS, isSkipping());org.apache.hadoop.mapreduce.Reducer.ContextreducerContext = createReduceContext(reducer, job, getTaskID(),rIter, reduceInputKeyCounter,reduceInputValueCounter,trackedRW,committer,reporter, comparator, keyClass,valueClass);try{reducer.run(reducerContext);} finally {trackedRW.close(reducerContext);}}
@SuppressWarnings("unchecked")protectedstatic<INKEY,INVALUE,OUTKEY,OUTVALUE>org.apache.hadoop.mapreduce.Reducer<INKEY,INVALUE,OUTKEY,OUTVALUE>.ContextcreateReduceContext(org.apache.hadoop.mapreduce.Reducer<INKEY,INVALUE,OUTKEY,OUTVALUE> reducer,Configuration job,org.apache.hadoop.mapreduce.TaskAttemptID taskId,RawKeyValueIterator rIter,org.apache.hadoop.mapreduce.Counter inputKeyCounter,org.apache.hadoop.mapreduce.Counter inputValueCounter,org.apache.hadoop.mapreduce.RecordWriter<OUTKEY,OUTVALUE> output,org.apache.hadoop.mapreduce.OutputCommitter committer,org.apache.hadoop.mapreduce.StatusReporter reporter,RawComparator<INKEY> comparator,Class<INKEY> keyClass,Class<INVALUE> valueClass) throws IOException,InterruptedException{org.apache.hadoop.mapreduce.ReduceContext<INKEY, INVALUE, OUTKEY, OUTVALUE>reduceContext =newReduceContextImpl<INKEY, INVALUE, OUTKEY, OUTVALUE>(job, taskId,rIter,inputKeyCounter,inputValueCounter,output,committer,reporter,comparator,keyClass,valueClass);
publicReduceContextImpl(Configuration conf,TaskAttemptID taskid,RawKeyValueIterator input,Counter inputKeyCounter,Counter inputValueCounter,RecordWriter<KEYOUT,VALUEOUT> output,OutputCommitter committer,StatusReporter reporter,RawComparator<KEYIN> comparator,Class<KEYIN> keyClass,Class<VALUEIN> valueClass) throws InterruptedException,IOException{super(conf, taskid, output, committer, reporter);this.input = input;this.inputKeyCounter = inputKeyCounter;this.inputValueCounter = inputValueCounter;this.comparator = comparator;this.serializationFactory =newSerializationFactory(conf);this.keyDeserializer = serializationFactory.getDeserializer(keyClass);this.keyDeserializer.open(buffer);this.valueDeserializer = serializationFactory.getDeserializer(valueClass);this.valueDeserializer.open(buffer);hasMore = input.next();this.keyClass = keyClass;this.valueClass = valueClass;this.conf = conf;this.taskid = taskid;}
/*** Advance to the next key/value pair.*/@Overridepublic boolean nextKeyValue() throws IOException,InterruptedException{if(!hasMore){key = null;value = null;returnfalse;}firstValue =!nextKeyIsSame;DataInputBuffer nextKey = input.getKey();currentRawKey.set(nextKey.getData(), nextKey.getPosition(),nextKey.getLength()- nextKey.getPosition());buffer.reset(currentRawKey.getBytes(),0, currentRawKey.getLength());key = keyDeserializer.deserialize(key);DataInputBuffer nextVal = input.getValue();buffer.reset(nextVal.getData(), nextVal.getPosition(), nextVal.getLength()- nextVal.getPosition());value = valueDeserializer.deserialize(value);currentKeyLength = nextKey.getLength()- nextKey.getPosition();currentValueLength = nextVal.getLength()- nextVal.getPosition();if(isMarked){backupStore.write(nextKey, nextVal);}hasMore = input.next();if(hasMore){nextKey = input.getKey();nextKeyIsSame = comparator.compare(currentRawKey.getBytes(),0,currentRawKey.getLength(),nextKey.getData(),nextKey.getPosition(),nextKey.getLength()- nextKey.getPosition())==0;}else{nextKeyIsSame =false;}inputValueCounter.increment(1);returntrue;}
if(theClass == null){return getOutputKeyComparator();}
/*** Get the {@link RawComparator} comparator used to compare keys.** @return the {@link RawComparator} comparator used to compare keys.*/publicRawComparator getOutputKeyComparator(){Class<? extends RawComparator> theClass = getClass(JobContext.KEY_COMPARATOR, null,RawComparator.class);if(theClass != null)returnReflectionUtils.newInstance(theClass,this);returnWritableComparator.get(getMapOutputKeyClass().asSubclass(WritableComparable.class),this);}
returnReflectionUtils.newInstance(theClass,this);
关于MapReduce中自定义分组类(三)的更多相关文章
- 关于MapReduce中自定义分区类(四)
MapTask类 在MapTask类中找到run函数 if(useNewApi){ runNewMapper(job, splitMetaInfo, umbilical, reporter ...
- 关于MapReduce中自定义Combine类(一)
MRJobConfig public static fina COMBINE_CLASS_ATTR 属性COMBINE_CLASS_ATTR = "mapreduce.j ...
- 2 weekend110的hadoop的自定义排序实现 + mr程序中自定义分组的实现
我想得到按流量来排序,而且还是倒序,怎么达到实现呢? 达到下面这种效果, 默认是根据key来排, 我想根据value里的某个排, 解决思路:将value里的某个,放到key里去,然后来排 下面,开始w ...
- 关于MapReduce中自定义带比较key类、比较器类(二)——初学者从源码查看其原理
Job类 /** * Define the comparator that controls * how the keys are sorted before they * are pa ...
- flask中自定义日志类
一:项目架构 二:自定义日志类 1. 建立log.conf的配置文件 log.conf [log] LOG_PATH = /log/ LOG_NAME = info.log 2. 定义日志类 LogC ...
- python3.4中自定义数组类(即重写数组类)
'''自定义数组类,实现数组中数字之间的四则运算,内积运算,大小比较,数组元素访问修改及成员测试等功能''' class MyArray: '''保证输入值为数字元素(整型,浮点型,复数)''' de ...
- 一脸懵逼学习Hadoop中的MapReduce程序中自定义分组的实现
1:首先搞好实体类对象: write 是把每个对象序列化到输出流,readFields是把输入流字节反序列化,实现WritableComparable,Java值对象的比较:一般需要重写toStrin ...
- 读取SequenceFile中自定义Writable类型值
1)hadoop允许程序员创建自定义的数据类型,如果是key则必须要继承WritableComparable,因为key要参与排序,而value只需要继承Writable就可以了.以下定义一个Doub ...
- Java中自定义注解类,并加以运用
在Java框架中,经常会使用注解,而且还可以省很多事,来了解下自定义注解. 注解是一种能被添加到java代码中的元数据,类.方法.变量.参数和包都可以用注解来修饰.注解对于它所修饰的代码并没有直接的影 ...
随机推荐
- Linux基础练习题
1.列出当前系统上所有已经登录的用户名,注意:同一个用户登录多次,则只显示一次即可. [root@bj-1-160-enzhi ~]# who|cut -d ' ' -f 1|uniq -c 2 ro ...
- 《Note --- Unreal 4 --- behavior tree》
Web: https://docs.unrealengine.com/latest/INT/Engine/AI/BehaviorTrees/index.html Test project: D:\En ...
- 破解 Windows 下Markdown 编辑器 MarkdownPad 2
MarkdownPad 是 Windows 平台下一款优秀的 Markdown 编辑器,本文简单介绍 Markdown 以及使用一种方法破解 MarkdownPad 使其升级到专业版.该方法仅限于教育 ...
- Quartz框架
Quartz框架 Quartz 是个开源的作业调度框架,为在 Java 应用程序中进行作业调度提供了简单却强大的机制.Quartz 允许开发人员根据时间间隔(或天)来调度作业.它实现了作业和触发器的多 ...
- 启用SQLite的Data Provider 运行WECOMPANYSITE时遇到ERROR CREATING CONTEXT 'SPRING.ROOT': ERROR THROWN BY A DEPENDENCY OF OBJECT 'SYSTEM.DATA.SQLITE'
从网上下载的源码WeCompanySite,运行时报错 Error creating context 'spring.root': Error thrown by a dependency of ob ...
- [LeetCode] Add and Search Word - Data structure design 添加和查找单词-数据结构设计
Design a data structure that supports the following two operations: void addWord(word) bool search(w ...
- [LeetCode] Trapping Rain Water 收集雨水
Given n non-negative integers representing an elevation map where the width of each bar is 1, comput ...
- 占位符行为 PlaceHolderBehavior 的实现以及使用
这个效果我不太会描述 PlaceHolder直译占位符 也有人把这个效果叫水印效果 就是和HTML5的PlaceHolder属性一样的效果 上图直观: 使用方法: 首先下载 占位符行为dll.rar ...
- "Installation failed !" in GUI but not in CLI (/usr/bin/winusb: line 78: 18265 Terminated )
"Installation failed !" in GUI but not in CLI (/usr/bin/winusb: line 78: 18265 Terminated ...
- Oracle11g字符集AL32UTF8修改为ZHS16GBK详解
此问题发生在数据库迁移过程中.源数据库:自己笔记本上win7 64位系统的oracle11g个人版,字符集ZHS16GBK :目标数据库,HP的sqlserver2008 系统 64位数据库服务器,字 ...