MapReduce(三)

MapReduce(三):

1.关于倒叙排序前10名

1)TreeMap根据key排序

2)TreeSet排序,传入一个对象,排序按照类中的compareTo方法排序

2.写一个MapReduce的模板

3.MapReduce的分区

             1)手动分区

             2)自动分区

      4.自定义分区


----------------------------------------------------------------------------------------------------------------------------------

一.关于倒叙排序前10名  

将一个文章中字母单词出现的次数进行倒叙排序,只取前十

1)TreeMap

package com.huhu.day03;

import java.io.IOException;
import java.util.Collections;
import java.util.Map;
import java.util.StringTokenizer;
import java.util.TreeMap; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner; /**
* a 80 b 78 r 70 .. 基于value来排序
*
* @author huhu_k
*
*/
public class Top10_1 extends ToolRunner implements Tool { private Configuration conf; public static class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
StringTokenizer st = new StringTokenizer(value.toString());
while (st.hasMoreTokens()) {
context.write(new Text(st.nextToken()), new IntWritable(1));
}
}
} public static class MyReduce extends Reducer<Text, IntWritable, Text, IntWritable> {
// TreeMap倒叙排列
private TreeMap<Long, String> map = new TreeMap<>(Collections.reverseOrder()); @Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
int sum = 0;
for (IntWritable v : values) {
sum++;
}
map.put(Long.valueOf(sum), key.toString());
if (map.size() > 10) {
// 按key排序 前十 降序
map.remove(map.lastKey());
}
} @Override
protected void cleanup(Context context) throws IOException, InterruptedException {
for (Map.Entry<Long, String> m : map.entrySet()) {
context.write(new Text(m.getValue()), new IntWritable(Integer.parseInt(m.getKey() + "")));
}
}
} @Override
public Configuration getConf() {
if (conf != null) {
return conf;
}
return new Configuration();
} @Override
public void setConf(Configuration arg0) { } @Override
public int run(String[] other) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(Top10_1.class);
job.setMapperClass(MyMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class); job.setReducerClass(MyReduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(other[0]));
FileOutputFormat.setOutputPath(job, new Path(other[1])); return job.waitForCompletion(true) ? 0 : 1;
} public static void main(String[] args) throws Exception {
Top10_1 t = new Top10_1();
String[] other = new GenericOptionsParser(t.getConf(), args).getRemainingArgs();
if (other.length != 2) {
System.out.println("your input args number is fail,you need input <in> and <out>");
System.exit(0);
}
ToolRunner.run(t.conf, t, other);
}
}
package com.huhu.day03;

import java.io.IOException;
import java.util.Iterator;
import java.util.TreeSet; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner; /**
* a 80 b 78 r 70 .. 基于value来排序 TreeSet
*
* @author huhu_k
*
*/
public class Top10_2 extends ToolRunner implements Tool { private Configuration conf; public static class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable> { @Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] line = value.toString().split(" ");
for (String s : line) {
context.write(new Text(s), new IntWritable(1));
}
}
} public static class MyReduce extends Reducer<Text, IntWritable, WCWritable, NullWritable> {
// TreeSet倒叙排列
private TreeSet<WCWritable> set;
private final int KEY = 11; @Override
protected void setup(Context context) throws IOException, InterruptedException {
set = new TreeSet<WCWritable>();
} @Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
WCWritable w = new WCWritable();
int sum = 0;
for (IntWritable i : values) {
sum += i.get();
}
w.setWord(key.toString());
w.setCount(sum); set.add(w); if (KEY < set.size()) {
set.remove(set.last());
}
} @Override
protected void cleanup(Context context) throws IOException, InterruptedException {
Iterator<WCWritable> iterator = set.iterator();
if (iterator.hasNext()) {
context.write(iterator.next(), NullWritable.get());
}
}
} @Override
public Configuration getConf() {
if (conf != null) {
return conf;
}
return new Configuration();
} @Override
public void setConf(Configuration arg0) { } @Override
public int run(String[] other) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(Top10_2.class);
job.setMapperClass(MyMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class); job.setReducerClass(MyReduce.class);
job.setOutputKeyClass(WCWritable.class);
job.setOutputValueClass(NullWritable.class); FileInputFormat.addInputPath(job, new Path(other[0]));
FileOutputFormat.setOutputPath(job, new Path(other[1])); return job.waitForCompletion(true) ? 0 : 1;
} public static void main(String[] args) throws Exception {
Top10_2 t = new Top10_2();
String[] other = new GenericOptionsParser(t.getConf(), args).getRemainingArgs();
if (other.length != 2) {
System.out.println("your input args number is fail,you need input <in> and <out>");
System.exit(0);
}
ToolRunner.run(t.getConf(), t, other);
}
}

运行还是在集群中运行,报错可以查看日志

2)TreeSet

package com.huhu.day03;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException; import org.apache.hadoop.io.WritableComparable; public class WCWritable implements WritableComparable<WCWritable> { private String word;
private int count; public WCWritable() {
super();
} public WCWritable(String word, int count) {
super();
this.word = word;
this.count = count;
} public String getWord() {
return word;
} public void setWord(String word) {
this.word = word;
} public int getCount() {
return count;
} public void setCount(int count) {
this.count = count;
} @Override
public String toString() {
return "WCWritable [word=" + word + ", count=" + count + "]";
} @Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + count;
result = prime * result + ((word == null) ? 0 : word.hashCode());
return result;
} @Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
WCWritable other = (WCWritable) obj;
if (count != other.count)
return false;
if (word == null) {
if (other.word != null)
return false;
} else if (!word.equals(other.word))
return false;
return true;
} @Override
public void readFields(DataInput in) throws IOException {
this.word = in.readUTF();
this.count = in.readInt();
} @Override
public void write(DataOutput out) throws IOException {
out.writeUTF(word);
out.writeInt(count);
} @Override
public int compareTo(WCWritable o) {
if (this.count == o.count) {
// 字典顺序
return this.word.compareTo(o.word);
// return this.word.length() - o.word.length();
}
return o.count - this.count;
}
}

package com.huhu.day03;

import java.io.IOException;
import java.util.Iterator;
import java.util.TreeSet; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner; /**
 * a 80 b 78 r 70 .. 基于value来排序 TreeSet
 * 
 * @author huhu_k
 *
 */
public class Top10_2 extends ToolRunner implements Tool {
private Configuration conf; static class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable> { @Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] line = value.toString().split(" ");
for (String s : line) {
context.write(new Text(s), new IntWritable(1));
} }
} static class MyReducer extends Reducer<Text, IntWritable, WCWritable, NullWritable> {
private TreeSet<WCWritable> set;
private final int KEY = 10; @Override
protected void setup(Context context) throws IOException, InterruptedException {
set = new TreeSet<WCWritable>();
} @Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
WCWritable w = new WCWritable();
int sum = 0;
for (IntWritable v : values) {
sum += v.get();
}
w.setWord(key.toString());
w.setCount(sum); set.add(w); if (KEY < set.size()) {
set.remove(set.last());
}
} @Override
protected void cleanup(Context context) throws IOException, InterruptedException {
Iterator<WCWritable> iterator = set.iterator();
while (iterator.hasNext()) {
context.write(iterator.next(), NullWritable.get());
}
}
} public static void main(String[] args) throws Exception {
Top10_2 t = new Top10_2();
Configuration con = t.getConf();
String[] other = new GenericOptionsParser(con, args).getRemainingArgs();
if (other.length != 2) {
System.err.println("number is fail");
}
int run = ToolRunner.run(con, t, args);
System.exit(run);
} @Override
public Configuration getConf() {
if (conf != null) {
return conf;
}
return new Configuration();
} @Override
public void setConf(Configuration arg0) { } @Override
public int run(String[] other) throws Exception {
Configuration con = getConf();
Job job = Job.getInstance(con);
job.setJarByClass(Top10_2.class);
job.setMapperClass(MyMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class); // 默认分区
job.setPartitionerClass(HashPartitioner.class); job.setReducerClass(MyReducer.class);
job.setOutputKeyClass(WCWritable.class);
job.setOutputValueClass(NullWritable.class); FileInputFormat.addInputPath(job, new Path(other[0]));
FileOutputFormat.setOutputPath(job, new Path(other[1])); return job.waitForCompletion(true) ? 0 : 1;
}
}


二.写一个MapReduce的模板

package util;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner; public class Frame extends ToolRunner implements Tool { private Configuration conf; public static class MyMapper extends Mapper<LongWritable, Text, Text, Text> { @Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] line = value.toString().split(" "); }
} public static class MyReduce extends Reducer<Text, Text, Text, Text> { @Override
protected void setup(Context context) throws IOException, InterruptedException {
} @Override
protected void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
} @Override
protected void cleanup(Context context) throws IOException, InterruptedException {
}
} public static void main(String[] args) throws Exception {
Frame t = new Frame();
Configuration conf = t.getConf();
String[] other = new GenericOptionsParser(conf, args).getRemainingArgs();
if (other.length != 2) {
System.err.println("number is fail");
}
int run = ToolRunner.run(conf, t, args);
System.exit(run);
} @Override
public Configuration getConf() {
if (conf != null) {
return conf;
}
return new Configuration();
} @Override
public void setConf(Configuration arg0) { } @Override
public int run(String[] other) throws Exception {
Configuration con = getConf();
Job job = Job.getInstance(con);
job.setJarByClass(Frame.class);
job.setMapperClass(MyMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class); //默认分区
job.setPartitionerClass(HashPartitioner.class); job.setReducerClass(MyReduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(other[0]));
FileOutputFormat.setOutputPath(job, new Path(other[1])); return job.waitForCompletion(true) ? 0 : 1;
} }

三.MapReduce的分区

1)手动分区
package com.huhu.day03;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
/**
 * 手动分区
 * 
 * @author huhu_k
 *
 */
public class ManualPartition extends ToolRunner implements Tool { private Configuration conf; public static class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable> { @Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] line = value.toString().split(" ");
for (String s : line) {
context.write(new Text(s), new IntWritable(1));
}
}
} public static class MyReduce extends Reducer<Text, IntWritable, Text, IntWritable> { private MultipleOutputs<Text, IntWritable> mos; @Override
protected void setup(Reducer<Text, IntWritable, Text, IntWritable>.Context context)
throws IOException, InterruptedException {
mos = new MultipleOutputs<>(context);
} @Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
int sum = 0;
for (IntWritable v : values) {
sum += v.get();
} if (key.toString().substring(0, 1).matches("[a-z]")) {
mos.write("az", key.toString(), new IntWritable(sum));
} else if (key.toString().substring(0, 1).matches("[A-Z]")) {
mos.write("AZ", key.toString(), new IntWritable(sum));
} else if (key.toString().substring(0, 1).matches("[0-9]")) {
mos.write("09", key.toString(), new IntWritable(sum));
} else {
mos.write("default", key.toString(), new IntWritable(sum));
}
} @Override
protected void cleanup(Context context) throws IOException, InterruptedException {
// 很重要 -->因为mos类时一个类似的缓冲区 hdfs可以写 更改 追加写
mos.close();
}
} public static void main(String[] args) throws Exception {
ManualPartition t = new ManualPartition();
Configuration conf = t.getConf();
String[] other = new GenericOptionsParser(conf, args).getRemainingArgs();
if (other.length != 2) {
System.err.println("number is fail");
}
int run = ToolRunner.run(conf, t, args);
System.exit(run);
} @Override
public Configuration getConf() {
if (conf != null) {
return conf;
}
return new Configuration();
} @Override
public void setConf(Configuration arg0) { } @Override
public int run(String[] other) throws Exception {
Configuration con = getConf();
Job job = Job.getInstance(con);
job.setJarByClass(ManualPartition.class);
job.setMapperClass(MyMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class); // 默认分区
job.setPartitionerClass(HashPartitioner.class); job.setReducerClass(MyReduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(other[0]));
FileOutputFormat.setOutputPath(job, new Path(other[1]));
// 手动分区
MultipleOutputs.addNamedOutput(job, "az", TextOutputFormat.class, Text.class, IntWritable.class);
MultipleOutputs.addNamedOutput(job, "AZ", TextOutputFormat.class, Text.class, IntWritable.class);
MultipleOutputs.addNamedOutput(job, "09", TextOutputFormat.class, Text.class, IntWritable.class);
MultipleOutputs.addNamedOutput(job, "default", TextOutputFormat.class, Text.class, IntWritable.class);
return job.waitForCompletion(true) ? 0 : 1;
} }

2)自动分区
package com.huhu.day03.partitioner;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner; public class WordCountAUTOPartitioner extends Partitioner<Text, IntWritable> { @Override
public int getPartition(Text key, IntWritable value, int numPartitioner) {
String firstChar = key.toString().substring(0, 1);
if (firstChar.matches("[a-g]")) {
//返回
return 0 % numPartitioner;
} else if (firstChar.matches("[h-z]")) {
return 1 % numPartitioner;
} else if (firstChar.matches("[0-5]")) {
return 2 % numPartitioner;
} else if (firstChar.matches("[6-9]")) {
return 3 % numPartitioner;
} else if (firstChar.matches("[A-G]")) {
return 0 % numPartitioner;
} else if (firstChar.matches("[H-Z]")) {
return 5 % numPartitioner;
} else {
return 6 % numPartitioner;
}
} }

package com.huhu.day03;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner; import com.huhu.day03.partitioner.WordCountAUTOPartitioner; public class AutomaticPartition extends ToolRunner implements Tool { private Configuration conf; public static class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable> { @Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] line = value.toString().split(" ");
for (String s : line) {
context.write(new Text(s), new IntWritable(1));
}
}
} public static class MyReduce extends Reducer<Text, IntWritable, Text, IntWritable> { @Override
protected void setup(Context context) throws IOException, InterruptedException {
} @Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
int sum = 0;
for (IntWritable v : values) {
sum += v.get();
}
context.write(key, new IntWritable(sum));
} @Override
protected void cleanup(Context context) throws IOException, InterruptedException {
}
} public static void main(String[] args) throws Exception {
AutomaticPartition t = new AutomaticPartition();
Configuration conf = t.getConf();
String[] other = new GenericOptionsParser(conf, args).getRemainingArgs();
if (other.length != 2) {
System.err.println("number is fail");
}
int run = ToolRunner.run(conf, t, args);
System.exit(run);
} @Override
public Configuration getConf() {
if (conf != null) {
return conf;
}
return new Configuration();
} @Override
public void setConf(Configuration arg0) { } @Override
public int run(String[] other) throws Exception {
Configuration con = getConf();
Job job = Job.getInstance(con);
job.setJarByClass(AutomaticPartition.class);
job.setMapperClass(MyMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class); // 默认分区
// job.setPartitionerClass(HashPartitioner.class); // 自定义分区
job.setPartitionerClass(WordCountAUTOPartitioner.class);
// 分40个区
job.setNumReduceTasks(40); job.setReducerClass(MyReduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(other[0]));
FileOutputFormat.setOutputPath(job, new Path(other[1])); return job.waitForCompletion(true) ? 0 : 1;
} }


只有在我的分区类中设置的分区中有内容,别的没有,因为我没有设置


四.自定义分组
package com.huhu.day03;

import java.io.IOException;
import java.util.Iterator; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner; import com.huhu.day03.group.ClassGroupSort;
import com.huhu.day03.pojo.Student; public class StudentAutoGroup extends ToolRunner implements Tool { private Configuration conf; public static class MyMapper extends Mapper<LongWritable, Text, Student, Student> { @Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] line = value.toString().split(" ");
Student s = new Student(line[0], line[1], Integer.parseInt(line[2]));
context.write(s, s);
}
} public static class MyReduce extends Reducer<Student, Student, Text, IntWritable> { @Override
protected void reduce(Student key, Iterable<Student> values, Context context)
throws IOException, InterruptedException {
int sum = 0;
for (Student s : values) {
sum += s.getSccore();
}
context.write(new Text(key.getGroup()), new IntWritable(sum));
}
} public static void main(String[] args) throws Exception {
StudentAutoGroup t = new StudentAutoGroup();
Configuration conf = t.getConf();
String[] other = new GenericOptionsParser(conf, args).getRemainingArgs();
if (other.length != 2) {
System.err.println("number is fail");
}
int run = ToolRunner.run(conf, t, args);
System.exit(run);
} @Override
public Configuration getConf() {
if (conf != null) {
return conf;
}
return new Configuration();
} @Override
public void setConf(Configuration arg0) { } @Override
public int run(String[] other) throws Exception {
Configuration con = getConf();
Job job = Job.getInstance(con);
job.setJarByClass(StudentAutoGroup.class);
job.setMapperClass(MyMapper.class);
job.setMapOutputKeyClass(Student.class);
job.setMapOutputValueClass(Student.class); // 分组
job.setCombinerKeyGroupingComparatorClass(ClassGroupSort.class);
// job.setGroupingComparatorClass(ClassGroupSort.class);
// 默认分区
job.setPartitionerClass(HashPartitioner.class);
job.setNumReduceTasks(1); job.setReducerClass(MyReduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(other[0]));
FileOutputFormat.setOutputPath(job, new Path(other[1])); return job.waitForCompletion(true) ? 0 : 1;
} }
package com.huhu.day03.pojo;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException; import org.apache.hadoop.io.WritableComparable; public class Student implements WritableComparable<Student> { private String name;
private String group;
private int sccore; public Student() {
super();
// TODO Auto-generated constructor stub
} public Student(String name, String group, int sccore) {
super();
this.name = name;
this.group = group;
this.sccore = sccore;
} public String getName() {
return name;
} public void setName(String name) {
this.name = name;
} public String getGroup() {
return group;
} public void setGroup(String group) {
this.group = group;
} public int getSccore() {
return sccore;
} public void setSccore(int sccore) {
this.sccore = sccore;
} @Override
public String toString() {
return "Student [name=" + name + ", group=" + group + ", sccore=" + sccore + "]";
} @Override
public void readFields(DataInput in) throws IOException {
this.name = in.readUTF();
this.group = in.readUTF();
this.sccore = in.readInt();
} @Override
public void write(DataOutput out) throws IOException {
out.writeUTF(name);
out.writeUTF(group);
out.writeInt(sccore);
} @Override
public int compareTo(Student o) {
return this.group.compareTo(o.group);
} }
package com.huhu.day03.group;

import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.io.WritableComparator; import com.huhu.day03.pojo.Student; public class ClassGroupSort implements RawComparator<Student> { @Override
public int compare(Student o1, Student o2) {
return (int) (o1.getGroup().equals(o2.getGroup()) ? 0 : 1);
} @Override
public int compare(byte[] arg0, int arg1, int arg2, byte[] arg3, int arg4, int arg5) {
return WritableComparator.compareBytes(arg0, arg1, 8, arg3, arg4, 8);
}
}



总结下吧:

MapReduce的分区是对key的类型分:可以根据【a-z】【A-Z】【0-9】.....等等。是对reduce中的key来分的

MapReduce的分组则是忽略key:根据value来分的,比如你传入一个对象,根据对象的属性来比较,虽然传入的是不同的对象,但是只要属性相同,则可以对数据进行操作。

他们之所以又是分组又是分区,一则为了清洗数据,二则为了给数据排序。

MapReduce(三)的更多相关文章

  1. MapReduce三种join实例分析

    本文引自吴超博客 实现原理 1.在Reudce端进行连接. 在Reudce端进行连接是MapReduce框架进行表之间join操作最为常见的模式,其具体的实现原理如下: Map端的主要工作:为来自不同 ...

  2. MapReduce三种路径输入

    目前为止知道MapReduce有三种路径输入方式.1.第一种是通过一下方式输入: FileInputFormat.addInputPath(job, new Path(args[0]));FileIn ...

  3. MapReduce(三) 典型场景(一)

    一.mapreduce多job串联 1.需求 一个稍复杂点的处理逻辑往往需要多个 mapreduce 程序串联处理,多 job 的串联可以借助 mapreduce 框架的 JobControl 实现 ...

  4. mapreduce (三) MapReduce实现倒排索引(二)

    hadoop api http://hadoop.apache.org/docs/r1.0.4/api/org/apache/hadoop/mapreduce/Reducer.html 改变一下需求: ...

  5. 百度和 Google 的搜索技术是一个量级吗?

    著作权归作者所有. 商业转载请联系作者获得授权,非商业转载请注明出处. 作者:Kenny Chao 链接:http://www.zhihu.com/question/22447908/answer/2 ...

  6. MongoDB中聚合工具Aggregate等的介绍与使用

    Aggregate是MongoDB提供的众多工具中的比较重要的一个,类似于SQL语句中的GROUP BY.聚合工具可以让开发人员直接使用MongoDB原生的命令操作数据库中的数据,并且按照要求进行聚合 ...

  7. Hive的HQL语句及数据倾斜解决方案

    [版权申明:本文系作者原创,转载请注明出处] 文章出处:http://blog.csdn.net/sdksdk0/article/details/51675005 作者: 朱培          ID ...

  8. Spark原理概述

    原文来自我的个人网站:http://www.itrensheng.com/archives/Spark_basic_knowledge 一. Spark出现的背景 在Spark出现之前,大数据计算引擎 ...

  9. Apache Flink 如何正确处理实时计算场景中的乱序数据

    一.流式计算的未来 在谷歌发表了 GFS.BigTable.Google MapReduce 三篇论文后,大数据技术真正有了第一次飞跃,Hadoop 生态系统逐渐发展起来. Hadoop 在处理大批量 ...

随机推荐

  1. 【Luogu P2764】最小路径覆盖问题

    网络流 \(24\) 题之一. Problem Description 给出一个 \(n\) 个点 \(m\) 条边的 \(DAG\) ,求最小路径点覆盖,并输出路径选择方案. Input Forma ...

  2. centos7安装tomcat8 新手入门 图文教程

    系统环境 操作系统:64位CentOS Linux release 7.2.1511 (Core) JDK版本:1.8.0_121 下载tomcat8压缩包 访问官网:http://tomcat.ap ...

  3. 为 昂达 v891 安装上了 remix OS 了

    起因: 默认的ROM自带一堆垃圾app,最主要的是没有root , 所以卸载不了. 然后试了 Root大师 , 刷机精灵 之类的软件. 我 CTMD , 简直比出厂ROM 还流氓, 不断的强制安装各种 ...

  4. 雷林鹏分享:XML 注意事项

    XML 注意事项 这里列出了您在使用 XML 时应该尽量避免使用的技术. Internet Explorer - XML 数据岛 它是什么?XML 数据岛是嵌入到 HTML 页面中的 XML 数据. ...

  5. Onsen UI快速入门 --Onsen UI

     一.什么是Onsen UI? 1.一系列专为移动应用程序设计的丰富UI组件.具有遵循原生iOS和Android设计标准的即时实现功能.免费使用,100%开源软件(Apache v2许可证).Onse ...

  6. 11月24日 layouts and rendering in rails(部分没有看)

    http://guides.rubyonrails.org/layouts_and_rendering.html  中文 This guide covers the basic layout feat ...

  7. 浅谈微服务架构、容器技术与K8S

    关注嘉为科技,获取运维新知 企业应用系统:从单体应用走向微服务架构:从裸金属走向容器. 如果在诸多热门云计算技术诸如容器.微服务.DevOps.OpenStack等之中,找出一个最火的方向,那么可能非 ...

  8. uva11609

    以三个人组队为例,3组合是:C(3,0)=1,3,3,1.还有队长的选择.有 1*0,3*1,3*2,1*3种. 组合数:            1      3        3        1 ...

  9. Confluence 6 对一个空间进行归档后产生的影响

    空间 如果一个空间被归档: 将不会在查找结果中显示,除非你选择 在归档空间中查找(Search archived spaces).如果没有归档空间的话,这个功能是隐藏的. 页面和内容将不会在 Conf ...

  10. Centos 6.5 搭建阿里云yum源

    Linux系统下yum源配置(Centos 6) 1.备份 mv /etc/yum.repos.d/CentOS-Base.repo /etc/yum.repos.d/CentOS-Base.repo ...