MapReduce(三)

MapReduce(三):

1.关于倒叙排序前10名

1)TreeMap根据key排序

2)TreeSet排序,传入一个对象,排序按照类中的compareTo方法排序

2.写一个MapReduce的模板

3.MapReduce的分区

             1)手动分区

             2)自动分区

      4.自定义分区


----------------------------------------------------------------------------------------------------------------------------------

一.关于倒叙排序前10名  

将一个文章中字母单词出现的次数进行倒叙排序,只取前十

1)TreeMap

package com.huhu.day03;

import java.io.IOException;
import java.util.Collections;
import java.util.Map;
import java.util.StringTokenizer;
import java.util.TreeMap; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner; /**
* a 80 b 78 r 70 .. 基于value来排序
*
* @author huhu_k
*
*/
public class Top10_1 extends ToolRunner implements Tool { private Configuration conf; public static class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
StringTokenizer st = new StringTokenizer(value.toString());
while (st.hasMoreTokens()) {
context.write(new Text(st.nextToken()), new IntWritable(1));
}
}
} public static class MyReduce extends Reducer<Text, IntWritable, Text, IntWritable> {
// TreeMap倒叙排列
private TreeMap<Long, String> map = new TreeMap<>(Collections.reverseOrder()); @Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
int sum = 0;
for (IntWritable v : values) {
sum++;
}
map.put(Long.valueOf(sum), key.toString());
if (map.size() > 10) {
// 按key排序 前十 降序
map.remove(map.lastKey());
}
} @Override
protected void cleanup(Context context) throws IOException, InterruptedException {
for (Map.Entry<Long, String> m : map.entrySet()) {
context.write(new Text(m.getValue()), new IntWritable(Integer.parseInt(m.getKey() + "")));
}
}
} @Override
public Configuration getConf() {
if (conf != null) {
return conf;
}
return new Configuration();
} @Override
public void setConf(Configuration arg0) { } @Override
public int run(String[] other) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(Top10_1.class);
job.setMapperClass(MyMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class); job.setReducerClass(MyReduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(other[0]));
FileOutputFormat.setOutputPath(job, new Path(other[1])); return job.waitForCompletion(true) ? 0 : 1;
} public static void main(String[] args) throws Exception {
Top10_1 t = new Top10_1();
String[] other = new GenericOptionsParser(t.getConf(), args).getRemainingArgs();
if (other.length != 2) {
System.out.println("your input args number is fail,you need input <in> and <out>");
System.exit(0);
}
ToolRunner.run(t.conf, t, other);
}
}
package com.huhu.day03;

import java.io.IOException;
import java.util.Iterator;
import java.util.TreeSet; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner; /**
* a 80 b 78 r 70 .. 基于value来排序 TreeSet
*
* @author huhu_k
*
*/
public class Top10_2 extends ToolRunner implements Tool { private Configuration conf; public static class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable> { @Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] line = value.toString().split(" ");
for (String s : line) {
context.write(new Text(s), new IntWritable(1));
}
}
} public static class MyReduce extends Reducer<Text, IntWritable, WCWritable, NullWritable> {
// TreeSet倒叙排列
private TreeSet<WCWritable> set;
private final int KEY = 11; @Override
protected void setup(Context context) throws IOException, InterruptedException {
set = new TreeSet<WCWritable>();
} @Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
WCWritable w = new WCWritable();
int sum = 0;
for (IntWritable i : values) {
sum += i.get();
}
w.setWord(key.toString());
w.setCount(sum); set.add(w); if (KEY < set.size()) {
set.remove(set.last());
}
} @Override
protected void cleanup(Context context) throws IOException, InterruptedException {
Iterator<WCWritable> iterator = set.iterator();
if (iterator.hasNext()) {
context.write(iterator.next(), NullWritable.get());
}
}
} @Override
public Configuration getConf() {
if (conf != null) {
return conf;
}
return new Configuration();
} @Override
public void setConf(Configuration arg0) { } @Override
public int run(String[] other) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(Top10_2.class);
job.setMapperClass(MyMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class); job.setReducerClass(MyReduce.class);
job.setOutputKeyClass(WCWritable.class);
job.setOutputValueClass(NullWritable.class); FileInputFormat.addInputPath(job, new Path(other[0]));
FileOutputFormat.setOutputPath(job, new Path(other[1])); return job.waitForCompletion(true) ? 0 : 1;
} public static void main(String[] args) throws Exception {
Top10_2 t = new Top10_2();
String[] other = new GenericOptionsParser(t.getConf(), args).getRemainingArgs();
if (other.length != 2) {
System.out.println("your input args number is fail,you need input <in> and <out>");
System.exit(0);
}
ToolRunner.run(t.getConf(), t, other);
}
}

运行还是在集群中运行,报错可以查看日志

2)TreeSet

package com.huhu.day03;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException; import org.apache.hadoop.io.WritableComparable; public class WCWritable implements WritableComparable<WCWritable> { private String word;
private int count; public WCWritable() {
super();
} public WCWritable(String word, int count) {
super();
this.word = word;
this.count = count;
} public String getWord() {
return word;
} public void setWord(String word) {
this.word = word;
} public int getCount() {
return count;
} public void setCount(int count) {
this.count = count;
} @Override
public String toString() {
return "WCWritable [word=" + word + ", count=" + count + "]";
} @Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + count;
result = prime * result + ((word == null) ? 0 : word.hashCode());
return result;
} @Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
WCWritable other = (WCWritable) obj;
if (count != other.count)
return false;
if (word == null) {
if (other.word != null)
return false;
} else if (!word.equals(other.word))
return false;
return true;
} @Override
public void readFields(DataInput in) throws IOException {
this.word = in.readUTF();
this.count = in.readInt();
} @Override
public void write(DataOutput out) throws IOException {
out.writeUTF(word);
out.writeInt(count);
} @Override
public int compareTo(WCWritable o) {
if (this.count == o.count) {
// 字典顺序
return this.word.compareTo(o.word);
// return this.word.length() - o.word.length();
}
return o.count - this.count;
}
}

package com.huhu.day03;

import java.io.IOException;
import java.util.Iterator;
import java.util.TreeSet; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner; /**
 * a 80 b 78 r 70 .. 基于value来排序 TreeSet
 * 
 * @author huhu_k
 *
 */
public class Top10_2 extends ToolRunner implements Tool {
private Configuration conf; static class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable> { @Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] line = value.toString().split(" ");
for (String s : line) {
context.write(new Text(s), new IntWritable(1));
} }
} static class MyReducer extends Reducer<Text, IntWritable, WCWritable, NullWritable> {
private TreeSet<WCWritable> set;
private final int KEY = 10; @Override
protected void setup(Context context) throws IOException, InterruptedException {
set = new TreeSet<WCWritable>();
} @Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
WCWritable w = new WCWritable();
int sum = 0;
for (IntWritable v : values) {
sum += v.get();
}
w.setWord(key.toString());
w.setCount(sum); set.add(w); if (KEY < set.size()) {
set.remove(set.last());
}
} @Override
protected void cleanup(Context context) throws IOException, InterruptedException {
Iterator<WCWritable> iterator = set.iterator();
while (iterator.hasNext()) {
context.write(iterator.next(), NullWritable.get());
}
}
} public static void main(String[] args) throws Exception {
Top10_2 t = new Top10_2();
Configuration con = t.getConf();
String[] other = new GenericOptionsParser(con, args).getRemainingArgs();
if (other.length != 2) {
System.err.println("number is fail");
}
int run = ToolRunner.run(con, t, args);
System.exit(run);
} @Override
public Configuration getConf() {
if (conf != null) {
return conf;
}
return new Configuration();
} @Override
public void setConf(Configuration arg0) { } @Override
public int run(String[] other) throws Exception {
Configuration con = getConf();
Job job = Job.getInstance(con);
job.setJarByClass(Top10_2.class);
job.setMapperClass(MyMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class); // 默认分区
job.setPartitionerClass(HashPartitioner.class); job.setReducerClass(MyReducer.class);
job.setOutputKeyClass(WCWritable.class);
job.setOutputValueClass(NullWritable.class); FileInputFormat.addInputPath(job, new Path(other[0]));
FileOutputFormat.setOutputPath(job, new Path(other[1])); return job.waitForCompletion(true) ? 0 : 1;
}
}


二.写一个MapReduce的模板

package util;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner; public class Frame extends ToolRunner implements Tool { private Configuration conf; public static class MyMapper extends Mapper<LongWritable, Text, Text, Text> { @Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] line = value.toString().split(" "); }
} public static class MyReduce extends Reducer<Text, Text, Text, Text> { @Override
protected void setup(Context context) throws IOException, InterruptedException {
} @Override
protected void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
} @Override
protected void cleanup(Context context) throws IOException, InterruptedException {
}
} public static void main(String[] args) throws Exception {
Frame t = new Frame();
Configuration conf = t.getConf();
String[] other = new GenericOptionsParser(conf, args).getRemainingArgs();
if (other.length != 2) {
System.err.println("number is fail");
}
int run = ToolRunner.run(conf, t, args);
System.exit(run);
} @Override
public Configuration getConf() {
if (conf != null) {
return conf;
}
return new Configuration();
} @Override
public void setConf(Configuration arg0) { } @Override
public int run(String[] other) throws Exception {
Configuration con = getConf();
Job job = Job.getInstance(con);
job.setJarByClass(Frame.class);
job.setMapperClass(MyMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class); //默认分区
job.setPartitionerClass(HashPartitioner.class); job.setReducerClass(MyReduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(other[0]));
FileOutputFormat.setOutputPath(job, new Path(other[1])); return job.waitForCompletion(true) ? 0 : 1;
} }

三.MapReduce的分区

1)手动分区
package com.huhu.day03;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
/**
 * 手动分区
 * 
 * @author huhu_k
 *
 */
public class ManualPartition extends ToolRunner implements Tool { private Configuration conf; public static class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable> { @Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] line = value.toString().split(" ");
for (String s : line) {
context.write(new Text(s), new IntWritable(1));
}
}
} public static class MyReduce extends Reducer<Text, IntWritable, Text, IntWritable> { private MultipleOutputs<Text, IntWritable> mos; @Override
protected void setup(Reducer<Text, IntWritable, Text, IntWritable>.Context context)
throws IOException, InterruptedException {
mos = new MultipleOutputs<>(context);
} @Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
int sum = 0;
for (IntWritable v : values) {
sum += v.get();
} if (key.toString().substring(0, 1).matches("[a-z]")) {
mos.write("az", key.toString(), new IntWritable(sum));
} else if (key.toString().substring(0, 1).matches("[A-Z]")) {
mos.write("AZ", key.toString(), new IntWritable(sum));
} else if (key.toString().substring(0, 1).matches("[0-9]")) {
mos.write("09", key.toString(), new IntWritable(sum));
} else {
mos.write("default", key.toString(), new IntWritable(sum));
}
} @Override
protected void cleanup(Context context) throws IOException, InterruptedException {
// 很重要 -->因为mos类时一个类似的缓冲区 hdfs可以写 更改 追加写
mos.close();
}
} public static void main(String[] args) throws Exception {
ManualPartition t = new ManualPartition();
Configuration conf = t.getConf();
String[] other = new GenericOptionsParser(conf, args).getRemainingArgs();
if (other.length != 2) {
System.err.println("number is fail");
}
int run = ToolRunner.run(conf, t, args);
System.exit(run);
} @Override
public Configuration getConf() {
if (conf != null) {
return conf;
}
return new Configuration();
} @Override
public void setConf(Configuration arg0) { } @Override
public int run(String[] other) throws Exception {
Configuration con = getConf();
Job job = Job.getInstance(con);
job.setJarByClass(ManualPartition.class);
job.setMapperClass(MyMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class); // 默认分区
job.setPartitionerClass(HashPartitioner.class); job.setReducerClass(MyReduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(other[0]));
FileOutputFormat.setOutputPath(job, new Path(other[1]));
// 手动分区
MultipleOutputs.addNamedOutput(job, "az", TextOutputFormat.class, Text.class, IntWritable.class);
MultipleOutputs.addNamedOutput(job, "AZ", TextOutputFormat.class, Text.class, IntWritable.class);
MultipleOutputs.addNamedOutput(job, "09", TextOutputFormat.class, Text.class, IntWritable.class);
MultipleOutputs.addNamedOutput(job, "default", TextOutputFormat.class, Text.class, IntWritable.class);
return job.waitForCompletion(true) ? 0 : 1;
} }

2)自动分区
package com.huhu.day03.partitioner;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner; public class WordCountAUTOPartitioner extends Partitioner<Text, IntWritable> { @Override
public int getPartition(Text key, IntWritable value, int numPartitioner) {
String firstChar = key.toString().substring(0, 1);
if (firstChar.matches("[a-g]")) {
//返回
return 0 % numPartitioner;
} else if (firstChar.matches("[h-z]")) {
return 1 % numPartitioner;
} else if (firstChar.matches("[0-5]")) {
return 2 % numPartitioner;
} else if (firstChar.matches("[6-9]")) {
return 3 % numPartitioner;
} else if (firstChar.matches("[A-G]")) {
return 0 % numPartitioner;
} else if (firstChar.matches("[H-Z]")) {
return 5 % numPartitioner;
} else {
return 6 % numPartitioner;
}
} }

package com.huhu.day03;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner; import com.huhu.day03.partitioner.WordCountAUTOPartitioner; public class AutomaticPartition extends ToolRunner implements Tool { private Configuration conf; public static class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable> { @Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] line = value.toString().split(" ");
for (String s : line) {
context.write(new Text(s), new IntWritable(1));
}
}
} public static class MyReduce extends Reducer<Text, IntWritable, Text, IntWritable> { @Override
protected void setup(Context context) throws IOException, InterruptedException {
} @Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
int sum = 0;
for (IntWritable v : values) {
sum += v.get();
}
context.write(key, new IntWritable(sum));
} @Override
protected void cleanup(Context context) throws IOException, InterruptedException {
}
} public static void main(String[] args) throws Exception {
AutomaticPartition t = new AutomaticPartition();
Configuration conf = t.getConf();
String[] other = new GenericOptionsParser(conf, args).getRemainingArgs();
if (other.length != 2) {
System.err.println("number is fail");
}
int run = ToolRunner.run(conf, t, args);
System.exit(run);
} @Override
public Configuration getConf() {
if (conf != null) {
return conf;
}
return new Configuration();
} @Override
public void setConf(Configuration arg0) { } @Override
public int run(String[] other) throws Exception {
Configuration con = getConf();
Job job = Job.getInstance(con);
job.setJarByClass(AutomaticPartition.class);
job.setMapperClass(MyMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class); // 默认分区
// job.setPartitionerClass(HashPartitioner.class); // 自定义分区
job.setPartitionerClass(WordCountAUTOPartitioner.class);
// 分40个区
job.setNumReduceTasks(40); job.setReducerClass(MyReduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(other[0]));
FileOutputFormat.setOutputPath(job, new Path(other[1])); return job.waitForCompletion(true) ? 0 : 1;
} }


只有在我的分区类中设置的分区中有内容,别的没有,因为我没有设置


四.自定义分组
package com.huhu.day03;

import java.io.IOException;
import java.util.Iterator; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner; import com.huhu.day03.group.ClassGroupSort;
import com.huhu.day03.pojo.Student; public class StudentAutoGroup extends ToolRunner implements Tool { private Configuration conf; public static class MyMapper extends Mapper<LongWritable, Text, Student, Student> { @Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] line = value.toString().split(" ");
Student s = new Student(line[0], line[1], Integer.parseInt(line[2]));
context.write(s, s);
}
} public static class MyReduce extends Reducer<Student, Student, Text, IntWritable> { @Override
protected void reduce(Student key, Iterable<Student> values, Context context)
throws IOException, InterruptedException {
int sum = 0;
for (Student s : values) {
sum += s.getSccore();
}
context.write(new Text(key.getGroup()), new IntWritable(sum));
}
} public static void main(String[] args) throws Exception {
StudentAutoGroup t = new StudentAutoGroup();
Configuration conf = t.getConf();
String[] other = new GenericOptionsParser(conf, args).getRemainingArgs();
if (other.length != 2) {
System.err.println("number is fail");
}
int run = ToolRunner.run(conf, t, args);
System.exit(run);
} @Override
public Configuration getConf() {
if (conf != null) {
return conf;
}
return new Configuration();
} @Override
public void setConf(Configuration arg0) { } @Override
public int run(String[] other) throws Exception {
Configuration con = getConf();
Job job = Job.getInstance(con);
job.setJarByClass(StudentAutoGroup.class);
job.setMapperClass(MyMapper.class);
job.setMapOutputKeyClass(Student.class);
job.setMapOutputValueClass(Student.class); // 分组
job.setCombinerKeyGroupingComparatorClass(ClassGroupSort.class);
// job.setGroupingComparatorClass(ClassGroupSort.class);
// 默认分区
job.setPartitionerClass(HashPartitioner.class);
job.setNumReduceTasks(1); job.setReducerClass(MyReduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(other[0]));
FileOutputFormat.setOutputPath(job, new Path(other[1])); return job.waitForCompletion(true) ? 0 : 1;
} }
package com.huhu.day03.pojo;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException; import org.apache.hadoop.io.WritableComparable; public class Student implements WritableComparable<Student> { private String name;
private String group;
private int sccore; public Student() {
super();
// TODO Auto-generated constructor stub
} public Student(String name, String group, int sccore) {
super();
this.name = name;
this.group = group;
this.sccore = sccore;
} public String getName() {
return name;
} public void setName(String name) {
this.name = name;
} public String getGroup() {
return group;
} public void setGroup(String group) {
this.group = group;
} public int getSccore() {
return sccore;
} public void setSccore(int sccore) {
this.sccore = sccore;
} @Override
public String toString() {
return "Student [name=" + name + ", group=" + group + ", sccore=" + sccore + "]";
} @Override
public void readFields(DataInput in) throws IOException {
this.name = in.readUTF();
this.group = in.readUTF();
this.sccore = in.readInt();
} @Override
public void write(DataOutput out) throws IOException {
out.writeUTF(name);
out.writeUTF(group);
out.writeInt(sccore);
} @Override
public int compareTo(Student o) {
return this.group.compareTo(o.group);
} }
package com.huhu.day03.group;

import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.io.WritableComparator; import com.huhu.day03.pojo.Student; public class ClassGroupSort implements RawComparator<Student> { @Override
public int compare(Student o1, Student o2) {
return (int) (o1.getGroup().equals(o2.getGroup()) ? 0 : 1);
} @Override
public int compare(byte[] arg0, int arg1, int arg2, byte[] arg3, int arg4, int arg5) {
return WritableComparator.compareBytes(arg0, arg1, 8, arg3, arg4, 8);
}
}



总结下吧:

MapReduce的分区是对key的类型分:可以根据【a-z】【A-Z】【0-9】.....等等。是对reduce中的key来分的

MapReduce的分组则是忽略key:根据value来分的,比如你传入一个对象,根据对象的属性来比较,虽然传入的是不同的对象,但是只要属性相同,则可以对数据进行操作。

他们之所以又是分组又是分区,一则为了清洗数据,二则为了给数据排序。

MapReduce(三)的更多相关文章

  1. MapReduce三种join实例分析

    本文引自吴超博客 实现原理 1.在Reudce端进行连接. 在Reudce端进行连接是MapReduce框架进行表之间join操作最为常见的模式,其具体的实现原理如下: Map端的主要工作:为来自不同 ...

  2. MapReduce三种路径输入

    目前为止知道MapReduce有三种路径输入方式.1.第一种是通过一下方式输入: FileInputFormat.addInputPath(job, new Path(args[0]));FileIn ...

  3. MapReduce(三) 典型场景(一)

    一.mapreduce多job串联 1.需求 一个稍复杂点的处理逻辑往往需要多个 mapreduce 程序串联处理,多 job 的串联可以借助 mapreduce 框架的 JobControl 实现 ...

  4. mapreduce (三) MapReduce实现倒排索引(二)

    hadoop api http://hadoop.apache.org/docs/r1.0.4/api/org/apache/hadoop/mapreduce/Reducer.html 改变一下需求: ...

  5. 百度和 Google 的搜索技术是一个量级吗?

    著作权归作者所有. 商业转载请联系作者获得授权,非商业转载请注明出处. 作者:Kenny Chao 链接:http://www.zhihu.com/question/22447908/answer/2 ...

  6. MongoDB中聚合工具Aggregate等的介绍与使用

    Aggregate是MongoDB提供的众多工具中的比较重要的一个,类似于SQL语句中的GROUP BY.聚合工具可以让开发人员直接使用MongoDB原生的命令操作数据库中的数据,并且按照要求进行聚合 ...

  7. Hive的HQL语句及数据倾斜解决方案

    [版权申明:本文系作者原创,转载请注明出处] 文章出处:http://blog.csdn.net/sdksdk0/article/details/51675005 作者: 朱培          ID ...

  8. Spark原理概述

    原文来自我的个人网站:http://www.itrensheng.com/archives/Spark_basic_knowledge 一. Spark出现的背景 在Spark出现之前,大数据计算引擎 ...

  9. Apache Flink 如何正确处理实时计算场景中的乱序数据

    一.流式计算的未来 在谷歌发表了 GFS.BigTable.Google MapReduce 三篇论文后,大数据技术真正有了第一次飞跃,Hadoop 生态系统逐渐发展起来. Hadoop 在处理大批量 ...

随机推荐

  1. Terminal run py文件

    cd Documents cd PythonCode python3 hello.py Text Editor: Atom Atom 可以用来写 python 脚本 (文件后缀名 .py). 但是不用 ...

  2. HDU 5723 Abandoned country(最小生成树+边两边点数)

    http://acm.split.hdu.edu.cn/showproblem.php?pid=5723 题意:给出一个无向图,每条路都有一个代价,求出把所有城市连通的最小代价.在此基础上,国王会从这 ...

  3. bzoj 2243: [SDOI2011]染色 线段树区间合并+树链剖分

    2243: [SDOI2011]染色 Time Limit: 20 Sec  Memory Limit: 512 MBSubmit: 7925  Solved: 2975[Submit][Status ...

  4. 1.1 vue.js devtools使用教程

    1. vue.js devtools使用教程

  5. Spring 的@Required注释

    本例子源于:W3Cschool,在此做一个记录 @Required注释为为了保证所对应的属性必须被设置,@Required 注释应用于 bean 属性的 setter 方法,它表明受影响的 bean ...

  6. zlib交叉编译

    下载 zlib-1.2.9.tar.gz wget http://www.zlib.net/fossils/zlib-1.2.9.tar.gz 解压文件 tar -zxvf zlib-1.2.9.ta ...

  7. Codeforces 1073 E - Segment Sum

    E - Segment Sum 思路: 数位dp 我们平时做的数位dp都是求满足条件的数的个数, 这里要求满足条件的数的和 只要在原来的基础上求每一位的贡献就可以了,所以传参数时要传两个 代码: #p ...

  8. MySQL中如何建立主从复制

    ## 1 概述 ## MySQL内建的复制功能是构建大型,高性能应用程序的基础.将Mysql的数据分布到多个系统上去,这种分布的机制,是通过将Mysql的某一台主机的数据复制到其它主机(slaves) ...

  9. Python pickle使用

    2019-01-15 10:04:32 用于序列化的两个模块 json:用于字符串和Python数据类型间进行转换 pickle: 用于python特有的类型和python的数据类型间进行转换 jso ...

  10. Spring Boot入门第四天:使用Thymeleaf模板引擎

    原文链接 关于Thymeleaf的优点,我只说一条:它就是html页面啊,直接可以用浏览器打开.受够了JSP的同学可以尝试一下. 1.在pom.xml文件中添加依赖: <!--<depen ...