MapReduce(三)
MapReduce(三)
MapReduce(三):
1.关于倒叙排序前10名
1)TreeMap根据key排序
2)TreeSet排序,传入一个对象,排序按照类中的compareTo方法排序
2.写一个MapReduce的模板
3.MapReduce的分区
1)手动分区
2)自动分区
4.自定义分区
----------------------------------------------------------------------------------------------------------------------------------
一.关于倒叙排序前10名
将一个文章中字母单词出现的次数进行倒叙排序,只取前十
1)TreeMap
package com.huhu.day03; import java.io.IOException;
import java.util.Collections;
import java.util.Map;
import java.util.StringTokenizer;
import java.util.TreeMap; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner; /**
* a 80 b 78 r 70 .. 基于value来排序
*
* @author huhu_k
*
*/
public class Top10_1 extends ToolRunner implements Tool { private Configuration conf; public static class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
StringTokenizer st = new StringTokenizer(value.toString());
while (st.hasMoreTokens()) {
context.write(new Text(st.nextToken()), new IntWritable(1));
}
}
} public static class MyReduce extends Reducer<Text, IntWritable, Text, IntWritable> {
// TreeMap倒叙排列
private TreeMap<Long, String> map = new TreeMap<>(Collections.reverseOrder()); @Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
int sum = 0;
for (IntWritable v : values) {
sum++;
}
map.put(Long.valueOf(sum), key.toString());
if (map.size() > 10) {
// 按key排序 前十 降序
map.remove(map.lastKey());
}
} @Override
protected void cleanup(Context context) throws IOException, InterruptedException {
for (Map.Entry<Long, String> m : map.entrySet()) {
context.write(new Text(m.getValue()), new IntWritable(Integer.parseInt(m.getKey() + "")));
}
}
} @Override
public Configuration getConf() {
if (conf != null) {
return conf;
}
return new Configuration();
} @Override
public void setConf(Configuration arg0) { } @Override
public int run(String[] other) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(Top10_1.class);
job.setMapperClass(MyMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class); job.setReducerClass(MyReduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(other[0]));
FileOutputFormat.setOutputPath(job, new Path(other[1])); return job.waitForCompletion(true) ? 0 : 1;
} public static void main(String[] args) throws Exception {
Top10_1 t = new Top10_1();
String[] other = new GenericOptionsParser(t.getConf(), args).getRemainingArgs();
if (other.length != 2) {
System.out.println("your input args number is fail,you need input <in> and <out>");
System.exit(0);
}
ToolRunner.run(t.conf, t, other);
}
}
package com.huhu.day03; import java.io.IOException;
import java.util.Iterator;
import java.util.TreeSet; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner; /**
* a 80 b 78 r 70 .. 基于value来排序 TreeSet
*
* @author huhu_k
*
*/
public class Top10_2 extends ToolRunner implements Tool { private Configuration conf; public static class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable> { @Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] line = value.toString().split(" ");
for (String s : line) {
context.write(new Text(s), new IntWritable(1));
}
}
} public static class MyReduce extends Reducer<Text, IntWritable, WCWritable, NullWritable> {
// TreeSet倒叙排列
private TreeSet<WCWritable> set;
private final int KEY = 11; @Override
protected void setup(Context context) throws IOException, InterruptedException {
set = new TreeSet<WCWritable>();
} @Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
WCWritable w = new WCWritable();
int sum = 0;
for (IntWritable i : values) {
sum += i.get();
}
w.setWord(key.toString());
w.setCount(sum); set.add(w); if (KEY < set.size()) {
set.remove(set.last());
}
} @Override
protected void cleanup(Context context) throws IOException, InterruptedException {
Iterator<WCWritable> iterator = set.iterator();
if (iterator.hasNext()) {
context.write(iterator.next(), NullWritable.get());
}
}
} @Override
public Configuration getConf() {
if (conf != null) {
return conf;
}
return new Configuration();
} @Override
public void setConf(Configuration arg0) { } @Override
public int run(String[] other) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(Top10_2.class);
job.setMapperClass(MyMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class); job.setReducerClass(MyReduce.class);
job.setOutputKeyClass(WCWritable.class);
job.setOutputValueClass(NullWritable.class); FileInputFormat.addInputPath(job, new Path(other[0]));
FileOutputFormat.setOutputPath(job, new Path(other[1])); return job.waitForCompletion(true) ? 0 : 1;
} public static void main(String[] args) throws Exception {
Top10_2 t = new Top10_2();
String[] other = new GenericOptionsParser(t.getConf(), args).getRemainingArgs();
if (other.length != 2) {
System.out.println("your input args number is fail,you need input <in> and <out>");
System.exit(0);
}
ToolRunner.run(t.getConf(), t, other);
}
}
运行还是在集群中运行,报错可以查看日志
2)TreeSet
package com.huhu.day03; import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException; import org.apache.hadoop.io.WritableComparable; public class WCWritable implements WritableComparable<WCWritable> { private String word;
private int count; public WCWritable() {
super();
} public WCWritable(String word, int count) {
super();
this.word = word;
this.count = count;
} public String getWord() {
return word;
} public void setWord(String word) {
this.word = word;
} public int getCount() {
return count;
} public void setCount(int count) {
this.count = count;
} @Override
public String toString() {
return "WCWritable [word=" + word + ", count=" + count + "]";
} @Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + count;
result = prime * result + ((word == null) ? 0 : word.hashCode());
return result;
} @Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
WCWritable other = (WCWritable) obj;
if (count != other.count)
return false;
if (word == null) {
if (other.word != null)
return false;
} else if (!word.equals(other.word))
return false;
return true;
} @Override
public void readFields(DataInput in) throws IOException {
this.word = in.readUTF();
this.count = in.readInt();
} @Override
public void write(DataOutput out) throws IOException {
out.writeUTF(word);
out.writeInt(count);
} @Override
public int compareTo(WCWritable o) {
if (this.count == o.count) {
// 字典顺序
return this.word.compareTo(o.word);
// return this.word.length() - o.word.length();
}
return o.count - this.count;
}
}
package com.huhu.day03; import java.io.IOException;
import java.util.Iterator;
import java.util.TreeSet; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner; /**
* a 80 b 78 r 70 .. 基于value来排序 TreeSet
*
* @author huhu_k
*
*/
public class Top10_2 extends ToolRunner implements Tool {
private Configuration conf; static class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable> { @Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] line = value.toString().split(" ");
for (String s : line) {
context.write(new Text(s), new IntWritable(1));
} }
} static class MyReducer extends Reducer<Text, IntWritable, WCWritable, NullWritable> {
private TreeSet<WCWritable> set;
private final int KEY = 10; @Override
protected void setup(Context context) throws IOException, InterruptedException {
set = new TreeSet<WCWritable>();
} @Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
WCWritable w = new WCWritable();
int sum = 0;
for (IntWritable v : values) {
sum += v.get();
}
w.setWord(key.toString());
w.setCount(sum); set.add(w); if (KEY < set.size()) {
set.remove(set.last());
}
} @Override
protected void cleanup(Context context) throws IOException, InterruptedException {
Iterator<WCWritable> iterator = set.iterator();
while (iterator.hasNext()) {
context.write(iterator.next(), NullWritable.get());
}
}
} public static void main(String[] args) throws Exception {
Top10_2 t = new Top10_2();
Configuration con = t.getConf();
String[] other = new GenericOptionsParser(con, args).getRemainingArgs();
if (other.length != 2) {
System.err.println("number is fail");
}
int run = ToolRunner.run(con, t, args);
System.exit(run);
} @Override
public Configuration getConf() {
if (conf != null) {
return conf;
}
return new Configuration();
} @Override
public void setConf(Configuration arg0) { } @Override
public int run(String[] other) throws Exception {
Configuration con = getConf();
Job job = Job.getInstance(con);
job.setJarByClass(Top10_2.class);
job.setMapperClass(MyMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class); // 默认分区
job.setPartitionerClass(HashPartitioner.class); job.setReducerClass(MyReducer.class);
job.setOutputKeyClass(WCWritable.class);
job.setOutputValueClass(NullWritable.class); FileInputFormat.addInputPath(job, new Path(other[0]));
FileOutputFormat.setOutputPath(job, new Path(other[1])); return job.waitForCompletion(true) ? 0 : 1;
}
}
二.写一个MapReduce的模板
package util; import java.io.IOException; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner; public class Frame extends ToolRunner implements Tool { private Configuration conf; public static class MyMapper extends Mapper<LongWritable, Text, Text, Text> { @Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] line = value.toString().split(" "); }
} public static class MyReduce extends Reducer<Text, Text, Text, Text> { @Override
protected void setup(Context context) throws IOException, InterruptedException {
} @Override
protected void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
} @Override
protected void cleanup(Context context) throws IOException, InterruptedException {
}
} public static void main(String[] args) throws Exception {
Frame t = new Frame();
Configuration conf = t.getConf();
String[] other = new GenericOptionsParser(conf, args).getRemainingArgs();
if (other.length != 2) {
System.err.println("number is fail");
}
int run = ToolRunner.run(conf, t, args);
System.exit(run);
} @Override
public Configuration getConf() {
if (conf != null) {
return conf;
}
return new Configuration();
} @Override
public void setConf(Configuration arg0) { } @Override
public int run(String[] other) throws Exception {
Configuration con = getConf();
Job job = Job.getInstance(con);
job.setJarByClass(Frame.class);
job.setMapperClass(MyMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class); //默认分区
job.setPartitionerClass(HashPartitioner.class); job.setReducerClass(MyReduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(other[0]));
FileOutputFormat.setOutputPath(job, new Path(other[1])); return job.waitForCompletion(true) ? 0 : 1;
} }
三.MapReduce的分区
package com.huhu.day03; import java.io.IOException; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
/**
* 手动分区
*
* @author huhu_k
*
*/
public class ManualPartition extends ToolRunner implements Tool { private Configuration conf; public static class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable> { @Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] line = value.toString().split(" ");
for (String s : line) {
context.write(new Text(s), new IntWritable(1));
}
}
} public static class MyReduce extends Reducer<Text, IntWritable, Text, IntWritable> { private MultipleOutputs<Text, IntWritable> mos; @Override
protected void setup(Reducer<Text, IntWritable, Text, IntWritable>.Context context)
throws IOException, InterruptedException {
mos = new MultipleOutputs<>(context);
} @Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
int sum = 0;
for (IntWritable v : values) {
sum += v.get();
} if (key.toString().substring(0, 1).matches("[a-z]")) {
mos.write("az", key.toString(), new IntWritable(sum));
} else if (key.toString().substring(0, 1).matches("[A-Z]")) {
mos.write("AZ", key.toString(), new IntWritable(sum));
} else if (key.toString().substring(0, 1).matches("[0-9]")) {
mos.write("09", key.toString(), new IntWritable(sum));
} else {
mos.write("default", key.toString(), new IntWritable(sum));
}
} @Override
protected void cleanup(Context context) throws IOException, InterruptedException {
// 很重要 -->因为mos类时一个类似的缓冲区 hdfs可以写 更改 追加写
mos.close();
}
} public static void main(String[] args) throws Exception {
ManualPartition t = new ManualPartition();
Configuration conf = t.getConf();
String[] other = new GenericOptionsParser(conf, args).getRemainingArgs();
if (other.length != 2) {
System.err.println("number is fail");
}
int run = ToolRunner.run(conf, t, args);
System.exit(run);
} @Override
public Configuration getConf() {
if (conf != null) {
return conf;
}
return new Configuration();
} @Override
public void setConf(Configuration arg0) { } @Override
public int run(String[] other) throws Exception {
Configuration con = getConf();
Job job = Job.getInstance(con);
job.setJarByClass(ManualPartition.class);
job.setMapperClass(MyMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class); // 默认分区
job.setPartitionerClass(HashPartitioner.class); job.setReducerClass(MyReduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(other[0]));
FileOutputFormat.setOutputPath(job, new Path(other[1]));
// 手动分区
MultipleOutputs.addNamedOutput(job, "az", TextOutputFormat.class, Text.class, IntWritable.class);
MultipleOutputs.addNamedOutput(job, "AZ", TextOutputFormat.class, Text.class, IntWritable.class);
MultipleOutputs.addNamedOutput(job, "09", TextOutputFormat.class, Text.class, IntWritable.class);
MultipleOutputs.addNamedOutput(job, "default", TextOutputFormat.class, Text.class, IntWritable.class);
return job.waitForCompletion(true) ? 0 : 1;
} }
package com.huhu.day03.partitioner; import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner; public class WordCountAUTOPartitioner extends Partitioner<Text, IntWritable> { @Override
public int getPartition(Text key, IntWritable value, int numPartitioner) {
String firstChar = key.toString().substring(0, 1);
if (firstChar.matches("[a-g]")) {
//返回
return 0 % numPartitioner;
} else if (firstChar.matches("[h-z]")) {
return 1 % numPartitioner;
} else if (firstChar.matches("[0-5]")) {
return 2 % numPartitioner;
} else if (firstChar.matches("[6-9]")) {
return 3 % numPartitioner;
} else if (firstChar.matches("[A-G]")) {
return 0 % numPartitioner;
} else if (firstChar.matches("[H-Z]")) {
return 5 % numPartitioner;
} else {
return 6 % numPartitioner;
}
} }
package com.huhu.day03; import java.io.IOException; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner; import com.huhu.day03.partitioner.WordCountAUTOPartitioner; public class AutomaticPartition extends ToolRunner implements Tool { private Configuration conf; public static class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable> { @Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] line = value.toString().split(" ");
for (String s : line) {
context.write(new Text(s), new IntWritable(1));
}
}
} public static class MyReduce extends Reducer<Text, IntWritable, Text, IntWritable> { @Override
protected void setup(Context context) throws IOException, InterruptedException {
} @Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
int sum = 0;
for (IntWritable v : values) {
sum += v.get();
}
context.write(key, new IntWritable(sum));
} @Override
protected void cleanup(Context context) throws IOException, InterruptedException {
}
} public static void main(String[] args) throws Exception {
AutomaticPartition t = new AutomaticPartition();
Configuration conf = t.getConf();
String[] other = new GenericOptionsParser(conf, args).getRemainingArgs();
if (other.length != 2) {
System.err.println("number is fail");
}
int run = ToolRunner.run(conf, t, args);
System.exit(run);
} @Override
public Configuration getConf() {
if (conf != null) {
return conf;
}
return new Configuration();
} @Override
public void setConf(Configuration arg0) { } @Override
public int run(String[] other) throws Exception {
Configuration con = getConf();
Job job = Job.getInstance(con);
job.setJarByClass(AutomaticPartition.class);
job.setMapperClass(MyMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class); // 默认分区
// job.setPartitionerClass(HashPartitioner.class); // 自定义分区
job.setPartitionerClass(WordCountAUTOPartitioner.class);
// 分40个区
job.setNumReduceTasks(40); job.setReducerClass(MyReduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(other[0]));
FileOutputFormat.setOutputPath(job, new Path(other[1])); return job.waitForCompletion(true) ? 0 : 1;
} }
只有在我的分区类中设置的分区中有内容,别的没有,因为我没有设置
package com.huhu.day03; import java.io.IOException;
import java.util.Iterator; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner; import com.huhu.day03.group.ClassGroupSort;
import com.huhu.day03.pojo.Student; public class StudentAutoGroup extends ToolRunner implements Tool { private Configuration conf; public static class MyMapper extends Mapper<LongWritable, Text, Student, Student> { @Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] line = value.toString().split(" ");
Student s = new Student(line[0], line[1], Integer.parseInt(line[2]));
context.write(s, s);
}
} public static class MyReduce extends Reducer<Student, Student, Text, IntWritable> { @Override
protected void reduce(Student key, Iterable<Student> values, Context context)
throws IOException, InterruptedException {
int sum = 0;
for (Student s : values) {
sum += s.getSccore();
}
context.write(new Text(key.getGroup()), new IntWritable(sum));
}
} public static void main(String[] args) throws Exception {
StudentAutoGroup t = new StudentAutoGroup();
Configuration conf = t.getConf();
String[] other = new GenericOptionsParser(conf, args).getRemainingArgs();
if (other.length != 2) {
System.err.println("number is fail");
}
int run = ToolRunner.run(conf, t, args);
System.exit(run);
} @Override
public Configuration getConf() {
if (conf != null) {
return conf;
}
return new Configuration();
} @Override
public void setConf(Configuration arg0) { } @Override
public int run(String[] other) throws Exception {
Configuration con = getConf();
Job job = Job.getInstance(con);
job.setJarByClass(StudentAutoGroup.class);
job.setMapperClass(MyMapper.class);
job.setMapOutputKeyClass(Student.class);
job.setMapOutputValueClass(Student.class); // 分组
job.setCombinerKeyGroupingComparatorClass(ClassGroupSort.class);
// job.setGroupingComparatorClass(ClassGroupSort.class);
// 默认分区
job.setPartitionerClass(HashPartitioner.class);
job.setNumReduceTasks(1); job.setReducerClass(MyReduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(other[0]));
FileOutputFormat.setOutputPath(job, new Path(other[1])); return job.waitForCompletion(true) ? 0 : 1;
} }
package com.huhu.day03.pojo; import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException; import org.apache.hadoop.io.WritableComparable; public class Student implements WritableComparable<Student> { private String name;
private String group;
private int sccore; public Student() {
super();
// TODO Auto-generated constructor stub
} public Student(String name, String group, int sccore) {
super();
this.name = name;
this.group = group;
this.sccore = sccore;
} public String getName() {
return name;
} public void setName(String name) {
this.name = name;
} public String getGroup() {
return group;
} public void setGroup(String group) {
this.group = group;
} public int getSccore() {
return sccore;
} public void setSccore(int sccore) {
this.sccore = sccore;
} @Override
public String toString() {
return "Student [name=" + name + ", group=" + group + ", sccore=" + sccore + "]";
} @Override
public void readFields(DataInput in) throws IOException {
this.name = in.readUTF();
this.group = in.readUTF();
this.sccore = in.readInt();
} @Override
public void write(DataOutput out) throws IOException {
out.writeUTF(name);
out.writeUTF(group);
out.writeInt(sccore);
} @Override
public int compareTo(Student o) {
return this.group.compareTo(o.group);
} }
package com.huhu.day03.group; import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.io.WritableComparator; import com.huhu.day03.pojo.Student; public class ClassGroupSort implements RawComparator<Student> { @Override
public int compare(Student o1, Student o2) {
return (int) (o1.getGroup().equals(o2.getGroup()) ? 0 : 1);
} @Override
public int compare(byte[] arg0, int arg1, int arg2, byte[] arg3, int arg4, int arg5) {
return WritableComparator.compareBytes(arg0, arg1, 8, arg3, arg4, 8);
}
}
MapReduce的分区是对key的类型分:可以根据【a-z】【A-Z】【0-9】.....等等。是对reduce中的key来分的
MapReduce的分组则是忽略key:根据value来分的,比如你传入一个对象,根据对象的属性来比较,虽然传入的是不同的对象,但是只要属性相同,则可以对数据进行操作。
他们之所以又是分组又是分区,一则为了清洗数据,二则为了给数据排序。
MapReduce(三)的更多相关文章
- MapReduce三种join实例分析
本文引自吴超博客 实现原理 1.在Reudce端进行连接. 在Reudce端进行连接是MapReduce框架进行表之间join操作最为常见的模式,其具体的实现原理如下: Map端的主要工作:为来自不同 ...
- MapReduce三种路径输入
目前为止知道MapReduce有三种路径输入方式.1.第一种是通过一下方式输入: FileInputFormat.addInputPath(job, new Path(args[0]));FileIn ...
- MapReduce(三) 典型场景(一)
一.mapreduce多job串联 1.需求 一个稍复杂点的处理逻辑往往需要多个 mapreduce 程序串联处理,多 job 的串联可以借助 mapreduce 框架的 JobControl 实现 ...
- mapreduce (三) MapReduce实现倒排索引(二)
hadoop api http://hadoop.apache.org/docs/r1.0.4/api/org/apache/hadoop/mapreduce/Reducer.html 改变一下需求: ...
- 百度和 Google 的搜索技术是一个量级吗?
著作权归作者所有. 商业转载请联系作者获得授权,非商业转载请注明出处. 作者:Kenny Chao 链接:http://www.zhihu.com/question/22447908/answer/2 ...
- MongoDB中聚合工具Aggregate等的介绍与使用
Aggregate是MongoDB提供的众多工具中的比较重要的一个,类似于SQL语句中的GROUP BY.聚合工具可以让开发人员直接使用MongoDB原生的命令操作数据库中的数据,并且按照要求进行聚合 ...
- Hive的HQL语句及数据倾斜解决方案
[版权申明:本文系作者原创,转载请注明出处] 文章出处:http://blog.csdn.net/sdksdk0/article/details/51675005 作者: 朱培 ID ...
- Spark原理概述
原文来自我的个人网站:http://www.itrensheng.com/archives/Spark_basic_knowledge 一. Spark出现的背景 在Spark出现之前,大数据计算引擎 ...
- Apache Flink 如何正确处理实时计算场景中的乱序数据
一.流式计算的未来 在谷歌发表了 GFS.BigTable.Google MapReduce 三篇论文后,大数据技术真正有了第一次飞跃,Hadoop 生态系统逐渐发展起来. Hadoop 在处理大批量 ...
随机推荐
- 【Luogu P2764】最小路径覆盖问题
网络流 \(24\) 题之一. Problem Description 给出一个 \(n\) 个点 \(m\) 条边的 \(DAG\) ,求最小路径点覆盖,并输出路径选择方案. Input Forma ...
- centos7安装tomcat8 新手入门 图文教程
系统环境 操作系统:64位CentOS Linux release 7.2.1511 (Core) JDK版本:1.8.0_121 下载tomcat8压缩包 访问官网:http://tomcat.ap ...
- 为 昂达 v891 安装上了 remix OS 了
起因: 默认的ROM自带一堆垃圾app,最主要的是没有root , 所以卸载不了. 然后试了 Root大师 , 刷机精灵 之类的软件. 我 CTMD , 简直比出厂ROM 还流氓, 不断的强制安装各种 ...
- 雷林鹏分享:XML 注意事项
XML 注意事项 这里列出了您在使用 XML 时应该尽量避免使用的技术. Internet Explorer - XML 数据岛 它是什么?XML 数据岛是嵌入到 HTML 页面中的 XML 数据. ...
- Onsen UI快速入门 --Onsen UI
一.什么是Onsen UI? 1.一系列专为移动应用程序设计的丰富UI组件.具有遵循原生iOS和Android设计标准的即时实现功能.免费使用,100%开源软件(Apache v2许可证).Onse ...
- 11月24日 layouts and rendering in rails(部分没有看)
http://guides.rubyonrails.org/layouts_and_rendering.html 中文 This guide covers the basic layout feat ...
- 浅谈微服务架构、容器技术与K8S
关注嘉为科技,获取运维新知 企业应用系统:从单体应用走向微服务架构:从裸金属走向容器. 如果在诸多热门云计算技术诸如容器.微服务.DevOps.OpenStack等之中,找出一个最火的方向,那么可能非 ...
- uva11609
以三个人组队为例,3组合是:C(3,0)=1,3,3,1.还有队长的选择.有 1*0,3*1,3*2,1*3种. 组合数: 1 3 3 1 ...
- Confluence 6 对一个空间进行归档后产生的影响
空间 如果一个空间被归档: 将不会在查找结果中显示,除非你选择 在归档空间中查找(Search archived spaces).如果没有归档空间的话,这个功能是隐藏的. 页面和内容将不会在 Conf ...
- Centos 6.5 搭建阿里云yum源
Linux系统下yum源配置(Centos 6) 1.备份 mv /etc/yum.repos.d/CentOS-Base.repo /etc/yum.repos.d/CentOS-Base.repo ...