MapReduce(三)

MapReduce(三)：

1.关于倒叙排序前10名

1）TreeMap根据key排序

2）TreeSet排序,传入一个对象，排序按照类中的compareTo方法排序

2.写一个MapReduce的模板

3.MapReduce的分区

1）手动分区

2）自动分区

4.自定义分区

----------------------------------------------------------------------------------------------------------------------------------

一.关于倒叙排序前10名

将一个文章中字母单词出现的次数进行倒叙排序，只取前十

1)TreeMap

package com.huhu.day03;

import java.io.IOException;

import java.util.Collections;

import java.util.Map;

import java.util.StringTokenizer;

import java.util.TreeMap;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.util.GenericOptionsParser;

import org.apache.hadoop.util.Tool;

import org.apache.hadoop.util.ToolRunner;

/**

 * a 80 b 78 r 70 .. 基于value来排序

 *

 * @author huhu_k

 *

 */

public class Top10_1 extends ToolRunner implements Tool {

	private Configuration conf;

	public static class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable> {

		@Override

		protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

			StringTokenizer st = new StringTokenizer(value.toString());

			while (st.hasMoreTokens()) {

				context.write(new Text(st.nextToken()), new IntWritable(1));

			}

		}

	}

	public static class MyReduce extends Reducer<Text, IntWritable, Text, IntWritable> {

		// TreeMap倒叙排列

		private TreeMap<Long, String> map = new TreeMap<>(Collections.reverseOrder());

		@Override

		protected void reduce(Text key, Iterable<IntWritable> values, Context context)

				throws IOException, InterruptedException {

			int sum = 0;

			for (IntWritable v : values) {

				sum++;

			}

			map.put(Long.valueOf(sum), key.toString());

			if (map.size() > 10) {

				// 按key排序 前十 降序

				map.remove(map.lastKey());

			}

		}

		@Override

		protected void cleanup(Context context) throws IOException, InterruptedException {

			for (Map.Entry<Long, String> m : map.entrySet()) {

				context.write(new Text(m.getValue()), new IntWritable(Integer.parseInt(m.getKey() + "")));

			}

		}

	}

	@Override

	public Configuration getConf() {

		if (conf != null) {

			return conf;

		}

		return new Configuration();

	}

	@Override

	public void setConf(Configuration arg0) {

	}

	@Override

	public int run(String[] other) throws Exception {

		Configuration conf = new Configuration();

		Job job = Job.getInstance(conf);

		job.setJarByClass(Top10_1.class);

		job.setMapperClass(MyMapper.class);

		job.setMapOutputKeyClass(Text.class);

		job.setMapOutputValueClass(IntWritable.class);

		job.setReducerClass(MyReduce.class);

		job.setOutputKeyClass(Text.class);

		job.setOutputValueClass(IntWritable.class);

		FileInputFormat.addInputPath(job, new Path(other[0]));

		FileOutputFormat.setOutputPath(job, new Path(other[1]));

		return job.waitForCompletion(true) ? 0 : 1;

	}

	public static void main(String[] args) throws Exception {

		Top10_1 t = new Top10_1();

		String[] other = new GenericOptionsParser(t.getConf(), args).getRemainingArgs();

		if (other.length != 2) {

			System.out.println("your input args number is fail,you need input <in> and <out>");

			System.exit(0);

		}

		ToolRunner.run(t.conf, t, other);

	}

}

package com.huhu.day03;

import java.io.IOException;

import java.util.Iterator;

import java.util.TreeSet;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.NullWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.util.GenericOptionsParser;

import org.apache.hadoop.util.Tool;

import org.apache.hadoop.util.ToolRunner;

/**

 * a 80 b 78 r 70 .. 基于value来排序 TreeSet

 *

 * @author huhu_k

 *

 */

public class Top10_2 extends ToolRunner implements Tool {

	private Configuration conf;

	public static class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable> {

		@Override

		protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

			String[] line = value.toString().split(" ");

			for (String s : line) {

				context.write(new Text(s), new IntWritable(1));

			}

		}

	}

	public static class MyReduce extends Reducer<Text, IntWritable, WCWritable, NullWritable> {

		// TreeSet倒叙排列

		private TreeSet<WCWritable> set;

		private final int KEY = 11;

		@Override

		protected void setup(Context context) throws IOException, InterruptedException {

			set = new TreeSet<WCWritable>();

		}

		@Override

		protected void reduce(Text key, Iterable<IntWritable> values, Context context)

				throws IOException, InterruptedException {

			WCWritable w = new WCWritable();

			int sum = 0;

			for (IntWritable i : values) {

				sum += i.get();

			}

			w.setWord(key.toString());

			w.setCount(sum);

			set.add(w);

			if (KEY < set.size()) {

				set.remove(set.last());

			}

		}

		@Override

		protected void cleanup(Context context) throws IOException, InterruptedException {

			Iterator<WCWritable> iterator = set.iterator();

			if (iterator.hasNext()) {

				context.write(iterator.next(), NullWritable.get());

			}

		}

	}

	@Override

	public Configuration getConf() {

		if (conf != null) {

			return conf;

		}

		return new Configuration();

	}

	@Override

	public void setConf(Configuration arg0) {

	}

	@Override

	public int run(String[] other) throws Exception {

		Configuration conf = new Configuration();

		Job job = Job.getInstance(conf);

		job.setJarByClass(Top10_2.class);

		job.setMapperClass(MyMapper.class);

		job.setMapOutputKeyClass(Text.class);

		job.setMapOutputValueClass(IntWritable.class);

		job.setReducerClass(MyReduce.class);

		job.setOutputKeyClass(WCWritable.class);

		job.setOutputValueClass(NullWritable.class);

		FileInputFormat.addInputPath(job, new Path(other[0]));

		FileOutputFormat.setOutputPath(job, new Path(other[1]));

		return job.waitForCompletion(true) ? 0 : 1;

	}

	public static void main(String[] args) throws Exception {

		Top10_2 t = new Top10_2();

		String[] other = new GenericOptionsParser(t.getConf(), args).getRemainingArgs();

		if (other.length != 2) {

			System.out.println("your input args number is fail,you need input <in> and <out>");

			System.exit(0);

		}

		ToolRunner.run(t.getConf(), t, other);

	}

}

运行还是在集群中运行，报错可以查看日志

2)TreeSet

package com.huhu.day03;

import java.io.DataInput;

import java.io.DataOutput;

import java.io.IOException;

import org.apache.hadoop.io.WritableComparable;

public class WCWritable implements WritableComparable<WCWritable> {

	private String word;

	private int count;

	public WCWritable() {

		super();

	}

	public WCWritable(String word, int count) {

		super();

		this.word = word;

		this.count = count;

	}

	public String getWord() {

		return word;

	}

	public void setWord(String word) {

		this.word = word;

	}

	public int getCount() {

		return count;

	}

	public void setCount(int count) {

		this.count = count;

	}

	@Override

	public String toString() {

		return "WCWritable [word=" + word + ", count=" + count + "]";

	}

	@Override

	public int hashCode() {

		final int prime = 31;

		int result = 1;

		result = prime * result + count;

		result = prime * result + ((word == null) ? 0 : word.hashCode());

		return result;

	}

	@Override

	public boolean equals(Object obj) {

		if (this == obj)

			return true;

		if (obj == null)

			return false;

		if (getClass() != obj.getClass())

			return false;

		WCWritable other = (WCWritable) obj;

		if (count != other.count)

			return false;

		if (word == null) {

			if (other.word != null)

				return false;

		} else if (!word.equals(other.word))

			return false;

		return true;

	}

	@Override

	public void readFields(DataInput in) throws IOException {

		this.word = in.readUTF();

		this.count = in.readInt();

	}

	@Override

	public void write(DataOutput out) throws IOException {

		out.writeUTF(word);

		out.writeInt(count);

	}

	@Override

	public int compareTo(WCWritable o) {

		if (this.count == o.count) {

			// 字典顺序

			return this.word.compareTo(o.word);

			// return this.word.length() - o.word.length();

		}

		return o.count - this.count;

	}

}

package com.huhu.day03;

import java.io.IOException;

import java.util.Iterator;

import java.util.TreeSet;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.NullWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;

import org.apache.hadoop.util.GenericOptionsParser;

import org.apache.hadoop.util.Tool;

import org.apache.hadoop.util.ToolRunner;

/**

 * a 80 b 78 r 70 .. 基于value来排序 TreeSet

 * 

 * @author huhu_k

 *

 */

public class Top10_2 extends ToolRunner implements Tool {

	private Configuration conf;

	static class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable> {

		@Override

		protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

			String[] line = value.toString().split(" ");

			for (String s : line) {

				context.write(new Text(s), new IntWritable(1));

			}

		}

	}

	static class MyReducer extends Reducer<Text, IntWritable, WCWritable, NullWritable> {

		private TreeSet<WCWritable> set;

		private final int KEY = 10;

		@Override

		protected void setup(Context context) throws IOException, InterruptedException {

			set = new TreeSet<WCWritable>();

		}

		@Override

		protected void reduce(Text key, Iterable<IntWritable> values, Context context)

				throws IOException, InterruptedException {

			WCWritable w = new WCWritable();

			int sum = 0;

			for (IntWritable v : values) {

				sum += v.get();

			}

			w.setWord(key.toString());

			w.setCount(sum);

			set.add(w);

			if (KEY < set.size()) {

				set.remove(set.last());

			}

		}

		@Override

		protected void cleanup(Context context) throws IOException, InterruptedException {

			Iterator<WCWritable> iterator = set.iterator();

			while (iterator.hasNext()) {

				context.write(iterator.next(), NullWritable.get());

			}

		}

	}

	public static void main(String[] args) throws Exception {

		Top10_2 t = new Top10_2();

		Configuration con = t.getConf();

		String[] other = new GenericOptionsParser(con, args).getRemainingArgs();

		if (other.length != 2) {

			System.err.println("number is fail");

		}

		int run = ToolRunner.run(con, t, args);

		System.exit(run);

	}

	@Override

	public Configuration getConf() {

		if (conf != null) {

			return conf;

		}

		return new Configuration();

	}

	@Override

	public void setConf(Configuration arg0) {

	}

	@Override

	public int run(String[] other) throws Exception {

		Configuration con = getConf();

		Job job = Job.getInstance(con);

		job.setJarByClass(Top10_2.class);

		job.setMapperClass(MyMapper.class);

		job.setMapOutputKeyClass(Text.class);

		job.setMapOutputValueClass(IntWritable.class);

		// 默认分区

		job.setPartitionerClass(HashPartitioner.class);

		job.setReducerClass(MyReducer.class);

		job.setOutputKeyClass(WCWritable.class);

		job.setOutputValueClass(NullWritable.class);

		FileInputFormat.addInputPath(job, new Path(other[0]));

		FileOutputFormat.setOutputPath(job, new Path(other[1]));

		return job.waitForCompletion(true) ? 0 : 1;

	}

}

二.写一个MapReduce的模板

package util;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;

import org.apache.hadoop.util.GenericOptionsParser;

import org.apache.hadoop.util.Tool;

import org.apache.hadoop.util.ToolRunner;

public class Frame extends ToolRunner implements Tool {

	private Configuration conf;

	public static class MyMapper extends Mapper<LongWritable, Text, Text, Text> {

		@Override

		protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

			String[] line = value.toString().split(" ");

		}

	}

	public static class MyReduce extends Reducer<Text, Text, Text, Text> {

		@Override

		protected void setup(Context context) throws IOException, InterruptedException {

		}

		@Override

		protected void reduce(Text key, Iterable<Text> values, Context context)

				throws IOException, InterruptedException {

		}

		@Override

		protected void cleanup(Context context) throws IOException, InterruptedException {

		}

	}

	public static void main(String[] args) throws Exception {

		Frame t = new Frame();

		Configuration conf = t.getConf();

		String[] other = new GenericOptionsParser(conf, args).getRemainingArgs();

		if (other.length != 2) {

			System.err.println("number is fail");

		}

		int run = ToolRunner.run(conf, t, args);

		System.exit(run);

	}

	@Override

	public Configuration getConf() {

		if (conf != null) {

			return conf;

		}

		return new Configuration();

	}

	@Override

	public void setConf(Configuration arg0) {

	}

	@Override

	public int run(String[] other) throws Exception {

		Configuration con = getConf();

		Job job = Job.getInstance(con);

		job.setJarByClass(Frame.class);

		job.setMapperClass(MyMapper.class);

		job.setMapOutputKeyClass(Text.class);

		job.setMapOutputValueClass(Text.class);

		//默认分区

		job.setPartitionerClass(HashPartitioner.class);

		job.setReducerClass(MyReduce.class);

		job.setOutputKeyClass(Text.class);

		job.setOutputValueClass(Text.class);

		FileInputFormat.addInputPath(job, new Path(other[0]));

		FileOutputFormat.setOutputPath(job, new Path(other[1]));

		return job.waitForCompletion(true) ? 0 : 1;

	}

}

三.MapReduce的分区

1）手动分区

package com.huhu.day03;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;

import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;

import org.apache.hadoop.util.GenericOptionsParser;

import org.apache.hadoop.util.Tool;

import org.apache.hadoop.util.ToolRunner;

/**

 * 手动分区

 * 

 * @author huhu_k

 *

 */

public class ManualPartition extends ToolRunner implements Tool {

	private Configuration conf;

	public static class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable> {

		@Override

		protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

			String[] line = value.toString().split(" ");

			for (String s : line) {

				context.write(new Text(s), new IntWritable(1));

			}

		}

	}

	public static class MyReduce extends Reducer<Text, IntWritable, Text, IntWritable> {

		private MultipleOutputs<Text, IntWritable> mos;

		@Override

		protected void setup(Reducer<Text, IntWritable, Text, IntWritable>.Context context)

				throws IOException, InterruptedException {

			mos = new MultipleOutputs<>(context);

		}

		@Override

		protected void reduce(Text key, Iterable<IntWritable> values, Context context)

				throws IOException, InterruptedException {

			int sum = 0;

			for (IntWritable v : values) {

				sum += v.get();

			}

			if (key.toString().substring(0, 1).matches("[a-z]")) {

				mos.write("az", key.toString(), new IntWritable(sum));

			} else if (key.toString().substring(0, 1).matches("[A-Z]")) {

				mos.write("AZ", key.toString(), new IntWritable(sum));

			} else if (key.toString().substring(0, 1).matches("[0-9]")) {

				mos.write("09", key.toString(), new IntWritable(sum));

			} else {

				mos.write("default", key.toString(), new IntWritable(sum));

			}

		}

		@Override

		protected void cleanup(Context context) throws IOException, InterruptedException {

			// 很重要 -->因为mos类时一个类似的缓冲区 hdfs可以写 更改 追加写

			mos.close();

		}

	}

	public static void main(String[] args) throws Exception {

		ManualPartition t = new ManualPartition();

		Configuration conf = t.getConf();

		String[] other = new GenericOptionsParser(conf, args).getRemainingArgs();

		if (other.length != 2) {

			System.err.println("number is fail");

		}

		int run = ToolRunner.run(conf, t, args);

		System.exit(run);

	}

	@Override

	public Configuration getConf() {

		if (conf != null) {

			return conf;

		}

		return new Configuration();

	}

	@Override

	public void setConf(Configuration arg0) {

	}

	@Override

	public int run(String[] other) throws Exception {

		Configuration con = getConf();

		Job job = Job.getInstance(con);

		job.setJarByClass(ManualPartition.class);

		job.setMapperClass(MyMapper.class);

		job.setMapOutputKeyClass(Text.class);

		job.setMapOutputValueClass(IntWritable.class);

		// 默认分区

		job.setPartitionerClass(HashPartitioner.class);

		job.setReducerClass(MyReduce.class);

		job.setOutputKeyClass(Text.class);

		job.setOutputValueClass(IntWritable.class);

		FileInputFormat.addInputPath(job, new Path(other[0]));

		FileOutputFormat.setOutputPath(job, new Path(other[1]));

		// 手动分区

		MultipleOutputs.addNamedOutput(job, "az", TextOutputFormat.class, Text.class, IntWritable.class);

		MultipleOutputs.addNamedOutput(job, "AZ", TextOutputFormat.class, Text.class, IntWritable.class);

		MultipleOutputs.addNamedOutput(job, "09", TextOutputFormat.class, Text.class, IntWritable.class);

		MultipleOutputs.addNamedOutput(job, "default", TextOutputFormat.class, Text.class, IntWritable.class);

		return job.waitForCompletion(true) ? 0 : 1;

	}

}

2）自动分区

package com.huhu.day03.partitioner;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Partitioner;

public class WordCountAUTOPartitioner extends Partitioner<Text, IntWritable> {

	@Override

	public int getPartition(Text key, IntWritable value, int numPartitioner) {

		String firstChar = key.toString().substring(0, 1);

		if (firstChar.matches("[a-g]")) {

			//返回

			return 0 % numPartitioner;

		} else if (firstChar.matches("[h-z]")) {

			return 1 % numPartitioner;

		} else if (firstChar.matches("[0-5]")) {

			return 2 % numPartitioner;

		} else if (firstChar.matches("[6-9]")) {

			return 3 % numPartitioner;

		} else if (firstChar.matches("[A-G]")) {

			return 0 % numPartitioner;

		} else if (firstChar.matches("[H-Z]")) {

			return 5 % numPartitioner;

		} else {

			return 6 % numPartitioner;

		}

	}

}

package com.huhu.day03;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.util.GenericOptionsParser;

import org.apache.hadoop.util.Tool;

import org.apache.hadoop.util.ToolRunner;

import com.huhu.day03.partitioner.WordCountAUTOPartitioner;

public class AutomaticPartition extends ToolRunner implements Tool {

	private Configuration conf;

	public static class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable> {

		@Override

		protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

			String[] line = value.toString().split(" ");

			for (String s : line) {

				context.write(new Text(s), new IntWritable(1));

			}

		}

	}

	public static class MyReduce extends Reducer<Text, IntWritable, Text, IntWritable> {

		@Override

		protected void setup(Context context) throws IOException, InterruptedException {

		}

		@Override

		protected void reduce(Text key, Iterable<IntWritable> values, Context context)

				throws IOException, InterruptedException {

			int sum = 0;

			for (IntWritable v : values) {

				sum += v.get();

			}

			context.write(key, new IntWritable(sum));

		}

		@Override

		protected void cleanup(Context context) throws IOException, InterruptedException {

		}

	}

	public static void main(String[] args) throws Exception {

		AutomaticPartition t = new AutomaticPartition();

		Configuration conf = t.getConf();

		String[] other = new GenericOptionsParser(conf, args).getRemainingArgs();

		if (other.length != 2) {

			System.err.println("number is fail");

		}

		int run = ToolRunner.run(conf, t, args);

		System.exit(run);

	}

	@Override

	public Configuration getConf() {

		if (conf != null) {

			return conf;

		}

		return new Configuration();

	}

	@Override

	public void setConf(Configuration arg0) {

	}

	@Override

	public int run(String[] other) throws Exception {

		Configuration con = getConf();

		Job job = Job.getInstance(con);

		job.setJarByClass(AutomaticPartition.class);

		job.setMapperClass(MyMapper.class);

		job.setMapOutputKeyClass(Text.class);

		job.setMapOutputValueClass(IntWritable.class);

		// 默认分区

		// job.setPartitionerClass(HashPartitioner.class);

		// 自定义分区

		job.setPartitionerClass(WordCountAUTOPartitioner.class);

		// 分40个区

		job.setNumReduceTasks(40);

		job.setReducerClass(MyReduce.class);

		job.setOutputKeyClass(Text.class);

		job.setOutputValueClass(IntWritable.class);

		FileInputFormat.addInputPath(job, new Path(other[0]));

		FileOutputFormat.setOutputPath(job, new Path(other[1]));

		return job.waitForCompletion(true) ? 0 : 1;

	}

}

只有在我的分区类中设置的分区中有内容，别的没有，因为我没有设置

四.自定义分组

package com.huhu.day03;

import java.io.IOException;

import java.util.Iterator;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;

import org.apache.hadoop.util.GenericOptionsParser;

import org.apache.hadoop.util.Tool;

import org.apache.hadoop.util.ToolRunner;

import com.huhu.day03.group.ClassGroupSort;

import com.huhu.day03.pojo.Student;

public class StudentAutoGroup extends ToolRunner implements Tool {

	private Configuration conf;

	public static class MyMapper extends Mapper<LongWritable, Text, Student, Student> {

		@Override

		protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

			String[] line = value.toString().split(" ");

			Student s = new Student(line[0], line[1], Integer.parseInt(line[2]));

			context.write(s, s);

		}

	}

	public static class MyReduce extends Reducer<Student, Student, Text, IntWritable> {

		@Override

		protected void reduce(Student key, Iterable<Student> values, Context context)

				throws IOException, InterruptedException {

			int sum = 0;

			for (Student s : values) {

				sum += s.getSccore();

			}

			context.write(new Text(key.getGroup()), new IntWritable(sum));

		}

	}

	public static void main(String[] args) throws Exception {

		StudentAutoGroup t = new StudentAutoGroup();

		Configuration conf = t.getConf();

		String[] other = new GenericOptionsParser(conf, args).getRemainingArgs();

		if (other.length != 2) {

			System.err.println("number is fail");

		}

		int run = ToolRunner.run(conf, t, args);

		System.exit(run);

	}

	@Override

	public Configuration getConf() {

		if (conf != null) {

			return conf;

		}

		return new Configuration();

	}

	@Override

	public void setConf(Configuration arg0) {

	}

	@Override

	public int run(String[] other) throws Exception {

		Configuration con = getConf();

		Job job = Job.getInstance(con);

		job.setJarByClass(StudentAutoGroup.class);

		job.setMapperClass(MyMapper.class);

		job.setMapOutputKeyClass(Student.class);

		job.setMapOutputValueClass(Student.class);

		// 分组

		job.setCombinerKeyGroupingComparatorClass(ClassGroupSort.class);

		// job.setGroupingComparatorClass(ClassGroupSort.class);

		// 默认分区

		job.setPartitionerClass(HashPartitioner.class);

		job.setNumReduceTasks(1);

		job.setReducerClass(MyReduce.class);

		job.setOutputKeyClass(Text.class);

		job.setOutputValueClass(IntWritable.class);

		FileInputFormat.addInputPath(job, new Path(other[0]));

		FileOutputFormat.setOutputPath(job, new Path(other[1]));

		return job.waitForCompletion(true) ? 0 : 1;

	}

}

package com.huhu.day03.pojo;

import java.io.DataInput;

import java.io.DataOutput;

import java.io.IOException;

import org.apache.hadoop.io.WritableComparable;

public class Student implements WritableComparable<Student> {

	private String name;

	private String group;

	private int sccore;

	public Student() {

		super();

		// TODO Auto-generated constructor stub

	}

	public Student(String name, String group, int sccore) {

		super();

		this.name = name;

		this.group = group;

		this.sccore = sccore;

	}

	public String getName() {

		return name;

	}

	public void setName(String name) {

		this.name = name;

	}

	public String getGroup() {

		return group;

	}

	public void setGroup(String group) {

		this.group = group;

	}

	public int getSccore() {

		return sccore;

	}

	public void setSccore(int sccore) {

		this.sccore = sccore;

	}

	@Override

	public String toString() {

		return "Student [name=" + name + ", group=" + group + ", sccore=" + sccore + "]";

	}

	@Override

	public void readFields(DataInput in) throws IOException {

		this.name = in.readUTF();

		this.group = in.readUTF();

		this.sccore = in.readInt();

	}

	@Override

	public void write(DataOutput out) throws IOException {

		out.writeUTF(name);

		out.writeUTF(group);

		out.writeInt(sccore);

	}

	@Override

	public int compareTo(Student o) {

		return this.group.compareTo(o.group);

	}

}

package com.huhu.day03.group;

import org.apache.hadoop.io.RawComparator;

import org.apache.hadoop.io.WritableComparator;

import com.huhu.day03.pojo.Student;

public class ClassGroupSort implements RawComparator<Student> {

	@Override

	public int compare(Student o1, Student o2) {

		return (int) (o1.getGroup().equals(o2.getGroup()) ? 0 : 1);

	}

	@Override

	public int compare(byte[] arg0, int arg1, int arg2, byte[] arg3, int arg4, int arg5) {

		return WritableComparator.compareBytes(arg0, arg1, 8, arg3, arg4, 8);

	}

}

总结下吧：

MapReduce的分区是对key的类型分：可以根据【a-z】【A-Z】【0-9】.....等等。是对reduce中的key来分的

MapReduce的分组则是忽略key：根据value来分的，比如你传入一个对象，根据对象的属性来比较，虽然传入的是不同的对象，但是只要属性相同，则可以对数据进行操作。

他们之所以又是分组又是分区，一则为了清洗数据，二则为了给数据排序。

MapReduce(三)的更多相关文章

MapReduce三种join实例分析
本文引自吴超博客实现原理 1.在Reudce端进行连接. 在Reudce端进行连接是MapReduce框架进行表之间join操作最为常见的模式,其具体的实现原理如下: Map端的主要工作:为来自不同 ...
MapReduce三种路径输入
目前为止知道MapReduce有三种路径输入方式.1.第一种是通过一下方式输入: FileInputFormat.addInputPath(job, new Path(args[0]));FileIn ...
MapReduce(三) 典型场景（一）
一.mapreduce多job串联 1.需求一个稍复杂点的处理逻辑往往需要多个 mapreduce 程序串联处理,多 job 的串联可以借助 mapreduce 框架的 JobControl 实现 ...
mapreduce (三) MapReduce实现倒排索引(二)
hadoop api http://hadoop.apache.org/docs/r1.0.4/api/org/apache/hadoop/mapreduce/Reducer.html 改变一下需求: ...
百度和 Google 的搜索技术是一个量级吗？
著作权归作者所有. 商业转载请联系作者获得授权,非商业转载请注明出处. 作者:Kenny Chao 链接:http://www.zhihu.com/question/22447908/answer/2 ...
MongoDB中聚合工具Aggregate等的介绍与使用
Aggregate是MongoDB提供的众多工具中的比较重要的一个,类似于SQL语句中的GROUP BY.聚合工具可以让开发人员直接使用MongoDB原生的命令操作数据库中的数据,并且按照要求进行聚合 ...
Hive的HQL语句及数据倾斜解决方案
[版权申明:本文系作者原创,转载请注明出处] 文章出处:http://blog.csdn.net/sdksdk0/article/details/51675005 作者: 朱培 ID ...
Spark原理概述
原文来自我的个人网站:http://www.itrensheng.com/archives/Spark_basic_knowledge 一. Spark出现的背景在Spark出现之前,大数据计算引擎 ...
Apache Flink 如何正确处理实时计算场景中的乱序数据
一.流式计算的未来在谷歌发表了 GFS.BigTable.Google MapReduce 三篇论文后,大数据技术真正有了第一次飞跃,Hadoop 生态系统逐渐发展起来. Hadoop 在处理大批量 ...

随机推荐

C++ 空字符('\0')和空格符(' ')
1.从字符串的长度:-->空字符的长度为0,空格符的长度为1. 2.虽然输出到屏幕是一样的,但是本质的ascii code 是不一样的,他们还是有区别的. #include<iostrea ...
sql 查询字段是中文/英文/数字正则表达式
一.包含中文字符 select * from 表名 where 列名 like '%[吖-座]%' 二.包含英文字符 select * from 表名 where 列名 like '%[a-z]%' ...
JavaScript的案例（数据校验，js轮播图，页面定时弹窗）
1.数据校验步骤 1.确定事件(onsubmit)并绑定一个函数 2.书写这个函数,获取数据,并绑定id 3. ...
EditPlus查找替换
换行符\n,记得选择正则表达式 1]正则表达式应用——替换指定内容到行尾解决:① 在替换对话框,查找内容里输入“abc.*”② 同时勾选“正则表达式”复选框,然后点击“全部替换”按钮其中,符号的含义如 ...
Spring分布式事务实现概览
分布式事务,一直是实现分布式系统过程中最大的挑战.在只有单个数据源的单服务系统当中,只要这个数据源支持事务,例如大部分关系型数据库,和一些MQ服务,如activeMQ等,我们就可以很容易的实现事务. ...
控制 datetimepicker 显示位置
1. datetimepicker 位置 pickerPosition 有以下几个属性值,望文生义不解释 $('.form_datetime').datetimepicker({ pickerPo ...
通过TortoiseSVN进行文件（夹）外链 External File
1.假设将server/a.lua文件外链到client文件夹中 2.在client文件夹空白处右键->TortoiseSVN->Properties->New->Extern ...
NGUI实现的一套不同大小 Item 的循环滚动代码
测试: 数据 & Item 的 Ctrl : using UnityEngine; public class ScrollViewItemData { public int index; p ...
Java创建WebService
从java 6之后就提供了简单快速的创建WebService的方法,这里将这种简单的方法记录下来方便以后回看.第一步:首先创建一个java Project,然后创建一个类HelloWorldImpl如 ...
HTML 第十二章总结
HTML5 markup 前言在这一章中,讲解了新的 HTML5 的一些 markup,并且对其中的<video>进行了很详细的讲解. New HTML5 elements 列表如下: ...

MapReduce(三)

二.写一个MapReduce的模板

三.MapReduce的分区

MapReduce(三)的更多相关文章

随机推荐

热门专题