eclipse通过maven进行打包并且对hdfs上的文件进行wordcount

在eclipse中配置自己的maven仓库

1.安装maven（用于管理仓库，jar包的管理）

-1.解压maven安装包

-2.把maven添加到环境变量/etc/profile

-3.添加maven目录下的conf/setting.xml文件到～/.m2文件夹下

2.安装eclipse

-1.解压eclipse安装文件

-2.执行eclipse.inst文件

-3.按步骤操作

3.在eclipse中配置自己的maven仓库

1.window>>perfoemence>>maven>>installations(添加使用的maven目录，步骤1.1)

add>>选择1.1中的路径

2.window>>perfoemence>>maven>>User settings(选择本地仓库的配置文件，步骤1.3)

Uesr Settings>>选择1.3中的文件

4.新建maven的项目

-new>>maven project>>创建一个简单的项目>>next>>next>>Group Id:域名倒置>>Artfact Id:项目名>>finish

-修改pom.xml文件

junit

junit

3.8.1

test

org.apache.hadoop
hadoop-hdfs
2.5.0

org.apache.hadoop
hadoop-client
2.5.1

org.apache.hadoop
hadoop-common
2.5.0

编写一个小程序进行Test

在src/main/java下新建hadoop_test类

package hadoop_test;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.conf.Configured;

import org.apache.hadoop.util.Tool;

import org.apache.hadoop.util.ToolRunner;

public class ConfTest extends Configured implements Tool{

public int run(String[] arg0) throws Exception {

	// TODO Auto-generated method stub

	Configuration conf =getConf();

	return 0;

}

public static void main(String[] args) throws Exception {

	System.out.println("hello world！！！");

	int status = ToolRunner.run(new ConfTest(), args);

	System.exit(status);

}

}

打包，在终端进入该Java Project的pom.xml所在文件夹，执行mvn install clean，在target文件夹中可以找到一个jar包（hadoop_test-0.0.1-SNAPSHOT.jar），若是jarhadoop jar hadoop_test-0.0.1-SNAPSHOT.jar hadoop_test/ConfTest 指令执行输出hello world则该基本上成功了。同时也可测试下系统自带的wordcount类，具体方法是$ ./bin/$ hadoop jar $HADOOP_PREFIX/share/hadoop/mapreduce/hadoop-mapreduce-examples-2.2.0.jar wordcount input output

最后写程序读取hdfs上的文件进行mapreduce并将结果传回hdfs

类：package hadoop_test;

import java.io.IOException;

import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.conf.Configured;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;

import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

import org.apache.hadoop.util.Tool;

import org.apache.hadoop.util.ToolRunner;

public class WordCount extends Configured implements Tool{

static class WordCountMapper

extends Mapper<LongWritable, Text, Text, IntWritable>{

// 统计使用变量

private final static IntWritable one=

new IntWritable(1);

// 单词变量

private Text word=new Text();

	/**

	 * key:当前读取行的偏移量

	 * value：当前读取的行

	 * context:map方法执行时上下文

	 */

	@Override

	protected void map(LongWritable key, Text value, Context context)

			throws IOException, InterruptedException {

		// TODO Auto-generated method stub

		StringTokenizer words=

				new StringTokenizer(value.toString(), " ");

		while(words.hasMoreTokens()){

			word.set(words.nextToken());

			context.write(word, one);

		}

	}

}

static class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable>{

	private IntWritable counter = new IntWritable();

	/**

	 * key:待统计的word

	 * values:待统计word的所有统计标识

	 * context:reduce方法执行时的上下文

	 */

	@Override

	protected void reduce(Text key,

			Iterable<IntWritable> values,

			Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {

		// TODO Auto-generated method stub

		int count=0;

		for(IntWritable one:values){

			count+=one.get();

		}

		counter.set(count);

		context.write(key, counter);

	}

}

// @Override

public int run(String[] args) throws Exception {

//获得程序运行时的配置信息

Configuration conf=getConf();

String inputPath=conf.get("input");

String outputPath=conf.get("output");

	//构建新的作业

	Job job = Job.getInstance(conf, "Word Frequence Count");

	job.setJarByClass(WordCount.class);

	//给job设置mapper类及map方法输出的键值类型

	job.setMapperClass(WordCountMapper.class);

	job.setMapOutputKeyClass(Text.class);

	job.setMapOutputValueClass(IntWritable.class);

	//给job设置reducer类及reduce方法输出的键值类型

	job.setReducerClass(WordCountReducer.class);

	job.setOutputKeyClass(Text.class);

	job.setOutputValueClass(IntWritable.class);

	//设置数据的读取方式（文本文件）及结果的输出方式（文本文件）

	job.setInputFormatClass(TextInputFormat.class);

	job.setOutputFormatClass(TextOutputFormat.class);

	//设置输入和输出目录

	TextInputFormat.addInputPath(job, new Path(inputPath));

	TextOutputFormat.setOutputPath(job, new Path(outputPath));

	//将作业提交集群执行

	return job.waitForCompletion(true)?0:1;

}

public static void main(String[] args) throws Exception{

	int status = ToolRunner.run(new WordCount(), args);

	System.exit(status);

}

}

执行hadoop jar hadoop_test-0.0.1-SNAPSHOT.jar hadoop_test/WordCount -Dinput=hdfs:/usr/hadoop/maven* -Doutput=hdfs:/usr/hadoop/maven1指令（注意此时的文件路径和/usr/local区分开）

好了，到这里基本上我们的环境就搭建成功了，还有些细节的这几天会慢慢补充的。

参考地址：maven配置部分：https://www.cnblogs.com/cenzhongman/p/7093672.html 侵删