程序源码

 import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; public class WordCount {
public static class WordCountMap extends
Mapper<LongWritable, Text, Text, IntWritable> {
private final IntWritable one = new IntWritable(1);//输出的值 1
private Text word = new Text();//输出的键 单词 public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {//处理经过 TextInputFormat 产生的 <k1,v1>,然后产生 <k2,v2>
String line = value.toString();//读取文本中
StringTokenizer token = new StringTokenizer(line);//按照空格对单词进行切割
while (token.hasMoreTokens()) {
word.set(token.nextToken());//读取到的单词作为键值
context.write(word, one);//以 单词,1的中间形式交给reduce处理
}
}
} public static class WordCountReduce extends
Reducer<Text, IntWritable, Text, IntWritable> {
public void reduce(Text key, Iterable<IntWritable> values,
Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
context.write(key, new IntWritable(sum));
}
} public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = new Job(conf);
job.setJarByClass(WordCount.class);
job.setJobName("wordcount");
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setMapperClass(WordCountMap.class);
job.setReducerClass(WordCountReduce.class);
job.setInputFormatClass(TextInputFormat.class);//生成可供Map处理的键值对
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.waitForCompletion(true);
}
}

1 编译源码

javac -classpath /opt/hadoop-1.2.1/hadoop-core-1.2.1.jar:/opt/hadoop-1.2.1/lib/commons-cli-1.2.jar -d ./word_count_class/ WordCount.java
将源码编译成class文件并放在当前文件夹下的word_count_class目录,当然,首先需要创建该目录

2 将源码打成jar包

进入源码目录

jar -cvf wordcount.jar  *

3 上传输入文件

先在hadoop中为本次任务创建一个输入文件存放目录

hadoop fs -mkdir input_wordcount

将input目录下的所有文本文件上传到hadoop中的input_wordcount目录下

hadoop fs -put input/* input_wordcount/

注意:不能在运行前穿创建输出文件夹

4 上传jar并执行

hadoop jar word_count_class/wordcount.jar input_wordcount output_wordcount

5 查看计算结果

程序输出目录

hadoop fs -ls output_wordcount

程序输出内容

hadoop fs -cat output_wordcount/part-r-00000



版本二:自己实际操作中的程序

Map程序

 package com.zln.chapter03;

 import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter; import java.io.IOException;
import java.util.StringTokenizer; /**
* Created by sherry on 15-7-12.
*/
public class WordCountMap extends MapReduceBase implements Mapper<LongWritable,Text,Text,IntWritable> {
private final static IntWritable one = new IntWritable(1);//每个单词 +1
private Text word = new Text(); @Override
public void map(LongWritable longWritable, Text text, OutputCollector<Text, IntWritable> outputCollector, Reporter reporter) throws IOException {
String line = text.toString();
StringTokenizer tokenizer = new StringTokenizer(line);//分割出单词
while (tokenizer.hasMoreTokens()){
word.set(tokenizer.nextToken());
outputCollector.collect(word,one);
}
}
}

Reduce程序

 package com.zln.chapter03;

 import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter; import java.io.IOException;
import java.util.Iterator; /**
* Created by sherry on 15-7-12.
*/
public class WordCountReduce extends MapReduceBase implements Reducer<Text,IntWritable,Text,IntWritable> {
@Override
public void reduce(Text text, Iterator<IntWritable> iterator, OutputCollector<Text, IntWritable> outputCollector, Reporter reporter) throws IOException {
int sum = 0;
while (iterator.hasNext()){
sum += iterator.next().get();
}
outputCollector.collect(text,new IntWritable(sum));
}
}

主函数

 package com.zln.chapter03;

 import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.*; import java.io.IOException; /**
* Created by sherry on 15-7-12.
*/
public class WordCount {
public static void main(String[] args) throws IOException {
JobConf conf = new JobConf(WordCount.class);
conf.setJobName("wordCount"); //设置输出格式
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(IntWritable.class); //设置MapReduce类
conf.setMapperClass(WordCountMap.class);
conf.setReducerClass(WordCountReduce.class); //设置处理输入类
conf.setInputFormat(TextInputFormat.class);
//设置处理输出类
conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(args[0]));
FileOutputFormat.setOutputPath(conf, new Path(args[1])); JobClient.runJob(conf);
}
}

准备输入文件

file1

Hello Word By Word
Hello Word By zln

file2

Hello Hadoop
Hello GoodBye

放在同一个目录下:/home/sherry/IdeaProjects/Hadoop/WordCount/输入文件准备

编译class打成一个jar包

我使用IDEA进行编译。注意不要忘记指定main函数

上传输入文件

root@sherry:/opt/hadoop-1.2.# hadoop fs -mkdir /user/root/zln/WordCount/InputFiles
root@sherry:/opt/hadoop-1.2.# hadoop fs -put /home/sherry/IdeaProjects/Hadoop/WordCount/输入文件准备/* /user/root/zln/WordCount/InputFiles

上传jar并执行

root@sherry:/opt/hadoop-1.2.# hadoop jar /home/sherry/IdeaProjects/Hadoop/out/artifacts/WordCount_jar/WordCount.jar /user/root/zln/WordCount/InputFiles /user/root/zln/WordCount/OutputFiles

查看执行结果

root@sherry:/opt/hadoop-1.2.# hadoop fs -ls /user/root/zln/WordCount/OutputFiles
root@sherry:/opt/hadoop-1.2.# hadoop fs -text /user/root/zln/WordCount/OutputFiles/part-


版本三:使用新版本的API对Map  Reduce  main函数进行重写

Map

 package com.zln.chapter03;

 import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper; import java.io.IOException;
import java.util.StringTokenizer; /**
* Created by sherry on 15-7-12.
*/
public class WordCountMap extends Mapper<LongWritable,Text,Text,IntWritable> {
private final static IntWritable one = new IntWritable(1);//每个单词 +1
private Text word = new Text(); @Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
StringTokenizer tokenizer = new StringTokenizer(line);//分割出单词
while (tokenizer.hasMoreTokens()){
word.set(tokenizer.nextToken());
context.write(word,one);
}
} }

Reduce

 package com.zln.chapter03;

 import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer; import java.io.IOException; /**
* Created by sherry on 15-7-12.
*/
public class WordCountReduce extends Reducer<Text,IntWritable,Text,IntWritable> { @Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable intWritable:values){
sum += intWritable.get();
}
context.write(key,new IntWritable(sum));
}
}

Main

 package com.zln.chapter03;

 import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner; /**
* Created by sherry on 15-7-12.
*/
public class WordCount extends Configured implements Tool{ public int run(String[] args) throws Exception {
Job job = new Job(getConf());
job.setJarByClass(WordCount.class);
job.setJobName("WordCount"); job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class); job.setMapperClass(WordCountMap.class);
job.setReducerClass(WordCountReduce.class); job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.setInputPaths(job,new Path(args[0]));
FileOutputFormat.setOutputPath(job,new Path(args[1])); boolean success = job.waitForCompletion(true);
return success?0:1;
} public static void main(String[] args) throws Exception {
int ret = ToolRunner.run(new WordCount(),args);
System.exit(ret);
}
}

使用hadoop统计多个文本中每个单词数目的更多相关文章

  1. C#统计给定的文本中字符出现的次数,使用循环和递归两种方法

    前几天看了一个.net程序员面试题目,题目是”统计给定的文本中字符出现的次数,使用循环和递归两种方法“. 下面是我对这个题目的解法: 1.使用循环: /// <summary> /// 使 ...

  2. python统计文本中每个单词出现的次数

    .python统计文本中每个单词出现的次数: #coding=utf-8 __author__ = 'zcg' import collections import os with open('abc. ...

  3. Perl-统计文本中各个单词出现的次数(NVDIA2019笔试)

    1.原题 2.perl脚本 print "================ Method 1=====================\n"; open IN,'<','an ...

  4. Python的 counter内置函数,统计文本中的单词数量

    counter是 colletions内的一个类 可以理解为一个简单的计数 import collections str1=['a','a','b','d'] m=collections.Counte ...

  5. C#统计英文文本中的单词数并排序

    思路如下:1.使用的Hashtable(高效)集合,记录每个单词出现的次数2.采用ArrayList对Hashtable中的Keys按字母序排列3.排序使用插入排序(稳定) public void S ...

  6. C++统计一段文字中各单词出现的频率

    #include <iostream> using namespace std; /* run this program using the console pauser or add y ...

  7. 一个简单的程序,统计文本文档中的单词和汉字数,逆序排列(出现频率高的排在最前面)。python实现。

    仅简单统计英文. from collections import Counter f = open('1') c = Counter() for line in f: g = (x for x in ...

  8. ruby的hash学习笔记例: 将字符串文本中的单词存放在map中

    text = 'The rain in Spain falls mainly in the plain.'first = Hash.new []second = Hash.new {|hash,key ...

  9. python统计英文文本中的回文单词数

    1. 要求: 给定一篇纯英文的文本,统计其中回文单词的比列,并输出其中的回文单词,文本数据如下: This is Everyday Grammar. I am Madam Lucija And I a ...

随机推荐

  1. iOS 检测版本更新(02)

    iOS 检测版本更新 如果我们要检测app版本的更新,那么我们必须获取当前运行app版本的版本信息和appstore 上发布的最新版本的信息. 当前运行版本信息可以通过info.plist文件中的bu ...

  2. js复习,预编译

    注意:函数声明整体提升.变量 声明提升 1.imply global 暗示全局变量:即任何变量,如果变量未声明就赋值,此变量就为全局对象所有 ==>  eg: a = 122;==>  e ...

  3. github上更新fork项目

    转载:https://blog.csdn.net/qq1332479771/article/details/56087333 ps:需要用GitHub所指定的chrome或者firefox浏览器,其它 ...

  4. Linux进程通信之匿名管道

    进程间的通信方式 进程间的通信方式包括,管道.共享内存.信号.信号量.消息队列.套接字. 进程间通信的目的 进程间通信的主要目的是:数据传输.数据共享.事件通知.资源共享.进程控制等. 进程间通信之管 ...

  5. Target runtime Apache Tomcat v8.5 is not defined.

    Target runtime Apache Tomcat v8.5(或者其它版本) is not defined. 这个错误通常是在从文件夹中导入别人的项目的时候发生,因为 在 .setting 中有 ...

  6. scala初体验-02

    上一节,我们讲了scala的安装的即一些初步方法,今天,我们来介绍一下scala里面的一些基本操作 1.对于map的的编写,这个是广泛用于Array里面的 val arr = Array(1,2,3, ...

  7. python面向对象(进阶篇)

    本篇将详细介绍Python 类的成员.成员修饰符.类的特殊成员. 类的成员: 类的成员可以分为三大类:字段(变量).方法.属性. 注:所有成员中,只有普通字段的内容保存对象中,即:根据此类创建了多少对 ...

  8. android stadio open recent 在同一窗口打开

    Android staido 有一个功能是open recent ,默认是下面这样的: 就出来一个框,给你选择,是在新的窗口打开,还是在当前窗口打开.如果你选了当前窗口,并且点了Remember,do ...

  9. viewpager 无网络的时候滑动异常

    不知道大家有没有遇到过这种情况,就是框架是viewpager+fragment的架构.然后呢,fragment里面是webview.一般情况下,当没有网的时候,webviwe会说什么找不到网页,然后很 ...

  10. 一道关于C++ 继承/虚函数 笔试题 [转]

    转自:http://www.cnblogs.com/yangyh/archive/2011/06/04/2072393.html 首先这位作者, 因为看了这篇简短的一个博文, 我相同了关于虚函数方面的 ...