使用hadoop统计多个文本中每个单词数目

程序源码

 import java.io.IOException;

 import java.util.StringTokenizer;

 import org.apache.hadoop.conf.Configuration;

 import org.apache.hadoop.fs.Path;

 import org.apache.hadoop.io.IntWritable;

 import org.apache.hadoop.io.LongWritable;

 import org.apache.hadoop.io.Text;

 import org.apache.hadoop.mapreduce.Job;

 import org.apache.hadoop.mapreduce.Mapper;

 import org.apache.hadoop.mapreduce.Reducer;

 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

 import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;

 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

 import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

 public class WordCount {

     public static class WordCountMap extends

             Mapper<LongWritable, Text, Text, IntWritable> {

         private final IntWritable one = new IntWritable(1);//输出的值  1

         private Text word = new Text();//输出的键 单词

         public void map(LongWritable key, Text value, Context context)

                 throws IOException, InterruptedException {//处理经过  TextInputFormat  产生的  <k1,v1>，然后产生 <k2,v2>

             String line = value.toString();//读取文本中

             StringTokenizer token = new StringTokenizer(line);//按照空格对单词进行切割

             while (token.hasMoreTokens()) {

                 word.set(token.nextToken());//读取到的单词作为键值

                 context.write(word, one);//以  单词,1的中间形式交给reduce处理

             }

         }

     }

     public static class WordCountReduce extends

             Reducer<Text, IntWritable, Text, IntWritable> {

         public void reduce(Text key, Iterable<IntWritable> values,

                 Context context) throws IOException, InterruptedException {

             int sum = 0;

             for (IntWritable val : values) {

                 sum += val.get();

             }

             context.write(key, new IntWritable(sum));

         }

     }

     public static void main(String[] args) throws Exception {

         Configuration conf = new Configuration();

         Job job = new Job(conf);

         job.setJarByClass(WordCount.class);

         job.setJobName("wordcount");

         job.setOutputKeyClass(Text.class);

         job.setOutputValueClass(IntWritable.class);

         job.setMapperClass(WordCountMap.class);

         job.setReducerClass(WordCountReduce.class);

         job.setInputFormatClass(TextInputFormat.class);//生成可供Map处理的键值对

         job.setOutputFormatClass(TextOutputFormat.class);

         FileInputFormat.addInputPath(job, new Path(args[0]));

         FileOutputFormat.setOutputPath(job, new Path(args[1]));

         job.waitForCompletion(true);

     }

 }

1 编译源码

javac -classpath /opt/hadoop-1.2.1/hadoop-core-1.2.1.jar:/opt/hadoop-1.2.1/lib/commons-cli-1.2.jar -d ./word_count_class/ WordCount.java
将源码编译成class文件并放在当前文件夹下的word_count_class目录，当然，首先需要创建该目录

2 将源码打成jar包

进入源码目录

jar -cvf wordcount.jar *

3 上传输入文件

先在hadoop中为本次任务创建一个输入文件存放目录

hadoop fs -mkdir input_wordcount

将input目录下的所有文本文件上传到hadoop中的input_wordcount目录下

hadoop fs -put input/* input_wordcount/

注意：不能在运行前穿创建输出文件夹

4 上传jar并执行

hadoop jar word_count_class/wordcount.jar input_wordcount output_wordcount

5 查看计算结果

程序输出目录

hadoop fs -ls output_wordcount

程序输出内容

hadoop fs -cat output_wordcount/part-r-00000

版本二：自己实际操作中的程序

Map程序

 package com.zln.chapter03;

 import org.apache.hadoop.io.IntWritable;

 import org.apache.hadoop.io.LongWritable;

 import org.apache.hadoop.io.Text;

 import org.apache.hadoop.mapred.MapReduceBase;

 import org.apache.hadoop.mapred.Mapper;

 import org.apache.hadoop.mapred.OutputCollector;

 import org.apache.hadoop.mapred.Reporter;

 import java.io.IOException;

 import java.util.StringTokenizer;

 /**

  * Created by sherry on 15-7-12.

  */

 public class WordCountMap extends MapReduceBase implements Mapper<LongWritable,Text,Text,IntWritable> {

     private final static IntWritable one = new IntWritable(1);//每个单词 +1

     private Text word = new Text();

     @Override

     public void map(LongWritable longWritable, Text text, OutputCollector<Text, IntWritable> outputCollector, Reporter reporter) throws IOException {

         String line = text.toString();

         StringTokenizer tokenizer = new StringTokenizer(line);//分割出单词

         while (tokenizer.hasMoreTokens()){

             word.set(tokenizer.nextToken());

             outputCollector.collect(word,one);

         }

     }

 }

Reduce程序

 package com.zln.chapter03;

 import org.apache.hadoop.io.IntWritable;

 import org.apache.hadoop.io.Text;

 import org.apache.hadoop.mapred.MapReduceBase;

 import org.apache.hadoop.mapred.OutputCollector;

 import org.apache.hadoop.mapred.Reducer;

 import org.apache.hadoop.mapred.Reporter;

 import java.io.IOException;

 import java.util.Iterator;

 /**

  * Created by sherry on 15-7-12.

  */

 public class WordCountReduce extends MapReduceBase implements Reducer<Text,IntWritable,Text,IntWritable> {

     @Override

     public void reduce(Text text, Iterator<IntWritable> iterator, OutputCollector<Text, IntWritable> outputCollector, Reporter reporter) throws IOException {

         int sum = 0;

         while (iterator.hasNext()){

             sum += iterator.next().get();

         }

         outputCollector.collect(text,new IntWritable(sum));

     }

 }

主函数

 package com.zln.chapter03;

 import org.apache.hadoop.fs.Path;

 import org.apache.hadoop.io.IntWritable;

 import org.apache.hadoop.io.Text;

 import org.apache.hadoop.mapred.*;

 import java.io.IOException;

 /**

  * Created by sherry on 15-7-12.

  */

 public class WordCount {

     public static void main(String[] args) throws IOException {

         JobConf conf = new JobConf(WordCount.class);

         conf.setJobName("wordCount");

         //设置输出格式

         conf.setOutputKeyClass(Text.class);

         conf.setOutputValueClass(IntWritable.class);

         //设置MapReduce类

         conf.setMapperClass(WordCountMap.class);

         conf.setReducerClass(WordCountReduce.class);

         //设置处理输入类

         conf.setInputFormat(TextInputFormat.class);

         //设置处理输出类

         conf.setOutputFormat(TextOutputFormat.class);

         FileInputFormat.setInputPaths(conf, new Path(args[0]));

         FileOutputFormat.setOutputPath(conf, new Path(args[1]));

         JobClient.runJob(conf);

     }

 }

准备输入文件

file1

Hello Word By Word

Hello Word By zln

file2

Hello Hadoop

Hello GoodBye

放在同一个目录下：/home/sherry/IdeaProjects/Hadoop/WordCount/输入文件准备

编译class打成一个jar包

我使用IDEA进行编译。注意不要忘记指定main函数

上传输入文件

root@sherry:/opt/hadoop-1.2.# hadoop fs -mkdir /user/root/zln/WordCount/InputFiles

root@sherry:/opt/hadoop-1.2.# hadoop fs -put /home/sherry/IdeaProjects/Hadoop/WordCount/输入文件准备/* /user/root/zln/WordCount/InputFiles

上传jar并执行

root@sherry:/opt/hadoop-1.2.# hadoop jar /home/sherry/IdeaProjects/Hadoop/out/artifacts/WordCount_jar/WordCount.jar /user/root/zln/WordCount/InputFiles /user/root/zln/WordCount/OutputFiles

查看执行结果

root@sherry:/opt/hadoop-1.2.# hadoop fs -ls /user/root/zln/WordCount/OutputFiles

root@sherry:/opt/hadoop-1.2.# hadoop fs -text /user/root/zln/WordCount/OutputFiles/part-

版本三：使用新版本的API对Map Reduce main函数进行重写

Map

 package com.zln.chapter03;

 import org.apache.hadoop.io.IntWritable;

 import org.apache.hadoop.io.LongWritable;

 import org.apache.hadoop.io.Text;

 import org.apache.hadoop.mapreduce.Mapper;

 import java.io.IOException;

 import java.util.StringTokenizer;

 /**

  * Created by sherry on 15-7-12.

  */

 public class WordCountMap extends Mapper<LongWritable,Text,Text,IntWritable> {

     private final static IntWritable one = new IntWritable(1);//每个单词 +1

     private Text word = new Text();

     @Override

     protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

         String line = value.toString();

         StringTokenizer tokenizer = new StringTokenizer(line);//分割出单词

         while (tokenizer.hasMoreTokens()){

             word.set(tokenizer.nextToken());

             context.write(word,one);

         }

     }

 }

Reduce

 package com.zln.chapter03;

 import org.apache.hadoop.io.IntWritable;

 import org.apache.hadoop.io.Text;

 import org.apache.hadoop.mapreduce.Reducer;

 import java.io.IOException;

 /**

  * Created by sherry on 15-7-12.

  */

 public class WordCountReduce extends Reducer<Text,IntWritable,Text,IntWritable> {

     @Override

     protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {

         int sum = 0;

         for (IntWritable intWritable:values){

             sum += intWritable.get();

         }

         context.write(key,new IntWritable(sum));

     }

 }

Main

 package com.zln.chapter03;

 import org.apache.hadoop.conf.Configured;

 import org.apache.hadoop.fs.Path;

 import org.apache.hadoop.io.IntWritable;

 import org.apache.hadoop.io.Text;

 import org.apache.hadoop.mapreduce.Job;

 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

 import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;

 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

 import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

 import org.apache.hadoop.util.Tool;

 import org.apache.hadoop.util.ToolRunner;

 /**

  * Created by sherry on 15-7-12.

  */

 public class WordCount extends Configured implements Tool{

     public int run(String[] args) throws Exception {

         Job job = new Job(getConf());

         job.setJarByClass(WordCount.class);

         job.setJobName("WordCount");

         job.setOutputKeyClass(Text.class);

         job.setOutputValueClass(IntWritable.class);

         job.setMapperClass(WordCountMap.class);

         job.setReducerClass(WordCountReduce.class);

         job.setInputFormatClass(TextInputFormat.class);

         job.setOutputFormatClass(TextOutputFormat.class);

         FileInputFormat.setInputPaths(job,new Path(args[0]));

         FileOutputFormat.setOutputPath(job,new Path(args[1]));

         boolean success = job.waitForCompletion(true);

         return success?0:1;

     }

     public static void main(String[] args) throws Exception {

         int ret = ToolRunner.run(new WordCount(),args);

         System.exit(ret);

     }

 }

使用hadoop统计多个文本中每个单词数目的更多相关文章

C#统计给定的文本中字符出现的次数，使用循环和递归两种方法
前几天看了一个.net程序员面试题目,题目是”统计给定的文本中字符出现的次数,使用循环和递归两种方法“. 下面是我对这个题目的解法: 1.使用循环: /// <summary> /// 使 ...
python统计文本中每个单词出现的次数
.python统计文本中每个单词出现的次数: #coding=utf-8 __author__ = 'zcg' import collections import os with open('abc. ...
Perl-统计文本中各个单词出现的次数（NVDIA2019笔试）
1.原题 2.perl脚本 print "================ Method 1=====================\n"; open IN,'<','an ...
Python的 counter内置函数，统计文本中的单词数量
counter是 colletions内的一个类可以理解为一个简单的计数 import collections str1=['a','a','b','d'] m=collections.Counte ...
C#统计英文文本中的单词数并排序
思路如下:1.使用的Hashtable(高效)集合,记录每个单词出现的次数2.采用ArrayList对Hashtable中的Keys按字母序排列3.排序使用插入排序(稳定) public void S ...
C++统计一段文字中各单词出现的频率
#include <iostream> using namespace std; /* run this program using the console pauser or add y ...
一个简单的程序，统计文本文档中的单词和汉字数，逆序排列（出现频率高的排在最前面）。python实现。
仅简单统计英文. from collections import Counter f = open('1') c = Counter() for line in f: g = (x for x in ...
ruby的hash学习笔记例：将字符串文本中的单词存放在map中
text = 'The rain in Spain falls mainly in the plain.'first = Hash.new []second = Hash.new {|hash,key ...
python统计英文文本中的回文单词数
1. 要求: 给定一篇纯英文的文本,统计其中回文单词的比列,并输出其中的回文单词,文本数据如下: This is Everyday Grammar. I am Madam Lucija And I a ...

随机推荐

Java面试不得不知的问题（一）
程序员面试 1. 面向对象的特征有哪些方面 · 抽象:抽象就是忽略一个主题中与当前目标无关的那些方面,以便更充分地注意与当前目标有关的方面.抽象并不打算了解全部问题,而只是选择其中的一部分, ...
java Web 常见错误集锦及解决方法
只能删除pid为整数的商品,32位的pid商品不能删除? 原因onclick="agree('${s.pid}')" 括号中需要加 ' ' 删除多余的工作空间? 使用prefer ...
this以及执行上下文概念的重新认识
在理解this的绑定过程之前,必须要先明白调用位置,调用位置指的是函数在代码中被调用的位置,而不是声明所在的位置. (ES6的箭头函数不在该范围内,它的this在声明时已经绑定了,而不是取决于调用时. ...
Delphi7程序调用C#写的DLL解决办法(转)
近来,因工作需要,必须解决Delphi7写的主程序调用C#写的dll的问题.在网上一番搜索,又经过种种试验,最终证明有以下两种方法可行: 编写C#dll的方法都一样,首先在vs2005中创建一个 ...
Django 入门案例开发
Django是一个重量级的web开发框架,它提供了很多内部已开发好的插件供我们使用:这里不去描述 Django直接进入开发过程. Django入门案例分两部分:一.开发环境的配置:二.业务需求分析. ...
spring-IOC底层机制
JDK与CGLIB的动态代理 JDK动态代理创建代理的方法将需要代理的类传入代理类中(通过构造方法) 在代理方法中创建代理实例(需要一个接口和一个实现接口的类): Proxy.newProxyIn ...
PCA 实例演示二维数据降成1维
import numpy as np # 将二维数据降成1维 num = [(2.5, 2.4), (0.5, 0.7), (2.2, 2.9), (1.9, 2.2), (3.1, 3.0), (2 ...
【Codebase】JQuery获取表单部分数据提交方法
JQuery使用ajax提交整个表单最简便的方法就是$('#form').serialize();但如果仅想保存表单中的部分数据,比如仅更新选中的条目,那么获取数据就比较麻烦了. 解决方法:新建一个表 ...
Python__for循环和列表生成式的区别
话不多,上例子 >>> L = [,,] >>> for i in range(len(L)): L[i] = L[i] + L[i-] print(L) #结果 ...
Python学习第一弹
开发语言: 高级:Python.java.PHP C# GO ruby C++ ——>字节码低级:C.汇编 ...

使用hadoop统计多个文本中每个单词数目

使用hadoop统计多个文本中每个单词数目的更多相关文章

随机推荐

热门专题