共同出现的单词(Word co-occurrence)是指在一个句子中相邻的两个单词。每一个相邻的单词就是一个Co-Occurrence对。

Sample Input:

a b cc, c d d c
I Love U.
dd ee f g s sa dew ad da
So shaken as we are, so wan with care.
Find we a time for frighted peace to pant.
And breathe short-winded accents of new broil.
To be commenced in strands afar remote.
I Love U U love i.
i i i i

Sample Output:

a:b 1
a:time1
a:we1
accents:of1
accents:short-winded1
ad:da1
ad:dew1
afar:remote1
afar:strands1
and:breathe1
are:so1
are:we1
as:shaken1
as:we1
b:cc1
be:commenced1
be:to1
breathe:short-winded1
broil:new1
c:cc1
c:d2
care:with1
commenced:in1
d:d1
dd:ee1
dew:sa1
ee:f1
f:g1
find:we1
for:frighted1
for:time1
frighted:peace1
g:s1
i:i3
i:love3
in:strands1
love:u3
new:of1
pant:to1
peace:to1
s:sa1
shaken:so1
so:wan1
u:u1
wan:with1

Code:

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.StringTokenizer; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.util.GenericOptionsParser; public class CoOccurrence { public static class TextPair implements WritableComparable<TextPair> {
private Text first;
private Text second; public TextPair(){
set(new Text(), new Text());
}
public TextPair(String left, String right) {
set(new Text(left), new Text(right));
}
public TextPair(Text left, Text right) {
set(left, right);
} public void set(Text left, Text right){
String l = left.toString();
String r = right.toString();
int cmp = l.compareTo(r);
if(cmp <= 0){
this.first = left;
this.second = right;
}else{
this.first = right;
this.second = left;
}
} public Text getFirst() {
return first;
}
public Text getSecond() {
return second;
} @Override
public void readFields(DataInput in) throws IOException {
first.readFields(in);
second.readFields(in);
}
@Override
public void write(DataOutput out) throws IOException {
first.write(out);
second.write(out);
}
@Override
public int hashCode() {
return first.hashCode() * 163 + second.hashCode();//May be some trouble here. why 163? sometimes 157
}
@Override
public boolean equals(Object o) {
if (o instanceof TextPair) {
TextPair tp = (TextPair) o;
return first.equals(tp.first) && second.equals(tp.second);
}
return false;
}
@Override
public String toString(){
return first + ":" + second;
}
@Override
public int compareTo(TextPair tp) {
int cmp = first.compareTo(tp.first);
if(cmp != 0)
return cmp;
return second.compareTo(tp.second);
} // A Comparator that com.pares serialized StringPair.
public static class Comparator extends WritableComparator {
private static final Text.Comparator TEXT_COMPARATOR = new Text.Comparator();
public Comparator() {
super(TextPair.class);
}
@Override
public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2){
try {
int firstl1 = WritableUtils.decodeVIntSize(b1[s1]) + readVInt(b1, s1);
int firstl2 = WritableUtils.decodeVIntSize(b2[s2]) + readVInt(b2, s2);
int cmp = TEXT_COMPARATOR.compare(b1, s1, firstl1, b2, s2, firstl2);
if(cmp != 0)
return cmp;
return TEXT_COMPARATOR.compare(b1, s1 + firstl1, l1 - firstl1,
b2, s2 + firstl2, l1 - firstl2);
}catch (IOException e) {
throw new IllegalArgumentException(e);
}
}
}//End of Comparator
static { // register this comparator
WritableComparator.define(TextPair.class, new Comparator());
} // Compare only the first part of the pair, so that reduce is called once for each value of the first part.
public static class FirstComparator extends WritableComparator {
private static final Text.Comparator TEXT_COMPARATOR = new Text.Comparator();
public FirstComparator() {
super(TextPair.class);
}
@Override
public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2){
try {
int firstl1 = WritableUtils.decodeVIntSize(b1[s1]) + readVInt(b1, s1);
int firstl2 = WritableUtils.decodeVIntSize(b2[s2]) + readVInt(b2, s2);
return TEXT_COMPARATOR.compare(b1, s1, firstl1, b2, s2, firstl2);
}catch (IOException e) {
throw new IllegalArgumentException(e);
}
}
/*
@Override
public int compare(WritableComparator a, WritableComparator b) {
if(a instanceof TextPair && b instanceof TextPair)
return ((TextPair)a).first.compareTo(((TextPair)b).first);
return super.compare(a, b);
}*/
}//End of FirstComparator
}//End of TextPair //Partition based on the first part of the pair.
public static class FirstPartitioner extends Partitioner<TextPair,IntWritable>{
@Override
public int getPartition(TextPair key, IntWritable value, int numPartitions) {
return Math.abs(key.getFirst().toString().indexOf(0) * 127) % numPartitions;//May be some trouble here.
}
}//End of FirstPartitioner public static class MyMapper extends Mapper<LongWritable, Text, TextPair, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private static Text word0 = new Text();
private static Text word1 = new Text();
private String pattern = "[^a-zA-Z0-9-']"; @Override
public void map(LongWritable inKey, Text inValue, Context context)throws IOException, InterruptedException {
String line = inValue.toString();
line = line.replaceAll(pattern, " ");
line = line.toLowerCase();
String[] str = line.split(" +");
for(int i=0; i< str.length-1; i++)
{
word0.set(str[i]);
word1.set(str[i+1]);
TextPair pair = new TextPair(word0, word1);
context.write(pair, one);
}
}
}//End of MapClass
public static class MyReducer extends Reducer<TextPair, IntWritable, TextPair, IntWritable> {
private IntWritable result = new IntWritable(); @Override
public void reduce(TextPair inKey, Iterable<IntWritable> inValues, Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : inValues) {
sum += val.get();
}
result.set(sum);
context.write(inKey, result);
}
}//End of MyReducer public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
//conf.set("Hadoop.job.ugi", "sunguoli,cs402");
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
//if (otherArgs.length != 2) {
// System.err.println("Usage: CoOccurrence <in> <out>");
// System.exit(2);
//}
Job job = new Job(conf, "Co-Occurrence");
job.setJarByClass(CoOccurrence.class); job.setMapperClass(MyMapper.class);
job.setMapOutputKeyClass(TextPair.class);
job.setMapOutputValueClass(IntWritable.class); job.setCombinerClass(MyReducer.class); // group and partition by the first int in the pair
//job.setPartitionerClass(FirstPartitioner.class);
//job.setGroupingComparatorClass(FirstGroupingComparator.class); // the reduce output is Text, IntWritable
job.setReducerClass(MyReducer.class);
job.setOutputKeyClass(TextPair.class);
job.setOutputValueClass(IntWritable.class); //FileInputFormat.addInputPath(job, new Path("../shakespeareinput"));
//FileOutputFormat.setOutputPath(job, new Path("output"));
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}//End of main
}//End of CoOccurrence

hadoop实现共同出现的单词(Word co-occurrence)的更多相关文章

  1. Hadoop 统计文件中某个单词出现的次数

    如文件word.txt内容如下: what is you name? my name is zhang san. 要求统计word.txt中出现“is”的次数? 代码如下: PerWordMapper ...

  2. Hadoop入门实例——WordCount统计单词

    首先要说明的是运行Hadoop需要jdk1.6或以上版本,如果你还没有搭建好Hadoop集群,请参考我的另一篇文章: Linux环境搭建Hadoop伪分布模式 马上进入正题. 1.启动Hadoop集群 ...

  3. linux makefile字符串操作函数 替换subst、模式替换patsubst、去首尾空格strip、查找字符串findstring、过滤filter、反过滤filter-out、排序函数sort、取单词word、取单词串wordlist、个数统计words

    1.1       字符操作函数使用 在Makefile中可以使用函数来处理变量,从而让我们的命令或是规则更为的灵活和具有智能.make所支持的函数也不算很多,不过已经足够我们的操作了.函数调用后,函 ...

  4. [LeetCode] Shortest Completing Word 最短完整的单词

    Find the minimum length word from a given dictionary words, which has all the letters from the strin ...

  5. Hadoop:使用原生python编写MapReduce

    功能实现 功能:统计文本文件中所有单词出现的频率功能. 下面是要统计的文本文件 [/root/hadooptest/input.txt] foo foo quux labs foo bar quux ...

  6. Hadoop上路-03_Hadoop JavaAPI

    一.Eclipse安装 1.下载解压 下载:http://www.eclipse.org/downloads/ 解压:SHELL$ sudo tar -zxvf eclipse.tar.gz 2.快捷 ...

  7. 大数据【四】MapReduce(单词计数;二次排序;计数器;join;分布式缓存)

       前言: 根据前面的几篇博客学习,现在可以进行MapReduce学习了.本篇博客首先阐述了MapReduce的概念及使用原理,其次直接从五个实验中实践学习(单词计数,二次排序,计数器,join,分 ...

  8. Hadoop世界中的HelloWorld之WordCount具体分析

    MapReduce 应用举例:单词计数 WorldCount可以说是MapReduce中的helloworld了,下面来看看hadoop中的例子worldcount对其进行的处理过程,也能对mapre ...

  9. 在Hadoop上用Python实现WordCount

    一.简单说明 本例中我们用Python写一个简单的运行在Hadoop上的MapReduce程序,即WordCount(读取文本文件并统计单词的词频).这里我们将要输入的单词文本input.txt和Py ...

随机推荐

  1. ExtJS 4 MVC架构讲解

    大规模客户端应用通常不好实现不好组织也不好维护,因为功能和人力的不断增加,这些应用的规模很快就会超出掌控能力,ExtJS 4 带来了一个新的应用架构,不但可以组织代码,还可以减少实现的内容新的应用架构 ...

  2. nutch Fetcer阶段详解

    job.setSpeculativeExecution(false); 抓网页阶段,不允许同一个任务运行多次,否则,网页就抓重了 为了充分利用闲置资源,加快map 和 reduce 的执行,于是有Sp ...

  3. 使用JQUERY实现局部页面定时刷新

    没办法,运维会一点点前端,还是有好处的.. 说不定,BOOTSTRAP也得会一点点.. 本想用流式输出的搞定的,但没搞定,就取巧了... 代理简单: <script src="//cd ...

  4. C51 延时程序

    一.相关换算 1.1s=10^3ms(毫秒)=10^6μs(微秒)=10^9ns(纳秒)=10^12ps(皮秒)=10^15fs(飞秒)=10^18as(阿秒)=10^21zm(仄秒)=10^24ym ...

  5. gcc编译动态和静态链接库

    我们通常把一些公用函数制作成函数库,供其它程序使用.函数库分为静态库和动态库两种.静态库在程序编译时会被连接到目标代码中,程序运行时将不再需要该静态库.动态库在程序编译时并不会被连接到目标代码中,而是 ...

  6. php header()跳转

    test1.php <?PHP $g_user = "Jack"; echo $g_user; ?> test3.php <?PHP header('Locati ...

  7. mysql oracle静默 一键安装脚本

    pre-read; 为了达到一键搞定的目的!现Ruiy简单做如下几小条规定   如果你想这么一键来搞定请君莫要违背约束! 1. 下载 `二进制` mysql软件介质版本不限,二进制包务必,源码及rpm ...

  8. 「Githug」Git 游戏通关流程

    Githug 他喵的这是个啥!?难道不是 GitHub 拼错了么,和 Git 什么关系? 和游戏又有什么关系? 其实,他的元身在这里:https://github.com/Gazler/githug  ...

  9. 数据库 —— mySQL 的安装

    [转载Link]MySQL-5.6.24免安装版配置方法,有需要的朋友可以参考下. 1. 下载MySQL Community Server 5.6.24 2. 解压MySQL压缩包 将以下载的MySQ ...

  10. Android布局属性详解剖析

    View的布局显示方式有下面几种: 线性布局(LinearLayout) 相对布局(RelativeLayout) 表格布局(TableLayout) 网格视图(GridView) 标签布局(TabL ...