共同出现的单词(Word co-occurrence)是指在一个句子中相邻的两个单词。每一个相邻的单词就是一个Co-Occurrence对。

Sample Input:

a b cc, c d d c
I Love U.
dd ee f g s sa dew ad da
So shaken as we are, so wan with care.
Find we a time for frighted peace to pant.
And breathe short-winded accents of new broil.
To be commenced in strands afar remote.
I Love U U love i.
i i i i

Sample Output:

a:b 1
a:time1
a:we1
accents:of1
accents:short-winded1
ad:da1
ad:dew1
afar:remote1
afar:strands1
and:breathe1
are:so1
are:we1
as:shaken1
as:we1
b:cc1
be:commenced1
be:to1
breathe:short-winded1
broil:new1
c:cc1
c:d2
care:with1
commenced:in1
d:d1
dd:ee1
dew:sa1
ee:f1
f:g1
find:we1
for:frighted1
for:time1
frighted:peace1
g:s1
i:i3
i:love3
in:strands1
love:u3
new:of1
pant:to1
peace:to1
s:sa1
shaken:so1
so:wan1
u:u1
wan:with1

Code:

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.StringTokenizer; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.util.GenericOptionsParser; public class CoOccurrence { public static class TextPair implements WritableComparable<TextPair> {
private Text first;
private Text second; public TextPair(){
set(new Text(), new Text());
}
public TextPair(String left, String right) {
set(new Text(left), new Text(right));
}
public TextPair(Text left, Text right) {
set(left, right);
} public void set(Text left, Text right){
String l = left.toString();
String r = right.toString();
int cmp = l.compareTo(r);
if(cmp <= 0){
this.first = left;
this.second = right;
}else{
this.first = right;
this.second = left;
}
} public Text getFirst() {
return first;
}
public Text getSecond() {
return second;
} @Override
public void readFields(DataInput in) throws IOException {
first.readFields(in);
second.readFields(in);
}
@Override
public void write(DataOutput out) throws IOException {
first.write(out);
second.write(out);
}
@Override
public int hashCode() {
return first.hashCode() * 163 + second.hashCode();//May be some trouble here. why 163? sometimes 157
}
@Override
public boolean equals(Object o) {
if (o instanceof TextPair) {
TextPair tp = (TextPair) o;
return first.equals(tp.first) && second.equals(tp.second);
}
return false;
}
@Override
public String toString(){
return first + ":" + second;
}
@Override
public int compareTo(TextPair tp) {
int cmp = first.compareTo(tp.first);
if(cmp != 0)
return cmp;
return second.compareTo(tp.second);
} // A Comparator that com.pares serialized StringPair.
public static class Comparator extends WritableComparator {
private static final Text.Comparator TEXT_COMPARATOR = new Text.Comparator();
public Comparator() {
super(TextPair.class);
}
@Override
public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2){
try {
int firstl1 = WritableUtils.decodeVIntSize(b1[s1]) + readVInt(b1, s1);
int firstl2 = WritableUtils.decodeVIntSize(b2[s2]) + readVInt(b2, s2);
int cmp = TEXT_COMPARATOR.compare(b1, s1, firstl1, b2, s2, firstl2);
if(cmp != 0)
return cmp;
return TEXT_COMPARATOR.compare(b1, s1 + firstl1, l1 - firstl1,
b2, s2 + firstl2, l1 - firstl2);
}catch (IOException e) {
throw new IllegalArgumentException(e);
}
}
}//End of Comparator
static { // register this comparator
WritableComparator.define(TextPair.class, new Comparator());
} // Compare only the first part of the pair, so that reduce is called once for each value of the first part.
public static class FirstComparator extends WritableComparator {
private static final Text.Comparator TEXT_COMPARATOR = new Text.Comparator();
public FirstComparator() {
super(TextPair.class);
}
@Override
public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2){
try {
int firstl1 = WritableUtils.decodeVIntSize(b1[s1]) + readVInt(b1, s1);
int firstl2 = WritableUtils.decodeVIntSize(b2[s2]) + readVInt(b2, s2);
return TEXT_COMPARATOR.compare(b1, s1, firstl1, b2, s2, firstl2);
}catch (IOException e) {
throw new IllegalArgumentException(e);
}
}
/*
@Override
public int compare(WritableComparator a, WritableComparator b) {
if(a instanceof TextPair && b instanceof TextPair)
return ((TextPair)a).first.compareTo(((TextPair)b).first);
return super.compare(a, b);
}*/
}//End of FirstComparator
}//End of TextPair //Partition based on the first part of the pair.
public static class FirstPartitioner extends Partitioner<TextPair,IntWritable>{
@Override
public int getPartition(TextPair key, IntWritable value, int numPartitions) {
return Math.abs(key.getFirst().toString().indexOf(0) * 127) % numPartitions;//May be some trouble here.
}
}//End of FirstPartitioner public static class MyMapper extends Mapper<LongWritable, Text, TextPair, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private static Text word0 = new Text();
private static Text word1 = new Text();
private String pattern = "[^a-zA-Z0-9-']"; @Override
public void map(LongWritable inKey, Text inValue, Context context)throws IOException, InterruptedException {
String line = inValue.toString();
line = line.replaceAll(pattern, " ");
line = line.toLowerCase();
String[] str = line.split(" +");
for(int i=0; i< str.length-1; i++)
{
word0.set(str[i]);
word1.set(str[i+1]);
TextPair pair = new TextPair(word0, word1);
context.write(pair, one);
}
}
}//End of MapClass
public static class MyReducer extends Reducer<TextPair, IntWritable, TextPair, IntWritable> {
private IntWritable result = new IntWritable(); @Override
public void reduce(TextPair inKey, Iterable<IntWritable> inValues, Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : inValues) {
sum += val.get();
}
result.set(sum);
context.write(inKey, result);
}
}//End of MyReducer public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
//conf.set("Hadoop.job.ugi", "sunguoli,cs402");
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
//if (otherArgs.length != 2) {
// System.err.println("Usage: CoOccurrence <in> <out>");
// System.exit(2);
//}
Job job = new Job(conf, "Co-Occurrence");
job.setJarByClass(CoOccurrence.class); job.setMapperClass(MyMapper.class);
job.setMapOutputKeyClass(TextPair.class);
job.setMapOutputValueClass(IntWritable.class); job.setCombinerClass(MyReducer.class); // group and partition by the first int in the pair
//job.setPartitionerClass(FirstPartitioner.class);
//job.setGroupingComparatorClass(FirstGroupingComparator.class); // the reduce output is Text, IntWritable
job.setReducerClass(MyReducer.class);
job.setOutputKeyClass(TextPair.class);
job.setOutputValueClass(IntWritable.class); //FileInputFormat.addInputPath(job, new Path("../shakespeareinput"));
//FileOutputFormat.setOutputPath(job, new Path("output"));
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}//End of main
}//End of CoOccurrence

hadoop实现共同出现的单词(Word co-occurrence)的更多相关文章

  1. Hadoop 统计文件中某个单词出现的次数

    如文件word.txt内容如下: what is you name? my name is zhang san. 要求统计word.txt中出现“is”的次数? 代码如下: PerWordMapper ...

  2. Hadoop入门实例——WordCount统计单词

    首先要说明的是运行Hadoop需要jdk1.6或以上版本,如果你还没有搭建好Hadoop集群,请参考我的另一篇文章: Linux环境搭建Hadoop伪分布模式 马上进入正题. 1.启动Hadoop集群 ...

  3. linux makefile字符串操作函数 替换subst、模式替换patsubst、去首尾空格strip、查找字符串findstring、过滤filter、反过滤filter-out、排序函数sort、取单词word、取单词串wordlist、个数统计words

    1.1       字符操作函数使用 在Makefile中可以使用函数来处理变量,从而让我们的命令或是规则更为的灵活和具有智能.make所支持的函数也不算很多,不过已经足够我们的操作了.函数调用后,函 ...

  4. [LeetCode] Shortest Completing Word 最短完整的单词

    Find the minimum length word from a given dictionary words, which has all the letters from the strin ...

  5. Hadoop:使用原生python编写MapReduce

    功能实现 功能:统计文本文件中所有单词出现的频率功能. 下面是要统计的文本文件 [/root/hadooptest/input.txt] foo foo quux labs foo bar quux ...

  6. Hadoop上路-03_Hadoop JavaAPI

    一.Eclipse安装 1.下载解压 下载:http://www.eclipse.org/downloads/ 解压:SHELL$ sudo tar -zxvf eclipse.tar.gz 2.快捷 ...

  7. 大数据【四】MapReduce(单词计数;二次排序;计数器;join;分布式缓存)

       前言: 根据前面的几篇博客学习,现在可以进行MapReduce学习了.本篇博客首先阐述了MapReduce的概念及使用原理,其次直接从五个实验中实践学习(单词计数,二次排序,计数器,join,分 ...

  8. Hadoop世界中的HelloWorld之WordCount具体分析

    MapReduce 应用举例:单词计数 WorldCount可以说是MapReduce中的helloworld了,下面来看看hadoop中的例子worldcount对其进行的处理过程,也能对mapre ...

  9. 在Hadoop上用Python实现WordCount

    一.简单说明 本例中我们用Python写一个简单的运行在Hadoop上的MapReduce程序,即WordCount(读取文本文件并统计单词的词频).这里我们将要输入的单词文本input.txt和Py ...

随机推荐

  1. 运行Capture.exe找不到cdn_sfl401as.dll

    今天运行capture Orcad16.6显示缺少cdn_sfl401as.dll,昨天运行时并没有发现这种情况,回想今天安装了modelsim之后才发生这种情况,于是将modelsim卸载掉,再次启 ...

  2. mschedule 简单linux进程管理(树莓派)

    树莓派是神奇的机器,CPU和内存都少的可怜,但体积小功耗低,在上面搞些动搞些西其实也挺有意思,挺好玩的.装的是pidara,基本服务没有精简多少,先cat一下CPU和RAM. [able@raspi ...

  3. Solr4.8.0源码分析(10)之Lucene的索引文件(3)

    Solr4.8.0源码分析(10)之Lucene的索引文件(3) 1. .si文件 .si文件存储了段的元数据,主要涉及SegmentInfoFormat.java和Segmentinfo.java这 ...

  4. KEIL C51中的_at_关键字

    绝对位置变量 变量可以在你的C程序中的绝对内存地址位于源模块使用_at_关键字.此功能的用法是: 类型 _ memory_space _ 变量名 _at _  常数 ; 其中:memory_space ...

  5. android 状态栏(StatusBar)

    一.SystemUI 概述 自 android2.2 开始 , 原本存在与 framework-res.apk 中的状态栏和下拉通知栏界面控制被分割出一个单独的 apk 文件 , 命名为 System ...

  6. android开发--翻转闹铃(从制作到打包)

    (转载请声明,文章原作地址http://blog.csdn.net/buptgshengod) 最近在家放假,一直想做一个手机应用,于是就自己动手做起来了.想到一个注意就是当闹铃响的时候翻转闹铃,声音 ...

  7. SQL SELECT INTO 语句

    SQL SELECT INTO 语句可用于创建表的备份复件. SELECT INTO 语句 SELECT INTO 语句从一个表中选取数据,然后把数据插入另一个表中. SELECT INTO 语句常用 ...

  8. LINQ to SQLite完美解决方案

    1.下载安装LinqConnectExpress(就是LinqConnect免费版) 2.安装好后就和LINQ TO  SQL 一样了! 3.查询(增删改查和LINQ TO SQL 完全一样,你可以不 ...

  9. HttpContext之公交车

    小编刚在路边晃攸了许久,好不容易才等到个公交车,想想真是不容易呀,这年代路边打的也打不到,坐个公交车也难等呀. 顺势一想,HttpContext不也是一输运行在web上的一辆公交车吗?公交车让我们成为 ...

  10. 9. memcpy() memccpy() memmove() strcpy() memset()

    部分参考: http://www.cppblog.com/kang/archive/2009/04/05/78984.html 表头文件: #include <string.h>定义函数: ...