问题描写叙述:给定一个大文件,文件里的内容每一行为:文档名,文档内容。

input

文档名1,word1 Word2 .......

文档名2,word1 Word2 .......

output

word  文档名  tfidf值

package com.elex.mapreduce;

import java.io.IOException;
import java.net.URI;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Map;
import java.util.StringTokenizer; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import com.elex.mapreduce.TFIDF_4.IDFMap;
import com.elex.mapreduce.TFIDF_4.IDFReduce;
import com.elex.utils.DataClean;
import com.google.common.io.Closeables; public class TFIDF_5 {
public static String hdfsURL = "hdfs://namenode:8020";
public static String fileURL = "/tmp/usercount"; public static class TFMap extends Mapper<Object, Text, Text, Text> {
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
String userWordstmp = value.toString();
StringTokenizer userWords = new StringTokenizer(userWordstmp, "\n");
while (userWords.hasMoreTokens()) {
String userWordFragtmp = userWords.nextToken();
StringTokenizer userWordFrag = new StringTokenizer(
userWordFragtmp, ",");
String user = userWordFrag.nextToken();
Text outputKey = new Text();
Text outputValue = new Text();
while (userWordFrag.hasMoreTokens()) {
String words = userWordFrag.nextToken();
HashMap<String, Integer> wordMap = DataClean.clean(words,
"!total");
int wordTotal = wordMap.get("!total");
wordMap.remove("!total");
for (Map.Entry<String, Integer> wordEntry : wordMap
.entrySet()) {
String word = wordEntry.getKey();
int wordCount = wordEntry.getValue();
float tf = (float) wordCount / (float) wordTotal;
String outputStr = word + " " + Float.toString(tf)
+ ",";
byte[] bytes = outputStr.getBytes();
outputValue.append(bytes, 0, bytes.length);
}
}
outputKey.set(user);
context.write(outputKey, outputValue);
}
}
} public static class TFReduce extends Reducer<Text, Text, Text, Text> {
public void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
// StringBuffer sb = new StringBuffer();
Iterator<Text> iter = values.iterator();
while (iter.hasNext()) {
// sb.append(iter.next().toString() + "\t");
context.write(key, iter.next());
}
// Text outputValue = new Text();
// outputValue.set(sb.toString());
// context.write(key, outputValue);
}
} public static class IDFMap extends Mapper<Object, Text, Text, Text> {
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
String valuesTmp = value.toString();
StringTokenizer userWordFrag = new StringTokenizer(valuesTmp, "\n");
while (userWordFrag.hasMoreTokens()) {
// String userWordtmp = userWordFrag.nextToken();
StringTokenizer userWords = new StringTokenizer(
userWordFrag.nextToken(), "\t");
String user = userWords.nextToken();
while (userWords.hasMoreTokens()) {
StringTokenizer wordTFs = new StringTokenizer(
userWords.nextToken(), ",");
while (wordTFs.hasMoreTokens()) {
StringTokenizer wordTF = new StringTokenizer(
wordTFs.nextToken());
String word = wordTF.nextToken();
String tf = wordTF.nextToken();
Text outputKey = new Text();
Text outputValue = new Text();
outputKey.set(word);
outputValue.set(user + "\t" + tf);
context.write(outputKey, outputValue);
}
}
} }
} public static class IDFReduce extends Reducer<Text, Text, Text, Text> {
long userCount = 0; public void setup(Context context) throws IOException {
Configuration conf = context.getConfiguration();
Path path = new Path(fileURL);
FileSystem fs = FileSystem.get(URI.create(hdfsURL), conf);
if (!fs.isFile(path)) {
FSDataOutputStream output = fs.create(path, true);
output.close();
}
FSDataInputStream input = fs.open(path);
StringBuffer sb = new StringBuffer();
byte[] bytes = new byte[1024];
int status = input.read(bytes);
while (status != -1) {
sb.append(new String(bytes));
status = input.read(bytes);
}
if (!"".equals(sb.toString())) {
userCount = Long.parseLong(sb.toString().trim());
}
input.close();
} public void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
LinkedList<String> userList = new LinkedList<String>();
Iterator<Text> iter = values.iterator();
long wordCount = 0;
while (iter.hasNext()) {
wordCount++;
userList.add(iter.next().toString());
}
float idf = (float) Math.log((float) userCount
/ (float) (wordCount + 1));
Iterator<String> userIter = userList.iterator();
Text outputValue = new Text();
while (userIter.hasNext()) {
String usertftmp = userIter.next();
StringTokenizer usertf = new StringTokenizer(usertftmp, "\t");
String user = usertf.nextToken();
String tfStr = usertf.nextToken();
float tf = Float.parseFloat(tfStr.trim().toString());
float tfidf = tf * idf;
String outputTmp = user + "\t" + tfidf;
outputValue.set(outputTmp);
context.write(key, outputValue);
}
}
} public static class UserCountMap extends Mapper<Object, Text, Text, Text> { public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
String userWordtmp = value.toString();
StringTokenizer userWord = new StringTokenizer(userWordtmp, "\n");
while (userWord.hasMoreTokens()) {
userWord.nextToken();
Text outputKey = new Text();
outputKey.set("usercount");
Text one = new Text();
one.set("1");
context.write(outputKey, one);
}
}
} public static class UserCountCombine extends
Reducer<Text, Text, Text, Text> {
public void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
long user = 0;
for (Text value : values) {
String valueTmp = value.toString();
user += Long.parseLong(valueTmp);
}
Text outputValue = new Text();
outputValue.set(Long.toString(user));
context.write(key, outputValue);
}
} public static class UserCountReduce extends Reducer<Text, Text, Text, Text> {
int userCount = 0; public void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
for (Text value : values) {
String valueTmp = value.toString();
userCount += Long.parseLong(valueTmp);
}
} public void cleanup(Context context) throws IOException {
Configuration conf = context.getConfiguration();
FileSystem fs = FileSystem.get(URI.create(hdfsURL), conf);
Path path = new Path(fileURL);
FSDataOutputStream output = fs.create(path, true);
String content = Long.toString(userCount);
output.write(content.getBytes());
output.flush();
output.close();
}
} public static void main(String[] args) throws IOException,
ClassNotFoundException, InterruptedException {
// TODO Auto-generated method stub
Configuration conf = new Configuration();
// conf.set("mapred.child.java.opts", "-Xmx4096m");
Job tfJob = Job.getInstance(conf, "tfjob");
tfJob.setJarByClass(TFIDF_5.class);
tfJob.setMapperClass(TFMap.class);
// tfJob.setCombinerClass(TFCombine.class);
tfJob.setReducerClass(TFReduce.class);
tfJob.setOutputKeyClass(Text.class);
tfJob.setOutputValueClass(Text.class);
FileInputFormat.setInputPaths(tfJob, new Path(args[0]));
FileOutputFormat.setOutputPath(tfJob, new Path(args[1]));
tfJob.waitForCompletion(true); // Job userCountJob = Job.getInstance(conf, "usercountjob");
// userCountJob.setJarByClass(TFIDF_5.class);
// userCountJob.setMapperClass(UserCountMap.class);
// userCountJob.setCombinerClass(UserCountCombine.class);
// userCountJob.setReducerClass(UserCountReduce.class);
// userCountJob.setOutputKeyClass(Text.class);
// userCountJob.setOutputValueClass(Text.class);
// FileInputFormat.setInputPaths(userCountJob, new Path(args[1]));
// FileOutputFormat.setOutputPath(userCountJob, new Path(args[2]));
// userCountJob.waitForCompletion(true);
<span style="white-space: pre;">		</span>//计算文档数,并暂时储存到hdfs上
		Counter ct = tfJob.getCounters().findCounter(
"org.apache.hadoop.mapreduce.TaskCounter", "MAP_INPUT_RECORDS");
System.out.println(ct.getValue());
Iterable<String> groupNames = tfJob.getCounters().getGroupNames();
for (String groupName : groupNames) {
System.out.println(groupName);
}
FileSystem fs = FileSystem.get(URI.create(hdfsURL), conf);
Path path = new Path(fileURL);
FSDataOutputStream output = fs.create(path, true);
String content = Long.toString(ct.getValue());
output.write(content.getBytes());
output.flush();
output.close(); Job idfJob = Job.getInstance(conf, "idfjob");
idfJob.setJarByClass(TFIDF_5.class);
idfJob.setMapperClass(IDFMap.class);
idfJob.setReducerClass(IDFReduce.class);
idfJob.setOutputKeyClass(Text.class);
idfJob.setOutputValueClass(Text.class);
FileInputFormat.setInputPaths(idfJob, new Path(args[1]));
FileOutputFormat.setOutputPath(idfJob, new Path(args[3]));
System.exit(idfJob.waitForCompletion(true) ? 0 : 1); } }

最初运用了一个单独的job计算文档数,后面经过公司前辈的指点,能够通过计算tf的时候运用输入数据的条数来巧妙的计算文档数。

运用mapreduce计算tf-idf的更多相关文章

  1. TF/IDF(term frequency/inverse document frequency)

    TF/IDF(term frequency/inverse document frequency) 的概念被公认为信息检索中最重要的发明. 一. TF/IDF描述单个term与特定document的相 ...

  2. TF/IDF计算方法

    FROM:http://blog.csdn.net/pennyliang/article/details/1231028 我们已经谈过了如何自动下载网页.如何建立索引.如何衡量网页的质量(Page R ...

  3. tf–idf算法解释及其python代码实现(下)

    tf–idf算法python代码实现 这是我写的一个tf-idf的简单实现的代码,我们知道tfidf=tf*idf,所以可以分别计算tf和idf值在相乘,首先我们创建一个简单的语料库,作为例子,只有四 ...

  4. tf–idf算法解释及其python代码实现(上)

    tf–idf算法解释 tf–idf, 是term frequency–inverse document frequency的缩写,它通常用来衡量一个词对在一个语料库中对它所在的文档有多重要,常用在信息 ...

  5. 文本分类学习(三) 特征权重(TF/IDF)和特征提取

    上一篇中,主要说的就是词袋模型.回顾一下,在进行文本分类之前,我们需要把待分类文本先用词袋模型进行文本表示.首先是将训练集中的所有单词经过去停用词之后组合成一个词袋,或者叫做字典,实际上一个维度很大的 ...

  6. 信息检索中的TF/IDF概念与算法的解释

    https://blog.csdn.net/class_brick/article/details/79135909 概念 TF-IDF(term frequency–inverse document ...

  7. Elasticsearch学习之相关度评分TF&IDF

    relevance score算法,简单来说,就是计算出,一个索引中的文本,与搜索文本,他们之间的关联匹配程度 Elasticsearch使用的是 term frequency/inverse doc ...

  8. tf idf公式及sklearn中TfidfVectorizer

    在文本挖掘预处理之向量化与Hash Trick中我们讲到在文本挖掘的预处理中,向量化之后一般都伴随着TF-IDF的处理,那么什么是TF-IDF,为什么一般我们要加这一步预处理呢?这里就对TF-IDF的 ...

  9. 25.TF&IDF算法以及向量空间模型算法

    主要知识点: boolean model IF/IDF vector space model     一.boolean model     在es做各种搜索进行打分排序时,会先用boolean mo ...

随机推荐

  1. 分享非常有用的Java程序 (关键代码) (一)

    原文:分享非常有用的Java程序 (关键代码) (一)   分享一些非常有用的Java程序 (关键代码) ,希望对你有所帮助. 1.  得到当前方法的名字 String methodName = Th ...

  2. Regionals 2012, North America - Greater NY 解题报告

    这套题..除了几何的都出了 完全没时间学几何.杯具 A,B,J 水题不解释 C.Pen Counts 这题的话 写几个不等式限制边得范围就行了 然后枚举最小边 D.Maximum Random Wal ...

  3. Amlogic开关机按键功能实现

    在做AMlogic项目的时候,配置按键后,发现电源键仅仅能关机,不能开机,非常是郁闷 后来发现是漏掉了一个地方没有配置,firmware/arc_power/irremote2arc.c 这个文件中面 ...

  4. [置顶] 殊途同归——总结asp.net

    怀着期望,忐忑的心情看完了asp.net的一部分视频,这部分的学习也到了一个段落,颗粒归仓的工作还是要做的,但是有什么比一张图来得更直观有效呢? 先来张图,这次真的是有图有真相: 通过asp.net的 ...

  5. Spring笔记 - Bean xml装配

    命名空间表 aop Provides elements for declaring aspects and for automatically proxying @AspectJannotated c ...

  6. 关于 调用 JNI JAR 的说明和注意事项,调用第三方 JAR SDK 和 翻译 安卓 JAVA 代码 的说明 V2015.6.10

    关于 调用 JNI JAR 的说明和注意事项,调用第三方 JAR SDK 和 翻译 安卓 JAVA 代码 的说明 V2015.6.10 转载请标明出处,否则死全家.选择[复制链接]即可得到出处. (* ...

  7. 内嵌W5100的网络模块WIZ812MJ--数据手册

    1.简介 WIZ812MJ是一款内嵌了W5100(TCP/IP硬件芯片,内置PHY).MAG-JACK(带变压器的RJ45)和其他胶连逻辑的网络模块.它可以当作一个组件使用,而且不需要为W5100和变 ...

  8. dedecms的安装以及为他配置虚拟主机

    一.概念: 1.CMS是:Content Manage System   内容管理系统 内容包括:商品,文章,软件,视频 2.当前市面上常用的CMS有哪些? DedeCMS: 织梦     中小型公司 ...

  9. sql:oracle, CURSOR

    CursorsYou use a cursor to fetch rows returned by a query. You retrieve the rows into the cursor usi ...

  10. Qt for Android 部署流程分析

    原地址:http://blog.csdn.net/foruok/article/details/17796017 今天为了测试使用 Qt Creator 3.0.0 开发的纯 C 工程,利用了 Win ...