第一次迭代
1 package com.laoxiao.mr.weibo; import java.io.StringReader; import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme; /**
* 第一个MR,计算TF和计算N(微博总数)
* @author root
*
*/
public class firstMapper extends Mapper<LongWritable, Text, Text, IntWritable>{ protected void map(LongWritable key, Text value, Context context)
throws java.io.IOException ,InterruptedException {
String [] temp=StringUtils.split(value.toString(),"\t");
if(temp.length>=2){
String id=temp[0].trim();
String str=temp[1].trim();
StringReader sr =new StringReader(str);
IKSegmenter ikSegmenter =new IKSegmenter(sr, true);
Lexeme word=null;
while( (word=ikSegmenter.next()) !=null ){
String w= word.getLexemeText();
context.write(new Text(w+"_"+id), new IntWritable(1));
}
context.write(new Text("count"), new IntWritable(1));
}else{
System.out.println("value is error:"+value.toString());
}
};
}
package com.laoxiao.mr.weibo; import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer; import sun.management.resources.agent; public class firstReducer extends Reducer<Text, IntWritable, Text, IntWritable>{ protected void reduce(Text arg0, java.lang.Iterable<IntWritable> arg1, Context arg2)
throws java.io.IOException ,InterruptedException {
int sum=0;
for (IntWritable i : arg1) {
sum+=i.get();
}
arg2.write(arg0, new IntWritable(sum));
};
} package com.laoxiao.mr.weibo; import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner; public class firstRepartition extends HashPartitioner<Text, IntWritable>{ @Override
public int getPartition(Text key, IntWritable value, int numReduceTasks) {
if(key.toString().equals("count")){
return 3;
}else{
return super.getPartition(key, value, numReduceTasks-1);
} }
} package com.laoxiao.mr.weibo; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class firstJob { public static void main(String[] args) {
Configuration config=new Configuration();
config.set("fs.defaultFS", "hdfs://node1:8020");
config.set("yarn.resourcemanager.hostname", "node1");
try {
FileSystem fs =FileSystem.get(config);
Job job=Job.getInstance(config);
job.setJarByClass(firstJob.class);
job.setJobName("weibo1"); job.setMapperClass(firstMapper.class);
job.setReducerClass(firstReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setPartitionerClass(firstRepartition.class);
//job.setCombinerClass(firstReducer.class);
job.setNumReduceTasks(4); FileInputFormat.addInputPath(job, new Path("/root/input/data/weibo.txt")); Path path =new Path("/usr/output/weibo1");
if(fs.exists(path)){
fs.delete(path, true);
}
FileOutputFormat.setOutputPath(job,path); boolean f= job.waitForCompletion(true);
if(f){
System.out.println("first job run finished!!");
} } catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}

第二次迭代

 package com.laoxiao.mr.weibo;

 import java.io.IOException;

 import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
//统计df:词在多少个微博中出现过。
public class secondMapper extends Mapper<LongWritable, Text, Text, IntWritable>{ protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException { //获取当前 mapper task的数据片段(split)
FileSplit fs = (FileSplit) context.getInputSplit(); if (!fs.getPath().getName().contains("part-r-00003")) { String[] v = value.toString().trim().split("\t");
if (v.length >= 2) {
String[] ss = v[0].split("_");
if (ss.length >= 2) {
String w = ss[0];
context.write(new Text(w), new IntWritable(1));
}
} else {
System.out.println(value.toString() + "-------------");
}
} }
}
package com.laoxiao.mr.weibo; import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer; public class secondReducer extends Reducer<Text, IntWritable, Text, IntWritable>{ protected void reduce(Text arg0, java.lang.Iterable<IntWritable> arg1, Context context)
throws java.io.IOException ,InterruptedException {
int sum=0;
for (IntWritable i : arg1) {
sum+=1;
}
context.write(arg0, new IntWritable(sum));
};
}
package com.laoxiao.mr.weibo; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class secondJob { public static void main(String[] args) {
Configuration config=new Configuration();
config.set("fs.defaultFS", "hdfs://node1:8020");
config.set("yarn.resourcemanager.hostname", "node1");
try {
FileSystem fs =FileSystem.get(config);
Job job=Job.getInstance(config);
job.setJarByClass(secondJob.class);
job.setJobName("weibo2"); job.setMapperClass(secondMapper.class);
job.setReducerClass(secondReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
//job.setPartitionerClass(firstRepartition.class);
//job.setCombinerClass(firstReducer.class);
//job.setNumReduceTasks(4); FileInputFormat.addInputPath(job, new Path("/usr/output/weibo1")); Path path =new Path("/usr/output/weibo2");
if(fs.exists(path)){
fs.delete(path, true);
}
FileOutputFormat.setOutputPath(job,path); boolean f= job.waitForCompletion(true);
if(f){
System.out.println("second job run finished!!");
} } catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}

第三次迭代

package com.laoxiao.mr.weibo;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.StringReader;
import java.net.URI;
import java.text.NumberFormat;
import java.util.HashMap;
import java.util.Map; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme; /**
* 最后计算
* @author root
*
*/
public class LastMapper extends Mapper<LongWritable, Text, Text, Text> {
//存放微博总数
public static Map<String, Integer> cmap = null;
//存放df
public static Map<String, Integer> df = null; // 在map方法执行之前
protected void setup(Context context) throws IOException,
InterruptedException {
System.out.println("******************");
if (cmap == null || cmap.size() == 0 || df == null || df.size() == 0) { URI[] ss = context.getCacheFiles();
if (ss != null) {
for (int i = 0; i < ss.length; i++) {
URI uri = ss[i];
if (uri.getPath().endsWith("part-r-00003")) {//微博总数
Path path =new Path(uri.getPath());
// FileSystem fs =FileSystem.get(context.getConfiguration());
// fs.open(path);
BufferedReader br = new BufferedReader(new FileReader(path.getName()));
String line = br.readLine();
if (line.startsWith("count")) {
String[] ls = line.split("\t");
cmap = new HashMap<String, Integer>();
cmap.put(ls[0], Integer.parseInt(ls[1].trim()));
}
br.close();
} else if (uri.getPath().endsWith("part-r-00000")) {//词条的DF
df = new HashMap<String, Integer>();
Path path =new Path(uri.getPath());
BufferedReader br = new BufferedReader(new FileReader(path.getName()));
String line;
while ((line = br.readLine()) != null) {
String[] ls = line.split("\t");
df.put(ls[0], Integer.parseInt(ls[1].trim()));
}
br.close();
}
}
}
}
} protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
FileSplit fs = (FileSplit) context.getInputSplit();
// System.out.println("--------------------");
if (!fs.getPath().getName().contains("part-r-00003")) { String[] v = value.toString().trim().split("\t");
if (v.length >= 2) {
int tf =Integer.parseInt(v[1].trim());//tf值
String[] ss = v[0].split("_");
if (ss.length >= 2) {
String w = ss[0];
String id=ss[1]; double s=tf * Math.log(cmap.get("count")/df.get(w));
NumberFormat nf =NumberFormat.getInstance();
nf.setMaximumFractionDigits(5);
context.write(new Text(id), new Text(w+":"+nf.format(s)));
}
} else {
System.out.println(value.toString() + "-------------");
}
}
}
}
package com.laoxiao.mr.weibo; import java.io.IOException; import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer; public class LastReduce extends Reducer<Text, Text, Text, Text>{ protected void reduce(Text key, Iterable<Text> arg1,
Context context)
throws IOException, InterruptedException { StringBuffer sb =new StringBuffer(); for( Text i :arg1 ){
sb.append(i.toString()+"\t");
} context.write(key, new Text(sb.toString()));
} }
package com.laoxiao.mr.weibo; import java.io.IOException; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class LastJob { public static void main(String[] args) {
Configuration config =new Configuration();
config.set("fs.defaultFS", "hdfs://node1:8020");
config.set("yarn.resourcemanager.hostname", "node1");
//config.set("mapred.jar", "C:\\Users\\Administrator\\Desktop\\weibo3.jar");
try {
FileSystem fs =FileSystem.get(config);
//JobConf job =new JobConf(config);
Job job =Job.getInstance(config);
job.setJarByClass(LastJob.class);
job.setJobName("weibo3"); // DistributedCache.addCacheFile(uri, conf);
//2.5
//把微博总数加载到内存
job.addCacheFile(new Path("/usr/output/weibo1/part-r-00003").toUri());
//把df加载到内存
job.addCacheFile(new Path("/usr/output/weibo2/part-r-00000").toUri()); //设置map任务的输出key类型、value类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
// job.setMapperClass();
job.setMapperClass(LastMapper.class);
job.setReducerClass(LastReduce.class); //mr运行时的输入数据从hdfs的哪个目录中获取
FileInputFormat.addInputPath(job, new Path("/usr/output/weibo1"));
Path outpath =new Path("/usr/output/weibo3");
if(fs.exists(outpath)){
fs.delete(outpath, true);
}
FileOutputFormat.setOutputPath(job,outpath ); boolean f= job.waitForCompletion(true);
if(f){
System.out.println("执行job成功");
}
} catch (Exception e) {
e.printStackTrace();
}
}
}

mr微博内容推荐的更多相关文章

  1. 微博广告推荐中有关Hadoop的那些事

    一.背景 微博,一个DAU上亿.每日发博量几千万的社交性产品,拥有庞大的数据集.如何高效得从如此规模的数据集中挖掘出有价值的信息,以增强用户粘性,提高信息传播速度,就成了重中之重.因此,我们引入了ha ...

  2. 基于scrapy的分布式爬虫抓取新浪微博个人信息和微博内容存入MySQL

    为了学习机器学习深度学习和文本挖掘方面的知识,需要获取一定的数据,新浪微博的大量数据可以作为此次研究历程的对象 一.环境准备   python 2.7  scrapy框架的部署(可以查看上一篇博客的简 ...

  3. 基于KNN的相关内容推荐

    如果做网站的内容运营,相关内容推荐可以帮助用户更快地寻找和发现感兴趣的信息,从而提升网站内容浏览的流畅性,进而提升网站的价值转化.相关内容 推荐最常见的两块就是“关联推荐”和“相关内容推荐”,关联推荐 ...

  4. HBase在搜狐内容推荐引擎系统中的应用

    转自:http://www.aboutyun.com/thread-7297-1-1.html Facebook放弃Cassandra之后,对HBase 0.89版本进行了大量稳定性优化,使它真正成为 ...

  5. 基于Tags的简单内容推荐的实现

    原来为了简单方便,自己小网站上的文章页的相关内容推荐就是从数据库里随机抽取数据来填充一个列表,所以一点相关性都没有,更本没有办法引导用户去访问推荐内容. 算法选择 如何能做到相似内容的推荐呢,碍于小网 ...

  6. django集成微博内容

    登录微博 我的工具 OK. 分享sns网站的网址分享道.去上面获取代码就可. 改版后叫微博秀

  7. Python 爬虫 ajax爬取马云爸爸微博内容

    ajax爬取情况 有时候我们在用 Requests 抓取页面的时候,得到的结果可能和在浏览器中看到的是不一样的,在浏览器中可以看到正常显示的页面数据,但是使用 Requests 得到的结果并没有,这其 ...

  8. 微博推荐算法学习(Weibo Recommend Algolrithm)

    原文:http://hijiangtao.github.io/2014/10/06/WeiboRecommendAlgorithm/ 基础及关联算法 作用:为微博推荐挖掘必要的基础资源.解决推荐时的通 ...

  9. python爬虫实战(六)--------新浪微博(爬取微博帐号所发内容,不爬取历史内容)

    相关代码已经修改调试成功----2017-4-13 详情代码请移步我的github:https://github.com/pujinxiao/sina_spider 一.说明 1.目标网址:新浪微博 ...

随机推荐

  1. 用ASP.NET_Regsql.exe创建Session数据库

    CMD: C:\Users\ZhangSC>C:\Windows\Microsoft.NET\Framework64\v4.0.30319\aspnet_regsql.exe -S ZhangS ...

  2. 【C语言基础】循环体系

    1.For循环结构: For循环的一般形式为: for (表达式1 初始化:判断条件:自增自减) { 语句块 } 2.while循环结构: while循环的一般的形式为: 表达式1 初始化 while ...

  3. 关于Hibernate和Strtus2的xml提示问题

    话不多说,上图 1.Windom 2.preferences 3.搜索框搜索xml catalog 点击Add 4.导入约束(具体操作图上1.2.3)

  4. 一文读懂PRBS定义、生成办法、作用

    对于眼图测试.误码率和抖动容限测试,最常用的测试码是PRBS,主要有PRBS7.PRBS15.PRBS23和PRBS31.本文主要解释了PRBS的定义,生成方法以及简单应用. PRBS定义 二进制序列 ...

  5. 安装FrameWork后重新注册IIS

    IIS和.netfw4.0安装顺序是从前到后,如果不小心颠倒了,无所谓. 打开程序-运行-cmd:输入一下命令重新注册IIS C:\WINDOWS\Microsoft.NET\Framework\v4 ...

  6. 关于c# Debug和Release的区别 (转)

    关于Debug和Release的区别之讨论本文主要包含如下内容: 1. Debug 和 Release 编译方式的本质区别2. 哪些情况下 Release 版会出错2. 怎样“调试” Release ...

  7. Gradle 打多渠道包

    使用gradle 打多渠道包记录经验如下图可见,每个渠道是包含debug 和realse版本的.通过打印BASE_URL 发现在渠道和版本中都可以修改BuildConfig的常量,这样一次可以打出多个 ...

  8. JAVA并发-基于AQS实现自己的显示锁

    一.了解什么是AQS 原文链接:http://www.studyshare.cn/blog-front/blog/details/1131 AQS是AbstractQueuedSynchronizer ...

  9. 逃逸分析(Escape Analysis)

    一.什么是逃逸 逃逸是指在某个方法之内创建的对象,除了在方法体之内被引用之外,还在方法体之外被其它变量引用到:这样带来的后果是在该方法执行完毕之后,该方法中创建的对象将无法被GC回收,由于其被其它变量 ...

  10. anu小程序快速入门

    众所周知,微信推出小程序以来,可谓火遍大江南北,就像当前互联网兴起时,大家忙着抢域名与开私人博客一样.小程序之所以这么火,是因为微信拥有庞大的用户量,并且腾讯帮你搞定后台问题及众多功能问题(如分享,支 ...