官方 http://docs.mongodb.org/ecosystem/tutorial/getting-started-with-hadoop/

mongo-haoop项目地址 https://github.com/mongodb/mongo-hadoop

该代码托管 https://github.com/cclient/mongo_hadoop_map-reduce

原分析 由nodejs+async编写

用游标迭代查询mongo数据库,分析数据

因数据量较大,目前执行分析任务耗时4个小时,这只是极限数据量的1%

为优化,采用hadoop-mongo 方案

优点:mongo只能单机单线程(不作shard的情况),hadoop-mongo可以集群处理。

完成代码

近期一直写的脚本语言,再回头写点JAVA,好悲催,感觉很受限制。

初步代码 很粗糙

MAIN 入口

 package group.artifactid;

 //cc MaxTemperature Application to find the maximum temperature in the weather dataset
//vv MaxTemperature
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.MapWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import com.mongodb.hadoop.MongoConfig;
import com.mongodb.hadoop.io.BSONWritable;
import com.mongodb.hadoop.util.MongoTool; import com.mongodb.hadoop.MongoConfig;
import com.mongodb.hadoop.MongoInputFormat;
import com.mongodb.hadoop.MongoOutputFormat;
import com.mongodb.hadoop.util.MapredMongoConfigUtil;
import com.mongodb.hadoop.util.MongoConfigUtil;
import com.mongodb.hadoop.util.MongoTool;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.util.ToolRunner; public class MongoMaxTemperature extends MongoTool {
public MongoMaxTemperature() {
Configuration conf = new Configuration();
MongoConfig config = new MongoConfig(conf);
setConf(conf);
MongoConfigUtil.setInputFormat(getConf(), MongoInputFormat.class);
MongoConfigUtil.setOutputFormat(getConf(), MongoOutputFormat.class);
config.setInputURI("mongodb://localhost:27017/db1.collection1");
config.setMapper(MongoMaxTemperatureMapper.class);
// Combiner
config.setCombiner(MongoMaxTemperatureCombine.class);
// config.setReducer(MongoMaxTemperatureReducer.class);
config.setReducer(MongoMaxTemperatureReducerCombine.class);
config.setMapperOutputKey(Text.class);
config.setMapperOutputValue(Text.class);
config.setOutputKey(Text.class);
config.setOutputValue(BSONWritable.class);
config.setOutputURI("mongodb://localhost:27017/db2.collection2");
} public static void main(String[] args) throws Exception {
System.exit(ToolRunner.run(new MongoMaxTemperature(), args));
}
}

MAPER代码

package group.artifactid;

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.bson.BSONObject; import com.mongodb.hadoop.io.BSONWritable; public class MongoMaxTemperatureMapper extends
Mapper<Object, BSONObject, Text, Text> {
@Override
public void map(final Object key, BSONObject val, Context context)
throws IOException, InterruptedException {
String apmac = (String) val.get("apMac");
String clientmac = (String) val.get("clientMac");
String url = (String) val.get("url");
String proto = (String) val.get("proto");
if (proto.equals("http")&&!url.equals("")) {
if (url.indexOf("http://") == 0) {
url = url.substring(7);
}
int firstargindex = url.indexOf('/');
if(firstargindex>-1){
url = url.substring(0, firstargindex);
}
//验证输入 带.则参数错误,临时转为}
url=url.replace('.','}');
context.write(new Text(apmac), new Text(clientmac + url));
}
}
}

COMBINE代码

package group.artifactid;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import com.mongodb.hadoop.io.BSONWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import org.bson.BasicBSONObject; public class MongoMaxTemperatureReducerCombine extends
Reducer<Text, Text, Text, BSONWritable> {
public class UrlCount {
public UrlCount(String url, int count) {
this.Url = url;
this.Count = count;
}
String Url;
int Count;
}
public List<UrlCount> compresstopobj(BasicBSONObject topobj, int topnum) {
List<UrlCount> studentList = new ArrayList<UrlCount>();
for (Map.Entry<String, Object> entry : topobj.entrySet()) {
String Url = entry.getKey();
String scount = entry.getValue().toString();
studentList.add(new UrlCount(Url, Integer.parseInt(scount)));
}
Collections.sort(studentList, new Comparator<UrlCount>() {
@Override
public int compare(UrlCount o1, UrlCount o2) {
if (o1.Count > o2.Count) {
return -1;
} else if (o1.Count < o2.Count) {
return 1;
} else {
return 0;
}
}
});
// System.out.print("--------这里排序成功,但入库时,mongo按键名()排序,这里的排序是为筛选前100条用\n");
// for (int i = 0; i < studentList.size(); i++) {
// System.out.print(studentList.get(i).Count + "\n");
// }
if (studentList.size() > topnum) {
studentList = studentList.subList(0, topnum);
}
return studentList;
} @Override
public void reduce(Text apmac, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
BasicBSONObject clientmacmap = new BasicBSONObject();
int count = 0;
for (Text value : values) {
String subline = value.toString();
String clientmac = subline.substring(0, 17);
int indexcount = subline.indexOf("|");
int maplastcount = 1;
String url = null;
if (indexcount > -1) {
indexcount++;
url = subline.substring(17, indexcount);
String mapcount = subline.substring(indexcount);
maplastcount = Integer.parseInt(mapcount); } else {
url = subline.substring(17);
}
BasicBSONObject urlmap = (BasicBSONObject) clientmacmap
.get(clientmac);
if (urlmap == null) {
urlmap = new BasicBSONObject();
clientmacmap.put(clientmac, urlmap);
}
Object eveurl = urlmap.get(url); if (eveurl == null && !url.equals(" ")) {
urlmap.put(url, maplastcount);
} else {
urlmap.put(url, Integer.parseInt(eveurl.toString())
+ maplastcount);
}
count++;
if (count == 10000) {
List<UrlCount> arr = compresstopobj(urlmap, 100);
BasicBSONObject newurlcmap = new BasicBSONObject();
for (int i = 0; i < arr.size(); i++) {
UrlCount cuc = arr.get(i);
newurlcmap.put(cuc.Url, cuc.Count);
}
urlmap = newurlcmap;
}
}
for (Map.Entry<String, Object> entry : clientmacmap.entrySet()) {
BasicBSONObject urlmap = (BasicBSONObject) entry.getValue();
List<UrlCount> arr = compresstopobj(urlmap, 100);
BasicBSONObject newurlcmap = new BasicBSONObject();
for (int i = 0; i < arr.size(); i++) {
UrlCount cuc = arr.get(i);
newurlcmap.put(cuc.Url, cuc.Count);
}
urlmap = newurlcmap;
}
context.write(apmac, new BSONWritable(clientmacmap));
}
}

REDUCER代码

package group.artifactid;

import java.io.DataOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.TreeSet; import com.mongodb.hadoop.io.BSONWritable; import org.apache.commons.io.output.ByteArrayOutputStream;
import org.apache.hadoop.io.ArrayWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.MapWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.util.StringUtils;
import org.apache.zookeeper.server.util.SerializeUtils;
import org.bson.BasicBSONObject; public class MongoMaxTemperatureReducer extends
Reducer<Text, Text, Text, BSONWritable> {
public class UrlCount {
public UrlCount(String url, int count) {
this.Url = url;
this.Count = count;
}
String Url;
int Count;
}
class SortByCount implements Comparator {
public int compare(Object o1, Object o2) {
UrlCount s1 = (UrlCount) o1;
UrlCount s2 = (UrlCount) o2;
if (s1.Count > s2.Count)
return 1;
return 0;
}
}
public List<UrlCount> compresstopobj(BasicBSONObject topobj, int topnum) {
List<UrlCount> studentList = new ArrayList<UrlCount>();
for (Map.Entry<String, Object> entry : topobj.entrySet()) {
String Url = entry.getKey();
String scount = entry.getValue().toString();
System.out.print(scount + "\n");
studentList.add(new UrlCount(Url, Integer.parseInt(scount)));
}
Collections.sort(studentList, new SortByCount());
if (studentList.size() > topnum) {
studentList = studentList.subList(0, topnum);
}
return studentList;
}
@Override
public void reduce(Text apmac, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
BasicBSONObject clientmacmap = new BasicBSONObject();
int count = 0;
for (Text value : values) {
String subline = value.toString();
String clientmac = subline.substring(0, 17);
String url = subline.substring(17);
BasicBSONObject urlmap = (BasicBSONObject) clientmacmap
.get(clientmac);
if (urlmap == null) {
urlmap = new BasicBSONObject();
clientmacmap.put(clientmac, urlmap);
}
Object eveurl = urlmap.get(url);
if (eveurl == null && !url.equals(" ")) {
urlmap.put(url, 1);
} else {
urlmap.put(url, Integer.parseInt(eveurl.toString()) + 1);
}
count++;
if (count == 1000) {
List<UrlCount> arr = compresstopobj(urlmap, 100);
BasicBSONObject newurlcmap = new BasicBSONObject();
for (int i = 0; i < arr.size(); i++) {
UrlCount cuc = arr.get(i);
newurlcmap.put(cuc.Url, cuc.Count);
}
urlmap = newurlcmap;
}
}
context.write(apmac, new BSONWritable(clientmacmap));
}
}

Mongo collection 数据格式

{
"_id" : ObjectId("54d83f3548c9bc218e056ce6"),"apMac" : "aa:bb:cc:dd:ee:ff","proto" : "http",
"url" : "extshort.weixin.qq.comhttp",
"clientMac" : "ff:ee:dd:cc:bb:aa"
}

clientMac和url 先拼在一起,再按mac长度分割

数据流程

orgin->map

map:[{"aa:bb:cc:dd:ee:ff":[ff:ee:dd:cc:bb:aaextshort.weixin.qq.comhttp]}]

假如是多条数据则

map:[{"aa:bb:cc:dd:ee:ff":["ff:ee:dd:cc:bb:aaextshort.weixin.qq.comhttp","ff:ee:dd:cc:bb:aaextshort.weixin.qq.comhttp1","ff:ee:dd:cc:bb:aaextshort.weixin.qq.comhttp2"]}]

map->compine

如果有相同的client+url 则统计个数,以|分隔

compine:[{"aa:bb:cc:dd:ee:ff":[ff:ee:dd:cc:bb:aaextshort.weixin.qq.comhttp|100]}]

compine->reducer

reducer中 按mac长度分割出 clientMac url 再按“|”分割出 个数

统计前每个clientMac的前100条

reduce:

{
"_id": "00:21:26:00:0A:FF",
"aa:bb:cc:1c:b9:8f": {
"c}tieba}baidu}com|": 1,
"short}weixin}qq}comhttp:|": 1,
"get}sogou}com|": 1,
"md}openapi}360}cn|": 1,
"74}125}235}224|": 1,
"mmbiz}qpic}cn|": 1,
"tb}himg}baidu}com|": 1
},
"cc:bb:aa:d5:30:8a": {
"captive}apple}com|": 2,
"www}airport}us|": 1,
"www}itools}info|": 2,
"www}thinkdifferent}us|": 1,
"www}ibook}info|": 1
},
"ee:ee:bb:78:31:74": {
"www}itools}info|": 1,
"www}ibook}info|": 1
} }

hadoop-mongo map/reduce java的更多相关文章

  1. 大文本 通过 hadoop spark map reduce 获取 特征列 的 属性值 计算速度

    大文本 通过 hadoop spark map reduce   获取 特征列  的 属性值  计算速度

  2. Hadoop 少量map/reduce任务执行慢问题

    最近在做报表统计,跑hadoop任务. 之前也跑过map/reduce但是数据量不大,遇到某些map/reduce执行时间特别长的问题. 执行时间长有几种可能性: 1. 单个map/reduce任务处 ...

  3. hadoop编译map/reduce时的问题

    参考链接 http://hadoop.apache.org/common/docs/stable/mapred_tutorial.html http://blog.endlesscode.com/20 ...

  4. hadoop入门级总结二:Map/Reduce

    在上一篇博客:hadoop入门级总结一:HDFS中,简单的介绍了hadoop分布式文件系统HDFS的整体框架及文件写入读出机制.接下来,简要的总结一下hadoop的另外一大关键技术之一分布式计算框架: ...

  5. MapReduce启动的Map/Reduce子任务简要分析

      对于Hadoop来说,是通过在DataNode中启动Map/Reduce java进程的方式来实现分布式计算处理的,那么就从源码层简要分析一下hadoop中启动Map/Reduce任务的过程.   ...

  6. Map Reduce和流处理

    欢迎大家前往腾讯云+社区,获取更多腾讯海量技术实践干货哦~ 本文由@从流域到海域翻译,发表于腾讯云+社区 map()和reduce()是在集群式设备上用来做大规模数据处理的方法,用户定义一个特定的映射 ...

  7. 马士兵hadoop第五课:java开发Map/Reduce

    马士兵hadoop第一课:虚拟机搭建和安装hadoop及启动 马士兵hadoop第二课:hdfs集群集中管理和hadoop文件操作 马士兵hadoop第三课:java开发hdfs 马士兵hadoop第 ...

  8. 马士兵hadoop第五课:java开发Map/Reduce(转)

    马士兵hadoop第一课:虚拟机搭建和安装hadoop及启动 马士兵hadoop第二课:hdfs集群集中管理和hadoop文件操作 马士兵hadoop第三课:java开发hdfs 马士兵hadoop第 ...

  9. Java操作Hadoop、Map、Reduce合成

    原始数据: Map阶段 1.每次读一行数据, 2.拆分每行数据, 3.每个单词碰到一次写个1 <0, "hello tom"> <10, "hello ...

随机推荐

  1. PL/SQL Developer 和 instantclient客户端安装配置(图文)

    一: PL/SQL Developer 安装 下载安装文件安装,我这里的版本号是PLSQL7.1.4.1391,安装目录是:D:\soft\PLSQLDeveloper 二:instantclient ...

  2. 在sql server使用链接服务器中访问mysql

    ----创建ODBC链接EXEC sp_addlinkedserver @server = 'MySQL', @srvproduct='MySql' , @provider = 'MSDASQL', ...

  3. ORACLE常见数据类型详解

    1.字符类型 • CHAR:一个定长字符串,当位数不足自动用空格填充来达到其最大长度.如非NULL的CHAR(12)总是包含12字节信息.CHAR字段最多可以存储2,000字节的 信息. • VARC ...

  4. WebView 调试

      if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.KITKAT) {            WebView.setWebContentsDeb ...

  5. Kolmogorov 的数学观与业绩

    https://www.douban.com/group/topic/11395706/ 作者:伊藤清 当我得知苏联伟大的数学家,84岁的 Andreyii Nikolaevich Kolmogoro ...

  6. JVM配置

    1.堆设置 JVM中最大堆大小有三方面限制:操作系统位数(32-bt还是64-bit)限制:可用虚拟内存限制:系统的可用物理内存限制. java -Xmx3550m -Xms3550m -Xmn2g  ...

  7. DHCP服务器原理

    DHCP服务器   port:67 DHCP 这个服务可以自动的分配 IP 与相关的网络参数给客户端, 来提供客户端自动以服务器提供的参数来设定他们的网络   12.1 DHCP 运作的原理      ...

  8. Replace Pioneer 注册

    批量文本替换工具,Replace Pioneer 注册:http://www.mind-pioneer.com

  9. 使用用Generic.xaml加载默认的主题资源

    把Resource嵌入到Generic.xaml文件中,并把该文件放到应用程序的Themes主题文件夹下面,这们Generic.xaml文件中的资源就可以被系统识别为默认主题一部分,从而进行使用. 为 ...

  10. 机器学习PR:k近邻法分类

    k近邻法是一种基本分类与回归方法.本章只讨论k近邻分类,回归方法将在随后专题中进行. 它可以进行多类分类,分类时根据在样本集合中其k个最近邻点的类别,通过多数表决等方式进行预测,因此不具有显式的学习过 ...