hadoop-mongo map/reduce java

官方 http://docs.mongodb.org/ecosystem/tutorial/getting-started-with-hadoop/

mongo-haoop项目地址 https://github.com/mongodb/mongo-hadoop

该代码托管 https://github.com/cclient/mongo_hadoop_map-reduce

原分析由nodejs+async编写

用游标迭代查询mongo数据库，分析数据

因数据量较大，目前执行分析任务耗时4个小时，这只是极限数据量的1%

为优化，采用hadoop-mongo 方案

优点：mongo只能单机单线程（不作shard的情况），hadoop-mongo可以集群处理。

完成代码

近期一直写的脚本语言，再回头写点JAVA，好悲催，感觉很受限制。

初步代码很粗糙

MAIN 入口

 package group.artifactid;

 //cc MaxTemperature Application to find the maximum temperature in the weather dataset

 //vv MaxTemperature

 import org.apache.hadoop.conf.Configuration;

 import org.apache.hadoop.fs.Path;

 import org.apache.hadoop.io.MapWritable;

 import org.apache.hadoop.io.Text;

 import org.apache.hadoop.mapreduce.Job;

 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

 import com.mongodb.hadoop.MongoConfig;

 import com.mongodb.hadoop.io.BSONWritable;

 import com.mongodb.hadoop.util.MongoTool;

 import com.mongodb.hadoop.MongoConfig;

 import com.mongodb.hadoop.MongoInputFormat;

 import com.mongodb.hadoop.MongoOutputFormat;

 import com.mongodb.hadoop.util.MapredMongoConfigUtil;

 import com.mongodb.hadoop.util.MongoConfigUtil;

 import com.mongodb.hadoop.util.MongoTool;

 import org.apache.hadoop.conf.Configuration;

 import org.apache.hadoop.io.IntWritable;

 import org.apache.hadoop.util.ToolRunner;

 public class MongoMaxTemperature extends MongoTool {

     public MongoMaxTemperature() {

         Configuration conf = new Configuration();

         MongoConfig config = new MongoConfig(conf);

         setConf(conf);

         MongoConfigUtil.setInputFormat(getConf(), MongoInputFormat.class);

         MongoConfigUtil.setOutputFormat(getConf(), MongoOutputFormat.class);

         config.setInputURI("mongodb://localhost:27017/db1.collection1");

         config.setMapper(MongoMaxTemperatureMapper.class);

         // Combiner

         config.setCombiner(MongoMaxTemperatureCombine.class);

         // config.setReducer(MongoMaxTemperatureReducer.class);

         config.setReducer(MongoMaxTemperatureReducerCombine.class);

         config.setMapperOutputKey(Text.class);

         config.setMapperOutputValue(Text.class);

         config.setOutputKey(Text.class);

         config.setOutputValue(BSONWritable.class);

         config.setOutputURI("mongodb://localhost:27017/db2.collection2");

     }

     public static void main(String[] args) throws Exception {

         System.exit(ToolRunner.run(new MongoMaxTemperature(), args));

     }

 }

MAPER代码

package group.artifactid;

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Mapper;

import org.bson.BSONObject;

import com.mongodb.hadoop.io.BSONWritable;

public class MongoMaxTemperatureMapper extends

        Mapper<Object, BSONObject, Text, Text> {

    @Override

    public void map(final Object key, BSONObject val, Context context)

            throws IOException, InterruptedException {

        String apmac = (String) val.get("apMac");

        String clientmac = (String) val.get("clientMac");

        String url = (String) val.get("url");

        String proto = (String) val.get("proto");

        if (proto.equals("http")&&!url.equals("")) {

            if (url.indexOf("http://") == 0) {

                url = url.substring(7);

            }

            int firstargindex = url.indexOf('/');

            if(firstargindex>-1){

                url = url.substring(0, firstargindex);

            }

            //验证输入 带.则参数错误，临时转为}

            url=url.replace('.','}');

            context.write(new Text(apmac), new Text(clientmac + url));

        }

    }

}

COMBINE代码

package group.artifactid;

import java.io.IOException;

import java.util.ArrayList;

import java.util.Collections;

import java.util.Comparator;

import java.util.List;

import java.util.Map;

import com.mongodb.hadoop.io.BSONWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Reducer;

import org.bson.BasicBSONObject;

public class MongoMaxTemperatureReducerCombine extends

        Reducer<Text, Text, Text, BSONWritable> {

    public class UrlCount {

        public UrlCount(String url, int count) {

            this.Url = url;

            this.Count = count;

        }

        String Url;

        int Count;

    }

    public List<UrlCount> compresstopobj(BasicBSONObject topobj, int topnum) {

        List<UrlCount> studentList = new ArrayList<UrlCount>();

        for (Map.Entry<String, Object> entry : topobj.entrySet()) {

            String Url = entry.getKey();

            String scount = entry.getValue().toString();

            studentList.add(new UrlCount(Url, Integer.parseInt(scount)));

        }

        Collections.sort(studentList, new Comparator<UrlCount>() {

            @Override

            public int compare(UrlCount o1, UrlCount o2) {

                if (o1.Count > o2.Count) {

                    return -1;

                } else if (o1.Count < o2.Count) {

                    return 1;

                } else {

                    return 0;

                }

            }

        });

//        System.out.print("--------这里排序成功，但入库时，mongo按键名（）排序,这里的排序是为筛选前100条用\n");

//        for (int i = 0; i < studentList.size(); i++) {

//            System.out.print(studentList.get(i).Count + "\n");

//        }

        if (studentList.size() > topnum) {

            studentList = studentList.subList(0, topnum);

        }

        return studentList;

    }

    @Override

    public void reduce(Text apmac, Iterable<Text> values, Context context)

            throws IOException, InterruptedException {

        BasicBSONObject clientmacmap = new BasicBSONObject();

        int count = 0;

        for (Text value : values) {

            String subline = value.toString();

            String clientmac = subline.substring(0, 17);

            int indexcount = subline.indexOf("|");

            int maplastcount = 1;

            String url = null;

            if (indexcount > -1) {

                indexcount++;

                url = subline.substring(17, indexcount);

                String mapcount = subline.substring(indexcount);

                maplastcount = Integer.parseInt(mapcount);

            } else {

                url = subline.substring(17);

            }

            BasicBSONObject urlmap = (BasicBSONObject) clientmacmap

                    .get(clientmac);

            if (urlmap == null) {

                urlmap = new BasicBSONObject();

                clientmacmap.put(clientmac, urlmap);

            }

            Object eveurl = urlmap.get(url);

            if (eveurl == null && !url.equals(" ")) {

                urlmap.put(url, maplastcount);

            } else {

                urlmap.put(url, Integer.parseInt(eveurl.toString())

                        + maplastcount);

            }

            count++;

            if (count == 10000) {

                List<UrlCount> arr = compresstopobj(urlmap, 100);

                BasicBSONObject newurlcmap = new BasicBSONObject();

                for (int i = 0; i < arr.size(); i++) {

                    UrlCount cuc = arr.get(i);

                    newurlcmap.put(cuc.Url, cuc.Count);

                }

                urlmap = newurlcmap;

            }

        }

        for (Map.Entry<String, Object> entry : clientmacmap.entrySet()) {

            BasicBSONObject urlmap = (BasicBSONObject) entry.getValue();

            List<UrlCount> arr = compresstopobj(urlmap, 100);

            BasicBSONObject newurlcmap = new BasicBSONObject();

            for (int i = 0; i < arr.size(); i++) {

                UrlCount cuc = arr.get(i);

                newurlcmap.put(cuc.Url, cuc.Count);

            }

            urlmap = newurlcmap;

        }

        context.write(apmac, new BSONWritable(clientmacmap));

    }

}

REDUCER代码

package group.artifactid;

import java.io.DataOutputStream;

import java.io.IOException;

import java.util.ArrayList;

import java.util.Collections;

import java.util.Comparator;

import java.util.HashMap;

import java.util.List;

import java.util.Map;

import java.util.TreeSet;

import com.mongodb.hadoop.io.BSONWritable;

import org.apache.commons.io.output.ByteArrayOutputStream;

import org.apache.hadoop.io.ArrayWritable;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.MapWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.io.Writable;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.util.StringUtils;

import org.apache.zookeeper.server.util.SerializeUtils;

import org.bson.BasicBSONObject;

public class MongoMaxTemperatureReducer extends

        Reducer<Text, Text, Text, BSONWritable> {

    public class UrlCount {

        public UrlCount(String url, int count) {

            this.Url = url;

            this.Count = count;

        }

        String Url;

        int Count;

    }

    class SortByCount implements Comparator {

        public int compare(Object o1, Object o2) {

            UrlCount s1 = (UrlCount) o1;

            UrlCount s2 = (UrlCount) o2;

            if (s1.Count > s2.Count)

                return 1;

            return 0;

        }

    }

    public List<UrlCount> compresstopobj(BasicBSONObject topobj, int topnum) {

        List<UrlCount> studentList = new ArrayList<UrlCount>();

        for (Map.Entry<String, Object> entry : topobj.entrySet()) {

            String Url = entry.getKey();

            String scount = entry.getValue().toString();

            System.out.print(scount + "\n");

            studentList.add(new UrlCount(Url, Integer.parseInt(scount)));

        }

        Collections.sort(studentList, new SortByCount());

        if (studentList.size() > topnum) {

            studentList = studentList.subList(0, topnum);

        }

        return studentList;

    }

    @Override

    public void reduce(Text apmac, Iterable<Text> values, Context context)

            throws IOException, InterruptedException {

        BasicBSONObject clientmacmap = new BasicBSONObject();

        int count = 0;

        for (Text value : values) {

            String subline = value.toString();

            String clientmac = subline.substring(0, 17);

            String url = subline.substring(17);

            BasicBSONObject urlmap = (BasicBSONObject) clientmacmap

                    .get(clientmac);

            if (urlmap == null) {

                urlmap = new BasicBSONObject();

                clientmacmap.put(clientmac, urlmap);

            }

            Object eveurl = urlmap.get(url);

            if (eveurl == null && !url.equals(" ")) {

                urlmap.put(url, 1);

            } else {

                urlmap.put(url, Integer.parseInt(eveurl.toString()) + 1);

            }

            count++;

            if (count == 1000) {

                List<UrlCount> arr = compresstopobj(urlmap, 100);

                BasicBSONObject newurlcmap = new BasicBSONObject();

                for (int i = 0; i < arr.size(); i++) {

                    UrlCount cuc = arr.get(i);

                    newurlcmap.put(cuc.Url, cuc.Count);

                }

                urlmap = newurlcmap;

            }

        }

        context.write(apmac, new BSONWritable(clientmacmap));

    }

}

Mongo collection 数据格式

{

    "_id" : ObjectId("54d83f3548c9bc218e056ce6"),"apMac" : "aa:bb:cc:dd:ee:ff","proto" : "http",

    "url" : "extshort.weixin.qq.comhttp",

    "clientMac" : "ff:ee:dd:cc:bb:aa"

}

clientMac和url 先拼在一起，再按mac长度分割

数据流程

orgin->map

map:[{"aa:bb:cc:dd:ee:ff":[ff:ee:dd:cc:bb:aaextshort.weixin.qq.comhttp]}]

假如是多条数据则

map:[{"aa:bb:cc:dd:ee:ff":["ff:ee:dd:cc:bb:aaextshort.weixin.qq.comhttp","ff:ee:dd:cc:bb:aaextshort.weixin.qq.comhttp1","ff:ee:dd:cc:bb:aaextshort.weixin.qq.comhttp2"]}]

map->compine

如果有相同的client+url 则统计个数，以|分隔

compine:[{"aa:bb:cc:dd:ee:ff":[ff:ee:dd:cc:bb:aaextshort.weixin.qq.comhttp|100]}]

compine->reducer

reducer中按mac长度分割出 clientMac url 再按“|”分割出个数

统计前每个clientMac的前100条

reduce:

{

    "_id": "00:21:26:00:0A:FF",

    "aa:bb:cc:1c:b9:8f": {

        "c}tieba}baidu}com|": 1,

        "short}weixin}qq}comhttp:|": 1,

        "get}sogou}com|": 1,

        "md}openapi}360}cn|": 1,

        "74}125}235}224|": 1,

        "mmbiz}qpic}cn|": 1,

        "tb}himg}baidu}com|": 1

    },

    "cc:bb:aa:d5:30:8a": {

        "captive}apple}com|": 2,

        "www}airport}us|": 1,

        "www}itools}info|": 2,

        "www}thinkdifferent}us|": 1,

        "www}ibook}info|": 1

    },

    "ee:ee:bb:78:31:74": {

        "www}itools}info|": 1,

        "www}ibook}info|": 1

    }

}

hadoop-mongo map/reduce java的更多相关文章

大文本通过 hadoop spark map reduce 获取特征列的属性值计算速度
大文本通过 hadoop spark map reduce 获取特征列的属性值计算速度
Hadoop 少量map/reduce任务执行慢问题
最近在做报表统计,跑hadoop任务. 之前也跑过map/reduce但是数据量不大,遇到某些map/reduce执行时间特别长的问题. 执行时间长有几种可能性: 1. 单个map/reduce任务处 ...
hadoop编译map/reduce时的问题
参考链接 http://hadoop.apache.org/common/docs/stable/mapred_tutorial.html http://blog.endlesscode.com/20 ...
hadoop入门级总结二：Map/Reduce
在上一篇博客:hadoop入门级总结一:HDFS中,简单的介绍了hadoop分布式文件系统HDFS的整体框架及文件写入读出机制.接下来,简要的总结一下hadoop的另外一大关键技术之一分布式计算框架: ...
MapReduce启动的Map/Reduce子任务简要分析
对于Hadoop来说,是通过在DataNode中启动Map/Reduce java进程的方式来实现分布式计算处理的,那么就从源码层简要分析一下hadoop中启动Map/Reduce任务的过程. ...
Map Reduce和流处理
欢迎大家前往腾讯云+社区,获取更多腾讯海量技术实践干货哦~ 本文由@从流域到海域翻译,发表于腾讯云+社区 map()和reduce()是在集群式设备上用来做大规模数据处理的方法,用户定义一个特定的映射 ...
马士兵hadoop第五课：java开发Map/Reduce
马士兵hadoop第一课:虚拟机搭建和安装hadoop及启动马士兵hadoop第二课:hdfs集群集中管理和hadoop文件操作马士兵hadoop第三课:java开发hdfs 马士兵hadoop第 ...
马士兵hadoop第五课：java开发Map/Reduce（转）
马士兵hadoop第一课:虚拟机搭建和安装hadoop及启动马士兵hadoop第二课:hdfs集群集中管理和hadoop文件操作马士兵hadoop第三课:java开发hdfs 马士兵hadoop第 ...
Java操作Hadoop、Map、Reduce合成
原始数据: Map阶段 1.每次读一行数据, 2.拆分每行数据, 3.每个单词碰到一次写个1 <0, "hello tom"> <10, "hello ...

随机推荐

LR12.53—使用HP网络导游示例应用程序
本教程使用的HP Web之旅,一个样本的基于Web的旅行社系统,向人们展示LoadRunner将如何作为负载测试解决方案.惠普网络旅游用户连接到Web服务器,搜索航班,预订机票,检查飞行路线. 虽然 ...
mac系统下Ruby环境安装
在我们编写代码过程中,会用到一系列的第三方开源类库,我们可以选择手动去一个个添加需要的库,但这非常麻烦.因此我们可以使用CocoaPods来帮助我们自动添加.使用CocoaPods需要先安装Ruby环 ...
Ninject之旅之九：Ninject上下文绑定（附程序下载）
摘要既然在插件模型里,每一个服务类型可以被映射到多个实现,绑定方法不用决定要返回哪个实现.因为kernel应该返回所有的实现.然而,上下文绑定是多个绑定场景,在这个场景里,kernel需要根据给定的 ...
Python面试题
1.Python装饰器详情 2.设置多个Python项目使用不同版本的Python和第三方库使用PyEnv 详情 3.PEP8 详情 4.参数传递按引用传递 5.列表解析,字典解析详情 6.列 ...
char wchar 互转多字符宽字符的N种方式
1: 用 CString 如果没有mfc 可以用 ATL 中的 CString #include <atlstr.h> CStringA v1 = "111&quo ...
学校系统快速js代码
var select_arr=document.getElementById("iframeautoheight").contentWindow.document.getEleme ...
CAN总线(一)
原文出处:http://www.cnblogs.com/jacklu/p/4729638.html 嵌入式的工程师一般都知道CAN总线广泛应用到汽车中,其实船舰电子设备通信也广泛使用CAN,随着国家对 ...
ado.net excel 模版
ado.net excel 模版 private static void Excute() { while (true) { ...
mysql 存储过程游标的使用
BEGINDECLARE id long;DECLARE Done INT DEFAULT 0;DECLARE userids CURSOR FOR SELECT userid from info_u ...
python之路——面向对象（基础篇）
面向对象编程:类,对象面向对象编程是一种编程方式,此编程方式的落地需要使用 "类" 和 "对象" 来实现,所以,面向对象编程其实就是对 "类&quo ...

hadoop-mongo map/reduce java

hadoop-mongo map/reduce java的更多相关文章

随机推荐

热门专题