Apriori on MapReduce

Apiroi算法在Hadoop MapReduce上的实现

输入格式：

一行为一个Bucket

1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62 64 66 68 70 72 74

1 3 5 7 9 12 13 15 17 19 21 23 25 27 29 31 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62 64 66 68 70 72 74

1 3 5 7 9 12 13 16 17 19 21 23 25 27 29 31 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62 64 66 68 70 72 74

1 3 5 7 9 11 13 15 17 20 21 23 25 27 29 31 34 36 38 40 42 44 47 48 50 52 54 56 58 60 62 64 66 68 70 72 74

1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31 34 36 38 40 42 44 46 48 51 52 54 56 58 60 62 64 66 68 70 72 74

1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31 34 36 38 40 42 44 46 48 51 52 54 56 58 60 63 64 66 68 70 72 74

1 3 5 7 9 11 13 15 17 20 21 23 25 27 29 31 34 36 38 40 42 44 47 48 51 52 54 56 58 60 62 64 66 68 70 72 74

1 3 5 7 9 12 13 15 17 19 21 24 25 27 29 31 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62 64 66 68 70 72 74

1 3 5 7 9 11 13 15 17 19 21 24 25 27 29 31 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62 65 66 68 70 72 74

1 3 5 7 9 11 13 16 17 19 21 24 25 27 29 31 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62 64 66 68 70 72 74

1 3 5 7 9 12 13 16 17 19 21 24 25 27 29 31 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62 64 66 68 70 72 74

1 3 5 7 9 11 13 15 17 20 21 24 25 27 29 31 34 36 38 40 42 44 47 48 50 52 54 56 58 60 62 64 66 68 70 72 74

1 3 5 7 9 11 13 15 17 20 21 24 25 27 29 31 34 36 38 40 42 44 47 48 50 52 54 56 58 60 62 65 66 68 70 72 74

1 3 5 7 9 11 13 15 17 20 21 24 25 27 29 31 34 36 38 40 43 44 47 48 50 52 54 56 58 60 62 65 66 68 70 72 74

输出格式：

<item1,item2,...itemK, frequency>

代码：

 package apriori;

 import java.io.IOException;

 import java.util.Iterator;

 import java.util.StringTokenizer;

 import java.util.List;

 import java.util.ArrayList;

 import java.util.Collections;

 import java.util.Map;

 import java.util.HashMap;

 import java.io.*;

 import org.apache.hadoop.conf.Configuration;

 import org.apache.hadoop.conf.Configured;

 import org.apache.hadoop.fs.Path;

 import org.apache.hadoop.fs.FileSystem;

 import org.apache.hadoop.io.Text;

 import org.apache.hadoop.io.IntWritable;

 import org.apache.hadoop.mapreduce.Job;

 import org.apache.hadoop.mapreduce.Mapper;

 import org.apache.hadoop.mapreduce.Mapper.Context;

 import org.apache.hadoop.mapreduce.Reducer;

 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

 import org.apache.hadoop.mapreduce.lib.jobcontrol.JobControl;

 import org.apache.hadoop.mapreduce.lib.jobcontrol.ControlledJob;

 import org.apache.hadoop.util.Tool;

 import org.apache.hadoop.util.ToolRunner;

 class AprioriPass1Mapper extends Mapper<Object,Text,Text,IntWritable>{

     private final static IntWritable one = new IntWritable(1);

     private Text number = new Text();

     //第一次pass的Mapper只要把每个item映射为1

     public void map(Object key,Text value,Context context) throws IOException,InterruptedException{

         String[] ids = value.toString().split("[\\s\\t]+");

         for(int i = 0;i < ids.length;i++){

             context.write(new Text(ids[i]),one);

         }

     }

 }

 class AprioriReducer extends Reducer<Text,IntWritable,Text,IntWritable>{

     private IntWritable result = new IntWritable();

     //所有Pass的job共用一个reducer，即统计一种itemset的个数，并筛选除大于s的

     public void reduce(Text key,Iterable<IntWritable> values,Context context) throws IOException,InterruptedException{

         int sum = 0;

         int minSup = context.getConfiguration().getInt("minSup",5);

         for(IntWritable val : values){

             sum += val.get();

         }

         result.set(sum);

         if(sum > minSup){

             context.write(key,result);

         }

     }

 }

 class AprioriPassKMapper extends Mapper<Object,Text,Text,IntWritable>{

     private final static IntWritable one = new IntWritable(1);

     private Text item = new Text();

     private List< List<Integer> > prevItemsets = new ArrayList< List<Integer> >();

     private List< List<Integer> > candidateItemsets = new ArrayList< List<Integer> >();

     private Map<String,Boolean> candidateItemsetsMap = new HashMap<String,Boolean>();

     //第一个以后的pass使用该Mapper，在map函数执行前会执行setup来从k-1次pass的输出中构建候选itemsets,对应于apriori算法

     @Override

     public void setup(Context context) throws IOException, InterruptedException{

         int passNum = context.getConfiguration().getInt("passNum",2);

         String prefix = context.getConfiguration().get("hdfsOutputDirPrefix","");

         String lastPass1 = context.getConfiguration().get("fs.default.name") + "/user/hadoop/chess-" + (passNum - 1) + "/part-r-00000";

         String lastPass = context.getConfiguration().get("fs.default.name") + prefix + (passNum - 1) + "/part-r-00000";

         try{

             Path path = new Path(lastPass);

             FileSystem fs = FileSystem.get(context.getConfiguration());

             BufferedReader fis = new BufferedReader(new InputStreamReader(fs.open(path)));

             String line = null;

             while((line = fis.readLine()) != null){

                 List<Integer> itemset = new ArrayList<Integer>();

                 String itemsStr = line.split("[\\s\\t]+")[0];

                 for(String itemStr : itemsStr.split(",")){

                     itemset.add(Integer.parseInt(itemStr));

                 }

                 prevItemsets.add(itemset);

             }

         }catch (Exception e){

             e.printStackTrace();

         }

         //get candidate itemsets from the prev itemsets

         candidateItemsets = getCandidateItemsets(prevItemsets,passNum - 1);

     }

     public void map(Object key,Text value,Context context) throws IOException,InterruptedException{

         String[] ids = value.toString().split("[\\s\\t]+");

         List<Integer> itemset = new ArrayList<Integer>();

         for(String id : ids){

             itemset.add(Integer.parseInt(id));

         }

         //遍历所有候选集合

         for(List<Integer> candidateItemset : candidateItemsets){

             //如果输入的一行中包含该候选集合，则映射1，这样来统计候选集合被包括的次数

             //子集合，消耗掉了大部分时间

             if(contains(candidateItemset,itemset)){

                 String outputKey = "";

                 for(int i = 0;i < candidateItemset.size();i++){

                     outputKey += candidateItemset.get(i) + ",";

                 }

                 outputKey = outputKey.substring(0,outputKey.length() - 1);

                 context.write(new Text(outputKey),one);

             }

         }

     }

     //返回items是否是allItems的子集

     private boolean contains(List<Integer> items,List<Integer> allItems){

         int i = 0;

         int j = 0;

         while(i < items.size() && j < allItems.size()){

             if(allItems.get(j) > items.get(i)){

                 return false;

             }else if(allItems.get(j) == items.get(i)){

                 j++;

                 i++;

             }else{

                 j++;

             }

         }

         if(i != items.size()){

             return false;

         }

         return true;

     }

     //获取所有候选集合，参考apriori算法

     private List< List<Integer> > getCandidateItemsets(List< List<Integer> > prevItemsets, int passNum){

         List< List<Integer> > candidateItemsets = new ArrayList<List<Integer> >();

         //上次pass的输出中选取连个itemset构造大小为k + 1的候选集合

         for(int i = 0;i < prevItemsets.size();i++){

             for(int j = i + 1;j < prevItemsets.size();j++){

                 List<Integer> outerItems = prevItemsets.get(i);

                 List<Integer> innerItems = prevItemsets.get(j);

                 List<Integer> newItems = null;

                 if(passNum == 1){

                     newItems = new ArrayList<Integer>();

                     newItems.add(outerItems.get(0));

                     newItems.add(innerItems.get(0));

                 }

                 else{

                     int nDifferent = 0;

                     int index = -1;

                     for(int k = 0; k < passNum && nDifferent < 2;k++){

                         if(!innerItems.contains(outerItems.get(k))){

                             nDifferent++;

                             index = k;

                         }

                     }

                     if(nDifferent == 1){

                         //System.out.println("inner " + innerItems + " outer : " + outerItems);

                         newItems = new ArrayList<Integer>();

                         newItems.addAll(innerItems);

                         newItems.add(outerItems.get(index));

                     }

                 }

                 if(newItems == null){continue;}

                 Collections.sort(newItems);

                 //候选集合必须满足所有的子集都在上次pass的输出中，调用isCandidate进行检测，通过后加入到候选子集和列表

                 if(isCandidate(newItems,prevItemsets) && !candidateItemsets.contains(newItems)){

                     candidateItemsets.add(newItems);

                     //System.out.println(newItems);

                 }

             }

         }

         return candidateItemsets;

     }

     private boolean isCandidate(List<Integer> newItems,List< List<Integer> > prevItemsets){

         List<List<Integer>> subsets = getSubsets(newItems);     

         for(List<Integer> subset : subsets){

             if(!prevItemsets.contains(subset)){

                 return false;

             }

         }

         return true;

     }

     private List<List<Integer>> getSubsets(List<Integer> items){

         List<List<Integer>> subsets = new ArrayList<List<Integer>>();

         for(int i = 0;i < items.size();i++){

             List<Integer> subset = new ArrayList<Integer>(items);

             subset.remove(i);

             subsets.add(subset);

         }

         return subsets;

     }

 }

 public class Apriori extends Configured implements Tool{

     public static int s;

     public static int k;

     public int run(String[] args)throws IOException,InterruptedException,ClassNotFoundException{

         long startTime = System.currentTimeMillis();

         String hdfsInputDir = args[0];        //从参数1中读取输入数据

         String hdfsOutputDirPrefix = args[1];    //参数2为输出数据前缀，和第pass次组成输出目录

         s = Integer.parseInt(args[2]);        //阈值

         k = Integer.parseInt(args[3]);        //k次pass

         //循环执行K次pass

         for(int pass = 1; pass <= k;pass++){

             long passStartTime = System.currentTimeMillis();

             //配置执行该job

             if(!runPassKMRJob(hdfsInputDir,hdfsOutputDirPrefix,pass)){

                 return -1;

             }

             long passEndTime = System.currentTimeMillis();

             System.out.println("pass " + pass + " time : " + (passEndTime - passStartTime));

         }

         long endTime = System.currentTimeMillis();

         System.out.println("total time : " + (endTime - startTime));

         return 0;

     }

     private static boolean runPassKMRJob(String hdfsInputDir,String hdfsOutputDirPrefix,int passNum)

             throws IOException,InterruptedException,ClassNotFoundException{

             Configuration passNumMRConf = new Configuration();

             passNumMRConf.setInt("passNum",passNum);

             passNumMRConf.set("hdfsOutputDirPrefix",hdfsOutputDirPrefix);

             passNumMRConf.setInt("minSup",s);

             Job passNumMRJob = new Job(passNumMRConf,"" + passNum);

             passNumMRJob.setJarByClass(Apriori.class);

             if(passNum == 1){

                 //第一次pass的Mapper类特殊对待，不许要构造候选itemsets

                 passNumMRJob.setMapperClass(AprioriPass1Mapper.class);

             }

             else{

                 //第一次之后的pass的Mapper类特殊对待，不许要构造候选itemsets

                 passNumMRJob.setMapperClass(AprioriPassKMapper.class);

             }

             passNumMRJob.setReducerClass(AprioriReducer.class);

             passNumMRJob.setOutputKeyClass(Text.class);

             passNumMRJob.setOutputValueClass(IntWritable.class);

             FileInputFormat.addInputPath(passNumMRJob,new Path(hdfsInputDir));

             FileOutputFormat.setOutputPath(passNumMRJob,new Path(hdfsOutputDirPrefix + passNum));

             return passNumMRJob.waitForCompletion(true);

     }

     public static void main(String[] args) throws Exception{

         int exitCode = ToolRunner.run(new Apriori(),args);

         System.exit(exitCode);

     }

 }

Apriori on MapReduce的更多相关文章

记录近期小改Apriori至MapReduce上的心得
·背景前一阵,一直在研究一些ML的东东,后来工作关系暂停了一阵.现在继续把剩下一些热门的算法再吃吃透,"无聊+逗比"地把他们搞到MapReduce上.这次选择的入手对象为Apri ...
常见数据挖掘算法的Map-Reduce策略(2)
接着上一篇文章常见算法的mapreduce案例(1)继续挖坑,本文涉及到算法的基本原理,文中会大概讲讲,但具体有关公式的推导还请大家去查阅相关的文献文章.下面涉及到的数据挖掘算法会有:L ...
MapReduce实现Apriori算法
Apiroi算法在Hadoop MapReduce上的实现输入格式: 一行为一个Bucket 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31 34 36 38 ...
#研发解决方案#基于Apriori算法的Nginx+Lua+ELK异常流量拦截方案
郑昀基于杨海波的设计文档创建于2015/8/13 最后更新于2015/8/25 关键词:异常流量.rate limiting.Nginx.Apriori.频繁项集.先验算法.Lua.ELK 本文档 ...
使用Apriori算法和FP-growth算法进行关联分析
系列文章:<机器学习实战>学习笔记最近看了<机器学习实战>中的第11章(使用Apriori算法进行关联分析)和第12章(使用FP-growth算法来高效发现频繁项集).正如章 ...
利用Apriori算法对交通路况的研究
首先简单描述一下Apriori算法:Apriori算法分为频繁项集的产生和规则的产生. Apriori算法频繁项集的产生: 令ck为候选k-项集的集合,而Fk为频繁k-项集的集合. 1.首先通过单遍扫 ...
基于Apriori算法的Nginx+Lua+ELK异常流量拦截方案郑昀基于杨海波的设计文档（转）
郑昀基于杨海波的设计文档创建于2015/8/13 最后更新于2015/8/25 关键词:异常流量.rate limiting.Nginx.Apriori.频繁项集.先验算法.Lua.ELK 本文档 ...
基于Hadoop的改进Apriori算法
一.Apriori算法性质性质一: 候选的k元组集合Ck中,任意k-1个项组成的集合都来自于Lk. 性质二: 若k维数据项目集X={i1,i2,-,ik}中至少存在一个j∈X,使得|L(k-1)(j ...
海量数据挖掘MMDS week2: 频繁项集挖掘 Apriori算法的改进：非hash方法
http://blog.csdn.net/pipisorry/article/details/48914067 海量数据挖掘Mining Massive Datasets(MMDs) -Jure Le ...

随机推荐

UVA 11210 中国麻将
https://uva.onlinejudge.org/index.php?option=com_onlinejudge&Itemid=8&page=show_problem& ...
SQL-Server使用点滴(一)
前言 SQL的语法比较简单,学起来相比界面UI控制要简单得多,但是SQL在企业级应用中又是如此的重要,以至于很多开发人员都把重点放在SQL上. SQL并没有面向对象的概念,最复杂的设计也不过是表值函数 ...
nginx 支持laravel 5.3配置
server { listen ; server_name www.baidu.com.cn; root /data/cehuiren/public; #charset koi8-r; #access ...
SSH邮箱验证与激活
下面是我写的email验证和激活: 自己瞎写的,能用,不喜欢勿喷 action中regist方法中代码 /** * * 发送邮件的方法 */ StringBuffer sb=new StringBuf ...
AWS EC2首次使用VPS
看到AWS有免费一年的试用期,就申请了一个账号.想搭建一个自己的网站.申请之前,你还需要有一张信用卡. 申请AWS账号,登陆控制台进入AWS官网即可申请账号,进入控制台后,就可以新建一个AWS EC ...
用excel做分组散点图
散点图主要观察两组变量间的趋势和分布,如果变量多于两组,仍旧使用散点图的话,那所有点都会集中在同一显示区域内,使人无法准确判断,此时一般使用散点图矩阵进行两两比较.除此之外,如果并不关心组与组之间的关 ...
移动端弹性布局--flex
目前,Flex布局,可以简便.完整.响应式地实现各种页面布局.而且,它已得到了所有浏览器的支持,这意味着,我们能很安全地使用这项功能. 如果你对弹性网页布局感兴趣,那可别错过flex这么好用的属性哦. ...
不刷新页面获取HTML进行显示
$.ajax({ url: "请求地址", dataType: "text", type: "GET" ...
javascript进阶系列专题：闭包(Closure)
在javascript中,函数可看作是一种数据,可以赋值给变量,可以嵌套在另一个函数中. var fun = function(){ console.log("平底斜"); } f ...
教你9个提升 Wordpress 网站安全性的方法
大约一个月前,这个部落格被黑客入侵(编按:Amit Agarwal 的网站).而其他托管于相同主机商的网站像是 ctrlq.org 和2hundredzeros.com 也深受其害,黑客成功从网路上拿 ...

Apriori on MapReduce

输入格式：

输出格式：

代码：

Apriori on MapReduce的更多相关文章

随机推荐

热门专题