MapReduce实现Apriori算法

Apiroi算法在Hadoop MapReduce上的实现

输入格式：

一行为一个Bucket

1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62 64 66 68 70 72 74

1 3 5 7 9 12 13 15 17 19 21 23 25 27 29 31 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62 64 66 68 70 72 74

1 3 5 7 9 12 13 16 17 19 21 23 25 27 29 31 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62 64 66 68 70 72 74

1 3 5 7 9 11 13 15 17 20 21 23 25 27 29 31 34 36 38 40 42 44 47 48 50 52 54 56 58 60 62 64 66 68 70 72 74

1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31 34 36 38 40 42 44 46 48 51 52 54 56 58 60 62 64 66 68 70 72 74

1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31 34 36 38 40 42 44 46 48 51 52 54 56 58 60 63 64 66 68 70 72 74

1 3 5 7 9 11 13 15 17 20 21 23 25 27 29 31 34 36 38 40 42 44 47 48 51 52 54 56 58 60 62 64 66 68 70 72 74

1 3 5 7 9 12 13 15 17 19 21 24 25 27 29 31 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62 64 66 68 70 72 74

1 3 5 7 9 11 13 15 17 19 21 24 25 27 29 31 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62 65 66 68 70 72 74

1 3 5 7 9 11 13 16 17 19 21 24 25 27 29 31 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62 64 66 68 70 72 74

1 3 5 7 9 12 13 16 17 19 21 24 25 27 29 31 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62 64 66 68 70 72 74

1 3 5 7 9 11 13 15 17 20 21 24 25 27 29 31 34 36 38 40 42 44 47 48 50 52 54 56 58 60 62 64 66 68 70 72 74

1 3 5 7 9 11 13 15 17 20 21 24 25 27 29 31 34 36 38 40 42 44 47 48 50 52 54 56 58 60 62 65 66 68 70 72 74

1 3 5 7 9 11 13 15 17 20 21 24 25 27 29 31 34 36 38 40 43 44 47 48 50 52 54 56 58 60 62 65 66 68 70 72 74

输出格式：

<item1,item2,...itemK, frequency>

代码：

 package apriori;

 import java.io.IOException;

 import java.util.Iterator;

 import java.util.StringTokenizer;

 import java.util.List;

 import java.util.ArrayList;

 import java.util.Collections;

 import java.util.Map;

 import java.util.HashMap;

 import java.io.*;

 import org.apache.hadoop.conf.Configuration;

 import org.apache.hadoop.conf.Configured;

 import org.apache.hadoop.fs.Path;

 import org.apache.hadoop.fs.FileSystem;

 import org.apache.hadoop.io.Text;

 import org.apache.hadoop.io.IntWritable;

 import org.apache.hadoop.mapreduce.Job;

 import org.apache.hadoop.mapreduce.Mapper;

 import org.apache.hadoop.mapreduce.Mapper.Context;

 import org.apache.hadoop.mapreduce.Reducer;

 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

 import org.apache.hadoop.mapreduce.lib.jobcontrol.JobControl;

 import org.apache.hadoop.mapreduce.lib.jobcontrol.ControlledJob;

 import org.apache.hadoop.util.Tool;

 import org.apache.hadoop.util.ToolRunner;

 class AprioriPass1Mapper extends Mapper<Object,Text,Text,IntWritable>{

     private final static IntWritable one = new IntWritable(1);

     private Text number = new Text();

     //第一次pass的Mapper只要把每个item映射为1

     public void map(Object key,Text value,Context context) throws IOException,InterruptedException{

         String[] ids = value.toString().split("[\\s\\t]+");

         for(int i = 0;i < ids.length;i++){

             context.write(new Text(ids[i]),one);

         }

     }

 }

 class AprioriReducer extends Reducer<Text,IntWritable,Text,IntWritable>{

     private IntWritable result = new IntWritable();

     //所有Pass的job共用一个reducer，即统计一种itemset的个数，并筛选除大于s的

     public void reduce(Text key,Iterable<IntWritable> values,Context context) throws IOException,InterruptedException{

         int sum = 0;

         int minSup = context.getConfiguration().getInt("minSup",5);

         for(IntWritable val : values){

             sum += val.get();

         }

         result.set(sum);

         if(sum > minSup){

             context.write(key,result);

         }

     }

 }

 class AprioriPassKMapper extends Mapper<Object,Text,Text,IntWritable>{

     private final static IntWritable one = new IntWritable(1);

     private Text item = new Text();

     private List< List<Integer> > prevItemsets = new ArrayList< List<Integer> >();

     private List< List<Integer> > candidateItemsets = new ArrayList< List<Integer> >();

     private Map<String,Boolean> candidateItemsetsMap = new HashMap<String,Boolean>();

     //第一个以后的pass使用该Mapper，在map函数执行前会执行setup来从k-1次pass的输出中构建候选itemsets,对应于apriori算法

     @Override

     public void setup(Context context) throws IOException, InterruptedException{

         int passNum = context.getConfiguration().getInt("passNum",2);

         String prefix = context.getConfiguration().get("hdfsOutputDirPrefix","");

         String lastPass1 = context.getConfiguration().get("fs.default.name") + "/user/hadoop/chess-" + (passNum - 1) + "/part-r-00000";

         String lastPass = context.getConfiguration().get("fs.default.name") + prefix + (passNum - 1) + "/part-r-00000";

         try{

             Path path = new Path(lastPass);

             FileSystem fs = FileSystem.get(context.getConfiguration());

             BufferedReader fis = new BufferedReader(new InputStreamReader(fs.open(path)));

             String line = null;

             while((line = fis.readLine()) != null){

                 List<Integer> itemset = new ArrayList<Integer>();

                 String itemsStr = line.split("[\\s\\t]+")[0];

                 for(String itemStr : itemsStr.split(",")){

                     itemset.add(Integer.parseInt(itemStr));

                 }

                 prevItemsets.add(itemset);

             }

         }catch (Exception e){

             e.printStackTrace();

         }

         //get candidate itemsets from the prev itemsets

         candidateItemsets = getCandidateItemsets(prevItemsets,passNum - 1);

     }

     public void map(Object key,Text value,Context context) throws IOException,InterruptedException{

         String[] ids = value.toString().split("[\\s\\t]+");

         List<Integer> itemset = new ArrayList<Integer>();

         for(String id : ids){

             itemset.add(Integer.parseInt(id));

         }

         //遍历所有候选集合

         for(List<Integer> candidateItemset : candidateItemsets){

             //如果输入的一行中包含该候选集合，则映射1，这样来统计候选集合被包括的次数

             //子集合，消耗掉了大部分时间

             if(contains(candidateItemset,itemset)){

                 String outputKey = "";

                 for(int i = 0;i < candidateItemset.size();i++){

                     outputKey += candidateItemset.get(i) + ",";

                 }

                 outputKey = outputKey.substring(0,outputKey.length() - 1);

                 context.write(new Text(outputKey),one);

             }

         }

     }

     //返回items是否是allItems的子集

     private boolean contains(List<Integer> items,List<Integer> allItems){

         int i = 0;

         int j = 0;

         while(i < items.size() && j < allItems.size()){

             if(allItems.get(j) > items.get(i)){

                 return false;

             }else if(allItems.get(j) == items.get(i)){

                 j++;

                 i++;

             }else{

                 j++;

             }

         }

         if(i != items.size()){

             return false;

         }

         return true;

     }

     //获取所有候选集合，参考apriori算法

     private List< List<Integer> > getCandidateItemsets(List< List<Integer> > prevItemsets, int passNum){

         List< List<Integer> > candidateItemsets = new ArrayList<List<Integer> >();

         //上次pass的输出中选取连个itemset构造大小为k + 1的候选集合

         for(int i = 0;i < prevItemsets.size();i++){

             for(int j = i + 1;j < prevItemsets.size();j++){

                 List<Integer> outerItems = prevItemsets.get(i);

                 List<Integer> innerItems = prevItemsets.get(j);

                 List<Integer> newItems = null;

                 if(passNum == 1){

                     newItems = new ArrayList<Integer>();

                     newItems.add(outerItems.get(0));

                     newItems.add(innerItems.get(0));

                 }

                 else{

                     int nDifferent = 0;

                     int index = -1;

                     for(int k = 0; k < passNum && nDifferent < 2;k++){

                         if(!innerItems.contains(outerItems.get(k))){

                             nDifferent++;

                             index = k;

                         }

                     }

                     if(nDifferent == 1){

                         //System.out.println("inner " + innerItems + " outer : " + outerItems);

                         newItems = new ArrayList<Integer>();

                         newItems.addAll(innerItems);

                         newItems.add(outerItems.get(index));

                     }

                 }

                 if(newItems == null){continue;}

                 Collections.sort(newItems);

                 //候选集合必须满足所有的子集都在上次pass的输出中，调用isCandidate进行检测，通过后加入到候选子集和列表

                 if(isCandidate(newItems,prevItemsets) && !candidateItemsets.contains(newItems)){

                     candidateItemsets.add(newItems);

                     //System.out.println(newItems);

                 }

             }

         }

         return candidateItemsets;

     }

     private boolean isCandidate(List<Integer> newItems,List< List<Integer> > prevItemsets){

         List<List<Integer>> subsets = getSubsets(newItems);     

         for(List<Integer> subset : subsets){

             if(!prevItemsets.contains(subset)){

                 return false;

             }

         }

         return true;

     }

     private List<List<Integer>> getSubsets(List<Integer> items){

         List<List<Integer>> subsets = new ArrayList<List<Integer>>();

         for(int i = 0;i < items.size();i++){

             List<Integer> subset = new ArrayList<Integer>(items);

             subset.remove(i);

             subsets.add(subset);

         }

         return subsets;

     }

 }

 public class Apriori extends Configured implements Tool{

     public static int s;

     public static int k;

     public int run(String[] args)throws IOException,InterruptedException,ClassNotFoundException{

         long startTime = System.currentTimeMillis();

         String hdfsInputDir = args[0];        //从参数1中读取输入数据

         String hdfsOutputDirPrefix = args[1];    //参数2为输出数据前缀，和第pass次组成输出目录

         s = Integer.parseInt(args[2]);        //阈值

         k = Integer.parseInt(args[3]);        //k次pass

         //循环执行K次pass

         for(int pass = 1; pass <= k;pass++){

             long passStartTime = System.currentTimeMillis();

             //配置执行该job

             if(!runPassKMRJob(hdfsInputDir,hdfsOutputDirPrefix,pass)){

                 return -1;

             }

             long passEndTime = System.currentTimeMillis();

             System.out.println("pass " + pass + " time : " + (passEndTime - passStartTime));

         }

         long endTime = System.currentTimeMillis();

         System.out.println("total time : " + (endTime - startTime));

         return 0;

     }

     private static boolean runPassKMRJob(String hdfsInputDir,String hdfsOutputDirPrefix,int passNum)

             throws IOException,InterruptedException,ClassNotFoundException{

             Configuration passNumMRConf = new Configuration();

             passNumMRConf.setInt("passNum",passNum);

             passNumMRConf.set("hdfsOutputDirPrefix",hdfsOutputDirPrefix);

             passNumMRConf.setInt("minSup",s);

             Job passNumMRJob = new Job(passNumMRConf,"" + passNum);

             passNumMRJob.setJarByClass(Apriori.class);

             if(passNum == 1){

                 //第一次pass的Mapper类特殊对待，不许要构造候选itemsets

                 passNumMRJob.setMapperClass(AprioriPass1Mapper.class);

             }

             else{

                 //第一次之后的pass的Mapper类特殊对待，不许要构造候选itemsets

                 passNumMRJob.setMapperClass(AprioriPassKMapper.class);

             }

             passNumMRJob.setReducerClass(AprioriReducer.class);

             passNumMRJob.setOutputKeyClass(Text.class);

             passNumMRJob.setOutputValueClass(IntWritable.class);

             FileInputFormat.addInputPath(passNumMRJob,new Path(hdfsInputDir));

             FileOutputFormat.setOutputPath(passNumMRJob,new Path(hdfsOutputDirPrefix + passNum));

             return passNumMRJob.waitForCompletion(true);

     }

     public static void main(String[] args) throws Exception{

         int exitCode = ToolRunner.run(new Apriori(),args);

         System.exit(exitCode);

     }

 }

MapReduce实现Apriori算法的更多相关文章

利用Apriori算法对交通路况的研究
首先简单描述一下Apriori算法:Apriori算法分为频繁项集的产生和规则的产生. Apriori算法频繁项集的产生: 令ck为候选k-项集的集合,而Fk为频繁k-项集的集合. 1.首先通过单遍扫 ...
#研发解决方案#基于Apriori算法的Nginx+Lua+ELK异常流量拦截方案
郑昀基于杨海波的设计文档创建于2015/8/13 最后更新于2015/8/25 关键词:异常流量.rate limiting.Nginx.Apriori.频繁项集.先验算法.Lua.ELK 本文档 ...
基于Apriori算法的Nginx+Lua+ELK异常流量拦截方案郑昀基于杨海波的设计文档（转）
郑昀基于杨海波的设计文档创建于2015/8/13 最后更新于2015/8/25 关键词:异常流量.rate limiting.Nginx.Apriori.频繁项集.先验算法.Lua.ELK 本文档 ...
基于Hadoop的改进Apriori算法
一.Apriori算法性质性质一: 候选的k元组集合Ck中,任意k-1个项组成的集合都来自于Lk. 性质二: 若k维数据项目集X={i1,i2,-,ik}中至少存在一个j∈X,使得|L(k-1)(j ...
海量数据挖掘MMDS week2: 频繁项集挖掘 Apriori算法的改进：非hash方法
http://blog.csdn.net/pipisorry/article/details/48914067 海量数据挖掘Mining Massive Datasets(MMDs) -Jure Le ...
Apriori算法的原理与python 实现。
前言:这是一个老故事, 但每次看总是能从中想到点什么.在一家超市里,有一个有趣的现象:尿布和啤酒赫然摆在一起出售.但是这个奇怪的举措却使尿布和啤酒的销量双双增加了.这不是一个笑话,而是发生在美国沃尔玛 ...
数据挖掘算法（四）Apriori算法
参考文献: 关联分析之Apriori算法
机器学习实战 - 读书笔记(11) - 使用Apriori算法进行关联分析
前言最近在看Peter Harrington写的"机器学习实战",这是我的学习心得,这次是第11章 - 使用Apriori算法进行关联分析. 基本概念关联分析(associat ...
关联规则挖掘之apriori算法
前言: 众所周知,关联规则挖掘是数据挖掘中重要的一部分,如著名的啤酒和尿布的问题.今天要学习的是经典的关联规则挖掘算法--Apriori算法一.算法的基本原理由k项频繁集去导出k+1项频繁集. 二 ...

随机推荐

国内环境安装k8s
环境准备 1. 配置/etc/hosts文件,将所有机器配置成通过主机名可以访问. 2. 如果环境中有代理,请一定要在环境变量中将no_proxy配置正确. 3. master还需要执行下面的命令 ...
推荐一个SAM文件或者bam文件中flag含义解释工具
SAM是Sequence Alignment/Map 的缩写.像bwa等软件序列比对结果都会输出这样的文件.samtools网站上有专门的文档介绍SAM文件.具体地址:http://samtools. ...
css浮动与清除浮动
css浮动首先,我们要知道,css中块级元素在页面中是独占一行的,自上而下排列,也就是我们所说的流,通常称为标准流. 以div为例,div是块级元素,如下: 可以清楚地看到,div是独占一行的,di ...
krpano 常用标签
<krpano></krpano>根标签相当于 <body> <scene></scene>一个全景图场景 <image> 图 ...
git1使用步骤初始化拉取修改提交推送
Git 使用 git init 命令来初始化一个 Git 仓库,Git 的很多命令都需要在 Git 的仓库中运行,所以 git init 是使用 Git 的第一个命令. 在执行完成 git init ...
搭建日志收集系统时使用客户端连接etcd遇到的问题
问题: 在做日志收集系统时使用到etcd,其中server端在linux上,首先安装第三方包(windows)(安装过程可能会有问题,我遇到的是连接谷歌官网请求超时,如果已经出现下面的两个文件夹并且文 ...
多重线性回归 (multiple linear regression) | 变量选择 | 最佳模型 | 基本假设的诊断方法
P133,这是第二次作业,考察多重线性回归.这个youtube频道真是精品,用R做统计.这里是R代码的总结. 连续变量和类别型变量总要分开讨论: 多重线性回归可以写成矩阵形式的一元一次回归:相当于把多 ...
SpringBoot之配置文件加载位置
1.SpringBoot启动会扫描application.properties或者application.yml文件作为springboot的配置文件.默认创建项目生成application.prop ...
Android测试（二）——adb常用命令
连接设备: 安装应用包apk文件: adb install apk文件卸载应用: adb uninstall 包名将设备中的文件放到本地: adb pull 设备文件目录本地文件目录将本地文件 ...
STSdb数据库的实现使用类
STSdb 3.5是一个开源的key-value存储形式的数据库,它是用微软.net框架C#语言编写的.STSdb 3.5尤其使用于紧急任务或实时系统,如:股市交易,电子通信,实验室数据等,它的主要功 ...

MapReduce实现Apriori算法

输入格式：

输出格式：

代码：

MapReduce实现Apriori算法的更多相关文章

随机推荐

热门专题