MAPREDUCE的实战案例

reduce端join算法实现

1、需求：

订单数据表t_order：

id	date	pid	amount
1001	20150710	P0001	2
1002	20150710	P0001	3
1002	20150710	P0002	3

商品信息表t_product

id	pname	category_id	price
P0001	小米5	1000	2
P0002	锤子T1	1000	3

假如数据量巨大，两表的数据是以文件的形式存储在HDFS中，需要用mapreduce程序来实现一下SQL查询运算：

select a.id,a.date,b.name,b.category_id,b.price from t_order a join t_product b on a.pid = b.id

2、实现机制：

通过将关联的条件作为map输出的key，将两表满足join条件的数据并携带数据所来源的文件信息，发往同一个reduce task，在reduce中进行数据的串联

public class OrderJoin {

    static class OrderJoinMapper extends Mapper<LongWritable, Text, Text, OrderJoinBean> {

        @Override

        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

            // 拿到一行数据，并且要分辨出这行数据所属的文件

            String line = value.toString();

            String[] fields = line.split("\t");

            // 拿到itemid

            String itemid = fields[0];

            // 获取到这一行所在的文件名（通过inpusplit）

            String name = "你拿到的文件名";

            // 根据文件名，切分出各字段（如果是a，切分出两个字段，如果是b，切分出3个字段）

            OrderJoinBean bean = new OrderJoinBean();

            bean.set(null, null, null, null, null);

            context.write(new Text(itemid), bean);

        }

    }

    static class OrderJoinReducer extends Reducer<Text, OrderJoinBean, OrderJoinBean, NullWritable> {

        @Override

        protected void reduce(Text key, Iterable<OrderJoinBean> beans, Context context) throws IOException, InterruptedException {

             //拿到的key是某一个itemid,比如1000

            //拿到的beans是来自于两类文件的bean

            //  {1000,amount} {1000,amount} {1000,amount}   ---   {1000,price,name}

            //将来自于b文件的bean里面的字段，跟来自于a的所有bean进行字段拼接并输出

        }

    }

}

缺点：这种方式中，join的操作是在reduce阶段完成，reduce端的处理压力太大，map节点的运算负载则很低，资源利用率不高，且在reduce阶段极易产生数据倾斜

解决方案： map端join实现方式

1、原理阐述

适用于关联表中有小表的情形；

可以将小表分发到所有的map节点，这样，map节点就可以在本地对自己所读到的大表数据进行join并输出最终结果，可以大大提高join操作的并发度，加快处理速度

2、实现示例

--先在mapper类中预先定义好小表，进行join

--引入实际场景中的解决方案：一次加载数据库或者用distributedcache

public class TestDistributedCache {

    static class TestDistributedCacheMapper extends Mapper<LongWritable, Text, Text, Text>{

        FileReader in = null;

        BufferedReader reader = null;

        HashMap<String,String> b_tab = new HashMap<String, String>();

        String localpath =null;

        String uirpath = null;

        //是在map任务初始化的时候调用一次

        @Override

        protected void setup(Context context) throws IOException, InterruptedException {

            //通过这几句代码可以获取到cache file的本地绝对路径，测试验证用

            Path[] files = context.getLocalCacheFiles();

            localpath = files[0].toString();

            URI[] cacheFiles = context.getCacheFiles();

            //缓存文件的用法——直接用本地IO来读取

            //这里读的数据是map task所在机器本地工作目录中的一个小文件

            in = new FileReader("b.txt");

            reader =new BufferedReader(in);

            String line =null;

            while(null!=(line=reader.readLine())){

                String[] fields = line.split(",");

                b_tab.put(fields[0],fields[1]);

            }

            IOUtils.closeStream(reader);

            IOUtils.closeStream(in);

        }

        @Override

        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

            //这里读的是这个map task所负责的那一个切片数据（在hdfs上）

             String[] fields = value.toString().split("\t");

             String a_itemid = fields[0];

             String a_amount = fields[1];

             String b_name = b_tab.get(a_itemid);

             // 输出结果  1001    98.9    banan

             context.write(new Text(a_itemid), new Text(a_amount + "\t" + ":" + localpath + "\t" +b_name ));

        }

    }

    public static void main(String[] args) throws Exception {

        Configuration conf = new Configuration();

        Job job = Job.getInstance(conf);

        job.setJarByClass(TestDistributedCache.class);

        job.setMapperClass(TestDistributedCacheMapper.class);

        job.setOutputKeyClass(Text.class);

        job.setOutputValueClass(LongWritable.class);

        //这里是我们正常的需要处理的数据所在路径

        FileInputFormat.setInputPaths(job, new Path(args[0]));

        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        //不需要reducer

        job.setNumReduceTasks(0);

        //分发一个文件到task进程的工作目录

        job.addCacheFile(new URI("hdfs://hadoop-server01:9000/cachefile/b.txt"));

        //分发一个归档文件到task进程的工作目录

//        job.addArchiveToClassPath(archive);

        //分发jar包到task节点的classpath下

//        job.addFileToClassPath(jarfile);

        job.waitForCompletion(true);

    }

}

web日志预处理

1、需求：

对web访问日志中的各字段识别切分

去除日志中不合法的记录

根据KPI统计需求，生成各类访问请求过滤数据

2、实现代码：

a) 定义一个bean，用来记录日志数据中的各数据字段

public class WebLogBean {

    private String remote_addr;// 记录客户端的ip地址

    private String remote_user;// 记录客户端用户名称,忽略属性"-"

    private String time_local;// 记录访问时间与时区

    private String request;// 记录请求的url与http协议

    private String status;// 记录请求状态；成功是200

    private String body_bytes_sent;// 记录发送给客户端文件主体内容大小

    private String http_referer;// 用来记录从那个页面链接访问过来的

    private String http_user_agent;// 记录客户浏览器的相关信息

    private boolean valid = true;// 判断数据是否合法

    public String getRemote_addr() {

        return remote_addr;

    }

    public void setRemote_addr(String remote_addr) {

        this.remote_addr = remote_addr;

    }

    public String getRemote_user() {

        return remote_user;

    }

    public void setRemote_user(String remote_user) {

        this.remote_user = remote_user;

    }

    public String getTime_local() {

        return time_local;

    }

    public void setTime_local(String time_local) {

        this.time_local = time_local;

    }

    public String getRequest() {

        return request;

    }

    public void setRequest(String request) {

        this.request = request;

    }

    public String getStatus() {

        return status;

    }

    public void setStatus(String status) {

        this.status = status;

    }

    public String getBody_bytes_sent() {

        return body_bytes_sent;

    }

    public void setBody_bytes_sent(String body_bytes_sent) {

        this.body_bytes_sent = body_bytes_sent;

    }

    public String getHttp_referer() {

        return http_referer;

    }

    public void setHttp_referer(String http_referer) {

        this.http_referer = http_referer;

    }

    public String getHttp_user_agent() {

        return http_user_agent;

    }

    public void setHttp_user_agent(String http_user_agent) {

        this.http_user_agent = http_user_agent;

    }

    public boolean isValid() {

        return valid;

    }

    public void setValid(boolean valid) {

        this.valid = valid;

    }

    @Override

    public String toString() {

        StringBuilder sb = new StringBuilder();

        sb.append(this.valid);

        sb.append("\001").append(this.remote_addr);

        sb.append("\001").append(this.remote_user);

        sb.append("\001").append(this.time_local);

        sb.append("\001").append(this.request);

        sb.append("\001").append(this.status);

        sb.append("\001").append(this.body_bytes_sent);

        sb.append("\001").append(this.http_referer);

        sb.append("\001").append(this.http_user_agent);

        return sb.toString();

}

}

b)定义一个parser用来解析过滤web访问日志原始记录

public class WebLogParser {

    public static WebLogBean parser(String line) {

        WebLogBean webLogBean = new WebLogBean();

        String[] arr = line.split(" ");

        if (arr.length > 11) {

            webLogBean.setRemote_addr(arr[0]);

            webLogBean.setRemote_user(arr[1]);

            webLogBean.setTime_local(arr[3].substring(1));

            webLogBean.setRequest(arr[6]);

            webLogBean.setStatus(arr[8]);

            webLogBean.setBody_bytes_sent(arr[9]);

            webLogBean.setHttp_referer(arr[10]);

            if (arr.length > 12) {

                webLogBean.setHttp_user_agent(arr[11] + " " + arr[12]);

            } else {

                webLogBean.setHttp_user_agent(arr[11]);

            }

            if (Integer.parseInt(webLogBean.getStatus()) >= 400) {// 大于400，HTTP错误

                webLogBean.setValid(false);

            }

        } else {

            webLogBean.setValid(false);

        }

        return webLogBean;

    }

    public static String parserTime(String time) {

        time.replace("/", "-");

        return time;

    }

}

c) mapreduce程序

public class WeblogPreProcess {

    static class WeblogPreProcessMapper extends Mapper<LongWritable, Text, Text, NullWritable> {

        Text k = new Text();

        NullWritable v = NullWritable.get();

        @Override

        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

            String line = value.toString();

            WebLogBean webLogBean = WebLogParser.parser(line);

            if (!webLogBean.isValid())

                return;

            k.set(webLogBean.toString());

            context.write(k, v);

        }

    }

    public static void main(String[] args) throws Exception {

        Configuration conf = new Configuration();

        Job job = Job.getInstance(conf);

        job.setJarByClass(WeblogPreProcess.class);

        job.setMapperClass(WeblogPreProcessMapper.class);

        job.setOutputKeyClass(Text.class);

        job.setOutputValueClass(NullWritable.class);

        FileInputFormat.setInputPaths(job, new Path(args[0]));

        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        job.waitForCompletion(true);

    }

}

流量统计相关需求

1、对流量日志中的用户统计总上、下行流量技术点：自定义javaBean用来在mapreduce中充当value

注意： javaBean要实现Writable接口，实现两个方法

    //序列化，将对象的字段信息写入输出流

    @Override

    public void write(DataOutput out) throws IOException {

        out.writeLong(upflow);

        out.writeLong(downflow);

        out.writeLong(sumflow);

    }

    //反序列化，从输入流中读取各个字段信息

    @Override

    public void readFields(DataInput in) throws IOException {

        upflow = in.readLong();

        downflow = in.readLong();

        sumflow = in.readLong();

    }

1、统计流量且按照流量大小倒序排序

技术点：这种需求，用一个mapreduce -job 不好实现，需要两个mapreduce -job

第一个job负责流量统计，跟上题相同

第二个job读入第一个job的输出，然后做排序

要将flowBean作为map的key输出，这样mapreduce就会自动排序此时，flowBean要实现接口WritableComparable 要实现其中的compareTo()方法，方法中，我们可以定义倒序比较的逻辑

1、统计流量且按照手机号的归属地，将结果数据输出到不同的省份文件中技术点：自定义Partitioner

    @Override

    public int getPartition(Text key, FlowBean value, int numPartitions) {

        String prefix = key.toString().substring(0,3);

        Integer partNum = pmap.get(prefix);

        return (partNum==null?4:partNum);

    }

自定义partition后，要根据自定义partitioner的逻辑设置相应数量的reduce task

job.setNumReduceTasks(5);

注意：如果reduceTask的数量>= getPartition的结果数，则会多产生几个空的输出文件part-r-000xx

如果 1<reduceTask的数量<getPartition的结果数，则有一部分分区数据无处安放，会Exception！！！

如果 reduceTask的数量=1，则不管mapTask端输出多少个分区文件，最终结果都交给这一个reduceTask，最终也就只会产生一个结果文件 part-r-00000

社交粉丝数据分析

以下是qq的好友列表数据，冒号前是一个用，冒号后是该用户的所有好友（数据中的好友关系是单向的）

A:B,C,D,F,E,O

B:A,C,E,K

C:F,A,D,I

D:A,E,F,L

E:B,C,D,M,L

F:A,B,C,D,E,O,M

G:A,C,D,E,F

H:A,C,D,E,O

I:A,O

J:B,O

K:A,C,D

L:D,E,F

M:E,F,G

O:A,H,I,J

求出哪些人两两之间有共同好友，及他俩的共同好友都有谁？

解题思路：

求出哪些人两两之间有共同好友，及他俩的共同好友都有谁？

解题思路：

第一步

map

读一行 A:B,C,D,F,E,O

输出 <B,A><C,A><D,A><F,A><E,A><O,A>

在读一行 B:A,C,E,K

输出 <A,B><C,B><E,B><K,B>

REDUCE

拿到的数据比如<C,A><C,B><C,E><C,F><C,G>......

输出：

<A-B,C>

<A-E,C>

<A-F,C>

<A-G,C>

<B-E,C>

<B-F,C>.....

第二步

map

读入一行<A-B,C>

直接输出<A-B,C>

reduce

读入数据 <A-B,C><A-B,F><A-B,G>.......

输出： A-B C,F,G,.....

package cn.itcast.bigdata.mr.fensi;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class SharedFriendsStepOne {

    static class SharedFriendsStepOneMapper extends Mapper<LongWritable, Text, Text, Text> {

        @Override

        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

            // A:B,C,D,F,E,O

            String line = value.toString();

            String[] person_friends = line.split(":");

            String person = person_friends[0];

            String friends = person_friends[1];

            for (String friend : friends.split(",")) {

                // 输出<好友，人>

                context.write(new Text(friend), new Text(person));

            }

        }

    }

    static class SharedFriendsStepOneReducer extends Reducer<Text, Text, Text, Text> {

        @Override

        protected void reduce(Text friend, Iterable<Text> persons, Context context) throws IOException, InterruptedException {

            StringBuffer sb = new StringBuffer();

            for (Text person : persons) {

                sb.append(person).append(",");

            }

            context.write(friend, new Text(sb.toString()));

        }

    }

    public static void main(String[] args) throws Exception {

        Configuration conf = new Configuration();

        Job job = Job.getInstance(conf);

        job.setJarByClass(SharedFriendsStepOne.class);

        job.setOutputKeyClass(Text.class);

        job.setOutputValueClass(Text.class);

        job.setMapperClass(SharedFriendsStepOneMapper.class);

        job.setReducerClass(SharedFriendsStepOneReducer.class);

        FileInputFormat.setInputPaths(job, new Path("D:/srcdata/friends"));

        FileOutputFormat.setOutputPath(job, new Path("D:/temp/out"));

        job.waitForCompletion(true);

    }

}

package cn.itcast.bigdata.mr.fensi;

import java.io.IOException;

import java.util.Arrays;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class SharedFriendsStepTwo {

    static class SharedFriendsStepTwoMapper extends Mapper<LongWritable, Text, Text, Text> {

        // 拿到的数据是上一个步骤的输出结果

        // A I,K,C,B,G,F,H,O,D,

        // 友 人，人，人

        @Override

        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

            String line = value.toString();

            String[] friend_persons = line.split("\t");

            String friend = friend_persons[0];

            String[] persons = friend_persons[1].split(",");

            Arrays.sort(persons);

            for (int i = 0; i < persons.length - 1; i++) {

                for (int j = i + 1; j < persons.length; j++) {

                    // 发出 <人-人，好友> ，这样，相同的“人-人”对的所有好友就会到同1个reduce中去

                    context.write(new Text(persons[i] + "-" + persons[j]), new Text(friend));

                }

            }

        }

    }

    static class SharedFriendsStepTwoReducer extends Reducer<Text, Text, Text, Text> {

        @Override

        protected void reduce(Text person_person, Iterable<Text> friends, Context context) throws IOException, InterruptedException {

            StringBuffer sb = new StringBuffer();

            for (Text friend : friends) {

                sb.append(friend).append(" ");

            }

            context.write(person_person, new Text(sb.toString()));

        }

    }

    public static void main(String[] args) throws Exception {

        Configuration conf = new Configuration();

        Job job = Job.getInstance(conf);

        job.setJarByClass(SharedFriendsStepTwo.class);

        job.setOutputKeyClass(Text.class);

        job.setOutputValueClass(Text.class);

        job.setMapperClass(SharedFriendsStepTwoMapper.class);

        job.setReducerClass(SharedFriendsStepTwoReducer.class);

        FileInputFormat.setInputPaths(job, new Path("D:/temp/out/part-r-00000"));

        FileOutputFormat.setOutputPath(job, new Path("D:/temp/out2"));

        job.waitForCompletion(true);

    }

}