要求:计算hasgj表,计算每天新增mac数量。

因为spark直接扫描hbase表,对hbase集群访问量太大,给集群造成压力,这里考虑用spark读取HFile进行数据分析。

1、建立hasgj表的快照表:hasgjSnapshot

语句为:snapshot 'hasgj','hasgjSnapshot'

2、计算每天mac增量的代码如下:

package com.ba.sparkReadHbase.operatorHfile.hfileinputformat;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.NavigableMap;
import java.util.Set;
import java.util.Map.Entry;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableInputFormat;
import org.apache.hadoop.hbase.mapreduce.TableSnapshotInputFormat;
import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
import org.apache.hadoop.hbase.protobuf.generated.ClientProtos;
import org.apache.hadoop.hbase.util.Base64;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.mapreduce.Job;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import scala.Tuple2; public class SparkReadHFile {
private static String convertScanToString(Scan scan) throws IOException {
ClientProtos.Scan proto = ProtobufUtil.toScan(scan);
return Base64.encodeBytes(proto.toByteArray());
} public static void main(String[] args) throws IOException {
final String date=args[0];
int max_versions = 3;
SparkConf sparkConf = new SparkConf().setAppName("sparkReadHfile");
JavaSparkContext sc = new JavaSparkContext(sparkConf);
Configuration hconf = HBaseConfiguration.create();
hconf.set("hbase.rootdir", "/hbase");
hconf.set("hbase.zookeeper.quorum", "master,slave1,slave2");
Scan scan = new Scan();
scan.addFamily(Bytes.toBytes("ba"));
scan.setMaxVersions(max_versions);
hconf.set(TableInputFormat.SCAN, convertScanToString(scan));
Job job = Job.getInstance(hconf);
Path path = new Path("/snapshot");
String snapName ="hasgjSnapshot";
TableSnapshotInputFormat.setInput(job, snapName, path);
JavaPairRDD<ImmutableBytesWritable, Result> newAPIHadoopRDD = sc.newAPIHadoopRDD(job.getConfiguration(), TableSnapshotInputFormat.class, ImmutableBytesWritable.class,Result.class);
List<String> collect = newAPIHadoopRDD.map(new Function<Tuple2<ImmutableBytesWritable, Result>, String>(){
private static final long serialVersionUID = 1L;
public String call(Tuple2<ImmutableBytesWritable, Result> v1)
throws Exception {
// TODO Auto-generated method stub
String newMac =null;
Result result = v1._2();
if (result.isEmpty()) {
return null;
}
String rowKey = Bytes.toString(result.getRow());
//System.out.println("行健为:"+rowKey);
NavigableMap<byte[], byte[]> familyMap = result.getFamilyMap(Bytes.toBytes("ba"));
Set<Entry<byte[], byte[]>> entrySet = familyMap.entrySet();
java.util.Iterator<Entry<byte[], byte[]>> it = entrySet.iterator();
String colunNmae =null;
String minDate="34561213";
while(it.hasNext()){
colunNmae = new String(it.next().getKey());//列
if(colunNmae.compareTo(minDate)<0){
minDate=colunNmae;
}
} if (date.equals(minDate)) {
// row=rowKey.substring(4);
newMac=rowKey;
//ls.add(rowKey.substring(4));
//bf.append(rowKey+"----");
}
return newMac;
}
}).collect();
ArrayList<String> arrayList = new ArrayList<String>();
for (int i = 0; i < collect.size(); i++) {
if (collect.get(i) !=null) {
arrayList.add(collect.get(i));
}
}
System.out.println("新增mac数"+(arrayList.size())); }
}

3、特别说明:

hasgj表的表结构:

0000F470ABF3A587                          column=ba:20170802, timestamp=1517558687930, value=                                                                         
 0000F470ABF3A587                          column=ba:20170804, timestamp=1517593923254, value=                                                                         
 0000F470ABF3A587                          column=ba:20170806, timestamp=1517620990589, value=                                                                         
 0000F470ABF3A587                          column=ba:20170809, timestamp=1517706294758, value=                                                                         
 0000F470ABF3A587                          column=ba:20170810, timestamp=1517722369020, value=                                                                         
 0000F470ABF3A587                          column=ba:20170811, timestamp=1517796060984, value=                                                                         
 0000F470ABF3A587                          column=ba:20170816, timestamp=1517882948856, value=                                                                         
 0000F470ABF3A587                          column=ba:20170818, timestamp=1517912603602, value=                                                                         
 0000F470ABF3A587                          column=ba:20170819, timestamp=1517938488763, value=                                                                         
 0000F470ABF3A587                          column=ba:20170821, timestamp=1517989742180, value=                                                                         
 0000F470ABF3A587                          column=ba:20170827, timestamp=1518383470292, value=                                                                         
 0000F470ABF3A587                          column=ba:20170828, timestamp=1520305841272, value=                                                                         
 0000F470ABF3A587                          column=ba:20170831, timestamp=1522115116459, value=                                                                         
 0000F4730088A5D3                          column=ba:20170805, timestamp=1517598564121, value=                                                                         
 0000F47679E83F7D                          column=ba:20170817, timestamp=1517890046587, value=                                                                         
 0000F47FBA753FC7                          column=ba:20170827, timestamp=1518365792130, value=                                                                         
 0000F48C02F8EB83                          column=ba:20170810, timestamp=1517729864592, value=                                                                         
 0000F49578E63F55                          column=ba:20170828, timestamp=1520302223714, value=                                                                         
 0000F4AC4A93F7A5                          column=ba:20170810, timestamp=1517724545955, value=                                                                         
 0000F4B4807679AA                          column=ba:20170801, timestamp=1517543775374, value=                                                                         
 0000F4B7E374C0FF                          column=ba:20170804, timestamp=1517578239073, value=                                                                         
 0000F4BDBF6EBF37                          column=ba:20170829, timestamp=1520558747936, value=                                                                         
 0000F4CB52FDDA58                          column=ba:20170806, timestamp=1517638015583, value=                                                                         
 0000F4CB52FDDA58                          column=ba:20170807, timestamp=1517677405900, value=

4、提交作业命令:

./spark-submit --master yarn-client  --num-executors 7 --executor-cores 2 --driver-memory 2g  --executor-memory 30g --class com.ba.sparkReadHbase.operatorHfile.hfileinputformat.SparkReadHFile  /home/xxx0108/ftttttttt/testJar/sparkTest9.jar 20170806

spark读HFile对hbase表数据进行分析的更多相关文章

  1. 数据分页处理系列之二:HBase表数据分页处理

      HBase是Hadoop大数据生态技术圈中的一项关键技术,是一种用于分布式存储大数据的列式数据库,关于HBase更加详细的介绍和技术细节,朋友们可以在网络上进行搜寻,笔者本人在接下来的日子里也会写 ...

  2. HBase(三): Azure HDInsigt HBase表数据导入本地HBase

    目录: hdfs 命令操作本地 hbase Azure HDInsight HBase表数据导入本地 hbase hdfs命令操作本地hbase: 参见  HDP2.4安装(五):集群及组件安装 , ...

  3. 一种HBase表数据迁移方法的优化

    1.背景调研: 目前存在的hbase数据迁移主要分如下几类: 根据上图,可以看出: 其实主要分为两种方式:(1)hadoop层:因为hbase底层是基于hdfs存储的,所以可以通过把hdfs上的数据拷 ...

  4. HBase表数据分页处理

    HBase表数据分页处理 HBase是Hadoop大数据生态技术圈中的一项关键技术,是一种用于分布式存储大数据的列式数据库,关于HBase更加详细的介绍和技术细节,朋友们可以在网络上进行搜寻,笔者本人 ...

  5. HBase表数据的转移之使用自定义MapReduce

    目标:将fruit表中的一部分数据,通过MR迁入到fruit_mr表中 Step1.构建ReadFruitMapper类,用于读取fruit表中的数据 package com.z.hbase_mr; ...

  6. Spark读HBase写MySQL

    1 Spark读HBase Spark读HBase黑名单数据,过滤出当日新增userid,并与mysql黑名单表内userid去重后,写入mysql. def main(args: Array[Str ...

  7. hbase操作(shell 命令,如建表,清空表,增删改查)以及 hbase表存储结构和原理

    两篇讲的不错文章 http://www.cnblogs.com/nexiyi/p/hbase_shell.html http://blog.csdn.net/u010967382/article/de ...

  8. Hive如何加载和导入HBase的数据

    当我们用HBase 存储实时数据的时候, 如果要做一些数据分析方面的操作, 就比较困难了, 要写MapReduce Job. Hive 主要是用来做数据分析的数据仓库,支持标准SQL 查询, 做数据分 ...

  9. 用Spark查询HBase中的表数据

    java代码如下: package db.query; import org.apache.commons.logging.Log; import org.apache.commons.logging ...

随机推荐

  1. 开始使用 Ubuntu(字体渲染去模糊+软件安装+优化配置+常见错误)(29)

    1. 中文字体渲染美化 + 去模糊 步骤: 1. 解压安装 lulinux_fontsConf_181226.tar.gz,按里面的安装说明操作: 2. 开启字体渲染: 打开 unity-tweak- ...

  2. Python--递归函数实现:多维嵌套字典数据无限遍历

    原创:多层嵌套字典无限遍历,实现当value值以特殊字符$开头,并且等于某项值时,用随机函数替换该参数 """处理前的字典{'patient': {'avatarPic' ...

  3. PAT(B) 1069 微博转发抽奖(Java)

    题目链接:1069 微博转发抽奖 (20 point(s)) 题目描述 小明 PAT 考了满分,高兴之余决定发起微博转发抽奖活动,从转发的网友中按顺序每隔 N 个人就发出一个红包.请你编写程序帮助他确 ...

  4. 解决找不到mkfs.ubifs命令

    解决找不到mkfs.ubifs命令 ubuntu 版本:14.04 sudo apt-get update sudo apt-get install mtd-utils sudo apt-get in ...

  5. Python 命令行模块使用技巧

    命令行参数传递 python main.py -H 192.168.1.1 -p 22,23,24 #coding:utf-8 import optparse def PortScan(host,po ...

  6. Singer House CodeForces - 830D (组合计数,dp)

    大意: 一个$k$层完全二叉树, 每个节点向它祖先连边, 就得到一个$k$房子, 求$k$房子的所有简单路径数. $DP$好题. 首先设$dp_{i,j}$表示$i$房子, 分出$j$条简单路径的方案 ...

  7. JavaDoc工具和Ideade javadoc工具

    命令参考: javadoc -locale zh_CN -protected -notree -nonavbar -noindex -use -author -version -encoding UT ...

  8. GRPC代替webapi Demo。

    gRPC 是一种与语言无关的高性能远程过程调用 (RPC) 框架. gRPC 的主要优点是: 现代高性能轻量级 RPC 框架. 协定优先 API 开发,默认使用协议缓冲区,允许与语言无关的实现. 可用 ...

  9. 测试人员必须掌握的linu常用命令

    有些公司需要测试人员部署程序包,通过工具xshell. 现在我将总结下工作需要用到的最多的命令 ls                                显示文件或目录 pwd       ...

  10. php的文件上传及下载,附带显示文件及目录

    主页面wenjianceshi.php <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" &quo ...