hbase快速入门
hbase 是什么?
Apache HBase is an open-source, distributed, versioned, non-relational database modeled after Google's Bigtable: A Distributed Storage System for Structured Data by Chang et al. Just as Bigtable leverages the distributed data storage provided by the Google File System, Apache HBase provides Bigtable-like capabilities on top of Hadoop and HDFS.
hbase的应用场景
Use Apache HBase™ when you need random, realtime read/write access to your Big Data. This project's goal is the hosting of very large tables -- billions of rows X millions of columns -- atop clusters of commodity hardware.
hbase的特性
>>>Linear and modular scalability.
>>>Strictly consistent reads and writes.
>>>Automatic and configurable sharding of tables
>>>Automatic failover support between RegionServers.
>>>Convenient base classes for backing Hadoop MapReduce jobs with Apache HBase tables.
>>>Easy to use Java API for client access.
>>>Block cache and Bloom Filters for real-time queries.
>>>Query predicate push down via server side Filters
>>>Thrift gateway and a REST-ful Web service that supports XML, Protobuf, and binary data encoding options
>>>Extensible jruby-based (JIRB) shell
>>>Support for exporting metrics via the Hadoop metrics subsystem to files or Ganglia; or via JMX
hbase 有哪些操作?
先看一下hbase的基本操作:创建一个表,删除一个表,增加一条记录,删除一条记录,遍历一条记录。
import java.io.IOException;
import java.util.ArrayList;
import java.util.List; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.MasterNotRunningException;
import org.apache.hadoop.hbase.ZooKeeperConnectionException;
import org.apache.hadoop.hbase.client.Delete;
import org.apache.hadoop.hbase.client.Get;
import org.apache.hadoop.hbase.client.HBaseAdmin;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.ResultScanner;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.util.Bytes; public class HBaseTest { private static Configuration conf = null;
/**
* Initialization
*/
static {
conf = HBaseConfiguration.create();
} /**
* Create a table
*/
public static void creatTable(String tableName, String[] familys)
throws Exception {
HBaseAdmin admin = new HBaseAdmin(conf);
if (admin.tableExists(tableName)) {
System.out.println("table already exists!");
} else {
HTableDescriptor tableDesc = new HTableDescriptor(tableName);
for (int i = 0; i < familys.length; i++) {
tableDesc.addFamily(new HColumnDescriptor(familys[i]));
}
admin.createTable(tableDesc);
System.out.println("create table " + tableName + " ok.");
}
} /**
* Delete a table
*/
public static void deleteTable(String tableName) throws Exception {
try {
HBaseAdmin admin = new HBaseAdmin(conf);
admin.disableTable(tableName);
admin.deleteTable(tableName);
System.out.println("delete table " + tableName + " ok.");
} catch (MasterNotRunningException e) {
e.printStackTrace();
} catch (ZooKeeperConnectionException e) {
e.printStackTrace();
}
} /**
* Put (or insert) a row
*/
public static void addRecord(String tableName, String rowKey,
String family, String qualifier, String value) throws Exception {
try {
HTable table = new HTable(conf, tableName);
Put put = new Put(Bytes.toBytes(rowKey));
put.add(Bytes.toBytes(family), Bytes.toBytes(qualifier), Bytes
.toBytes(value));
table.put(put);
System.out.println("insert recored " + rowKey + " to table "
+ tableName + " ok.");
} catch (IOException e) {
e.printStackTrace();
}
} /**
* Delete a row
*/
public static void delRecord(String tableName, String rowKey)
throws IOException {
HTable table = new HTable(conf, tableName);
List<Delete> list = new ArrayList<Delete>();
Delete del = new Delete(rowKey.getBytes());
list.add(del);
table.delete(list);
System.out.println("del recored " + rowKey + " ok.");
} /**
* Get a row
*/
public static void getOneRecord (String tableName, String rowKey) throws IOException{
HTable table = new HTable(conf, tableName);
Get get = new Get(rowKey.getBytes());
Result rs = table.get(get);
for(KeyValue kv : rs.raw()){
System.out.print(new String(kv.getRow()) + " " );
System.out.print(new String(kv.getFamily()) + ":" );
System.out.print(new String(kv.getQualifier()) + " " );
System.out.print(kv.getTimestamp() + " " );
System.out.println(new String(kv.getValue()));
}
}
/**
* Scan (or list) a table
*/
public static void getAllRecord (String tableName) {
try{
HTable table = new HTable(conf, tableName);
Scan s = new Scan();
ResultScanner ss = table.getScanner(s);
for(Result r:ss){
for(KeyValue kv : r.raw()){
System.out.print(new String(kv.getRow()) + " ");
System.out.print(new String(kv.getFamily()) + ":");
System.out.print(new String(kv.getQualifier()) + " ");
System.out.print(kv.getTimestamp() + " ");
System.out.println(new String(kv.getValue()));
}
}
} catch (IOException e){
e.printStackTrace();
}
} public static void main(String[] agrs) {
try {
String tablename = "scores";
String[] familys = { "grade", "course" };
HBaseTest.creatTable(tablename, familys); // add record zkb
HBaseTest.addRecord(tablename, "zkb", "grade", "", "5");
HBaseTest.addRecord(tablename, "zkb", "course", "", "90");
HBaseTest.addRecord(tablename, "zkb", "course", "math", "97");
HBaseTest.addRecord(tablename, "zkb", "course", "art", "87");
// add record baoniu
HBaseTest.addRecord(tablename, "baoniu", "grade", "", "4");
HBaseTest.addRecord(tablename, "baoniu", "course", "math", "89"); System.out.println("===========get one record========");
HBaseTest.getOneRecord(tablename, "zkb"); System.out.println("===========show all record========");
HBaseTest.getAllRecord(tablename); System.out.println("===========del one record========");
HBaseTest.delRecord(tablename, "baoniu");
HBaseTest.getAllRecord(tablename); System.out.println("===========show all record========");
HBaseTest.getAllRecord(tablename);
} catch (Exception e) {
e.printStackTrace();
}
}
}
基本概念
HTable
核心概念,实现了Table,用来和hbase的一个单表进行通信。轻量级的,提供获取和关闭方法。
这个类不能通过构造函数直接构建出来。可以通过Connection获取该类的一个实例。参考ConnectionFactory类生成实例:
Connection connection = ConnectionFactory.createConnection(config);
Table table = connection.getTable(TableName.valueOf("table1"));
try {
// Use the table as needed, for a single operation and a single thread
} finally {
table.close();
connection.close();
}
Htable的字段有哪些呢?
public class HTable implements HTableInterface, RegionLocator {
private static final Log LOG = LogFactory.getLog(HTable.class);
protected ClusterConnection connection;
private final TableName tableName;
private volatile Configuration configuration;
private TableConfiguration tableConfiguration;
protected BufferedMutatorImpl mutator;
private boolean autoFlush = true;
private boolean closed = false;
protected int scannerCaching;
private ExecutorService pool; // For Multi & Scan
private int operationTimeout;
private final boolean cleanupPoolOnClose; // shutdown the pool in close()
private final boolean cleanupConnectionOnClose; // close the connection in close()
private Consistency defaultConsistency = Consistency.STRONG;
/** The Async process for batch */
protected AsyncProcess multiAp;
private RpcRetryingCallerFactory rpcCallerFactory;
private RpcControllerFactory rpcControllerFactory;
}
HTableDescriptor包含了HBase表的详细信息,例如所有列家族的描述,该表是否一个分类表,ROOT或者hbase:meta,该表是否只读,当region分片时memstore的最大值,关联的coprocessor等等。
Htable继承并实现了Table,Table用来和一个hbase单表进行通信,从表中获取,插入,删除或者扫描数据。使用Connection来获取Table实例,使用完毕后调用close()方法。
Htable也继承实现了RegionLocator,RegionLocator用来定位一张Hbase单表的区域位置信息,可以通过Connection获取该类的实例,RegionLocator的getRegionLocation方法返回HRegionLocation。
HRegionLocation
记录HRegionInfo和HRegionServer的主机地址的数据结构。
构造函数:
public HRegionLocation(HRegionInfo regionInfo, ServerName serverName) {
this(regionInfo, serverName, HConstants.NO_SEQNUM);
}
public HRegionLocation(HRegionInfo regionInfo, ServerName serverName, long seqNum) {
this.regionInfo = regionInfo;
this.serverName = serverName;
this.seqNum = seqNum;
}
HRegionInfo
一个区域的信息。区域是在一张表的整个键空间中一系列的键,一个标识(时间戳)区分不同子序列(在区间分隔之后),一个复制ID区分同一序列和同一区域状态信息的不同实例。
一个区域有一个位于的名称,名称由下列的字段组成:
表名(tableName):表的名称。
开始键(startKey):一个区间的开始键。
区域ID(regionId):创建区域的时间戳。
复制ID(replicaId):一个区分同一区域序列的Id,从0开始,保存到不同的服务器,同一个区域的序列可以保存在多个位置中。
加密后的名称(encodedName):md5加密后的区域名称。
除了区域名称外,区域信息还包含:
结束键(endkey):区域的结束键(独有的)
分片(split):区域是否分片
离线(offline):区域是否离线
在0.98版本或者之前,一组表的区域会完全包含所有的键空间,在任何时间点,一个行键通常属于一个单独的区域,该单独区域又属于一个单独的服务器。在0.99+版本,一个区域可以有多个实例(叫做备份),因此一行可以对应多个HRegionInfo。这些HRI除了备份Id字段外都可以共用字段。若备份Id未设置,默认为0。
/**
* The new format for a region name contains its encodedName at the end.
* The encoded name also serves as the directory name for the region
* in the filesystem.
*
* New region name format:
* <tablename>,,<startkey>,<regionIdTimestamp>.<encodedName>.
* where,
* <encodedName> is a hex version of the MD5 hash of
* <tablename>,<startkey>,<regionIdTimestamp>
*
* The old region name format:
* <tablename>,<startkey>,<regionIdTimestamp>
* For region names in the old format, the encoded name is a 32-bit
* JenkinsHash integer value (in its decimal notation, string form).
*<p>
* **NOTE**
*
* The first hbase:meta region, and regions created by an older
* version of HBase (0.20 or prior) will continue to use the
* old region name format.
*/ /** Separator used to demarcate the encodedName in a region name
* in the new format. See description on new format above.
*/
private static final int ENC_SEPARATOR = '.';
public static final int MD5_HEX_LENGTH = 32; /** A non-capture group so that this can be embedded. */
public static final String ENCODED_REGION_NAME_REGEX = "(?:[a-f0-9]+)"; // to keep appended int's sorted in string format. Only allows 2 bytes to be
// sorted for replicaId
public static final String REPLICA_ID_FORMAT = "%04X"; public static final byte REPLICA_ID_DELIMITER = (byte)'_'; private static final int MAX_REPLICA_ID = 0xFFFF;
static final int DEFAULT_REPLICA_ID = 0; private byte [] endKey = HConstants.EMPTY_BYTE_ARRAY;
// This flag is in the parent of a split while the parent is still referenced
// by daughter regions. We USED to set this flag when we disabled a table
// but now table state is kept up in zookeeper as of 0.90.0 HBase.
private boolean offLine = false;
private long regionId = -1;
private transient byte [] regionName = HConstants.EMPTY_BYTE_ARRAY;
private boolean split = false;
private byte [] startKey = HConstants.EMPTY_BYTE_ARRAY;
private int hashCode = -1;
//TODO: Move NO_HASH to HStoreFile which is really the only place it is used.
public static final String NO_HASH = null;
private String encodedName = null;
private byte [] encodedNameAsBytes = null;
private int replicaId = DEFAULT_REPLICA_ID; // Current TableName
private TableName tableName = null; /** HRegionInfo for first meta region */
public static final HRegionInfo FIRST_META_REGIONINFO =
new HRegionInfo(1L, TableName.META_TABLE_NAME);
先了解一下HBase的数据结构:
组成部件说明:
Row Key: Table主键 行键 Table中记录按照Row Key排序
Timestamp: 每次对数据操作对应的时间戳,也即数据的version number
Column Family: 列簇,一个table在水平方向有一个或者多个列簇,列簇可由任意多个Column组成,列簇支持动态扩展,无须预定义数量及类型,二进制存储,用户需自行进行类型转换。
行操作
Get用来对一个单独的行进行Get操作:
获取一个row的所有信息前,需要实例化一个Get对象。
获取特定家族的所有列,使用addFamily(byte[])。
获取特定列,使用addColumn(byte[], byte[])
获取在特定的时间戳内的一系列列,使用setTimeRange(long, long)
获取在特定时间戳的列,使用setTimeStamp(long)
限制返回的列数,使用setMaxVersions(int)
增加过滤器,使用setFilter(Filter)。 Put用来对一个单独的列进行Put操作:
使用Put前需初始化Put对象,来插入一行使用add(byte[], byte[], byte[]),若设定时间戳则使用add(byte[], byte[], long, byte[])。 Append 操作:
对一行增加多个列,使用add(byte[], byte[], byte[]);
参考文献:
【1】http://hbase.apache.org/
【2】https://autofei.wordpress.com/2012/04/02/java-example-code-using-hbase-data-model-operations/
【3】http://www.cnblogs.com/shitouer/archive/2012/06/04/2533518.html
hbase快速入门的更多相关文章
- Nutch 快速入门(Nutch 2.2.1+Hbase+Solr)
http://www.tuicool.com/articles/VfEFjm Nutch 2.x 与 Nutch 1.x 相比,剥离出了存储层,放到了gora中,可以使用多种数据库,例如HBase, ...
- Gora快速入门
概述 Gora是apache的一个开源项目. The Apache Gora open source framework provides an in-memory data model and pe ...
- Hadoop生态圈-大数据生态体系快速入门篇
Hadoop生态圈-大数据生态体系快速入门篇 作者:尹正杰 版权声明:原创作品,谢绝转载!否则将追究法律责任. 一.大数据概念 1>.什么是大数据 大数据(big data):是指无法在一定时间 ...
- 大数据技术之_09_Flume学习_Flume概述+Flume快速入门+Flume企业开发案例+Flume监控之Ganglia+Flume高级之自定义MySQLSource+Flume企业真实面试题(重点)
第1章 Flume概述1.1 Flume定义1.2 Flume组成架构1.2.1 Agent1.2.2 Source1.2.3 Channel1.2.4 Sink1.2.5 Event1.3 Flum ...
- Hadoop生态圈-Hive快速入门篇之Hive环境搭建
Hadoop生态圈-Hive快速入门篇之Hive环境搭建 作者:尹正杰 版权声明:原创作品,谢绝转载!否则将追究法律责任. 一.数据仓库(理论性知识大多摘自百度百科) 1>.什么是数据仓库 数据 ...
- Gora快速入门 分类: C_OHTERS 2015-01-30 09:55 465人阅读 评论(0) 收藏
概述 Gora是apache的一个开源项目. The Apache Gora open source framework provides an in-memory data model and pe ...
- Scala快速入门 - 基础语法篇
本篇文章首发于头条号Scala快速入门 - 基础语法篇,欢迎关注我的头条号和微信公众号"大数据技术和人工智能"(微信搜索bigdata_ai_tech)获取更多干货,也欢迎关注我的 ...
- 1.2 Hadoop快速入门
1.2 Hadoop快速入门 1.Hadoop简介 Hadoop是一个开源的分布式计算平台. 提供功能:利用服务器集群,根据用户定义的业务逻辑,对海量数据的存储(HDFS)和分析计算(MapReduc ...
- Web Api 入门实战 (快速入门+工具使用+不依赖IIS)
平台之大势何人能挡? 带着你的Net飞奔吧!:http://www.cnblogs.com/dunitian/p/4822808.html 屁话我也就不多说了,什么简介的也省了,直接简单概括+demo ...
随机推荐
- 2016-2017-2 20155339《 java面向对象程序设计》实验四Android程序设计
2016-2017-2 20155339< java面向对象程序设计>实验四Android程序设计 实验内容 1.Android Stuidio的安装测试: 参考<Java和Andr ...
- [2016北京集训测试赛7]isn-[树状数组+dp+容斥]
Description Solution 定义dp[i][j]为在1到i个数中选了j个数,并且保证选了i的选法总数. dp[i][j]为所有满足A[k]>A[i]的k(k<i)的dp[k] ...
- 从原理到代码:大牛教你如何用 TensorFlow 亲手搭建一套图像识别模块 | AI 研习社
从原理到代码:大牛教你如何用 TensorFlow 亲手搭建一套图像识别模块 | AI 研习社 PPT链接: https://pan.baidu.com/s/1i5Jrr1N 视频链接: https: ...
- day 3 局部变量 全局变量
1.局部变量 2.全局变量(死歌的大招)函数前面声明的都是全局变量 3.全局变量和局部变量的区别 1)老方法 def get_temper(): temper = 33 return temper d ...
- 1111: [POI2007]四进制的天平Wag
1111: [POI2007]四进制的天平Wag 链接 题意: 用一些四进制数,相减得到给定的数,四进制数的数量应该尽量少,满足最少的条件下,求方案数. 分析: 这道题拖了好久啊. 参考Claris的 ...
- Git操作指令
1.创建版本库 git init 2.把工作区修改过的文件添加到版本库暂存区,点号表示当前目录下所有文件; git add .#查看仓库状态git status 3.把版本库暂存区的文件提交到当前分支 ...
- php缩放处理png和jpg图片
本例子介绍使用php自带的GD库对png和jpg图片进行放大和缩小处理 <?php$target_width = 120; //目标图片宽度 $target_height = 150; //目标 ...
- 【BUG】12小时制和24小时制获取当天零点问题
[BUG]12小时制和24小时制获取当天零点问题 最近在写定时服务的时候,要获取当天的零点这个时间,但是是这样获取的 DateTime dt = DateTime.Parse(DateTime.Now ...
- Just a Hook:线段树+区间修改
E - Just a Hook In the game of DotA, Pudge’s meat hook is actually the most horrible thing for most ...
- Qt应用程序重启
重启应用程序是一种常见的操作,在Qt中实现非常简单,需要用到QProcess类一个静态方法: // program, 要启动的程序名称 // arguments, 启动参数 bool startDet ...

