给HBase添加一二级索引,HBase协处理器结合solr

代码如下

package com.hbase.coprocessor;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.NavigableMap;
import java.util.UUID; import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.CoprocessorEnvironment;
import org.apache.hadoop.hbase.client.Durability;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.coprocessor.BaseRegionObserver;
import org.apache.hadoop.hbase.coprocessor.ObserverContext;
import org.apache.hadoop.hbase.coprocessor.RegionCoprocessorEnvironment;
import org.apache.hadoop.hbase.regionserver.wal.WALEdit;
import org.apache.hadoop.hbase.util.Bytes;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory; /**
* @author:FengZhen
* @create:2018年7月9日
*/
public class HbaseDataSyncSolrObserver extends BaseRegionObserver{
public static Logger log = LoggerFactory.getLogger(HbaseDataSyncSolrObserver.class);
/**
* start
* @param e
* @throws IOException
*/
@Override
public void start(CoprocessorEnvironment e) throws IOException {
} /**
* stop
* @param e
* @throws IOException
*/
@Override
public void stop(CoprocessorEnvironment e) throws IOException {
} /**
* Called after the client stores a value
* after data put to hbase then prepare update builder to bulk Solr
*
* @param e
* @param put
* @param edit
* @param durability
* @throws IOException
*/
@Override
public void postPut(ObserverContext<RegionCoprocessorEnvironment> e, Put put, WALEdit edit, Durability durability) throws IOException {
NavigableMap<byte[], List<Cell>> familyMap = put.getFamilyCellMap();
for (Map.Entry<byte[], List<Cell>> entry : familyMap.entrySet()) {
String id = UUID.randomUUID().toString();
String rowkey = Bytes.toString(CellUtil.cloneRow(entry.getValue().get(0)));
List<String> tags = new ArrayList<String>();
for (Cell cell : entry.getValue()) {
String key = Bytes.toString(CellUtil.cloneQualifier(cell));
if (key.contains("tb_") || key.contains("tm_")) {
tags.add(key);
}
}
if (null == tags || tags.size() <= 0) {
continue;
}
VmMemory vmMemory = new VmMemory();
vmMemory.setId(id);
vmMemory.setRowkey(rowkey);
vmMemory.setTags(tags);
SolrWriter.addDocToCache(vmMemory);
}
}
}

 Solr代码处理如下

package com.hbase.coprocessor;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.Timer;
import java.util.TimerTask;
import java.util.Vector;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.impl.CloudSolrClient;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrInputDocument;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory; /**
* @author:FengZhen
* @create:2018年7月9日
*/
public class SolrWriter {
public static Logger log = LoggerFactory.getLogger(SolrWriter.class); public static String urlSolr = "node244.qt:2181,node245.qt:2181,node246.qt:2181"; //solr地址 192.168.1.232:2181
public static String defaultCollection = "socialSecurity"; //默认collection tagCollectionHDFS socialSecurity
public static int zkClientTimeOut =20000; //zk客户端请求超时间
public static int zkConnectTimeOut =10000; //zk客户端连接超时间
public static CloudSolrClient cloudSolrClient = null; public static int maxCacheCount = 200; //缓存大小,当达到该上限时提交
public static Vector<VmMemory> cache = null; //缓存
public static Vector<String> cacheRowkey = null;
public static Lock commitLock =new ReentrantLock(); //在添加缓存或进行提交时加�? public static int maxCommitTime = 60*1; //�?大提交时�? static {
Configuration conf = HBaseConfiguration.create();
urlSolr = conf.get("hbase.solr.zklist", "node244.qt:2181,node245.qt:2181,node246.qt:2181"); // 192.168.1.231:2181,192.168.1.232:2181,192.168.1.233:2181
defaultCollection = conf.get("hbase.solr.collection","socialSecurity");
zkClientTimeOut = conf.getInt("hbase.solr.zkClientTimeOut", 10000);
zkConnectTimeOut = conf.getInt("hbase.solr.zkConnectTimeOut", 10000);
maxCacheCount = conf.getInt("hbase.solr.maxCacheCount", 200);
maxCommitTime = conf.getInt("hbase.solr.maxCommitTime", 60*1); log.info("solr init param"+urlSolr+" "+defaultCollection+" "+zkClientTimeOut+" "+zkConnectTimeOut+" "+maxCacheCount+" "+maxCommitTime);
try {
cache=new Vector<VmMemory>(maxCacheCount);
cacheRowkey = new Vector<String>(maxCacheCount);
cloudSolrClient = new CloudSolrClient(urlSolr);
cloudSolrClient.setDefaultCollection(defaultCollection);
cloudSolrClient.setZkClientTimeout(zkClientTimeOut);
cloudSolrClient.setZkConnectTimeout(zkConnectTimeOut);
//启动定时任务,第�?次延�?10执行,之后每隔指定时间执行�?�?
Timer timer=new Timer();
timer.schedule(new CommitTimer(),10*1000,maxCommitTime*1000);
} catch (Exception ex){
ex.printStackTrace();
}
}
/**
* 批量提交
*/
public void inputDoc(List<VmMemory> vmMoneyList) throws IOException, SolrServerException {
if (vmMoneyList == null || vmMoneyList.size() == 0) {
log.info("==========inputDoc:return========");
return;
}
List<SolrInputDocument> doclist= new ArrayList<SolrInputDocument>(vmMoneyList.size());
for (VmMemory vm : vmMoneyList) {
String id = vm.getId();
String rowkey = vm.getRowkey();
List<String> tags = vm.getTags();
log.info("===id={}===rowkey={}=======",id,rowkey);
Set<String> tagSet = new HashSet<String>();
SolrQuery solrQuery = new SolrQuery();
solrQuery.setQuery("rowkey:"+rowkey);
QueryResponse queryResponse = cloudSolrClient.query(solrQuery);
List<SolrDocument> rowkeys = queryResponse.getResults();
SolrInputDocument document = new SolrInputDocument(); if (null != rowkeys && rowkeys.size() > 0) {
for(SolrDocument solrDocument : rowkeys) {
id = (String)solrDocument.get("id");
rowkey = (String)solrDocument.get("rowkey");
List<String> solrTags = (List<String>)solrDocument.get("tags");
tagSet.addAll(solrTags);
}
}
tagSet.addAll(tags);
document.addField("id", id);
document.addField("rowkey", rowkey);
List<String> tagIds = new ArrayList<String>(tagSet);
for (String tagId : tagIds) {
document.addField("tags", tagId);
}
doclist.add(document);
}
cloudSolrClient.add(doclist);
cloudSolrClient.commit(true, true, true);
} /**
* 单条提交
*/
public void inputDoc(VmMemory vm) throws IOException, SolrServerException {
if (vm == null) {
return;
}
SolrInputDocument doc = new SolrInputDocument();
doc.addField("id", vm.getId());
doc.addField("rowkey", vm.getRowkey());
List<String> tags = vm.getTags();
for (String tag:tags) {
doc.addField("tags", tag);
}
cloudSolrClient.add(doc);
cloudSolrClient.commit(true, true, true);
} public void deleteDoc(List<String> rowkeys) throws IOException, SolrServerException {
if (rowkeys == null || rowkeys.size() == 0) {
return;
}
cloudSolrClient.deleteById(rowkeys);
cloudSolrClient.commit(true, true, true);
} public void deleteDoc(String rowkey) throws IOException, SolrServerException {
cloudSolrClient.deleteById(rowkey);
cloudSolrClient.commit(true, true, true);
} /**
* 添加记录到cache,如果cache达到maxCacheCount,则提交
* addDocToCache会在hbase每次插入数据时将记录插入缓存�?
* 并且判断是否达到上限,如果达到则将缓存内�?用数据提交到solr
*/
public static void addDocToCache(VmMemory vmMemory) {
commitLock.lock();
try {
//判断cache中是否有重复的rowkey,有则先提交
if (cacheRowkey.contains(vmMemory.getRowkey())) {
new SolrWriter().inputDoc(cache);
cache.clear();
cacheRowkey.clear();
}
cache.add(vmMemory);
cacheRowkey.add(vmMemory.getRowkey());
if (cache.size() >= maxCacheCount) {
new SolrWriter().inputDoc(cache);
cache.clear();
cacheRowkey.clear();
}
} catch (Exception ex) {
log.info(ex.getMessage());
} finally {
commitLock.unlock();
}
} /**
* 提交定时�?
* CommitTimer 则会每隔�?段时间提交一次,
* 以保证缓存内�?有数据最终写入solr
*/
static class CommitTimer extends TimerTask {
@Override
public void run() {
commitLock.lock();
try {
if (cache.size() > 0) { //大于0则提�?
log.info("timer commit count:"+cache.size());
new SolrWriter().inputDoc(cache);
cache.clear();
cacheRowkey.clear();
}
} catch (Exception ex) {
log.info(ex.getMessage());
} finally {
commitLock.unlock();
}
}
}
}

协处理器使用步骤如下

1.代码打jar包,并上传至HDFS

2.创建HBase表并添加协处理器,如下

hbase(main):002:0> create 'socialSecurityTest','tags','userInfo'
hbase(main):004:0> disable 'socialSecurityTest'
hbase(main):010:0> alter 'socialSecurityTest',METHOD=>'table_att','coprocessor'=>'hdfs://nameservice/user/solr/hbase/observer/HBaseCoprocessor.jar|com.hbase.coprocessor.HbaseDataSyncSolrObserver|1001|collection=tagCollection'
hbase(main):027:0> enable 'socialSecurityTest'

3.测试

hbase(main):016:0> put 'socialSecurityTest','rowkey-1','tags:0_1','1'

此时,可通过HBase日志查看协处理器的处理情况。

没错误的情况下,Solr中应该已经也有数据了

使用过程中出现的问题

2018-07-11 17:06:14,054 INFO  [LruBlockCacheStatsExecutor] hfile.LruBlockCache: totalSize=417.42 KB, freeSize=395.89 MB, max=396.30 MB, blockCount=0, accesses=0, hits=0, hitRatio=0, cachingAccesses=0, cachingHits=0, cachingHitsRatio=0,evictions=8069, evicted=0, evictedPerRun=0.0
2018-07-11 17:06:23,523 ERROR [RpcServer.FifoWFPBQ.priority.handler=19,queue=1,port=16000] master.MasterRpcServices: Region server node231.qt,16020,1531219308266 reported a fatal error:
ABORTING region server node231.qt,16020,1531219308266: The coprocessor com.hbase.coprocesser.HbaseDataSyncEsObserver threw java.lang.NoClassDefFoundError: org/apache/http/entity/mime/content/ContentBody
Cause:
java.lang.NoClassDefFoundError: org/apache/http/entity/mime/content/ContentBody
at com.hbase.coprocesser.SolrUtil.insert(SolrUtil.java:53)
at com.hbase.coprocesser.HbaseDataSyncEsObserver.postPut(HbaseDataSyncEsObserver.java:79)
at org.apache.hadoop.hbase.regionserver.RegionCoprocessorHost$32.call(RegionCoprocessorHost.java:923)
at org.apache.hadoop.hbase.regionserver.RegionCoprocessorHost$RegionOperation.call(RegionCoprocessorHost.java:1660)
at org.apache.hadoop.hbase.regionserver.RegionCoprocessorHost.execOperation(RegionCoprocessorHost.java:1734)
at org.apache.hadoop.hbase.regionserver.RegionCoprocessorHost.execOperation(RegionCoprocessorHost.java:1692)
at org.apache.hadoop.hbase.regionserver.RegionCoprocessorHost.postPut(RegionCoprocessorHost.java:919)
at org.apache.hadoop.hbase.regionserver.HRegion.doMiniBatchMutation(HRegion.java:3413)
at org.apache.hadoop.hbase.regionserver.HRegion.batchMutate(HRegion.java:2986)
at org.apache.hadoop.hbase.regionserver.HRegion.batchMutate(HRegion.java:2928)
at org.apache.hadoop.hbase.regionserver.RSRpcServices.doBatchOp(RSRpcServices.java:748)
at org.apache.hadoop.hbase.regionserver.RSRpcServices.doNonAtomicRegionMutation(RSRpcServices.java:708)
at org.apache.hadoop.hbase.regionserver.RSRpcServices.multi(RSRpcServices.java:2124)
at org.apache.hadoop.hbase.protobuf.generated.ClientProtos$ClientService$2.callBlockingMethod(ClientProtos.java:32393)
at org.apache.hadoop.hbase.ipc.RpcServer.call(RpcServer.java:2141)
at org.apache.hadoop.hbase.ipc.CallRunner.run(CallRunner.java:112)
at org.apache.hadoop.hbase.ipc.RpcExecutor$Handler.run(RpcExecutor.java:187)
at org.apache.hadoop.hbase.ipc.RpcExecutor$Handler.run(RpcExecutor.java:167)
Caused by: java.lang.ClassNotFoundException: org.apache.http.entity.mime.content.ContentBody
at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:338)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
... 18 more

两种解决方式

一、将缺少的jar包放入HBase的lib下

二、添加依赖重新打包即可,依赖如下

<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpmime</artifactId>
<version>4.3.2</version>
</dependency> 

pom添加一下内容

<build>
<finalName>SolrTest</finalName> <plugins>
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
<archive>
<manifest>
<mainClass></mainClass>
</manifest>
</archive>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>

<dependency>

<groupId>org.apache.httpcomponents</groupId>

<artifactId>httpmime</artifactId>

<version>4.3.2</version>

</dependency>

HBase协处理器的使用(添加Solr二级索引)的更多相关文章

  1. Hbase 学习(九) 华为二级索引(原理)

    这个是华为的二级索引方案,已经开放源代码了,下面是网上的一篇讲解原理的帖子,发出来和大家共享一下. 经过本人认真阅读了一下代码,发现这个源码仅供参考,想要集成到原有的集群当中是有点儿难度的,它对hba ...

  2. HBase协处理器同步二级索引到Solr

    一. 背景二. 什么是HBase的协处理器三. HBase协处理器同步数据到Solr四. 添加协处理器五. 测试六. 协处理器动态加载 一. 背景 在实际生产中,HBase往往不能满足多维度分析,我们 ...

  3. 基于Solr实现HBase的二级索引

    文章来源:http://www.open-open.com/lib/view/open1421501717312.html 实现目的: 由于hbase基于行健有序存储,在查询时使用行健十分高效,然后想 ...

  4. CDH使用Solr实现HBase二级索引

      一.为什么要使用Solr做二级索引二.实时查询方案三.部署流程3.1 安装HBase.Solr3.2 增加HBase复制功能3.3创建相应的 SolrCloud 集合3.4 创建 Lily HBa ...

  5. CDH版本Hbase二级索引方案Solr key value index

    概述 在Hbase中,表的RowKey 按照字典排序, Region按照RowKey设置split point进行shard,通过这种方式实现的全局.分布式索引. 成为了其成功的最大的砝码. 然而单一 ...

  6. HBase协处理器同步二级索引到Solr(续)

    一. 已知的问题和不足二.解决思路三.代码3.1 读取config文件内容3.2 封装SolrServer的获取方式3.3 编写提交数据到Solr的代码3.4 拦截HBase的Put和Delete操作 ...

  7. HBase + Solr Cloud实现HBase二级索引

    1. 执行流程 2. Solr Cloud实现 http://blog.csdn.net/u011462328/article/details/53008344 3. HBase实现 1) 自定义Ob ...

  8. hbase基于solr配置二级索引

    一.概述 Hbase适用于大表的存储,通过单一的RowKey查询虽然能快速查询,但是对于复杂查询,尤其分页.查询总数等,实现方案浪费计算资源,所以可以针对hbase数据创建二级索引(Hbase Sec ...

  9. Hbase(三) hbase协处理器与二级索引

    一.协处理器—Coprocessor 1. 起源Hbase 作为列族数据库最经常被人诟病的特性包括:无法轻易建立“二级索引”,难以执 行求和.计数.排序等操作.比如,在旧版本的(<0.92)Hb ...

随机推荐

  1. selenium的元素定位-iframe

    # name = top-frame # 如果iframe有name属性 或 id属性则 可以直接在frame种填写 # 如果没有name和id属性 frame() 可以接受elementOBj el ...

  2. 学习boost::asio一些小例子

    # #include <boost/asio.hpp> #include <boost/thread.hpp> #include <iostream> void h ...

  3. windows下 兼容Python2和Python3

    windows下同时安装了python2和python3时,都可以配置环境变量,如果在命令行里输入python命令,windows会去环境变量里寻找Python的安装位置,如果先找到pytoon2的, ...

  4. A - 士兵队列训练问题

    A - 士兵队列训练问题 Time Limit:1000MS     Memory Limit:32768KB     64bit IO Format:%I64d & %I64u Submit ...

  5. iOS 多线程之 GCD 的基本使用

    什么是GCD 全称Grand Central Dispatch 中暑调度器 纯C语言 提供了很多强大的函数 GCD 的优势 GCD是苹果公司为多核的并行运算提出的解决方案 GCD会自动利用更多的CPU ...

  6. ECMAScript6箭头函数ArrowFunction"=>"

    一.说明 ECMAScript6可以用箭头"=>"定义函数.x => x * x或(x) => {return x * x;}与匿名函数function(x){r ...

  7. <mvc:view-controller path=""/>标签的作用

    <mvc:view-controller path=""/>标签的作用 对应WEB-INF目录下面的JSP页面,我们知道是不能直接使用URL访问到.需要通过转发的方式, ...

  8. 洛谷 P2523 [HAOI2011]Problem c

    洛谷1或洛谷2,它们是一样的题目,手动滑稽- 这一题我是想不出来, 但是我想吐槽一下坐我左边的大佬. 大佬做题的时候,只是想了几分钟,拍了拍大腿,干脆的道:"这不是很显然吗!" 然 ...

  9. python多进程理论

    什么是进程 进程:正在进行的一个过程或者说一个任务.而负责执行任务则是cpu. 举例(单核+多道,实现多个进程的并发执行): 你在一个时间段内有很多任务要做:python学习的任务,赚钱的任务,交女朋 ...

  10. windows server2003/2008中权限账户

    在windows server 2003与windows server 2008 R2中,查看文件夹权限时,尤其是用cacls命令查看时,经常会见nt authority system这样的用户信息. ...