package com.grady.geomesa

import org.apache.hadoop.conf.Configuration
import org.apache.spark.SparkConf
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.geotools.data.Query
import org.locationtech.geomesa.spark.{GeoMesaSpark, GeoMesaSparkKryoRegistrator, SpatialRDD}
import org.locationtech.geomesa.spark.jts._ import scala.collection.JavaConversions._ object SparkReadGeomesa { val GeomesaCatalog = "gradytest"
val GeomesaCatalogFeature = "student" def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("SparkReadGeomesa")
// 这里序列化配置非常关键,否则spark解析不出来数据
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
conf.set("spark.kryo.registrator", classOf[GeoMesaSparkKryoRegistrator].getName)
val ss = SparkSession.builder().config(conf).getOrCreate().withJTS // 方法一报错:
// java.lang.ClassNotFoundException: org.locationtech.geomesa.hbase.rpc.filter.CqlTransformFilter
// 可能是环境配置关系
// val dataFrame = readGeomesaData(ss)
// showDataFrame(dataFrame, ss) // 方法二: ok
val spatialRDD = readGeomesaDataToRDD(ss)
showSpatialRDD(spatialRDD) ss.stop()
} /**
* 方法一: 获取dataFrame
* @param ss
* @return
*/
def readGeomesaData(ss: SparkSession): DataFrame = {
val params = Map(
"hbase.zookeepers" -> "10.82.xxx.xx:2181",
"hbase.catalog" -> GeomesaCatalog) val dataFrame = ss.read
.format("geomesa")
.options(params)
.option("geomesa.feature", GeomesaCatalogFeature)
.load()
dataFrame
} def showDataFrame(dataFrame: DataFrame, ss: SparkSession): Unit = {
dataFrame.show()
println("-----------------------------------")
dataFrame.createOrReplaceTempView("student")
val sqlQuery = "select * from student"
val resultDataFrame = ss.sql(sqlQuery)
resultDataFrame.show()
} /**
* 方法二: 获取SpatialRDD
* @param ss
* @return
*/
def readGeomesaDataToRDD(ss: SparkSession): SpatialRDD = {
val params = Map(
"hbase.zookeepers" -> "10.82.xxx.xx:2181",
"hbase.catalog" -> GeomesaCatalog)
val spatialRDDProvider = GeoMesaSpark(params)
val query = new Query(GeomesaCatalogFeature)
val resultRDD = spatialRDDProvider.rdd(new Configuration, ss.sparkContext, params, query)
resultRDD
} def showSpatialRDD(spatialRDD: SpatialRDD): Unit = {
spatialRDD.collect().foreach(row => {
val geom = row.getAttribute("geom").toString
val name = row.getAttribute("name").toString
println("name:" + name + " geom: " + geom)
})
println("-----------------------------------")
spatialRDD.collect().foreach(println)
} }
pom.xml
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>spark-practise</artifactId>
<groupId>org.example</groupId>
<version>1.0-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion> <artifactId>geomesa</artifactId> <properties>
<maven.compiler.source>8</maven.compiler.source>
<maven.compiler.target>8</maven.compiler.target>
<geomesa.version>3.1.0</geomesa.version>
</properties> <dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>${hadoop.version}</version>
</dependency> <dependency>
<groupId>org.locationtech.geomesa</groupId>
<artifactId>geomesa-hbase-spark-runtime-hbase2_2.12</artifactId>
<version>3.3.0</version>
<exclusions>
<exclusion>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
</exclusion>
</exclusions>
</dependency> <dependency>
<groupId>org.locationtech.geomesa</groupId>
<artifactId>geomesa-spark-core_2.12</artifactId>
<version>3.3.0</version>
</dependency> <dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_${scala.binary.version}</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_${scala.binary.version}</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-yarn_${scala.binary.version}</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>
</dependencies> <build>
<resources>
<resource>
<directory>src/main/resources</directory>
<filtering>true</filtering>
</resource>
</resources> <plugins>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>3.2.1</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
<scalaVersion>${scala.version}</scalaVersion>
</configuration>
<executions>
<execution>
<id>scala-compile-first</id>
<phase>process-resources</phase>
<goals>
<goal>add-source</goal>
<goal>compile</goal>
</goals>
</execution>
<execution>
<id>scala-test-compile</id>
<phase>process-test-resources</phase>
<goals>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
</plugin> <plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>3.2.1</version>
<configuration>
<artifactSet>
<excludes>
<exclude>org.slf4j:*</exclude>
</excludes>
</artifactSet>
</configuration>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<createDependencyReducedPom>false</createDependencyReducedPom>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
<transformers>
<transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer" />
</transformers>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>

运行:spark-submit --master yarn --driver-memory=2G --class com.grady.geomesa.SparkReadGeomesa /app/data/appdeploy/geomesa-1.0-SNAPSHOT.jar

这里最好是在yarn上执行,因为本地执行可能内存不够而卡住,我就被坑了几次

执行日志:(因为多执行了几遍插入,所以有重复数据)
name:jack  geom: POINT (11.1 12.1)
name:Lily geom: POINT (12.1 13.1)
name:jack geom: POINT (11.1 12.1)
name:Lily geom: POINT (12.1 13.1)
name:mike geom: POINT (14.1 15.1)
name:jack geom: POINT (11.1 12.1)
name:Lily geom: POINT (12.1 13.1)
name:mike geom: POINT (14.1 15.1)
name:mike geom: POINT (14.1 15.1)
-----------------------------------
ScalaSimpleFeature:000017ed-e5d1-41f8-ae71-84db58b9478f:POINT (11.1 12.1)|1|jack|15
ScalaSimpleFeature:000017ed-e5d1-41f8-a308-efcee8b70bf9:POINT (12.1 13.1)|2|Lily|16
ScalaSimpleFeature:000017ed-e35c-4d77-a841-b3bcf6faa8ac:POINT (11.1 12.1)|1|jack|15
ScalaSimpleFeature:000017ed-e37a-4e60-9d7f-66988be48234:POINT (12.1 13.1)|2|Lily|16
ScalaSimpleFeature:000017ed-e35c-4e9a-8600-97ed8d92c48b:POINT (14.1 15.1)|3|mike|16
ScalaSimpleFeature:000017ed-e37a-4e60-b90f-93fc81e0ab0e:POINT (11.1 12.1)|1|jack|15
ScalaSimpleFeature:000017ed-e35c-4d77-99e7-c6918a06c008:POINT (12.1 13.1)|2|Lily|16
ScalaSimpleFeature:000017ed-e37a-4ebd-b3a5-a9c7399a635b:POINT (14.1 15.1)|3|mike|16
ScalaSimpleFeature:000017ed-e5d1-4257-a75d-b0e23729542e:POINT (14.1 15.1)|3|mike|16

spark 读取Geomesa(Hbase)数据的更多相关文章

  1. Spark读取结构化数据

    读取结构化数据 Spark可以从本地CSV,HDFS以及Hive读取结构化数据,直接解析为DataFrame,进行后续分析. 读取本地CSV 需要指定一些选项,比如留header,比如指定delimi ...

  2. 大数据学习day20-----spark03-----RDD编程实战案例(1 计算订单分类成交金额,2 将订单信息关联分类信息,并将这些数据存入Hbase中,3 使用Spark读取日志文件,根据Ip地址,查询地址对应的位置信息

    1 RDD编程实战案例一 数据样例 字段说明: 其中cid中1代表手机,2代表家具,3代表服装 1.1 计算订单分类成交金额 需求:在给定的订单数据,根据订单的分类ID进行聚合,然后管理订单分类名称, ...

  3. Spark:读取mysql数据作为DataFrame

    在日常工作中,有时候需要读取mysql的数据作为DataFrame数据源进行后期的Spark处理,Spark自带了一些方法供我们使用,读取mysql我们可以直接使用表的结构信息,而不需要自己再去定义每 ...

  4. spark读取hbase形成RDD,存入hive或者spark_sql分析

    object SaprkReadHbase { var total:Int = 0 def main(args: Array[String]) { val spark = SparkSession . ...

  5. Spark Streaming接收Kafka数据存储到Hbase

    Spark Streaming接收Kafka数据存储到Hbase fly spark hbase kafka 主要参考了这篇文章https://yq.aliyun.com/articles/60712 ...

  6. SparkSQL读取HBase数据

    这里的SparkSQL是指整合了Hive的spark-sql cli(关于SparkSQL和Hive的整合,见文章后面的参考阅读). 本质上就是通过Hive访问HBase表,具体就是通过hive-hb ...

  7. Spark Streaming实时写入数据到HBase

    一.概述 在实时应用之中,难免会遇到往NoSql数据如HBase中写入数据的情景.题主在工作中遇到如下情景,需要实时查询某个设备ID对应的账号ID数量.踩过的坑也挺多,举其中之一,如一开始选择使用NE ...

  8. 关于mapreducer 读取hbase数据 存入mysql的实现过程

    mapreducer编程模型是一种八股文的代码逻辑,就以用户行为分析求流存率的作为例子 1.map端来说:必须继承hadoop规定好的mapper类:在读取hbase数据时,已经有现成的接口 Tabl ...

  9. Kafka:ZK+Kafka+Spark Streaming集群环境搭建(十一)定制一个arvo格式文件发送到kafka的topic,通过Structured Streaming读取kafka的数据

    将arvo格式数据发送到kafka的topic 第一步:定制avro schema: { "type": "record", "name": ...

随机推荐

  1. NC16660 [NOIP2004]FBI树

    NC16660 [NOIP2004]FBI树 题目 题目描述 我们可以把由"0"和"1"组成的字符串分为三类:全"0"串称为B串,全&quo ...

  2. Transferable Joint Attribute-Identity Deep Learning for Unsupervised Person Re-Identification理解

    简介:这篇文章属于跨域无监督行人再识别,不同于大部分文章它使用了属性标注.旨在于能够学习到有属性语义与有区分力的身份特征的表达空间(TJ-AIDL),并能够转移到一个没有看到过的域. 贡献: 提出了一 ...

  3. 迭代阈值收缩算法ISTA,背后的思想与具体推到过程

  4. Java方法的重写

    package Demo.oop.APP.Demo04; //启动器 public class application { public static void main(String[] args) ...

  5. K阶斐波那契数列--------西工大NOJ习题.10

    K阶斐波那契数列--------西工大NOJ习题.10 原创不易,转载请说明出处!!! 科普:k阶斐波那契数列的0到n-1项需要有初始值. 其中,0到n-2项初始化为0,第n-1项初始化为1. 在这道 ...

  6. CF Round #808 题解 (Div. 2 ABCD)

    后面题太难搞不动 . ABCD 的题解写的好水啊,感觉在写闲话,,, A 若 \(\forall i, a_1\mid a_i\),则可以 . 注意判 \(0\) 的情况 . 提交记录 . B 显而易 ...

  7. 在Mac Os(苹果)上用手机抓包软件Charles抓取微信小程序中的高清无水印视频

    原文转载自「刘悦的技术博客」https://v3u.cn/a_id_118 手机抓包是一名测试工程师常备的技能,比如我想查看一个接口请求的参数.返回值,还有移动设备上的http请求.https请求,这 ...

  8. Sampler类定义

    此是所有采样的基类,这样定义的好处是,我们可以分别测试每一个采样算法. 类定义: #pragma once #ifndef __SAMPLER_HEADER__ #define __SAMPLER_H ...

  9. win10下计算文件哈希值的方法

    cmd下使用certutil命令 使用方法: certutil -hashfile FILE_NAME ALGORITHM_NAME 支持的加密算法包括:MD2,MD4,MD5,SHA1,SHA256 ...

  10. Spring源码 13 IOC refresh方法8

    本文章基于 Spring 5.3.15 Spring IOC 的核心是 AbstractApplicationContext 的 refresh 方法. 其中一共有 13 个主要方法,这里分析第 8 ...