在最新的master分支上官方提供了Spark JDBC外部数据源的实现,先尝为快。

通过spark-shell测试

import org.apache.spark.sql.SQLContext
val sqlContext = new SQLContext(sc)
import sqlContext._ val TBLS_JDBC_DDL = s"""
|CREATE TEMPORARY TABLE spark_tbls
|USING org.apache.spark.sql.jdbc
|OPTIONS (
| url 'jdbc:mysql://hadoop000:3306/hive?user=root&password=root',
| dbtable 'TBLS'
|)""".stripMargin sqlContext.sql(TBLS_JDBC_DDL)

指定列查询:

sql("SELECT * FROM spark_tbls").collect.foreach(println)
[1,1423100397,1,0,spark,0,1,page_views,MANAGED_TABLE,A,D]
[6,1423116106,1,0,spark,0,6,order_created,MANAGED_TABLE,B,E]
[7,1423116131,1,0,spark,0,7,test_load1,MANAGED_TABLE,C,F]
[8,1423116145,1,0,spark,0,8,order_picked,MANAGED_TABLE,null,null]
[9,1423116160,1,0,spark,0,9,order_shipped,MANAGED_TABLE,null,null]
[10,1423116168,1,0,spark,0,10,order_received,MANAGED_TABLE,null,null]
[11,1423116179,1,0,spark,0,11,order_cancelled,MANAGED_TABLE,null,null]
[12,1423116193,1,0,spark,0,12,order_tracking,MANAGED_TABLE,null,null]
[13,1423116248,1,0,spark,0,13,order_tracking_join,MANAGED_TABLE,null,null]
[14,1423116298,1,0,spark,0,14,click_log,MANAGED_TABLE,null,null]
[15,1423116316,1,0,spark,0,15,ad_list,MANAGED_TABLE,null,null][16,1423116324,1,0,spark,0,16,ad_list_string,MANAGED_TABLE,null,null]
[17,1423116338,1,0,spark,0,17,cookie_cats,MANAGED_TABLE,null,null]

查询表中指定列:

sql("SELECT TBL_ID,TBL_NAME,TBL_TYPE FROM spark_tbls").collect.foreach(println)
[1,page_views,MANAGED_TABLE]
[6,order_created,MANAGED_TABLE]
[7,test_load1,MANAGED_TABLE]
[8,order_picked,MANAGED_TABLE]
[9,order_shipped,MANAGED_TABLE]
[10,order_received,MANAGED_TABLE]
[11,order_cancelled,MANAGED_TABLE]
[12,order_tracking,MANAGED_TABLE]
[13,order_tracking_join,MANAGED_TABLE]
[14,click_log,MANAGED_TABLE]
[15,ad_list,MANAGED_TABLE]
[16,ad_list_string,MANAGED_TABLE]
[17,cookie_cats,MANAGED_TABLE]

指定查询条件查询:

sql("SELECT TBL_ID,TBL_NAME,TBL_TYPE FROM spark_tbls WHERE TBL_ID = 1").collect.foreach(println)
[1,page_views,MANAGED_TABLE] sql("SELECT TBL_ID,TBL_NAME,TBL_TYPE FROM spark_tbls WHERE TBL_ID < 7").collect.foreach(println)
[1,page_views,MANAGED_TABLE]
[6,order_created,MANAGED_TABLE] sql("SELECT TBL_ID,TBL_NAME,TBL_TYPE FROM spark_tbls WHERE TBL_ID <= 7").collect.foreach(println)
[1,page_views,MANAGED_TABLE]
[6,order_created,MANAGED_TABLE]
[7,test_load1,MANAGED_TABLE] sql("SELECT TBL_ID,TBL_NAME,TBL_TYPE FROM spark_tbls WHERE TBL_ID > 7").collect.foreach(println)
[8,order_picked,MANAGED_TABLE]
[9,order_shipped,MANAGED_TABLE]
[10,order_received,MANAGED_TABLE]
[11,order_cancelled,MANAGED_TABLE]
[12,order_tracking,MANAGED_TABLE]
[13,order_tracking_join,MANAGED_TABLE]
[14,click_log,MANAGED_TABLE]
[15,ad_list,MANAGED_TABLE]
[16,ad_list_string,MANAGED_TABLE]
[17,cookie_cats,MANAGED_TABLE] sql("SELECT TBL_ID,TBL_NAME,TBL_TYPE FROM spark_tbls WHERE TBL_ID >= 7").collect.foreach(println)
[7,test_load1,MANAGED_TABLE]
[8,order_picked,MANAGED_TABLE]
[9,order_shipped,MANAGED_TABLE]
[10,order_received,MANAGED_TABLE]
[11,order_cancelled,MANAGED_TABLE]
[12,order_tracking,MANAGED_TABLE]
[13,order_tracking_join,MANAGED_TABLE]
[14,click_log,MANAGED_TABLE]
[15,ad_list,MANAGED_TABLE]
[16,ad_list_string,MANAGED_TABLE]
[17,cookie_cats,MANAGED_TABLE] sql("SELECT TBL_ID,TBL_NAME,TBL_TYPE,VIEW_EXPANDED_TEXT FROM spark_tbls WHERE VIEW_EXPANDED_TEXT IS NULL").collect.foreach(println)
[8,order_picked,MANAGED_TABLE,null]
[9,order_shipped,MANAGED_TABLE,null]
[10,order_received,MANAGED_TABLE,null]
[11,order_cancelled,MANAGED_TABLE,null]
[12,order_tracking,MANAGED_TABLE,null]
[13,order_tracking_join,MANAGED_TABLE,null]
[14,click_log,MANAGED_TABLE,null]
[15,ad_list,MANAGED_TABLE,null]
[16,ad_list_string,MANAGED_TABLE,null]
[17,cookie_cats,MANAGED_TABLE,null] sql("SELECT TBL_ID,TBL_NAME,TBL_TYPE,VIEW_EXPANDED_TEXT FROM spark_tbls WHERE VIEW_EXPANDED_TEXT IS NOT NULL").collect.foreach(println)
[1,page_views,MANAGED_TABLE,A]
[6,order_created,MANAGED_TABLE,B]
[7,test_load1,MANAGED_TABLE,C] sql("SELECT TBL_ID,TBL_NAME,TBL_TYPE,VIEW_EXPANDED_TEXT FROM spark_tbls WHERE TBL_ID>=7 AND TBL_ID <=10").collect.foreach(println)
[7,test_load1,MANAGED_TABLE,C]
[8,order_picked,MANAGED_TABLE,null]
[9,order_shipped,MANAGED_TABLE,null]
[10,order_received,MANAGED_TABLE,null]

多partition并行执行: 可以通过http://hadoop000:4040/jobs/的tasks数查看

val TBLS_PARTS_JDBC_DDL = s"""
|CREATE TEMPORARY TABLE spark_tbls_parts
|USING org.apache.spark.sql.jdbc
|OPTIONS (
| url 'jdbc:mysql://hadoop000:3306/hive?user=root&password=root',
| dbtable 'TBLS',
| partitionColumn 'TBL_ID',
| lowerBound '',
| upperBound '',
| numPartitions ''
|)""".stripMargin sqlContext.sql(TBLS_PARTS_JDBC_DDL)
sql("SELECT TBL_ID,TBL_NAME,TBL_TYPE,VIEW_EXPANDED_TEXT FROM spark_tbls_parts WHERE VIEW_EXPANDED_TEXT IS NULL").collect.foreach(println)
[8,order_picked,MANAGED_TABLE,null]
[9,order_shipped,MANAGED_TABLE,null]
[10,order_received,MANAGED_TABLE,null]
[11,order_cancelled,MANAGED_TABLE,null]
[12,order_tracking,MANAGED_TABLE,null]
[13,order_tracking_join,MANAGED_TABLE,null]
[14,click_log,MANAGED_TABLE,null]
[15,ad_list,MANAGED_TABLE,null]
[16,ad_list_string,MANAGED_TABLE,null]
[17,cookie_cats,MANAGED_TABLE,null]
[21,emp,MANAGED_TABLE,null]
[22,dept,MANAGED_TABLE,null]

多表关联查询:

val COLUMNS_V2_JDBC_DDL = s"""
|CREATE TEMPORARY TABLE spark_column_v2
|USING org.apache.spark.sql.jdbc
|OPTIONS (
| url 'jdbc:mysql://hadoop000:3306/hive?user=root&password=root',
| dbtable 'COLUMNS_V2'
|)""".stripMargin sqlContext.sql(COLUMNS_V2_JDBC_DDL)
sql("SELECT CD_ID, COLUMN_NAME FROM spark_column_v2").collect.foreach(println)
[1,city_id]
[1,end_user_id]
[1,ip]
[1,referer]
[1,session_id]
[1,track_time]
[1,url]
[6,event_time]
[6,ordernumber]
[7,id]
[7,name]
[8,event_time]
[8,ordernumber]
[9,event_time]
[9,ordernumber]
[10,event_time]
[10,ordernumber]
[11,event_time]
[11,ordernumber]
[12,order_cancelled_ts]
[12,order_created_ts]
[12,order_picked_ts]
[12,order_received_ts]
[12,order_shipped_ts]
[12,ordernumber]
[13,order_cancelled_ts]
[13,order_created_ts]
[13,order_picked_ts]
[13,order_received_ts]
[13,order_shipped_ts]
[13,ordernumber]
[14,ad_id]
[14,cookie_id]
[14,ts]
[15,ad_id]
[15,catalogs]
[15,url]
[16,ad_id]
[16,catalogs]
[16,url]
[17,catalog]
[17,cookie_id]
[17,weight]
[21,comm]
[21,deptno]
[21,empno]
[21,ename]
[21,hiredate]
[21,job]
[21,mgr]
[21,sal]
[22,deptno]
[22,dname]
[22,loc] sql("SELECT a.TBL_ID, a.TBL_NAME, a.TBL_TYPE, b.CD_ID, b.COLUMN_NAME FROM spark_tbls a join spark_column_v2 b on a.TBL_ID = b.CD_ID WHERE a.TBL_ID = 1").collect.foreach(println)
[1,page_views,MANAGED_TABLE,1,city_id]
[1,page_views,MANAGED_TABLE,1,end_user_id]
[1,page_views,MANAGED_TABLE,1,ip]
[1,page_views,MANAGED_TABLE,1,referer]
[1,page_views,MANAGED_TABLE,1,session_id]
[1,page_views,MANAGED_TABLE,1,track_time]
[1,page_views,MANAGED_TABLE,1,url] sql("SELECT a.TBL_ID, COUNT(b.CD_ID) FROM spark_tbls a join spark_column_v2 b on a.TBL_ID = b.CD_ID GROUP BY a.TBL_ID").collect.foreach(println)
[1,7]
[6,2]
[7,2]
[8,2]
[9,2]
[10,2]
[11,2]
[12,6]
[13,6]
[14,3]
[15,3]
[16,3]
[17,3]
[21,8]
[22,3]

通过spark-sql测试

CREATE TEMPORARY TABLE spark_tbls
USING org.apache.spark.sql.jdbc
OPTIONS (
url 'jdbc:mysql://hadoop000:3306/hive?user=root&password=root',
dbtable 'TBLS'
);
SELECT * FROM spark_tbls;

CREATE TEMPORARY TABLE spark_tbls_parts
USING org.apache.spark.sql.jdbc
OPTIONS (
url 'jdbc:mysql://hadoop000:3306/hive?user=root&password=root',
dbtable 'TBLS',
partitionColumn 'TBL_ID',
lowerBound '',
upperBound '',
numPartitions ''
);
SELECT * FROM spark_tbls_parts;

CREATE TEMPORARY TABLE spark_column_v2
USING org.apache.spark.sql.jdbc
OPTIONS (
url 'jdbc:mysql://hadoop000:3306/hive?user=root&password=root',
dbtable 'COLUMNS_V2'
);
select * from spark_column_v2;
SELECT a.TBL_ID, a.TBL_NAME, a.TBL_TYPE, b.CD_ID, b.COLUMN_NAME FROM spark_tbls a join spark_column_v2 b on a.TBL_ID = b.CD_ID WHERE a.TBL_ID = 1

Spark SQL External Data Sources JDBC官方实现读测试的更多相关文章

  1. Spark SQL External Data Sources JDBC官方实现写测试

    通过Spark SQL External Data Sources JDBC实现将RDD的数据写入到MySQL数据库中. jdbc.scala重要API介绍: /** * Save this RDD ...

  2. Spark SQL External Data Sources JDBC简易实现

    在spark1.2版本中最令我期待的功能是External Data Sources,通过该API可以直接将External Data Sources注册成一个临时表,该表可以和已经存在的表等通过sq ...

  3. Spark SQL 之 Data Sources

    #Spark SQL 之 Data Sources 转载请注明出处:http://www.cnblogs.com/BYRans/ 数据源(Data Source) Spark SQL的DataFram ...

  4. Spark(3) - External Data Source

    Introduction Spark provides a unified runtime for big data. HDFS, which is Hadoop's filesystem, is t ...

  5. Spark SQL External DataSource简介

    随着Spark1.2的发布,Spark SQL开始正式支持外部数据源.这使得Spark SQL支持了更多的类型数据源,如json, parquet, avro, csv格式.只要我们愿意,我们可以开发 ...

  6. How to: Provide Credentials for the Dashboards Module when Using External Data Sources

    XAF中使用dashboard模块时,如果使用了sql数据源,可以使用此方法提供连接信息 https://www.devexpress.com/Support/Center/Question/Deta ...

  7. 【转载】Spark SQL之External DataSource外部数据源

    http://blog.csdn.net/oopsoom/article/details/42061077 一.Spark SQL External DataSource简介 随着Spark1.2的发 ...

  8. Apache Spark 2.2.0 中文文档 - Spark SQL, DataFrames and Datasets Guide | ApacheCN

    Spark SQL, DataFrames and Datasets Guide Overview SQL Datasets and DataFrames 开始入门 起始点: SparkSession ...

  9. What’s new for Spark SQL in Apache Spark 1.3(中英双语)

    文章标题 What’s new for Spark SQL in Apache Spark 1.3 作者介绍 Michael Armbrust 文章正文 The Apache Spark 1.3 re ...

随机推荐

  1. SpringMVC进阶

    1.springmvc(注解版本) 注解扫描 <?xml version="1.0" encoding="UTF-8"?> <beans xm ...

  2. git学习(这个我没有整理,是我不断在学习的过程中,自己总结的,对象是我,不过有问题的,我们可以相互交流)

    每次git提交,都会有一个parent指针,指向上一次的commit ,   如果合并,master就和hotfix河道一起,就直接删除hotfix就OK     此时,虽然操作一样,大底层实现不一样 ...

  3. Xcode5 运行程序 提示IOS 模拟器未能安装此应用程序

    更新了Xcode5,结果模拟器各种不配合,首先遇到的问题就是提示“IOS 模拟器未能安装此应用程序” 上网查了一下,网友给出的解决办法“删除~/Library/Application Support/ ...

  4. 浏览器兼容 copyToClipboard("拷贝内容")

    function copyToClipboard(txt) { if (window.clipboardData) { window.clipboardData.clearData(); clipbo ...

  5. HttpContext.Current.Cache 和HttpRuntime.Cache的区别

    先看MSDN上的解释:      HttpContext.Current.Cache:为当前 HTTP 请求获取Cache对象.      HttpRuntime.Cache:获取当前应用程序的Cac ...

  6. Xml Schema:C#访问在complextype中插入新元素

    最近用c#写Xml Schema代码,找了很久也找不到如何在已有的complextype中插入新的element,最后我充分发挥自己的聪明才智,哈哈,终于从...中找到了灵感. XmlSchemaSe ...

  7. Welcome to LED Control Wiki

    About this project This project was developed after I had to find out that controlling my RGB ambien ...

  8. LintCode Maximum Depth of Binary Tree

    1 /** * Definition of TreeNode: * public class TreeNode { * public int val; * public TreeNode left, ...

  9. xmimd的第十天笔记

  10. Java多线程之ConcurrentSkipListMap深入分析(转)

    Java多线程之ConcurrentSkipListMap深入分析   一.前言 concurrentHashMap与ConcurrentSkipListMap性能测试 在4线程1.6万数据的条件下, ...