Spark测试代码

测试代码：

 import org.apache.spark.{SparkConf, SparkContext}

 import org.apache.spark.sql.hive.HiveContext

 /**

   * Created by Administrator on 2017/1/7.

   */

 object TestMain {

   def main(args: Array[String]): Unit = {

     val conf = new SparkConf().setAppName("Hangzhou_Test")

     //.setMaster("local[1]").setMaster("spark://172.21.7.10:7077").setJars(List("xxx.jar")).set("spark.executor.memory", "10g")

     val sc = new SparkContext(conf)

     val hiveContext = new HiveContext(sc)

     // use rc_hive_db;

     hiveContext.sql("use rc_hive_db")

     import hiveContext.implicits._

     hiveContext.setConf("mapred.max.split.size", "")

     hiveContext.setConf("mapred.min.split.size.per.node", "")

     hiveContext.setConf("mapred.min.split.size.per.rack", "")

     hiveContext.setConf("hive.input.format", "org.apache.hadoop.hive.ql.io.CombineHiveInputFormat")

     hiveContext.setConf("hive.merge.mapfiles", "true")

     hiveContext.setConf("hive.merge.mapredfiles", "true")

     hiveContext.setConf("hive.merge.size.per.task", "")

     hiveContext.setConf("hive.merge.smallfiles.avgsize", "")

     hiveContext.setConf("hive.groupby.skewindata", "true")

     hiveContext.sql("create table if not exists tb_id_vs_name(id int,name string)")

     hiveContext.sql("create table if not exists tb_id_vs_name2(id int,name string)")

     println("-------------------------word count:------------------------------------")

     // http://blog.csdn.net/t1dmzks/article/details/70189509

     var words = "When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature. If float, the parameter represents a proportion of documents, integer absolute counts. This parameter is ignored if vocabulary is not None."

     val textFile = sc.parallelize(words.split(" "), )

     textFile.flatMap(line => line.split(" "))

       .map(word => (word, ))

       .reduceByKey(_ + _)

       .foreach(println)

     println("-------------------------map(func):------------------------------------")

     //    1.map(func)

     val rdd = sc.parallelize( to ) //创建RDD

     val map = rdd.map(_ * ) //对RDD中的每个元素都乘于2

     map.foreach(x => print(x + " "))

     println("-------------------------flatMap(func):------------------------------------")

     //    2.flatMap(func)

     val fm = rdd.flatMap(x => ( to x)).collect()

     fm.foreach(x => print(x + " "))

     println("-------------------------mapPartitions(func) 1:------------------------------------")

     //    3.mapPartitions(func)

     val mp = sc.parallelize(List(("kpop", "female"), ("zorro", "male"), ("mobin", "male"), ("lucy", "female")), ).mapPartitions(x => {

       var woman = List[String]()

       while (x.hasNext) {

         val next = x.next()

         next match {

           case (_, "female") => woman = next._1 :: woman

           case _ =>

         }

       }

       woman.iterator

     })

     /*val mp = rdd.mapPartitionsWithIndex(partitionsFun)*/

     mp.collect.foreach(x => (print(x + " "))) //将分区中的元素转换成Aarray再输出

     println("-------------------------mapPartitions(func) 2:------------------------------------")

     sc.parallelize(List(("kpop", "female"), ("zorro", "male"), ("mobin", "male"), ("lucy", "female")), )

       .mapPartitions(x => x.filter(_._2 == "female"))

       .map(x => x._1)

       .foreach(x => (print(x + " ")))

     println("-------------------------mapPartitionsWithIndex(func) :------------------------------------")

     //    4.mapPartitionsWithIndex(func)

     sc.parallelize(List(("kpop", "female"), ("zorro", "male"), ("mobin", "male"), ("lucy", "female")), )

       .mapPartitionsWithIndex((index: Int, iter: Iterator[(String, String)]) => {

         var woman = List[String]()

         while (iter.hasNext) {

           val next = iter.next()

           next match {

             case (_, "female") => woman = "[" + index + "]" + next._1 :: woman

             case _ =>

           }

         }

         woman.iterator

       })

       .collect.foreach(x => (print(x + " "))) //将分区中的元素转换成Aarray再输出

     println("-------------------------simple(withReplacement,fraction,seed) :------------------------------------")

     //    5.simple(withReplacement,fraction,seed)

     val sample1 = rdd.sample(true, 0.5, )

     sample1.collect.foreach(x => print(x + " "))

     println("-------------------------union(ortherDataset) :将两个RDD中的数据集进行合并，最终返回两个RDD的并集，若RDD中存在相同的元素也不会去重------------------------------------")

     //    6.union(ortherDataset)

     val rdd1 = sc.parallelize( to )

     val rdd2 = sc.parallelize( to )

     rdd1.union(rdd2).collect.foreach(x => print(x + " "))

     println("-------------------------union(ortherDataset) :返回两个RDD的交集------------------------------------")

     //    7.intersection(otherDataset)

     rdd1.intersection(rdd2).collect.foreach(x => print(x + " "))

     println("-------------------------distinct([numTasks]) :对RDD中的元素进行去重------------------------------------")

     //    8.distinct([numTasks])

     sc.parallelize(List(, , , , , , , )).distinct().collect.foreach(x => print(x + " "))

     println("-------------------------cartesian(otherDataset):对两个RDD中的所有元素进行笛卡尔积操作------------------------------------")

     //    9.cartesian(otherDataset)

     sc.parallelize( to ).cartesian(sc.parallelize( to )).foreach(x => println(x + " "))

     println("-------------------------coalesce(numPartitions，shuffle):对RDD的分区进行重新分区，shuffle默认值为false,当shuffle=false时，不能增加分区数------------------------------------")

     //    10.coalesce(numPartitions，shuffle)

     val coalesceRDD = sc.parallelize( to , ).coalesce() //当suffle的值为false时，不能增加分区数(即分区数不能从5->7)

     println("重新分区后的分区个数:" + coalesceRDD.partitions.size)

     val coalesceRDD2 = sc.parallelize( to , ).coalesce(, true)

     println("重新分区后的分区个数:" + coalesceRDD2.partitions.size)

     println("RDD依赖关系:" + coalesceRDD2.toDebugString)

     println("-------------------------repartition(numPartition):是函数coalesce(numPartition,true)的实现，效果和例9.1的coalesce(numPartition,true)的一样------------------------------------")

     //    11.repartition(numPartition)

     //    12.glom()glom():将RDD的每个分区中的类型为T的元素转换换数组Array[T]

     //    13.randomSplit(weight:Array[Double],seed)：根据weight权重值将一个RDD划分成多个RDD,权重越高划分得到的元素较多的几率就越大

     println("-------------------------repartition(numPartition)-----------------------------")

     sc.parallelize(List((, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, ""),

       (, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, ""),

       (, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, ""),

       (, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, ""),

       (, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, ""),

       (, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, ""),

       (, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, ""),

       (, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, "")

     )).map(s => (s._1, s._2)).toDF().registerTempTable("temp_tb_id_vs_name")

     hiveContext.sql("insert into tb_id_vs_name select * from temp_tb_id_vs_name")

     sc.parallelize(List((, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, ""), (, "")

     )).map(s => (s._1, s._2)).toDF().registerTempTable("temp_tb_id_vs_name2")

     hiveContext.sql("insert into tb_id_vs_name2 select * from temp_tb_id_vs_name2")

     var result = hiveContext.sql("select t10.id as t10_id,t10.name as t10_name from tb_id_vs_name t10 inner join tb_id_vs_name2 t11 on t10.id=t11.id")

     result.map(s => (s.getAs[Int]("t10_id"), s.getAs[String]("t10_name"))).foreach(s => {

       println(s._1 + ":" + s._2)

     })

     sc.stop()

   }

 }

测试结果：

 -------------------------word count:------------------------------------

 -------------------------map(func):------------------------------------

 -------------------------flatMap(func):------------------------------------

                                                        
   -------------------------mapPartitions(func) :------------------------------------

 kpop lucy 
   -------------------------mapPartitions(func) :------------------------------------

 -------------------------mapPartitionsWithIndex(func) :------------------------------------

 []kpop []lucy -------------------------simple(withReplacement,fraction,seed) :------------------------------------

     
   -------------------------union(ortherDataset) :灏嗕袱涓猂DD涓殑鏁版嵁闆嗚繘琛屽悎骞讹紝鏈€缁堣繑鍥炰袱涓猂DD鐨勫苟闆嗭紝鑻DD涓瓨鍦ㄧ浉鍚岀殑鍏冪礌涔熶笉浼氬幓閲-----------------------------------

       
   -------------------------union(ortherDataset) :杩斿洖涓や釜RDD鐨勪氦闆-----------------------------------

  
   -------------------------distinct([numTasks]) :瀵筊DD涓殑鍏冪礌杩涜鍘婚噸------------------------------------

      
   -------------------------cartesian(otherDataset):瀵逛袱涓猂DD涓殑鎵€鏈夊厓绱犺繘琛岀瑳鍗″皵绉搷浣-----------------------------------

 -------------------------coalesce(numPartitions锛宻huffle):瀵筊DD鐨勫垎鍖鸿繘琛岄噸鏂板垎鍖猴紝shuffle榛樿鍊间负false,褰搒huffle=false鏃讹紝涓嶈兘澧炲姞鍒嗗尯鏁-----------------------------------

 閲嶆柊鍒嗗尯鍚庣殑鍒嗗尯涓暟:

 閲嶆柊鍒嗗尯鍚庣殑鍒嗗尯涓暟:

 RDD渚濊禆鍏崇郴:() MapPartitionsRDD[] at coalesce at TestMain.scala: []

  |  CoalescedRDD[] at coalesce at TestMain.scala: []

  |  ShuffledRDD[] at coalesce at TestMain.scala: []

  +-() MapPartitionsRDD[] at coalesce at TestMain.scala: []

     |  ParallelCollectionRDD[] at parallelize at TestMain.scala: []

 -------------------------repartition(numPartition):鏄嚱鏁癱oalesce(numPartition,true)鐨勫疄鐜帮紝鏁堟灉鍜屼緥9.1鐨刢oalesce(numPartition,true)鐨勪竴鏍-----------------------------------

 -------------------------repartition(numPartition)-----------------------------

Spark测试代码的更多相关文章

Hadoop基础-MapReduce入门篇之编写简单的Wordcount测试代码
Hadoop基础-MapReduce入门篇之编写简单的Wordcount测试代码作者:尹正杰版权声明:原创作品,谢绝转载!否则将追究法律责任. 本文主要是记录一写我在学习MapReduce时的一些 ...
Idea 编写 Spark 示例代码并打包成Jar
说明:本人是在Linux下搭建的单机Spark环境,也是在Linux下使用Idea14.02进行代码编辑 1. 打开IDEA,在欢迎界面从右下角的Configure -> Plugins进入,安 ...
整合Kafka到Spark Streaming——代码示例和挑战
作者Michael G. Noll是瑞士的一位工程师和研究员,效力于Verisign,是Verisign实验室的大规模数据分析基础设施(基础Hadoop)的技术主管.本文,Michael详细的演示了如 ...
.NET单元测试的艺术-3.测试代码
开篇:上一篇我们学习单元测试和核心技术:存根.模拟对象和隔离框架,它们是我们进行高质量单元测试的技术基础.本篇会集中在管理和组织单元测试的技术,以及如何确保在真实项目中进行高质量的单元测试. 系列目录 ...
mysql锁实战测试代码
存储引擎支持的锁定 MyISAM 表级锁 MEMORY 表级锁 InnoDB 行级锁 BDB 页面锁表级锁:开销小,加锁快:不会出现死锁:锁定粒度大,发生锁冲突的概率最高,并发度最低.行级锁:开销 ...
使用Microsoft Fakes隔离测试代码
在单元测试(Unit Test)中我们遇到的问题之一是:假如被测试组件(类或项目)为A,组件A依赖于组件B,那么在组件A的单元测试ATest中测试A时,也需要依赖于B,在B发生改动后,就可能影响到A的 ...
iOS开发：XCTest单元测试（附上一个单例的测试代码）
测试驱动开发并不是一个很新鲜的概念了.在我最开始学习程序编写时,最喜欢干的事情就是编写一段代码,然后运行观察结果是否正确.我所学习第一门语言是c语言,用的最多的是在算法设计上,那时候最常做的事情就是编 ...
在内核中异步请求设备固件firmware的测试代码
在内核中异步请求设备固件firmware的测试代码 static void ghost_load_firmware_callback(const struct firmware *fw, void * ...
x264测试代码
建立一个工程,将头文件,库文件加载到工程,测试代码如下:#include <iostream>#include <string>#include "stdint.h& ...

随机推荐

Linux find用法
Linux中find常见用法示例 ----摘抄哪里忘记了 ·find path -option [ -print ] [ -exec -ok command ] {} ...
剑指Offer-按之字形顺序打印二叉树
package Tree; import java.util.ArrayList; import java.util.LinkedList; import java.util.Queue; /** * ...
Java后台模拟发送http的get和post请求，并测试
个人学习使用:谨慎参考 1 Client类 import com.thoughtworks.gauge.Step; import com.thoughtworks.gauge.Table; impor ...
phpStorm安装方法
1)下载 http://big2.h5gamen.com/soft/jetbrainscrack-2.6.2.zip 放到phpstorm安装目录下的lib文件夹如放到f盘 F:\PhpStorm ...
单元测试er——为什么真的真的要写单元测试
优点为什么很多技术或者知识要说优点?因为有些道理看着很简单,大家表面上都觉得对,但是做的时候又不去做或者做不到.其中有一个很重要原因是骨子里或者潜意识并没有真实觉得这是对的,一旦想去做的时候同时会冒 ...
OpenStreetMap、googleMap等经纬度和行列号之间相互转化
# OpenStreetMap经纬度转行列号 def deg2num(lat_deg, lon_deg, zoom): lat_rad = math.radians(lat_deg) n = 2.0 ...
Ajax教程（转载）
第 1 页 Ajax 简介Ajax 由 HTML.JavaScript™ 技术.DHTML 和 DOM 组成,这一杰出的方法可以将笨拙的 Web 界面转化成交互性的 Ajax 应用程序.本文的作者是一 ...
scrapy---callback 传递自定义参数
在scrapy提交一个链接请求是用 Request(url,callback=func) 这种形式的,而parse只有一个response参数,如果自定义一个有多参数的parse可以考虑用下面的方法实 ...
设计模式之外观模式详解（Service第三者插足，让action与dao分手）
作者:zuoxiaolong8810(左潇龙),转载请注明出处,特别说明:本博文来自博主原博客,为保证新博客中博文的完整性,特复制到此留存,如需转载请注明新博客地址即可. 各位好,LZ今天给各位分享一 ...
<经验杂谈>介绍Js简单的递归排列组合
最近在开发SKU模块的时候,遇到这样一个需求,某种商品有N(用未知数N来表示是因为规格的数组由用户制定且随时可以编辑的,所以对程序来说,它是一个未知数)类规格,每一类规格又有M个规格值,各种规格值的组 ...

Spark测试代码

Spark测试代码的更多相关文章

随机推荐

热门专题