package com.gm.hive.SparkHive;

import java.text.SimpleDateFormat;
import java.util.Arrays;
import java.util.Collection;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties; import org.apache.spark.Partition; import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.common.serialization.StringDeserializer;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.Optional;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.VoidFunction;
import org.apache.spark.broadcast.Broadcast;
import org.apache.spark.rdd.RDD;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaInputDStream;
import org.apache.spark.streaming.api.java.JavaPairDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.kafka010.ConsumerStrategies;
import org.apache.spark.streaming.kafka010.KafkaUtils;
import org.apache.spark.streaming.kafka010.LocationStrategies; import scala.Tuple2;
import scala.reflect.ClassManifestFactory; public class App { private static volatile Broadcast<Map<String,Boolean>> bcMap = null; public static void main(String[] args) {
// TODO Auto-generated method stub
SparkConf conf = new SparkConf().setMaster("local[2]").setAppName(
"SparkStreaming"); JavaSparkContext sc = new JavaSparkContext(conf);
sc.setLogLevel("ERROR");
sc.setCheckpointDir("./checkpoint"); JavaStreamingContext ssc = new JavaStreamingContext(sc,
Durations.seconds(10)); Date date = new Date();
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); Map<String,Boolean> map = new HashMap<String,Boolean>();
map.put(sdf.format(date), true);
bcMap = sc.broadcast(map);//初始广播变量 // kafka相关参数,必要!缺了会报错
Map<String, Object> kafkaParams = new HashMap<>();
kafkaParams.put("bootstrap.servers", "192.168.174.200:9092");
kafkaParams.put("key.deserializer", StringDeserializer.class);
kafkaParams.put("value.deserializer", StringDeserializer.class);
kafkaParams.put("group.id", "newgroup2");
kafkaParams.put("auto.offset.reset", "latest");
kafkaParams.put("enable.auto.commit", false); Collection<String> topics = Arrays.asList("test"); JavaInputDStream<ConsumerRecord<String, String>> stream = KafkaUtils
.createDirectStream(ssc, LocationStrategies.PreferConsistent(),
ConsumerStrategies.<String, String> Subscribe(topics,
kafkaParams)); // 注意这边的stream里的参数本身是个ConsumerRecord对象
JavaPairDStream<String, Integer> counts = stream
.flatMap(
x -> Arrays.asList(x.value().toString().split(" "))
.iterator())
.mapToPair(x -> new Tuple2<String, Integer>(x, 1))
.reduceByKey((x, y) -> x + y);
//counts.print(); stream.foreachRDD(rdd -> { Map<String,Boolean> map1 = bcMap.value();
Date newDate = new Date();
SimpleDateFormat newsdf = new SimpleDateFormat("yyyy-MM-dd");
String newDay = newsdf.format(newDate); if (map1.get(newDay) != null) {//存在当前天
if (bcMap.value().get(newDay)) {//当前天的值为true,更新为false并更新到广播变量中
map1.put(newDay, false);
bcMap = rdd.context().broadcast(map1,ClassManifestFactory.classType(Map.class));
}
} else {
if (bcMap != null) {//不存在当前天,将新的一天添加并更新到广播变量中
bcMap.unpersist();
}
map1.put(newDay, true);
bcMap = rdd.context().broadcast(map1,ClassManifestFactory.classType(Map.class));
}
}); JavaPairDStream<String, Integer> result = counts
.updateStateByKey(new Function2<List<Integer>, Optional<Integer>, Optional<Integer>>() { private static final long serialVersionUID = 1L; @Override
public Optional<Integer> call(List<Integer> values,
Optional<Integer> state) throws Exception {
/**
* values:经过分组最后 这个key所对应的value,如:[1,1,1,1,1]
* state:这个key在本次之前之前的状态
*/
Integer updateValue = 0; Date newDate = new Date();
SimpleDateFormat newsdf = new SimpleDateFormat("yyyy-MM-dd");
String newDay = newsdf.format(newDate); Map<String,Boolean> map1 = bcMap.value();
if (map1.get(newDay) != null) {
if(map1.get(newDay)){//新的一天开始,将计算结果更新为0
for (Integer value : values) {
updateValue += value;
}
} else {//新的一天进行中,已计算过数据,正常运算
if (state.isPresent()) {
updateValue = state.get();
}
for (Integer value : values) {
updateValue += value;
}
}
}
return Optional.of(updateValue);
}
}); //数据库内容
String url = "jdbc:postgresql://192.168.174.200:5432/postgres?charSet=utf-8";
Properties connectionProperties = new Properties();
connectionProperties.put("user","postgres");
connectionProperties.put("password","postgres");
connectionProperties.put("driver","org.postgresql.Driver"); result.print(); result.foreachRDD(new VoidFunction<JavaPairRDD<String, Integer>>(){
public void call(JavaPairRDD<String, Integer> rdd)
throws Exception {
// TODO Auto-generated method stub
JavaRDD<ResultRow> rowRDD = rdd.map(new Function<Tuple2<String, Integer>,ResultRow>(){ public ResultRow call(Tuple2<String, Integer> arg0)
throws Exception {
// TODO Auto-generated method stub
Date newDate = new Date();
SimpleDateFormat newsdf = new SimpleDateFormat("yyyy-MM-dd");
String newDay = newsdf.format(newDate); ResultRow rr = new ResultRow();
rr.setTypeid(arg0._1+"_"+newDay);
rr.setKczs(arg0._2);
return rr;
} });
SparkSession spark = SparkSession.builder().config(rdd.context().getConf()).getOrCreate();
Dataset<Row> data = spark.createDataFrame(rowRDD, ResultRow.class);
//将数据通过覆盖的形式保存在数据表中
data.write().mode(SaveMode.Append).jdbc(url, "kcssqktj", connectionProperties);
}
}); ssc.start();
try {
ssc.awaitTermination();
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
ssc.close();
} }
package com.gm.hive.SparkHive;

import java.io.Serializable;

public class ResultRow implements Serializable {
private static final long serialVersionUID = 6681372116317508248L;
String typeid;
int kczs; public String getTypeid() {
return typeid;
} public void setTypeid(String typeid) {
this.typeid = typeid;
} public int getKczs() {
return kczs;
} public void setKczs(int kczs) {
this.kczs = kczs;
} }
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.test</groupId>
<artifactId>kcssqktj_spark</artifactId>
<version>0.0.1-SNAPSHOT</version>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties> <dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>3.8.1</version>
<scope>test</scope>
</dependency> <dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>1.7.22</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.8.0</version>
<exclusions>
<exclusion>
<groupId>javax.servlet</groupId>
<artifactId>*</artifactId>
</exclusion>
</exclusions>
</dependency> <dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>2.0.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_2.11</artifactId>
<version>2.0.0</version>
</dependency> <dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>2.0.0</version>
<exclusions>
<exclusion>
<artifactId>slf4j-log4j12</artifactId>
<groupId>org.slf4j</groupId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.0.0</version>
</dependency> <!-- https://mvnrepository.com/artifact/org.apache.hive/hive-jdbc -->
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-jdbc</artifactId>
<version>2.1.1</version>
</dependency> <!-- https://mvnrepository.com/artifact/org.apache.hive/hive-exec -->
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>2.1.1</version>
</dependency> <dependency>
<groupId>org.postgresql</groupId>
<artifactId>postgresql</artifactId>
<version>9.4-1201-jdbc4</version>
</dependency> <dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
<version>2.0.0</version>
</dependency>
</dependencies>
<build> <plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<shadedArtifactAttached>true</shadedArtifactAttached>
<shadedClassifierName>allinone</shadedClassifierName>
<artifactSet>
<includes>
<include>*:*</include>
</includes>
</artifactSet>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
<transformers>
<transformer
implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
<resource>reference.conf</resource>
</transformer>
<transformer
implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
<resource>META-INF/spring.handlers</resource>
</transformer>
<transformer
implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
<resource>META-INF/spring.schemas</resource>
</transformer>
<transformer
implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<manifestEntries>
<Main-Class></Main-Class>
</manifestEntries>
</transformer>
</transformers>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>

Spark2.0基于广播变量broadcast实现实时数据按天统计的更多相关文章

  1. spark中的广播变量broadcast

    Spark中的Broadcast处理 首先先来看一看broadcast的使用代码: val values = List[Int](1,2,3) val broadcastValues = sparkC ...

  2. Spark 广播变量BroadCast

    一. 广播变量 广播变量允许程序员将一个只读的变量缓存在每台机器上,而不用在任务之间传递变量.广播变量可被用于有效地给每个节点一个大输入数据集的副本.Spark还尝试使用高效地广播算法来分发变量,进而 ...

  3. 初识Flink广播变量broadcast

    Broadcast 广播变量:可以理解为是一个公共的共享变量,我们可以把一个dataset 或者不变的缓存对象(例如map list集合对象等)数据集广播出去,然后不同的任务在节点上都能够获取到,并在 ...

  4. 基于Redis、Storm的实时数据查询实践

    通过算法小组给出的聚合文件,我们需要实现一种业务场景,通过用户的消费地点的商户ID与posId,查询出他所在的商圈,并通过商圈地点查询出与该区域的做活动的商户,并与之进行消息匹配,推送相应活动信息到用 ...

  5. Spark——DataFrames,RDD,DataSets、广播变量与累加器

    Spark--DataFrames,RDD,DataSets 一.弹性数据集(RDD) 创建RDD 1.1RDD的宽依赖和窄依赖 二.DataFrames 三.DataSets 四.什么时候使用Dat ...

  6. Spark(三)RDD与广播变量、累加器

    一.RDD的概述 1.1 什么是RDD RDD(Resilient Distributed Dataset)叫做弹性分布式数据集,是Spark中最基本的数据抽象,它代表一个不可变.可分区.里面的元素可 ...

  7. Spark2.0机器学习系列之3:决策树

    概述 分类决策树模型是一种描述对实例进行分类的树形结构. 决策树可以看为一个if-then规则集合,具有“互斥完备”性质 .决策树基本上都是 采用的是贪心(即非回溯)的算法,自顶向下递归分治构造. 生 ...

  8. Spark学习之路(六)—— 累加器与广播变量

    一.简介 在Spark中,提供了两种类型的共享变量:累加器(accumulator)与广播变量(broadcast variable): 累加器:用来对信息进行聚合,主要用于累计计数等场景: 广播变量 ...

  9. Spark 系列(六)—— 累加器与广播变量

    一.简介 在 Spark 中,提供了两种类型的共享变量:累加器 (accumulator) 与广播变量 (broadcast variable): 累加器:用来对信息进行聚合,主要用于累计计数等场景: ...

随机推荐

  1. 简单说说JavaBean的使用

    一:JavaBean定义 JavaBean是一种可重复使用.跨平台的软件组件.JavaBean可分为两种:一种是有用户界面(UI,User Interface)的JavaBean,例如中的那些可视化图 ...

  2. sass、less中的scoped属性

    1.scoped 的实现原理 Vue中的Less 中的 scoped 属性的效果主要是通过 PostCss 实现的.代码示例: //编译前 <template> <div class ...

  3. [CSP-S模拟测试]:简单的序列(DP)

    题目描述 从前有个括号序列$s$,满足$|s|=m$.你需要统计括号序列对$(p,q)$的数量. 其中$(p,q)$满足$|p|+|s|+|q|=n$,且$p+s+q$是一个合法的括号序列. 输入格式 ...

  4. Python 字典dict操作定义

    字典是用大括号{ }来表示,它是python中最灵活的内置数据类型.它是一个无序的集合,通过键来存取值,而不能用索引. 字典的创建和使用 字典的组成:字典是由大括号{  }来包含其数据的,大括号内包含 ...

  5. Python 元组遍历排序操作方法

    在Python不可变数据类型中,有一个比较重要的角色那就是元组( tuple ).如果某个对像被定义为元组类型,那么就意味着它的值不能被修改,除非重新定义一个新的对像.元组和List列表常被放在一起进 ...

  6. ionic框架+angular开发项目

    ionic框架组件地址:https://ionicframework.com/docs/api/tab ionic文档地址:https://ionicframework.com/docs/angula ...

  7. vue +ts 在router的路由中import报错的解决方案

    在router.ts中引入.vue文件,会提示打不到module,但是编译可能成功,运行也不报错 找了好久,发现了这个答案 https://segmentfault.com/a/11900000167 ...

  8. 再说vim 乱码问题

    在vimrc设置中, 可以简写: file = f encoding = enc fileencodings = fencs fileencoding = fenc 主要有四个编码, 要了解它们的含义 ...

  9. awk调用系统命令

    cmd = ("the linux command") cmd | getline dk; close(cmd) dk stores the output of the comma ...

  10. 【SD系列】SAP SD模块-公司间销售简介

    公众号:SAP Technical 本文作者:matinal 原文出处:http://www.cnblogs.com/SAPmatinal/ 原文链接:[SD系列]SAP SD模块-公司间销售简介   ...