package com.gm.hive.SparkHive;

import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties; import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.common.serialization.StringDeserializer;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.Optional;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.VoidFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaInputDStream;
import org.apache.spark.streaming.api.java.JavaPairDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.kafka010.ConsumerStrategies;
import org.apache.spark.streaming.kafka010.KafkaUtils;
import org.apache.spark.streaming.kafka010.LocationStrategies; import scala.Tuple2; public class App { public static void main(String[] args) {
// TODO Auto-generated method stub
SparkConf conf = new SparkConf().setMaster("local[2]").setAppName(
"streamingTest"); JavaSparkContext sc = new JavaSparkContext(conf);
sc.setLogLevel("ERROR");
sc.setCheckpointDir("./checkpoint"); JavaStreamingContext ssc = new JavaStreamingContext(sc,
Durations.seconds(10)); // kafka相关参数,必要!缺了会报错
Map<String, Object> kafkaParams = new HashMap<>();
kafkaParams.put("bootstrap.servers", "192.168.174.200:9092");
kafkaParams.put("key.deserializer", StringDeserializer.class);
kafkaParams.put("value.deserializer", StringDeserializer.class);
kafkaParams.put("group.id", "newgroup2");
kafkaParams.put("auto.offset.reset", "latest");
kafkaParams.put("enable.auto.commit", false); Collection<String> topics = Arrays.asList("test"); JavaInputDStream<ConsumerRecord<String, String>> stream = KafkaUtils
.createDirectStream(ssc, LocationStrategies.PreferConsistent(),
ConsumerStrategies.<String, String> Subscribe(topics,
kafkaParams)); // 注意这边的stream里的参数本身是个ConsumerRecord对象
JavaPairDStream<String, Integer> counts = stream
.flatMap(
x -> Arrays.asList(x.value().toString().split(" "))
.iterator())
.mapToPair(x -> new Tuple2<String, Integer>(x, 1))
.reduceByKey((x, y) -> x + y);
//counts.print(); JavaPairDStream<String, Integer> result = counts
.updateStateByKey(new Function2<List<Integer>, Optional<Integer>, Optional<Integer>>() { private static final long serialVersionUID = 1L; @Override
public Optional<Integer> call(List<Integer> values,
Optional<Integer> state) throws Exception {
/**
* values:经过分组最后 这个key所对应的value,如:[1,1,1,1,1]
* state:这个key在本次之前之前的状态
*/
Integer updateValue = 0;
if (state.isPresent()) {
updateValue = state.get();
} for (Integer value : values) {
updateValue += value;
}
return Optional.of(updateValue);
}
}); //数据库内容
String url = "jdbc:postgresql://192.168.174.200:5432/postgres?charSet=utf-8";
Properties connectionProperties = new Properties();
connectionProperties.put("user","postgres");
connectionProperties.put("password","postgres");
connectionProperties.put("driver","org.postgresql.Driver"); result.print(); result.foreachRDD(new VoidFunction<JavaPairRDD<String, Integer>>(){
public void call(JavaPairRDD<String, Integer> rdd)
throws Exception {
// TODO Auto-generated method stub
JavaRDD<ResultRow> rowRDD = rdd.map(new Function<Tuple2<String, Integer>,ResultRow>(){ public ResultRow call(Tuple2<String, Integer> arg0)
throws Exception {
// TODO Auto-generated method stub
ResultRow rr = new ResultRow();
rr.setTypeid(arg0._1);
rr.setKczs(arg0._2);
return rr;
} });
SparkSession spark = SparkSession.builder().config(rdd.context().getConf()).getOrCreate();
Dataset<Row> data = spark.createDataFrame(rowRDD, ResultRow.class);
//将数据通过覆盖的形式保存在数据表中
data.write().mode(SaveMode.Overwrite).jdbc(url, "kcssqktj", connectionProperties);
}
}); ssc.start();
try {
ssc.awaitTermination();
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
ssc.close();
} }
package com.gm.hive.SparkHive;

import java.io.Serializable;

public class ResultRow implements Serializable {
private static final long serialVersionUID = 6681372116317508248L;
String typeid;
int kczs; public String getTypeid() {
return typeid;
} public void setTypeid(String typeid) {
this.typeid = typeid;
} public int getKczs() {
return kczs;
} public void setKczs(int kczs) {
this.kczs = kczs;
} }
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.test</groupId>
<artifactId>kcssqktj_spark</artifactId>
<version>0.0.1-SNAPSHOT</version>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties> <dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>3.8.1</version>
<scope>test</scope>
</dependency> <dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>1.7.22</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.8.0</version>
<exclusions>
<exclusion>
<groupId>javax.servlet</groupId>
<artifactId>*</artifactId>
</exclusion>
</exclusions>
</dependency> <dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>2.0.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_2.11</artifactId>
<version>2.0.0</version>
</dependency> <dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>2.0.0</version>
<exclusions>
<exclusion>
<artifactId>slf4j-log4j12</artifactId>
<groupId>org.slf4j</groupId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.0.0</version>
</dependency> <!-- https://mvnrepository.com/artifact/org.apache.hive/hive-jdbc -->
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-jdbc</artifactId>
<version>2.1.1</version>
</dependency> <!-- https://mvnrepository.com/artifact/org.apache.hive/hive-exec -->
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>2.1.1</version>
</dependency> <dependency>
<groupId>org.postgresql</groupId>
<artifactId>postgresql</artifactId>
<version>9.4-1201-jdbc4</version>
</dependency> <dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
<version>2.0.0</version>
</dependency>
</dependencies>
<build> <plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<shadedArtifactAttached>true</shadedArtifactAttached>
<shadedClassifierName>allinone</shadedClassifierName>
<artifactSet>
<includes>
<include>*:*</include>
</includes>
</artifactSet>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
<transformers>
<transformer
implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
<resource>reference.conf</resource>
</transformer>
<transformer
implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
<resource>META-INF/spring.handlers</resource>
</transformer>
<transformer
implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
<resource>META-INF/spring.schemas</resource>
</transformer>
<transformer
implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<manifestEntries>
<Main-Class></Main-Class>
</manifestEntries>
</transformer>
</transformers>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>

Sprak2.0 Streaming消费Kafka数据实时计算及运算结果保存数据库代码示例的更多相关文章

  1. storm消费kafka实现实时计算

    大致架构 * 每个应用实例部署一个日志agent * agent实时将日志发送到kafka * storm实时计算日志 * storm计算结果保存到hbase storm消费kafka 创建实时计算项 ...

  2. Flink消费Kafka数据并把实时计算的结果导入到Redis

    1. 完成的场景 在很多大数据场景下,要求数据形成数据流的形式进行计算和存储.上篇博客介绍了Flink消费Kafka数据实现Wordcount计算,这篇博客需要完成的是将实时计算的结果写到redis. ...

  3. Spark Streaming消费Kafka Direct方式数据零丢失实现

    使用场景 Spark Streaming实时消费kafka数据的时候,程序停止或者Kafka节点挂掉会导致数据丢失,Spark Streaming也没有设置CheckPoint(据说比较鸡肋,虽然可以 ...

  4. Spark Streaming消费Kafka Direct保存offset到Redis,实现数据零丢失和exactly once

    一.概述 上次写这篇文章文章的时候,Spark还是1.x,kafka还是0.8x版本,转眼间spark到了2.x,kafka也到了2.x,存储offset的方式也发生了改变,笔者根据上篇文章和网上文章 ...

  5. spark streaming从指定offset处消费Kafka数据

    spark streaming从指定offset处消费Kafka数据 -- : 770人阅读 评论() 收藏 举报 分类: spark() 原文地址:http://blog.csdn.net/high ...

  6. Spark streaming消费Kafka的正确姿势

    前言 在游戏项目中,需要对每天千万级的游戏评论信息进行词频统计,在生产者一端,我们将数据按照每天的拉取时间存入了Kafka当中,而在消费者一端,我们利用了spark streaming从kafka中不 ...

  7. 一文让你彻底了解大数据实时计算引擎 Flink

    前言 在上一篇文章 你公司到底需不需要引入实时计算引擎? 中我讲解了日常中常见的实时需求,然后分析了这些需求的实现方式,接着对比了实时计算和离线计算.随着这些年大数据的飞速发展,也出现了不少计算的框架 ...

  8. 基于Kafka的实时计算引擎如何选择?Flink or Spark?

    1.前言 目前实时计算的业务场景越来越多,实时计算引擎技术及生态也越来越成熟.以Flink和Spark为首的实时计算引擎,成为实时计算场景的重点考虑对象.那么,今天就来聊一聊基于Kafka的实时计算引 ...

  9. 基于Kafka的实时计算引擎如何选择?(转载)

    1.前言 目前实时计算的业务场景越来越多,实时计算引擎技术及生态也越来越成熟.以Flink和Spark为首的实时计算引擎,成为实时计算场景的重点考虑对象.那么,今天就来聊一聊基于Kafka的实时计算引 ...

随机推荐

  1. Activiti流量变量(九)

    1什么是流程变量 流程变量在 activiti 中是一个非常重要的角色,流程运转有时需要靠流程变量,业务系统和 activiti结合时少不了流程变量,流程变量就是 activiti 在管理工作流时根据 ...

  2. 【bzoj1588】[HNOI2002]营业额统计

    题目描述: 营业额统计 Tiger最近被公司升任为营业部经理,他上任后接受公司交给的第一项任务便是统计并分析公司成立以来的营业情况. Tiger拿出了公司的账本,账本上记录了公司成立以来每天的营业额. ...

  3. Linux 安装R包

    https://www.cnblogs.com/jessepeng/p/10984983.html Linux 的R环境,可以通过anaconda jupyter notbook很容易的配置,见我之前 ...

  4. 【BZOJ5092】分割序列(高维前缀和)

    题意:对于一个长度为n的非负整数序列b_1,b_2,...,b_n, 定义这个序列的能量为:f(b)=max{i=0,1,...,n}((b_1 xor b_2 xor...xor b_i)+(b_{ ...

  5. linux 搭建环境

    报错:cannot find valid baseurl for repo:base 解决办法: https://blog.csdn.net/banqgg/article/details/782560 ...

  6. [CSP-S模拟测试]:矩阵游戏(数学)

    题目描述 $LZK$发明一个矩阵游戏,大家一起来玩玩吧,有一个$N$行$M$列的矩阵.第一行的数字是$1,2,...,M$,第二行的数字是$M+1,M+2,...,2\times M$,以此类推,第$ ...

  7. Python 高效编程技巧实战(2-1)如何在列表,字典, 集合中根据条件筛选数据

    Python 高效编程技巧实战(2-1)如何在列表,字典, 集合中根据条件筛选数据 学习目标 1.学会使用 filter 借助 Lambda 表达式过滤列表.集合.元组中的元素: 2.学会使用列表解析 ...

  8. MacOS上zsh环境设置默认jdk

    进入home目录 cd ~ 修改.zprofile文件 vi .zprofile 按i进入vim插入模式,添加以下代码 export JAVA_HOME="/Library/Java/Jav ...

  9. 【C++】fill函数,fill与memset函数的区别

    转载自:https://blog.csdn.net/liuchuo/article/details/52296646 memset函数 按照字节填充某字符在头文件<cstring>里面fi ...

  10. android hidl

    1.定义.hal接口文件,如: 在vendor/sprd/interface中新建目录hello,其中定义好hidl接口,如: package vendor.sprd.hardware.hello@1 ...