Python+SparkStreaming+kafka+写入本地文件案例(可执行)
从kafka中读取指定的topic,根据中间内容的不同,写入不同的文件中。
文件按照日期区分。
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2018/4/9 11:49
# @Author : baoshan
# @Site :
# @File : readTraceFromKafkaStreamingToJson.py
# @Software: PyCharm Community Edition from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
import datetime
import json
import time
from collections import defaultdict import subprocess class KafkaMessageParse: def extractFromKafka(self, kafkainfo):
if type(kafkainfo) is tuple and len(kafkainfo) == 2:
return kafkainfo[1] def lineFromLines(self, lines):
if lines is not None and len(lines) > 0:
return lines.strip().split("\n") def messageFromLine(self, line):
if line is not None and "message" in line.keys():
return line.get("message") def extractFromMessage(self, message):
try:
jline = json.loads(message)
result = defaultdict()
name = jline.get("name") if "speech" in name:
trace_id = jline.get("trace_id")
parent_id = jline.get("parent_id")
span_id = jline.get("span_id")
sa = jline.get("sa")
sr = jline.get("sr")
ss = jline.get("ss")
ret = jline.get("ret") result['trace_id'] = trace_id
result['parent_id'] = parent_id
result['span_id'] = span_id
result['name'] = name
result['sa'] = sa
result['sr'] = sr
result['ss'] = ss
result['ret'] = ret annotation = jline.get("annotation")
try:
for anno in annotation:
if anno.get("name") == "nlp":
debug_log = anno.get("debug_log")
debug_log_anno = debug_log[0]
asr = debug_log_anno.get("asr") # asr
nlp = debug_log_anno.get("nlp") action = debug_log_anno.get("action")
jaction = json.loads(action)
response = jaction.get("response")
tts = response.get("action").get("directives")[0].get("item").get("tts")
result['tts'] = tts jnlp = json.loads(nlp)
intent = jnlp.get('intent')
app_id = jnlp.get('appId')
cloud = jnlp.get("cloud")
slots = jnlp.get("slots") result['app_id'] = app_id
result['intent'] = intent
result['cloud'] = cloud
result['asr'] = asr
result['nlp'] = nlp
result['slots'] = slots
debug_log = jline.get("debug_log")
debug_log0 = debug_log[0]
session_id = debug_log0.get("session_id")
codec = debug_log0.get("codec")
if not session_id:
session_id = "" # 超级无敌重要
wavfile = session_id + ".wav"
codecfile = session_id + "." + codec asrtimestr = session_id.split("-")[-1]
try:
st = time.localtime(float(asrtimestr))
except:
st = time.localtime()
asrtime = time.strftime("%Y-%m-%d %H:%M:%S", st)
asrthedate = time.strftime("%Y%m%d", st) asrdeviceid = debug_log0.get("device_id")
asrdevicetype = debug_log0.get("device_type")
asrdevicekey = debug_log0.get("device_key") result['session_id'] = session_id
result['device_id'] = asrdeviceid
result['device_key'] = asrdevicekey
result['device_type'] = asrdevicetype
result['thedate'] = asrtime
result['wavfile'] = wavfile
result['codecfile'] = codecfile
result['asrthedate'] = asrthedate strmessage = json.dumps(result, ensure_ascii=False) return strmessage
except:
return strmessage elif "tts" in name: # tts
try:
trace_id = jline.get("trace_id")
parent_id = jline.get("parent_id")
span_id = jline.get("span_id")
name = jline.get("name")
sa = jline.get("sa")
sr = jline.get("sr")
ss = jline.get("ss")
ret = jline.get("ret") result['trace_id'] = trace_id
result['parent_id'] = parent_id
result['span_id'] = span_id
result['name'] = name
result['sa'] = sa
result['sr'] = sr
result['ss'] = ss
result['ret'] = ret debug_log = jline.get("debug_log")
debug_log_tts = debug_log[0]
text = debug_log_tts.get("text")
codec = debug_log_tts.get("codec")
declaimer = debug_log_tts.get("declaimer")
logs = debug_log_tts.get("logs")
params = debug_log_tts.get("params") result['text'] = text
result['codec'] = codec
result['declaimer'] = declaimer
result['logs'] = logs
result['params'] = params strresult = json.dumps(result, ensure_ascii=False) return strresult
except:
return None
except:
return '' def tpprint(val, num=10000):
"""
Print the first num elements of each RDD generated in this DStream.
@param num: the number of elements from the first will be printed.
"""
def takeAndPrint(time, rdd):
taken = rdd.take(num + 1)
print("########################")
print("Time: %s" % time)
print("########################")
DATEFORMAT = '%Y%m%d'
today = datetime.datetime.now().strftime(DATEFORMAT) speechfile = open("/mnt/data/trace/trace.rt.speech." + today, "a")
ttsfile = open("/mnt/data/trace/trace.rt.tts." + today, "a")
otherfile = open("/mnt/data/trace/trace.rt.other." + today, "a") for record in taken[:num]:
if record is not None and len(record) > 2: # None 不打印
print(record)
jrecord = json.loads(record)
name = jrecord.get("name")
if "speech" in name:
speechfile.write(str(record) + "\n")
elif "tts" in name:
ttsfile.write(str(record) + "\n")
else:
otherfile.write(str(record) + "\n") speechfile.close()
ttsfile.close()
otherfile.close() if len(taken) > num:
print("...") val.foreachRDD(takeAndPrint) if __name__ == '__main__':
zkQuorum = 'datacollect-1:2181,datacollect-2:2181,datacollect-3:2181'
topic = {'trace-open-gw-5': 1, 'trace-open-gw-6': 1, 'trace-open-gw-7': 1, 'trace-open-gw-8': 1, 'trace-open-gw-9': 1}
groupid = "rokid-trace-rt"
master = "local[*]"
appName = "SparkStreamingRokidTrace"
timecell = 5 sc = SparkContext(master=master, appName=appName)
ssc = StreamingContext(sc, timecell) kvs = KafkaUtils.createStream(ssc, zkQuorum, groupid, topic)
kmp = KafkaMessageParse()
lines = kvs.map(lambda x: kmp.extractFromKafka(x))
lines1 = lines.flatMap(lambda x: kmp.lineFromLines(x))
valuedict = lines1.map(lambda x: eval(x))
message = valuedict.map(lambda x: kmp.messageFromLine(x))
rdd2 = message.map(lambda x: kmp.extractFromMessage(x)) # result is a json str tpprint(rdd2) ssc.start()
ssc.awaitTermination()
还请各位大仙不吝赐教!
Python+SparkStreaming+kafka+写入本地文件案例(可执行)的更多相关文章
- python 利用 ogr 写入shp文件,数据格式
python 利用 ogr 写入 shp 文件, 定义shp文件中的属性字段(field)的数据格式为: OFTInteger # 整型 OFTIntegerList # 整型list OFTReal ...
- python读取数据库并把数据写入本地文件
一,介绍 上周用jmeter做性能测试时,接口B传入的参数需要依赖接口A生成的借贷申请ID,接口A运行完需要把生成的借贷申请ID导出来到一个文件,作为参数传给接口B,刚开始的时候,手动去数据库倒, 倒 ...
- python开发_搜索本地文件信息写入文件
功能:#在指定的盘符,如D盘,搜索出与用户给定后缀名(如:jpg,png)相关的文件 #然后把搜索出来的信息(相关文件的绝对路径),存放到用户指定的 #文件(如果文件不存在,则建立相应的文件)中 之前 ...
- python读取和写入csv文件
读取csv文件: def readCsv(): rows=[] with file(r'E:\py\py01\Data\system.csv','rb') as f: reads=csv.reader ...
- OC 将NSString写入本地文件
最近在公司偶尔遇到一些不经常复现的bug,为了调试,只好把关键值记录到本地文件中,在遇到问题时,调出本地文件查看一下就可以很方便的知道是不是代码逻辑的错误或者问题考虑不够周全了. 废话不多说,流程在代 ...
- java获取网页源代码并写入本地文件中
import java.io.*; import java.net.*; public class URLDemo { public static void main(String args[]){ ...
- python读取并写入mat文件
用matlab生成一个示例mat文件: clear;clc matrix1 = magic(5); matrix2 = magic(6); save matData.mat 用python3读取并写入 ...
- python+selenium上传本地文件
迅雷号自媒体视频文件自动上传,贴标签发布 难点 本地文件上传,通过send_keys(‘文件路径’)的方式实现上传的目的 文件名通过正则匹配的方式进行处理,主要匹配出中文标题名称 处理过程中文件名称中 ...
- 如何将Python对象保存在本地文件中?
Python对象的永久存储 1.使用Python的pickle模块 import pickle class A: def __init__(self,name,a): self.name=name s ...
随机推荐
- 个人常用eclipse快捷键,不定期更新
ctrl+f11 ==> runctrl+h ==> 全文检索main+enter ==>public static void main(String[] args) { } alt ...
- 树莓派进阶之路 (024) - windows远程桌面连接树莓派通过xrdp服务(转)
本文转载:http://www.cnblogs.com/edgexie/p/6527992.html 在网上看到很多关于windows远程桌面连接树莓派的教程.我也按照教程试过了,遇到了几个坑.特意记 ...
- System.ComponentModel.DataAnnotations.Schema 冲突
System.ComponentModel.DataAnnotations.Schema 冲突 Entity Framework 与 .net4.5 的 System.ComponentModel.D ...
- 解决UEditor将div标签换成p标签的问题
原文链接 将设计排版好的页面html代码上传到数据库,再读取出来的时候发现所有的div都被替换成了p标签. 解决方法: 首先在ueditor.all.js文件内搜索allowDivTransToP,找 ...
- 《JAVA与模式》之原型模式(转载)
原型模式其实就是java的拷贝机制 原文出处:http://blog.csdn.net/zhengzhb/article/details/7393528 定义:用原型实例指定创建对象的种类,并通过 ...
- 分布式配置 tachyon 并执行Hadoop样例 MapReduce
----------此文章.笔者按着tachyon官网教程进行安装并记录. (本地安装tachyon具体解释:http://blog.csdn.net/u012587561/article/detai ...
- 关于PKCS的文档资料
关于PKCS的文档资料,在这里查找: http://www.emc.com/emc-plus/rsa-labs/standards-initiatives/public-key-cryptograph ...
- Python 文件 read() 方法
概述 Python 文件 read() 方法用于从文件中读取指定的字符数,如果未给定或为负则读取所有. 语法 read() 方法语法如下: fileObject.read([size]) 参数 siz ...
- [转]OkHttp使用完全教程
1. 历史上Http请求库优缺点 在讲述OkHttp之前, 我们看下没有OkHttp的时代, 我们是如何完成http请求的.在没有OkHttp的日子, 我们使用HttpURLConnection或者H ...
- php实现ZIP压缩文件解压缩(转)
测试使用了两个办法都可以实现: 第一个:需要开启配置php_aip.dll <?php //需开启配置 php_zip.dll //phpinfo(); header("Content ...