从kafka中读取指定的topic,根据中间内容的不同,写入不同的文件中。

文件按照日期区分。

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2018/4/9 11:49
# @Author : baoshan
# @Site :
# @File : readTraceFromKafkaStreamingToJson.py
# @Software: PyCharm Community Edition from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
import datetime
import json
import time
from collections import defaultdict import subprocess class KafkaMessageParse: def extractFromKafka(self, kafkainfo):
if type(kafkainfo) is tuple and len(kafkainfo) == 2:
return kafkainfo[1] def lineFromLines(self, lines):
if lines is not None and len(lines) > 0:
return lines.strip().split("\n") def messageFromLine(self, line):
if line is not None and "message" in line.keys():
return line.get("message") def extractFromMessage(self, message):
try:
jline = json.loads(message)
result = defaultdict()
name = jline.get("name") if "speech" in name:
trace_id = jline.get("trace_id")
parent_id = jline.get("parent_id")
span_id = jline.get("span_id")
sa = jline.get("sa")
sr = jline.get("sr")
ss = jline.get("ss")
ret = jline.get("ret") result['trace_id'] = trace_id
result['parent_id'] = parent_id
result['span_id'] = span_id
result['name'] = name
result['sa'] = sa
result['sr'] = sr
result['ss'] = ss
result['ret'] = ret annotation = jline.get("annotation")
try:
for anno in annotation:
if anno.get("name") == "nlp":
debug_log = anno.get("debug_log")
debug_log_anno = debug_log[0]
asr = debug_log_anno.get("asr") # asr
nlp = debug_log_anno.get("nlp") action = debug_log_anno.get("action")
jaction = json.loads(action)
response = jaction.get("response")
tts = response.get("action").get("directives")[0].get("item").get("tts")
result['tts'] = tts jnlp = json.loads(nlp)
intent = jnlp.get('intent')
app_id = jnlp.get('appId')
cloud = jnlp.get("cloud")
slots = jnlp.get("slots") result['app_id'] = app_id
result['intent'] = intent
result['cloud'] = cloud
result['asr'] = asr
result['nlp'] = nlp
result['slots'] = slots
debug_log = jline.get("debug_log")
debug_log0 = debug_log[0]
session_id = debug_log0.get("session_id")
codec = debug_log0.get("codec")
if not session_id:
session_id = "" # 超级无敌重要
wavfile = session_id + ".wav"
codecfile = session_id + "." + codec asrtimestr = session_id.split("-")[-1]
try:
st = time.localtime(float(asrtimestr))
except:
st = time.localtime()
asrtime = time.strftime("%Y-%m-%d %H:%M:%S", st)
asrthedate = time.strftime("%Y%m%d", st) asrdeviceid = debug_log0.get("device_id")
asrdevicetype = debug_log0.get("device_type")
asrdevicekey = debug_log0.get("device_key") result['session_id'] = session_id
result['device_id'] = asrdeviceid
result['device_key'] = asrdevicekey
result['device_type'] = asrdevicetype
result['thedate'] = asrtime
result['wavfile'] = wavfile
result['codecfile'] = codecfile
result['asrthedate'] = asrthedate strmessage = json.dumps(result, ensure_ascii=False) return strmessage
except:
return strmessage elif "tts" in name: # tts
try:
trace_id = jline.get("trace_id")
parent_id = jline.get("parent_id")
span_id = jline.get("span_id")
name = jline.get("name")
sa = jline.get("sa")
sr = jline.get("sr")
ss = jline.get("ss")
ret = jline.get("ret") result['trace_id'] = trace_id
result['parent_id'] = parent_id
result['span_id'] = span_id
result['name'] = name
result['sa'] = sa
result['sr'] = sr
result['ss'] = ss
result['ret'] = ret debug_log = jline.get("debug_log")
debug_log_tts = debug_log[0]
text = debug_log_tts.get("text")
codec = debug_log_tts.get("codec")
declaimer = debug_log_tts.get("declaimer")
logs = debug_log_tts.get("logs")
params = debug_log_tts.get("params") result['text'] = text
result['codec'] = codec
result['declaimer'] = declaimer
result['logs'] = logs
result['params'] = params strresult = json.dumps(result, ensure_ascii=False) return strresult
except:
return None
except:
return '' def tpprint(val, num=10000):
"""
Print the first num elements of each RDD generated in this DStream.
@param num: the number of elements from the first will be printed.
"""
def takeAndPrint(time, rdd):
taken = rdd.take(num + 1)
print("########################")
print("Time: %s" % time)
print("########################")
DATEFORMAT = '%Y%m%d'
today = datetime.datetime.now().strftime(DATEFORMAT) speechfile = open("/mnt/data/trace/trace.rt.speech." + today, "a")
ttsfile = open("/mnt/data/trace/trace.rt.tts." + today, "a")
otherfile = open("/mnt/data/trace/trace.rt.other." + today, "a") for record in taken[:num]:
if record is not None and len(record) > 2: # None 不打印
print(record)
jrecord = json.loads(record)
name = jrecord.get("name")
if "speech" in name:
speechfile.write(str(record) + "\n")
elif "tts" in name:
ttsfile.write(str(record) + "\n")
else:
otherfile.write(str(record) + "\n") speechfile.close()
ttsfile.close()
otherfile.close() if len(taken) > num:
print("...") val.foreachRDD(takeAndPrint) if __name__ == '__main__':
zkQuorum = 'datacollect-1:2181,datacollect-2:2181,datacollect-3:2181'
topic = {'trace-open-gw-5': 1, 'trace-open-gw-6': 1, 'trace-open-gw-7': 1, 'trace-open-gw-8': 1, 'trace-open-gw-9': 1}
groupid = "rokid-trace-rt"
master = "local[*]"
appName = "SparkStreamingRokidTrace"
timecell = 5 sc = SparkContext(master=master, appName=appName)
ssc = StreamingContext(sc, timecell) kvs = KafkaUtils.createStream(ssc, zkQuorum, groupid, topic)
kmp = KafkaMessageParse()
lines = kvs.map(lambda x: kmp.extractFromKafka(x))
lines1 = lines.flatMap(lambda x: kmp.lineFromLines(x))
valuedict = lines1.map(lambda x: eval(x))
message = valuedict.map(lambda x: kmp.messageFromLine(x))
rdd2 = message.map(lambda x: kmp.extractFromMessage(x)) # result is a json str tpprint(rdd2) ssc.start()
ssc.awaitTermination()

还请各位大仙不吝赐教!

Python+SparkStreaming+kafka+写入本地文件案例(可执行)的更多相关文章

  1. python 利用 ogr 写入shp文件,数据格式

    python 利用 ogr 写入 shp 文件, 定义shp文件中的属性字段(field)的数据格式为: OFTInteger # 整型 OFTIntegerList # 整型list OFTReal ...

  2. python读取数据库并把数据写入本地文件

    一,介绍 上周用jmeter做性能测试时,接口B传入的参数需要依赖接口A生成的借贷申请ID,接口A运行完需要把生成的借贷申请ID导出来到一个文件,作为参数传给接口B,刚开始的时候,手动去数据库倒, 倒 ...

  3. python开发_搜索本地文件信息写入文件

    功能:#在指定的盘符,如D盘,搜索出与用户给定后缀名(如:jpg,png)相关的文件 #然后把搜索出来的信息(相关文件的绝对路径),存放到用户指定的 #文件(如果文件不存在,则建立相应的文件)中 之前 ...

  4. python读取和写入csv文件

    读取csv文件: def readCsv(): rows=[] with file(r'E:\py\py01\Data\system.csv','rb') as f: reads=csv.reader ...

  5. OC 将NSString写入本地文件

    最近在公司偶尔遇到一些不经常复现的bug,为了调试,只好把关键值记录到本地文件中,在遇到问题时,调出本地文件查看一下就可以很方便的知道是不是代码逻辑的错误或者问题考虑不够周全了. 废话不多说,流程在代 ...

  6. java获取网页源代码并写入本地文件中

    import java.io.*; import java.net.*; public class URLDemo { public static void main(String args[]){ ...

  7. python读取并写入mat文件

    用matlab生成一个示例mat文件: clear;clc matrix1 = magic(5); matrix2 = magic(6); save matData.mat 用python3读取并写入 ...

  8. python+selenium上传本地文件

    迅雷号自媒体视频文件自动上传,贴标签发布 难点 本地文件上传,通过send_keys(‘文件路径’)的方式实现上传的目的 文件名通过正则匹配的方式进行处理,主要匹配出中文标题名称 处理过程中文件名称中 ...

  9. 如何将Python对象保存在本地文件中?

    Python对象的永久存储 1.使用Python的pickle模块 import pickle class A: def __init__(self,name,a): self.name=name s ...

随机推荐

  1. Java获取函数参数名称

    原理 编译之后的class文件默认是不带有参数名称信息的,使用 IDE 时,反编译jar包得到的源代码函数参数名称是 arg0,arg1......这种形式,这是因为编译 jar 包的时候没有把符号表 ...

  2. maven-compiler-plugin升级到3.1出现问题(转)

    转自:http://my.oschina.net/zhuka/blog/124503 No compiler is provided in this environment. Perhaps you ...

  3. 关于去哪儿网的UI自动化测试脚本

    UI自动化测试Qunar机票搜索场景访问Qunar机票首页http://flight.qunar.com,选择“单程”,输入出发.到达城市,选择today+7日后的日期,点“搜索”,跳转到机票单程搜索 ...

  4. webpack window dev-server配置

    1.安装webpack dev-server npm install --save-dev webpack webpack-dev-server 著作权归作者所有.商业转载请联系作者获得授权,非商业转 ...

  5. Centos 安装ImageMagick 与 imagick for php步骤详解

    现在有很多朋友在使用安装ImageMagick imagick for php了,今天自己也想做但是不知道如何操作,下面我来给大家介绍Centos 安装ImageMagick imagick for ...

  6. k近邻算法-java实现

    最近在看<机器学习实战>这本书,因为自己本身很想深入的了解机器学习算法,加之想学python,就在朋友的推荐之下选择了这本书进行学习. 一 . K-近邻算法(KNN)概述 最简单最初级的分 ...

  7. mysql_install_db 运行结果

    # /usr/local/mysql/scripts/mysql_install_db \ > --defaults-file=/etc/my.cnf \ > --basedir=/usr ...

  8. SQL中特殊符号的使用

    1. & 在Oracle中,& 是从需要外部输入输入的变量,PS:MySqL中可以直接用"P&G"这样的字符串,故不存在本文所说的问题 如下SQL语句就不能 ...

  9. VS Code 中文注释显示乱码

    将设置中的"files.autoGuessEncoding"项的值改为true即可. 1.文件 2.首选项 3.设置 4.搜索 "files.autoGuessEncod ...

  10. SNF.Net 快速开发平台Spring.Net.Framework 诞生的由来与规划

    没有快速开发平台的时候只能感慨自己曾经浪费了那么多精力在拖拽控件上,总写重复的代码,却花了很多精力且不能体现自己的价值.SNF快速开发平台能把你解放出来,让你有更多的时间参与到核心业务逻辑中去,让你有 ...