HiBench成长笔记——(9) 分析源码monitor.py
monitor.py 是主监控程序,将监控数据写入日志,并统计监控数据生成HTML统计展示页面:
#!/usr/bin/env python2
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import threading, subprocess, re, os, sys, signal, socket
from time import sleep, time
from contextlib import closing
import traceback, thread
from datetime import datetime
from collections import namedtuple
from pprint import pprint
from itertools import groupby
# Probe intervals, in seconds.
# Warning: a value too short may get wrong results due to lack of data when system load goes high.
# and must be float!
PROBE_INTERVAL=float(5)
#FIXME: use log helper later
#log_lock = threading.Lock()
def log(*s):
if len(s)==1: s=s[0]
else: s= " ".join([str(x) for x in s])
# with log_lock:
# with open("/home/zhihui/monitor_proc.log", 'a') as f:
log_str = str(thread.get_ident())+":"+str(s) +'\n'
# f.write( log_str )
sys.stderr.write(log_str)
entered=False
def sig_term_handler(signo, stack):
global entered
global log_path
global report_path
global workload_title
global bench_log_path
global na
if not entered:
entered=True # FIXME: Not atomic
else: return
na.stop()
generate_report(workload_title, log_path, bench_log_path, report_path)
sys.exit(0)
def samedir(fn):
"""
return abspath of fn in the same directory where this python file stores
"""
return os.path.abspath(os.path.join(os.path.dirname(__file__), fn))
class PatchedNameTuple(object):
def __sub__(self, other):
assert isinstance(other, self.__class__)
assert self[0] == other[0]
cls = self.__class__
return cls(self[0], *[a-b for a, b in zip(self[1:], other[1:])])
def __div__(self, other):
return self.__class__(self[0], *[a/other for a in self[1:]])
def _add(self, other, override_title=None):
if other == None: return self
assert isinstance(other, self.__class__)
cls = self.__class__
title = self[0] if not override_title else override_title
return cls(title, *[a+b for a, b in zip(self[1:], other[1:])])
def ident(size, s):
return "\n".join((" "*size + x for x in s.split("\n")))
class RemoteProc(threading.Thread):
SEP="----SEP----"
template_debug=r"""exec('
import time, os, sys, socket, traceback
socket.setdefaulttimeout(1)
def log(*x, **kw):
with open("/home/zhihui/probe.log", kw.get("mode","a")) as f:
f.write(repr(x)+chr(10))
try:
log("create socket", mode="w")
s=socket.socket(socket.AF_INET, socket.SOCK_STREAM)
log("bind socket")
s.bind(("0.0.0.0",0))
log("listen socket")
s.listen(5)
log("bind socket to:", s.getsockname())
while True:
log("accepting")
try:
print s.getsockname()[1]
s2,peer=s.accept()
break
except socket.timeout:
log("accept timeout, retry")
log("accepted, peer:",peer)
except Exception as e:
import traceback
log(traceback.format_exc())
{func_template}
while True:
s2.send(("{SEP}+%s" % time.time())+chr(10))
{call_template}
s2.send("{SEP}#end"+chr(10))
time.sleep({interval})
')"""
template=r"""exec('
import time, os, sys, socket, traceback
s=socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.bind(("0.0.0.0",0))
s.listen(5)
print s.getsockname()[1]
s2,peer=s.accept()
{func_template}
while True:
s2.send(("{SEP}+%s" % time.time())+chr(10))
{call_template}
s2.send("{SEP}#end"+chr(10))
time.sleep({interval})
')"""
def __init__(self, host, interval=1):
self.host = host
self.cmds = []
self.interval = interval
self.monitor_ins = {}
self.local_aggr_container={}
self._running=True
super(RemoteProc, self).__init__()
def register(self, monitor_ins, cmds):
assert isinstance(monitor_ins, BaseMonitor)
self.monitor_ins[len(self.cmds)] = monitor_ins # monitor command seq id => monitor instance
self.cmds.append(cmds)
def run(self):
func_template = "\n".join(["def func_{id}():\n{func}"\
.format(id=id,
func=ident(2,
func+'\ns2.send("{SEP}={id}"+chr(10))'\
.format(SEP=self.SEP, id=id))) \
for id, func in enumerate(self.cmds)])
call_template="\n".join([" func_{id}()"\
.format(id=id) for id in range(len(self.cmds))]
)
script = self.template.format(func_template=func_template,
call_template=call_template,
interval = self.interval,
SEP = self.SEP)
s = script.replace('"', r'\"').replace("\n", r"\n")
container=[]
# log("ssh client to:", self.host)
with self.ssh_client(self.host, "python -u -c \"{script}\"".format(script=s)) as f:
# log("ssh client %s connected" % self.host)
try:
port_line = f.readline()
# log("host:", self.host, "got port,", port_line)
port = int(port_line.rstrip())
s=socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.settimeout(0.5)
for i in range(30): # try to connect 30 times maximum
try:
# log("try to connect:", self.host, port)
s.connect((self.host, port))
# log("connectted to:", self.host, port)
break
except socket.timeout:
# log("connecting to:", self.host, port, "timedout")
pass
else: # not connectted after 30 times trying
# log("cann't connectted to:", self.host, port)
s.shutdown(socket.SHUT_RDWR)
self.ssh_close()
return
s.settimeout(None)
except Exception as e:
log(traceback.format_exc())
with closing(s.makefile()) as f2:
while self._running:
try:
l = f2.readline()
except KeyboardInterrupt:
break
if not l: break
if l.startswith(self.SEP):
tail = l.lstrip(self.SEP)
if tail[0]=='+': # timestamp
remote_timestamp = float(tail[1:])
cur_timestamp = time()
elif tail.startswith('#end'): # end sign
# log("na push, timestamp:", cur_timestamp)
self.na_push(cur_timestamp)
else:
id = int(tail[1:])
if self.monitor_ins[id]:
self.monitor_ins[id].feed(container, cur_timestamp)
container = []
else:
container.append(l.rstrip())
s.shutdown(socket.SHUT_RDWR)
self.ssh_close()
def stop(self):
self._running=False
def aggregate(self, timestamp, data):
if not self.local_aggr_container:
self.local_aggr_container['timestamp']=timestamp
assert timestamp == self.local_aggr_container['timestamp']
assert type(data) is dict
self.local_aggr_container.update(data)
self.local_aggr_container['timestamp'] = timestamp
def na_register(self, na):
assert isinstance(na, NodeAggregator)
self.node_aggr_parent = na
def na_push(self, timestamp):
if self.local_aggr_container:
assert self.local_aggr_container.get('timestamp', -1) == timestamp
self.node_aggr_parent.commit_aggregate(self.host, self.local_aggr_container)
self.local_aggr_container={}
class BaseMonitor(object):
IGNORE_KEYS=[]
def __init__(self, rproc):
self.rproc = rproc
self._last = None
def feed(self, container, timestamp): # override to parse pulled data files
raise NotImplementedError()
def ssh_client(self, host, shell): # override for opening ssh client
raise NotImplementedError()
def ssh_close(self): # override for clear up ssh client
raise NotImplementedError()
def commit(self, timestamp, header, stat):
if self._last is None: self._last = stat
else:
stat_delta = dict([(header+'/'+k, stat[k] - self._last[k]) \
for k in set(self._last.keys()).union(set(stat.keys()))\
if k in stat and k in self._last and k not in self.IGNORE_KEYS
])
self._last = stat
# if header.startswith("net"):
# print stat_delta
stat_delta[header+'/total'] = reduce_patched(lambda a,b: a._add(b, 'total'), stat_delta.values())
self.rproc.aggregate(timestamp, stat_delta)
class BashSSHClientMixin(object):
ssh_lock = threading.Lock()
def ssh_client(self, host, shell):
with open(os.devnull, 'rb', 0) as DEVNULL:
with BashSSHClientMixin.ssh_lock:
self.proc = subprocess.Popen(["ssh", host, shell], bufsize=1,
stdin=DEVNULL, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
return self.proc.stdout
def ssh_close(self):
assert self.proc
self.proc.terminate()
self.proc.wait()
return self.proc.returncode
_CPU=namedtuple("CPU", ['label', 'user', 'nice', 'system', 'idle', 'iowait', 'irq', 'softirq'])
class CPU(_CPU, PatchedNameTuple):
def percentage(self):
total = sum(self[1:])
return CPU(self[0], *[x*100.0 / total for x in self[1:]]) if total>0 else self
class CPUMonitor(BaseMonitor):
def __init__(self, rproc):
super(CPUMonitor, self).__init__(rproc)
rproc.register(self, """with open("/proc/stat") as f:
s2.send("".join([x for x in f.readlines() if x.startswith("cpu")]))
""")
def feed(self, container, timestamp):
"parse /proc/stat"
self.commit(timestamp, dict([self._parse_stat(line) for line in container]))
def _parse_stat(self, line):
"parse one line of /proc/stat"
assert line.strip(), "BUG! empty line in /proc/stat"
fields = line.split()
if fields[0]=='cpu':
fields[0]='total'
return (fields[0], CPU(fields[0], *[int(x) for x in fields[1:8]]))
def commit(self, timestamp, cpu_stat):
if self._last is None:
self._last = cpu_stat
else:
cpu_usage = dict([("cpu/"+k, (cpu_stat[k] - self._last[k]).percentage()) for k in self._last])
self._last = cpu_stat
self.rproc.aggregate(timestamp, cpu_usage)
_Network=namedtuple("Network", ['label', "recv_bytes", "recv_packets", "recv_errs", "recv_drop",
"send_bytes", "send_packets", "send_errs", "send_drop"])
class Network(_Network, PatchedNameTuple): pass
class NetworkMonitor(BaseMonitor):
IGNORE_KEYS=["lo"]
def __init__(self, rproc):
rproc.register(self, """with open("/proc/net/dev") as f:
s2.send("".join([x for x in f.readlines()]))
""")
self._filter = re.compile('^\s*(.+):\s*(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+\d+\s+\d+\s+\d+\s+\d+\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+).*$')
super(NetworkMonitor, self).__init__(rproc)
def feed(self, container, timestamp):
"parse /proc/net/dev"
self.commit(timestamp, "net", dict(filter(lambda x:x, [self._parse_net_dev(line) for line in container])))
def _parse_net_dev(self, line):
matched = self._filter.match(line)
if matched:
obj = Network(matched.groups()[0], *[int(x) for x in matched.groups()[1:]])
if not (obj.recv_bytes==0 and obj.send_bytes==0):
return (obj[0], obj)
_Disk=namedtuple("Disk", ["label", "io_read", "bytes_read", "time_spent_read", "io_write", "bytes_write", "time_spent_write"])
class Disk(_Disk, PatchedNameTuple): pass
class DiskMonitor(BaseMonitor):
def __init__(self, rproc):
super(DiskMonitor, self).__init__(rproc)
rproc.register(self, """with open("/proc/diskstats") as f:
blocks = os.listdir("/sys/block")
s2.send("".join([x for x in f.readlines() if x.split()[2] in blocks and not x.split()[2].startswith("loop") and x.split()[3]!="0"]))
""")
def feed(self, container, timestamp):
"parse /proc/diskstats"
self.commit(timestamp, "disk", dict([self._parse_disk_stat(line) for line in container]))
def _parse_disk_stat(self, line):
fields = line.split()[2:]
obj = Disk(fields[0],
io_read=int(fields[1]), bytes_read=int(fields[3])*512, time_spent_read=int(fields[4])/1000.0,
io_write=int(fields[5]), bytes_write=int(fields[7])*512, time_spent_write=int(fields[8])/1000.0)
return (obj[0], obj)
_Memory=namedtuple("Memory", ["label", "total", "used", "buffer_cache", "free", "map"])
class Memory(_Memory, PatchedNameTuple): pass
class MemoryMonitor(BaseMonitor):
def __init__(self, rproc):
super(MemoryMonitor, self).__init__(rproc)
rproc.register(self, """with open("/proc/meminfo") as f:
mem = dict([(a, b.split()[0].strip()) for a, b in [x.split(":") for x in f.readlines()]])
s2.send(":".join([mem[field] for field in ["MemTotal", "Buffers", "Cached", "MemFree", "Mapped"]])+chr(10))
""")
def feed(self, memory_status, timestamp):
"parse /proc/meminfo"
total, buffers, cached, free, mapped= [int(x) for x in memory_status[0].split(":")]
self.rproc.aggregate(timestamp, {"memory/total":Memory(label="total", total=total,
used=total - free - buffers-cached,
buffer_cache=buffers + cached,
free=free, map=mapped)})
_Proc=namedtuple("Proc", ["label", "load5", "load10", "load15", "running", "procs"])
class Proc(_Proc, PatchedNameTuple): pass
class ProcMonitor(BaseMonitor):
def __init__(self, rproc):
super(ProcMonitor, self).__init__(rproc)
rproc.register(self, """with open("/proc/loadavg") as f:
s2.send(f.read())
""")
def feed(self, load_status, timestamp):
"parse /proc/meminfo"
load5, load10, load15, running_procs= load_status[0].split()[:4]
running, procs = running_procs.split('/')
self.rproc.aggregate(timestamp, {"proc":Proc(label="total", load5=float(load5), load10=float(load10),
load15=float(load15), running=int(running), procs=int(procs))})
class NodeAggregator(object):
def __init__(self, log_name):
self.node_pool = {}
self.log_name = log_name
self.log_lock = threading.Lock()
try:
os.unlink(self.log_name)
except OSError:
pass
def append(self, node):
assert isinstance(node, RemoteProc)
self.node_pool[node.host] = node
node.na_register(self)
def commit_aggregate(self, node, datas):
datas['hostname'] = node
with self.log_lock:
with file(self.log_name, "a") as f:
f.write(repr(datas) + "\n")
def run(self):
for v in self.node_pool.values():
v.start()
def stop(self):
for v in self.node_pool.values():
v.stop()
for v in self.node_pool.values():
v.join()
def round_to_base(v, b):
"""
>>> round_to_base(0.1, 0.3)
0.0
>>> round_to_base(0.3, 0.3)
0.3
>>> round_to_base(0.0, 0.3)
0.0
>>> round_to_base(0.5, 0.3)
0.3
>>> round_to_base(0.51, 0.3)
0.3
"""
for i in range(10):
base = int(b * 10**i)
if abs(base - b * 10**i) < 0.001: break
assert base>0
return float(int(v * 10**i) / base * base) / (10**i)
def filter_dict_with_prefix(d, prefix, sort=True):
keys = sorted(d.keys()) if sort else d.keys()
if prefix[0]=='!':
return dict([(x, d[x]) for x in keys if not x.startswith(prefix[1:])])
else:
return dict([(x, d[x]) for x in keys if x.startswith(prefix)])
def reduce_patched(func, data):
if len(data)==1:
return data[0]
elif len(data)==0:
return data
else:
return reduce(func, data)
def filter_dict_with_prefixes(d, *prefixes):
if len(prefixes)==1:
return filter_dict_with_prefix(d, prefixes[0])
else:
return reduce_patched(lambda a,b: filter_dict_with_prefix(filter_dict_with_prefix(d, a),b),
prefixes)
def test():
p = BashSSHClientMixin()
script=r"""exec('
import time, os, sys
while 1:
with open("/proc/stat") as f: print f.read(),
print "---hello---"
time.sleep(1)
')"""
s = script.replace('"', r'\"').replace("\n", r"\n")
with p.ssh_client("localhost", "python -u -c \"{s}\"".format(s=s)) as f:
while 1:
l = f.readline()
print l.rstrip()
if not l: break
p.ssh_close()
def test2():
class P(RemoteProc, BashSSHClientMixin): pass
p = P("localhost", 0.3)
CPUMonitor(p)
NetworkMonitor(p)
DiskMonitor(p)
MemoryMonitor(p)
p.run()
def start_monitor(log_filename, nodes):
class P(RemoteProc, BashSSHClientMixin):
def __init__(self, *args):
RemoteProc.__init__(self, *args)
CPUMonitor(self)
NetworkMonitor(self)
DiskMonitor(self)
MemoryMonitor(self)
ProcMonitor(self)
global na
na = NodeAggregator(log_filename)
nodes = sorted(list(set(nodes)))
for node in nodes:
na.append(P(node, PROBE_INTERVAL))
na.run()
def parse_bench_log(benchlog_fn):
events=["x,event"]
_spark_stage_submit = re.compile("^(\d{2}\/\d{2}\/\d{2} \d{2}:\d{2}:\d{2}) INFO [a-zA-Z0-9_\.]*DAGScheduler: Submitting (Stage \d+) \((.*)\).+$") # submit spark stage
_spark_stage_finish = re.compile("^(\d{2}\/\d{2}\/\d{2} \d{2}:\d{2}:\d{2}) INFO [a-zA-Z0-9_\.]*DAGScheduler: (Stage \d+) \((.*)\) finished.+$") # spark stage finish
_hadoop_run_job = re.compile("^(\d{2}\/\d{2}\/\d{2} \d{2}:\d{2}:\d{2}) INFO mapred.*\.Job.*: Running job: job_([\d_]+)$") # hadoop run job
_hadoop_map_reduce_progress = re.compile("^(\d{2}\/\d{2}\/\d{2} \d{2}:\d{2}:\d{2}) INFO mapred.*\.Job.*:\s+map (\d{1,2})% reduce (\d{1,2})%$") # hadoop reduce progress
_hadoop_job_complete_mr1 = re.compile("^(\d{2}\/\d{2}\/\d{2} \d{2}:\d{2}:\d{2}) INFO mapred.JobClient: Job complete: job_([\d_]+)$")
_hadoop_job_complete_mr2 = re.compile("^(\d{2}\/\d{2}\/\d{2} \d{2}:\d{2}:\d{2}) INFO mapreduce.Job: Job job_([\d_]+) completed successfully$")
"""
# MR1 sample
14/06/24 11:18:39 INFO mapred.JobClient: Running job: job_201406241116_0001
14/06/24 11:18:40 INFO mapred.JobClient: map 0% reduce 0%
...
13/11/21 14:38:55 INFO mapred.JobClient: Job complete: job_201311150128_0050
# MR2 sample
15/04/10 17:20:01 INFO mapreduce.Job: Running job: job_1427781540447_0448
15/04/10 17:20:07 INFO mapreduce.Job: Job job_1427781540447_0448 running in uber mode : false
15/04/10 17:20:07 INFO mapreduce.Job: map 0% reduce 0%
...
15/04/10 17:20:25 INFO mapreduce.Job: Job job_1427781540447_0448 completed successfully
"""
flag={}
with open(benchlog_fn) as f:
while True:
line = f.readline().rstrip()
if not line: break
for rule in [_spark_stage_submit, _spark_stage_finish, _hadoop_run_job, _hadoop_map_reduce_progress, _hadoop_job_complete_mr1, _hadoop_job_complete_mr2]:
matched = rule.match(line)
if matched:
result = matched.groups()
timestamp = datetime.strptime(result[0], r" # convert to millsec for js
if rule is _spark_stage_submit:
events.append("{t},Start {v1} ({v2})".format(t=timestamp, v1=result[1], v2=result[2]))
elif rule is _spark_stage_finish:
events.append("{t},Finish {v1} ({v2})".format(t=timestamp, v1=result[1], v2=result[2]))
elif rule is _hadoop_run_job:
events.append("{t},Start Job {v1}".format(t=timestamp, v1=result[1]))
flag={}
elif rule is _hadoop_map_reduce_progress:
map_progress,reduce_progress = int(result[1]), int(result[2])
op={'map':False, 'reduce':False}
if map_progress == 100:
if not "map" in flag:
op['map'] = True
flag['map'] = True
elif reduce_progress>0:
if not 'reduce' in flag:
op['reduce'] = True
flag['reduce'] = True
if op['map'] and op['reduce']:
events.append("{t},Map finish and Reduce start".format(t=timestamp))
elif op['map']:
events.append("{t},Map finish".format(t=timestamp))
elif op['reduce']:
events.append("{t},Reduce start".format(t=timestamp))
elif rule is _hadoop_job_complete_mr1 or rule is _hadoop_job_complete_mr2:
events.append("{t},Finsih Job {v1}".format(t=timestamp, v1=result[1]))
else:
assert 0, "should never reach here"
# limit maximum string length of events
for i in range(len(events)):
event_time, event_str = re.split(',', events[i], 1)
if len(event_str) > 45:
event_str = event_str[:21]+ '...' + event_str[-21:]
events[i]="%s,%s" % (event_time, event_str)
# merge events occurred at sametime:
i = 1
while i < len(events)-1:
cur = events[i].split(',')[0]
next = events[i+1].split(',')[0]
if abs(int(cur)/1000 - int(next)/1000) < 1:
events[i] = events[i] + "<br>" + re.split(',', events[i+1], 1)[1]
del events[i+1]
continue
i += 1
return events
def generate_report(workload_title, log_fn, benchlog_fn, report_fn):
c =- 1
with open(log_fn) as f:
datas=[eval(x) for x in f.readlines()]
all_hosts = sorted(list(set([x['hostname'] for x in datas])))
data_slices = groupby(datas, lambda x:round_to_base(x['timestamp'], PROBE_INTERVAL)) # round to time interval and groupby
# Generating CSVs
cpu_heatmap = ["x,y,value,hostname,coreid"]
cpu_overall = ["x,idle,user,system,iowait,others"]
network_heatmap = ["x,y,value,hostname,adapterid"]
network_overall = ["x,recv_bytes,send_bytes,|recv_packets,send_packets,errors"]
diskio_heatmap = ["x,y,value,hostname,diskid"]
diskio_overall = ["x,read_bytes,write_bytes,|read_io,write_io"]
memory_heatmap = ["x,y,value,hostname"]
memory_overall = ["x,free,buffer_cache,used"]
procload_heatmap = ["x,y,value,hostname"]
procload_overall = ["x,load5,load10,load15,|running,procs"]
events = parse_bench_log(benchlog_fn)
cpu_count={}
network_count={}
diskio_count={}
memory_count={}
proc_count={}
for t, sub_data in data_slices:
classed_by_host = dict([(x['hostname'], x) for x in sub_data])
# total cpus, plot user/sys/iowait/other
data_by_all_hosts = [classed_by_host.get(h, {}) for h in all_hosts]
# all cpu cores, total cluster
summed1 = [x['cpu/total'] for x in data_by_all_hosts if x.has_key('cpu/total')]
if summed1:
summed = reduce_patched(lambda a,b: a._add(b), summed1) / len(summed1)
for x in data_by_all_hosts:
cpu = x.get('cpu/total', None)
if not cpu: continue
# user, system, io, idle, others
# print t, x['hostname'], cpu.user, cpu.system, cpu.iowait, cpu.idle, cpu.nice+cpu.irq+cpu.softirq
# print t, summed
cpu_overall.append("{time},{idle},{user},{system},{iowait},{others}" \
.format(time = int(t*1000), user = summed.user, system = summed.system,
iowait = summed.iowait, idle = summed.idle,
others = summed.nice + summed.irq + summed.softirq))
# all cpu cores, plot heatmap according to cpus/time/usage(100%-idle)
for idx, x in enumerate(data_by_all_hosts):
for idy, y in enumerate(filter_dict_with_prefixes(x, "cpu", "!cpu/total").values()):
try:
pos = cpu_count[(idx, idy, x['hostname'])]
except:
pos = len(cpu_count)
cpu_count[(idx, idy, x['hostname'])] = pos
# print t, pos, 100-y.idle, x['hostname'], y.label
cpu_heatmap.append("{time},{pos},{value},{host},{cpuid}" \
.format(time = int(t*1000), pos = pos, value = 100-y.idle,
host = x['hostname'], cpuid = y.label))
# all disk of each node, total cluster
summed1=[x['disk/total'] for x in data_by_all_hosts if x.has_key('disk/total')]
if summed1:
summed = reduce_patched(lambda a,b: a._add(b), summed1)
for x in data_by_all_hosts:
disk = x.get('disk/total', None)
if not disk: continue
# io-read, io-write, bytes-read, bytes-write
# print t, x['hostname'], disk.io_read, disk.io_write, disk.bytes_read, disk.bytes_write
# print t, summed
diskio_overall.append("{time},{bytes_read},{bytes_write},{io_read},{io_write}" \
.format(time = int(t*1000),
bytes_read = summed.bytes_read / PROBE_INTERVAL,
bytes_write = summed.bytes_write / PROBE_INTERVAL,
io_read = summed.io_read / PROBE_INTERVAL,
io_write = summed.io_write / PROBE_INTERVAL))
# all disks, plot heatmap according to disks/bytes_read+bytes_write
for idx, x in enumerate(data_by_all_hosts):
for idy, y in enumerate(filter_dict_with_prefixes(x, "disk", "!disk/total").values()):
try:
pos = diskio_count[(idx, idy, x['hostname'])]
except:
pos = len(diskio_count)
diskio_count[(idx, idy, x['hostname'])] = pos
# print t, pos, 100-y.idle, x['hostname'], y.label
diskio_heatmap.append("{time},{pos},{value},{host},{diskid}" \
.format(time = int(t*1000),
pos = pos,
value = (y.bytes_read + y.bytes_write) / PROBE_INTERVAL,
host = x['hostname'],
diskid = y.label))
# memory of each node, total cluster
summed1 = [x['memory/total'] for x in data_by_all_hosts if x.has_key('memory/total')]
if summed1:
summed = reduce_patched(lambda a,b: a._add(b), summed1)
for x in data_by_all_hosts:
mem = x.get("memory/total", None)
if not mem: continue
# mem-total, mem-used, mem-buffer&cache, mem-free, KB
# print t, x['hostname'], mem.total, mem.used, mem.buffer_cache, mem.free
#print t, summed
memory_overall.append("{time},{free},{buffer_cache},{used}" \
.format(time = int(t*1000),
free = summed.free,
used = summed.used,
buffer_cache = summed.buffer_cache))
# all memory, plot heatmap according to memory/total - free
for idx, x in enumerate(data_by_all_hosts):
for idy, y in enumerate(filter_dict_with_prefixes(x, "memory/total").values()):
try:
pos = memory_count[(idx, idy, x['hostname'])]
except:
pos = len(memory_count)
memory_count[(idx, idy, x['hostname'])] = pos
# print t, pos, 100-y.idle, x['hostname'], y.label
memory_heatmap.append("{time},{pos},{value},{host}" \
.format(time = int(t*1000),
pos = pos,
value = (y.total - y.free)*1000,
host = x['hostname']))
# proc of each node, total cluster
summed1 = [x['proc'] for x in data_by_all_hosts if x.has_key('proc')]
if summed1:
summed = reduce_patched(lambda a,b: a._add(b), summed1)
for x in data_by_all_hosts:
procs = x.get("proc", None)
if not procs: continue
procload_overall.append("{time},{load5},{load10},{load15},{running},{procs}"\
.format(time = int(t*1000),
load5 = summed.load5,load10=summed.load10,
load15 = summed.load15,running=summed.running,
procs = summed.procs))
# all nodes' proc, plot heatmap according to proc/proc.procs
for idx, x in enumerate(data_by_all_hosts):
for idy, y in enumerate(filter_dict_with_prefixes(x, "proc").values()):
try:
pos = proc_count[(idx, idy, x['hostname'])]
except:
pos = len(proc_count)
proc_count[(idx, idy, x['hostname'])] = pos
# print t, pos, 100-y.idle, x['hostname'], y.label
procload_heatmap.append("{time},{pos},{value},{host}" \
.format(time = int(t*1000), pos = pos, value = y.procs,
host = x['hostname']))
# all network interface, total cluster
summed1 = [x['net/total'] for x in data_by_all_hosts if x.has_key('net/total')]
if summed1:
summed = reduce_patched(lambda a,b: a._add(b), summed1)
for x in data_by_all_hosts:
net = x.get("net/total", None)
if not net: continue
# recv-byte, send-byte, recv-packet, send-packet, errors
# print t, x['hostname'], net.recv_bytes, net.send_bytes, net.recv_packets, net.send_packets, net.recv_errs+net.send_errs+net.recv_drop+net.send_drop
# print t, summed
network_overall.append("{time},{recv_bytes},{send_bytes},{recv_packets},{send_packets},{errors}" \
.format(time = int(t*1000),
recv_bytes = summed.recv_bytes / PROBE_INTERVAL,
send_bytes = summed.send_bytes / PROBE_INTERVAL,
recv_packets = summed.recv_packets / PROBE_INTERVAL,
send_packets = summed.send_packets / PROBE_INTERVAL,
errors = (summed.recv_errs + summed.send_errs + \
summed.recv_drop + summed.send_drop) / PROBE_INTERVAL)
)
# all network adapters, plot heatmap according to net/recv_bytes + send_bytes
for idx, x in enumerate(data_by_all_hosts):
for idy, y in enumerate(filter_dict_with_prefixes(x, "net", "!net/total").values()):
try:
pos = network_count[(idx, idy, x['hostname'])]
except:
pos = len(network_count)
network_count[(idx, idy, x['hostname'])] = pos
network_heatmap.append("{time},{pos},{value},{host},{networkid}" \
.format(time = int(t*1000),
pos = pos*2,
value = y.recv_bytes / PROBE_INTERVAL,
host = x['hostname'],
networkid = y.label+".recv"))
network_heatmap.append("{time},{pos},{value},{host},{networkid}" \
.format(time = int(t*1000),
pos = pos*2+1,
value = y.send_bytes / PROBE_INTERVAL,
host = x['hostname'],
networkid = y.label+".send"))
with open(samedir("chart-template.html")) as f:
template = f.read()
variables = locals()
def my_replace(match):
match = match.group()[1:-1]
if match.endswith('heatmap') or match.endswith('overall'):
return "\n".join(variables[match])
elif match =='events':
return "\n".join(events)
elif match == 'probe_interval':
return str(PROBE_INTERVAL * 1000)
elif match == 'workload_name':
return workload_title
else:
return '{%s}' % match
with open(report_fn, 'w') as f:
f.write(re.sub(r'{\w+}', my_replace, template))
def show_usage():
log("""Usage:
monitor.py <workload_title> <parent_pid> <log_path.log> <benchlog_fn.log> <report_path.html> <monitor_node_name1> ... <monitor_node_nameN>
""")
if __name__=="__main__":
if len(sys.argv)<6:
log(sys.argv)
show_usage()
sys.exit(1)
# log(sys.argv)
global log_path
global report_path
global workload_title
global bench_log_path
global na
workload_title = sys.argv[1]
parent_pid = sys.argv[2]
log_path = sys.argv[3]
bench_log_path = sys.argv[4]
report_path = sys.argv[5]
nodes_to_monitor = sys.argv[6:]
pid=os.fork()
if pid: #parent
print pid
else: #child
os.close(0)
os.close(1)
os.close(2)
# log("child process start")
signal.signal(signal.SIGTERM, sig_term_handler)
start_monitor(log_path, nodes_to_monitor)
while os.path.exists("/proc/%s" % parent_pid):
sleep(1)
# parent lost, stop!
signal.signal(signal.SIGTERM, signal.SIG_IGN)
na.stop()
generate_report(workload_title, log_path, bench_log_path, report_path)
HiBench成长笔记——(9) 分析源码monitor.py的更多相关文章
- HiBench成长笔记——(10) 分析源码execute_with_log.py
#!/usr/bin/env python2 # Licensed to the Apache Software Foundation (ASF) under one or more # contri ...
- HiBench成长笔记——(8) 分析源码workload_functions.sh
workload_functions.sh 是测试程序的入口,粘连了监控程序 monitor.py 和 主运行程序: #!/bin/bash # Licensed to the Apache Soft ...
- HiBench成长笔记——(11) 分析源码run.sh
#!/bin/bash # Licensed to the Apache Software Foundation (ASF) under one or more # contributor licen ...
- HiBench成长笔记——(5) HiBench-Spark-SQL-Scan源码分析
run.sh #!/bin/bash # Licensed to the Apache Software Foundation (ASF) under one or more # contributo ...
- memcached学习笔记——存储命令源码分析下篇
上一篇回顾:<memcached学习笔记——存储命令源码分析上篇>通过分析memcached的存储命令源码的过程,了解了memcached如何解析文本命令和mencached的内存管理机制 ...
- memcached学习笔记——存储命令源码分析上篇
原创文章,转载请标明,谢谢. 上一篇分析过memcached的连接模型,了解memcached是如何高效处理客户端连接,这一篇分析memcached源码中的process_update_command ...
- [转]【安卓笔记】AsyncTask源码剖析
[转][安卓笔记]AsyncTask源码剖析 http://blog.csdn.net/chdjj/article/details/39122547 前言: 初学AsyncTask时,就想研究下它的实 ...
- Hadoop学习笔记(10) ——搭建源码学习环境
Hadoop学习笔记(10) ——搭建源码学习环境 上一章中,我们对整个hadoop的目录及源码目录有了一个初步的了解,接下来计划深入学习一下这头神象作品了.但是看代码用什么,难不成gedit?,单步 ...
- 【Azure 应用服务】Azure Function App使用SendGrid发送邮件遇见异常消息The operation was canceled,分析源码逐步最终源端
问题描述 在使用Azure Function App的SendGrid Binging功能,调用SendGrid服务器发送邮件功能时,有时候遇见间歇性,偶发性异常.在重新触发SendGrid部分的Fu ...
随机推荐
- StaticLinkList(静态链表)
写这个写了几次,然后都没写完就关掉了,所以也不想多码字了,直接上代码吧(本来还认真自制了一张图片来理解静态链表的cursor与sub之间的关系)但其实也就那么回事:通过游标来找下标通过下标找到对应的数 ...
- C/C++网络编程5——实现基于TCP的服务器端/客户端2
三次握手过程详解: 1:客户端的协议栈向服务器端发送SYN包,并告诉服务器端当前放送序号为j,客户端进入SYNC_SEND状态. 2:服务器端的协议栈收到这个包以后,和客户端进行ACK应答,应答值为j ...
- K8S-OVS使用Openvswitch为提供SDN功能支持单租户模式和多租户模式
k8s-ovs ============================== 最近在寻求一些工作机会,如果有kubernetes相关研发招聘的朋友,欢迎随时联系我.我的个人简历可以通过百度网盘:htt ...
- JS原型链的理解和使用(一)
一些个人的理解,不一定是对的,仅供参考. 在JS中有函数和对象两个概念,而又有一切皆对象的概念及函数也是一个对象.所以可以说函数一定可以作为一个对象,而对象不一定是一个函数. 也可以说在js中对象分为 ...
- 对象和Map转化gongju
package czc.superzig.modular.utils; import java.lang.reflect.Field; import java.util.HashMap; import ...
- C# Stream篇(—) -- Stream基类-----转载
C# Stream篇(—) -- Stream基类 写在前头: Stream系列文章共收录7篇,本着备忘和归纳的目的本着备忘和归纳的目的,全部收录于本分类中. 下面是有原文连接,望各位看官还是到原作者 ...
- 在webView中的返回键
在写webView中我们按一下返回键,退到上一个我们浏览的网页,到第一个页面时,按两下退出程序,且按一下时提示你在按一下退出程序 只要加上这个方法即可 public void onBackPresse ...
- eclipse启动时权限不够的问题
eclipse启动时权限不够的问题 2009年04月28日 19:19:00 tomey21 阅读数 1445 安装好后每次都要用root权限运行,比较郁闷,摸索了一下,修改一下相关目录的权限就可 ...
- One-Hot Encoding(独热编码)
前几天查了一些与独热编码相关的资料后,发现看不进去...看不太懂,今天又查了一下,然后写了写代码,通过自己写例子加上别人的解释后,从结果上观察,明白了sklearn中独热编码做了什么事. 下面举个例子 ...
- pdf.js-----后端返回utf-8数据流,前端处理数据展示pdf
需求:做项目联调接口时,发现知识库展示pdf未果,经与后端人员沟通,发现以下问题: 1.接口返回的是utf-8数据流,但是前端调用的是base64解析方法: 导致功能有误: 方案一:将后端返回的utf ...