monitor.py 是主监控程序,将监控数据写入日志,并统计监控数据生成HTML统计展示页面:

#!/usr/bin/env python2
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import threading, subprocess, re, os, sys, signal, socket
from time import sleep, time
from contextlib import closing
import traceback, thread
from datetime import datetime
from collections import namedtuple
from pprint import pprint
from itertools import groupby

# Probe intervals, in seconds.
# Warning: a value too short may get wrong results due to lack of data when system load goes high.
#          and must be float!
PROBE_INTERVAL=float(5)

#FIXME: use log helper later
#log_lock = threading.Lock()
def log(*s):
    if len(s)==1: s=s[0]
    else: s= " ".join([str(x) for x in s])
#    with log_lock:
#        with open("/home/zhihui/monitor_proc.log", 'a') as f:
    log_str = str(thread.get_ident())+":"+str(s) +'\n'
    #        f.write( log_str )
    sys.stderr.write(log_str)

entered=False
def sig_term_handler(signo, stack):
    global entered
    global log_path
    global report_path
    global workload_title
    global bench_log_path
    global na

    if not entered:
        entered=True            # FIXME: Not atomic
    else: return

    na.stop()
    generate_report(workload_title, log_path, bench_log_path, report_path)
    sys.exit(0)

def samedir(fn):
    """
    return abspath of fn in the same directory where this python file stores
    """
    return os.path.abspath(os.path.join(os.path.dirname(__file__), fn))

class PatchedNameTuple(object):
    def __sub__(self, other):
        assert isinstance(other, self.__class__)
        assert self[0] == other[0]
        cls = self.__class__
        return cls(self[0], *[a-b for a, b in zip(self[1:], other[1:])])

    def __div__(self, other):
        return self.__class__(self[0], *[a/other for a in self[1:]])

    def _add(self, other, override_title=None):
        if other == None: return self
        assert isinstance(other, self.__class__)
        cls = self.__class__
        title = self[0] if not override_title else override_title
        return cls(title, *[a+b for a, b in zip(self[1:], other[1:])])

def ident(size, s):
    return "\n".join((" "*size + x for x in s.split("\n")))

class RemoteProc(threading.Thread):
    SEP="----SEP----"
    template_debug=r"""exec('
import time, os, sys, socket, traceback
socket.setdefaulttimeout(1)
def log(*x, **kw):
 with open("/home/zhihui/probe.log", kw.get("mode","a")) as f:
  f.write(repr(x)+chr(10))
try:
 log("create socket", mode="w")
 s=socket.socket(socket.AF_INET, socket.SOCK_STREAM)
 log("bind socket")
 s.bind(("0.0.0.0",0))
 log("listen socket")
 s.listen(5)
 log("bind socket to:", s.getsockname())
 while True:
  log("accepting")
  try:
   print s.getsockname()[1]
   s2,peer=s.accept()
   break
  except socket.timeout:
   log("accept timeout, retry")
 log("accepted, peer:",peer)
except Exception as e:
 import traceback
 log(traceback.format_exc())
{func_template}
while True:
  s2.send(("{SEP}+%s" % time.time())+chr(10))
{call_template}
  s2.send("{SEP}#end"+chr(10))
  time.sleep({interval})
')"""
    template=r"""exec('
import time, os, sys, socket, traceback
s=socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.bind(("0.0.0.0",0))
s.listen(5)
print s.getsockname()[1]
s2,peer=s.accept()
{func_template}
while True:
  s2.send(("{SEP}+%s" % time.time())+chr(10))
{call_template}
  s2.send("{SEP}#end"+chr(10))
  time.sleep({interval})
')"""

    def __init__(self, host, interval=1):
        self.host = host
        self.cmds = []
        self.interval = interval
        self.monitor_ins = {}
        self.local_aggr_container={}
        self._running=True

        super(RemoteProc, self).__init__()

    def register(self, monitor_ins, cmds):
        assert isinstance(monitor_ins, BaseMonitor)
        self.monitor_ins[len(self.cmds)] = monitor_ins # monitor command seq id => monitor instance
        self.cmds.append(cmds)

    def run(self):
        func_template = "\n".join(["def func_{id}():\n{func}"\
                                       .format(id=id,
                                               func=ident(2,
                                                          func+'\ns2.send("{SEP}={id}"+chr(10))'\
                                                              .format(SEP=self.SEP, id=id))) \
                                       for id, func in enumerate(self.cmds)])
        call_template="\n".join(["  func_{id}()"\
                                     .format(id=id) for id in range(len(self.cmds))]
                                )
        script = self.template.format(func_template=func_template,
                                      call_template=call_template,
                                      interval = self.interval,
                                      SEP = self.SEP)

        s = script.replace('"', r'\"').replace("\n", r"\n")
        container=[]
#        log("ssh client to:", self.host)
        with self.ssh_client(self.host, "python -u -c \"{script}\"".format(script=s)) as f:
#            log("ssh client %s connected" % self.host)
            try:
                port_line = f.readline()
#                log("host:", self.host, "got port,", port_line)
                port = int(port_line.rstrip())
                s=socket.socket(socket.AF_INET, socket.SOCK_STREAM)
                s.settimeout(0.5)
                for i in range(30): # try to connect 30 times maximum
                    try:
#                        log("try to connect:", self.host, port)
                        s.connect((self.host, port))
#                        log("connectted to:", self.host, port)
                        break
                    except socket.timeout:
#                        log("connecting to:", self.host, port, "timedout")
                        pass
                else:           # not connectted after 30 times trying
#                    log("cann't connectted to:", self.host, port)
                    s.shutdown(socket.SHUT_RDWR)
                    self.ssh_close()
                    return
                s.settimeout(None)
            except Exception as e:
                log(traceback.format_exc())

            with closing(s.makefile()) as f2:
                while self._running:
                    try:
                        l = f2.readline()
                    except KeyboardInterrupt:
                        break
                    if not l: break
                    if l.startswith(self.SEP):
                        tail = l.lstrip(self.SEP)
                        if tail[0]=='+': # timestamp
                            remote_timestamp = float(tail[1:])
                            cur_timestamp = time()
                        elif tail.startswith('#end'): # end sign
#                            log("na push, timestamp:", cur_timestamp)
                            self.na_push(cur_timestamp)
                        else:
                            id = int(tail[1:])
                            if self.monitor_ins[id]:
                                self.monitor_ins[id].feed(container, cur_timestamp)
                        container = []
                    else:
                        container.append(l.rstrip())
            s.shutdown(socket.SHUT_RDWR)
        self.ssh_close()

    def stop(self):
        self._running=False

    def aggregate(self, timestamp, data):
        if not self.local_aggr_container:
            self.local_aggr_container['timestamp']=timestamp
        assert timestamp == self.local_aggr_container['timestamp']
        assert type(data) is dict
        self.local_aggr_container.update(data)
        self.local_aggr_container['timestamp'] = timestamp

    def na_register(self, na):
        assert isinstance(na, NodeAggregator)
        self.node_aggr_parent = na

    def na_push(self, timestamp):
        if self.local_aggr_container:
            assert self.local_aggr_container.get('timestamp', -1) == timestamp
            self.node_aggr_parent.commit_aggregate(self.host, self.local_aggr_container)
            self.local_aggr_container={}

class BaseMonitor(object):
    IGNORE_KEYS=[]
    def __init__(self, rproc):
        self.rproc = rproc
        self._last = None

    def feed(self, container, timestamp):            # override to parse pulled data files
        raise NotImplementedError()

    def ssh_client(self, host, shell):    # override for opening ssh client
        raise NotImplementedError()

    def ssh_close(self):                  # override for clear up ssh client
        raise NotImplementedError()

    def commit(self, timestamp, header, stat):
        if self._last is None: self._last = stat
        else:
            stat_delta = dict([(header+'/'+k, stat[k] - self._last[k]) \
                                for k in set(self._last.keys()).union(set(stat.keys()))\
                                if k in stat and k in self._last and k not in self.IGNORE_KEYS
                                ])
            self._last = stat
#            if header.startswith("net"):
#                print stat_delta
            stat_delta[header+'/total'] = reduce_patched(lambda a,b: a._add(b, 'total'), stat_delta.values())
            self.rproc.aggregate(timestamp, stat_delta)

class BashSSHClientMixin(object):
    ssh_lock = threading.Lock()
    def ssh_client(self, host, shell):
        with open(os.devnull, 'rb', 0) as DEVNULL:
            with BashSSHClientMixin.ssh_lock:
                self.proc = subprocess.Popen(["ssh", host, shell], bufsize=1,
                                             stdin=DEVNULL, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
        return self.proc.stdout

    def ssh_close(self):
        assert self.proc
        self.proc.terminate()
        self.proc.wait()
        return self.proc.returncode

_CPU=namedtuple("CPU", ['label', 'user', 'nice', 'system', 'idle', 'iowait', 'irq', 'softirq'])
class CPU(_CPU, PatchedNameTuple):
    def percentage(self):
        total = sum(self[1:])
        return CPU(self[0], *[x*100.0 / total for x in self[1:]]) if total>0 else self

class CPUMonitor(BaseMonitor):
    def __init__(self, rproc):
        super(CPUMonitor, self).__init__(rproc)
        rproc.register(self, """with open("/proc/stat") as f:
  s2.send("".join([x for x in f.readlines() if x.startswith("cpu")]))
""")

    def feed(self, container, timestamp):
        "parse /proc/stat"
        self.commit(timestamp, dict([self._parse_stat(line) for line in container]))

    def _parse_stat(self, line):
        "parse one line of /proc/stat"
        assert line.strip(), "BUG! empty line in /proc/stat"
        fields = line.split()
        if fields[0]=='cpu':
            fields[0]='total'
        return (fields[0], CPU(fields[0], *[int(x) for x in fields[1:8]]))

    def commit(self, timestamp, cpu_stat):
        if self._last is None:
            self._last = cpu_stat
        else:
            cpu_usage = dict([("cpu/"+k, (cpu_stat[k] - self._last[k]).percentage()) for k in self._last])
            self._last = cpu_stat
            self.rproc.aggregate(timestamp, cpu_usage)

_Network=namedtuple("Network", ['label', "recv_bytes", "recv_packets", "recv_errs", "recv_drop",
                                "send_bytes", "send_packets", "send_errs", "send_drop"])
class Network(_Network, PatchedNameTuple): pass

class NetworkMonitor(BaseMonitor):
    IGNORE_KEYS=["lo"]
    def __init__(self, rproc):
        rproc.register(self, """with open("/proc/net/dev") as f:
  s2.send("".join([x for x in f.readlines()]))
""")
        self._filter = re.compile('^\s*(.+):\s*(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+\d+\s+\d+\s+\d+\s+\d+\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+).*$')
        super(NetworkMonitor, self).__init__(rproc)

    def feed(self, container, timestamp):
        "parse /proc/net/dev"
        self.commit(timestamp, "net", dict(filter(lambda x:x, [self._parse_net_dev(line) for line in container])))

    def _parse_net_dev(self, line):
        matched = self._filter.match(line)
        if matched:
            obj = Network(matched.groups()[0], *[int(x) for x in matched.groups()[1:]])
            if not (obj.recv_bytes==0 and obj.send_bytes==0):
                return (obj[0], obj)

_Disk=namedtuple("Disk", ["label", "io_read", "bytes_read", "time_spent_read", "io_write", "bytes_write", "time_spent_write"])

class Disk(_Disk, PatchedNameTuple): pass

class DiskMonitor(BaseMonitor):
    def __init__(self, rproc):
        super(DiskMonitor, self).__init__(rproc)
        rproc.register(self, """with open("/proc/diskstats") as f:
  blocks = os.listdir("/sys/block")
  s2.send("".join([x for x in f.readlines() if x.split()[2] in blocks and not x.split()[2].startswith("loop") and x.split()[3]!="0"]))
""")

    def feed(self, container, timestamp):
        "parse /proc/diskstats"
        self.commit(timestamp, "disk", dict([self._parse_disk_stat(line) for line in container]))

    def _parse_disk_stat(self, line):
        fields = line.split()[2:]
        obj = Disk(fields[0],
                    io_read=int(fields[1]), bytes_read=int(fields[3])*512, time_spent_read=int(fields[4])/1000.0,
                    io_write=int(fields[5]), bytes_write=int(fields[7])*512, time_spent_write=int(fields[8])/1000.0)
        return (obj[0], obj)

_Memory=namedtuple("Memory", ["label", "total", "used", "buffer_cache", "free", "map"])
class Memory(_Memory, PatchedNameTuple): pass

class MemoryMonitor(BaseMonitor):
    def __init__(self, rproc):
        super(MemoryMonitor, self).__init__(rproc)
        rproc.register(self, """with open("/proc/meminfo") as f:
  mem = dict([(a, b.split()[0].strip()) for a, b in [x.split(":") for x in f.readlines()]])
  s2.send(":".join([mem[field] for field in ["MemTotal", "Buffers", "Cached", "MemFree", "Mapped"]])+chr(10))
""")

    def feed(self, memory_status, timestamp):
        "parse /proc/meminfo"
        total, buffers, cached, free, mapped= [int(x) for x in memory_status[0].split(":")]

        self.rproc.aggregate(timestamp, {"memory/total":Memory(label="total", total=total,
                                                               used=total - free - buffers-cached,
                                                               buffer_cache=buffers + cached,
                                                               free=free, map=mapped)})
_Proc=namedtuple("Proc", ["label", "load5", "load10", "load15", "running", "procs"])
class Proc(_Proc, PatchedNameTuple): pass

class ProcMonitor(BaseMonitor):
    def __init__(self, rproc):
        super(ProcMonitor, self).__init__(rproc)
        rproc.register(self, """with open("/proc/loadavg") as f:
  s2.send(f.read())
""")

    def feed(self, load_status, timestamp):
        "parse /proc/meminfo"
        load5, load10, load15, running_procs= load_status[0].split()[:4]
        running, procs = running_procs.split('/')

        self.rproc.aggregate(timestamp, {"proc":Proc(label="total", load5=float(load5), load10=float(load10),
                                                     load15=float(load15), running=int(running), procs=int(procs))})

class NodeAggregator(object):
    def __init__(self, log_name):
        self.node_pool = {}
        self.log_name = log_name
        self.log_lock = threading.Lock()
        try:
            os.unlink(self.log_name)
        except OSError:
            pass

    def append(self, node):
        assert isinstance(node, RemoteProc)
        self.node_pool[node.host] = node
        node.na_register(self)

    def commit_aggregate(self, node, datas):
        datas['hostname'] = node
        with self.log_lock:
            with file(self.log_name, "a") as f:
                f.write(repr(datas) + "\n")

    def run(self):
        for v in self.node_pool.values():
            v.start()

    def stop(self):
        for v in self.node_pool.values():
            v.stop()
        for v in self.node_pool.values():
            v.join()

def round_to_base(v, b):
    """
    >>> round_to_base(0.1, 0.3)
    0.0
    >>> round_to_base(0.3, 0.3)
    0.3
    >>> round_to_base(0.0, 0.3)
    0.0
    >>> round_to_base(0.5, 0.3)
    0.3
    >>> round_to_base(0.51, 0.3)
    0.3
    """
    for i in range(10):
        base = int(b * 10**i)
        if abs(base - b * 10**i) < 0.001: break
    assert base>0
    return float(int(v * 10**i) / base * base) / (10**i)

def filter_dict_with_prefix(d, prefix, sort=True):
    keys = sorted(d.keys()) if sort else d.keys()
    if prefix[0]=='!':
        return  dict([(x, d[x]) for x in keys if not x.startswith(prefix[1:])])
    else:
        return  dict([(x, d[x]) for x in keys if x.startswith(prefix)])

def reduce_patched(func, data):
    if len(data)==1:
        return data[0]
    elif len(data)==0:
        return data
    else:
        return reduce(func, data)

def filter_dict_with_prefixes(d, *prefixes):
    if len(prefixes)==1:
        return filter_dict_with_prefix(d, prefixes[0])
    else:
        return reduce_patched(lambda a,b: filter_dict_with_prefix(filter_dict_with_prefix(d, a),b),
                      prefixes)

def test():
    p = BashSSHClientMixin()
    script=r"""exec('
import time, os, sys
while 1:
  with open("/proc/stat") as f: print f.read(),
  print "---hello---"
  time.sleep(1)
')"""
    s = script.replace('"', r'\"').replace("\n", r"\n")
    with p.ssh_client("localhost", "python -u -c \"{s}\"".format(s=s)) as f:
        while 1:
            l = f.readline()
            print l.rstrip()
            if not l: break
    p.ssh_close()

def test2():
    class P(RemoteProc, BashSSHClientMixin):  pass

    p = P("localhost", 0.3)
    CPUMonitor(p)
    NetworkMonitor(p)
    DiskMonitor(p)
    MemoryMonitor(p)

    p.run()

def start_monitor(log_filename, nodes):
    class P(RemoteProc, BashSSHClientMixin):
        def __init__(self, *args):
            RemoteProc.__init__(self, *args)
            CPUMonitor(self)
            NetworkMonitor(self)
            DiskMonitor(self)
            MemoryMonitor(self)
            ProcMonitor(self)
    global na
    na = NodeAggregator(log_filename)
    nodes = sorted(list(set(nodes)))
    for node in nodes:
        na.append(P(node, PROBE_INTERVAL))
    na.run()

def parse_bench_log(benchlog_fn):
    events=["x,event"]
    _spark_stage_submit = re.compile("^(\d{2}\/\d{2}\/\d{2} \d{2}:\d{2}:\d{2}) INFO [a-zA-Z0-9_\.]*DAGScheduler: Submitting (Stage \d+) \((.*)\).+$") # submit spark stage
    _spark_stage_finish = re.compile("^(\d{2}\/\d{2}\/\d{2} \d{2}:\d{2}:\d{2}) INFO [a-zA-Z0-9_\.]*DAGScheduler: (Stage \d+) \((.*)\) finished.+$")   # spark stage finish
    _hadoop_run_job = re.compile("^(\d{2}\/\d{2}\/\d{2} \d{2}:\d{2}:\d{2}) INFO mapred.*\.Job.*: Running job: job_([\d_]+)$") # hadoop run job
    _hadoop_map_reduce_progress = re.compile("^(\d{2}\/\d{2}\/\d{2} \d{2}:\d{2}:\d{2}) INFO mapred.*\.Job.*:\s+map (\d{1,2})% reduce (\d{1,2})%$") # hadoop reduce progress
    _hadoop_job_complete_mr1 = re.compile("^(\d{2}\/\d{2}\/\d{2} \d{2}:\d{2}:\d{2}) INFO mapred.JobClient: Job complete: job_([\d_]+)$")
    _hadoop_job_complete_mr2 = re.compile("^(\d{2}\/\d{2}\/\d{2} \d{2}:\d{2}:\d{2}) INFO mapreduce.Job: Job job_([\d_]+) completed successfully$")

    """
# MR1 sample
14/06/24 11:18:39 INFO mapred.JobClient: Running job: job_201406241116_0001
14/06/24 11:18:40 INFO mapred.JobClient:  map 0% reduce 0%
...
13/11/21 14:38:55 INFO mapred.JobClient: Job complete: job_201311150128_0050

# MR2 sample
15/04/10 17:20:01 INFO mapreduce.Job: Running job: job_1427781540447_0448
15/04/10 17:20:07 INFO mapreduce.Job: Job job_1427781540447_0448 running in uber mode : false
15/04/10 17:20:07 INFO mapreduce.Job:  map 0% reduce 0%
...
15/04/10 17:20:25 INFO mapreduce.Job: Job job_1427781540447_0448 completed successfully
    """
    flag={}
    with open(benchlog_fn) as f:
        while True:
            line = f.readline().rstrip()
            if not line: break
            for rule in [_spark_stage_submit, _spark_stage_finish, _hadoop_run_job, _hadoop_map_reduce_progress, _hadoop_job_complete_mr1, _hadoop_job_complete_mr2]:
                matched = rule.match(line)
                if matched:
                    result = matched.groups()
                    timestamp = datetime.strptime(result[0], r" # convert to millsec for js
                    if rule is _spark_stage_submit:
                        events.append("{t},Start {v1} ({v2})".format(t=timestamp, v1=result[1], v2=result[2]))
                    elif rule is _spark_stage_finish:
                        events.append("{t},Finish {v1} ({v2})".format(t=timestamp, v1=result[1], v2=result[2]))
                    elif rule is _hadoop_run_job:
                        events.append("{t},Start Job {v1}".format(t=timestamp, v1=result[1]))
                        flag={}
                    elif rule is _hadoop_map_reduce_progress:
                        map_progress,reduce_progress = int(result[1]), int(result[2])
                        op={'map':False, 'reduce':False}
                        if map_progress == 100:
                            if not "map" in flag:
                                op['map'] = True
                                flag['map'] = True
                        elif reduce_progress>0:
                            if not 'reduce' in flag:
                                op['reduce'] = True
                                flag['reduce'] = True
                        if op['map'] and op['reduce']:
                            events.append("{t},Map finish and Reduce start".format(t=timestamp))
                        elif op['map']:
                            events.append("{t},Map finish".format(t=timestamp))
                        elif op['reduce']:
                            events.append("{t},Reduce start".format(t=timestamp))
                    elif rule is _hadoop_job_complete_mr1 or rule is _hadoop_job_complete_mr2:
                        events.append("{t},Finsih Job {v1}".format(t=timestamp, v1=result[1]))
                    else:
                        assert 0, "should never reach here"

        # limit maximum string length of events
        for i in range(len(events)):
            event_time, event_str = re.split(',', events[i], 1)
            if len(event_str) > 45:
                event_str = event_str[:21]+ '...' + event_str[-21:]
                events[i]="%s,%s" % (event_time, event_str)

        # merge events occurred at sametime:
        i = 1
        while i < len(events)-1:
            cur = events[i].split(',')[0]
            next = events[i+1].split(',')[0]
            if abs(int(cur)/1000 - int(next)/1000) < 1:
                events[i] = events[i] + "<br>" + re.split(',', events[i+1], 1)[1]
                del events[i+1]
                continue
            i += 1
        return events

def generate_report(workload_title, log_fn, benchlog_fn, report_fn):
    c =- 1
    with open(log_fn) as f:
        datas=[eval(x) for x in f.readlines()]

    all_hosts = sorted(list(set([x['hostname'] for x in datas])))
    data_slices = groupby(datas, lambda x:round_to_base(x['timestamp'], PROBE_INTERVAL)) # round to time interval and groupby

    # Generating CSVs
    cpu_heatmap = ["x,y,value,hostname,coreid"]
    cpu_overall = ["x,idle,user,system,iowait,others"]
    network_heatmap = ["x,y,value,hostname,adapterid"]
    network_overall = ["x,recv_bytes,send_bytes,|recv_packets,send_packets,errors"]
    diskio_heatmap = ["x,y,value,hostname,diskid"]
    diskio_overall = ["x,read_bytes,write_bytes,|read_io,write_io"]
    memory_heatmap = ["x,y,value,hostname"]
    memory_overall = ["x,free,buffer_cache,used"]
    procload_heatmap = ["x,y,value,hostname"]
    procload_overall = ["x,load5,load10,load15,|running,procs"]
    events = parse_bench_log(benchlog_fn)

    cpu_count={}
    network_count={}
    diskio_count={}
    memory_count={}
    proc_count={}

    for t, sub_data in data_slices:
        classed_by_host = dict([(x['hostname'], x) for x in sub_data])
        # total cpus, plot user/sys/iowait/other
        data_by_all_hosts = [classed_by_host.get(h, {}) for h in all_hosts]

        # all cpu cores, total cluster
        summed1 = [x['cpu/total'] for x in data_by_all_hosts if x.has_key('cpu/total')]
        if summed1:
            summed = reduce_patched(lambda a,b: a._add(b), summed1) / len(summed1)
            for x in data_by_all_hosts:
                cpu = x.get('cpu/total', None)
                if not cpu: continue
            # user, system, io, idle, others
#            print t, x['hostname'], cpu.user, cpu.system, cpu.iowait, cpu.idle, cpu.nice+cpu.irq+cpu.softirq
#        print t, summed
            cpu_overall.append("{time},{idle},{user},{system},{iowait},{others}" \
                                   .format(time   = int(t*1000), user = summed.user, system = summed.system,
                                           iowait = summed.iowait, idle = summed.idle,
                                           others = summed.nice + summed.irq + summed.softirq))

        # all cpu cores, plot heatmap according to cpus/time/usage(100%-idle)
        for idx, x in enumerate(data_by_all_hosts):
            for idy, y in enumerate(filter_dict_with_prefixes(x, "cpu", "!cpu/total").values()):
                try:
                    pos = cpu_count[(idx, idy, x['hostname'])]
                except:
                    pos =  len(cpu_count)
                    cpu_count[(idx, idy, x['hostname'])] = pos
#                print t, pos, 100-y.idle, x['hostname'], y.label
                cpu_heatmap.append("{time},{pos},{value},{host},{cpuid}" \
                                       .format(time = int(t*1000), pos = pos, value = 100-y.idle,
                                               host = x['hostname'], cpuid = y.label))

        # all disk of each node, total cluster
        summed1=[x['disk/total'] for x in data_by_all_hosts if x.has_key('disk/total')]
        if summed1:
            summed = reduce_patched(lambda a,b: a._add(b), summed1)
            for x in data_by_all_hosts:
                disk = x.get('disk/total', None)
                if not disk: continue
            # io-read, io-write, bytes-read, bytes-write
#            print t, x['hostname'], disk.io_read, disk.io_write, disk.bytes_read, disk.bytes_write
 #       print t, summed
            diskio_overall.append("{time},{bytes_read},{bytes_write},{io_read},{io_write}" \
                                      .format(time        = int(t*1000),
                                              bytes_read  = summed.bytes_read / PROBE_INTERVAL,
                                              bytes_write = summed.bytes_write / PROBE_INTERVAL,
                                              io_read     = summed.io_read / PROBE_INTERVAL,
                                              io_write    = summed.io_write / PROBE_INTERVAL))

        # all disks, plot heatmap according to disks/bytes_read+bytes_write
        for idx, x in enumerate(data_by_all_hosts):
            for idy, y in enumerate(filter_dict_with_prefixes(x, "disk", "!disk/total").values()):
                try:
                    pos = diskio_count[(idx, idy, x['hostname'])]
                except:
                    pos =  len(diskio_count)
                    diskio_count[(idx, idy, x['hostname'])] = pos
#                print t, pos, 100-y.idle, x['hostname'], y.label
                diskio_heatmap.append("{time},{pos},{value},{host},{diskid}" \
                                       .format(time   = int(t*1000),
                                               pos    = pos,
                                               value  = (y.bytes_read + y.bytes_write) / PROBE_INTERVAL,
                                               host   = x['hostname'],
                                               diskid = y.label))

        # memory of each node, total cluster
        summed1 = [x['memory/total'] for x in data_by_all_hosts if x.has_key('memory/total')]
        if summed1:
            summed = reduce_patched(lambda a,b: a._add(b), summed1)
            for x in data_by_all_hosts:
                mem = x.get("memory/total", None)
                if not mem: continue
                # mem-total, mem-used, mem-buffer&cache, mem-free, KB
    #            print t, x['hostname'], mem.total, mem.used, mem.buffer_cache, mem.free
            #print t, summed
            memory_overall.append("{time},{free},{buffer_cache},{used}" \
                                      .format(time = int(t*1000),
                                              free = summed.free,
                                              used = summed.used,
                                              buffer_cache = summed.buffer_cache))

        # all memory, plot heatmap according to memory/total - free
        for idx, x in enumerate(data_by_all_hosts):
            for idy, y in enumerate(filter_dict_with_prefixes(x, "memory/total").values()):
                try:
                    pos = memory_count[(idx, idy, x['hostname'])]
                except:
                    pos =  len(memory_count)
                    memory_count[(idx, idy, x['hostname'])] = pos
#                print t, pos, 100-y.idle, x['hostname'], y.label
                memory_heatmap.append("{time},{pos},{value},{host}" \
                                       .format(time  = int(t*1000),
                                               pos   = pos,
                                               value = (y.total - y.free)*1000,
                                               host  = x['hostname']))

        # proc of each node, total cluster
        summed1 = [x['proc'] for x in data_by_all_hosts if x.has_key('proc')]
        if summed1:
            summed = reduce_patched(lambda a,b: a._add(b), summed1)
            for x in data_by_all_hosts:
                procs = x.get("proc", None)
                if not procs: continue
            procload_overall.append("{time},{load5},{load10},{load15},{running},{procs}"\
                                        .format(time   = int(t*1000),
                                                load5  = summed.load5,load10=summed.load10,
                                                load15 = summed.load15,running=summed.running,
                                                procs  = summed.procs))

        # all nodes' proc, plot heatmap according to proc/proc.procs
        for idx, x in enumerate(data_by_all_hosts):
            for idy, y in enumerate(filter_dict_with_prefixes(x, "proc").values()):
                try:
                    pos = proc_count[(idx, idy, x['hostname'])]
                except:
                    pos =  len(proc_count)
                    proc_count[(idx, idy, x['hostname'])] = pos
#                print t, pos, 100-y.idle, x['hostname'], y.label
                procload_heatmap.append("{time},{pos},{value},{host}" \
                                            .format(time = int(t*1000), pos = pos, value = y.procs,
                                                    host = x['hostname']))

        # all network interface, total cluster
        summed1 = [x['net/total'] for x in data_by_all_hosts if x.has_key('net/total')]

        if summed1:
            summed = reduce_patched(lambda a,b: a._add(b), summed1)
            for x in data_by_all_hosts:
                net = x.get("net/total", None)
                if not net: continue
                # recv-byte, send-byte, recv-packet, send-packet, errors
    #            print t, x['hostname'], net.recv_bytes, net.send_bytes, net.recv_packets, net.send_packets, net.recv_errs+net.send_errs+net.recv_drop+net.send_drop
    #        print t, summed
            network_overall.append("{time},{recv_bytes},{send_bytes},{recv_packets},{send_packets},{errors}" \
                                       .format(time         = int(t*1000),
                                               recv_bytes   = summed.recv_bytes / PROBE_INTERVAL,
                                               send_bytes   = summed.send_bytes / PROBE_INTERVAL,
                                               recv_packets = summed.recv_packets / PROBE_INTERVAL,
                                               send_packets = summed.send_packets / PROBE_INTERVAL,
                                               errors = (summed.recv_errs + summed.send_errs + \
                                                             summed.recv_drop + summed.send_drop) / PROBE_INTERVAL)
                                   )

        # all network adapters, plot heatmap according to net/recv_bytes + send_bytes
        for idx, x in enumerate(data_by_all_hosts):
            for idy, y in enumerate(filter_dict_with_prefixes(x, "net", "!net/total").values()):
                try:
                    pos = network_count[(idx, idy, x['hostname'])]
                except:
                    pos =  len(network_count)
                    network_count[(idx, idy, x['hostname'])] = pos
                network_heatmap.append("{time},{pos},{value},{host},{networkid}" \
                                       .format(time  = int(t*1000),
                                               pos   = pos*2,
                                               value = y.recv_bytes / PROBE_INTERVAL,
                                               host  = x['hostname'],
                                               networkid = y.label+".recv"))
                network_heatmap.append("{time},{pos},{value},{host},{networkid}" \
                                       .format(time  = int(t*1000),
                                               pos   = pos*2+1,
                                               value = y.send_bytes / PROBE_INTERVAL,
                                               host  = x['hostname'],
                                               networkid = y.label+".send"))

    with open(samedir("chart-template.html")) as f:
        template = f.read()

    variables = locals()
    def my_replace(match):
        match = match.group()[1:-1]
        if match.endswith('heatmap') or match.endswith('overall'):
            return "\n".join(variables[match])
        elif match =='events':
            return "\n".join(events)
        elif match == 'probe_interval':
            return str(PROBE_INTERVAL * 1000)
        elif match == 'workload_name':
            return workload_title
        else:
            return '{%s}' % match

    with open(report_fn, 'w') as f:
        f.write(re.sub(r'{\w+}', my_replace, template))

def show_usage():
    log("""Usage:
    monitor.py <workload_title> <parent_pid> <log_path.log> <benchlog_fn.log> <report_path.html> <monitor_node_name1> ... <monitor_node_nameN>
""")

if __name__=="__main__":
    if len(sys.argv)<6:
        log(sys.argv)
        show_usage()
        sys.exit(1)

#    log(sys.argv)
    global log_path
    global report_path
    global workload_title
    global bench_log_path
    global na

    workload_title = sys.argv[1]
    parent_pid = sys.argv[2]
    log_path = sys.argv[3]
    bench_log_path = sys.argv[4]
    report_path = sys.argv[5]
    nodes_to_monitor = sys.argv[6:]
    pid=os.fork()
    if pid:                               #parent
        print pid
    else:                                 #child
        os.close(0)
        os.close(1)
        os.close(2)
#        log("child process start")
        signal.signal(signal.SIGTERM, sig_term_handler)
        start_monitor(log_path, nodes_to_monitor)
        while  os.path.exists("/proc/%s" % parent_pid):
            sleep(1)
        # parent lost, stop!
        signal.signal(signal.SIGTERM, signal.SIG_IGN)
        na.stop()
        generate_report(workload_title, log_path, bench_log_path, report_path)

HiBench成长笔记——(9) 分析源码monitor.py的更多相关文章

  1. HiBench成长笔记——(10) 分析源码execute_with_log.py

    #!/usr/bin/env python2 # Licensed to the Apache Software Foundation (ASF) under one or more # contri ...

  2. HiBench成长笔记——(8) 分析源码workload_functions.sh

    workload_functions.sh 是测试程序的入口,粘连了监控程序 monitor.py 和 主运行程序: #!/bin/bash # Licensed to the Apache Soft ...

  3. HiBench成长笔记——(11) 分析源码run.sh

    #!/bin/bash # Licensed to the Apache Software Foundation (ASF) under one or more # contributor licen ...

  4. HiBench成长笔记——(5) HiBench-Spark-SQL-Scan源码分析

    run.sh #!/bin/bash # Licensed to the Apache Software Foundation (ASF) under one or more # contributo ...

  5. memcached学习笔记——存储命令源码分析下篇

    上一篇回顾:<memcached学习笔记——存储命令源码分析上篇>通过分析memcached的存储命令源码的过程,了解了memcached如何解析文本命令和mencached的内存管理机制 ...

  6. memcached学习笔记——存储命令源码分析上篇

    原创文章,转载请标明,谢谢. 上一篇分析过memcached的连接模型,了解memcached是如何高效处理客户端连接,这一篇分析memcached源码中的process_update_command ...

  7. [转]【安卓笔记】AsyncTask源码剖析

    [转][安卓笔记]AsyncTask源码剖析 http://blog.csdn.net/chdjj/article/details/39122547 前言: 初学AsyncTask时,就想研究下它的实 ...

  8. Hadoop学习笔记(10) ——搭建源码学习环境

    Hadoop学习笔记(10) ——搭建源码学习环境 上一章中,我们对整个hadoop的目录及源码目录有了一个初步的了解,接下来计划深入学习一下这头神象作品了.但是看代码用什么,难不成gedit?,单步 ...

  9. 【Azure 应用服务】Azure Function App使用SendGrid发送邮件遇见异常消息The operation was canceled,分析源码逐步最终源端

    问题描述 在使用Azure Function App的SendGrid Binging功能,调用SendGrid服务器发送邮件功能时,有时候遇见间歇性,偶发性异常.在重新触发SendGrid部分的Fu ...

随机推荐

  1. RestTemplate post请求使用map传参 Controller 接收不到值的解决方案 postForObject方法源码解析.md

    结论 post方法中如果使用map传参,需要使用MultiValueMap来传递 RestTemplate 的 postForObject 方法有四个参数 String url => 顾名思义 ...

  2. UICollectionViewCell点击高亮效果(附带效果GIF)

    首先效果如下: 背景是这样的:UI上使用的是UICollectionView,所以后面会使用它的协议方法完成. 实现思路是这样的:高亮状态+点击后短时间内保持颜色变化 实现的代码参考如下: // Ce ...

  3. win10+anaconda安装tensorflow和keras遇到的坑小结

    win10下利用anaconda安装tensorflow和keras的教程都大同小异(针对CPU版本,我的gpu是1050TI的MAX-Q,不知为啥一直没安装成功),下面简单说下步骤. 一 Anaco ...

  4. ES5-json对象和字符串互转

    JSON.stringify();和JSON.parse();是在ES5中提出并使用的:JSON.stringify();将一个对象转化为json字符串,JSON.parse();将一个对象转化为对象 ...

  5. c++刷算法的好处

    写再最前面:摘录于柳神的笔记 在已经学习过C语⾔的前提下,学习C++并使⽤它刷算法的学习成本⾮常低-只需要⼏个⼩时就可 以学会- C++向下兼容C,C语⾔⾥⾯的语法完全可以在C++⽂件中运⾏,所以学习 ...

  6. Python学习笔记002

    字符编码:把二进制字符翻译成字符 ASCII码表    256    一个字节,8个比特 支持中文: GB2312 GBK1.0 GB18030 BIG5(台湾) unicode UTF-8 开头定义 ...

  7. 物联网协议CoAP协议学习

    CoAP:Constrained Application Protocol协议是为物联网中资源受限的设备制定的应用层协议,即简化版的基于UDP的HTTP协议.其核心内容为资源抽象.REST式交互可扩展 ...

  8. css样式和定义的class都没问题,但样式却没生效

    今天开发遇到过这样的问题,主要原因是 css 文件格式有问题导致的.有问题的 css 样式的那一行下面的 css 样式不能生效

  9. jsp页面展示更加商品的分类,控制商品的显示

    我的大概思路是这样的,第一后果获取所有的商品分类 保存在list集合里面,第二从后台获取所有的商品 第三在JSP页面遍历商品分类集合放在页面的左边,然后jsp页面商品详细信息这块,也得先遍历商品分类, ...

  10. C++11常用特性介绍——auto类型修饰符

    1.C++11常用特性介绍 从本篇开始介绍C++11常用特性,大致分:关键字及新语法.STL容器.多线程.智能指针内存管理,最后讲一下std::bind和std::function 二.关键字和新语法 ...