perf_event 源码分析

前言

简单来说,perf是一种性能监测工具,它首先对通用处理器提供的performance counter进行编程,设定计数器阈值和事件,然后性能计数器就会在设定事件发生时递增计数器,直至这个计数器的计数值达到阈值,在不同的结构中对于计数器数值的提取有不同的方式,例如MIPS上会注册一个硬件中断,这样在计数器溢出时触发一个硬件中断,在中断处理函数中记录数值,x86中则是利用通知链机制,将溢出处理函数注册到die_chain通知链上,它会利用任何一个硬件中断发生的时机,检测性能计数器是否溢出,是则记录这个数值,这种实现方式就避免了单独为性能计数器溢出注册一个硬件中断。

perf源码分为用户层和内核层,用户层代码为用户提供命令行指定事件与采样方式,perf的一大特点就体现在丰富的用户层工具,可以说,内核部分代码只是为perf提供采样引擎,用户层才是perf的精华。用户层代码位于src/tools/perf目录下,c代码有13000行左右,此外还有大量的脚本程序。内核层代码分为结构无关代码(位于src/kernel/core/目录),和结构相关代码(位于src/arch/x86/cpu/**)。

这里先列个框架:首先从系统启动初始化开始,perf-init的相关工作,之后介绍用户层指定事件,通过系统调用转入内核,执行采样,采样数据通过内存映射返回给用户层,用户层工具进行上层分析并显示

perf_event源码分析(一)——cmd_record

perf's main entry

tools/perf/perf.c

static struct cmd_struct commands[] = {
{ "buildid-cache", cmd_buildid_cache, 0 },
{ "buildid-list", cmd_buildid_list, 0 },
{ "diff", cmd_diff, 0 },
{ "evlist", cmd_evlist, 0 },
{ "help", cmd_help, 0 },
{ "list", cmd_list, 0 },
{ "record", cmd_record, 0 },
{ "report", cmd_report, 0 },
{ "bench", cmd_bench, 0 },
{ "stat", cmd_stat, 0 },
{ "timechart", cmd_timechart, 0 },
{ "top", cmd_top, 0 },
{ "annotate", cmd_annotate, 0 },
{ "version", cmd_version, 0 },
{ "script", cmd_script, 0 },
{ "sched", cmd_sched, 0 },
#ifdef HAVE_LIBELF_SUPPORT
{ "probe", cmd_probe, 0 },
#endif
{ "kmem", cmd_kmem, 0 },
{ "lock", cmd_lock, 0 },
{ "kvm", cmd_kvm, 0 },
{ "test", cmd_test, 0 },
#ifdef HAVE_LIBAUDIT_SUPPORT
{ "trace", cmd_trace, 0 },
#endif
{ "inject", cmd_inject, 0 },
{ "mem", cmd_mem, 0 },
{ "data", cmd_data, 0 },
};

perf record's CALL CHAIN:

cmd_record
;; new a struct "record" rec, and a struct "evlist" in rec->evlist;
perf_evlist__new
perf_config
__cmd_record(&record, argc, argv); // fill out "struct record"
perf_session__new(file, false, tool); // New a sesssion for this rec, rec->session, attention: file is "struct perf_data_file *file", &rec->file;
machines__init(&session->machines);
ordered_events__init(&session->ordered_events, ordered_events__deliver_event);
perf_data_file__open(file)
check_pipe(file)
file->path = "perf.data" // If not specified name, fill out file->path
open_file(file);
fd = perf_data_file__is_read(file) ? open_file_read(file) : open_file_write(file);
file->fd = fd;
perf_session__create_kernel_maps(session) //
fd = perf_data_file__fd(file); // Get rec's fd, rec->file->fd
record__init_features(rec);
perf_header__set_feat // Fill out session's header of this rec, rec->session->header
record__open(rec)
perf_evlist__config(evlist, opts); // perf_evlist
perf_evsel__config(evsel, opts); // perf_evsel
perf_header__clear_feat
perf_header__write_pipe / perf_session__write_header
perf_event__synthesize_kernel_mmap(tool, process_synthesized_event, machine);
perf_event__synthesize_modules(tool, process_synthesized_event, machine);
machines__process_guests(&session->machines,perf_event__synthesize_guest_os, tool);
__machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->threads,process_synthesized_event, opts->sample_address);
tools/perf/builtin-record.c

int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused)
{
int err = -ENOMEM;
struct record *rec = &record;
char errbuf[BUFSIZ]; rec->evlist = perf_evlist__new();
if (rec->evlist == NULL)
return -ENOMEM; perf_config(perf_record_config, rec); // 解析, tools/perf/util/config.c argc = parse_options(argc, argv, record_options, record_usage,
PARSE_OPT_STOP_AT_NON_OPTION);
if (!argc && target__none(&rec->opts.target))
usage_with_options(record_usage, record_options); if (nr_cgroups && !rec->opts.target.system_wide) {
ui__error("cgroup monitoring only available in"
" system-wide mode\n");
usage_with_options(record_usage, record_options);
}
}
tools/perf/util/parse-events.c

setup_events // tools/perf/builtin-stat.c
parse_events // tools/perf/util/parse-events.c parse_events // tools/perf/util/parse-events.c int parse_events(struct perf_evlist *evlist, const char *str)
{
struct parse_events_evlist data = {
.list = LIST_HEAD_INIT(data.list),
.idx = evlist->nr_entries,
};
int ret; ret = parse_events__scanner(str, &data, PE_START_EVENTS);
perf_pmu__parse_cleanup();
if (!ret) {
int entries = data.idx - evlist->nr_entries;
perf_evlist__splice_list_tail(evlist, &data.list, entries);
evlist->nr_groups += data.nr_groups;
return 0;
} /*
* There are 2 users - builtin-record and builtin-test objects.
* Both call perf_evlist__delete in case of error, so we dont
* need to bother.
*/
return ret;
}

struct introduction

tools/perf/util/target.h

struct target {
const char *pid;
const char *tid;
const char *cpu_list;
const char *uid_str;
uid_t uid;
bool system_wide;
bool uses_mmap;
bool default_per_cpu;
bool per_thread;
};
=== tools/perf/util/data.h struct perf_data_file {
const char *path;
int fd;
bool is_pipe;
bool force;
unsigned long size;
enum perf_data_mode mode;
}; === tools/perf/util/session.h struct perf_session {
struct perf_header header;
struct machines machines;
struct perf_evlist *evlist;
struct trace_event tevent;
bool repipe;
bool one_mmap;
void *one_mmap_addr;
u64 one_mmap_offset;
struct ordered_events ordered_events;
struct perf_data_file *file;
struct perf_tool *tool;
}; === tools/perf/util/evlist.h struct perf_evlist {
struct list_head entries;
struct hlist_head heads[PERF_EVLIST__HLIST_SIZE];
int nr_entries;
int nr_groups;
int nr_mmaps;
size_t mmap_len;
int id_pos;
int is_pos;
u64 combined_sample_type;
struct {
int cork_fd;
pid_t pid;
} workload;
bool overwrite;
struct fdarray pollfd;
struct perf_mmap *mmap;
struct thread_map *threads; // threads
struct cpu_map *cpus; // cpus
struct perf_evsel *selected;
struct events_stats stats;
}; === /** struct perf_evsel - event selector **/ Each event passed from user mapping one perf_evsel struct. struct perf_evsel {
struct list_head node;
struct perf_event_attr attr;
char *filter;
struct xyarray *fd;
struct xyarray *sample_id;
u64 *id;
struct perf_counts *counts;
struct perf_counts *prev_raw_counts;
int idx;
u32 ids;
char *name;
double scale;
const char *unit;
bool snapshot;
struct event_format *tp_format;
...
...
struct perf_evsel *leader;
} === tools/perf/builtin-record.c struct record {
struct perf_tool tool;
struct record_opts opts;
u64 bytes_written;
struct perf_data_file file;
struct perf_evlist *evlist;
struct perf_session *session;
const char *progname;
int realtime_prio;
bool no_buildid;
bool no_buildid_cache;
long samples;
}; ===
Here is important, perf_stat is an array include three "struct stats" in "perf_stat",
and will init perf_stat:
for (i = 0; i < 3; i++)
init_stats(&ps->res_stats[i]); struct perf_stat {
struct stats res_stats[3];
}; tools/perf/util/stat.h struct stats
{
double n, mean, M2;
u64 max, min;
}; ====
tools/perf/util/evsel.h struct perf_counts_values {
union {
struct {
u64 val;
u64 ena;
u64 run;
};
u64 values[3];
};
}; struct perf_counts {
s8 scaled;
struct perf_counts_values aggr;
struct perf_counts_values cpu[];
};

perf stat's CALL CHAIN

CALL CHAIN:
commands // tools/perf/perf.c
cmd_stat // tools/perf/builtin-stat.c
parse_events_option // If perf stat -e xxx, specified event name, will check this event name
parse_events
parse_events__scanner // check events
parse_events_lex_init_extra
parse_events__scan_string
parse_events_parse
parse_events__flush_buffer
parse_events__delete_buffer
parse_events_lex_destroy
perf_pmu__parse_cleanup:
perf_evlist__new();
perf_evlist__init(struct perf_evlist *evlist, struct cpu_map *cpus, struct thread_map *threads) // evlist->cpus, evlist->threads
perf_evlist__set_maps ///
parse_options
parse_options_usage
add_default_attributes()
target__validate(&target);
perf_evlist__create_maps(evsel_list, &target) // fill out evlist->threads(thread_map)
evlist->threads = thread_map__new_str(target->pid, target->tid,target->uid); // evlist->threads
evlist->threads(thread_map) = [tid,tid,tid,tid,...]
target__uses_dummy_map(target)
evlist->cpus = cpu_map__dummy_new() // evlist->cpus
evlist->cpus = cpu_map__new(target->cpu_list)
perf_evlist__alloc_stats(evsel_list, interval) // Traverse all evsel
evlist__for_each(evlist, evsel) {
perf_evsel__alloc_stat_priv(evsel) // Alloc memory for each evsel->priv = zalloc(sizeof(struct perf_stat));
perf_evsel__reset_stat_priv(evsel)
init_stats // Fill out "struct perf_stat", perf_stat include 3 elements of "struct stats{}"
perf_evsel__alloc_counts(evsel, perf_evsel__nr_cpus(evsel)) // Alloc evsel->counts
alloc_raw && perf_evsel__alloc_prev_raw_counts(evsel) // Alloc evsel->prev_raw_counts = addr;
}
perf_stat_init_aggr_mode()
cpu_map__build_socket_map
cpu_map__build_map(cpus, sockp, cpu_map__get_socket);
cpu_map__get_socket
cpu_map__build_core_map
cpu_map__build_map(cpus, corep, cpu_map__get_core);
cpu_map__get_core
cpu_map__get_socket run_perf_stat(argc, argv);
__run_perf_stat(argc, argv);
perf_evlist__prepare_workload(evsel_list, &target, argv, false, workload_exec_failed_signal)
perf_evlist__set_leader(evsel_list); // evlist->nr_groups = 1 or 0 ? decide by evlist->nr_entries > 1 or not
__perf_evlist__set_leader(&evlist->entries);
evlist__for_each(evsel_list, evsel) { // Traverse all evsel
create_perf_stat_counter(evsel)
struct perf_event_attr *attr = &evsel->attr;
attr->xxx = xxx
perf_evsel__open_per_cpu(evsel, perf_evsel__cpus(evsel)
perf_evsel__is_group_leader(evsel)
perf_evsel__open_per_thread(evsel, evsel_list->threads)
// important: __perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus, struct thread_map *threads)
__perf_evsel__open(evsel, &empty_cpu_map.map, threads)
// perf_evsel__alloc_fd(struct perf_evsel *evsel, int ncpus, int nthreads), if system_wide: nthreads = 1
perf_evsel__alloc_fd(evsel, cpus->nr, nthreads)
evsel->fd = xyarray__new(ncpus, nthreads, sizeof(int));
for (cpu = 0; cpu < cpus->nr; cpu++) {
for (thread = 0; thread < nthreads; thread++) {
group_fd = get_group_fd(evsel, cpu, thread);
sys_perf_event_open(&evsel->attr, pid, cpus->map[cpu], group_fd, flags);
}
}
}
perf_evlist__apply_filters(evsel_list, &counter)
evlist__for_each(evlist, evsel) {
perf_evsel__set_filter(evsel, ncpus, nthreads, evsel->filter);
}
t0 = rdclock();
clock_gettime(CLOCK_MONOTONIC, &ref_time);
if (forks) {
perf_evlist__start_workload(evsel_list);
handle_initial_delay();
if (interval) {
print_interval();
}
} else {
handle_initial_delay();
print_interval();
}
t1 = rdclock(); update_stats(&walltime_nsecs_stats, t1 - t0); // 开始为每个evsel读
if (aggr_mode == AGGR_GLOBAL) {
evlist__for_each(evsel_list, counter) {
// 读到struct: "struct perf_counts_values", 保存在evsel的 &counter->counts->aggr , (这里evsel 就是counter)
// 还有“struct perf_stat” , counter->priv
read_counter_aggr(counter);
aggr->val = aggr->ena = aggr->run = 0; // 这里, 把 perf_counts_values aggr 全部初始化为0
read_counter(counter) // 如何读此event?遍历每个thread和cpu
int nthreads = thread_map__nr(evsel_list->threads);
int ncpus = perf_evsel__nr_cpus(counter);
int cpu, thread;
for (thread = 0; thread < nthreads; thread++) {
for (cpu = 0; cpu < ncpus; cpu++) {
// pocess + cpu 二维数组方式读, 读到 "struct perf_counts_values count"
process_per_cpu(struct perf_evsel *evsel, int cpu, int thread))
perf_evsel__read_cb(evsel, cpu, thread, &count)
memset(count, 0, sizeof(*count));
FD(evsel, cpu, thread)
readn(FD(evsel, cpu, thread), count, sizeof(*count))
ion(true, fd, buf, n);
read(fd, buf, left) read_cb(evsel, cpu, thread, tmp);
switch (aggr_mode) {
case AGGR_CORE:
case AGGR_SOCKET:
case AGGR_NONE:
perf_evsel__compute_deltas(evsel, cpu, count);
perf_counts_values__scale(count, scale, NULL);
update_shadow_stats(evsel, count->values, cpu); }
}
}
perf_evsel__close_fd(counter, perf_evsel__nr_cpus(counter), thread_map__nr(evsel_list->threads));
}
} else {
evlist__for_each(evsel_list, counter) {
read_counter(counter);
perf_evsel__close_fd(counter, perf_evsel__nr_cpus(counter), 1);
}
} print_stat
print_aggr // AGGR_CORE AGGR_SOCKET
print_counter_aggr(evsel, NULL); // AGGR_GLOBAL
print_counter(evsel, NULL) // AGGR_NONE
tools/perf/util/evsel.h

struct perf_evsel {

}

perf-perf stat用户层代码分析的更多相关文章

  1. Express实例代码分析1——简单的用户验证登录文件

    /** * Module dependencies. */ var express = require('../..');// ../..是上级目录的上级目录 var hash = require(' ...

  2. 完整全面的Java资源库(包括构建、操作、代码分析、编译器、数据库、社区等等)

    构建 这里搜集了用来构建应用程序的工具. Apache Maven:Maven使用声明进行构建并进行依赖管理,偏向于使用约定而不是配置进行构建.Maven优于Apache Ant.后者采用了一种过程化 ...

  3. wifi display代码 分析

    转自:http://blog.csdn.net/lilian0118/article/details/23168531 这一章中我们来看Wifi Display连接过程的建立,包含P2P的部分和RTS ...

  4. Linux从用户层到内核层系列 - GNU系列之glibc介绍

    题记:本系列文章的目的是抛开书本从源代码和使用的角度分析Linux内核和相关源代码,byhankswang和你一起玩转linux开发 轻松搞定TCP/IP协议栈,原创文章欢迎交流, byhankswa ...

  5. 虚拟机创建流程中neutron代码分析(三)

    前言: 当neutron-server创建了port信息,将port信息写入数据库中.流程返回到nova服务端,接着nova创建的流程继续走.在计算节点中neutron-agent同样要完成很多的工作 ...

  6. Android Hal层简要分析

    Android Hal层简要分析 Android Hal层(即 Hardware Abstraction Layer)是Google开发的Android系统里上层应用对底层硬件操作屏蔽的一个软件层次, ...

  7. 【转载】word2vec原理推导与代码分析

    本文的理论部分大量参考<word2vec中的数学原理详解>,按照我这种初学者方便理解的顺序重新编排.重新叙述.题图来自siegfang的博客.我提出的Java方案基于kojisekig,我 ...

  8. OVS 内核KEY值提取及匹配流表代码分析

    原文链接:http://ry0117.com/2016/12/24/OVS内核KEY值提取及匹配流表代码分析/ 当开启OVS后,创建datapath类型为system的网桥并他添加相关接口,OVS网桥 ...

  9. Https与Http,SSL,DevOps, 静态代码分析工具,RFID, SSH, 非对称加密算法(使用最广泛的一种是RSA), 数字签名, 数字证书

    在URL前加https://前缀表明是用SSL加密的. 你的电脑与服务器之间收发的信息传输将更加安全. Web服务器启用SSL需要获得一个服务器证书并将该证书与要使用SSL的服务器绑定. http和h ...

随机推荐

  1. 热烈庆祝国产编程语言R++1.8研发成功

    R++是专为懒人设计的国产编程语言.支持无操作系统裸奔.编译成机器码.android.cocos2dx绑定.中文编程.闭包.惰性求值.JSON.Lisp的S表达式.内联汇编.伪代码.模板.宏.多重继承 ...

  2. CF 445B(DZY Loves Chemistry-求连通块)

    B. DZY Loves Chemistry time limit per test 1 second memory limit per test 256 megabytes input standa ...

  3. BAT 前端开发面经 —— 吐血总结 前端相关片段整理——持续更新 前端基础精简总结 Web Storage You don't know js

    BAT 前端开发面经 —— 吐血总结   目录 1. Tencent 2. 阿里 3. 百度 更好阅读,请移步这里 聊之前 最近暑期实习招聘已经开始,个人目前参加了阿里的内推及腾讯和百度的实习生招聘, ...

  4. hdu 1248 寒冰王座(暴力)

    寒冰王座 Time Limit: 2000/1000 MS (Java/Others)    Memory Limit: 65536/32768 K (Java/Others) Total Submi ...

  5. git-svn for mac

    熟练使用 git ,新公司用的是 svn,这就尴尬了,为了这个习惯问题,我还是毅然坚持使用 git,但是又不与公司的 svn 冲突,所以就找到了 git 的 git-svn 插件. 在 mac 上使用 ...

  6. java文件和目录的增删复制

    在使用java进行开发时常常会用到文件和目录的增删复制等方法.我写了一个小工具类.和大家分享,希望大家指正: package com.wangpeng.utill; import java.io.Fi ...

  7. 第十四章 netlink机制--基于Linux3.10【转】

    本文转载自:http://blog.csdn.net/shichaog/article/details/44682613 Netlink基于网络的消息机制,能够让用户和内核空间进行通信,12.3节提到 ...

  8. go语言笔记——map map 默认是无序的,不管是按照 key 还是按照 value 默认都不排序

    示例 8.1 make_maps.go package main import "fmt" func main() { var mapLit map[string]int //va ...

  9. E20171015-hm

    quirk   n. 怪癖; 奇事,巧合; 突然的弯曲; propagation  n. 宣传; 传播,传输,蔓延,扩展,波及深度; [生]繁殖法,[地]传导; 培养; immediate  adj. ...

  10. Countries in War(强连通分量及其缩点)

    http://poj.org/problem?id=3114 题意:有n个城市,m条边,由a城市到b城市的通信时间为w,若a城市与b城市连通,b城市与a城市也连通,则a,b城市之间的通信时间为0,求出 ...