From 4e267db135c44d0b18e553899fe7df32b89211a5 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 14 Apr 2016 07:38:13 -0400 Subject: [PATCH 01/20] tracing: Make the pid filtering helper functions global Make the functions used for pid filtering global for tracing, such that the function tracer can use the pid code as well. Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 9 +++++++++ kernel/trace/trace_events.c | 34 +++++++++++++++++----------------- 2 files changed, 26 insertions(+), 17 deletions(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 5167c366d6b7..172330891c6d 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -628,6 +628,15 @@ extern unsigned long nsecs_to_usecs(unsigned long nsecs); extern unsigned long tracing_thresh; +/* PID filtering */ +bool trace_find_filtered_pid(struct trace_pid_list *filtered_pids, + pid_t search_pid); +bool trace_ignore_this_task(struct trace_pid_list *filtered_pids, + struct task_struct *task); +void trace_filter_add_remove_task(struct trace_pid_list *pid_list, + struct task_struct *self, + struct task_struct *task); + #ifdef CONFIG_TRACER_MAX_TRACE void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); void update_max_tr_single(struct trace_array *tr, diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 3d4155892a1e..b5e514c4dada 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -503,8 +503,8 @@ static void ftrace_clear_events(struct trace_array *tr) extern int pid_max; /* Returns true if found in filter */ -static bool -find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid) +bool +trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid) { /* * If pid_max changed after filtered_pids was created, we @@ -516,8 +516,8 @@ find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid) return test_bit(search_pid, filtered_pids->pids); } -static bool -ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task) +bool +trace_ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task) { /* * Return false, because if filtered_pids does not exist, @@ -526,19 +526,19 @@ ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task) if (!filtered_pids) return false; - return !find_filtered_pid(filtered_pids, task->pid); + return !trace_find_filtered_pid(filtered_pids, task->pid); } -static void filter_add_remove_task(struct trace_pid_list *pid_list, - struct task_struct *self, - struct task_struct *task) +void trace_filter_add_remove_task(struct trace_pid_list *pid_list, + struct task_struct *self, + struct task_struct *task) { if (!pid_list) return; /* For forks, we only add if the forking task is listed */ if (self) { - if (!find_filtered_pid(pid_list, self->pid)) + if (!trace_find_filtered_pid(pid_list, self->pid)) return; } @@ -560,7 +560,7 @@ event_filter_pid_sched_process_exit(void *data, struct task_struct *task) struct trace_array *tr = data; pid_list = rcu_dereference_sched(tr->filtered_pids); - filter_add_remove_task(pid_list, NULL, task); + trace_filter_add_remove_task(pid_list, NULL, task); } static void @@ -572,7 +572,7 @@ event_filter_pid_sched_process_fork(void *data, struct trace_array *tr = data; pid_list = rcu_dereference_sched(tr->filtered_pids); - filter_add_remove_task(pid_list, self, task); + trace_filter_add_remove_task(pid_list, self, task); } void trace_event_follow_fork(struct trace_array *tr, bool enable) @@ -600,8 +600,8 @@ event_filter_pid_sched_switch_probe_pre(void *data, bool preempt, pid_list = rcu_dereference_sched(tr->filtered_pids); this_cpu_write(tr->trace_buffer.data->ignore_pid, - ignore_this_task(pid_list, prev) && - ignore_this_task(pid_list, next)); + trace_ignore_this_task(pid_list, prev) && + trace_ignore_this_task(pid_list, next)); } static void @@ -614,7 +614,7 @@ event_filter_pid_sched_switch_probe_post(void *data, bool preempt, pid_list = rcu_dereference_sched(tr->filtered_pids); this_cpu_write(tr->trace_buffer.data->ignore_pid, - ignore_this_task(pid_list, next)); + trace_ignore_this_task(pid_list, next)); } static void @@ -630,7 +630,7 @@ event_filter_pid_sched_wakeup_probe_pre(void *data, struct task_struct *task) pid_list = rcu_dereference_sched(tr->filtered_pids); this_cpu_write(tr->trace_buffer.data->ignore_pid, - ignore_this_task(pid_list, task)); + trace_ignore_this_task(pid_list, task)); } static void @@ -647,7 +647,7 @@ event_filter_pid_sched_wakeup_probe_post(void *data, struct task_struct *task) /* Set tracing if current is enabled */ this_cpu_write(tr->trace_buffer.data->ignore_pid, - ignore_this_task(pid_list, current)); + trace_ignore_this_task(pid_list, current)); } static void __ftrace_clear_event_pids(struct trace_array *tr) @@ -1654,7 +1654,7 @@ static void ignore_task_cpu(void *data) mutex_is_locked(&event_mutex)); this_cpu_write(tr->trace_buffer.data->ignore_pid, - ignore_this_task(pid_list, current)); + trace_ignore_this_task(pid_list, current)); } static ssize_t From d8275c454dcdba296675221b4c12f19d1b6e0ee8 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 14 Apr 2016 12:15:22 -0400 Subject: [PATCH 02/20] tracing: Move filtered_pid helper functions into trace.c As the filtered_pid functions are going to be used by function tracer as well as trace_events, move the code into the generic trace.c file. The functions moved are: trace_find_filtered_pid() trace_ignore_this_task() trace_filter_add_remove_task() Kernel Doc text was also added. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 78 +++++++++++++++++++++++++++++++++++++ kernel/trace/trace_events.c | 51 ------------------------ 2 files changed, 78 insertions(+), 51 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8a4bd6b68a0b..0b87fe8e6d0b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -319,6 +319,84 @@ int call_filter_check_discard(struct trace_event_call *call, void *rec, return 0; } +/** + * trace_find_filtered_pid - check if a pid exists in a filtered_pid list + * @filtered_pids: The list of pids to check + * @search_pid: The PID to find in @filtered_pids + * + * Returns true if @search_pid is fonud in @filtered_pids, and false otherwis. + */ +bool +trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid) +{ + /* + * If pid_max changed after filtered_pids was created, we + * by default ignore all pids greater than the previous pid_max. + */ + if (search_pid >= filtered_pids->pid_max) + return false; + + return test_bit(search_pid, filtered_pids->pids); +} + +/** + * trace_ignore_this_task - should a task be ignored for tracing + * @filtered_pids: The list of pids to check + * @task: The task that should be ignored if not filtered + * + * Checks if @task should be traced or not from @filtered_pids. + * Returns true if @task should *NOT* be traced. + * Returns false if @task should be traced. + */ +bool +trace_ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task) +{ + /* + * Return false, because if filtered_pids does not exist, + * all pids are good to trace. + */ + if (!filtered_pids) + return false; + + return !trace_find_filtered_pid(filtered_pids, task->pid); +} + +/** + * trace_pid_filter_add_remove - Add or remove a task from a pid_list + * @pid_list: The list to modify + * @self: The current task for fork or NULL for exit + * @task: The task to add or remove + * + * If adding a task, if @self is defined, the task is only added if @self + * is also included in @pid_list. This happens on fork and tasks should + * only be added when the parent is listed. If @self is NULL, then the + * @task pid will be removed from the list, which would happen on exit + * of a task. + */ +void trace_filter_add_remove_task(struct trace_pid_list *pid_list, + struct task_struct *self, + struct task_struct *task) +{ + if (!pid_list) + return; + + /* For forks, we only add if the forking task is listed */ + if (self) { + if (!trace_find_filtered_pid(pid_list, self->pid)) + return; + } + + /* Sorry, but we don't support pid_max changing after setting */ + if (task->pid >= pid_list->pid_max) + return; + + /* "self" is set for forks, and NULL for exits */ + if (self) + set_bit(task->pid, pid_list->pids); + else + clear_bit(task->pid, pid_list->pids); +} + static cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu) { u64 ts; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index b5e514c4dada..a11e6d9a3841 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -502,57 +502,6 @@ static void ftrace_clear_events(struct trace_array *tr) /* Shouldn't this be in a header? */ extern int pid_max; -/* Returns true if found in filter */ -bool -trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid) -{ - /* - * If pid_max changed after filtered_pids was created, we - * by default ignore all pids greater than the previous pid_max. - */ - if (search_pid >= filtered_pids->pid_max) - return false; - - return test_bit(search_pid, filtered_pids->pids); -} - -bool -trace_ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task) -{ - /* - * Return false, because if filtered_pids does not exist, - * all pids are good to trace. - */ - if (!filtered_pids) - return false; - - return !trace_find_filtered_pid(filtered_pids, task->pid); -} - -void trace_filter_add_remove_task(struct trace_pid_list *pid_list, - struct task_struct *self, - struct task_struct *task) -{ - if (!pid_list) - return; - - /* For forks, we only add if the forking task is listed */ - if (self) { - if (!trace_find_filtered_pid(pid_list, self->pid)) - return; - } - - /* Sorry, but we don't support pid_max changing after setting */ - if (task->pid >= pid_list->pid_max) - return; - - /* "self" is set for forks, and NULL for exits */ - if (self) - set_bit(task->pid, pid_list->pids); - else - clear_bit(task->pid, pid_list->pids); -} - static void event_filter_pid_sched_process_exit(void *data, struct task_struct *task) { From 5cc8976bd52153678ca37cc1e3000833b20276f3 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 20 Apr 2016 15:19:54 -0400 Subject: [PATCH 03/20] tracing: Move the pid_list seq_file functions to be global To allow other aspects of ftrace to use the pid_list logic, we need to reuse the seq_file functions. Making the generic part into functions that can be called by other files will help in this regard. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 71 +++++++++++++++++++++++++++++++++++++ kernel/trace/trace.h | 3 ++ kernel/trace/trace_events.c | 34 ++---------------- 3 files changed, 77 insertions(+), 31 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0b87fe8e6d0b..7943e306cc7f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -397,6 +397,77 @@ void trace_filter_add_remove_task(struct trace_pid_list *pid_list, clear_bit(task->pid, pid_list->pids); } +/** + * trace_pid_next - Used for seq_file to get to the next pid of a pid_list + * @pid_list: The pid list to show + * @v: The last pid that was shown (+1 the actual pid to let zero be displayed) + * @pos: The position of the file + * + * This is used by the seq_file "next" operation to iterate the pids + * listed in a trace_pid_list structure. + * + * Returns the pid+1 as we want to display pid of zero, but NULL would + * stop the iteration. + */ +void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos) +{ + unsigned long pid = (unsigned long)v; + + (*pos)++; + + /* pid already is +1 of the actual prevous bit */ + pid = find_next_bit(pid_list->pids, pid_list->pid_max, pid); + + /* Return pid + 1 to allow zero to be represented */ + if (pid < pid_list->pid_max) + return (void *)(pid + 1); + + return NULL; +} + +/** + * trace_pid_start - Used for seq_file to start reading pid lists + * @pid_list: The pid list to show + * @pos: The position of the file + * + * This is used by seq_file "start" operation to start the iteration + * of listing pids. + * + * Returns the pid+1 as we want to display pid of zero, but NULL would + * stop the iteration. + */ +void *trace_pid_start(struct trace_pid_list *pid_list, loff_t *pos) +{ + unsigned long pid; + loff_t l = 0; + + pid = find_first_bit(pid_list->pids, pid_list->pid_max); + if (pid >= pid_list->pid_max) + return NULL; + + /* Return pid + 1 so that zero can be the exit value */ + for (pid++; pid && l < *pos; + pid = (unsigned long)trace_pid_next(pid_list, (void *)pid, &l)) + ; + return (void *)pid; +} + +/** + * trace_pid_show - show the current pid in seq_file processing + * @m: The seq_file structure to write into + * @v: A void pointer of the pid (+1) value to display + * + * Can be directly used by seq_file operations to display the current + * pid value. + */ +int trace_pid_show(struct seq_file *m, void *v) +{ + unsigned long pid = (unsigned long)v - 1; + + seq_printf(m, "%lu\n", pid); + return 0; +} + static cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu) { u64 ts; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 172330891c6d..45442d5842f2 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -636,6 +636,9 @@ bool trace_ignore_this_task(struct trace_pid_list *filtered_pids, void trace_filter_add_remove_task(struct trace_pid_list *pid_list, struct task_struct *self, struct task_struct *task); +void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos); +void *trace_pid_start(struct trace_pid_list *pid_list, loff_t *pos); +int trace_pid_show(struct seq_file *m, void *v); #ifdef CONFIG_TRACER_MAX_TRACE void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index a11e6d9a3841..fd831a972bae 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -983,18 +983,8 @@ p_next(struct seq_file *m, void *v, loff_t *pos) { struct trace_array *tr = m->private; struct trace_pid_list *pid_list = rcu_dereference_sched(tr->filtered_pids); - unsigned long pid = (unsigned long)v; - (*pos)++; - - /* pid already is +1 of the actual prevous bit */ - pid = find_next_bit(pid_list->pids, pid_list->pid_max, pid); - - /* Return pid + 1 to allow zero to be represented */ - if (pid < pid_list->pid_max) - return (void *)(pid + 1); - - return NULL; + return trace_pid_next(pid_list, v, pos); } static void *p_start(struct seq_file *m, loff_t *pos) @@ -1002,8 +992,6 @@ static void *p_start(struct seq_file *m, loff_t *pos) { struct trace_pid_list *pid_list; struct trace_array *tr = m->private; - unsigned long pid; - loff_t l = 0; /* * Grab the mutex, to keep calls to p_next() having the same @@ -1019,15 +1007,7 @@ static void *p_start(struct seq_file *m, loff_t *pos) if (!pid_list) return NULL; - pid = find_first_bit(pid_list->pids, pid_list->pid_max); - if (pid >= pid_list->pid_max) - return NULL; - - /* Return pid + 1 so that zero can be the exit value */ - for (pid++; pid && l < *pos; - pid = (unsigned long)p_next(m, (void *)pid, &l)) - ; - return (void *)pid; + return trace_pid_start(pid_list, pos); } static void p_stop(struct seq_file *m, void *p) @@ -1037,14 +1017,6 @@ static void p_stop(struct seq_file *m, void *p) mutex_unlock(&event_mutex); } -static int p_show(struct seq_file *m, void *v) -{ - unsigned long pid = (unsigned long)v - 1; - - seq_printf(m, "%lu\n", pid); - return 0; -} - static ssize_t event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) @@ -1795,7 +1767,7 @@ static const struct seq_operations show_set_event_seq_ops = { static const struct seq_operations show_set_pid_seq_ops = { .start = p_start, .next = p_next, - .show = p_show, + .show = trace_pid_show, .stop = p_stop, }; From 76c813e26606d35ea9d8d6f96e646b3944c730a9 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 21 Apr 2016 11:35:30 -0400 Subject: [PATCH 04/20] tracing: Move pid_list write processing into its own function The addition of PIDs into a pid_list via the write operation of set_event_pid is a bit complex. The same operation will be needed for function tracing pids. Move the code into its own generic function in trace.c, so that we can avoid duplication of this code. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 109 ++++++++++++++++++++++++++++++++++- kernel/trace/trace.h | 7 +++ kernel/trace/trace_events.c | 110 +++--------------------------------- 3 files changed, 124 insertions(+), 102 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 7943e306cc7f..a8bb7485fd1d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include #include #include @@ -319,6 +319,12 @@ int call_filter_check_discard(struct trace_event_call *call, void *rec, return 0; } +void trace_free_pid_list(struct trace_pid_list *pid_list) +{ + vfree(pid_list->pids); + kfree(pid_list); +} + /** * trace_find_filtered_pid - check if a pid exists in a filtered_pid list * @filtered_pids: The list of pids to check @@ -468,6 +474,107 @@ int trace_pid_show(struct seq_file *m, void *v) return 0; } +/* 128 should be much more than enough */ +#define PID_BUF_SIZE 127 + +int trace_pid_write(struct trace_pid_list *filtered_pids, + struct trace_pid_list **new_pid_list, + const char __user *ubuf, size_t cnt) +{ + struct trace_pid_list *pid_list; + struct trace_parser parser; + unsigned long val; + int nr_pids = 0; + ssize_t read = 0; + ssize_t ret = 0; + loff_t pos; + pid_t pid; + + if (trace_parser_get_init(&parser, PID_BUF_SIZE + 1)) + return -ENOMEM; + + /* + * Always recreate a new array. The write is an all or nothing + * operation. Always create a new array when adding new pids by + * the user. If the operation fails, then the current list is + * not modified. + */ + pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL); + if (!pid_list) + return -ENOMEM; + + pid_list->pid_max = READ_ONCE(pid_max); + + /* Only truncating will shrink pid_max */ + if (filtered_pids && filtered_pids->pid_max > pid_list->pid_max) + pid_list->pid_max = filtered_pids->pid_max; + + pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3); + if (!pid_list->pids) { + kfree(pid_list); + return -ENOMEM; + } + + if (filtered_pids) { + /* copy the current bits to the new max */ + pid = find_first_bit(filtered_pids->pids, + filtered_pids->pid_max); + while (pid < filtered_pids->pid_max) { + set_bit(pid, pid_list->pids); + pid = find_next_bit(filtered_pids->pids, + filtered_pids->pid_max, + pid + 1); + nr_pids++; + } + } + + while (cnt > 0) { + + pos = 0; + + ret = trace_get_user(&parser, ubuf, cnt, &pos); + if (ret < 0 || !trace_parser_loaded(&parser)) + break; + + read += ret; + ubuf += ret; + cnt -= ret; + + parser.buffer[parser.idx] = 0; + + ret = -EINVAL; + if (kstrtoul(parser.buffer, 0, &val)) + break; + if (val >= pid_list->pid_max) + break; + + pid = (pid_t)val; + + set_bit(pid, pid_list->pids); + nr_pids++; + + trace_parser_clear(&parser); + ret = 0; + } + trace_parser_put(&parser); + + if (ret < 0) { + trace_free_pid_list(pid_list); + return ret; + } + + if (!nr_pids) { + /* Cleared the list of pids */ + trace_free_pid_list(pid_list); + read = ret; + pid_list = NULL; + } + + *new_pid_list = pid_list; + + return read; +} + static cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu) { u64 ts; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 45442d5842f2..a4dce1ef9e03 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -629,6 +629,9 @@ extern unsigned long nsecs_to_usecs(unsigned long nsecs); extern unsigned long tracing_thresh; /* PID filtering */ + +extern int pid_max; + bool trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid); bool trace_ignore_this_task(struct trace_pid_list *filtered_pids, @@ -639,6 +642,10 @@ void trace_filter_add_remove_task(struct trace_pid_list *pid_list, void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos); void *trace_pid_start(struct trace_pid_list *pid_list, loff_t *pos); int trace_pid_show(struct seq_file *m, void *v); +void trace_free_pid_list(struct trace_pid_list *pid_list); +int trace_pid_write(struct trace_pid_list *filtered_pids, + struct trace_pid_list **new_pid_list, + const char __user *ubuf, size_t cnt); #ifdef CONFIG_TRACER_MAX_TRACE void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index fd831a972bae..fd449eb138cf 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include @@ -499,9 +498,6 @@ static void ftrace_clear_events(struct trace_array *tr) mutex_unlock(&event_mutex); } -/* Shouldn't this be in a header? */ -extern int pid_max; - static void event_filter_pid_sched_process_exit(void *data, struct task_struct *task) { @@ -634,8 +630,7 @@ static void __ftrace_clear_event_pids(struct trace_array *tr) /* Wait till all users are no longer using pid filtering */ synchronize_sched(); - vfree(pid_list->pids); - kfree(pid_list); + trace_free_pid_list(pid_list); } static void ftrace_clear_event_pids(struct trace_array *tr) @@ -1587,13 +1582,7 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf, struct trace_pid_list *filtered_pids = NULL; struct trace_pid_list *pid_list; struct trace_event_file *file; - struct trace_parser parser; - unsigned long val; - loff_t this_pos; - ssize_t read = 0; - ssize_t ret = 0; - pid_t pid; - int nr_pids = 0; + ssize_t ret; if (!cnt) return 0; @@ -1602,93 +1591,15 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf, if (ret < 0) return ret; - if (trace_parser_get_init(&parser, EVENT_BUF_SIZE + 1)) - return -ENOMEM; - mutex_lock(&event_mutex); + filtered_pids = rcu_dereference_protected(tr->filtered_pids, lockdep_is_held(&event_mutex)); - /* - * Always recreate a new array. The write is an all or nothing - * operation. Always create a new array when adding new pids by - * the user. If the operation fails, then the current list is - * not modified. - */ - pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL); - if (!pid_list) { - read = -ENOMEM; + ret = trace_pid_write(filtered_pids, &pid_list, ubuf, cnt); + if (ret < 0) goto out; - } - pid_list->pid_max = READ_ONCE(pid_max); - /* Only truncating will shrink pid_max */ - if (filtered_pids && filtered_pids->pid_max > pid_list->pid_max) - pid_list->pid_max = filtered_pids->pid_max; - pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3); - if (!pid_list->pids) { - kfree(pid_list); - read = -ENOMEM; - goto out; - } - if (filtered_pids) { - /* copy the current bits to the new max */ - pid = find_first_bit(filtered_pids->pids, - filtered_pids->pid_max); - while (pid < filtered_pids->pid_max) { - set_bit(pid, pid_list->pids); - pid = find_next_bit(filtered_pids->pids, - filtered_pids->pid_max, - pid + 1); - nr_pids++; - } - } - while (cnt > 0) { - - this_pos = 0; - - ret = trace_get_user(&parser, ubuf, cnt, &this_pos); - if (ret < 0 || !trace_parser_loaded(&parser)) - break; - - read += ret; - ubuf += ret; - cnt -= ret; - - parser.buffer[parser.idx] = 0; - - ret = -EINVAL; - if (kstrtoul(parser.buffer, 0, &val)) - break; - if (val >= pid_list->pid_max) - break; - - pid = (pid_t)val; - - set_bit(pid, pid_list->pids); - nr_pids++; - - trace_parser_clear(&parser); - ret = 0; - } - trace_parser_put(&parser); - - if (ret < 0) { - vfree(pid_list->pids); - kfree(pid_list); - read = ret; - goto out; - } - - if (!nr_pids) { - /* Cleared the list of pids */ - vfree(pid_list->pids); - kfree(pid_list); - read = ret; - if (!filtered_pids) - goto out; - pid_list = NULL; - } rcu_assign_pointer(tr->filtered_pids, pid_list); list_for_each_entry(file, &tr->events, list) { @@ -1697,10 +1608,8 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf, if (filtered_pids) { synchronize_sched(); - - vfree(filtered_pids->pids); - kfree(filtered_pids); - } else { + trace_free_pid_list(filtered_pids); + } else if (pid_list) { /* * Register a probe that is called before all other probes * to set ignore_pid if next or prev do not match. @@ -1738,9 +1647,8 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf, out: mutex_unlock(&event_mutex); - ret = read; - if (read > 0) - *ppos += read; + if (ret > 0) + *ppos += ret; return ret; } From 345ddcc882d8896dcbdcb3e0ee4a415fc23ec8b0 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 22 Apr 2016 18:11:33 -0400 Subject: [PATCH 05/20] ftrace: Have set_ftrace_pid use the bitmap like events do Convert set_ftrace_pid to use the bitmap like set_event_pid does. This allows for instances to use the pid filtering as well, and will allow for function-fork option to set if the children of a traced function should be traced or not. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 333 ++++++++++++--------------- kernel/trace/trace.c | 1 + kernel/trace/trace.h | 15 +- kernel/trace/trace_functions.c | 2 +- kernel/trace/trace_functions_graph.c | 2 +- 5 files changed, 158 insertions(+), 195 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 900dbb1efff2..8b488f4dd8e8 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -89,16 +89,16 @@ struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end; /* What to set function_trace_op to */ static struct ftrace_ops *set_function_trace_op; -/* List for set_ftrace_pid's pids. */ -LIST_HEAD(ftrace_pids); -struct ftrace_pid { - struct list_head list; - struct pid *pid; -}; - -static bool ftrace_pids_enabled(void) +static bool ftrace_pids_enabled(struct ftrace_ops *ops) { - return !list_empty(&ftrace_pids); + struct trace_array *tr; + + if (!(ops->flags & FTRACE_OPS_FL_PID) || !ops->private) + return false; + + tr = ops->private; + + return tr->function_pids != NULL; } static void ftrace_update_trampoline(struct ftrace_ops *ops); @@ -179,7 +179,9 @@ int ftrace_nr_registered_ops(void) static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *regs) { - if (!test_tsk_trace_trace(current)) + struct trace_array *tr = op->private; + + if (tr && this_cpu_read(tr->trace_buffer.data->ftrace_ignore_pid)) return; op->saved_func(ip, parent_ip, op, regs); @@ -417,7 +419,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops) /* Always save the function, and reset at unregistering */ ops->saved_func = ops->func; - if (ops->flags & FTRACE_OPS_FL_PID && ftrace_pids_enabled()) + if (ftrace_pids_enabled(ops)) ops->func = ftrace_pid_func; ftrace_update_trampoline(ops); @@ -450,7 +452,6 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) static void ftrace_update_pid_func(void) { - bool enabled = ftrace_pids_enabled(); struct ftrace_ops *op; /* Only do something if we are tracing something */ @@ -459,8 +460,8 @@ static void ftrace_update_pid_func(void) do_for_each_ftrace_op(op, ftrace_ops_list) { if (op->flags & FTRACE_OPS_FL_PID) { - op->func = enabled ? ftrace_pid_func : - op->saved_func; + op->func = ftrace_pids_enabled(op) ? + ftrace_pid_func : op->saved_func; ftrace_update_trampoline(op); } } while_for_each_ftrace_op(op); @@ -5324,134 +5325,46 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops) return ops->func; } -static void clear_ftrace_swapper(void) +static void +ftrace_filter_pid_sched_switch_probe(void *data, bool preempt, + struct task_struct *prev, struct task_struct *next) { - struct task_struct *p; + struct trace_array *tr = data; + struct trace_pid_list *pid_list; + + pid_list = rcu_dereference_sched(tr->function_pids); + + this_cpu_write(tr->trace_buffer.data->ftrace_ignore_pid, + trace_ignore_this_task(pid_list, next)); +} + +static void clear_ftrace_pids(struct trace_array *tr) +{ + struct trace_pid_list *pid_list; int cpu; - get_online_cpus(); - for_each_online_cpu(cpu) { - p = idle_task(cpu); - clear_tsk_trace_trace(p); - } - put_online_cpus(); + pid_list = rcu_dereference_protected(tr->function_pids, + lockdep_is_held(&ftrace_lock)); + if (!pid_list) + return; + + unregister_trace_sched_switch(ftrace_filter_pid_sched_switch_probe, tr); + + for_each_possible_cpu(cpu) + per_cpu_ptr(tr->trace_buffer.data, cpu)->ftrace_ignore_pid = false; + + rcu_assign_pointer(tr->function_pids, NULL); + + /* Wait till all users are no longer using pid filtering */ + synchronize_sched(); + + trace_free_pid_list(pid_list); } -static void set_ftrace_swapper(void) +static void ftrace_pid_reset(struct trace_array *tr) { - struct task_struct *p; - int cpu; - - get_online_cpus(); - for_each_online_cpu(cpu) { - p = idle_task(cpu); - set_tsk_trace_trace(p); - } - put_online_cpus(); -} - -static void clear_ftrace_pid(struct pid *pid) -{ - struct task_struct *p; - - rcu_read_lock(); - do_each_pid_task(pid, PIDTYPE_PID, p) { - clear_tsk_trace_trace(p); - } while_each_pid_task(pid, PIDTYPE_PID, p); - rcu_read_unlock(); - - put_pid(pid); -} - -static void set_ftrace_pid(struct pid *pid) -{ - struct task_struct *p; - - rcu_read_lock(); - do_each_pid_task(pid, PIDTYPE_PID, p) { - set_tsk_trace_trace(p); - } while_each_pid_task(pid, PIDTYPE_PID, p); - rcu_read_unlock(); -} - -static void clear_ftrace_pid_task(struct pid *pid) -{ - if (pid == ftrace_swapper_pid) - clear_ftrace_swapper(); - else - clear_ftrace_pid(pid); -} - -static void set_ftrace_pid_task(struct pid *pid) -{ - if (pid == ftrace_swapper_pid) - set_ftrace_swapper(); - else - set_ftrace_pid(pid); -} - -static int ftrace_pid_add(int p) -{ - struct pid *pid; - struct ftrace_pid *fpid; - int ret = -EINVAL; - mutex_lock(&ftrace_lock); - - if (!p) - pid = ftrace_swapper_pid; - else - pid = find_get_pid(p); - - if (!pid) - goto out; - - ret = 0; - - list_for_each_entry(fpid, &ftrace_pids, list) - if (fpid->pid == pid) - goto out_put; - - ret = -ENOMEM; - - fpid = kmalloc(sizeof(*fpid), GFP_KERNEL); - if (!fpid) - goto out_put; - - list_add(&fpid->list, &ftrace_pids); - fpid->pid = pid; - - set_ftrace_pid_task(pid); - - ftrace_update_pid_func(); - - ftrace_startup_all(0); - - mutex_unlock(&ftrace_lock); - return 0; - -out_put: - if (pid != ftrace_swapper_pid) - put_pid(pid); - -out: - mutex_unlock(&ftrace_lock); - return ret; -} - -static void ftrace_pid_reset(void) -{ - struct ftrace_pid *fpid, *safe; - - mutex_lock(&ftrace_lock); - list_for_each_entry_safe(fpid, safe, &ftrace_pids, list) { - struct pid *pid = fpid->pid; - - clear_ftrace_pid_task(pid); - - list_del(&fpid->list); - kfree(fpid); - } + clear_ftrace_pids(tr); ftrace_update_pid_func(); ftrace_startup_all(0); @@ -5459,44 +5372,52 @@ static void ftrace_pid_reset(void) mutex_unlock(&ftrace_lock); } +/* Greater than any max PID */ +#define FTRACE_NO_PIDS (void *)(PID_MAX_LIMIT + 1) + static void *fpid_start(struct seq_file *m, loff_t *pos) + __acquires(RCU) { + struct trace_pid_list *pid_list; + struct trace_array *tr = m->private; + mutex_lock(&ftrace_lock); + rcu_read_lock_sched(); - if (!ftrace_pids_enabled() && (!*pos)) - return (void *) 1; + pid_list = rcu_dereference_sched(tr->function_pids); - return seq_list_start(&ftrace_pids, *pos); + if (!pid_list) + return !(*pos) ? FTRACE_NO_PIDS : NULL; + + return trace_pid_start(pid_list, pos); } static void *fpid_next(struct seq_file *m, void *v, loff_t *pos) { - if (v == (void *)1) + struct trace_array *tr = m->private; + struct trace_pid_list *pid_list = rcu_dereference_sched(tr->function_pids); + + if (v == FTRACE_NO_PIDS) return NULL; - return seq_list_next(v, &ftrace_pids, pos); + return trace_pid_next(pid_list, v, pos); } static void fpid_stop(struct seq_file *m, void *p) + __releases(RCU) { + rcu_read_unlock_sched(); mutex_unlock(&ftrace_lock); } static int fpid_show(struct seq_file *m, void *v) { - const struct ftrace_pid *fpid = list_entry(v, struct ftrace_pid, list); - - if (v == (void *)1) { + if (v == FTRACE_NO_PIDS) { seq_puts(m, "no pid\n"); return 0; } - if (fpid->pid == ftrace_swapper_pid) - seq_puts(m, "swapper tasks\n"); - else - seq_printf(m, "%u\n", pid_vnr(fpid->pid)); - - return 0; + return trace_pid_show(m, v); } static const struct seq_operations ftrace_pid_sops = { @@ -5509,58 +5430,103 @@ static const struct seq_operations ftrace_pid_sops = { static int ftrace_pid_open(struct inode *inode, struct file *file) { + struct trace_array *tr = inode->i_private; + struct seq_file *m; int ret = 0; + if (trace_array_get(tr) < 0) + return -ENODEV; + if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) - ftrace_pid_reset(); + ftrace_pid_reset(tr); - if (file->f_mode & FMODE_READ) - ret = seq_open(file, &ftrace_pid_sops); + ret = seq_open(file, &ftrace_pid_sops); + if (ret < 0) { + trace_array_put(tr); + } else { + m = file->private_data; + /* copy tr over to seq ops */ + m->private = tr; + } return ret; } +static void ignore_task_cpu(void *data) +{ + struct trace_array *tr = data; + struct trace_pid_list *pid_list; + + /* + * This function is called by on_each_cpu() while the + * event_mutex is held. + */ + pid_list = rcu_dereference_protected(tr->function_pids, + mutex_is_locked(&ftrace_lock)); + + this_cpu_write(tr->trace_buffer.data->ftrace_ignore_pid, + trace_ignore_this_task(pid_list, current)); +} + static ssize_t ftrace_pid_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - char buf[64], *tmp; - long val; - int ret; + struct seq_file *m = filp->private_data; + struct trace_array *tr = m->private; + struct trace_pid_list *filtered_pids = NULL; + struct trace_pid_list *pid_list; + ssize_t ret; - if (cnt >= sizeof(buf)) - return -EINVAL; + if (!cnt) + return 0; - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; + mutex_lock(&ftrace_lock); - buf[cnt] = 0; + filtered_pids = rcu_dereference_protected(tr->function_pids, + lockdep_is_held(&ftrace_lock)); + + ret = trace_pid_write(filtered_pids, &pid_list, ubuf, cnt); + if (ret < 0) + goto out; + + rcu_assign_pointer(tr->function_pids, pid_list); + + if (filtered_pids) { + synchronize_sched(); + trace_free_pid_list(filtered_pids); + } else if (pid_list) { + /* Register a probe to set whether to ignore the tracing of a task */ + register_trace_sched_switch(ftrace_filter_pid_sched_switch_probe, tr); + } /* - * Allow "echo > set_ftrace_pid" or "echo -n '' > set_ftrace_pid" - * to clean the filter quietly. + * Ignoring of pids is done at task switch. But we have to + * check for those tasks that are currently running. + * Always do this in case a pid was appended or removed. */ - tmp = strstrip(buf); - if (strlen(tmp) == 0) - return 1; + on_each_cpu(ignore_task_cpu, tr, 1); - ret = kstrtol(tmp, 10, &val); - if (ret < 0) - return ret; + ftrace_update_pid_func(); + ftrace_startup_all(0); + out: + mutex_unlock(&ftrace_lock); - ret = ftrace_pid_add(val); + if (ret > 0) + *ppos += ret; - return ret ? ret : cnt; + return ret; } static int ftrace_pid_release(struct inode *inode, struct file *file) { - if (file->f_mode & FMODE_READ) - seq_release(inode, file); + struct trace_array *tr = inode->i_private; - return 0; + trace_array_put(tr); + + return seq_release(inode, file); } static const struct file_operations ftrace_pid_fops = { @@ -5571,24 +5537,17 @@ static const struct file_operations ftrace_pid_fops = { .release = ftrace_pid_release, }; -static __init int ftrace_init_tracefs(void) +void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer) { - struct dentry *d_tracer; - - d_tracer = tracing_init_dentry(); - if (IS_ERR(d_tracer)) - return 0; - - ftrace_init_dyn_tracefs(d_tracer); + /* Only the top level directory has the dyn_tracefs and profile */ + if (tr->flags & TRACE_ARRAY_FL_GLOBAL) { + ftrace_init_dyn_tracefs(d_tracer); + ftrace_profile_tracefs(d_tracer); + } trace_create_file("set_ftrace_pid", 0644, d_tracer, - NULL, &ftrace_pid_fops); - - ftrace_profile_tracefs(d_tracer); - - return 0; + tr, &ftrace_pid_fops); } -fs_initcall(ftrace_init_tracefs); /** * ftrace_kill - kill ftrace diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a8bb7485fd1d..aa240551fc5d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7233,6 +7233,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) for_each_tracing_cpu(cpu) tracing_init_tracefs_percpu(tr, cpu); + ftrace_init_tracefs(tr, d_tracer); } static struct vfsmount *trace_automount(void *ingore) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index a4dce1ef9e03..eaee458755a4 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -156,6 +156,9 @@ struct trace_array_cpu { char comm[TASK_COMM_LEN]; bool ignore_pid; +#ifdef CONFIG_FUNCTION_TRACER + bool ftrace_ignore_pid; +#endif }; struct tracer; @@ -247,6 +250,7 @@ struct trace_array { int ref; #ifdef CONFIG_FUNCTION_TRACER struct ftrace_ops *ops; + struct trace_pid_list __rcu *function_pids; /* function tracing enabled */ int function_enabled; #endif @@ -840,12 +844,9 @@ extern struct list_head ftrace_pids; #ifdef CONFIG_FUNCTION_TRACER extern bool ftrace_filter_param __initdata; -static inline int ftrace_trace_task(struct task_struct *task) +static inline int ftrace_trace_task(struct trace_array *tr) { - if (list_empty(&ftrace_pids)) - return 1; - - return test_tsk_trace_trace(task); + return !this_cpu_read(tr->trace_buffer.data->ftrace_ignore_pid); } extern int ftrace_is_dead(void); int ftrace_create_function_files(struct trace_array *tr, @@ -855,8 +856,9 @@ void ftrace_init_global_array_ops(struct trace_array *tr); void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func); void ftrace_reset_array_ops(struct trace_array *tr); int using_ftrace_ops_list_func(void); +void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer); #else -static inline int ftrace_trace_task(struct task_struct *task) +static inline int ftrace_trace_task(struct trace_array *tr) { return 1; } @@ -871,6 +873,7 @@ static inline void ftrace_destroy_function_files(struct trace_array *tr) { } static inline __init void ftrace_init_global_array_ops(struct trace_array *tr) { } static inline void ftrace_reset_array_ops(struct trace_array *tr) { } +static inline void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d) { } /* ftace_func_t type is not defined, use macro instead of static inline */ #define ftrace_init_array_ops(tr, func) do { } while (0) #endif /* CONFIG_FUNCTION_TRACER */ diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 5a095c2e4b69..0efa00d80623 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -43,7 +43,7 @@ static int allocate_ftrace_ops(struct trace_array *tr) /* Currently only the non stack verision is supported */ ops->func = function_trace_call; - ops->flags = FTRACE_OPS_FL_RECURSION_SAFE; + ops->flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_PID; tr->ops = ops; ops->private = tr; diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 3a0244ff7ea8..67cce7896aeb 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -319,7 +319,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) int cpu; int pc; - if (!ftrace_trace_task(current)) + if (!ftrace_trace_task(tr)) return 0; /* trace it when it is-nested-in or is a function enabled. */ From 35abb67de744b5dbaec54381f2f9e0246089331d Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Wed, 8 Jun 2016 18:38:02 -0700 Subject: [PATCH 06/20] tracing: expose current->comm to [ku]probe events ftrace is very quick to give up on saving the task command line (see `trace_save_cmdline()`). The workaround for events which really care about the command line is to explicitly assign it as part of the entry. However, this doesn't work for kprobe events, as there's no straightforward way to get access to current->comm. Add a kprobe/uprobe event variable $comm which provides exactly that. Link: http://lkml.kernel.org/r/f59b472033b943a370f5f48d0af37698f409108f.1465435894.git.osandov@fb.com Acked-by: Masami Hiramatsu Signed-off-by: Omar Sandoval Signed-off-by: Steven Rostedt --- Documentation/trace/kprobetrace.txt | 3 +++ Documentation/trace/uprobetracer.txt | 3 +++ kernel/trace/trace_kprobe.c | 1 + kernel/trace/trace_probe.c | 33 ++++++++++++++++++++++++++++ kernel/trace/trace_probe.h | 10 +++++++++ 5 files changed, 50 insertions(+) diff --git a/Documentation/trace/kprobetrace.txt b/Documentation/trace/kprobetrace.txt index d68ea5fc812b..ea52ec1f8484 100644 --- a/Documentation/trace/kprobetrace.txt +++ b/Documentation/trace/kprobetrace.txt @@ -40,6 +40,7 @@ Synopsis of kprobe_events $stackN : Fetch Nth entry of stack (N >= 0) $stack : Fetch stack address. $retval : Fetch return value.(*) + $comm : Fetch current task comm. +|-offs(FETCHARG) : Fetch memory at FETCHARG +|- offs address.(**) NAME=FETCHARG : Set NAME as the argument name of FETCHARG. FETCHARG:TYPE : Set TYPE as the type of FETCHARG. Currently, basic types @@ -62,6 +63,8 @@ offset, and container-size (usually 32). The syntax is; b@/ +For $comm, the default type is "string"; any other type is invalid. + Per-Probe Event Filtering ------------------------- diff --git a/Documentation/trace/uprobetracer.txt b/Documentation/trace/uprobetracer.txt index f1cf9a34ad9d..72d1cd4f7bf3 100644 --- a/Documentation/trace/uprobetracer.txt +++ b/Documentation/trace/uprobetracer.txt @@ -36,6 +36,7 @@ Synopsis of uprobe_tracer $stackN : Fetch Nth entry of stack (N >= 0) $stack : Fetch stack address. $retval : Fetch return value.(*) + $comm : Fetch current task comm. +|-offs(FETCHARG) : Fetch memory at FETCHARG +|- offs address.(**) NAME=FETCHARG : Set NAME as the argument name of FETCHARG. FETCHARG:TYPE : Set TYPE as the type of FETCHARG. Currently, basic types @@ -57,6 +58,8 @@ offset, and container-size (usually 32). The syntax is; b@/ +For $comm, the default type is "string"; any other type is invalid. + Event Profiling --------------- diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 5546eec0505f..9aedb0b06683 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -587,6 +587,7 @@ static int create_trace_kprobe(int argc, char **argv) * $retval : fetch return value * $stack : fetch stack address * $stackN : fetch Nth of stack (N:0-) + * $comm : fetch current task comm * @ADDR : fetch memory at ADDR (ADDR should be in kernel) * @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol) * %REG : fetch register REG diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 1d372fa6fefb..74e80a582c28 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -218,6 +218,28 @@ free_bitfield_fetch_param(struct bitfield_fetch_param *data) kfree(data); } +void FETCH_FUNC_NAME(comm, string)(struct pt_regs *regs, + void *data, void *dest) +{ + int maxlen = get_rloc_len(*(u32 *)dest); + u8 *dst = get_rloc_data(dest); + long ret; + + if (!maxlen) + return; + + ret = strlcpy(dst, current->comm, maxlen); + *(u32 *)dest = make_data_rloc(ret, get_rloc_offs(*(u32 *)dest)); +} +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(comm, string)); + +void FETCH_FUNC_NAME(comm, string_size)(struct pt_regs *regs, + void *data, void *dest) +{ + *(u32 *)dest = strlen(current->comm) + 1; +} +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(comm, string_size)); + static const struct fetch_type *find_fetch_type(const char *type, const struct fetch_type *ftbl) { @@ -348,6 +370,11 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, } } else ret = -EINVAL; + } else if (strcmp(arg, "comm") == 0) { + if (strcmp(t->name, "string") != 0 && + strcmp(t->name, "string_size") != 0) + return -EINVAL; + f->fn = t->fetch[FETCH_MTD_comm]; } else ret = -EINVAL; @@ -522,6 +549,12 @@ int traceprobe_parse_probe_arg(char *arg, ssize_t *size, arg[t - parg->comm] = '\0'; t++; } + /* + * The default type of $comm should be "string", and it can't be + * dereferenced. + */ + if (!t && strcmp(arg, "$comm") == 0) + t = "string"; parg->type = find_fetch_type(t, ftbl); if (!parg->type) { pr_info("Unsupported type: %s\n", t); diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index f6398db09114..45400ca5ded1 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -102,6 +102,7 @@ enum { FETCH_MTD_reg = 0, FETCH_MTD_stack, FETCH_MTD_retval, + FETCH_MTD_comm, FETCH_MTD_memory, FETCH_MTD_symbol, FETCH_MTD_deref, @@ -183,6 +184,14 @@ DECLARE_BASIC_FETCH_FUNCS(bitfield); #define fetch_bitfield_string NULL #define fetch_bitfield_string_size NULL +/* comm only makes sense as a string */ +#define fetch_comm_u8 NULL +#define fetch_comm_u16 NULL +#define fetch_comm_u32 NULL +#define fetch_comm_u64 NULL +DECLARE_FETCH_FUNC(comm, string); +DECLARE_FETCH_FUNC(comm, string_size); + /* * Define macro for basic types - we don't need to define s* types, because * we have to care only about bitwidth at recording time. @@ -213,6 +222,7 @@ DEFINE_FETCH_##method(u64) ASSIGN_FETCH_FUNC(reg, ftype), \ ASSIGN_FETCH_FUNC(stack, ftype), \ ASSIGN_FETCH_FUNC(retval, ftype), \ +ASSIGN_FETCH_FUNC(comm, ftype), \ ASSIGN_FETCH_FUNC(memory, ftype), \ ASSIGN_FETCH_FUNC(symbol, ftype), \ ASSIGN_FETCH_FUNC(deref, ftype), \ From e2ace001176dc9745a472fe8bda1f0b28a4d7351 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 26 May 2016 12:00:33 -0700 Subject: [PATCH 07/20] tracing: Choose static tp_printk buffer by explicit nesting count Currently, the trace_printk code chooses which static buffer to use based on what type of atomic context (NMI, IRQ, etc) it's in. Simplify the code and make it more robust: simply count the nesting depth and choose a buffer based on the current nesting depth. The new code will only drop an event if we nest more than 4 deep, and the old code was guaranteed to malfunction if that happened. Link: http://lkml.kernel.org/r/07ab03aecfba25fcce8f9a211b14c9c5e2865c58.1464289095.git.luto@kernel.org Acked-by: Namhyung Kim Signed-off-by: Andy Lutomirski Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 83 +++++++++++++------------------------------- 1 file changed, 24 insertions(+), 59 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index aa240551fc5d..45e6747589c6 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2339,83 +2339,41 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags) /* created for use with alloc_percpu */ struct trace_buffer_struct { - char buffer[TRACE_BUF_SIZE]; + int nesting; + char buffer[4][TRACE_BUF_SIZE]; }; static struct trace_buffer_struct *trace_percpu_buffer; -static struct trace_buffer_struct *trace_percpu_sirq_buffer; -static struct trace_buffer_struct *trace_percpu_irq_buffer; -static struct trace_buffer_struct *trace_percpu_nmi_buffer; /* - * The buffer used is dependent on the context. There is a per cpu - * buffer for normal context, softirq contex, hard irq context and - * for NMI context. Thise allows for lockless recording. - * - * Note, if the buffers failed to be allocated, then this returns NULL + * Thise allows for lockless recording. If we're nested too deeply, then + * this returns NULL. */ static char *get_trace_buf(void) { - struct trace_buffer_struct *percpu_buffer; + struct trace_buffer_struct *buffer = this_cpu_ptr(trace_percpu_buffer); - /* - * If we have allocated per cpu buffers, then we do not - * need to do any locking. - */ - if (in_nmi()) - percpu_buffer = trace_percpu_nmi_buffer; - else if (in_irq()) - percpu_buffer = trace_percpu_irq_buffer; - else if (in_softirq()) - percpu_buffer = trace_percpu_sirq_buffer; - else - percpu_buffer = trace_percpu_buffer; - - if (!percpu_buffer) + if (!buffer || buffer->nesting >= 4) return NULL; - return this_cpu_ptr(&percpu_buffer->buffer[0]); + return &buffer->buffer[buffer->nesting++][0]; +} + +static void put_trace_buf(void) +{ + this_cpu_dec(trace_percpu_buffer->nesting); } static int alloc_percpu_trace_buffer(void) { struct trace_buffer_struct *buffers; - struct trace_buffer_struct *sirq_buffers; - struct trace_buffer_struct *irq_buffers; - struct trace_buffer_struct *nmi_buffers; buffers = alloc_percpu(struct trace_buffer_struct); - if (!buffers) - goto err_warn; - - sirq_buffers = alloc_percpu(struct trace_buffer_struct); - if (!sirq_buffers) - goto err_sirq; - - irq_buffers = alloc_percpu(struct trace_buffer_struct); - if (!irq_buffers) - goto err_irq; - - nmi_buffers = alloc_percpu(struct trace_buffer_struct); - if (!nmi_buffers) - goto err_nmi; + if (WARN(!buffers, "Could not allocate percpu trace_printk buffer")) + return -ENOMEM; trace_percpu_buffer = buffers; - trace_percpu_sirq_buffer = sirq_buffers; - trace_percpu_irq_buffer = irq_buffers; - trace_percpu_nmi_buffer = nmi_buffers; - return 0; - - err_nmi: - free_percpu(irq_buffers); - err_irq: - free_percpu(sirq_buffers); - err_sirq: - free_percpu(buffers); - err_warn: - WARN(1, "Could not allocate percpu trace_printk buffer"); - return -ENOMEM; } static int buffers_allocated; @@ -2506,7 +2464,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) tbuffer = get_trace_buf(); if (!tbuffer) { len = 0; - goto out; + goto out_nobuffer; } len = vbin_printf((u32 *)tbuffer, TRACE_BUF_SIZE/sizeof(int), fmt, args); @@ -2532,6 +2490,9 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) } out: + put_trace_buf(); + +out_nobuffer: preempt_enable_notrace(); unpause_graph_tracing(); @@ -2563,7 +2524,7 @@ __trace_array_vprintk(struct ring_buffer *buffer, tbuffer = get_trace_buf(); if (!tbuffer) { len = 0; - goto out; + goto out_nobuffer; } len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args); @@ -2582,7 +2543,11 @@ __trace_array_vprintk(struct ring_buffer *buffer, __buffer_unlock_commit(buffer, event); ftrace_trace_stack(&global_trace, buffer, flags, 6, pc, NULL); } - out: + +out: + put_trace_buf(); + +out_nobuffer: preempt_enable_notrace(); unpause_graph_tracing(); From aad108aa9d1aca5be178ef40325dcc99b448e866 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 17 Jun 2016 14:50:23 -0400 Subject: [PATCH 08/20] tracing: Add trace_printk sample code Add sample code to test trace_printk(). The trace_printk() functions should never be used in production code. This makes testing it a bit more difficult. Having a sample module that can test use cases of trace_printk() can help out. Currently it just tests trace_printk() where it will be converted into: trace_bputs() trace_puts() trace_bprintk() as well as staying as the normal _trace_printk(). It also tests its use in interrupt context as that will test the auxilery buffers. Signed-off-by: Steven Rostedt --- samples/Kconfig | 7 ++++ samples/Makefile | 2 +- samples/trace_printk/Makefile | 6 ++++ samples/trace_printk/trace-printk.c | 56 +++++++++++++++++++++++++++++ 4 files changed, 70 insertions(+), 1 deletion(-) create mode 100644 samples/trace_printk/Makefile create mode 100644 samples/trace_printk/trace-printk.c diff --git a/samples/Kconfig b/samples/Kconfig index 559a58baff6e..27a24571e96c 100644 --- a/samples/Kconfig +++ b/samples/Kconfig @@ -11,6 +11,13 @@ config SAMPLE_TRACE_EVENTS help This build trace event example modules. +config SAMPLE_TRACE_PRINTK + tristate "Build trace_printk module - tests various trace_printk formats" + depends on EVENT_TRACING && m + help + This builds a module that calls trace_printk() and can be used to + test various trace_printk() calls from a module. + config SAMPLE_KOBJECT tristate "Build kobject examples -- loadable modules only" depends on m diff --git a/samples/Makefile b/samples/Makefile index 2e3b523d7097..1a20169d85ac 100644 --- a/samples/Makefile +++ b/samples/Makefile @@ -2,4 +2,4 @@ obj-$(CONFIG_SAMPLES) += kobject/ kprobes/ trace_events/ livepatch/ \ hw_breakpoint/ kfifo/ kdb/ hidraw/ rpmsg/ seccomp/ \ - configfs/ connector/ v4l/ + configfs/ connector/ v4l/ trace_printk/ diff --git a/samples/trace_printk/Makefile b/samples/trace_printk/Makefile new file mode 100644 index 000000000000..19900ab2b00d --- /dev/null +++ b/samples/trace_printk/Makefile @@ -0,0 +1,6 @@ +# builds a module that calls various trace_printk routines +# then to use one (as root): insmod + +# This module can also be used to test the trace_printk code. + +obj-$(CONFIG_SAMPLE_TRACE_PRINTK) += trace-printk.o diff --git a/samples/trace_printk/trace-printk.c b/samples/trace_printk/trace-printk.c new file mode 100644 index 000000000000..e9e0040ff7be --- /dev/null +++ b/samples/trace_printk/trace-printk.c @@ -0,0 +1,56 @@ +#include +#include +#include + +/* Must not be static to force gcc to consider these non constant */ +char *trace_printk_test_global_str = + "This is a dynamic string that will use trace_puts\n"; + +char *trace_printk_test_global_str_irq = + "(irq) This is a dynamic string that will use trace_puts\n"; + +char *trace_printk_test_global_str_fmt = + "%sThis is a %s that will use trace_printk\n"; + +static struct irq_work irqwork; + +static void trace_printk_irq_work(struct irq_work *work) +{ + trace_printk("(irq) This is a static string that will use trace_bputs\n"); + trace_printk(trace_printk_test_global_str_irq); + + trace_printk("(irq) This is a %s that will use trace_bprintk()\n", + "static string"); + + trace_printk(trace_printk_test_global_str_fmt, + "(irq) ", "dynamic string"); +} + +static int __init trace_printk_init(void) +{ + init_irq_work(&irqwork, trace_printk_irq_work); + + trace_printk("This is a static string that will use trace_bputs\n"); + trace_printk(trace_printk_test_global_str); + + /* Kick off printing in irq context */ + irq_work_queue(&irqwork); + + trace_printk("This is a %s that will use trace_bprintk()\n", + "static string"); + + trace_printk(trace_printk_test_global_str_fmt, "", "dynamic string"); + + return 0; +} + +static void __exit trace_printk_exit(void) +{ +} + +module_init(trace_printk_init); +module_exit(trace_printk_exit); + +MODULE_AUTHOR("Steven Rostedt"); +MODULE_DESCRIPTION("trace-printk"); +MODULE_LICENSE("GPL"); From e947841c0dce9db675a957182214ef8091ac3d61 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 17 Jun 2016 17:40:58 -0400 Subject: [PATCH 09/20] tracing: Show the preempt count of when the event was called Because tracepoint callbacks are done with preemption enabled, the trace events are always called with preempt disable due to the rcu_read_lock_sched_notrace() in __DO_TRACE(). This causes the preempt count shown in the recorded trace event to be inaccurate. It is always one more that what the preempt_count was when the tracepoint was called. If CONFIG_PREEMPT is enabled, subtract 1 from the preempt_count before recording it in the trace buffer. Link: http://lkml.kernel.org/r/20160525132537.GA10808@linutronix.de Reported-by: Sebastian Andrzej Siewior Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index fd449eb138cf..03c0a48c3ac4 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -261,6 +261,14 @@ void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer, local_save_flags(fbuffer->flags); fbuffer->pc = preempt_count(); + /* + * If CONFIG_PREEMPT is enabled, then the tracepoint itself disables + * preemption (adding one to the preempt_count). Since we are + * interested in the preempt_count at the time the tracepoint was + * hit, we need to subtract one to offset the increment. + */ + if (IS_ENABLED(CONFIG_PREEMPT)) + fbuffer->pc--; fbuffer->trace_file = trace_file; fbuffer->event = From 9a51933e360897d9b3867c9b09dd5ccf7493e97e Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Wed, 11 May 2016 14:06:57 -0500 Subject: [PATCH 10/20] tracing: Expose CPU physical addresses (resource values) for PCI devices Previously, mmio_print_pcidev() put "user" addresses in the trace buffer. On most architectures, these are the same as CPU physical addresses, but on microblaze, mips, powerpc, and sparc, they may be something else, typically a raw BAR value (a bus address as opposed to a CPU address). Always expose the CPU physical address to avoid this arch-dependent behavior. This change should have no user-visible effect because this file currently depends on CONFIG_HAVE_MMIOTRACE_SUPPORT, which is only defined for x86, and pci_resource_to_user() is a no-op on x86. Link: http://lkml.kernel.org/r/20160511190657.5898.4248.stgit@bhelgaas-glaptop2.roam.corp.google.com Signed-off-by: Bjorn Helgaas Signed-off-by: Steven Rostedt --- kernel/trace/trace_mmiotrace.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 68f376ca6d3f..cd7480d0a201 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -68,19 +68,15 @@ static void mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev) trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x", dev->bus->number, dev->devfn, dev->vendor, dev->device, dev->irq); - /* - * XXX: is pci_resource_to_user() appropriate, since we are - * supposed to interpret the __ioremap() phys_addr argument based on - * these printed values? - */ for (i = 0; i < 7; i++) { - pci_resource_to_user(dev, i, &dev->resource[i], &start, &end); + start = dev->resource[i].start; trace_seq_printf(s, " %llx", (unsigned long long)(start | (dev->resource[i].flags & PCI_REGION_FLAG_MASK))); } for (i = 0; i < 7; i++) { - pci_resource_to_user(dev, i, &dev->resource[i], &start, &end); + start = dev->resource[i].start; + end = dev->resource[i].end; trace_seq_printf(s, " %llx", dev->resource[i].start < dev->resource[i].end ? (unsigned long long)(end - start) + 1 : 0); From be54f69c26193de31053190761e521903b89d098 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 23 Jun 2016 14:03:47 -0400 Subject: [PATCH 11/20] tracing: Skip more functions when doing stack tracing of events # echo 1 > options/stacktrace # echo 1 > events/sched/sched_switch/enable # cat trace -0 [002] d..2 1982.525169: => save_stack_trace => __ftrace_trace_stack => trace_buffer_unlock_commit_regs => event_trigger_unlock_commit => trace_event_buffer_commit => trace_event_raw_event_sched_switch => __schedule => schedule => schedule_preempt_disabled => cpu_startup_entry => start_secondary The above shows that we are seeing 6 functions before ever making it to the caller of the sched_switch event. # echo stacktrace > events/sched/sched_switch/trigger # cat trace -0 [002] d..3 2146.335208: => trace_event_buffer_commit => trace_event_raw_event_sched_switch => __schedule => schedule => schedule_preempt_disabled => cpu_startup_entry => start_secondary The stacktrace trigger isn't as bad, because it adds its own skip to the stacktracing, but still has two events extra. One issue is that if the stacktrace passes its own "regs" then there should be no addition to the skip, as the regs will not include the functions being called. This was an issue that was fixed by commit 7717c6be6999 ("tracing: Fix stacktrace skip depth in trace_buffer_unlock_commit_regs()" as adding the skip number for kprobes made the probes not have any stack at all. But since this is only an issue when regs is being used, a skip should be added if regs is NULL. Now we have: # echo 1 > options/stacktrace # echo 1 > events/sched/sched_switch/enable # cat trace -0 [000] d..2 1297.676333: => __schedule => schedule => schedule_preempt_disabled => cpu_startup_entry => rest_init => start_kernel => x86_64_start_reservations => x86_64_start_kernel # echo stacktrace > events/sched/sched_switch/trigger # cat trace -0 [002] d..3 1370.759745: => __schedule => schedule => schedule_preempt_disabled => cpu_startup_entry => start_secondary And kprobes are not touched. Reported-by: Peter Zijlstra Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 45e6747589c6..3d9f31b576f3 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2118,7 +2118,17 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr, { __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(tr, buffer, flags, 0, pc, regs); + /* + * If regs is not set, then skip the following callers: + * trace_buffer_unlock_commit_regs + * event_trigger_unlock_commit + * trace_event_buffer_commit + * trace_event_raw_event_sched_switch + * Note, we can still get here via blktrace, wakeup tracer + * and mmiotrace, but that's ok if they lose a function or + * two. They are that meaningful. + */ + ftrace_trace_stack(tr, buffer, flags, regs ? 0 : 4, pc, regs); ftrace_trace_userstack(buffer, flags, pc); } @@ -2168,6 +2178,13 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, trace.nr_entries = 0; trace.skip = skip; + /* + * Add two, for this function and the call to save_stack_trace() + * If regs is set, then these functions will not be in the way. + */ + if (!regs) + trace.skip += 2; + /* * Since events can happen in NMIs there's no safe way to * use the per cpu ftrace_stacks. We reserve it and if an interrupt From 7fa8b7171a638ad896acabd9a17183b75b70aeb4 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Fri, 17 Jun 2016 22:44:54 -0700 Subject: [PATCH 12/20] tracing/function_graph: Fix filters for function_graph threshold Function graph tracer currently ignores filters if tracing_thresh is set. For example, even if set_ftrace_pid is set, then its ignored if tracing_thresh set, resulting in all processes being traced. To fix this, we reuse the same entry function as when tracing_thresh is not set and do everything as in the regular case except for writing the function entry to the ring buffer. Link: http://lkml.kernel.org/r/1466228694-2677-1-git-send-email-agnel.joel@gmail.com Cc: Frederic Weisbecker Cc: Ingo Molnar Signed-off-by: Joel Fernandes Signed-off-by: Steven Rostedt --- kernel/trace/trace_functions_graph.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 67cce7896aeb..7363ccf79512 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -338,6 +338,13 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) if (ftrace_graph_notrace_addr(trace->func)) return 1; + /* + * Stop here if tracing_threshold is set. We only write function return + * events to the ring buffer. + */ + if (tracing_thresh) + return 1; + local_irq_save(flags); cpu = raw_smp_processor_id(); data = per_cpu_ptr(tr->trace_buffer.data, cpu); @@ -355,14 +362,6 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) return ret; } -static int trace_graph_thresh_entry(struct ftrace_graph_ent *trace) -{ - if (tracing_thresh) - return 1; - else - return trace_graph_entry(trace); -} - static void __trace_graph_function(struct trace_array *tr, unsigned long ip, unsigned long flags, int pc) @@ -457,7 +456,7 @@ static int graph_trace_init(struct trace_array *tr) set_graph_array(tr); if (tracing_thresh) ret = register_ftrace_graph(&trace_graph_thresh_return, - &trace_graph_thresh_entry); + &trace_graph_entry); else ret = register_ftrace_graph(&trace_graph_return, &trace_graph_entry); From 501c2375253c0795048f48368e0b3e8b2f6646dc Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 5 Jul 2016 10:04:34 -0400 Subject: [PATCH 13/20] ftrace: Move toplevel init out of ftrace_init_tracefs() Commit 345ddcc882d8 ("ftrace: Have set_ftrace_pid use the bitmap like events do") placed ftrace_init_tracefs into the instance creation, and encapsulated the top level updating with an if conditional, as the top level only gets updated at boot up. Unfortunately, this triggers section mismatch errors as the init functions are called from a function that can be called later, and the section mismatch logic is unaware of the if conditional that would prevent it from happening at run time. To make everyone happy, create a separate ftrace_init_tracefs_toplevel() routine that only gets called by init functions, and this will be what calls other init functions for the toplevel directory. Link: http://lkml.kernel.org/r/20160704102139.19cbc0d9@gandalf.local.home Reported-by: kbuild test robot Reported-by: Arnd Bergmann Fixes: 345ddcc882d8 ("ftrace: Have set_ftrace_pid use the bitmap like events do") Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 16 ++++++++++------ kernel/trace/trace.c | 1 + kernel/trace/trace.h | 3 +++ 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 8b488f4dd8e8..84752c8e28b5 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5539,16 +5539,20 @@ static const struct file_operations ftrace_pid_fops = { void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer) { - /* Only the top level directory has the dyn_tracefs and profile */ - if (tr->flags & TRACE_ARRAY_FL_GLOBAL) { - ftrace_init_dyn_tracefs(d_tracer); - ftrace_profile_tracefs(d_tracer); - } - trace_create_file("set_ftrace_pid", 0644, d_tracer, tr, &ftrace_pid_fops); } +void __init ftrace_init_tracefs_toplevel(struct trace_array *tr, + struct dentry *d_tracer) +{ + /* Only the top level directory has the dyn_tracefs and profile */ + WARN_ON(!(tr->flags & TRACE_ARRAY_FL_GLOBAL)); + + ftrace_init_dyn_tracefs(d_tracer); + ftrace_profile_tracefs(d_tracer); +} + /** * ftrace_kill - kill ftrace * diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3d9f31b576f3..5fd53a7847bc 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7369,6 +7369,7 @@ static __init int tracer_init_tracefs(void) return 0; init_tracer_tracefs(&global_trace, d_tracer); + ftrace_init_tracefs_toplevel(&global_trace, d_tracer); trace_create_file("tracing_thresh", 0644, d_tracer, &global_trace, &tracing_thresh_fops); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index eaee458755a4..c1de3f493cd3 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -857,6 +857,8 @@ void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func); void ftrace_reset_array_ops(struct trace_array *tr); int using_ftrace_ops_list_func(void); void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer); +void ftrace_init_tracefs_toplevel(struct trace_array *tr, + struct dentry *d_tracer); #else static inline int ftrace_trace_task(struct trace_array *tr) { @@ -874,6 +876,7 @@ static inline __init void ftrace_init_global_array_ops(struct trace_array *tr) { } static inline void ftrace_reset_array_ops(struct trace_array *tr) { } static inline void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d) { } +static inline void ftrace_init_tracefs_toplevel(struct trace_array *tr, struct dentry *d) { } /* ftace_func_t type is not defined, use macro instead of static inline */ #define ftrace_init_array_ops(tr, func) do { } while (0) #endif /* CONFIG_FUNCTION_TRACER */ From 67f20b084574def586ecba68508acd5d054ccc88 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Mon, 4 Jul 2016 15:10:04 +0000 Subject: [PATCH 14/20] tracing: Using for_each_set_bit() to simplify trace_pid_write() Using for_each_set_bit() to simplify the code. Link: http://lkml.kernel.org/r/1467645004-11169-1-git-send-email-weiyj_lk@163.com Signed-off-by: Wei Yongjun Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5fd53a7847bc..dade4c9559cc 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -517,13 +517,9 @@ int trace_pid_write(struct trace_pid_list *filtered_pids, if (filtered_pids) { /* copy the current bits to the new max */ - pid = find_first_bit(filtered_pids->pids, - filtered_pids->pid_max); - while (pid < filtered_pids->pid_max) { + for_each_set_bit(pid, filtered_pids->pids, + filtered_pids->pid_max) { set_bit(pid, pid_list->pids); - pid = find_next_bit(filtered_pids->pids, - filtered_pids->pid_max, - pid + 1); nr_pids++; } } From 7ad8fb61c4abf589596f0a4da34d987471481569 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Sun, 3 Jul 2016 08:51:34 -0500 Subject: [PATCH 15/20] tracing: Have HIST_TRIGGERS select TRACING The kbuild test robot reported a compile error if HIST_TRIGGERS was enabled but nothing else that selected TRACING was configured in. HIST_TRIGGERS should directly select it and not rely on anything else to do it. Link: http://lkml.kernel.org/r/57791866.8080505@linux.intel.com Reported-by: kbuild test robot Fixes: 7ef224d1d0e3a ("tracing: Add 'hist' event trigger command") Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index fafeaf803bd0..f4b86e8ca1e7 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -542,6 +542,7 @@ config HIST_TRIGGERS bool "Histogram triggers" depends on ARCH_HAVE_NMI_SAFE_CMPXCHG select TRACING_MAP + select TRACING default n help Hist triggers allow one or more arbitrary trace event fields From a4a551b8f1d4c4ebffd0f49dfef44df3128546f8 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 29 Jun 2016 19:56:48 +0900 Subject: [PATCH 16/20] ftrace: Reduce size of function graph entries Currently ftrace_graph_ent{,_entry} and ftrace_graph_ret{,_entry} struct can have padding bytes at the end due to alignment in 64-bit data type. As these data are recorded so frequently, those paddings waste non-negligible space. As the ring buffer maintains alignment properly for each architecture, just to remove the extra padding using 'packed' attribute. ftrace_graph_ent_entry: 24 -> 20 ftrace_graph_ret_entry: 48 -> 44 Also I moved the 'overrun' field in struct ftrace_graph_ret to minimize the padding in the middle. Tested on x86_64 only. Link: http://lkml.kernel.org/r/1467197808-13578-1-git-send-email-namhyung@kernel.org Cc: Ingo Molnar Cc: linux-arch@vger.kernel.org Signed-off-by: Namhyung Kim Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 12 ++++++++---- kernel/trace/trace.h | 11 +++++++++++ kernel/trace/trace_entries.h | 4 ++-- 3 files changed, 21 insertions(+), 6 deletions(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 66a36a815f0a..7d565afe35d2 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -754,23 +754,27 @@ static inline void ftrace_init(void) { } /* * Structure that defines an entry function trace. + * It's already packed but the attribute "packed" is needed + * to remove extra padding at the end. */ struct ftrace_graph_ent { unsigned long func; /* Current function */ int depth; -}; +} __packed; /* * Structure that defines a return function trace. + * It's already packed but the attribute "packed" is needed + * to remove extra padding at the end. */ struct ftrace_graph_ret { unsigned long func; /* Current function */ - unsigned long long calltime; - unsigned long long rettime; /* Number of functions that overran the depth limit for current task */ unsigned long overrun; + unsigned long long calltime; + unsigned long long rettime; int depth; -}; +} __packed; /* Type of the callback handlers for tracing function graph*/ typedef void (*trace_func_graph_ret_t)(struct ftrace_graph_ret *); /* return */ diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index c1de3f493cd3..f783df416726 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -80,6 +80,12 @@ enum trace_type { FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \ filter) +#undef FTRACE_ENTRY_PACKED +#define FTRACE_ENTRY_PACKED(name, struct_name, id, tstruct, print, \ + filter) \ + FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \ + filter) __packed + #include "trace_entries.h" /* @@ -1625,6 +1631,11 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled); #define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print, filter) \ FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \ filter) +#undef FTRACE_ENTRY_PACKED +#define FTRACE_ENTRY_PACKED(call, struct_name, id, tstruct, print, filter) \ + FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \ + filter) + #include "trace_entries.h" #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_FUNCTION_TRACER) diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index ee7b94a4810a..5c30efcda5e6 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -72,7 +72,7 @@ FTRACE_ENTRY_REG(function, ftrace_entry, ); /* Function call entry */ -FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry, +FTRACE_ENTRY_PACKED(funcgraph_entry, ftrace_graph_ent_entry, TRACE_GRAPH_ENT, @@ -88,7 +88,7 @@ FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry, ); /* Function return entry */ -FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry, +FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry, TRACE_GRAPH_RET, From 934de5f2eac35f34954166cdb47af3593451658a Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Fri, 1 Jul 2016 20:44:34 -0300 Subject: [PATCH 17/20] tracing: Use outer () on __get_str() definition __get_str(str)'s definition includes a (char *) operator overloading that is not protected with outer (). This patch adds () around __get_str()'s definition, enabling some code cleanup. Link: http://lkml.kernel.org/r/20ac1a10c2ec4ccd23e4a8ef34101fb6e4157d37.1467407618.git.bristot@redhat.com Cc: Trond Myklebust Cc: Anna Schumaker Cc: Ingo Molnar Suggested-by: Steven Rostedt Reviewed-by: Steven Rostedt Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt --- include/trace/perf.h | 2 +- include/trace/trace_events.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/trace/perf.h b/include/trace/perf.h index 88de5c205e86..04fe68bbe767 100644 --- a/include/trace/perf.h +++ b/include/trace/perf.h @@ -15,7 +15,7 @@ ((__entry->__data_loc_##field >> 16) & 0xffff) #undef __get_str -#define __get_str(field) (char *)__get_dynamic_array(field) +#define __get_str(field) ((char *)__get_dynamic_array(field)) #undef __get_bitmask #define __get_bitmask(field) (char *)__get_dynamic_array(field) diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h index 80679a9fae65..467e12f780d8 100644 --- a/include/trace/trace_events.h +++ b/include/trace/trace_events.h @@ -256,7 +256,7 @@ TRACE_MAKE_SYSTEM_STR(); ((__entry->__data_loc_##field >> 16) & 0xffff) #undef __get_str -#define __get_str(field) (char *)__get_dynamic_array(field) +#define __get_str(field) ((char *)__get_dynamic_array(field)) #undef __get_bitmask #define __get_bitmask(field) \ From a17167ce1110e576f7032b98deeb6ec800f39c7a Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Fri, 1 Jul 2016 20:44:35 -0300 Subject: [PATCH 18/20] tracing, RAS: Cleanup on __get_str() usage __get_str(msg) does not need (char *) operator overloading to access mgs's elements anymore. This patch substitutes ((char *)__get_str(msg))[0] usage to __get_str(msg)[0]. It is just a code cleanup, no changes on tracepoint ABI. Link: http://lkml.kernel.org/r/6f2db5be7705da2cb483923320c91283d7c712a7.1467407618.git.bristot@redhat.com Cc: Trond Myklebust Cc: Anna Schumaker Cc: Ingo Molnar Reviewed-by: Steven Rostedt Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt --- include/ras/ras_event.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h index 1443d79e4fe6..1791a12cfa85 100644 --- a/include/ras/ras_event.h +++ b/include/ras/ras_event.h @@ -147,7 +147,7 @@ TRACE_EVENT(mc_event, __entry->error_count, mc_event_error_type(__entry->error_type), __entry->error_count > 1 ? "s" : "", - ((char *)__get_str(msg))[0] ? " " : "", + __get_str(msg)[0] ? " " : "", __get_str(msg), __get_str(label), __entry->mc_index, @@ -157,7 +157,7 @@ TRACE_EVENT(mc_event, __entry->address, 1 << __entry->grain_bits, __entry->syndrome, - ((char *)__get_str(driver_detail))[0] ? " " : "", + __get_str(driver_detail)[0] ? " " : "", __get_str(driver_detail)) ); From 752d596b394c62f081393bbf53dff2a2c244632e Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Fri, 1 Jul 2016 20:44:36 -0300 Subject: [PATCH 19/20] tracing: Use __get_str() when manipulating strings Use __get_str(str) rather than __get_dynamic_array(str) when deadling with strings. It is just a code cleanup, no changes on tracepoint ABI. Link: http://lkml.kernel.org/r/ea260df91817411cca2a1f3db2abd88860094788.1467407618.git.bristot@redhat.com Cc: Trond Myklebust Cc: Anna Schumaker Cc: Ingo Molnar Cc: linux-nfs@vger.kernel.org Suggested-by: Steven Rostedt Reviewed-by: Steven Rostedt Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt --- fs/nfs/nfs4trace.h | 4 ++-- fs/nfs/nfstrace.h | 4 ++-- include/trace/events/printk.h | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index 9c150b153782..cfb8f7ce5cf6 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -1235,8 +1235,8 @@ DECLARE_EVENT_CLASS(nfs4_idmap_event, len = 0; __entry->error = error < 0 ? error : 0; __entry->id = id; - memcpy(__get_dynamic_array(name), name, len); - ((char *)__get_dynamic_array(name))[len] = 0; + memcpy(__get_str(name), name, len); + __get_str(name)[len] = 0; ), TP_printk( diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 0b9e5cc9a747..31c7763b94d5 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -707,9 +707,9 @@ TRACE_EVENT(nfs_sillyrename_unlink, __entry->dev = dir->i_sb->s_dev; __entry->dir = NFS_FILEID(dir); __entry->error = error; - memcpy(__get_dynamic_array(name), + memcpy(__get_str(name), data->args.name.name, len); - ((char *)__get_dynamic_array(name))[len] = 0; + __get_str(name)[len] = 0; ), TP_printk( diff --git a/include/trace/events/printk.h b/include/trace/events/printk.h index c008bc99f9fa..542a7558154a 100644 --- a/include/trace/events/printk.h +++ b/include/trace/events/printk.h @@ -16,8 +16,8 @@ TRACE_EVENT(console, ), TP_fast_assign( - memcpy(__get_dynamic_array(msg), text, len); - ((char *)__get_dynamic_array(msg))[len] = 0; + memcpy(__get_str(msg), text, len); + __get_str(msg)[len] = 0; ), TP_printk("%s", __get_str(msg)) From 78aebca2c955c1c5aeb48e12645e13fe3c3461f2 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Fri, 1 Jul 2016 20:44:37 -0300 Subject: [PATCH 20/20] printk, tracing: Avoiding unneeded blank lines Printk messages often finish with '\n' to cause a new line. But as each tracepoint is already printed in a new line, printk messages that finish with '\n' ends up adding a blank line to the trace output. For example: kworker/0:1-86 [000] d... 46.006949: console: [ 46.006946] usb 1-3: USB disconnect, device number 3 kworker/2:2-374 [002] d... 48.699342: console: [ 48.699339] usb 1-3: new high-speed USB device number 4 using ehci-pci kworker/2:2-374 [002] d... 49.041450: console: [ 49.041448] usb 1-3: New USB device found, idVendor=5986, idProduct=0 To avoid unneeded blank lines, this patch checks if the printk message finishes with '\n', if so, it cut is off the '\n' to avoid blank lines. In a patched kernel, the same messages are printed without extra blank lines. For example: kworker/0:4-185 [000] d... 23.641738: console: [ 23.641736] usb 1-3: USB disconnect, device number 3 kworker/0:4-185 [000] d... 24.918703: console: [ 24.918700] usb 1-3: new high-speed USB device number 4 using ehci-pci kworker/0:4-185 [000] d... 25.228308: console: [ 25.228306] usb 1-3: New USB device found, idVendor=5986, idProduct=02d5 Link: http://lkml.kernel.org/r/c350fb2521baaf681a1b4d67981ca0e900108e8e.1467407618.git.bristot@redhat.com Cc: Trond Myklebust Cc: Anna Schumaker Cc: Ingo Molnar Reviewed-by: Steven Rostedt Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt --- include/trace/events/printk.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/include/trace/events/printk.h b/include/trace/events/printk.h index 542a7558154a..f350170059c6 100644 --- a/include/trace/events/printk.h +++ b/include/trace/events/printk.h @@ -16,6 +16,14 @@ TRACE_EVENT(console, ), TP_fast_assign( + /* + * Each trace entry is printed in a new line. + * If the msg finishes with '\n', cut it off + * to avoid blank lines in the trace. + */ + if ((len > 0) && (text[len-1] == '\n')) + len -= 1; + memcpy(__get_str(msg), text, len); __get_str(msg)[len] = 0; ),