Merge branch 'perfcounters-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip

* 'perfcounters-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (27 commits)
  perf_counter: Zero dead bytes from ftrace raw samples size alignment
  perf_counter: Subtract the buffer size field from the event record size
  perf_counter: Require CAP_SYS_ADMIN for raw tracepoint data
  perf_counter: Correct PERF_SAMPLE_RAW output
  perf tools: callchain: Fix bad rounding of minimum rate
  perf_counter tools: Fix libbfd detection for systems with libz dependency
  perf: "Longum est iter per praecepta, breve et efficax per exempla"
  perf_counter: Fix a race on perf_counter_ctx
  perf_counter: Fix tracepoint sampling to be part of generic sampling
  perf_counter: Work around gcc warning by initializing tracepoint record unconditionally
  perf tools: callchain: Fix sum of percentages to be 100% by displaying amount of ignored chains in fractal mode
  perf tools: callchain: Fix 'perf report' display to be callchain by default
  perf tools: callchain: Fix spurious 'perf report' warnings: ignore empty callchains
  perf record: Fix the -A UI for empty or non-existent perf.data
  perf util: Fix do_read() to fail on EOF instead of busy-looping
  perf list: Fix the output to not include tracepoints without an id
  perf_counter/powerpc: Fix oops on cpus without perf_counter hardware support
  perf stat: Fix tool option consistency: rename -S/--scale to -c/--scale
  perf report: Add debug help for the finding of symbol bugs - show the symtab origin (DSO, build-id, kernel, etc)
  perf report: Fix per task mult-counter stat reporting
  ...
This commit is contained in:
Linus Torvalds
2009-08-10 11:48:51 -07:00
19 changed files with 1223 additions and 212 deletions

View File

@ -2646,7 +2646,6 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
u64 counter;
} group_entry;
struct perf_callchain_entry *callchain = NULL;
struct perf_tracepoint_record *tp;
int callchain_size = 0;
u64 time;
struct {
@ -2715,9 +2714,16 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
header.size += sizeof(u64);
}
if (sample_type & PERF_SAMPLE_TP_RECORD) {
tp = data->private;
header.size += tp->size;
if (sample_type & PERF_SAMPLE_RAW) {
int size = sizeof(u32);
if (data->raw)
size += data->raw->size;
else
size += sizeof(u32);
WARN_ON_ONCE(size & (sizeof(u64)-1));
header.size += size;
}
ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
@ -2783,8 +2789,21 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
}
}
if (sample_type & PERF_SAMPLE_TP_RECORD)
perf_output_copy(&handle, tp->record, tp->size);
if (sample_type & PERF_SAMPLE_RAW) {
if (data->raw) {
perf_output_put(&handle, data->raw->size);
perf_output_copy(&handle, data->raw->data, data->raw->size);
} else {
struct {
u32 size;
u32 data;
} raw = {
.size = sizeof(u32),
.data = 0,
};
perf_output_put(&handle, raw);
}
}
perf_output_end(&handle);
}
@ -2849,7 +2868,8 @@ perf_counter_read_event(struct perf_counter *counter,
*/
struct perf_task_event {
struct task_struct *task;
struct task_struct *task;
struct perf_counter_context *task_ctx;
struct {
struct perf_event_header header;
@ -2909,24 +2929,23 @@ static void perf_counter_task_ctx(struct perf_counter_context *ctx,
static void perf_counter_task_event(struct perf_task_event *task_event)
{
struct perf_cpu_context *cpuctx;
struct perf_counter_context *ctx;
struct perf_counter_context *ctx = task_event->task_ctx;
cpuctx = &get_cpu_var(perf_cpu_context);
perf_counter_task_ctx(&cpuctx->ctx, task_event);
put_cpu_var(perf_cpu_context);
rcu_read_lock();
/*
* doesn't really matter which of the child contexts the
* events ends up in.
*/
ctx = rcu_dereference(current->perf_counter_ctxp);
if (!ctx)
ctx = rcu_dereference(task_event->task->perf_counter_ctxp);
if (ctx)
perf_counter_task_ctx(ctx, task_event);
rcu_read_unlock();
}
static void perf_counter_task(struct task_struct *task, int new)
static void perf_counter_task(struct task_struct *task,
struct perf_counter_context *task_ctx,
int new)
{
struct perf_task_event task_event;
@ -2936,8 +2955,9 @@ static void perf_counter_task(struct task_struct *task, int new)
return;
task_event = (struct perf_task_event){
.task = task,
.event = {
.task = task,
.task_ctx = task_ctx,
.event = {
.header = {
.type = new ? PERF_EVENT_FORK : PERF_EVENT_EXIT,
.misc = 0,
@ -2955,7 +2975,7 @@ static void perf_counter_task(struct task_struct *task, int new)
void perf_counter_fork(struct task_struct *task)
{
perf_counter_task(task, 1);
perf_counter_task(task, NULL, 1);
}
/*
@ -3344,87 +3364,81 @@ int perf_counter_overflow(struct perf_counter *counter, int nmi,
* Generic software counter infrastructure
*/
static void perf_swcounter_update(struct perf_counter *counter)
/*
* We directly increment counter->count and keep a second value in
* counter->hw.period_left to count intervals. This period counter
* is kept in the range [-sample_period, 0] so that we can use the
* sign as trigger.
*/
static u64 perf_swcounter_set_period(struct perf_counter *counter)
{
struct hw_perf_counter *hwc = &counter->hw;
u64 prev, now;
s64 delta;
u64 period = hwc->last_period;
u64 nr, offset;
s64 old, val;
hwc->last_period = hwc->sample_period;
again:
prev = atomic64_read(&hwc->prev_count);
now = atomic64_read(&hwc->count);
if (atomic64_cmpxchg(&hwc->prev_count, prev, now) != prev)
old = val = atomic64_read(&hwc->period_left);
if (val < 0)
return 0;
nr = div64_u64(period + val, period);
offset = nr * period;
val -= offset;
if (atomic64_cmpxchg(&hwc->period_left, old, val) != old)
goto again;
delta = now - prev;
atomic64_add(delta, &counter->count);
atomic64_sub(delta, &hwc->period_left);
}
static void perf_swcounter_set_period(struct perf_counter *counter)
{
struct hw_perf_counter *hwc = &counter->hw;
s64 left = atomic64_read(&hwc->period_left);
s64 period = hwc->sample_period;
if (unlikely(left <= -period)) {
left = period;
atomic64_set(&hwc->period_left, left);
hwc->last_period = period;
}
if (unlikely(left <= 0)) {
left += period;
atomic64_add(period, &hwc->period_left);
hwc->last_period = period;
}
atomic64_set(&hwc->prev_count, -left);
atomic64_set(&hwc->count, -left);
}
static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
{
enum hrtimer_restart ret = HRTIMER_RESTART;
struct perf_sample_data data;
struct perf_counter *counter;
u64 period;
counter = container_of(hrtimer, struct perf_counter, hw.hrtimer);
counter->pmu->read(counter);
data.addr = 0;
data.regs = get_irq_regs();
/*
* In case we exclude kernel IPs or are somehow not in interrupt
* context, provide the next best thing, the user IP.
*/
if ((counter->attr.exclude_kernel || !data.regs) &&
!counter->attr.exclude_user)
data.regs = task_pt_regs(current);
if (data.regs) {
if (perf_counter_overflow(counter, 0, &data))
ret = HRTIMER_NORESTART;
}
period = max_t(u64, 10000, counter->hw.sample_period);
hrtimer_forward_now(hrtimer, ns_to_ktime(period));
return ret;
return nr;
}
static void perf_swcounter_overflow(struct perf_counter *counter,
int nmi, struct perf_sample_data *data)
{
data->period = counter->hw.last_period;
struct hw_perf_counter *hwc = &counter->hw;
u64 overflow;
perf_swcounter_update(counter);
perf_swcounter_set_period(counter);
if (perf_counter_overflow(counter, nmi, data))
/* soft-disable the counter */
;
data->period = counter->hw.last_period;
overflow = perf_swcounter_set_period(counter);
if (hwc->interrupts == MAX_INTERRUPTS)
return;
for (; overflow; overflow--) {
if (perf_counter_overflow(counter, nmi, data)) {
/*
* We inhibit the overflow from happening when
* hwc->interrupts == MAX_INTERRUPTS.
*/
break;
}
}
}
static void perf_swcounter_unthrottle(struct perf_counter *counter)
{
/*
* Nothing to do, we already reset hwc->interrupts.
*/
}
static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
int nmi, struct perf_sample_data *data)
{
struct hw_perf_counter *hwc = &counter->hw;
atomic64_add(nr, &counter->count);
if (!hwc->sample_period)
return;
if (!data->regs)
return;
if (!atomic64_add_negative(nr, &hwc->period_left))
perf_swcounter_overflow(counter, nmi, data);
}
static int perf_swcounter_is_counting(struct perf_counter *counter)
@ -3488,15 +3502,6 @@ static int perf_swcounter_match(struct perf_counter *counter,
return 1;
}
static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
int nmi, struct perf_sample_data *data)
{
int neg = atomic64_add_negative(nr, &counter->hw.count);
if (counter->hw.sample_period && !neg && data->regs)
perf_swcounter_overflow(counter, nmi, data);
}
static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
enum perf_type_id type,
u32 event, u64 nr, int nmi,
@ -3575,26 +3580,65 @@ void __perf_swcounter_event(u32 event, u64 nr, int nmi,
static void perf_swcounter_read(struct perf_counter *counter)
{
perf_swcounter_update(counter);
}
static int perf_swcounter_enable(struct perf_counter *counter)
{
perf_swcounter_set_period(counter);
struct hw_perf_counter *hwc = &counter->hw;
if (hwc->sample_period) {
hwc->last_period = hwc->sample_period;
perf_swcounter_set_period(counter);
}
return 0;
}
static void perf_swcounter_disable(struct perf_counter *counter)
{
perf_swcounter_update(counter);
}
static const struct pmu perf_ops_generic = {
.enable = perf_swcounter_enable,
.disable = perf_swcounter_disable,
.read = perf_swcounter_read,
.unthrottle = perf_swcounter_unthrottle,
};
/*
* hrtimer based swcounter callback
*/
static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
{
enum hrtimer_restart ret = HRTIMER_RESTART;
struct perf_sample_data data;
struct perf_counter *counter;
u64 period;
counter = container_of(hrtimer, struct perf_counter, hw.hrtimer);
counter->pmu->read(counter);
data.addr = 0;
data.regs = get_irq_regs();
/*
* In case we exclude kernel IPs or are somehow not in interrupt
* context, provide the next best thing, the user IP.
*/
if ((counter->attr.exclude_kernel || !data.regs) &&
!counter->attr.exclude_user)
data.regs = task_pt_regs(current);
if (data.regs) {
if (perf_counter_overflow(counter, 0, &data))
ret = HRTIMER_NORESTART;
}
period = max_t(u64, 10000, counter->hw.sample_period);
hrtimer_forward_now(hrtimer, ns_to_ktime(period));
return ret;
}
/*
* Software counter: cpu wall time clock
*/
@ -3715,15 +3759,15 @@ static const struct pmu perf_ops_task_clock = {
void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record,
int entry_size)
{
struct perf_tracepoint_record tp = {
struct perf_raw_record raw = {
.size = entry_size,
.record = record,
.data = record,
};
struct perf_sample_data data = {
.regs = get_irq_regs(),
.addr = addr,
.private = &tp,
.raw = &raw,
};
if (!data.regs)
@ -3743,6 +3787,14 @@ static void tp_perf_counter_destroy(struct perf_counter *counter)
static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
{
/*
* Raw tracepoint data is a severe data leak, only allow root to
* have these.
*/
if ((counter->attr.sample_type & PERF_SAMPLE_RAW) &&
!capable(CAP_SYS_ADMIN))
return ERR_PTR(-EPERM);
if (ftrace_profile_enable(counter->attr.config))
return NULL;
@ -4285,7 +4337,7 @@ void perf_counter_exit_task(struct task_struct *child)
unsigned long flags;
if (likely(!child->perf_counter_ctxp)) {
perf_counter_task(child, 0);
perf_counter_task(child, NULL, 0);
return;
}
@ -4305,6 +4357,7 @@ void perf_counter_exit_task(struct task_struct *child)
* incremented the context's refcount before we do put_ctx below.
*/
spin_lock(&child_ctx->lock);
child->perf_counter_ctxp = NULL;
/*
* If this context is a clone; unclone it so it can't get
* swapped to another process while we're removing all
@ -4318,9 +4371,7 @@ void perf_counter_exit_task(struct task_struct *child)
* won't get any samples after PERF_EVENT_EXIT. We can however still
* get a few PERF_EVENT_READ events.
*/
perf_counter_task(child, 0);
child->perf_counter_ctxp = NULL;
perf_counter_task(child, child_ctx, 0);
/*
* We can recurse on the same lock type through: