rcu: RCU-based detection of stalled CPUs for Classic RCU
This patch adds stalled-CPU detection to Classic RCU. This capability
is enabled by a new config variable CONFIG_RCU_CPU_STALL_DETECTOR, which
defaults disabled.
This is a debugging feature to detect infinite loops in kernel code, not
something that non-kernel-hackers would be expected to care about.
This feature can detect looping CPUs in !PREEMPT builds and looping CPUs
with preemption disabled in PREEMPT builds. This is essentially a port of
this functionality from the treercu patch, replacing the stall debug patch
that is already in tip/core/rcu (commit 67182ae1c4
).
The changes from the patch in tip/core/rcu include making the config
variable name match that in treercu, changing from seconds to jiffies to
avoid spurious warnings, and printing a boot message when this feature
is enabled.
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
This commit is contained in:
committed by
Ingo Molnar
parent
b5259d9442
commit
2133b5d7ff
@@ -40,15 +40,21 @@
|
|||||||
#include <linux/cpumask.h>
|
#include <linux/cpumask.h>
|
||||||
#include <linux/seqlock.h>
|
#include <linux/seqlock.h>
|
||||||
|
|
||||||
|
#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
|
||||||
|
#define RCU_SECONDS_TILL_STALL_CHECK ( 3 * HZ) /* for rcp->jiffies_stall */
|
||||||
|
#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ) /* for rcp->jiffies_stall */
|
||||||
|
#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
|
||||||
|
|
||||||
/* Global control variables for rcupdate callback mechanism. */
|
/* Global control variables for rcupdate callback mechanism. */
|
||||||
struct rcu_ctrlblk {
|
struct rcu_ctrlblk {
|
||||||
long cur; /* Current batch number. */
|
long cur; /* Current batch number. */
|
||||||
long completed; /* Number of the last completed batch */
|
long completed; /* Number of the last completed batch */
|
||||||
long pending; /* Number of the last pending batch */
|
long pending; /* Number of the last pending batch */
|
||||||
#ifdef CONFIG_DEBUG_RCU_STALL
|
#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
|
||||||
unsigned long gp_check; /* Time grace period should end, in seconds. */
|
unsigned long gp_start; /* Time at which GP started in jiffies. */
|
||||||
#endif /* #ifdef CONFIG_DEBUG_RCU_STALL */
|
unsigned long jiffies_stall;
|
||||||
|
/* Time at which to check for CPU stalls. */
|
||||||
|
#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
|
||||||
|
|
||||||
int signaled;
|
int signaled;
|
||||||
|
|
||||||
|
@@ -164,6 +164,87 @@ static void __call_rcu(struct rcu_head *head, struct rcu_ctrlblk *rcp,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
|
||||||
|
|
||||||
|
static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp)
|
||||||
|
{
|
||||||
|
rcp->gp_start = jiffies;
|
||||||
|
rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
|
||||||
|
{
|
||||||
|
int cpu;
|
||||||
|
long delta;
|
||||||
|
unsigned long flags;
|
||||||
|
|
||||||
|
/* Only let one CPU complain about others per time interval. */
|
||||||
|
|
||||||
|
spin_lock_irqsave(&rcp->lock, flags);
|
||||||
|
delta = jiffies - rcp->jiffies_stall;
|
||||||
|
if (delta < 2 || rcp->cur != rcp->completed) {
|
||||||
|
spin_unlock_irqrestore(&rcp->lock, flags);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
|
||||||
|
spin_unlock_irqrestore(&rcp->lock, flags);
|
||||||
|
|
||||||
|
/* OK, time to rat on our buddy... */
|
||||||
|
|
||||||
|
printk(KERN_ERR "RCU detected CPU stalls:");
|
||||||
|
for_each_possible_cpu(cpu) {
|
||||||
|
if (cpu_isset(cpu, rcp->cpumask))
|
||||||
|
printk(" %d", cpu);
|
||||||
|
}
|
||||||
|
printk(" (detected by %d, t=%ld jiffies)\n",
|
||||||
|
smp_processor_id(), (long)(jiffies - rcp->gp_start));
|
||||||
|
}
|
||||||
|
|
||||||
|
static void print_cpu_stall(struct rcu_ctrlblk *rcp)
|
||||||
|
{
|
||||||
|
unsigned long flags;
|
||||||
|
|
||||||
|
printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu jiffies)\n",
|
||||||
|
smp_processor_id(), jiffies,
|
||||||
|
jiffies - rcp->gp_start);
|
||||||
|
dump_stack();
|
||||||
|
spin_lock_irqsave(&rcp->lock, flags);
|
||||||
|
if ((long)(jiffies - rcp->jiffies_stall) >= 0)
|
||||||
|
rcp->jiffies_stall =
|
||||||
|
jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
|
||||||
|
spin_unlock_irqrestore(&rcp->lock, flags);
|
||||||
|
set_need_resched(); /* kick ourselves to get things going. */
|
||||||
|
}
|
||||||
|
|
||||||
|
static void check_cpu_stall(struct rcu_ctrlblk *rcp)
|
||||||
|
{
|
||||||
|
long delta;
|
||||||
|
|
||||||
|
delta = jiffies - rcp->jiffies_stall;
|
||||||
|
if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0) {
|
||||||
|
|
||||||
|
/* We haven't checked in, so go dump stack. */
|
||||||
|
print_cpu_stall(rcp);
|
||||||
|
|
||||||
|
} else if (rcp->cur != rcp->completed && delta >= 2) {
|
||||||
|
|
||||||
|
/* They had two seconds to dump stack, so complain. */
|
||||||
|
print_other_cpu_stall(rcp);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
|
||||||
|
|
||||||
|
static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
static void check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* call_rcu - Queue an RCU callback for invocation after a grace period.
|
* call_rcu - Queue an RCU callback for invocation after a grace period.
|
||||||
* @head: structure to be used for queueing the RCU updates.
|
* @head: structure to be used for queueing the RCU updates.
|
||||||
@@ -293,84 +374,6 @@ static void rcu_do_batch(struct rcu_data *rdp)
|
|||||||
* period (if necessary).
|
* period (if necessary).
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#ifdef CONFIG_DEBUG_RCU_STALL
|
|
||||||
|
|
||||||
static inline void record_gp_check_time(struct rcu_ctrlblk *rcp)
|
|
||||||
{
|
|
||||||
rcp->gp_check = get_seconds() + 3;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
|
|
||||||
{
|
|
||||||
int cpu;
|
|
||||||
long delta;
|
|
||||||
unsigned long flags;
|
|
||||||
|
|
||||||
/* Only let one CPU complain about others per time interval. */
|
|
||||||
|
|
||||||
spin_lock_irqsave(&rcp->lock, flags);
|
|
||||||
delta = get_seconds() - rcp->gp_check;
|
|
||||||
if (delta < 2L || cpus_empty(rcp->cpumask)) {
|
|
||||||
spin_unlock(&rcp->lock);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
rcp->gp_check = get_seconds() + 30;
|
|
||||||
spin_unlock_irqrestore(&rcp->lock, flags);
|
|
||||||
|
|
||||||
/* OK, time to rat on our buddy... */
|
|
||||||
|
|
||||||
printk(KERN_ERR "RCU detected CPU stalls:");
|
|
||||||
for_each_cpu_mask(cpu, rcp->cpumask)
|
|
||||||
printk(" %d", cpu);
|
|
||||||
printk(" (detected by %d, t=%lu/%lu)\n",
|
|
||||||
smp_processor_id(), get_seconds(), rcp->gp_check);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void print_cpu_stall(struct rcu_ctrlblk *rcp)
|
|
||||||
{
|
|
||||||
unsigned long flags;
|
|
||||||
|
|
||||||
printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu)\n",
|
|
||||||
smp_processor_id(), get_seconds(), rcp->gp_check);
|
|
||||||
dump_stack();
|
|
||||||
spin_lock_irqsave(&rcp->lock, flags);
|
|
||||||
if ((long)(get_seconds() - rcp->gp_check) >= 0L)
|
|
||||||
rcp->gp_check = get_seconds() + 30;
|
|
||||||
spin_unlock_irqrestore(&rcp->lock, flags);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
|
|
||||||
{
|
|
||||||
long delta;
|
|
||||||
|
|
||||||
delta = get_seconds() - rcp->gp_check;
|
|
||||||
if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0L) {
|
|
||||||
|
|
||||||
/* We haven't checked in, so go dump stack. */
|
|
||||||
|
|
||||||
print_cpu_stall(rcp);
|
|
||||||
|
|
||||||
} else {
|
|
||||||
if (!cpus_empty(rcp->cpumask) && delta >= 2L) {
|
|
||||||
/* They had two seconds to dump stack, so complain. */
|
|
||||||
print_other_cpu_stall(rcp);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#else /* #ifdef CONFIG_DEBUG_RCU_STALL */
|
|
||||||
|
|
||||||
static inline void record_gp_check_time(struct rcu_ctrlblk *rcp)
|
|
||||||
{
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline void
|
|
||||||
check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
|
|
||||||
{
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif /* #else #ifdef CONFIG_DEBUG_RCU_STALL */
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Register a new batch of callbacks, and start it up if there is currently no
|
* Register a new batch of callbacks, and start it up if there is currently no
|
||||||
* active batch and the batch to be registered has not already occurred.
|
* active batch and the batch to be registered has not already occurred.
|
||||||
@@ -381,7 +384,7 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp)
|
|||||||
if (rcp->cur != rcp->pending &&
|
if (rcp->cur != rcp->pending &&
|
||||||
rcp->completed == rcp->cur) {
|
rcp->completed == rcp->cur) {
|
||||||
rcp->cur++;
|
rcp->cur++;
|
||||||
record_gp_check_time(rcp);
|
record_gp_stall_check_time(rcp);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Accessing nohz_cpu_mask before incrementing rcp->cur needs a
|
* Accessing nohz_cpu_mask before incrementing rcp->cur needs a
|
||||||
@@ -603,7 +606,7 @@ static void rcu_process_callbacks(struct softirq_action *unused)
|
|||||||
static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
|
static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
|
||||||
{
|
{
|
||||||
/* Check for CPU stalls, if enabled. */
|
/* Check for CPU stalls, if enabled. */
|
||||||
check_cpu_stall(rcp, rdp);
|
check_cpu_stall(rcp);
|
||||||
|
|
||||||
if (rdp->nxtlist) {
|
if (rdp->nxtlist) {
|
||||||
long completed_snap = ACCESS_ONCE(rcp->completed);
|
long completed_snap = ACCESS_ONCE(rcp->completed);
|
||||||
@@ -769,6 +772,9 @@ static struct notifier_block __cpuinitdata rcu_nb = {
|
|||||||
*/
|
*/
|
||||||
void __init __rcu_init(void)
|
void __init __rcu_init(void)
|
||||||
{
|
{
|
||||||
|
#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
|
||||||
|
printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
|
||||||
|
#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
|
||||||
rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
|
rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
|
||||||
(void *)(long)smp_processor_id());
|
(void *)(long)smp_processor_id());
|
||||||
/* Register notifier for non-boot CPUs */
|
/* Register notifier for non-boot CPUs */
|
||||||
|
@@ -597,7 +597,7 @@ config RCU_TORTURE_TEST_RUNNABLE
|
|||||||
Say N here if you want the RCU torture tests to start only
|
Say N here if you want the RCU torture tests to start only
|
||||||
after being manually enabled via /proc.
|
after being manually enabled via /proc.
|
||||||
|
|
||||||
config RCU_CPU_STALL
|
config RCU_CPU_STALL_DETECTOR
|
||||||
bool "Check for stalled CPUs delaying RCU grace periods"
|
bool "Check for stalled CPUs delaying RCU grace periods"
|
||||||
depends on CLASSIC_RCU
|
depends on CLASSIC_RCU
|
||||||
default n
|
default n
|
||||||
|
Reference in New Issue
Block a user