sched: Dynamically allocate sched_domain/sched_group data-structures
Instead of relying on static allocations for the sched_domain and sched_group trees, dynamically allocate and RCU free them. Allocating this dynamically also allows for some build_sched_groups() simplification since we can now (like with other simplifications) rely on the sched_domain tree instead of hard-coded knowledge. One tricky to note is that detach_destroy_domains() needs to hold rcu_read_lock() over the entire tear-down, per-cpu is not sufficient since that can lead to partial sched_group existance (could possibly be solved by doing the tear-down backwards but this is much more robust). A concequence of the above is that we can no longer print the sched_domain debug stuff from cpu_attach_domain() since that might now run with preemption disabled (due to classic RCU etc.) and sched_domain_debug() does some GFP_KERNEL allocations. Another thing to note is that we now fully rely on normal RCU and not RCU-sched, this is because with the new and exiting RCU flavours we grew over the years BH doesn't necessarily hold off RCU-sched grace periods (-rt is known to break this). This would in fact already cause us grief since we do sched_domain/sched_group iterations from softirq context. This patch is somewhat larger than I would like it to be, but I didn't find any means of shrinking/splitting this. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Mike Galbraith <efault@gmx.de> Cc: Nick Piggin <npiggin@kernel.dk> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Andrew Morton <akpm@linux-foundation.org> Link: http://lkml.kernel.org/r/20110407122942.245307941@chello.nl Signed-off-by: Ingo Molnar <mingo@elte.hu>
This commit is contained in:
committed by
Ingo Molnar
parent
a9c9a9b6bf
commit
dce840a087
@@ -1622,6 +1622,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
|
||||
/*
|
||||
* Otherwise, iterate the domains and find an elegible idle cpu.
|
||||
*/
|
||||
rcu_read_lock();
|
||||
for_each_domain(target, sd) {
|
||||
if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
|
||||
break;
|
||||
@@ -1641,6 +1642,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
|
||||
cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
|
||||
break;
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
return target;
|
||||
}
|
||||
@@ -1673,6 +1675,7 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
|
||||
new_cpu = prev_cpu;
|
||||
}
|
||||
|
||||
rcu_read_lock();
|
||||
for_each_domain(cpu, tmp) {
|
||||
if (!(tmp->flags & SD_LOAD_BALANCE))
|
||||
continue;
|
||||
@@ -1723,9 +1726,10 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
|
||||
|
||||
if (affine_sd) {
|
||||
if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
|
||||
return select_idle_sibling(p, cpu);
|
||||
else
|
||||
return select_idle_sibling(p, prev_cpu);
|
||||
prev_cpu = cpu;
|
||||
|
||||
new_cpu = select_idle_sibling(p, prev_cpu);
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
while (sd) {
|
||||
@@ -1766,6 +1770,8 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
|
||||
}
|
||||
/* while loop will break here if sd == NULL */
|
||||
}
|
||||
unlock:
|
||||
rcu_read_unlock();
|
||||
|
||||
return new_cpu;
|
||||
}
|
||||
@@ -3462,6 +3468,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
|
||||
raw_spin_unlock(&this_rq->lock);
|
||||
|
||||
update_shares(this_cpu);
|
||||
rcu_read_lock();
|
||||
for_each_domain(this_cpu, sd) {
|
||||
unsigned long interval;
|
||||
int balance = 1;
|
||||
@@ -3483,6 +3490,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
|
||||
break;
|
||||
}
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
raw_spin_lock(&this_rq->lock);
|
||||
|
||||
@@ -3531,6 +3539,7 @@ static int active_load_balance_cpu_stop(void *data)
|
||||
double_lock_balance(busiest_rq, target_rq);
|
||||
|
||||
/* Search for an sd spanning us and the target CPU. */
|
||||
rcu_read_lock();
|
||||
for_each_domain(target_cpu, sd) {
|
||||
if ((sd->flags & SD_LOAD_BALANCE) &&
|
||||
cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
|
||||
@@ -3546,6 +3555,7 @@ static int active_load_balance_cpu_stop(void *data)
|
||||
else
|
||||
schedstat_inc(sd, alb_failed);
|
||||
}
|
||||
rcu_read_unlock();
|
||||
double_unlock_balance(busiest_rq, target_rq);
|
||||
out_unlock:
|
||||
busiest_rq->active_balance = 0;
|
||||
@@ -3672,6 +3682,7 @@ static int find_new_ilb(int cpu)
|
||||
{
|
||||
struct sched_domain *sd;
|
||||
struct sched_group *ilb_group;
|
||||
int ilb = nr_cpu_ids;
|
||||
|
||||
/*
|
||||
* Have idle load balancer selection from semi-idle packages only
|
||||
@@ -3687,20 +3698,25 @@ static int find_new_ilb(int cpu)
|
||||
if (cpumask_weight(nohz.idle_cpus_mask) < 2)
|
||||
goto out_done;
|
||||
|
||||
rcu_read_lock();
|
||||
for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
|
||||
ilb_group = sd->groups;
|
||||
|
||||
do {
|
||||
if (is_semi_idle_group(ilb_group))
|
||||
return cpumask_first(nohz.grp_idle_mask);
|
||||
if (is_semi_idle_group(ilb_group)) {
|
||||
ilb = cpumask_first(nohz.grp_idle_mask);
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
ilb_group = ilb_group->next;
|
||||
|
||||
} while (ilb_group != sd->groups);
|
||||
}
|
||||
unlock:
|
||||
rcu_read_unlock();
|
||||
|
||||
out_done:
|
||||
return nr_cpu_ids;
|
||||
return ilb;
|
||||
}
|
||||
#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
|
||||
static inline int find_new_ilb(int call_cpu)
|
||||
@@ -3845,6 +3861,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
|
||||
|
||||
update_shares(cpu);
|
||||
|
||||
rcu_read_lock();
|
||||
for_each_domain(cpu, sd) {
|
||||
if (!(sd->flags & SD_LOAD_BALANCE))
|
||||
continue;
|
||||
@@ -3890,6 +3907,7 @@ out:
|
||||
if (!balance)
|
||||
break;
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
/*
|
||||
* next_balance will be updated only when there is a need.
|
||||
|
Reference in New Issue
Block a user