Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (46 commits) llist: Add back llist_add_batch() and llist_del_first() prototypes sched: Don't use tasklist_lock for debug prints sched: Warn on rt throttling sched: Unify the ->cpus_allowed mask copy sched: Wrap scheduler p->cpus_allowed access sched: Request for idle balance during nohz idle load balance sched: Use resched IPI to kick off the nohz idle balance sched: Fix idle_cpu() llist: Remove cpu_relax() usage in cmpxchg loops sched: Convert to struct llist llist: Add llist_next() irq_work: Use llist in the struct irq_work logic llist: Return whether list is empty before adding in llist_add() llist: Move cpu_relax() to after the cmpxchg() llist: Remove the platform-dependent NMI checks llist: Make some llist functions inline sched, tracing: Show PREEMPT_ACTIVE state in trace_sched_switch sched: Remove redundant test in check_preempt_tick() sched: Add documentation for bandwidth control sched: Return unused runtime on group dequeue ...
This commit is contained in:
666
kernel/sched.c
666
kernel/sched.c
@@ -196,10 +196,28 @@ static inline int rt_bandwidth_enabled(void)
|
||||
return sysctl_sched_rt_runtime >= 0;
|
||||
}
|
||||
|
||||
static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
|
||||
{
|
||||
unsigned long delta;
|
||||
ktime_t soft, hard, now;
|
||||
|
||||
for (;;) {
|
||||
if (hrtimer_active(period_timer))
|
||||
break;
|
||||
|
||||
now = hrtimer_cb_get_time(period_timer);
|
||||
hrtimer_forward(period_timer, now, period);
|
||||
|
||||
soft = hrtimer_get_softexpires(period_timer);
|
||||
hard = hrtimer_get_expires(period_timer);
|
||||
delta = ktime_to_ns(ktime_sub(hard, soft));
|
||||
__hrtimer_start_range_ns(period_timer, soft, delta,
|
||||
HRTIMER_MODE_ABS_PINNED, 0);
|
||||
}
|
||||
}
|
||||
|
||||
static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
|
||||
{
|
||||
ktime_t now;
|
||||
|
||||
if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
|
||||
return;
|
||||
|
||||
@@ -207,22 +225,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
|
||||
return;
|
||||
|
||||
raw_spin_lock(&rt_b->rt_runtime_lock);
|
||||
for (;;) {
|
||||
unsigned long delta;
|
||||
ktime_t soft, hard;
|
||||
|
||||
if (hrtimer_active(&rt_b->rt_period_timer))
|
||||
break;
|
||||
|
||||
now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
|
||||
hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
|
||||
|
||||
soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);
|
||||
hard = hrtimer_get_expires(&rt_b->rt_period_timer);
|
||||
delta = ktime_to_ns(ktime_sub(hard, soft));
|
||||
__hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
|
||||
HRTIMER_MODE_ABS_PINNED, 0);
|
||||
}
|
||||
start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
|
||||
raw_spin_unlock(&rt_b->rt_runtime_lock);
|
||||
}
|
||||
|
||||
@@ -247,6 +250,24 @@ struct cfs_rq;
|
||||
|
||||
static LIST_HEAD(task_groups);
|
||||
|
||||
struct cfs_bandwidth {
|
||||
#ifdef CONFIG_CFS_BANDWIDTH
|
||||
raw_spinlock_t lock;
|
||||
ktime_t period;
|
||||
u64 quota, runtime;
|
||||
s64 hierarchal_quota;
|
||||
u64 runtime_expires;
|
||||
|
||||
int idle, timer_active;
|
||||
struct hrtimer period_timer, slack_timer;
|
||||
struct list_head throttled_cfs_rq;
|
||||
|
||||
/* statistics */
|
||||
int nr_periods, nr_throttled;
|
||||
u64 throttled_time;
|
||||
#endif
|
||||
};
|
||||
|
||||
/* task group related information */
|
||||
struct task_group {
|
||||
struct cgroup_subsys_state css;
|
||||
@@ -278,6 +299,8 @@ struct task_group {
|
||||
#ifdef CONFIG_SCHED_AUTOGROUP
|
||||
struct autogroup *autogroup;
|
||||
#endif
|
||||
|
||||
struct cfs_bandwidth cfs_bandwidth;
|
||||
};
|
||||
|
||||
/* task_group_lock serializes the addition/removal of task groups */
|
||||
@@ -311,7 +334,7 @@ struct task_group root_task_group;
|
||||
/* CFS-related fields in a runqueue */
|
||||
struct cfs_rq {
|
||||
struct load_weight load;
|
||||
unsigned long nr_running;
|
||||
unsigned long nr_running, h_nr_running;
|
||||
|
||||
u64 exec_clock;
|
||||
u64 min_vruntime;
|
||||
@@ -377,9 +400,120 @@ struct cfs_rq {
|
||||
|
||||
unsigned long load_contribution;
|
||||
#endif
|
||||
#ifdef CONFIG_CFS_BANDWIDTH
|
||||
int runtime_enabled;
|
||||
u64 runtime_expires;
|
||||
s64 runtime_remaining;
|
||||
|
||||
u64 throttled_timestamp;
|
||||
int throttled, throttle_count;
|
||||
struct list_head throttled_list;
|
||||
#endif
|
||||
#endif
|
||||
};
|
||||
|
||||
#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||
#ifdef CONFIG_CFS_BANDWIDTH
|
||||
static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
|
||||
{
|
||||
return &tg->cfs_bandwidth;
|
||||
}
|
||||
|
||||
static inline u64 default_cfs_period(void);
|
||||
static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
|
||||
static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
|
||||
|
||||
static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
|
||||
{
|
||||
struct cfs_bandwidth *cfs_b =
|
||||
container_of(timer, struct cfs_bandwidth, slack_timer);
|
||||
do_sched_cfs_slack_timer(cfs_b);
|
||||
|
||||
return HRTIMER_NORESTART;
|
||||
}
|
||||
|
||||
static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
|
||||
{
|
||||
struct cfs_bandwidth *cfs_b =
|
||||
container_of(timer, struct cfs_bandwidth, period_timer);
|
||||
ktime_t now;
|
||||
int overrun;
|
||||
int idle = 0;
|
||||
|
||||
for (;;) {
|
||||
now = hrtimer_cb_get_time(timer);
|
||||
overrun = hrtimer_forward(timer, now, cfs_b->period);
|
||||
|
||||
if (!overrun)
|
||||
break;
|
||||
|
||||
idle = do_sched_cfs_period_timer(cfs_b, overrun);
|
||||
}
|
||||
|
||||
return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
|
||||
}
|
||||
|
||||
static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
|
||||
{
|
||||
raw_spin_lock_init(&cfs_b->lock);
|
||||
cfs_b->runtime = 0;
|
||||
cfs_b->quota = RUNTIME_INF;
|
||||
cfs_b->period = ns_to_ktime(default_cfs_period());
|
||||
|
||||
INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
|
||||
hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
|
||||
cfs_b->period_timer.function = sched_cfs_period_timer;
|
||||
hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
|
||||
cfs_b->slack_timer.function = sched_cfs_slack_timer;
|
||||
}
|
||||
|
||||
static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
|
||||
{
|
||||
cfs_rq->runtime_enabled = 0;
|
||||
INIT_LIST_HEAD(&cfs_rq->throttled_list);
|
||||
}
|
||||
|
||||
/* requires cfs_b->lock, may release to reprogram timer */
|
||||
static void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
|
||||
{
|
||||
/*
|
||||
* The timer may be active because we're trying to set a new bandwidth
|
||||
* period or because we're racing with the tear-down path
|
||||
* (timer_active==0 becomes visible before the hrtimer call-back
|
||||
* terminates). In either case we ensure that it's re-programmed
|
||||
*/
|
||||
while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
|
||||
raw_spin_unlock(&cfs_b->lock);
|
||||
/* ensure cfs_b->lock is available while we wait */
|
||||
hrtimer_cancel(&cfs_b->period_timer);
|
||||
|
||||
raw_spin_lock(&cfs_b->lock);
|
||||
/* if someone else restarted the timer then we're done */
|
||||
if (cfs_b->timer_active)
|
||||
return;
|
||||
}
|
||||
|
||||
cfs_b->timer_active = 1;
|
||||
start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
|
||||
}
|
||||
|
||||
static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
|
||||
{
|
||||
hrtimer_cancel(&cfs_b->period_timer);
|
||||
hrtimer_cancel(&cfs_b->slack_timer);
|
||||
}
|
||||
#else
|
||||
static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
|
||||
static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
|
||||
static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
|
||||
|
||||
static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
#endif /* CONFIG_CFS_BANDWIDTH */
|
||||
#endif /* CONFIG_FAIR_GROUP_SCHED */
|
||||
|
||||
/* Real-Time classes' related field in a runqueue: */
|
||||
struct rt_rq {
|
||||
struct rt_prio_array active;
|
||||
@@ -510,7 +644,7 @@ struct rq {
|
||||
|
||||
unsigned long cpu_power;
|
||||
|
||||
unsigned char idle_at_tick;
|
||||
unsigned char idle_balance;
|
||||
/* For active balancing */
|
||||
int post_schedule;
|
||||
int active_balance;
|
||||
@@ -520,8 +654,6 @@ struct rq {
|
||||
int cpu;
|
||||
int online;
|
||||
|
||||
unsigned long avg_load_per_task;
|
||||
|
||||
u64 rt_avg;
|
||||
u64 age_stamp;
|
||||
u64 idle_stamp;
|
||||
@@ -570,7 +702,7 @@ struct rq {
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
struct task_struct *wake_list;
|
||||
struct llist_head wake_list;
|
||||
#endif
|
||||
};
|
||||
|
||||
@@ -1272,6 +1404,18 @@ void wake_up_idle_cpu(int cpu)
|
||||
smp_send_reschedule(cpu);
|
||||
}
|
||||
|
||||
static inline bool got_nohz_idle_kick(void)
|
||||
{
|
||||
return idle_cpu(smp_processor_id()) && this_rq()->nohz_balance_kick;
|
||||
}
|
||||
|
||||
#else /* CONFIG_NO_HZ */
|
||||
|
||||
static inline bool got_nohz_idle_kick(void)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
#endif /* CONFIG_NO_HZ */
|
||||
|
||||
static u64 sched_avg_period(void)
|
||||
@@ -1471,24 +1615,28 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load)
|
||||
update_load_sub(&rq->load, load);
|
||||
}
|
||||
|
||||
#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
|
||||
#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
|
||||
(defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
|
||||
typedef int (*tg_visitor)(struct task_group *, void *);
|
||||
|
||||
/*
|
||||
* Iterate the full tree, calling @down when first entering a node and @up when
|
||||
* leaving it for the final time.
|
||||
* Iterate task_group tree rooted at *from, calling @down when first entering a
|
||||
* node and @up when leaving it for the final time.
|
||||
*
|
||||
* Caller must hold rcu_lock or sufficient equivalent.
|
||||
*/
|
||||
static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
|
||||
static int walk_tg_tree_from(struct task_group *from,
|
||||
tg_visitor down, tg_visitor up, void *data)
|
||||
{
|
||||
struct task_group *parent, *child;
|
||||
int ret;
|
||||
|
||||
rcu_read_lock();
|
||||
parent = &root_task_group;
|
||||
parent = from;
|
||||
|
||||
down:
|
||||
ret = (*down)(parent, data);
|
||||
if (ret)
|
||||
goto out_unlock;
|
||||
goto out;
|
||||
list_for_each_entry_rcu(child, &parent->children, siblings) {
|
||||
parent = child;
|
||||
goto down;
|
||||
@@ -1497,19 +1645,29 @@ up:
|
||||
continue;
|
||||
}
|
||||
ret = (*up)(parent, data);
|
||||
if (ret)
|
||||
goto out_unlock;
|
||||
if (ret || parent == from)
|
||||
goto out;
|
||||
|
||||
child = parent;
|
||||
parent = parent->parent;
|
||||
if (parent)
|
||||
goto up;
|
||||
out_unlock:
|
||||
rcu_read_unlock();
|
||||
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Iterate the full tree, calling @down when first entering a node and @up when
|
||||
* leaving it for the final time.
|
||||
*
|
||||
* Caller must hold rcu_lock or sufficient equivalent.
|
||||
*/
|
||||
|
||||
static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
|
||||
{
|
||||
return walk_tg_tree_from(&root_task_group, down, up, data);
|
||||
}
|
||||
|
||||
static int tg_nop(struct task_group *tg, void *data)
|
||||
{
|
||||
return 0;
|
||||
@@ -1569,11 +1727,9 @@ static unsigned long cpu_avg_load_per_task(int cpu)
|
||||
unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
|
||||
|
||||
if (nr_running)
|
||||
rq->avg_load_per_task = rq->load.weight / nr_running;
|
||||
else
|
||||
rq->avg_load_per_task = 0;
|
||||
return rq->load.weight / nr_running;
|
||||
|
||||
return rq->avg_load_per_task;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_PREEMPT
|
||||
@@ -1806,7 +1962,6 @@ static void activate_task(struct rq *rq, struct task_struct *p, int flags)
|
||||
rq->nr_uninterruptible--;
|
||||
|
||||
enqueue_task(rq, p, flags);
|
||||
inc_nr_running(rq);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1818,7 +1973,6 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
|
||||
rq->nr_uninterruptible++;
|
||||
|
||||
dequeue_task(rq, p, flags);
|
||||
dec_nr_running(rq);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
|
||||
@@ -2390,11 +2544,11 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
|
||||
|
||||
/* Look for allowed, online CPU in same node. */
|
||||
for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
|
||||
if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
|
||||
if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
|
||||
return dest_cpu;
|
||||
|
||||
/* Any allowed, online CPU? */
|
||||
dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask);
|
||||
dest_cpu = cpumask_any_and(tsk_cpus_allowed(p), cpu_active_mask);
|
||||
if (dest_cpu < nr_cpu_ids)
|
||||
return dest_cpu;
|
||||
|
||||
@@ -2431,7 +2585,7 @@ int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
|
||||
* [ this allows ->select_task() to simply return task_cpu(p) and
|
||||
* not worry about this generic constraint ]
|
||||
*/
|
||||
if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) ||
|
||||
if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) ||
|
||||
!cpu_online(cpu)))
|
||||
cpu = select_fallback_rq(task_cpu(p), p);
|
||||
|
||||
@@ -2556,42 +2710,26 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
|
||||
}
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
static void sched_ttwu_do_pending(struct task_struct *list)
|
||||
static void sched_ttwu_pending(void)
|
||||
{
|
||||
struct rq *rq = this_rq();
|
||||
struct llist_node *llist = llist_del_all(&rq->wake_list);
|
||||
struct task_struct *p;
|
||||
|
||||
raw_spin_lock(&rq->lock);
|
||||
|
||||
while (list) {
|
||||
struct task_struct *p = list;
|
||||
list = list->wake_entry;
|
||||
while (llist) {
|
||||
p = llist_entry(llist, struct task_struct, wake_entry);
|
||||
llist = llist_next(llist);
|
||||
ttwu_do_activate(rq, p, 0);
|
||||
}
|
||||
|
||||
raw_spin_unlock(&rq->lock);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_HOTPLUG_CPU
|
||||
|
||||
static void sched_ttwu_pending(void)
|
||||
{
|
||||
struct rq *rq = this_rq();
|
||||
struct task_struct *list = xchg(&rq->wake_list, NULL);
|
||||
|
||||
if (!list)
|
||||
return;
|
||||
|
||||
sched_ttwu_do_pending(list);
|
||||
}
|
||||
|
||||
#endif /* CONFIG_HOTPLUG_CPU */
|
||||
|
||||
void scheduler_ipi(void)
|
||||
{
|
||||
struct rq *rq = this_rq();
|
||||
struct task_struct *list = xchg(&rq->wake_list, NULL);
|
||||
|
||||
if (!list)
|
||||
if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
|
||||
return;
|
||||
|
||||
/*
|
||||
@@ -2608,25 +2746,21 @@ void scheduler_ipi(void)
|
||||
* somewhat pessimize the simple resched case.
|
||||
*/
|
||||
irq_enter();
|
||||
sched_ttwu_do_pending(list);
|
||||
sched_ttwu_pending();
|
||||
|
||||
/*
|
||||
* Check if someone kicked us for doing the nohz idle load balance.
|
||||
*/
|
||||
if (unlikely(got_nohz_idle_kick() && !need_resched())) {
|
||||
this_rq()->idle_balance = 1;
|
||||
raise_softirq_irqoff(SCHED_SOFTIRQ);
|
||||
}
|
||||
irq_exit();
|
||||
}
|
||||
|
||||
static void ttwu_queue_remote(struct task_struct *p, int cpu)
|
||||
{
|
||||
struct rq *rq = cpu_rq(cpu);
|
||||
struct task_struct *next = rq->wake_list;
|
||||
|
||||
for (;;) {
|
||||
struct task_struct *old = next;
|
||||
|
||||
p->wake_entry = next;
|
||||
next = cmpxchg(&rq->wake_list, old, p);
|
||||
if (next == old)
|
||||
break;
|
||||
}
|
||||
|
||||
if (!next)
|
||||
if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list))
|
||||
smp_send_reschedule(cpu);
|
||||
}
|
||||
|
||||
@@ -2847,20 +2981,24 @@ void sched_fork(struct task_struct *p)
|
||||
*/
|
||||
p->state = TASK_RUNNING;
|
||||
|
||||
/*
|
||||
* Make sure we do not leak PI boosting priority to the child.
|
||||
*/
|
||||
p->prio = current->normal_prio;
|
||||
|
||||
/*
|
||||
* Revert to default priority/policy on fork if requested.
|
||||
*/
|
||||
if (unlikely(p->sched_reset_on_fork)) {
|
||||
if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) {
|
||||
if (task_has_rt_policy(p)) {
|
||||
p->policy = SCHED_NORMAL;
|
||||
p->normal_prio = p->static_prio;
|
||||
}
|
||||
|
||||
if (PRIO_TO_NICE(p->static_prio) < 0) {
|
||||
p->static_prio = NICE_TO_PRIO(0);
|
||||
p->normal_prio = p->static_prio;
|
||||
set_load_weight(p);
|
||||
}
|
||||
p->rt_priority = 0;
|
||||
} else if (PRIO_TO_NICE(p->static_prio) < 0)
|
||||
p->static_prio = NICE_TO_PRIO(0);
|
||||
|
||||
p->prio = p->normal_prio = __normal_prio(p);
|
||||
set_load_weight(p);
|
||||
|
||||
/*
|
||||
* We don't need the reset flag anymore after the fork. It has
|
||||
@@ -2869,11 +3007,6 @@ void sched_fork(struct task_struct *p)
|
||||
p->sched_reset_on_fork = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Make sure we do not leak PI boosting priority to the child.
|
||||
*/
|
||||
p->prio = current->normal_prio;
|
||||
|
||||
if (!rt_prio(p->prio))
|
||||
p->sched_class = &fair_sched_class;
|
||||
|
||||
@@ -4116,7 +4249,7 @@ void scheduler_tick(void)
|
||||
perf_event_task_tick();
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
rq->idle_at_tick = idle_cpu(cpu);
|
||||
rq->idle_balance = idle_cpu(cpu);
|
||||
trigger_load_balance(rq, cpu);
|
||||
#endif
|
||||
}
|
||||
@@ -4240,7 +4373,7 @@ pick_next_task(struct rq *rq)
|
||||
* Optimization: we know that if all tasks are in
|
||||
* the fair class we can call that function directly:
|
||||
*/
|
||||
if (likely(rq->nr_running == rq->cfs.nr_running)) {
|
||||
if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
|
||||
p = fair_sched_class.pick_next_task(rq);
|
||||
if (likely(p))
|
||||
return p;
|
||||
@@ -5026,7 +5159,20 @@ EXPORT_SYMBOL(task_nice);
|
||||
*/
|
||||
int idle_cpu(int cpu)
|
||||
{
|
||||
return cpu_curr(cpu) == cpu_rq(cpu)->idle;
|
||||
struct rq *rq = cpu_rq(cpu);
|
||||
|
||||
if (rq->curr != rq->idle)
|
||||
return 0;
|
||||
|
||||
if (rq->nr_running)
|
||||
return 0;
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
if (!llist_empty(&rq->wake_list))
|
||||
return 0;
|
||||
#endif
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -5876,7 +6022,7 @@ void show_state_filter(unsigned long state_filter)
|
||||
printk(KERN_INFO
|
||||
" task PC stack pid father\n");
|
||||
#endif
|
||||
read_lock(&tasklist_lock);
|
||||
rcu_read_lock();
|
||||
do_each_thread(g, p) {
|
||||
/*
|
||||
* reset the NMI-timeout, listing all files on a slow
|
||||
@@ -5892,7 +6038,7 @@ void show_state_filter(unsigned long state_filter)
|
||||
#ifdef CONFIG_SCHED_DEBUG
|
||||
sysrq_sched_debug_show();
|
||||
#endif
|
||||
read_unlock(&tasklist_lock);
|
||||
rcu_read_unlock();
|
||||
/*
|
||||
* Only show locks if all tasks are dumped:
|
||||
*/
|
||||
@@ -6007,10 +6153,9 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
|
||||
{
|
||||
if (p->sched_class && p->sched_class->set_cpus_allowed)
|
||||
p->sched_class->set_cpus_allowed(p, new_mask);
|
||||
else {
|
||||
cpumask_copy(&p->cpus_allowed, new_mask);
|
||||
p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
|
||||
}
|
||||
|
||||
cpumask_copy(&p->cpus_allowed, new_mask);
|
||||
p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -6108,7 +6253,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
|
||||
if (task_cpu(p) != src_cpu)
|
||||
goto done;
|
||||
/* Affinity changed (again). */
|
||||
if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
|
||||
if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
|
||||
goto fail;
|
||||
|
||||
/*
|
||||
@@ -6189,6 +6334,30 @@ static void calc_global_load_remove(struct rq *rq)
|
||||
rq->calc_load_active = 0;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_CFS_BANDWIDTH
|
||||
static void unthrottle_offline_cfs_rqs(struct rq *rq)
|
||||
{
|
||||
struct cfs_rq *cfs_rq;
|
||||
|
||||
for_each_leaf_cfs_rq(rq, cfs_rq) {
|
||||
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
|
||||
|
||||
if (!cfs_rq->runtime_enabled)
|
||||
continue;
|
||||
|
||||
/*
|
||||
* clock_task is not advancing so we just need to make sure
|
||||
* there's some valid quota amount
|
||||
*/
|
||||
cfs_rq->runtime_remaining = cfs_b->quota;
|
||||
if (cfs_rq_throttled(cfs_rq))
|
||||
unthrottle_cfs_rq(cfs_rq);
|
||||
}
|
||||
}
|
||||
#else
|
||||
static void unthrottle_offline_cfs_rqs(struct rq *rq) {}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Migrate all tasks from the rq, sleeping tasks will be migrated by
|
||||
* try_to_wake_up()->select_task_rq().
|
||||
@@ -6214,6 +6383,9 @@ static void migrate_tasks(unsigned int dead_cpu)
|
||||
*/
|
||||
rq->stop = NULL;
|
||||
|
||||
/* Ensure any throttled groups are reachable by pick_next_task */
|
||||
unthrottle_offline_cfs_rqs(rq);
|
||||
|
||||
for ( ; ; ) {
|
||||
/*
|
||||
* There's this thread running, bail when that's the only
|
||||
@@ -7957,6 +8129,7 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
|
||||
/* allow initial update_cfs_load() to truncate */
|
||||
cfs_rq->load_stamp = 1;
|
||||
#endif
|
||||
init_cfs_rq_runtime(cfs_rq);
|
||||
|
||||
tg->cfs_rq[cpu] = cfs_rq;
|
||||
tg->se[cpu] = se;
|
||||
@@ -8096,6 +8269,7 @@ void __init sched_init(void)
|
||||
* We achieve this by letting root_task_group's tasks sit
|
||||
* directly in rq->cfs (i.e root_task_group->se[] = NULL).
|
||||
*/
|
||||
init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
|
||||
init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
|
||||
#endif /* CONFIG_FAIR_GROUP_SCHED */
|
||||
|
||||
@@ -8125,7 +8299,6 @@ void __init sched_init(void)
|
||||
rq_attach_root(rq, &def_root_domain);
|
||||
#ifdef CONFIG_NO_HZ
|
||||
rq->nohz_balance_kick = 0;
|
||||
init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i));
|
||||
#endif
|
||||
#endif
|
||||
init_rq_hrtick(rq);
|
||||
@@ -8336,6 +8509,8 @@ static void free_fair_sched_group(struct task_group *tg)
|
||||
{
|
||||
int i;
|
||||
|
||||
destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
|
||||
|
||||
for_each_possible_cpu(i) {
|
||||
if (tg->cfs_rq)
|
||||
kfree(tg->cfs_rq[i]);
|
||||
@@ -8363,6 +8538,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
|
||||
|
||||
tg->shares = NICE_0_LOAD;
|
||||
|
||||
init_cfs_bandwidth(tg_cfs_bandwidth(tg));
|
||||
|
||||
for_each_possible_cpu(i) {
|
||||
cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
|
||||
GFP_KERNEL, cpu_to_node(i));
|
||||
@@ -8638,12 +8815,7 @@ unsigned long sched_group_shares(struct task_group *tg)
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_RT_GROUP_SCHED
|
||||
/*
|
||||
* Ensure that the real time constraints are schedulable.
|
||||
*/
|
||||
static DEFINE_MUTEX(rt_constraints_mutex);
|
||||
|
||||
#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
|
||||
static unsigned long to_ratio(u64 period, u64 runtime)
|
||||
{
|
||||
if (runtime == RUNTIME_INF)
|
||||
@@ -8651,6 +8823,13 @@ static unsigned long to_ratio(u64 period, u64 runtime)
|
||||
|
||||
return div64_u64(runtime << 20, period);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_RT_GROUP_SCHED
|
||||
/*
|
||||
* Ensure that the real time constraints are schedulable.
|
||||
*/
|
||||
static DEFINE_MUTEX(rt_constraints_mutex);
|
||||
|
||||
/* Must be called with tasklist_lock held */
|
||||
static inline int tg_has_rt_tasks(struct task_group *tg)
|
||||
@@ -8671,7 +8850,7 @@ struct rt_schedulable_data {
|
||||
u64 rt_runtime;
|
||||
};
|
||||
|
||||
static int tg_schedulable(struct task_group *tg, void *data)
|
||||
static int tg_rt_schedulable(struct task_group *tg, void *data)
|
||||
{
|
||||
struct rt_schedulable_data *d = data;
|
||||
struct task_group *child;
|
||||
@@ -8729,16 +8908,22 @@ static int tg_schedulable(struct task_group *tg, void *data)
|
||||
|
||||
static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
|
||||
{
|
||||
int ret;
|
||||
|
||||
struct rt_schedulable_data data = {
|
||||
.tg = tg,
|
||||
.rt_period = period,
|
||||
.rt_runtime = runtime,
|
||||
};
|
||||
|
||||
return walk_tg_tree(tg_schedulable, tg_nop, &data);
|
||||
rcu_read_lock();
|
||||
ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
|
||||
rcu_read_unlock();
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int tg_set_bandwidth(struct task_group *tg,
|
||||
static int tg_set_rt_bandwidth(struct task_group *tg,
|
||||
u64 rt_period, u64 rt_runtime)
|
||||
{
|
||||
int i, err = 0;
|
||||
@@ -8777,7 +8962,7 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
|
||||
if (rt_runtime_us < 0)
|
||||
rt_runtime = RUNTIME_INF;
|
||||
|
||||
return tg_set_bandwidth(tg, rt_period, rt_runtime);
|
||||
return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
|
||||
}
|
||||
|
||||
long sched_group_rt_runtime(struct task_group *tg)
|
||||
@@ -8802,7 +8987,7 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
|
||||
if (rt_period == 0)
|
||||
return -EINVAL;
|
||||
|
||||
return tg_set_bandwidth(tg, rt_period, rt_runtime);
|
||||
return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
|
||||
}
|
||||
|
||||
long sched_group_rt_period(struct task_group *tg)
|
||||
@@ -8992,6 +9177,238 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
|
||||
|
||||
return (u64) scale_load_down(tg->shares);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_CFS_BANDWIDTH
|
||||
static DEFINE_MUTEX(cfs_constraints_mutex);
|
||||
|
||||
const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
|
||||
const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
|
||||
|
||||
static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
|
||||
|
||||
static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
|
||||
{
|
||||
int i, ret = 0, runtime_enabled;
|
||||
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
|
||||
|
||||
if (tg == &root_task_group)
|
||||
return -EINVAL;
|
||||
|
||||
/*
|
||||
* Ensure we have at some amount of bandwidth every period. This is
|
||||
* to prevent reaching a state of large arrears when throttled via
|
||||
* entity_tick() resulting in prolonged exit starvation.
|
||||
*/
|
||||
if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
|
||||
return -EINVAL;
|
||||
|
||||
/*
|
||||
* Likewise, bound things on the otherside by preventing insane quota
|
||||
* periods. This also allows us to normalize in computing quota
|
||||
* feasibility.
|
||||
*/
|
||||
if (period > max_cfs_quota_period)
|
||||
return -EINVAL;
|
||||
|
||||
mutex_lock(&cfs_constraints_mutex);
|
||||
ret = __cfs_schedulable(tg, period, quota);
|
||||
if (ret)
|
||||
goto out_unlock;
|
||||
|
||||
runtime_enabled = quota != RUNTIME_INF;
|
||||
raw_spin_lock_irq(&cfs_b->lock);
|
||||
cfs_b->period = ns_to_ktime(period);
|
||||
cfs_b->quota = quota;
|
||||
|
||||
__refill_cfs_bandwidth_runtime(cfs_b);
|
||||
/* restart the period timer (if active) to handle new period expiry */
|
||||
if (runtime_enabled && cfs_b->timer_active) {
|
||||
/* force a reprogram */
|
||||
cfs_b->timer_active = 0;
|
||||
__start_cfs_bandwidth(cfs_b);
|
||||
}
|
||||
raw_spin_unlock_irq(&cfs_b->lock);
|
||||
|
||||
for_each_possible_cpu(i) {
|
||||
struct cfs_rq *cfs_rq = tg->cfs_rq[i];
|
||||
struct rq *rq = rq_of(cfs_rq);
|
||||
|
||||
raw_spin_lock_irq(&rq->lock);
|
||||
cfs_rq->runtime_enabled = runtime_enabled;
|
||||
cfs_rq->runtime_remaining = 0;
|
||||
|
||||
if (cfs_rq_throttled(cfs_rq))
|
||||
unthrottle_cfs_rq(cfs_rq);
|
||||
raw_spin_unlock_irq(&rq->lock);
|
||||
}
|
||||
out_unlock:
|
||||
mutex_unlock(&cfs_constraints_mutex);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
|
||||
{
|
||||
u64 quota, period;
|
||||
|
||||
period = ktime_to_ns(tg_cfs_bandwidth(tg)->period);
|
||||
if (cfs_quota_us < 0)
|
||||
quota = RUNTIME_INF;
|
||||
else
|
||||
quota = (u64)cfs_quota_us * NSEC_PER_USEC;
|
||||
|
||||
return tg_set_cfs_bandwidth(tg, period, quota);
|
||||
}
|
||||
|
||||
long tg_get_cfs_quota(struct task_group *tg)
|
||||
{
|
||||
u64 quota_us;
|
||||
|
||||
if (tg_cfs_bandwidth(tg)->quota == RUNTIME_INF)
|
||||
return -1;
|
||||
|
||||
quota_us = tg_cfs_bandwidth(tg)->quota;
|
||||
do_div(quota_us, NSEC_PER_USEC);
|
||||
|
||||
return quota_us;
|
||||
}
|
||||
|
||||
int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
|
||||
{
|
||||
u64 quota, period;
|
||||
|
||||
period = (u64)cfs_period_us * NSEC_PER_USEC;
|
||||
quota = tg_cfs_bandwidth(tg)->quota;
|
||||
|
||||
if (period <= 0)
|
||||
return -EINVAL;
|
||||
|
||||
return tg_set_cfs_bandwidth(tg, period, quota);
|
||||
}
|
||||
|
||||
long tg_get_cfs_period(struct task_group *tg)
|
||||
{
|
||||
u64 cfs_period_us;
|
||||
|
||||
cfs_period_us = ktime_to_ns(tg_cfs_bandwidth(tg)->period);
|
||||
do_div(cfs_period_us, NSEC_PER_USEC);
|
||||
|
||||
return cfs_period_us;
|
||||
}
|
||||
|
||||
static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft)
|
||||
{
|
||||
return tg_get_cfs_quota(cgroup_tg(cgrp));
|
||||
}
|
||||
|
||||
static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype,
|
||||
s64 cfs_quota_us)
|
||||
{
|
||||
return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us);
|
||||
}
|
||||
|
||||
static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft)
|
||||
{
|
||||
return tg_get_cfs_period(cgroup_tg(cgrp));
|
||||
}
|
||||
|
||||
static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype,
|
||||
u64 cfs_period_us)
|
||||
{
|
||||
return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us);
|
||||
}
|
||||
|
||||
struct cfs_schedulable_data {
|
||||
struct task_group *tg;
|
||||
u64 period, quota;
|
||||
};
|
||||
|
||||
/*
|
||||
* normalize group quota/period to be quota/max_period
|
||||
* note: units are usecs
|
||||
*/
|
||||
static u64 normalize_cfs_quota(struct task_group *tg,
|
||||
struct cfs_schedulable_data *d)
|
||||
{
|
||||
u64 quota, period;
|
||||
|
||||
if (tg == d->tg) {
|
||||
period = d->period;
|
||||
quota = d->quota;
|
||||
} else {
|
||||
period = tg_get_cfs_period(tg);
|
||||
quota = tg_get_cfs_quota(tg);
|
||||
}
|
||||
|
||||
/* note: these should typically be equivalent */
|
||||
if (quota == RUNTIME_INF || quota == -1)
|
||||
return RUNTIME_INF;
|
||||
|
||||
return to_ratio(period, quota);
|
||||
}
|
||||
|
||||
static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
|
||||
{
|
||||
struct cfs_schedulable_data *d = data;
|
||||
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
|
||||
s64 quota = 0, parent_quota = -1;
|
||||
|
||||
if (!tg->parent) {
|
||||
quota = RUNTIME_INF;
|
||||
} else {
|
||||
struct cfs_bandwidth *parent_b = tg_cfs_bandwidth(tg->parent);
|
||||
|
||||
quota = normalize_cfs_quota(tg, d);
|
||||
parent_quota = parent_b->hierarchal_quota;
|
||||
|
||||
/*
|
||||
* ensure max(child_quota) <= parent_quota, inherit when no
|
||||
* limit is set
|
||||
*/
|
||||
if (quota == RUNTIME_INF)
|
||||
quota = parent_quota;
|
||||
else if (parent_quota != RUNTIME_INF && quota > parent_quota)
|
||||
return -EINVAL;
|
||||
}
|
||||
cfs_b->hierarchal_quota = quota;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
|
||||
{
|
||||
int ret;
|
||||
struct cfs_schedulable_data data = {
|
||||
.tg = tg,
|
||||
.period = period,
|
||||
.quota = quota,
|
||||
};
|
||||
|
||||
if (quota != RUNTIME_INF) {
|
||||
do_div(data.period, NSEC_PER_USEC);
|
||||
do_div(data.quota, NSEC_PER_USEC);
|
||||
}
|
||||
|
||||
rcu_read_lock();
|
||||
ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
|
||||
rcu_read_unlock();
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
|
||||
struct cgroup_map_cb *cb)
|
||||
{
|
||||
struct task_group *tg = cgroup_tg(cgrp);
|
||||
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
|
||||
|
||||
cb->fill(cb, "nr_periods", cfs_b->nr_periods);
|
||||
cb->fill(cb, "nr_throttled", cfs_b->nr_throttled);
|
||||
cb->fill(cb, "throttled_time", cfs_b->throttled_time);
|
||||
|
||||
return 0;
|
||||
}
|
||||
#endif /* CONFIG_CFS_BANDWIDTH */
|
||||
#endif /* CONFIG_FAIR_GROUP_SCHED */
|
||||
|
||||
#ifdef CONFIG_RT_GROUP_SCHED
|
||||
@@ -9026,6 +9443,22 @@ static struct cftype cpu_files[] = {
|
||||
.write_u64 = cpu_shares_write_u64,
|
||||
},
|
||||
#endif
|
||||
#ifdef CONFIG_CFS_BANDWIDTH
|
||||
{
|
||||
.name = "cfs_quota_us",
|
||||
.read_s64 = cpu_cfs_quota_read_s64,
|
||||
.write_s64 = cpu_cfs_quota_write_s64,
|
||||
},
|
||||
{
|
||||
.name = "cfs_period_us",
|
||||
.read_u64 = cpu_cfs_period_read_u64,
|
||||
.write_u64 = cpu_cfs_period_write_u64,
|
||||
},
|
||||
{
|
||||
.name = "stat",
|
||||
.read_map = cpu_stats_show,
|
||||
},
|
||||
#endif
|
||||
#ifdef CONFIG_RT_GROUP_SCHED
|
||||
{
|
||||
.name = "rt_runtime_us",
|
||||
@@ -9335,4 +9768,3 @@ struct cgroup_subsys cpuacct_subsys = {
|
||||
.subsys_id = cpuacct_subsys_id,
|
||||
};
|
||||
#endif /* CONFIG_CGROUP_CPUACCT */
|
||||
|
||||
|
Reference in New Issue
Block a user