--- 0c91bf6544bf5d79b3d67ceea1f6a5cdf7c71104 +++ 95297f2c284ebeed72c63256397c783efbc289d4 @@ -1,3 +1,6 @@ +#ifdef CONFIG_SCHED_BFS +#include "sched_bfs.c" +#else /* * kernel/sched.c * @@ -76,6 +79,7 @@ #include #include "sched_cpupri.h" +#include "sched_autogroup.h" #define CREATE_TRACE_POINTS #include @@ -352,13 +356,20 @@ static inline struct task_group *task_gr rcu_read_lock(); tg = __task_cred(p)->user->tg; rcu_read_unlock(); + + return tg; #elif defined(CONFIG_CGROUP_SCHED) - tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), - struct task_group, css); + struct cgroup_subsys_state *css; + + css = task_subsys_state(p, cpu_cgroup_subsys_id); + tg = container_of(css, struct task_group, css); + + return autogroup_task_group(p, tg); #else tg = &init_task_group; -#endif + return tg; +#endif } /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ @@ -542,7 +553,6 @@ struct rq { struct load_weight load; unsigned long nr_load_updates; u64 nr_switches; - u64 nr_migrations_in; struct cfs_rq cfs; struct rt_rq rt; @@ -742,7 +752,7 @@ sched_feat_write(struct file *filp, cons size_t cnt, loff_t *ppos) { char buf[64]; - char *cmp = buf; + char *cmp; int neg = 0; int i; @@ -753,6 +763,7 @@ sched_feat_write(struct file *filp, cons return -EFAULT; buf[cnt] = 0; + cmp = strstrip(buf); if (strncmp(buf, "NO_", 3) == 0) { neg = 1; @@ -760,9 +771,7 @@ sched_feat_write(struct file *filp, cons } for (i = 0; sched_feat_names[i]; i++) { - int len = strlen(sched_feat_names[i]); - - if (strncmp(cmp, sched_feat_names[i], len) == 0) { + if (strcmp(cmp, sched_feat_names[i]) == 0) { if (neg) sysctl_sched_features &= ~(1UL << i); else @@ -943,14 +952,25 @@ static inline void finish_lock_switch(st #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ /* + * Check whether the task is waking, we use this to synchronize ->cpus_allowed + * against ttwu(). + */ +static inline int task_is_waking(struct task_struct *p) +{ + return unlikely(p->state == TASK_WAKING); +} + +/* * __task_rq_lock - lock the runqueue a given task resides on. * Must be called interrupts disabled. */ static inline struct rq *__task_rq_lock(struct task_struct *p) __acquires(rq->lock) { + struct rq *rq; + for (;;) { - struct rq *rq = task_rq(p); + rq = task_rq(p); spin_lock(&rq->lock); if (likely(rq == task_rq(p))) return rq; @@ -1623,7 +1643,7 @@ static void update_group_shares_cpu(stru */ static int tg_shares_up(struct task_group *tg, void *data) { - unsigned long weight, rq_weight = 0, shares = 0; + unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0; unsigned long *usd_rq_weight; struct sched_domain *sd = data; unsigned long flags; @@ -1639,6 +1659,7 @@ static int tg_shares_up(struct task_grou weight = tg->cfs_rq[i]->load.weight; usd_rq_weight[i] = weight; + rq_weight += weight; /* * If there are currently no tasks on the cpu pretend there * is one of average load so that when a new task gets to @@ -1647,10 +1668,13 @@ static int tg_shares_up(struct task_grou if (!weight) weight = NICE_0_LOAD; - rq_weight += weight; + sum_weight += weight; shares += tg->cfs_rq[i]->shares; } + if (!rq_weight) + rq_weight = sum_weight; + if ((!shares && rq_weight) || shares > tg->shares) shares = tg->shares; @@ -1818,10 +1842,25 @@ static void cfs_rq_set_shares(struct cfs static void calc_load_account_active(struct rq *this_rq); static void update_sysctl(void); +static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) +{ + set_task_rq(p, cpu); +#ifdef CONFIG_SMP + /* + * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be + * successfuly executed on another CPU. We must ensure that updates of + * per-task data have been completed by this moment. + */ + smp_wmb(); + task_thread_info(p)->cpu = cpu; +#endif +} + #include "sched_stats.h" #include "sched_idletask.c" #include "sched_fair.c" #include "sched_rt.c" +#include "sched_autogroup.c" #ifdef CONFIG_SCHED_DEBUG # include "sched_debug.c" #endif @@ -1867,13 +1906,14 @@ static void update_avg(u64 *avg, u64 sam *avg += diff >> 3; } -static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) +static void +enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head) { if (wakeup) p->se.start_runtime = p->se.sum_exec_runtime; sched_info_queued(p); - p->sched_class->enqueue_task(rq, p, wakeup); + p->sched_class->enqueue_task(rq, p, wakeup, head); p->se.on_rq = 1; } @@ -1949,7 +1989,7 @@ static void activate_task(struct rq *rq, if (task_contributes_to_load(p)) rq->nr_uninterruptible--; - enqueue_task(rq, p, wakeup); + enqueue_task(rq, p, wakeup, false); inc_nr_running(rq); } @@ -1974,20 +2014,6 @@ inline int task_curr(const struct task_s return cpu_curr(task_cpu(p)) == p; } -static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) -{ - set_task_rq(p, cpu); -#ifdef CONFIG_SMP - /* - * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be - * successfuly executed on another CPU. We must ensure that updates of - * per-task data have been completed by this moment. - */ - smp_wmb(); - task_thread_info(p)->cpu = cpu; -#endif -} - static inline void check_class_changed(struct rq *rq, struct task_struct *p, const struct sched_class *prev_class, int oldprio, int running) @@ -2014,21 +2040,15 @@ static inline void check_class_changed(s */ void kthread_bind(struct task_struct *p, unsigned int cpu) { - struct rq *rq = cpu_rq(cpu); - unsigned long flags; - /* Must have done schedule() in kthread() before we set_task_cpu */ if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) { WARN_ON(1); return; } - spin_lock_irqsave(&rq->lock, flags); - set_task_cpu(p, cpu); p->cpus_allowed = cpumask_of_cpu(cpu); p->rt.nr_cpus_allowed = 1; p->flags |= PF_THREAD_BOUND; - spin_unlock_irqrestore(&rq->lock, flags); } EXPORT_SYMBOL(kthread_bind); @@ -2066,35 +2086,23 @@ task_hot(struct task_struct *p, u64 now, void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { int old_cpu = task_cpu(p); - struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu); - struct cfs_rq *old_cfsrq = task_cfs_rq(p), - *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu); - u64 clock_offset; - clock_offset = old_rq->clock - new_rq->clock; +#ifdef CONFIG_SCHED_DEBUG + /* + * We should never call set_task_cpu() on a blocked task, + * ttwu() will sort out the placement. + */ + WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && + !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); +#endif trace_sched_migrate_task(p, new_cpu); -#ifdef CONFIG_SCHEDSTATS - if (p->se.wait_start) - p->se.wait_start -= clock_offset; - if (p->se.sleep_start) - p->se.sleep_start -= clock_offset; - if (p->se.block_start) - p->se.block_start -= clock_offset; -#endif if (old_cpu != new_cpu) { p->se.nr_migrations++; - new_rq->nr_migrations_in++; -#ifdef CONFIG_SCHEDSTATS - if (task_hot(p, old_rq->clock, NULL)) - schedstat_inc(p, se.nr_forced2_migrations); -#endif perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0); } - p->se.vruntime -= old_cfsrq->min_vruntime - - new_cfsrq->min_vruntime; __set_task_cpu(p, new_cpu); } @@ -2327,6 +2335,69 @@ void task_oncpu_function_call(struct tas preempt_enable(); } +#ifdef CONFIG_SMP +/* + * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held. + */ +static int select_fallback_rq(int cpu, struct task_struct *p) +{ + int dest_cpu; + const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu)); + + /* Look for allowed, online CPU in same node. */ + for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask) + if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) + return dest_cpu; + + /* Any allowed, online CPU? */ + dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask); + if (dest_cpu < nr_cpu_ids) + return dest_cpu; + + /* No more Mr. Nice Guy. */ + if (unlikely(dest_cpu >= nr_cpu_ids)) { + dest_cpu = cpuset_cpus_allowed_fallback(p); + /* + * Don't tell them about moving exiting tasks or + * kernel threads (both mm NULL), since they never + * leave kernel. + */ + if (p->mm && printk_ratelimit()) { + printk(KERN_INFO "process %d (%s) no " + "longer affine to cpu%d\n", + task_pid_nr(p), p->comm, cpu); + } + } + + return dest_cpu; +} + +/* + * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable. + */ +static inline +int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags) +{ + int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags); + + /* + * In order not to call set_task_cpu() on a blocking task we need + * to rely on ttwu() to place the task on a valid ->cpus_allowed + * cpu. + * + * Since this is common to all placement strategies, this lives here. + * + * [ this allows ->select_task() to simply return task_cpu(p) and + * not worry about this generic constraint ] + */ + if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) || + !cpu_online(cpu))) + cpu = select_fallback_rq(task_cpu(p), p); + + return cpu; +} +#endif + /*** * try_to_wake_up - wake up a thread * @p: the to-be-woken-up thread @@ -2375,22 +2446,34 @@ static int try_to_wake_up(struct task_st * * First fix up the nr_uninterruptible count: */ - if (task_contributes_to_load(p)) - rq->nr_uninterruptible--; + if (task_contributes_to_load(p)) { + if (likely(cpu_online(orig_cpu))) + rq->nr_uninterruptible--; + else + this_rq()->nr_uninterruptible--; + } p->state = TASK_WAKING; - task_rq_unlock(rq, &flags); - cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags); + if (p->sched_class->task_waking) + p->sched_class->task_waking(rq, p); + + cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags); if (cpu != orig_cpu) set_task_cpu(p, cpu); + __task_rq_unlock(rq); - rq = task_rq_lock(p, &flags); - - if (rq != orig_rq) - update_rq_clock(rq); + rq = cpu_rq(cpu); + spin_lock(&rq->lock); + update_rq_clock(rq); + /* + * We migrated the task without holding either rq->lock, however + * since the task is not on the task list itself, nobody else + * will try and migrate the task, hence the rq should match the + * cpu we just moved it to. + */ + WARN_ON(task_cpu(p) != cpu); WARN_ON(p->state != TASK_WAKING); - cpu = task_cpu(p); #ifdef CONFIG_SCHEDSTATS schedstat_inc(rq, ttwu_count); @@ -2443,8 +2526,8 @@ out_running: p->state = TASK_RUNNING; #ifdef CONFIG_SMP - if (p->sched_class->task_wake_up) - p->sched_class->task_wake_up(rq, p); + if (p->sched_class->task_woken) + p->sched_class->task_woken(rq, p); if (unlikely(rq->idle_stamp)) { u64 delta = rq->clock - rq->idle_stamp; @@ -2524,7 +2607,6 @@ static void __sched_fork(struct task_str p->se.nr_failed_migrations_running = 0; p->se.nr_failed_migrations_hot = 0; p->se.nr_forced_migrations = 0; - p->se.nr_forced2_migrations = 0; p->se.nr_wakeups = 0; p->se.nr_wakeups_sync = 0; @@ -2545,14 +2627,6 @@ static void __sched_fork(struct task_str #ifdef CONFIG_PREEMPT_NOTIFIERS INIT_HLIST_HEAD(&p->preempt_notifiers); #endif - - /* - * We mark the process as running here, but have not actually - * inserted it onto the runqueue yet. This guarantees that - * nobody will actually run it, and a signal or other external - * event cannot wake it up and insert it on the runqueue either. - */ - p->state = TASK_RUNNING; } /* @@ -2563,6 +2637,12 @@ void sched_fork(struct task_struct *p, i int cpu = get_cpu(); __sched_fork(p); + /* + * We mark the process as running here. This guarantees that + * nobody will actually run it, and a signal or other external + * event cannot wake it up and insert it on the runqueue either. + */ + p->state = TASK_RUNNING; /* * Revert to default priority/policy on fork if requested. @@ -2594,9 +2674,9 @@ void sched_fork(struct task_struct *p, i if (!rt_prio(p->prio)) p->sched_class = &fair_sched_class; -#ifdef CONFIG_SMP - cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0); -#endif + if (p->sched_class->task_fork) + p->sched_class->task_fork(p); + set_task_cpu(p, cpu); #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) @@ -2626,28 +2706,38 @@ void wake_up_new_task(struct task_struct { unsigned long flags; struct rq *rq; + int cpu = get_cpu(); +#ifdef CONFIG_SMP rq = task_rq_lock(p, &flags); - BUG_ON(p->state != TASK_RUNNING); - update_rq_clock(rq); + p->state = TASK_WAKING; - if (!p->sched_class->task_new || !current->se.on_rq) { - activate_task(rq, p, 0); - } else { - /* - * Let the scheduling class do new task startup - * management (if any): - */ - p->sched_class->task_new(rq, p); - inc_nr_running(rq); - } + /* + * Fork balancing, do it here and not earlier because: + * - cpus_allowed can change in the fork path + * - any previously selected cpu might disappear through hotplug + * + * We set TASK_WAKING so that select_task_rq() can drop rq->lock + * without people poking at ->cpus_allowed. + */ + cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0); + set_task_cpu(p, cpu); + + p->state = TASK_RUNNING; + task_rq_unlock(rq, &flags); +#endif + + rq = task_rq_lock(p, &flags); + update_rq_clock(rq); + activate_task(rq, p, 0); trace_sched_wakeup_new(rq, p, 1); check_preempt_curr(rq, p, WF_FORK); #ifdef CONFIG_SMP - if (p->sched_class->task_wake_up) - p->sched_class->task_wake_up(rq, p); + if (p->sched_class->task_woken) + p->sched_class->task_woken(rq, p); #endif task_rq_unlock(rq, &flags); + put_cpu(); } #ifdef CONFIG_PREEMPT_NOTIFIERS @@ -3034,15 +3124,6 @@ static void calc_load_account_active(str } /* - * Externally visible per-cpu scheduler statistics: - * cpu_nr_migrations(cpu) - number of migrations into that cpu - */ -u64 cpu_nr_migrations(int cpu) -{ - return cpu_rq(cpu)->nr_migrations_in; -} - -/* * Update rq->cpu_load[] statistics. This function is usually called every * scheduler tick (TICK_NSEC). */ @@ -3124,24 +3205,28 @@ static void double_rq_unlock(struct rq * } /* - * If dest_cpu is allowed for this process, migrate the task to it. - * This is accomplished by forcing the cpu_allowed mask to only - * allow dest_cpu, which will force the cpu onto dest_cpu. Then - * the cpu_allowed mask is restored. + * sched_exec - execve() is a valuable balancing opportunity, because at + * this point the task has the smallest effective memory and cache footprint. */ -static void sched_migrate_task(struct task_struct *p, int dest_cpu) +void sched_exec(void) { + struct task_struct *p = current; struct migration_req req; unsigned long flags; struct rq *rq; + int dest_cpu; rq = task_rq_lock(p, &flags); - if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed) - || unlikely(!cpu_active(dest_cpu))) - goto out; + dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0); + if (dest_cpu == smp_processor_id()) + goto unlock; - /* force the process onto the specified CPU */ - if (migrate_task(p, dest_cpu, &req)) { + /* + * select_task_rq() can race against ->cpus_allowed + */ + if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) && + likely(cpu_active(dest_cpu)) && + migrate_task(p, dest_cpu, &req)) { /* Need to wait for migration thread (might exit: take ref). */ struct task_struct *mt = rq->migration_thread; @@ -3153,24 +3238,11 @@ static void sched_migrate_task(struct ta return; } -out: +unlock: task_rq_unlock(rq, &flags); } /* - * sched_exec - execve() is a valuable balancing opportunity, because at - * this point the task has the smallest effective memory and cache footprint. - */ -void sched_exec(void) -{ - int new_cpu, this_cpu = get_cpu(); - new_cpu = current->sched_class->select_task_rq(current, SD_BALANCE_EXEC, 0); - put_cpu(); - if (new_cpu != this_cpu) - sched_migrate_task(current, new_cpu); -} - -/* * pull_task - move a task from a remote runqueue to the local runqueue. * Both runqueues must be locked. */ @@ -3617,7 +3689,7 @@ unsigned long __weak arch_scale_freq_pow unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) { - unsigned long weight = cpumask_weight(sched_domain_span(sd)); + unsigned long weight = sd->span_weight; unsigned long smt_gain = sd->smt_gain; smt_gain /= weight; @@ -3650,7 +3722,7 @@ unsigned long scale_rt_power(int cpu) static void update_cpu_power(struct sched_domain *sd, int cpu) { - unsigned long weight = cpumask_weight(sched_domain_span(sd)); + unsigned long weight = sd->span_weight; unsigned long power = SCHED_LOAD_SCALE; struct sched_group *sdg = sd->groups; @@ -5149,21 +5221,9 @@ void account_idle_time(cputime_t cputime struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; cputime64_t cputime64 = cputime_to_cputime64(cputime); struct rq *rq = this_rq(); - struct task_struct *task; - if (atomic_read(&rq->nr_iowait) > 0) { - for (task = current; task != &init_task; task = task->parent) - ; - /* task now points to init */ - for_each_process(task) { - /* this pointlessly prints the name and PID of each task */ - if (task->in_iowait) { - task->iowait = cputime64_add(task->iowait, cputime64); - //printk("%s[%d]\n", task->comm, task->pid); - } - } + if (atomic_read(&rq->nr_iowait) > 0) cpustat->iowait = cputime64_add(cpustat->iowait, cputime64); - } else cpustat->idle = cputime64_add(cpustat->idle, cputime64); } @@ -5223,45 +5283,90 @@ cputime_t task_stime(struct task_struct { return p->stime; } + +void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) +{ + struct task_cputime cputime; + + thread_group_cputime(p, &cputime); + + *ut = cputime.utime; + *st = cputime.stime; +} #else + +#ifndef nsecs_to_cputime +# define nsecs_to_cputime(__nsecs) \ + msecs_to_cputime(div_u64((__nsecs), NSEC_PER_MSEC)) +#endif + cputime_t task_utime(struct task_struct *p) { - clock_t utime = cputime_to_clock_t(p->utime), - total = utime + cputime_to_clock_t(p->stime); + cputime_t utime = p->utime, total = utime + p->stime; u64 temp; /* * Use CFS's precise accounting: */ - temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime); + temp = (u64)nsecs_to_cputime(p->se.sum_exec_runtime); if (total) { temp *= utime; do_div(temp, total); } - utime = (clock_t)temp; + utime = (cputime_t)temp; - p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime)); + p->prev_utime = max(p->prev_utime, utime); return p->prev_utime; } cputime_t task_stime(struct task_struct *p) { - clock_t stime; + cputime_t stime; /* * Use CFS's precise accounting. (we subtract utime from * the total, to make sure the total observed by userspace * grows monotonically - apps rely on that): */ - stime = nsec_to_clock_t(p->se.sum_exec_runtime) - - cputime_to_clock_t(task_utime(p)); + stime = nsecs_to_cputime(p->se.sum_exec_runtime) - task_utime(p); if (stime >= 0) - p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime)); + p->prev_stime = max(p->prev_stime, stime); return p->prev_stime; } + +/* + * Must be called with siglock held. + */ +void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) +{ + struct signal_struct *sig = p->signal; + struct task_cputime cputime; + cputime_t rtime, utime, total; + + thread_group_cputime(p, &cputime); + + total = cputime_add(cputime.utime, cputime.stime); + rtime = nsecs_to_cputime(cputime.sum_exec_runtime); + + if (total) { + u64 temp = rtime; + + temp *= cputime.utime; + do_div(temp, total); + utime = (cputime_t)temp; + } else + utime = rtime; + + sig->prev_utime = max(sig->prev_utime, utime); + sig->prev_stime = max(sig->prev_stime, + cputime_sub(rtime, sig->prev_utime)); + + *ut = sig->prev_utime; + *st = sig->prev_stime; +} #endif inline cputime_t task_gtime(struct task_struct *p) @@ -5553,7 +5658,7 @@ int mutex_spin_on_owner(struct mutex *lo * the mutex owner just released it and exited. */ if (probe_kernel_address(&owner->cpu, cpu)) - goto out; + return 0; #else cpu = owner->cpu; #endif @@ -5563,14 +5668,14 @@ int mutex_spin_on_owner(struct mutex *lo * the cpu field may no longer be valid. */ if (cpu >= nr_cpumask_bits) - goto out; + return 0; /* * We need to validate that we can do a * get_cpu() and that we have the percpu area. */ if (!cpu_online(cpu)) - goto out; + return 0; rq = cpu_rq(cpu); @@ -5589,7 +5694,7 @@ int mutex_spin_on_owner(struct mutex *lo cpu_relax(); } -out: + return 1; } #endif @@ -5937,14 +6042,15 @@ EXPORT_SYMBOL(wait_for_completion_killab */ bool try_wait_for_completion(struct completion *x) { + unsigned long flags; int ret = 1; - spin_lock_irq(&x->wait.lock); + spin_lock_irqsave(&x->wait.lock, flags); if (!x->done) ret = 0; else x->done--; - spin_unlock_irq(&x->wait.lock); + spin_unlock_irqrestore(&x->wait.lock, flags); return ret; } EXPORT_SYMBOL(try_wait_for_completion); @@ -5959,12 +6065,13 @@ EXPORT_SYMBOL(try_wait_for_completion); */ bool completion_done(struct completion *x) { + unsigned long flags; int ret = 1; - spin_lock_irq(&x->wait.lock); + spin_lock_irqsave(&x->wait.lock, flags); if (!x->done) ret = 0; - spin_unlock_irq(&x->wait.lock); + spin_unlock_irqrestore(&x->wait.lock, flags); return ret; } EXPORT_SYMBOL(completion_done); @@ -6058,7 +6165,7 @@ void rt_mutex_setprio(struct task_struct if (running) p->sched_class->set_curr_task(rq); if (on_rq) { - enqueue_task(rq, p, 0); + enqueue_task(rq, p, 0, oldprio < prio); check_class_changed(rq, p, prev_class, oldprio, running); } @@ -6102,7 +6209,7 @@ void set_user_nice(struct task_struct *p delta = p->prio - old_prio; if (on_rq) { - enqueue_task(rq, p, 0); + enqueue_task(rq, p, 0, false); /* * If the task increased its priority or is running and * lowered its priority, then reschedule its CPU: @@ -6493,7 +6600,7 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_ return -EINVAL; retval = -ESRCH; - read_lock(&tasklist_lock); + rcu_read_lock(); p = find_process_by_pid(pid); if (p) { retval = security_task_getscheduler(p); @@ -6501,7 +6608,7 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_ retval = p->policy | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); } - read_unlock(&tasklist_lock); + rcu_read_unlock(); return retval; } @@ -6519,7 +6626,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, p if (!param || pid < 0) return -EINVAL; - read_lock(&tasklist_lock); + rcu_read_lock(); p = find_process_by_pid(pid); retval = -ESRCH; if (!p) @@ -6530,7 +6637,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, p goto out_unlock; lp.sched_priority = p->rt_priority; - read_unlock(&tasklist_lock); + rcu_read_unlock(); /* * This one might sleep, we cannot do it with a spinlock held ... @@ -6540,7 +6647,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, p return retval; out_unlock: - read_unlock(&tasklist_lock); + rcu_read_unlock(); return retval; } @@ -6551,22 +6658,18 @@ long sched_setaffinity(pid_t pid, const int retval; get_online_cpus(); - read_lock(&tasklist_lock); + rcu_read_lock(); p = find_process_by_pid(pid); if (!p) { - read_unlock(&tasklist_lock); + rcu_read_unlock(); put_online_cpus(); return -ESRCH; } - /* - * It is not safe to call set_cpus_allowed with the - * tasklist_lock held. We will bump the task_struct's - * usage count and then drop tasklist_lock. - */ + /* Prevent p going away */ get_task_struct(p); - read_unlock(&tasklist_lock); + rcu_read_unlock(); if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { retval = -ENOMEM; @@ -6647,10 +6750,12 @@ SYSCALL_DEFINE3(sched_setaffinity, pid_t long sched_getaffinity(pid_t pid, struct cpumask *mask) { struct task_struct *p; + unsigned long flags; + struct rq *rq; int retval; get_online_cpus(); - read_lock(&tasklist_lock); + rcu_read_lock(); retval = -ESRCH; p = find_process_by_pid(pid); @@ -6661,10 +6766,12 @@ long sched_getaffinity(pid_t pid, struct if (retval) goto out_unlock; + rq = task_rq_lock(p, &flags); cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); + task_rq_unlock(rq, &flags); out_unlock: - read_unlock(&tasklist_lock); + rcu_read_unlock(); put_online_cpus(); return retval; @@ -6903,6 +7010,8 @@ SYSCALL_DEFINE2(sched_rr_get_interval, p { struct task_struct *p; unsigned int time_slice; + unsigned long flags; + struct rq *rq; int retval; struct timespec t; @@ -6910,7 +7019,7 @@ SYSCALL_DEFINE2(sched_rr_get_interval, p return -EINVAL; retval = -ESRCH; - read_lock(&tasklist_lock); + rcu_read_lock(); p = find_process_by_pid(pid); if (!p) goto out_unlock; @@ -6919,15 +7028,17 @@ SYSCALL_DEFINE2(sched_rr_get_interval, p if (retval) goto out_unlock; - time_slice = p->sched_class->get_rr_interval(p); + rq = task_rq_lock(p, &flags); + time_slice = p->sched_class->get_rr_interval(rq, p); + task_rq_unlock(rq, &flags); - read_unlock(&tasklist_lock); + rcu_read_unlock(); jiffies_to_timespec(time_slice, &t); retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; return retval; out_unlock: - read_unlock(&tasklist_lock); + rcu_read_unlock(); return retval; } @@ -6939,7 +7050,7 @@ void sched_show_task(struct task_struct unsigned state; state = p->state ? __ffs(p->state) + 1 : 0; - printk(KERN_INFO "%-15.15s %c", p->comm, + printk(KERN_INFO "%-13.13s %c", p->comm, state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); #if BITS_PER_LONG == 32 if (state == TASK_RUNNING) @@ -7018,6 +7129,7 @@ void __cpuinit init_idle(struct task_str spin_lock_irqsave(&rq->lock, flags); __sched_fork(idle); + idle->state = TASK_RUNNING; idle->se.exec_start = sched_clock(); cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); @@ -7112,7 +7224,19 @@ int set_cpus_allowed_ptr(struct task_str struct rq *rq; int ret = 0; + /* + * Serialize against TASK_WAKING so that ttwu() and wunt() can + * drop the rq->lock and still rely on ->cpus_allowed. + */ +again: + while (task_is_waking(p)) + cpu_relax(); rq = task_rq_lock(p, &flags); + if (task_is_waking(p)) { + task_rq_unlock(rq, &flags); + goto again; + } + if (!cpumask_intersects(new_mask, cpu_active_mask)) { ret = -EINVAL; goto out; @@ -7141,7 +7265,7 @@ int set_cpus_allowed_ptr(struct task_str get_task_struct(mt); task_rq_unlock(rq, &flags); - wake_up_process(rq->migration_thread); + wake_up_process(mt); put_task_struct(mt); wait_for_completion(&req.done); tlb_migrate_finish(p->mm); @@ -7168,7 +7292,7 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) { struct rq *rq_dest, *rq_src; - int ret = 0, on_rq; + int ret = 0; if (unlikely(!cpu_active(dest_cpu))) return ret; @@ -7180,19 +7304,17 @@ static int __migrate_task(struct task_st /* Already moved. */ if (task_cpu(p) != src_cpu) goto done; - /* Waking up, don't get in the way of try_to_wake_up(). */ - if (p->state == TASK_WAKING) - goto fail; /* Affinity changed (again). */ if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) goto fail; - on_rq = p->se.on_rq; - if (on_rq) + /* + * If we're not on a rq, the next wake-up will ensure we're + * placed properly. + */ + if (p->se.on_rq) { deactivate_task(rq_src, p, 0); - - set_task_cpu(p, dest_cpu); - if (on_rq) { + set_task_cpu(p, dest_cpu); activate_task(rq_dest, p, 0); check_preempt_curr(rq_dest, p, 0); } @@ -7271,57 +7393,29 @@ static int migration_thread(void *data) } #ifdef CONFIG_HOTPLUG_CPU - -static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu) -{ - int ret; - - local_irq_disable(); - ret = __migrate_task(p, src_cpu, dest_cpu); - local_irq_enable(); - return ret; -} - /* * Figure out where task on dead CPU should go, use force if necessary. */ -static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) +void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) { - int dest_cpu; - const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(dead_cpu)); - -again: - /* Look for allowed, online CPU in same node. */ - for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask) - if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) - goto move; - - /* Any allowed, online CPU? */ - dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask); - if (dest_cpu < nr_cpu_ids) - goto move; - - /* No more Mr. Nice Guy. */ - if (dest_cpu >= nr_cpu_ids) { - cpuset_cpus_allowed_locked(p, &p->cpus_allowed); - dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed); + struct rq *rq = cpu_rq(dead_cpu); + int needs_cpu, uninitialized_var(dest_cpu); + unsigned long flags; - /* - * Don't tell them about moving exiting tasks or - * kernel threads (both mm NULL), since they never - * leave kernel. - */ - if (p->mm && printk_ratelimit()) { - printk(KERN_INFO "process %d (%s) no " - "longer affine to cpu%d\n", - task_pid_nr(p), p->comm, dead_cpu); - } - } + local_irq_save(flags); -move: - /* It can have affinity changed while we were choosing. */ - if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu))) - goto again; + spin_lock(&rq->lock); + needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING); + if (needs_cpu) + dest_cpu = select_fallback_rq(dead_cpu, p); + spin_unlock(&rq->lock); + /* + * It can only fail if we race with set_cpus_allowed(), + * in the racer should migrate the task anyway. + */ + if (needs_cpu) + __migrate_task(p, dead_cpu, dest_cpu); + local_irq_restore(flags); } /* @@ -7669,10 +7763,9 @@ migration_call(struct notifier_block *nf unsigned long flags; struct rq *rq; - switch (action) { + switch (action & ~CPU_TASKS_FROZEN) { case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: p = kthread_create(migration_thread, hcpu, "migration/%d", cpu); if (IS_ERR(p)) return NOTIFY_BAD; @@ -7687,7 +7780,6 @@ migration_call(struct notifier_block *nf break; case CPU_ONLINE: - case CPU_ONLINE_FROZEN: /* Strictly unnecessary, as first user will wake it. */ wake_up_process(cpu_rq(cpu)->migration_thread); @@ -7704,7 +7796,6 @@ migration_call(struct notifier_block *nf #ifdef CONFIG_HOTPLUG_CPU case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: if (!cpu_rq(cpu)->migration_thread) break; /* Unbind it from offline cpu so it can run. Fall thru. */ @@ -7715,14 +7806,22 @@ migration_call(struct notifier_block *nf cpu_rq(cpu)->migration_thread = NULL; break; - case CPU_DEAD: - case CPU_DEAD_FROZEN: - cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */ - migrate_live_tasks(cpu); + case CPU_POST_DEAD: + /* + * Bring the migration thread down in CPU_POST_DEAD event, + * since the timers should have got migrated by now and thus + * we should not see a deadlock between trying to kill the + * migration thread and the sched_rt_period_timer. + */ rq = cpu_rq(cpu); kthread_stop(rq->migration_thread); put_task_struct(rq->migration_thread); rq->migration_thread = NULL; + break; + + case CPU_DEAD: + migrate_live_tasks(cpu); + rq = cpu_rq(cpu); /* Idle task back to normal (off runqueue, low prio) */ spin_lock_irq(&rq->lock); update_rq_clock(rq); @@ -7731,7 +7830,6 @@ migration_call(struct notifier_block *nf rq->idle->sched_class = &idle_sched_class; migrate_dead_tasks(cpu); spin_unlock_irq(&rq->lock); - cpuset_unlock(); migrate_nr_uninterruptible(rq); BUG_ON(rq->nr_running != 0); calc_global_load_remove(rq); @@ -7755,7 +7853,6 @@ migration_call(struct notifier_block *nf break; case CPU_DYING: - case CPU_DYING_FROZEN: /* Update our root-domain */ rq = cpu_rq(cpu); spin_lock_irqsave(&rq->lock, flags); @@ -8075,6 +8172,9 @@ cpu_attach_domain(struct sched_domain *s struct rq *rq = cpu_rq(cpu); struct sched_domain *tmp; + for (tmp = sd; tmp; tmp = tmp->parent) + tmp->span_weight = cpumask_weight(sched_domain_span(tmp)); + /* Remove the sched domains which do not contribute to scheduling. */ for (tmp = sd; tmp; ) { struct sched_domain *parent = tmp->parent; @@ -9499,6 +9599,8 @@ void __init sched_init(void) init_task_group.parent = &root_task_group; list_add(&init_task_group.siblings, &root_task_group.children); #endif /* CONFIG_USER_SCHED */ + + autogroup_init(&init_task); #endif /* CONFIG_GROUP_SCHED */ #if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP @@ -9656,24 +9758,13 @@ static inline int preempt_count_equals(i return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); } -static int __might_sleep_init_called; -int __init __might_sleep_init(void) -{ - __might_sleep_init_called = 1; - return 0; -} -early_initcall(__might_sleep_init); - void __might_sleep(char *file, int line, int preempt_offset) { #ifdef in_atomic static unsigned long prev_jiffy; /* ratelimiting */ if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || - oops_in_progress) - return; - if (system_state != SYSTEM_RUNNING && - (!__might_sleep_init_called || system_state != SYSTEM_BOOTING)) + system_state != SYSTEM_RUNNING || oops_in_progress) return; if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) return; @@ -10049,15 +10140,11 @@ void sched_destroy_group(struct task_gro /* change task's runqueue when it moves between groups. * The caller of this function should have put the task in its new group * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to - * reflect its new group. + * reflect its new group. Called with the runqueue lock held. */ -void sched_move_task(struct task_struct *tsk) +void __sched_move_task(struct task_struct *tsk, struct rq *rq) { int on_rq, running; - unsigned long flags; - struct rq *rq; - - rq = task_rq_lock(tsk, &flags); update_rq_clock(rq); @@ -10073,13 +10160,22 @@ void sched_move_task(struct task_struct #ifdef CONFIG_FAIR_GROUP_SCHED if (tsk->sched_class->moved_group) - tsk->sched_class->moved_group(tsk); + tsk->sched_class->moved_group(tsk, on_rq); #endif if (unlikely(running)) tsk->sched_class->set_curr_task(rq); if (on_rq) - enqueue_task(rq, tsk, 0); + enqueue_task(rq, tsk, 0, false); +} + +void sched_move_task(struct task_struct *tsk) +{ + struct rq *rq; + unsigned long flags; + + rq = task_rq_lock(tsk, &flags); + __sched_move_task(tsk, rq); task_rq_unlock(rq, &flags); } @@ -10493,15 +10589,6 @@ cpu_cgroup_destroy(struct cgroup_subsys static int cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) { - if ((current != tsk) && (!capable(CAP_SYS_NICE))) { - const struct cred *cred = current_cred(), *tcred; - - tcred = __task_cred(tsk); - - if (cred->euid != tcred->uid && cred->euid != tcred->suid) - return -EPERM; - } - #ifdef CONFIG_RT_GROUP_SCHED if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk)) return -EINVAL; @@ -10860,12 +10947,30 @@ static void cpuacct_charge(struct task_s } /* + * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large + * in cputime_t units. As a result, cpuacct_update_stats calls + * percpu_counter_add with values large enough to always overflow the + * per cpu batch limit causing bad SMP scalability. + * + * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we + * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled + * and enabled. We cap it at INT_MAX which is the largest allowed batch value. + */ +#ifdef CONFIG_SMP +#define CPUACCT_BATCH \ + min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX) +#else +#define CPUACCT_BATCH 0 +#endif + +/* * Charge the system/user time to the task's accounting group. */ static void cpuacct_update_stats(struct task_struct *tsk, enum cpuacct_stat_index idx, cputime_t val) { struct cpuacct *ca; + int batch = CPUACCT_BATCH; if (unlikely(!cpuacct_subsys.active)) return; @@ -10874,7 +10979,7 @@ static void cpuacct_update_stats(struct ca = task_ca(tsk); do { - percpu_counter_add(&ca->cpustat[idx], val); + __percpu_counter_add(&ca->cpustat[idx], val, batch); ca = ca->parent; } while (ca); rcu_read_unlock(); @@ -10998,3 +11103,4 @@ void synchronize_sched_expedited(void) EXPORT_SYMBOL_GPL(synchronize_sched_expedited); #endif /* #else #ifndef CONFIG_SMP */ +#endif /* CONFIG_SCHED_BFS */ \ No newline at end of file