git - ziggy471-frankenstein-kernel.git/blobdiff

blob:0c91bf6544bf5d79b3d67ceea1f6a5cdf7c71104 -> blob:95297f2c284ebeed72c63256397c783efbc289d4

--- kernel/sched.c

+++ kernel/sched.c

@@ -1,3 +1,6 @@

+#ifdef CONFIG_SCHED_BFS

+#include "sched_bfs.c"

+#else

* kernel/sched.c

@@ -76,6 +79,7 @@

#include <asm/irq_regs.h>

#include "sched_cpupri.h"

+#include "sched_autogroup.h"

#define CREATE_TRACE_POINTS

#include <trace/events/sched.h>

@@ -352,13 +356,20 @@ static inline struct task_group *task_gr

rcu_read_lock();

tg = __task_cred(p)->user->tg;

rcu_read_unlock();

+ return tg;

#elif defined(CONFIG_CGROUP_SCHED)

- tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),

- struct task_group, css);

+ struct cgroup_subsys_state *css;

+ css = task_subsys_state(p, cpu_cgroup_subsys_id);

+ tg = container_of(css, struct task_group, css);

+ return autogroup_task_group(p, tg);

#else

tg = &init_task_group;

-#endif

return tg;

+#endif

}

/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */

@@ -542,7 +553,6 @@ struct rq {

struct load_weight load;

unsigned long nr_load_updates;

u64 nr_switches;

- u64 nr_migrations_in;

struct cfs_rq cfs;

struct rt_rq rt;

@@ -742,7 +752,7 @@ sched_feat_write(struct file *filp, cons

size_t cnt, loff_t *ppos)

{

char buf[64];

- char *cmp = buf;

+ char *cmp;

int neg = 0;

int i;

@@ -753,6 +763,7 @@ sched_feat_write(struct file *filp, cons

return -EFAULT;

buf[cnt] = 0;

+ cmp = strstrip(buf);

if (strncmp(buf, "NO_", 3) == 0) {

neg = 1;

@@ -760,9 +771,7 @@ sched_feat_write(struct file *filp, cons

}

for (i = 0; sched_feat_names[i]; i++) {

- int len = strlen(sched_feat_names[i]);

- if (strncmp(cmp, sched_feat_names[i], len) == 0) {

+ if (strcmp(cmp, sched_feat_names[i]) == 0) {

if (neg)

sysctl_sched_features &= ~(1UL << i);

else

@@ -943,14 +952,25 @@ static inline void finish_lock_switch(st

#endif /* __ARCH_WANT_UNLOCKED_CTXSW */

+ * Check whether the task is waking, we use this to synchronize ->cpus_allowed

+ * against ttwu().

+ */

+static inline int task_is_waking(struct task_struct *p)

+ return unlikely(p->state == TASK_WAKING);

+/*

* __task_rq_lock - lock the runqueue a given task resides on.

* Must be called interrupts disabled.

static inline struct rq *__task_rq_lock(struct task_struct *p)

__acquires(rq->lock)

{

+ struct rq *rq;

for (;;) {

- struct rq *rq = task_rq(p);

+ rq = task_rq(p);

spin_lock(&rq->lock);

if (likely(rq == task_rq(p)))

return rq;

@@ -1623,7 +1643,7 @@ static void update_group_shares_cpu(stru

static int tg_shares_up(struct task_group *tg, void *data)

{

- unsigned long weight, rq_weight = 0, shares = 0;

+ unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0;

unsigned long *usd_rq_weight;

struct sched_domain *sd = data;

unsigned long flags;

@@ -1639,6 +1659,7 @@ static int tg_shares_up(struct task_grou

weight = tg->cfs_rq[i]->load.weight;

usd_rq_weight[i] = weight;

+ rq_weight += weight;

* If there are currently no tasks on the cpu pretend there

* is one of average load so that when a new task gets to

@@ -1647,10 +1668,13 @@ static int tg_shares_up(struct task_grou

if (!weight)

weight = NICE_0_LOAD;

- rq_weight += weight;

+ sum_weight += weight;

shares += tg->cfs_rq[i]->shares;

}

+ if (!rq_weight)

+ rq_weight = sum_weight;

if ((!shares && rq_weight) || shares > tg->shares)

shares = tg->shares;

@@ -1818,10 +1842,25 @@ static void cfs_rq_set_shares(struct cfs

static void calc_load_account_active(struct rq *this_rq);

static void update_sysctl(void);

+static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)

+ set_task_rq(p, cpu);

+#ifdef CONFIG_SMP

+ /*

+ * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be

+ * successfuly executed on another CPU. We must ensure that updates of

+ * per-task data have been completed by this moment.

+ */

+ smp_wmb();

+ task_thread_info(p)->cpu = cpu;

+#endif

#include "sched_stats.h"

#include "sched_idletask.c"

#include "sched_fair.c"

#include "sched_rt.c"

+#include "sched_autogroup.c"

#ifdef CONFIG_SCHED_DEBUG

# include "sched_debug.c"

#endif

@@ -1867,13 +1906,14 @@ static void update_avg(u64 *avg, u64 sam

*avg += diff >> 3;

}

-static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)

+static void

+enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head)

{

if (wakeup)

p->se.start_runtime = p->se.sum_exec_runtime;

sched_info_queued(p);

- p->sched_class->enqueue_task(rq, p, wakeup);

+ p->sched_class->enqueue_task(rq, p, wakeup, head);

p->se.on_rq = 1;

}

@@ -1949,7 +1989,7 @@ static void activate_task(struct rq *rq,

if (task_contributes_to_load(p))

rq->nr_uninterruptible--;

- enqueue_task(rq, p, wakeup);

+ enqueue_task(rq, p, wakeup, false);

inc_nr_running(rq);

}

@@ -1974,20 +2014,6 @@ inline int task_curr(const struct task_s

return cpu_curr(task_cpu(p)) == p;

}

-static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)

- set_task_rq(p, cpu);

-#ifdef CONFIG_SMP

- /*

- * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be

- * successfuly executed on another CPU. We must ensure that updates of

- * per-task data have been completed by this moment.

- */

- smp_wmb();

- task_thread_info(p)->cpu = cpu;

-#endif

static inline void check_class_changed(struct rq *rq, struct task_struct *p,

const struct sched_class *prev_class,

int oldprio, int running)

@@ -2014,21 +2040,15 @@ static inline void check_class_changed(s

void kthread_bind(struct task_struct *p, unsigned int cpu)

{

- struct rq *rq = cpu_rq(cpu);

- unsigned long flags;

/* Must have done schedule() in kthread() before we set_task_cpu */

if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) {

WARN_ON(1);

return;

}

- spin_lock_irqsave(&rq->lock, flags);

- set_task_cpu(p, cpu);

p->cpus_allowed = cpumask_of_cpu(cpu);

p->rt.nr_cpus_allowed = 1;

p->flags |= PF_THREAD_BOUND;

- spin_unlock_irqrestore(&rq->lock, flags);

}

EXPORT_SYMBOL(kthread_bind);

@@ -2066,35 +2086,23 @@ task_hot(struct task_struct *p, u64 now,

void set_task_cpu(struct task_struct *p, unsigned int new_cpu)

{

int old_cpu = task_cpu(p);

- struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);

- struct cfs_rq *old_cfsrq = task_cfs_rq(p),

- *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu);

- u64 clock_offset;

- clock_offset = old_rq->clock - new_rq->clock;

+#ifdef CONFIG_SCHED_DEBUG

+ /*

+ * We should never call set_task_cpu() on a blocked task,

+ * ttwu() will sort out the placement.

+ */

+ WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&

+ !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));

+#endif

trace_sched_migrate_task(p, new_cpu);

-#ifdef CONFIG_SCHEDSTATS

- if (p->se.wait_start)

- p->se.wait_start -= clock_offset;

- if (p->se.sleep_start)

- p->se.sleep_start -= clock_offset;

- if (p->se.block_start)

- p->se.block_start -= clock_offset;

-#endif

if (old_cpu != new_cpu) {

p->se.nr_migrations++;

- new_rq->nr_migrations_in++;

-#ifdef CONFIG_SCHEDSTATS

- if (task_hot(p, old_rq->clock, NULL))

- schedstat_inc(p, se.nr_forced2_migrations);

-#endif

perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS,

1, 1, NULL, 0);

}

- p->se.vruntime -= old_cfsrq->min_vruntime -

- new_cfsrq->min_vruntime;

__set_task_cpu(p, new_cpu);

}

@@ -2327,6 +2335,69 @@ void task_oncpu_function_call(struct tas

preempt_enable();

}

+#ifdef CONFIG_SMP

+/*

+ * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held.

+ */

+static int select_fallback_rq(int cpu, struct task_struct *p)

+ int dest_cpu;

+ const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));

+ /* Look for allowed, online CPU in same node. */

+ for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)

+ if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))

+ return dest_cpu;

+ /* Any allowed, online CPU? */

+ dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask);

+ if (dest_cpu < nr_cpu_ids)

+ return dest_cpu;

+ /* No more Mr. Nice Guy. */

+ if (unlikely(dest_cpu >= nr_cpu_ids)) {

+ dest_cpu = cpuset_cpus_allowed_fallback(p);

+ /*

+ * Don't tell them about moving exiting tasks or

+ * kernel threads (both mm NULL), since they never

+ * leave kernel.

+ */

+ if (p->mm && printk_ratelimit()) {

+ printk(KERN_INFO "process %d (%s) no "

+ "longer affine to cpu%d\n",

+ task_pid_nr(p), p->comm, cpu);

+ }

+ return dest_cpu;

+/*

+ * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable.

+ */

+static inline

+int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags)

+ int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags);

+ /*

+ * In order not to call set_task_cpu() on a blocking task we need

+ * to rely on ttwu() to place the task on a valid ->cpus_allowed

+ * cpu.

+ *

+ * Since this is common to all placement strategies, this lives here.

+ *

+ * [ this allows ->select_task() to simply return task_cpu(p) and

+ * not worry about this generic constraint ]

+ */

+ if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) ||

+ !cpu_online(cpu)))

+ cpu = select_fallback_rq(task_cpu(p), p);

+ return cpu;

+#endif

/***

* try_to_wake_up - wake up a thread

* @p: the to-be-woken-up thread

@@ -2375,22 +2446,34 @@ static int try_to_wake_up(struct task_st

* First fix up the nr_uninterruptible count:

- if (task_contributes_to_load(p))

- rq->nr_uninterruptible--;

+ if (task_contributes_to_load(p)) {

+ if (likely(cpu_online(orig_cpu)))

+ rq->nr_uninterruptible--;

+ else

+ this_rq()->nr_uninterruptible--;

+ }

p->state = TASK_WAKING;

- task_rq_unlock(rq, &flags);

- cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags);

+ if (p->sched_class->task_waking)

+ p->sched_class->task_waking(rq, p);

+ cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags);

if (cpu != orig_cpu)

set_task_cpu(p, cpu);

+ __task_rq_unlock(rq);

- rq = task_rq_lock(p, &flags);

- if (rq != orig_rq)

- update_rq_clock(rq);

+ rq = cpu_rq(cpu);

+ spin_lock(&rq->lock);

+ update_rq_clock(rq);

+ /*

+ * We migrated the task without holding either rq->lock, however

+ * since the task is not on the task list itself, nobody else

+ * will try and migrate the task, hence the rq should match the

+ * cpu we just moved it to.

+ */

+ WARN_ON(task_cpu(p) != cpu);

WARN_ON(p->state != TASK_WAKING);

- cpu = task_cpu(p);

#ifdef CONFIG_SCHEDSTATS

schedstat_inc(rq, ttwu_count);

@@ -2443,8 +2526,8 @@ out_running:

p->state = TASK_RUNNING;

#ifdef CONFIG_SMP

- if (p->sched_class->task_wake_up)

- p->sched_class->task_wake_up(rq, p);

+ if (p->sched_class->task_woken)

+ p->sched_class->task_woken(rq, p);

if (unlikely(rq->idle_stamp)) {

u64 delta = rq->clock - rq->idle_stamp;

@@ -2524,7 +2607,6 @@ static void __sched_fork(struct task_str

p->se.nr_failed_migrations_running = 0;

p->se.nr_failed_migrations_hot = 0;

p->se.nr_forced_migrations = 0;

- p->se.nr_forced2_migrations = 0;

p->se.nr_wakeups = 0;

p->se.nr_wakeups_sync = 0;

@@ -2545,14 +2627,6 @@ static void __sched_fork(struct task_str

#ifdef CONFIG_PREEMPT_NOTIFIERS

INIT_HLIST_HEAD(&p->preempt_notifiers);

#endif

- /*

- * We mark the process as running here, but have not actually

- * inserted it onto the runqueue yet. This guarantees that

- * nobody will actually run it, and a signal or other external

- * event cannot wake it up and insert it on the runqueue either.

- */

- p->state = TASK_RUNNING;

}

@@ -2563,6 +2637,12 @@ void sched_fork(struct task_struct *p, i

int cpu = get_cpu();

__sched_fork(p);

+ /*

+ * We mark the process as running here. This guarantees that

+ * nobody will actually run it, and a signal or other external

+ * event cannot wake it up and insert it on the runqueue either.

+ */

+ p->state = TASK_RUNNING;

* Revert to default priority/policy on fork if requested.

@@ -2594,9 +2674,9 @@ void sched_fork(struct task_struct *p, i

if (!rt_prio(p->prio))

p->sched_class = &fair_sched_class;

-#ifdef CONFIG_SMP

- cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0);

-#endif

+ if (p->sched_class->task_fork)

+ p->sched_class->task_fork(p);

set_task_cpu(p, cpu);

#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)

@@ -2626,28 +2706,38 @@ void wake_up_new_task(struct task_struct

{

unsigned long flags;

struct rq *rq;

+ int cpu = get_cpu();

+#ifdef CONFIG_SMP

rq = task_rq_lock(p, &flags);

- BUG_ON(p->state != TASK_RUNNING);

- update_rq_clock(rq);

+ p->state = TASK_WAKING;

- if (!p->sched_class->task_new || !current->se.on_rq) {

- activate_task(rq, p, 0);

- } else {

- /*

- * Let the scheduling class do new task startup

- * management (if any):

- */

- p->sched_class->task_new(rq, p);

- inc_nr_running(rq);

- }

+ /*

+ * Fork balancing, do it here and not earlier because:

+ * - cpus_allowed can change in the fork path

+ * - any previously selected cpu might disappear through hotplug

+ *

+ * We set TASK_WAKING so that select_task_rq() can drop rq->lock

+ * without people poking at ->cpus_allowed.

+ */

+ cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0);

+ set_task_cpu(p, cpu);

+ p->state = TASK_RUNNING;

+ task_rq_unlock(rq, &flags);

+#endif

+ rq = task_rq_lock(p, &flags);

+ update_rq_clock(rq);

+ activate_task(rq, p, 0);

trace_sched_wakeup_new(rq, p, 1);

check_preempt_curr(rq, p, WF_FORK);

#ifdef CONFIG_SMP

- if (p->sched_class->task_wake_up)

- p->sched_class->task_wake_up(rq, p);

+ if (p->sched_class->task_woken)

+ p->sched_class->task_woken(rq, p);

#endif

task_rq_unlock(rq, &flags);

+ put_cpu();

}

#ifdef CONFIG_PREEMPT_NOTIFIERS

@@ -3034,15 +3124,6 @@ static void calc_load_account_active(str

}

- * Externally visible per-cpu scheduler statistics:

- * cpu_nr_migrations(cpu) - number of migrations into that cpu

- */

-u64 cpu_nr_migrations(int cpu)

- return cpu_rq(cpu)->nr_migrations_in;

-/*

* Update rq->cpu_load[] statistics. This function is usually called every

* scheduler tick (TICK_NSEC).

@@ -3124,24 +3205,28 @@ static void double_rq_unlock(struct rq *

}

- * If dest_cpu is allowed for this process, migrate the task to it.

- * This is accomplished by forcing the cpu_allowed mask to only

- * allow dest_cpu, which will force the cpu onto dest_cpu. Then

- * the cpu_allowed mask is restored.

+ * sched_exec - execve() is a valuable balancing opportunity, because at

+ * this point the task has the smallest effective memory and cache footprint.

-static void sched_migrate_task(struct task_struct *p, int dest_cpu)

+void sched_exec(void)

{

+ struct task_struct *p = current;

struct migration_req req;

unsigned long flags;

struct rq *rq;

+ int dest_cpu;

rq = task_rq_lock(p, &flags);

- if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)

- || unlikely(!cpu_active(dest_cpu)))

- goto out;

+ dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0);

+ if (dest_cpu == smp_processor_id())

+ goto unlock;

- /* force the process onto the specified CPU */

- if (migrate_task(p, dest_cpu, &req)) {

+ /*

+ * select_task_rq() can race against ->cpus_allowed

+ */

+ if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&

+ likely(cpu_active(dest_cpu)) &&

+ migrate_task(p, dest_cpu, &req)) {

/* Need to wait for migration thread (might exit: take ref). */

struct task_struct *mt = rq->migration_thread;

@@ -3153,24 +3238,11 @@ static void sched_migrate_task(struct ta

return;

}

-out:

+unlock:

task_rq_unlock(rq, &flags);

}

- * sched_exec - execve() is a valuable balancing opportunity, because at

- * this point the task has the smallest effective memory and cache footprint.

- */

-void sched_exec(void)

- int new_cpu, this_cpu = get_cpu();

- new_cpu = current->sched_class->select_task_rq(current, SD_BALANCE_EXEC, 0);

- put_cpu();

- if (new_cpu != this_cpu)

- sched_migrate_task(current, new_cpu);

-/*

* pull_task - move a task from a remote runqueue to the local runqueue.

* Both runqueues must be locked.

@@ -3617,7 +3689,7 @@ unsigned long __weak arch_scale_freq_pow

unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)

{

- unsigned long weight = cpumask_weight(sched_domain_span(sd));

+ unsigned long weight = sd->span_weight;

unsigned long smt_gain = sd->smt_gain;

smt_gain /= weight;

@@ -3650,7 +3722,7 @@ unsigned long scale_rt_power(int cpu)

static void update_cpu_power(struct sched_domain *sd, int cpu)

{

- unsigned long weight = cpumask_weight(sched_domain_span(sd));

+ unsigned long weight = sd->span_weight;

unsigned long power = SCHED_LOAD_SCALE;

struct sched_group *sdg = sd->groups;

@@ -5149,21 +5221,9 @@ void account_idle_time(cputime_t cputime

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

cputime64_t cputime64 = cputime_to_cputime64(cputime);

struct rq *rq = this_rq();

- struct task_struct *task;

- if (atomic_read(&rq->nr_iowait) > 0) {

- for (task = current; task != &init_task; task = task->parent)

- ;

- /* task now points to init */

- for_each_process(task) {

- /* this pointlessly prints the name and PID of each task */

- if (task->in_iowait) {

- task->iowait = cputime64_add(task->iowait, cputime64);

- //printk("%s[%d]\n", task->comm, task->pid);

- }

+ if (atomic_read(&rq->nr_iowait) > 0)

cpustat->iowait = cputime64_add(cpustat->iowait, cputime64);

- }

else

cpustat->idle = cputime64_add(cpustat->idle, cputime64);

}

@@ -5223,45 +5283,90 @@ cputime_t task_stime(struct task_struct

{

return p->stime;

}

+void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)

+ struct task_cputime cputime;

+ thread_group_cputime(p, &cputime);

+ *ut = cputime.utime;

+ *st = cputime.stime;

#else

+#ifndef nsecs_to_cputime

+# define nsecs_to_cputime(__nsecs) \

+ msecs_to_cputime(div_u64((__nsecs), NSEC_PER_MSEC))

+#endif

cputime_t task_utime(struct task_struct *p)

{

- clock_t utime = cputime_to_clock_t(p->utime),

- total = utime + cputime_to_clock_t(p->stime);

+ cputime_t utime = p->utime, total = utime + p->stime;

u64 temp;

* Use CFS's precise accounting:

- temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);

+ temp = (u64)nsecs_to_cputime(p->se.sum_exec_runtime);

if (total) {

temp *= utime;

do_div(temp, total);

}

- utime = (clock_t)temp;

+ utime = (cputime_t)temp;

- p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime));

+ p->prev_utime = max(p->prev_utime, utime);

return p->prev_utime;

}

cputime_t task_stime(struct task_struct *p)

{

- clock_t stime;

+ cputime_t stime;

* Use CFS's precise accounting. (we subtract utime from

* the total, to make sure the total observed by userspace

* grows monotonically - apps rely on that):

- stime = nsec_to_clock_t(p->se.sum_exec_runtime) -

- cputime_to_clock_t(task_utime(p));

+ stime = nsecs_to_cputime(p->se.sum_exec_runtime) - task_utime(p);

if (stime >= 0)

- p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));

+ p->prev_stime = max(p->prev_stime, stime);

return p->prev_stime;

}

+/*

+ * Must be called with siglock held.

+ */

+void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)

+ struct signal_struct *sig = p->signal;

+ struct task_cputime cputime;

+ cputime_t rtime, utime, total;

+ thread_group_cputime(p, &cputime);

+ total = cputime_add(cputime.utime, cputime.stime);

+ rtime = nsecs_to_cputime(cputime.sum_exec_runtime);

+ if (total) {

+ u64 temp = rtime;

+ temp *= cputime.utime;

+ do_div(temp, total);

+ utime = (cputime_t)temp;

+ } else

+ utime = rtime;

+ sig->prev_utime = max(sig->prev_utime, utime);

+ sig->prev_stime = max(sig->prev_stime,

+ cputime_sub(rtime, sig->prev_utime));

+ *ut = sig->prev_utime;

+ *st = sig->prev_stime;

#endif

inline cputime_t task_gtime(struct task_struct *p)

@@ -5553,7 +5658,7 @@ int mutex_spin_on_owner(struct mutex *lo

* the mutex owner just released it and exited.

if (probe_kernel_address(&owner->cpu, cpu))

- goto out;

+ return 0;

#else

cpu = owner->cpu;

#endif

@@ -5563,14 +5668,14 @@ int mutex_spin_on_owner(struct mutex *lo

* the cpu field may no longer be valid.

if (cpu >= nr_cpumask_bits)

- goto out;

+ return 0;

* We need to validate that we can do a

* get_cpu() and that we have the percpu area.

if (!cpu_online(cpu))

- goto out;

+ return 0;

rq = cpu_rq(cpu);

@@ -5589,7 +5694,7 @@ int mutex_spin_on_owner(struct mutex *lo

cpu_relax();

}

-out:

return 1;

}

#endif

@@ -5937,14 +6042,15 @@ EXPORT_SYMBOL(wait_for_completion_killab

bool try_wait_for_completion(struct completion *x)

{

+ unsigned long flags;

int ret = 1;

- spin_lock_irq(&x->wait.lock);

+ spin_lock_irqsave(&x->wait.lock, flags);

if (!x->done)

ret = 0;

else

x->done--;

- spin_unlock_irq(&x->wait.lock);

+ spin_unlock_irqrestore(&x->wait.lock, flags);

return ret;

}

EXPORT_SYMBOL(try_wait_for_completion);

@@ -5959,12 +6065,13 @@ EXPORT_SYMBOL(try_wait_for_completion);

bool completion_done(struct completion *x)

{

+ unsigned long flags;

int ret = 1;

- spin_lock_irq(&x->wait.lock);

+ spin_lock_irqsave(&x->wait.lock, flags);

if (!x->done)

ret = 0;

- spin_unlock_irq(&x->wait.lock);

+ spin_unlock_irqrestore(&x->wait.lock, flags);

return ret;

}

EXPORT_SYMBOL(completion_done);

@@ -6058,7 +6165,7 @@ void rt_mutex_setprio(struct task_struct

if (running)

p->sched_class->set_curr_task(rq);

if (on_rq) {

- enqueue_task(rq, p, 0);

+ enqueue_task(rq, p, 0, oldprio < prio);

check_class_changed(rq, p, prev_class, oldprio, running);

}

@@ -6102,7 +6209,7 @@ void set_user_nice(struct task_struct *p

delta = p->prio - old_prio;

if (on_rq) {

- enqueue_task(rq, p, 0);

+ enqueue_task(rq, p, 0, false);

* If the task increased its priority or is running and

* lowered its priority, then reschedule its CPU:

@@ -6493,7 +6600,7 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_

return -EINVAL;

retval = -ESRCH;

- read_lock(&tasklist_lock);

+ rcu_read_lock();

p = find_process_by_pid(pid);

if (p) {

retval = security_task_getscheduler(p);

@@ -6501,7 +6608,7 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_

retval = p->policy

| (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);

}

- read_unlock(&tasklist_lock);

+ rcu_read_unlock();

return retval;

}

@@ -6519,7 +6626,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, p

if (!param || pid < 0)

return -EINVAL;

- read_lock(&tasklist_lock);

+ rcu_read_lock();

p = find_process_by_pid(pid);

retval = -ESRCH;

if (!p)

@@ -6530,7 +6637,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, p

goto out_unlock;

lp.sched_priority = p->rt_priority;

- read_unlock(&tasklist_lock);

+ rcu_read_unlock();

* This one might sleep, we cannot do it with a spinlock held ...

@@ -6540,7 +6647,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, p

return retval;

out_unlock:

- read_unlock(&tasklist_lock);

+ rcu_read_unlock();

return retval;

}

@@ -6551,22 +6658,18 @@ long sched_setaffinity(pid_t pid, const

int retval;

get_online_cpus();

- read_lock(&tasklist_lock);

+ rcu_read_lock();

p = find_process_by_pid(pid);

if (!p) {

- read_unlock(&tasklist_lock);

+ rcu_read_unlock();

put_online_cpus();

return -ESRCH;

}

- /*

- * It is not safe to call set_cpus_allowed with the

- * tasklist_lock held. We will bump the task_struct's

- * usage count and then drop tasklist_lock.

- */

+ /* Prevent p going away */

get_task_struct(p);

- read_unlock(&tasklist_lock);

+ rcu_read_unlock();

if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {

retval = -ENOMEM;

@@ -6647,10 +6750,12 @@ SYSCALL_DEFINE3(sched_setaffinity, pid_t

long sched_getaffinity(pid_t pid, struct cpumask *mask)

{

struct task_struct *p;

+ unsigned long flags;

+ struct rq *rq;

int retval;

get_online_cpus();

- read_lock(&tasklist_lock);

+ rcu_read_lock();

retval = -ESRCH;

p = find_process_by_pid(pid);

@@ -6661,10 +6766,12 @@ long sched_getaffinity(pid_t pid, struct

if (retval)

goto out_unlock;

+ rq = task_rq_lock(p, &flags);

cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);

+ task_rq_unlock(rq, &flags);

out_unlock:

- read_unlock(&tasklist_lock);

+ rcu_read_unlock();

put_online_cpus();

return retval;

@@ -6903,6 +7010,8 @@ SYSCALL_DEFINE2(sched_rr_get_interval, p

{

struct task_struct *p;

unsigned int time_slice;

+ unsigned long flags;

+ struct rq *rq;

int retval;

struct timespec t;

@@ -6910,7 +7019,7 @@ SYSCALL_DEFINE2(sched_rr_get_interval, p

return -EINVAL;

retval = -ESRCH;

- read_lock(&tasklist_lock);

+ rcu_read_lock();

p = find_process_by_pid(pid);

if (!p)

goto out_unlock;

@@ -6919,15 +7028,17 @@ SYSCALL_DEFINE2(sched_rr_get_interval, p

if (retval)

goto out_unlock;

- time_slice = p->sched_class->get_rr_interval(p);

+ rq = task_rq_lock(p, &flags);

+ time_slice = p->sched_class->get_rr_interval(rq, p);

+ task_rq_unlock(rq, &flags);

- read_unlock(&tasklist_lock);

+ rcu_read_unlock();

jiffies_to_timespec(time_slice, &t);

retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;

return retval;

out_unlock:

- read_unlock(&tasklist_lock);

+ rcu_read_unlock();

return retval;

}

@@ -6939,7 +7050,7 @@ void sched_show_task(struct task_struct

unsigned state;

state = p->state ? __ffs(p->state) + 1 : 0;

- printk(KERN_INFO "%-15.15s %c", p->comm,

+ printk(KERN_INFO "%-13.13s %c", p->comm,

state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');

#if BITS_PER_LONG == 32

if (state == TASK_RUNNING)

@@ -7018,6 +7129,7 @@ void __cpuinit init_idle(struct task_str

spin_lock_irqsave(&rq->lock, flags);

__sched_fork(idle);

+ idle->state = TASK_RUNNING;

idle->se.exec_start = sched_clock();

cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));

@@ -7112,7 +7224,19 @@ int set_cpus_allowed_ptr(struct task_str

struct rq *rq;

int ret = 0;

+ /*

+ * Serialize against TASK_WAKING so that ttwu() and wunt() can

+ * drop the rq->lock and still rely on ->cpus_allowed.

+ */

+again:

+ while (task_is_waking(p))

+ cpu_relax();

rq = task_rq_lock(p, &flags);

+ if (task_is_waking(p)) {

+ task_rq_unlock(rq, &flags);

+ goto again;

+ }

if (!cpumask_intersects(new_mask, cpu_active_mask)) {

ret = -EINVAL;

goto out;

@@ -7141,7 +7265,7 @@ int set_cpus_allowed_ptr(struct task_str

get_task_struct(mt);

task_rq_unlock(rq, &flags);

- wake_up_process(rq->migration_thread);

+ wake_up_process(mt);

put_task_struct(mt);

wait_for_completion(&req.done);

tlb_migrate_finish(p->mm);

@@ -7168,7 +7292,7 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);

static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)

{

struct rq *rq_dest, *rq_src;

- int ret = 0, on_rq;

+ int ret = 0;

if (unlikely(!cpu_active(dest_cpu)))

return ret;

@@ -7180,19 +7304,17 @@ static int __migrate_task(struct task_st

/* Already moved. */

if (task_cpu(p) != src_cpu)

goto done;

- /* Waking up, don't get in the way of try_to_wake_up(). */

- if (p->state == TASK_WAKING)

- goto fail;

/* Affinity changed (again). */

if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))

goto fail;

- on_rq = p->se.on_rq;

- if (on_rq)

+ /*

+ * If we're not on a rq, the next wake-up will ensure we're

+ * placed properly.

+ */

+ if (p->se.on_rq) {

deactivate_task(rq_src, p, 0);

- set_task_cpu(p, dest_cpu);

- if (on_rq) {

+ set_task_cpu(p, dest_cpu);

activate_task(rq_dest, p, 0);

check_preempt_curr(rq_dest, p, 0);

}

@@ -7271,57 +7393,29 @@ static int migration_thread(void *data)

}

#ifdef CONFIG_HOTPLUG_CPU

-static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)

- int ret;

- local_irq_disable();

- ret = __migrate_task(p, src_cpu, dest_cpu);

- local_irq_enable();

- return ret;

* Figure out where task on dead CPU should go, use force if necessary.

-static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)

+void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)

{

- int dest_cpu;

- const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(dead_cpu));

-again:

- /* Look for allowed, online CPU in same node. */

- for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)

- if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))

- goto move;

- /* Any allowed, online CPU? */

- dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask);

- if (dest_cpu < nr_cpu_ids)

- goto move;

- /* No more Mr. Nice Guy. */

- if (dest_cpu >= nr_cpu_ids) {

- cpuset_cpus_allowed_locked(p, &p->cpus_allowed);

- dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);

+ struct rq *rq = cpu_rq(dead_cpu);

+ int needs_cpu, uninitialized_var(dest_cpu);

+ unsigned long flags;

- /*

- * Don't tell them about moving exiting tasks or

- * kernel threads (both mm NULL), since they never

- * leave kernel.

- */

- if (p->mm && printk_ratelimit()) {

- printk(KERN_INFO "process %d (%s) no "

- "longer affine to cpu%d\n",

- task_pid_nr(p), p->comm, dead_cpu);

- }

+ local_irq_save(flags);

-move:

- /* It can have affinity changed while we were choosing. */

- if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu)))

- goto again;

+ spin_lock(&rq->lock);

+ needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING);

+ if (needs_cpu)

+ dest_cpu = select_fallback_rq(dead_cpu, p);

+ spin_unlock(&rq->lock);

+ /*

+ * It can only fail if we race with set_cpus_allowed(),

+ * in the racer should migrate the task anyway.

+ */

+ if (needs_cpu)

+ __migrate_task(p, dead_cpu, dest_cpu);

+ local_irq_restore(flags);

}

@@ -7669,10 +7763,9 @@ migration_call(struct notifier_block *nf

unsigned long flags;

struct rq *rq;

- switch (action) {

+ switch (action & ~CPU_TASKS_FROZEN) {

case CPU_UP_PREPARE:

- case CPU_UP_PREPARE_FROZEN:

p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);

if (IS_ERR(p))

return NOTIFY_BAD;

@@ -7687,7 +7780,6 @@ migration_call(struct notifier_block *nf

break;

case CPU_ONLINE:

- case CPU_ONLINE_FROZEN:

/* Strictly unnecessary, as first user will wake it. */

wake_up_process(cpu_rq(cpu)->migration_thread);

@@ -7704,7 +7796,6 @@ migration_call(struct notifier_block *nf

#ifdef CONFIG_HOTPLUG_CPU

case CPU_UP_CANCELED:

- case CPU_UP_CANCELED_FROZEN:

if (!cpu_rq(cpu)->migration_thread)

break;

/* Unbind it from offline cpu so it can run. Fall thru. */

@@ -7715,14 +7806,22 @@ migration_call(struct notifier_block *nf

cpu_rq(cpu)->migration_thread = NULL;

break;

- case CPU_DEAD:

- case CPU_DEAD_FROZEN:

- cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */

- migrate_live_tasks(cpu);

+ case CPU_POST_DEAD:

+ /*

+ * Bring the migration thread down in CPU_POST_DEAD event,

+ * since the timers should have got migrated by now and thus

+ * we should not see a deadlock between trying to kill the

+ * migration thread and the sched_rt_period_timer.

+ */

rq = cpu_rq(cpu);

kthread_stop(rq->migration_thread);

put_task_struct(rq->migration_thread);

rq->migration_thread = NULL;

+ break;

+ case CPU_DEAD:

+ migrate_live_tasks(cpu);

+ rq = cpu_rq(cpu);

/* Idle task back to normal (off runqueue, low prio) */

spin_lock_irq(&rq->lock);

update_rq_clock(rq);

@@ -7731,7 +7830,6 @@ migration_call(struct notifier_block *nf

rq->idle->sched_class = &idle_sched_class;

migrate_dead_tasks(cpu);

spin_unlock_irq(&rq->lock);

- cpuset_unlock();

migrate_nr_uninterruptible(rq);

BUG_ON(rq->nr_running != 0);

calc_global_load_remove(rq);

@@ -7755,7 +7853,6 @@ migration_call(struct notifier_block *nf

break;

case CPU_DYING:

- case CPU_DYING_FROZEN:

/* Update our root-domain */

rq = cpu_rq(cpu);

spin_lock_irqsave(&rq->lock, flags);

@@ -8075,6 +8172,9 @@ cpu_attach_domain(struct sched_domain *s

struct rq *rq = cpu_rq(cpu);

struct sched_domain *tmp;

+ for (tmp = sd; tmp; tmp = tmp->parent)

+ tmp->span_weight = cpumask_weight(sched_domain_span(tmp));

/* Remove the sched domains which do not contribute to scheduling. */

for (tmp = sd; tmp; ) {

struct sched_domain *parent = tmp->parent;

@@ -9499,6 +9599,8 @@ void __init sched_init(void)

init_task_group.parent = &root_task_group;

list_add(&init_task_group.siblings, &root_task_group.children);

#endif /* CONFIG_USER_SCHED */

+ autogroup_init(&init_task);

#endif /* CONFIG_GROUP_SCHED */

#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP

@@ -9656,24 +9758,13 @@ static inline int preempt_count_equals(i

return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);

}

-static int __might_sleep_init_called;

-int __init __might_sleep_init(void)

- __might_sleep_init_called = 1;

- return 0;

-early_initcall(__might_sleep_init);

void __might_sleep(char *file, int line, int preempt_offset)

{

#ifdef in_atomic

static unsigned long prev_jiffy; /* ratelimiting */

if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||

- oops_in_progress)

- return;

- if (system_state != SYSTEM_RUNNING &&

- (!__might_sleep_init_called || system_state != SYSTEM_BOOTING))

+ system_state != SYSTEM_RUNNING || oops_in_progress)

return;

if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)

return;

@@ -10049,15 +10140,11 @@ void sched_destroy_group(struct task_gro

/* change task's runqueue when it moves between groups.

* The caller of this function should have put the task in its new group

* by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to

- * reflect its new group.

+ * reflect its new group. Called with the runqueue lock held.

-void sched_move_task(struct task_struct *tsk)

+void __sched_move_task(struct task_struct *tsk, struct rq *rq)

{

int on_rq, running;

- unsigned long flags;

- struct rq *rq;

- rq = task_rq_lock(tsk, &flags);

update_rq_clock(rq);

@@ -10073,13 +10160,22 @@ void sched_move_task(struct task_struct

#ifdef CONFIG_FAIR_GROUP_SCHED

if (tsk->sched_class->moved_group)

- tsk->sched_class->moved_group(tsk);

+ tsk->sched_class->moved_group(tsk, on_rq);

#endif

if (unlikely(running))

tsk->sched_class->set_curr_task(rq);

if (on_rq)

- enqueue_task(rq, tsk, 0);

+ enqueue_task(rq, tsk, 0, false);

+void sched_move_task(struct task_struct *tsk)

+ struct rq *rq;

+ unsigned long flags;

+ rq = task_rq_lock(tsk, &flags);

+ __sched_move_task(tsk, rq);

task_rq_unlock(rq, &flags);

}

@@ -10493,15 +10589,6 @@ cpu_cgroup_destroy(struct cgroup_subsys

static int

cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)

{

- if ((current != tsk) && (!capable(CAP_SYS_NICE))) {

- const struct cred *cred = current_cred(), *tcred;

- tcred = __task_cred(tsk);

- if (cred->euid != tcred->uid && cred->euid != tcred->suid)

- return -EPERM;

- }

#ifdef CONFIG_RT_GROUP_SCHED

if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))

return -EINVAL;

@@ -10860,12 +10947,30 @@ static void cpuacct_charge(struct task_s

}

+ * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large

+ * in cputime_t units. As a result, cpuacct_update_stats calls

+ * percpu_counter_add with values large enough to always overflow the

+ * per cpu batch limit causing bad SMP scalability.

+ *

+ * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we

+ * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled

+ * and enabled. We cap it at INT_MAX which is the largest allowed batch value.

+ */

+#ifdef CONFIG_SMP

+#define CPUACCT_BATCH \

+ min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX)

+#else

+#define CPUACCT_BATCH 0

+#endif

+/*

* Charge the system/user time to the task's accounting group.

static void cpuacct_update_stats(struct task_struct *tsk,

enum cpuacct_stat_index idx, cputime_t val)

{

struct cpuacct *ca;

+ int batch = CPUACCT_BATCH;

if (unlikely(!cpuacct_subsys.active))

return;

@@ -10874,7 +10979,7 @@ static void cpuacct_update_stats(struct

ca = task_ca(tsk);

do {

- percpu_counter_add(&ca->cpustat[idx], val);

+ __percpu_counter_add(&ca->cpustat[idx], val, batch);

ca = ca->parent;

} while (ca);

rcu_read_unlock();

@@ -10998,3 +11103,4 @@ void synchronize_sched_expedited(void)

EXPORT_SYMBOL_GPL(synchronize_sched_expedited);

#endif /* #else #ifndef CONFIG_SMP */

+#endif /* CONFIG_SCHED_BFS */