--- 6319f828298861b37d9c74097fe26e9aa7bb3947
+++ 168e84a745778904e5a69cafe0c13396ba938602
@@ -85,7 +85,7 @@
 #define idleprio_task(p)	unlikely((p)->policy == SCHED_IDLEPRIO)
 #define iso_task(p)		unlikely((p)->policy == SCHED_ISO)
 #define iso_queue(rq)		unlikely((rq)->rq_policy == SCHED_ISO)
-#define ISO_PERIOD		((5 * HZ * num_online_cpus()) + 1)
+#define ISO_PERIOD		((5 * HZ * grq.noc) + 1)
 
 /*
  * Convert user-nice values [ -20 ... 0 ... 19 ]
@@ -120,7 +120,7 @@
 #define NS_TO_MS(TIME)		((TIME) >> 20)
 #define NS_TO_US(TIME)		((TIME) >> 10)
 
-#define RESCHED_US	(100) /* Reschedule if less than this many us left */
+#define RESCHED_US	(100) /* Reschedule if less than this many Î¼s left */
 
 /*
  * This is the time all tasks within the same priority round robin.
@@ -145,7 +145,7 @@ static int prio_ratios[PRIO_RANGE] __rea
  * The quota handed out to tasks of all priority levels when refilling their
  * time_slice.
  */
-static inline unsigned long timeslice(void)
+static inline int timeslice(void)
 {
 	return MS_TO_US(rr_interval);
 }
@@ -167,6 +167,7 @@ struct global_rq {
 	cpumask_t cpu_idle_map;
 	int idle_cpus;
 #endif
+	int noc; /* num_online_cpus stored and updated when it changes */
 	u64 niffies; /* Nanosecond jiffies */
 	unsigned long last_jiffy; /* Last jiffy we updated niffies */
 
@@ -187,7 +188,6 @@ struct rq {
 #ifdef CONFIG_NO_HZ
 	unsigned char in_nohz_recently;
 #endif
-	struct task_struct *last_task;
 #endif
 
 	struct task_struct *curr, *idle;
@@ -210,6 +210,8 @@ struct rq {
 #ifdef CONFIG_SMP
 	int cpu;		/* cpu of this runqueue */
 	int online;
+	int scaling; /* This CPU is managed by a scaling CPU freq governor */
+	struct task_struct *sticky_task;
 
 	struct root_domain *rd;
 	struct sched_domain *sd;
@@ -226,7 +228,11 @@ struct rq {
 #endif
 	u64 last_niffy; /* Last time this RQ updated grq.niffies */
 #endif
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+	u64 prev_irq_time;
+#endif
 	u64 clock, old_clock, last_tick;
+	u64 clock_task;
 	int dither;
 
 #ifdef CONFIG_SCHEDSTATS
@@ -398,9 +404,17 @@ static inline void update_clocks(struct
  * when we're not updating niffies.
  * Looking up task_rq must be done under grq.lock to be safe.
  */
+static u64 irq_time_cpu(int cpu);
+
 static inline void update_rq_clock(struct rq *rq)
 {
-	rq->clock = sched_clock_cpu(cpu_of(rq));
+	int cpu = cpu_of(rq);
+	u64 irq_time;
+
+	rq->clock = sched_clock_cpu(cpu);
+	irq_time = irq_time_cpu(cpu);
+	if (rq->clock - irq_time > rq->clock_task)
+		rq->clock_task = rq->clock - irq_time;
 }
 
 static inline int task_running(struct task_struct *p)
@@ -743,26 +757,17 @@ static int suitable_idle_cpus(struct tas
 
 static void resched_task(struct task_struct *p);
 
-/*
- * last_task stores the last non-idle task scheduled on the local rq for
- * cache warmth testing.
- */
-static inline void set_last_task(struct rq *rq, struct task_struct *p)
-{
-	rq->last_task = p;
-}
-
-#define CPUIDLE_CACHE_BUSY	(1)
-#define CPUIDLE_DIFF_CPU	(2)
-#define CPUIDLE_THREAD_BUSY	(4)
-#define CPUIDLE_DIFF_NODE	(8)
+#define CPUIDLE_DIFF_THREAD	(1)
+#define CPUIDLE_DIFF_CORE	(2)
+#define CPUIDLE_CACHE_BUSY	(4)
+#define CPUIDLE_DIFF_CPU	(8)
+#define CPUIDLE_THREAD_BUSY	(16)
+#define CPUIDLE_DIFF_NODE	(32)
 
 /*
  * The best idle CPU is chosen according to the CPUIDLE ranking above where the
- * lowest value would give the most suitable CPU to schedule p onto next. We
- * iterate from the last CPU upwards instead of using for_each_cpu_mask so as
- * to be able to break out immediately if the last CPU is idle. The order works
- * out to be the following:
+ * lowest value would give the most suitable CPU to schedule p onto next. The
+ * order works out to be the following:
  *
  * Same core, idle or busy cache, idle threads
  * Other core, same cache, idle or busy cache, idle threads.
@@ -774,96 +779,82 @@ static inline void set_last_task(struct
  * Other node, other CPU, idle cache, idle threads.
  * Other node, other CPU, busy cache, idle threads.
  * Other node, other CPU, busy threads.
- *
- * If p was the last task running on this rq, then regardless of where
- * it has been running since then, it is cache warm on this rq.
  */
-static void resched_best_idle(struct task_struct *p)
+static void
+resched_best_mask(unsigned long best_cpu, struct rq *rq, cpumask_t *tmpmask)
 {
-	unsigned long cpu_tmp, best_cpu, best_ranking;
-	cpumask_t tmpmask;
-	struct rq *rq;
-	int iterate;
+	unsigned long cpu_tmp, best_ranking;
 
-	cpus_and(tmpmask, p->cpus_allowed, grq.cpu_idle_map);
-	iterate = cpus_weight(tmpmask);
-	best_cpu = task_cpu(p);
-	/*
-	 * Start below the last CPU and work up with next_cpu as the last
-	 * CPU might not be idle or affinity might not allow it.
-	 */
-	cpu_tmp = best_cpu - 1;
-	rq = cpu_rq(best_cpu);
 	best_ranking = ~0UL;
 
-	do {
+	for_each_cpu_mask(cpu_tmp, *tmpmask) {
 		unsigned long ranking;
 		struct rq *tmp_rq;
 
 		ranking = 0;
-		cpu_tmp = next_cpu(cpu_tmp, tmpmask);
-		if (cpu_tmp >= nr_cpu_ids) {
-			cpu_tmp = -1;
-			cpu_tmp = next_cpu(cpu_tmp, tmpmask);
-		}
 		tmp_rq = cpu_rq(cpu_tmp);
 
-		if (rq->cpu_locality[cpu_tmp]) {
-			/* Check rq->last_task hasn't been dereferenced */
-			if (rq->last_task && p != rq->last_task) {
 #ifdef CONFIG_NUMA
-				if (rq->cpu_locality[cpu_tmp] > 1)
-					ranking |= CPUIDLE_DIFF_NODE;
+		if (rq->cpu_locality[cpu_tmp] > 3)
+			ranking |= CPUIDLE_DIFF_NODE;
+		else
 #endif
-				ranking |= CPUIDLE_DIFF_CPU;
-			}
-		}
+		if (rq->cpu_locality[cpu_tmp] > 2)
+			ranking |= CPUIDLE_DIFF_CPU;
 #ifdef CONFIG_SCHED_MC
+		if (rq->cpu_locality[cpu_tmp] == 2)
+			ranking |= CPUIDLE_DIFF_CORE;
 		if (!(tmp_rq->cache_idle(cpu_tmp)))
 			ranking |= CPUIDLE_CACHE_BUSY;
 #endif
 #ifdef CONFIG_SCHED_SMT
+		if (rq->cpu_locality[cpu_tmp] == 1)
+			ranking |= CPUIDLE_DIFF_THREAD;
 		if (!(tmp_rq->siblings_idle(cpu_tmp)))
 			ranking |= CPUIDLE_THREAD_BUSY;
 #endif
 		if (ranking < best_ranking) {
 			best_cpu = cpu_tmp;
-			if (ranking <= 1)
+			if (ranking == 0)
 				break;
 			best_ranking = ranking;
 		}
-	} while (--iterate > 0);
+	}
 
 	resched_task(cpu_rq(best_cpu)->curr);
 }
 
+static void resched_best_idle(struct task_struct *p)
+{
+	cpumask_t tmpmask;
+
+	cpus_and(tmpmask, p->cpus_allowed, grq.cpu_idle_map);
+	resched_best_mask(task_cpu(p), task_rq(p), &tmpmask);
+}
+
 static inline void resched_suitable_idle(struct task_struct *p)
 {
 	if (suitable_idle_cpus(p))
 		resched_best_idle(p);
 }
-
 /*
- * The cpu cache locality difference between CPUs is used to determine how far
- * to offset the virtual deadline. "One" difference in locality means that one
- * timeslice difference is allowed longer for the cpu local tasks. This is
- * enough in the common case when tasks are up to 2* number of CPUs to keep
- * tasks within their shared cache CPUs only. CPUs on different nodes or not
- * even in this domain (NUMA) have "3" difference, allowing 4 times longer
- * deadlines before being taken onto another cpu, allowing for 2* the double
- * seen by separate CPUs above.
- * Simple summary: Virtual deadlines are equal on shared cache CPUs, double
- * on separate CPUs and quadruple in separate NUMA nodes.
+ * Flags to tell us whether this CPU is running a CPU frequency governor that
+ * has slowed its speed or not. No locking required as the very rare wrongly
+ * read value would be harmless.
  */
-static inline int
-cache_distance(struct rq *task_rq, struct rq *rq, struct task_struct *p)
+void cpu_scaling(int cpu)
 {
-	/* Check rq->last_task hasn't been dereferenced */
-	if (likely(rq->last_task)) {
-		if (rq->last_task == p)
-			return 0;
-	}
-	return rq->cpu_locality[cpu_of(task_rq)] * task_timeslice(p);
+	cpu_rq(cpu)->scaling = 1;
+}
+
+void cpu_nonscaling(int cpu)
+{
+	cpu_rq(cpu)->scaling = 0;
+}
+
+static inline int scaling_rq(struct rq *rq)
+{
+	return rq->scaling;
 }
 #else /* CONFIG_SMP */
 static inline void inc_qnr(void)
@@ -896,16 +887,25 @@ static inline void resched_suitable_idle
 {
 }
 
-static inline int
-cache_distance(struct rq *task_rq, struct rq *rq, struct task_struct *p)
+void cpu_scaling(int __unused)
 {
-	return 0;
 }
 
-static inline void set_last_task(struct rq *rq, struct task_struct *p)
+void cpu_nonscaling(int __unused)
 {
 }
+
+/*
+ * Although CPUs can scale in UP, there is nowhere else for tasks to go so this
+ * always returns 0.
+ */
+static inline int scaling_rq(struct rq *rq)
+{
+	return 0;
+}
 #endif /* CONFIG_SMP */
+EXPORT_SYMBOL_GPL(cpu_scaling);
+EXPORT_SYMBOL_GPL(cpu_nonscaling);
 
 /*
  * activate_idle_task - move idle task to the _front_ of runqueue.
@@ -1001,6 +1001,82 @@ void set_task_cpu(struct task_struct *p,
 	smp_wmb();
 	task_thread_info(p)->cpu = cpu;
 }
+
+static inline void clear_sticky(struct task_struct *p)
+{
+	p->sticky = 0;
+}
+
+static inline int task_sticky(struct task_struct *p)
+{
+	return p->sticky;
+}
+
+/* Reschedule the best idle CPU that is not this one. */
+static void
+resched_closest_idle(struct rq *rq, unsigned long cpu, struct task_struct *p)
+{
+	cpumask_t tmpmask;
+
+	cpus_and(tmpmask, p->cpus_allowed, grq.cpu_idle_map);
+	cpu_clear(cpu, tmpmask);
+	if (cpus_empty(tmpmask))
+		return;
+	resched_best_mask(cpu, rq, &tmpmask);
+}
+
+/*
+ * We set the sticky flag on a task that is descheduled involuntarily meaning
+ * it is awaiting further CPU time. If the last sticky task is still sticky
+ * but unlucky enough to not be the next task scheduled, we unstick it and try
+ * to find it an idle CPU. Realtime tasks do not stick to minimise their
+ * latency at all times.
+ */
+static inline void
+swap_sticky(struct rq *rq, unsigned long cpu, struct task_struct *p)
+{
+	if (rq->sticky_task) {
+		if (rq->sticky_task == p) {
+			p->sticky = 1;
+			return;
+		}
+		if (rq->sticky_task->sticky) {
+			rq->sticky_task->sticky = 0;
+			resched_closest_idle(rq, cpu, rq->sticky_task);
+		}
+	}
+	if (!rt_task(p)) {
+		p->sticky = 1;
+		rq->sticky_task = p;
+	} else {
+		resched_closest_idle(rq, cpu, p);
+		rq->sticky_task = NULL;
+	}
+}
+
+static inline void unstick_task(struct rq *rq, struct task_struct *p)
+{
+	rq->sticky_task = NULL;
+	clear_sticky(p);
+}
+#else
+static inline void clear_sticky(struct task_struct *p)
+{
+}
+
+static inline int task_sticky(struct task_struct *p)
+{
+	return 0;
+}
+
+static inline void
+swap_sticky(struct rq *rq, unsigned long cpu, struct task_struct *p)
+{
+}
+
+static inline void unstick_task(struct rq *rq, struct task_struct *p)
+{
+}
 #endif
 
 /*
@@ -1011,6 +1087,7 @@ static inline void take_task(struct rq *
 {
 	set_task_cpu(p, cpu_of(rq));
 	dequeue_task(p);
+	clear_sticky(p);
 	dec_qnr();
 }
 
@@ -1348,7 +1425,7 @@ static inline int online_cpus(struct tas
  */
 static inline int needs_other_cpu(struct task_struct *p, int cpu)
 {
-	if (unlikely(!cpu_isset(cpu, p->cpus_allowed) && online_cpus(p)))
+	if (unlikely(!cpu_isset(cpu, p->cpus_allowed)))
 		return 1;
 	return 0;
 }
@@ -1365,25 +1442,31 @@ static void try_preempt(struct task_stru
 	int highest_prio;
 	cpumask_t tmp;
 
-	/* IDLEPRIO tasks never preempt anything */
-	if (p->policy == SCHED_IDLEPRIO)
-		return;
+	/*
+	 * We clear the sticky flag here because for a task to have called
+	 * try_preempt with the sticky flag enabled means some complicated
+	 * re-scheduling has occurred and we should ignore the sticky flag.
+	 */
+	clear_sticky(p);
 
 	if (suitable_idle_cpus(p)) {
 		resched_best_idle(p);
 		return;
 	}
 
-	if (online_cpus(p))
+	/* IDLEPRIO tasks never preempt anything */
+	if (p->policy == SCHED_IDLEPRIO)
+		return;
+
+	if (likely(online_cpus(p)))
 		cpus_and(tmp, cpu_online_map, p->cpus_allowed);
 	else
-		(cpumask_copy(&tmp, &cpu_online_map));
+		return;
 
 	latest_deadline = 0;
 	highest_prio = -1;
 
 	for_each_cpu_mask(cpu, tmp) {
-		u64 offset_deadline;
 		struct rq *rq;
 		int rq_prio;
 
@@ -1392,12 +1475,9 @@ static void try_preempt(struct task_stru
 		if (rq_prio < highest_prio)
 			continue;
 
-		offset_deadline = rq->rq_deadline -
-				  cache_distance(this_rq, rq, p);
-
-		if (rq_prio > highest_prio || (rq_prio == highest_prio &&
-		    deadline_after(offset_deadline, latest_deadline))) {
-			latest_deadline = offset_deadline;
+		if (rq_prio > highest_prio ||
+		    deadline_after(rq->rq_deadline, latest_deadline)) {
+			latest_deadline = rq->rq_deadline;
 			highest_prio = rq_prio;
 			highest_prio_rq = rq;
 		}
@@ -1591,6 +1671,7 @@ void sched_fork(struct task_struct *p, i
 #endif
 
 	p->oncpu = 0;
+	clear_sticky(p);
 
 #ifdef CONFIG_PREEMPT
 	/* Want to start with kernel preemption disabled. */
@@ -1836,14 +1917,14 @@ context_switch(struct rq *rq, struct tas
 	 */
 	arch_start_context_switch(prev);
 
-	if (unlikely(!mm)) {
+	if (!mm) {
 		next->active_mm = oldmm;
 		atomic_inc(&oldmm->mm_count);
 		enter_lazy_tlb(oldmm, next);
 	} else
 		switch_mm(oldmm, mm, next);
 
-	if (unlikely(!prev->mm)) {
+	if (!prev->mm) {
 		prev->active_mm = NULL;
 		rq->prev_mm = oldmm;
 	}
@@ -1931,8 +2012,7 @@ unsigned long nr_active(void)
 unsigned long this_cpu_load(void)
 {
 	return this_rq()->rq_running +
-		(queued_notrunning() + nr_uninterruptible()) /
-		(1 + num_online_cpus());
+		((queued_notrunning() + nr_uninterruptible()) / grq.noc);
 }
 
 /* Variables and functions for calc_load */
@@ -1985,6 +2065,81 @@ DEFINE_PER_CPU(struct kernel_stat, kstat
 
 EXPORT_PER_CPU_SYMBOL(kstat);
 
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+
+/*
+ * There are no locks covering percpu hardirq/softirq time.
+ * They are only modified in account_system_vtime, on corresponding CPU
+ * with interrupts disabled. So, writes are safe.
+ * They are read and saved off onto struct rq in update_rq_clock().
+ * This may result in other CPU reading this CPU's irq time and can
+ * race with irq/account_system_vtime on this CPU. We would either get old
+ * or new value (or semi updated value on 32 bit) with a side effect of
+ * accounting a slice of irq time to wrong task when irq is in progress
+ * while we read rq->clock. That is a worthy compromise in place of having
+ * locks on each irq in account_system_time.
+ */
+static DEFINE_PER_CPU(u64, cpu_hardirq_time);
+static DEFINE_PER_CPU(u64, cpu_softirq_time);
+
+static DEFINE_PER_CPU(u64, irq_start_time);
+static int sched_clock_irqtime;
+
+void enable_sched_clock_irqtime(void)
+{
+	sched_clock_irqtime = 1;
+}
+
+void disable_sched_clock_irqtime(void)
+{
+	sched_clock_irqtime = 0;
+}
+
+static u64 irq_time_cpu(int cpu)
+{
+	if (!sched_clock_irqtime)
+		return 0;
+
+	return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
+}
+
+void account_system_vtime(struct task_struct *curr)
+{
+	unsigned long flags;
+	int cpu;
+	u64 now, delta;
+
+	if (!sched_clock_irqtime)
+		return;
+
+	local_irq_save(flags);
+
+	cpu = smp_processor_id();
+	now = sched_clock_cpu(cpu);
+	delta = now - per_cpu(irq_start_time, cpu);
+	per_cpu(irq_start_time, cpu) = now;
+	/*
+	 * We do not account for softirq time from ksoftirqd here.
+	 * We want to continue accounting softirq time to ksoftirqd thread
+	 * in that case, so as not to confuse scheduler with a special task
+	 * that do not consume any time, but still wants to run.
+	 */
+	if (hardirq_count())
+		per_cpu(cpu_hardirq_time, cpu) += delta;
+	else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
+		per_cpu(cpu_softirq_time, cpu) += delta;
+
+	local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(account_system_vtime);
+#else
+
+static u64 irq_time_cpu(int cpu)
+{
+	return 0;
+}
+#endif
+
 /*
  * On each tick, see what percentage of that tick was attributed to each
  * component and add the percentage to the _pc values. Once a _pc value has
@@ -2029,9 +2184,13 @@ pc_system_time(struct rq *rq, struct tas
 	}
 	p->sched_time += ns;
 
-	if (hardirq_count() - hardirq_offset)
+	if (hardirq_count() - hardirq_offset) {
 		rq->irq_pc += pc;
-	else if (softirq_count()) {
+		if (rq->irq_pc >= 100) {
+			rq->irq_pc %= 100;
+			cpustat->irq = cputime64_add(cpustat->irq, tmp);
+		}
+	} else if (softirq_count()) {
 		rq->softirq_pc += pc;
 		if (rq->softirq_pc >= 100) {
 			rq->softirq_pc %= 100;
@@ -2416,7 +2575,7 @@ static void task_running_tick(struct rq
 	 * Tasks that were scheduled in the first half of a tick are not
 	 * allowed to run into the 2nd half of the next tick if they will
 	 * run out of time slice in the interim. Otherwise, if they have
-	 * less than 100us of time slice left they will be rescheduled.
+	 * less than RESCHED_US Î¼s of time slice left they will be rescheduled.
 	 */
 	if (rq->dither) {
 		if (rq->rq_time_slice > HALF_JIFFY_US)
@@ -2539,9 +2698,14 @@ static inline u64 static_deadline_diff(i
 	return prio_deadline_diff(USER_PRIO(static_prio));
 }
 
+static inline int longest_deadline_diff(void)
+{
+	return prio_deadline_diff(39);
+}
+
 static inline int ms_longest_deadline_diff(void)
 {
-	return NS_TO_MS(prio_deadline_diff(39));
+	return NS_TO_MS(longest_deadline_diff());
 }
 
 /*
@@ -2611,7 +2775,19 @@ retry:
 			goto out_take;
 		}
 
-		dl = p->deadline + cache_distance(task_rq(p), rq, p);
+		/*
+		 * Soft affinity happens here by not scheduling a task with
+		 * its sticky flag set that ran on a different CPU last when
+		 * the CPU is scaling, or by greatly biasing against its
+		 * deadline when not.
+		 */
+		if (task_rq(p) != rq && task_sticky(p)) {
+			if (scaling_rq(rq))
+				continue;
+			else
+				dl = p->deadline + longest_deadline_diff();
+		} else
+			dl = p->deadline;
 
 		/*
 		 * No rt tasks. Find the earliest deadline task. Now we're in
@@ -2689,7 +2865,7 @@ static inline void set_rq_task(struct rq
 {
 	rq->rq_time_slice = p->time_slice;
 	rq->rq_deadline = p->deadline;
-	rq->rq_last_ran = p->last_ran;
+	rq->rq_last_ran = p->last_ran = rq->clock;
 	rq->rq_policy = p->policy;
 	rq->rq_prio = p->prio;
 	if (p != rq->idle)
@@ -2768,14 +2944,8 @@ need_resched_nonpreemptible:
 				*/
 				grq_unlock_irq();
 				goto rerun_prev_unlocked;
-			} else {
-				/*
-				 * If prev got kicked off by a task that has to
-				 * run on this CPU for affinity reasons then
-				 * there may be an idle CPU it can go to.
-				 */
-				resched_suitable_idle(prev);
-			}
+			} else
+				swap_sticky(rq, cpu, prev);
 		}
 		return_task(prev, deactivate);
 	}
@@ -2790,17 +2960,24 @@ need_resched_nonpreemptible:
 		set_cpuidle_map(cpu);
 	} else {
 		next = earliest_deadline_task(rq, idle);
-		prefetch(next);
-		prefetch_stack(next);
-		clear_cpuidle_map(cpu);
+		if (likely(next->prio != PRIO_LIMIT)) {
+			prefetch(next);
+			prefetch_stack(next);
+			clear_cpuidle_map(cpu);
+		} else
+			set_cpuidle_map(cpu);
 	}
 
 	if (likely(prev != next)) {
+		/*
+		 * Don't stick tasks when a real time task is going to run as
+		 * they may literally get stuck.
+		 */
+		if (rt_task(next))
+			unstick_task(rq, prev);
 		sched_info_switch(prev, next);
 		perf_event_task_sched_out(prev, next, cpu);
 
-		if (prev != idle)
-			set_last_task(rq, prev);
 		set_rq_task(rq, next);
 		grq.nr_switches++;
 		prev->oncpu = 0;
@@ -3627,8 +3804,8 @@ recheck:
 	 * SCHED_BATCH is 0.
 	 */
 	if (param->sched_priority < 0 ||
-	    (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
-	    (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
+	    (p->mm && param->sched_priority > MAX_USER_RT_PRIO - 1) ||
+	    (!p->mm && param->sched_priority > MAX_RT_PRIO - 1))
 		return -EINVAL;
 	if (is_rt_policy(policy) != (param->sched_priority != 0))
 		return -EINVAL;
@@ -4349,10 +4526,12 @@ void init_idle(struct task_struct *idle,
 	idle->prio = PRIO_LIMIT;
 	set_rq_task(rq, idle);
 	idle->cpus_allowed = cpumask_of_cpu(cpu);
+	/* Silence PROVE_RCU */
+	rcu_read_lock();
 	set_task_cpu(idle, cpu);
+	rcu_read_unlock();
 	rq->curr = rq->idle = idle;
 	idle->oncpu = 1;
-	set_cpuidle_map(cpu);
 	grq_unlock_irqrestore(&flags);
 
 	/* Set the preempt count _outside_ the spinlocks! */
@@ -4579,6 +4758,30 @@ void move_task_off_dead_cpu(int dead_cpu
 
 }
 
+/* Run through task list and find tasks affined to just the dead cpu, then
+ * allocate a new affinity */
+static void break_sole_affinity(int src_cpu)
+{
+	struct task_struct *p, *t;
+
+	do_each_thread(t, p) {
+		if (!online_cpus(p)) {
+			cpumask_copy(&p->cpus_allowed, cpu_possible_mask);
+			/*
+			 * Don't tell them about moving exiting tasks or
+			 * kernel threads (both mm NULL), since they never
+			 * leave kernel.
+			 */
+			if (p->mm && printk_ratelimit()) {
+				printk(KERN_INFO "process %d (%s) no "
+				       "longer affine to cpu %d\n",
+				       task_pid_nr(p), p->comm, src_cpu);
+			}
+		}
+		clear_sticky(p);
+	} while_each_thread(t, p);
+}
+
 /*
  * Schedules idle task to be the next runnable task on current CPU.
  * It does so by boosting its priority to highest possible.
@@ -4599,6 +4802,7 @@ void sched_idle_next(void)
 	 * and interrupts disabled on the current cpu.
 	 */
 	grq_lock_irqsave(&flags);
+	break_sole_affinity(this_cpu);
 
 	__setscheduler(idle, rq, SCHED_FIFO, MAX_RT_PRIO - 1);
 
@@ -4836,6 +5040,7 @@ migration_call(struct notifier_block *nf
 
 			set_rq_online(rq);
 		}
+		grq.noc = num_online_cpus();
 		grq_unlock_irqrestore(&flags);
 		break;
 
@@ -4866,6 +5071,7 @@ migration_call(struct notifier_block *nf
 			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
 			set_rq_offline(rq);
 		}
+		grq.noc = num_online_cpus();
 		grq_unlock_irqrestore(&flags);
 		break;
 #endif
@@ -6389,7 +6595,7 @@ static int cache_cpu_idle(unsigned long
 void __init sched_init_smp(void)
 {
 	struct sched_domain *sd;
-	int cpu, cpus;
+	int cpu;
 
 	cpumask_var_t non_isolated_cpus;
 
@@ -6423,14 +6629,6 @@ void __init sched_init_smp(void)
 		BUG();
 	free_cpumask_var(non_isolated_cpus);
 
-	/*
-	 * Assume that every added cpu gives us slightly less overall latency
-	 * allowing us to increase the base rr_interval, non-linearly and with
-	 * an upper bound.
-	 */
-	cpus = num_online_cpus();
-	rr_interval = rr_interval * (4 * cpus + 4) / (cpus + 6);
-
 	grq_lock_irq();
 	/*
 	 * Set up the relative cache distance of each online cpu from each
@@ -6459,10 +6657,12 @@ void __init sched_init_smp(void)
 					cpumask_set_cpu(other_cpu, &rq->cache_siblings);
 			}
 #endif
-			if (sd->level <= SD_LV_MC)
-				locality = 0;
-			else if (sd->level <= SD_LV_NODE)
+			if (sd->level <= SD_LV_SIBLING)
 				locality = 1;
+			else if (sd->level <= SD_LV_MC)
+				locality = 2;
+			else if (sd->level <= SD_LV_NODE)
+				locality = 3;
 			else
 				continue;
 
@@ -6517,6 +6717,7 @@ void __init sched_init(void)
 	grq.last_jiffy = jiffies;
 	spin_lock_init(&grq.iso_lock);
 	grq.iso_ticks = grq.iso_refractory = 0;
+	grq.noc = 1;
 #ifdef CONFIG_SMP
 	init_defrootdomain();
 	grq.qnr = grq.idle_cpus = 0;
@@ -6530,6 +6731,7 @@ void __init sched_init(void)
 			      rq->iowait_pc = rq->idle_pc = 0;
 		rq->dither = 0;
 #ifdef CONFIG_SMP
+		rq->sticky_task = NULL;
 		rq->last_niffy = 0;
 		rq->sd = NULL;
 		rq->rd = NULL;
@@ -6568,7 +6770,7 @@ void __init sched_init(void)
 			if (i == j)
 				rq->cpu_locality[j] = 0;
 			else
-				rq->cpu_locality[j] = 3;
+				rq->cpu_locality[j] = 4;
 		}
 	}
 #endif