--- 168e84a745778904e5a69cafe0c13396ba938602
+++ 0f0e14f8cdb956035e72fd5ccb35f1278844858a
@@ -85,7 +85,7 @@
 #define idleprio_task(p)	unlikely((p)->policy == SCHED_IDLEPRIO)
 #define iso_task(p)		unlikely((p)->policy == SCHED_ISO)
 #define iso_queue(rq)		unlikely((rq)->rq_policy == SCHED_ISO)
-#define ISO_PERIOD		((5 * HZ * grq.noc) + 1)
+#define ISO_PERIOD		((5 * HZ * num_online_cpus()) + 1)
 
 /*
  * Convert user-nice values [ -20 ... 0 ... 19 ]
@@ -145,7 +145,7 @@ static int prio_ratios[PRIO_RANGE] __rea
  * The quota handed out to tasks of all priority levels when refilling their
  * time_slice.
  */
-static inline int timeslice(void)
+static inline unsigned long timeslice(void)
 {
 	return MS_TO_US(rr_interval);
 }
@@ -167,7 +167,6 @@ struct global_rq {
 	cpumask_t cpu_idle_map;
 	int idle_cpus;
 #endif
-	int noc; /* num_online_cpus stored and updated when it changes */
 	u64 niffies; /* Nanosecond jiffies */
 	unsigned long last_jiffy; /* Last jiffy we updated niffies */
 
@@ -210,8 +209,6 @@ struct rq {
 #ifdef CONFIG_SMP
 	int cpu;		/* cpu of this runqueue */
 	int online;
-	int scaling; /* This CPU is managed by a scaling CPU freq governor */
-	struct task_struct *sticky_task;
 
 	struct root_domain *rd;
 	struct sched_domain *sd;
@@ -228,11 +225,7 @@ struct rq {
 #endif
 	u64 last_niffy; /* Last time this RQ updated grq.niffies */
 #endif
-#ifdef CONFIG_IRQ_TIME_ACCOUNTING
-	u64 prev_irq_time;
-#endif
 	u64 clock, old_clock, last_tick;
-	u64 clock_task;
 	int dither;
 
 #ifdef CONFIG_SCHEDSTATS
@@ -404,17 +397,9 @@ static inline void update_clocks(struct
  * when we're not updating niffies.
  * Looking up task_rq must be done under grq.lock to be safe.
  */
-static u64 irq_time_cpu(int cpu);
-
 static inline void update_rq_clock(struct rq *rq)
 {
-	int cpu = cpu_of(rq);
-	u64 irq_time;
-
-	rq->clock = sched_clock_cpu(cpu);
-	irq_time = irq_time_cpu(cpu);
-	if (rq->clock - irq_time > rq->clock_task)
-		rq->clock_task = rq->clock - irq_time;
+	rq->clock = sched_clock_cpu(cpu_of(rq));
 }
 
 static inline int task_running(struct task_struct *p)
@@ -766,8 +751,10 @@ static void resched_task(struct task_str
 
 /*
  * The best idle CPU is chosen according to the CPUIDLE ranking above where the
- * lowest value would give the most suitable CPU to schedule p onto next. The
- * order works out to be the following:
+ * lowest value would give the most suitable CPU to schedule p onto next. We
+ * iterate from the last CPU upwards instead of using for_each_cpu_mask so as
+ * to be able to break out immediately if the last CPU is idle. The order works
+ * out to be the following:
  *
  * Same core, idle or busy cache, idle threads
  * Other core, same cache, idle or busy cache, idle threads.
@@ -779,19 +766,38 @@ static void resched_task(struct task_str
  * Other node, other CPU, idle cache, idle threads.
  * Other node, other CPU, busy cache, idle threads.
  * Other node, other CPU, busy threads.
+ *
+ * If p was the last task running on this rq, then regardless of where
+ * it has been running since then, it is cache warm on this rq.
  */
-static void
-resched_best_mask(unsigned long best_cpu, struct rq *rq, cpumask_t *tmpmask)
+static void resched_best_idle(struct task_struct *p)
 {
-	unsigned long cpu_tmp, best_ranking;
+	unsigned long cpu_tmp, best_cpu, best_ranking;
+	cpumask_t tmpmask;
+	struct rq *rq;
+	int iterate;
 
+	cpus_and(tmpmask, p->cpus_allowed, grq.cpu_idle_map);
+	iterate = cpus_weight(tmpmask);
+	best_cpu = task_cpu(p);
+	/*
+	 * Start below the last CPU and work up with next_cpu as the last
+	 * CPU might not be idle or affinity might not allow it.
+	 */
+	cpu_tmp = best_cpu - 1;
+	rq = cpu_rq(best_cpu);
 	best_ranking = ~0UL;
 
-	for_each_cpu_mask(cpu_tmp, *tmpmask) {
+	do {
 		unsigned long ranking;
 		struct rq *tmp_rq;
 
 		ranking = 0;
+		cpu_tmp = next_cpu(cpu_tmp, tmpmask);
+		if (cpu_tmp >= nr_cpu_ids) {
+			cpu_tmp = -1;
+			cpu_tmp = next_cpu(cpu_tmp, tmpmask);
+		}
 		tmp_rq = cpu_rq(cpu_tmp);
 
 #ifdef CONFIG_NUMA
@@ -819,42 +825,37 @@ resched_best_mask(unsigned long best_cpu
 				break;
 			best_ranking = ranking;
 		}
-	}
+	} while (--iterate > 0);
 
 	resched_task(cpu_rq(best_cpu)->curr);
 }
 
-static void resched_best_idle(struct task_struct *p)
-{
-	cpumask_t tmpmask;
-
-	cpus_and(tmpmask, p->cpus_allowed, grq.cpu_idle_map);
-	resched_best_mask(task_cpu(p), task_rq(p), &tmpmask);
-}
-
 static inline void resched_suitable_idle(struct task_struct *p)
 {
 	if (suitable_idle_cpus(p))
 		resched_best_idle(p);
 }
+
 /*
- * Flags to tell us whether this CPU is running a CPU frequency governor that
- * has slowed its speed or not. No locking required as the very rare wrongly
- * read value would be harmless.
+ * The cpu cache locality difference between CPUs is used to determine how far
+ * to offset the virtual deadline. <2 difference in locality means that one
+ * timeslice difference is allowed longer for the cpu local tasks. This is
+ * enough in the common case when tasks are up to 2* number of CPUs to keep
+ * tasks within their shared cache CPUs only. CPUs on different nodes or not
+ * even in this domain (NUMA) have "4" difference, allowing 4 times longer
+ * deadlines before being taken onto another cpu, allowing for 2* the double
+ * seen by separate CPUs above.
+ * Simple summary: Virtual deadlines are equal on shared cache CPUs, double
+ * on separate CPUs and quadruple in separate NUMA nodes.
  */
-void cpu_scaling(int cpu)
-{
-	cpu_rq(cpu)->scaling = 1;
-}
-
-void cpu_nonscaling(int cpu)
+static inline int
+cache_distance(struct rq *task_rq, struct rq *rq, struct task_struct *p)
 {
-	cpu_rq(cpu)->scaling = 0;
-}
+	int locality = rq->cpu_locality[cpu_of(task_rq)] - 2;
 
-static inline int scaling_rq(struct rq *rq)
-{
-	return rq->scaling;
+	if (locality > 0)
+		return task_timeslice(p) << locality;
+	return 0;
 }
 #else /* CONFIG_SMP */
 static inline void inc_qnr(void)
@@ -887,25 +888,12 @@ static inline void resched_suitable_idle
 {
 }
 
-void cpu_scaling(int __unused)
-{
-}
-
-void cpu_nonscaling(int __unused)
-{
-}
-
-/*
- * Although CPUs can scale in UP, there is nowhere else for tasks to go so this
- * always returns 0.
- */
-static inline int scaling_rq(struct rq *rq)
+static inline int
+cache_distance(struct rq *task_rq, struct rq *rq, struct task_struct *p)
 {
 	return 0;
 }
 #endif /* CONFIG_SMP */
-EXPORT_SYMBOL_GPL(cpu_scaling);
-EXPORT_SYMBOL_GPL(cpu_nonscaling);
 
 /*
  * activate_idle_task - move idle task to the _front_ of runqueue.
@@ -1001,82 +989,6 @@ void set_task_cpu(struct task_struct *p,
 	smp_wmb();
 	task_thread_info(p)->cpu = cpu;
 }
-
-static inline void clear_sticky(struct task_struct *p)
-{
-	p->sticky = 0;
-}
-
-static inline int task_sticky(struct task_struct *p)
-{
-	return p->sticky;
-}
-
-/* Reschedule the best idle CPU that is not this one. */
-static void
-resched_closest_idle(struct rq *rq, unsigned long cpu, struct task_struct *p)
-{
-	cpumask_t tmpmask;
-
-	cpus_and(tmpmask, p->cpus_allowed, grq.cpu_idle_map);
-	cpu_clear(cpu, tmpmask);
-	if (cpus_empty(tmpmask))
-		return;
-	resched_best_mask(cpu, rq, &tmpmask);
-}
-
-/*
- * We set the sticky flag on a task that is descheduled involuntarily meaning
- * it is awaiting further CPU time. If the last sticky task is still sticky
- * but unlucky enough to not be the next task scheduled, we unstick it and try
- * to find it an idle CPU. Realtime tasks do not stick to minimise their
- * latency at all times.
- */
-static inline void
-swap_sticky(struct rq *rq, unsigned long cpu, struct task_struct *p)
-{
-	if (rq->sticky_task) {
-		if (rq->sticky_task == p) {
-			p->sticky = 1;
-			return;
-		}
-		if (rq->sticky_task->sticky) {
-			rq->sticky_task->sticky = 0;
-			resched_closest_idle(rq, cpu, rq->sticky_task);
-		}
-	}
-	if (!rt_task(p)) {
-		p->sticky = 1;
-		rq->sticky_task = p;
-	} else {
-		resched_closest_idle(rq, cpu, p);
-		rq->sticky_task = NULL;
-	}
-}
-
-static inline void unstick_task(struct rq *rq, struct task_struct *p)
-{
-	rq->sticky_task = NULL;
-	clear_sticky(p);
-}
-#else
-static inline void clear_sticky(struct task_struct *p)
-{
-}
-
-static inline int task_sticky(struct task_struct *p)
-{
-	return 0;
-}
-
-static inline void
-swap_sticky(struct rq *rq, unsigned long cpu, struct task_struct *p)
-{
-}
-
-static inline void unstick_task(struct rq *rq, struct task_struct *p)
-{
-}
 #endif
 
 /*
@@ -1087,7 +999,6 @@ static inline void take_task(struct rq *
 {
 	set_task_cpu(p, cpu_of(rq));
 	dequeue_task(p);
-	clear_sticky(p);
 	dec_qnr();
 }
 
@@ -1442,13 +1353,6 @@ static void try_preempt(struct task_stru
 	int highest_prio;
 	cpumask_t tmp;
 
-	/*
-	 * We clear the sticky flag here because for a task to have called
-	 * try_preempt with the sticky flag enabled means some complicated
-	 * re-scheduling has occurred and we should ignore the sticky flag.
-	 */
-	clear_sticky(p);
-
 	if (suitable_idle_cpus(p)) {
 		resched_best_idle(p);
 		return;
@@ -1467,6 +1371,7 @@ static void try_preempt(struct task_stru
 	highest_prio = -1;
 
 	for_each_cpu_mask(cpu, tmp) {
+		u64 offset_deadline;
 		struct rq *rq;
 		int rq_prio;
 
@@ -1475,9 +1380,12 @@ static void try_preempt(struct task_stru
 		if (rq_prio < highest_prio)
 			continue;
 
-		if (rq_prio > highest_prio ||
-		    deadline_after(rq->rq_deadline, latest_deadline)) {
-			latest_deadline = rq->rq_deadline;
+		offset_deadline = rq->rq_deadline -
+				  cache_distance(this_rq, rq, p);
+
+		if (rq_prio > highest_prio || (rq_prio == highest_prio &&
+		    deadline_after(offset_deadline, latest_deadline))) {
+			latest_deadline = offset_deadline;
 			highest_prio = rq_prio;
 			highest_prio_rq = rq;
 		}
@@ -1671,7 +1579,6 @@ void sched_fork(struct task_struct *p, i
 #endif
 
 	p->oncpu = 0;
-	clear_sticky(p);
 
 #ifdef CONFIG_PREEMPT
 	/* Want to start with kernel preemption disabled. */
@@ -2012,7 +1919,8 @@ unsigned long nr_active(void)
 unsigned long this_cpu_load(void)
 {
 	return this_rq()->rq_running +
-		((queued_notrunning() + nr_uninterruptible()) / grq.noc);
+		(queued_notrunning() + nr_uninterruptible()) /
+		(1 + num_online_cpus());
 }
 
 /* Variables and functions for calc_load */
@@ -2065,81 +1973,6 @@ DEFINE_PER_CPU(struct kernel_stat, kstat
 
 EXPORT_PER_CPU_SYMBOL(kstat);
 
-#ifdef CONFIG_IRQ_TIME_ACCOUNTING
-
-/*
- * There are no locks covering percpu hardirq/softirq time.
- * They are only modified in account_system_vtime, on corresponding CPU
- * with interrupts disabled. So, writes are safe.
- * They are read and saved off onto struct rq in update_rq_clock().
- * This may result in other CPU reading this CPU's irq time and can
- * race with irq/account_system_vtime on this CPU. We would either get old
- * or new value (or semi updated value on 32 bit) with a side effect of
- * accounting a slice of irq time to wrong task when irq is in progress
- * while we read rq->clock. That is a worthy compromise in place of having
- * locks on each irq in account_system_time.
- */
-static DEFINE_PER_CPU(u64, cpu_hardirq_time);
-static DEFINE_PER_CPU(u64, cpu_softirq_time);
-
-static DEFINE_PER_CPU(u64, irq_start_time);
-static int sched_clock_irqtime;
-
-void enable_sched_clock_irqtime(void)
-{
-	sched_clock_irqtime = 1;
-}
-
-void disable_sched_clock_irqtime(void)
-{
-	sched_clock_irqtime = 0;
-}
-
-static u64 irq_time_cpu(int cpu)
-{
-	if (!sched_clock_irqtime)
-		return 0;
-
-	return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
-}
-
-void account_system_vtime(struct task_struct *curr)
-{
-	unsigned long flags;
-	int cpu;
-	u64 now, delta;
-
-	if (!sched_clock_irqtime)
-		return;
-
-	local_irq_save(flags);
-
-	cpu = smp_processor_id();
-	now = sched_clock_cpu(cpu);
-	delta = now - per_cpu(irq_start_time, cpu);
-	per_cpu(irq_start_time, cpu) = now;
-	/*
-	 * We do not account for softirq time from ksoftirqd here.
-	 * We want to continue accounting softirq time to ksoftirqd thread
-	 * in that case, so as not to confuse scheduler with a special task
-	 * that do not consume any time, but still wants to run.
-	 */
-	if (hardirq_count())
-		per_cpu(cpu_hardirq_time, cpu) += delta;
-	else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
-		per_cpu(cpu_softirq_time, cpu) += delta;
-
-	local_irq_restore(flags);
-}
-EXPORT_SYMBOL_GPL(account_system_vtime);
-#else
-
-static u64 irq_time_cpu(int cpu)
-{
-	return 0;
-}
-#endif
-
 /*
  * On each tick, see what percentage of that tick was attributed to each
  * component and add the percentage to the _pc values. Once a _pc value has
@@ -2698,14 +2531,9 @@ static inline u64 static_deadline_diff(i
 	return prio_deadline_diff(USER_PRIO(static_prio));
 }
 
-static inline int longest_deadline_diff(void)
-{
-	return prio_deadline_diff(39);
-}
-
 static inline int ms_longest_deadline_diff(void)
 {
-	return NS_TO_MS(longest_deadline_diff());
+	return NS_TO_MS(prio_deadline_diff(39));
 }
 
 /*
@@ -2775,19 +2603,7 @@ retry:
 			goto out_take;
 		}
 
-		/*
-		 * Soft affinity happens here by not scheduling a task with
-		 * its sticky flag set that ran on a different CPU last when
-		 * the CPU is scaling, or by greatly biasing against its
-		 * deadline when not.
-		 */
-		if (task_rq(p) != rq && task_sticky(p)) {
-			if (scaling_rq(rq))
-				continue;
-			else
-				dl = p->deadline + longest_deadline_diff();
-		} else
-			dl = p->deadline;
+		dl = p->deadline + cache_distance(task_rq(p), rq, p);
 
 		/*
 		 * No rt tasks. Find the earliest deadline task. Now we're in
@@ -2865,7 +2681,7 @@ static inline void set_rq_task(struct rq
 {
 	rq->rq_time_slice = p->time_slice;
 	rq->rq_deadline = p->deadline;
-	rq->rq_last_ran = p->last_ran = rq->clock;
+	rq->rq_last_ran = p->last_ran;
 	rq->rq_policy = p->policy;
 	rq->rq_prio = p->prio;
 	if (p != rq->idle)
@@ -2944,8 +2760,14 @@ need_resched_nonpreemptible:
 				*/
 				grq_unlock_irq();
 				goto rerun_prev_unlocked;
-			} else
-				swap_sticky(rq, cpu, prev);
+			} else {
+				/*
+				 * If prev got kicked off by a task that has to
+				 * run on this CPU for affinity reasons then
+				 * there may be an idle CPU it can go to.
+				 */
+				resched_suitable_idle(prev);
+			}
 		}
 		return_task(prev, deactivate);
 	}
@@ -2960,21 +2782,12 @@ need_resched_nonpreemptible:
 		set_cpuidle_map(cpu);
 	} else {
 		next = earliest_deadline_task(rq, idle);
-		if (likely(next->prio != PRIO_LIMIT)) {
-			prefetch(next);
-			prefetch_stack(next);
-			clear_cpuidle_map(cpu);
-		} else
-			set_cpuidle_map(cpu);
+		prefetch(next);
+		prefetch_stack(next);
+		clear_cpuidle_map(cpu);
 	}
 
 	if (likely(prev != next)) {
-		/*
-		 * Don't stick tasks when a real time task is going to run as
-		 * they may literally get stuck.
-		 */
-		if (rt_task(next))
-			unstick_task(rq, prev);
 		sched_info_switch(prev, next);
 		perf_event_task_sched_out(prev, next, cpu);
 
@@ -4532,6 +4345,7 @@ void init_idle(struct task_struct *idle,
 	rcu_read_unlock();
 	rq->curr = rq->idle = idle;
 	idle->oncpu = 1;
+	set_cpuidle_map(cpu);
 	grq_unlock_irqrestore(&flags);
 
 	/* Set the preempt count _outside_ the spinlocks! */
@@ -4778,7 +4592,6 @@ static void break_sole_affinity(int src_
 				       task_pid_nr(p), p->comm, src_cpu);
 			}
 		}
-		clear_sticky(p);
 	} while_each_thread(t, p);
 }
 
@@ -5040,7 +4853,6 @@ migration_call(struct notifier_block *nf
 
 			set_rq_online(rq);
 		}
-		grq.noc = num_online_cpus();
 		grq_unlock_irqrestore(&flags);
 		break;
 
@@ -5071,7 +4883,6 @@ migration_call(struct notifier_block *nf
 			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
 			set_rq_offline(rq);
 		}
-		grq.noc = num_online_cpus();
 		grq_unlock_irqrestore(&flags);
 		break;
 #endif
@@ -6595,7 +6406,7 @@ static int cache_cpu_idle(unsigned long
 void __init sched_init_smp(void)
 {
 	struct sched_domain *sd;
-	int cpu;
+	int cpu, cpus;
 
 	cpumask_var_t non_isolated_cpus;
 
@@ -6629,6 +6440,14 @@ void __init sched_init_smp(void)
 		BUG();
 	free_cpumask_var(non_isolated_cpus);
 
+	/*
+	 * Assume that every added cpu gives us slightly less overall latency
+	 * allowing us to increase the base rr_interval, non-linearly and with
+	 * an upper bound.
+	 */
+	cpus = num_online_cpus();
+	rr_interval = rr_interval * (4 * cpus + 4) / (cpus + 6);
+
 	grq_lock_irq();
 	/*
 	 * Set up the relative cache distance of each online cpu from each
@@ -6717,7 +6536,6 @@ void __init sched_init(void)
 	grq.last_jiffy = jiffies;
 	spin_lock_init(&grq.iso_lock);
 	grq.iso_ticks = grq.iso_refractory = 0;
-	grq.noc = 1;
 #ifdef CONFIG_SMP
 	init_defrootdomain();
 	grq.qnr = grq.idle_cpus = 0;
@@ -6731,7 +6549,6 @@ void __init sched_init(void)
 			      rq->iowait_pc = rq->idle_pc = 0;
 		rq->dither = 0;
 #ifdef CONFIG_SMP
-		rq->sticky_task = NULL;
 		rq->last_niffy = 0;
 		rq->sd = NULL;
 		rq->rd = NULL;