git - ziggy471-frankenstein-kernel.git/commitdiff

file:c0282002a079131b16f8256080158247e75e02d5 -> file:c10d956018f990109f30d189c4d322829e2e5703

--- a/Documentation/scheduler/sched-BFS.txt

+++ b/Documentation/scheduler/sched-BFS.txt

@@ -177,29 +177,26 @@ The first is the local copy of the runni

on to allow that data to be updated lockless where possible. Then there is

deference paid to the last CPU a task was running on, by trying that CPU first

when looking for an idle CPU to use the next time it's scheduled. Finally there

-is the notion of cache locality beyond the last running CPU. The sched_domains

-information is used to determine the relative virtual "cache distance" that

-other CPUs have from the last CPU a task was running on. CPUs with shared

-caches, such as SMT siblings, or multicore CPUs with shared caches, are treated

-as cache local. CPUs without shared caches are treated as not cache local, and

-CPUs on different NUMA nodes are treated as very distant. This "relative cache

-distance" is used by modifying the virtual deadline value when doing lookups.

-Effectively, the deadline is unaltered between "cache local" CPUs, doubled for

-"cache distant" CPUs, and quadrupled for "very distant" CPUs. The reasoning

-behind the doubling of deadlines is as follows. The real cost of migrating a

-task from one CPU to another is entirely dependant on the cache footprint of

-the task, how cache intensive the task is, how long it's been running on that

-CPU to take up the bulk of its cache, how big the CPU cache is, how fast and

-how layered the CPU cache is, how fast a context switch is... and so on. In

-other words, it's close to random in the real world where we do more than just

-one sole workload. The only thing we can be sure of is that it's not free. So

-BFS uses the principle that an idle CPU is a wasted CPU and utilising idle CPUs

-is more important than cache locality, and cache locality only plays a part

-after that. Doubling the effective deadline is based on the premise that the

-"cache local" CPUs will tend to work on the same tasks up to double the number

-of cache local CPUs, and once the workload is beyond that amount, it is likely

-that none of the tasks are cache warm anywhere anyway. The quadrupling for NUMA

-is a value I pulled out of my arse.

+is the notion of "sticky" tasks that are flagged when they are involuntarily

+descheduled, meaning they still want further CPU time. This sticky flag is

+used to bias heavily against those tasks being scheduled on a different CPU

+unless that CPU would be otherwise idle. When a cpu frequency governor is used

+that scales with CPU load, such as ondemand, sticky tasks are not scheduled

+on a different CPU at all, preferring instead to go idle. This means the CPU

+they were bound to is more likely to increase its speed while the other CPU

+will go idle, thus speeding up total task execution time and likely decreasing

+power usage. This is the only scenario where BFS will allow a CPU to go idle

+in preference to scheduling a task on the earliest available spare CPU.

+The real cost of migrating a task from one CPU to another is entirely dependant

+on the cache footprint of the task, how cache intensive the task is, how long

+it's been running on that CPU to take up the bulk of its cache, how big the CPU

+cache is, how fast and how layered the CPU cache is, how fast a context switch

+is... and so on. In other words, it's close to random in the real world where we

+do more than just one sole workload. The only thing we can be sure of is that

+it's not free. So BFS uses the principle that an idle CPU is a wasted CPU and

+utilising idle CPUs is more important than cache locality, and cache locality

+only plays a part after that.

When choosing an idle CPU for a waking task, the cache locality is determined

according to where the task last ran and then idle CPUs are ranked from best

@@ -252,22 +249,21 @@ accessed in

/proc/sys/kernel/rr_interval

-The value is in milliseconds, and the default value is set to 6 on a

-uniprocessor machine, and automatically set to a progressively higher value on

-multiprocessor machines. The reasoning behind increasing the value on more CPUs

-is that the effective latency is decreased by virtue of there being more CPUs on

-BFS (for reasons explained above), and increasing the value allows for less

-cache contention and more throughput. Valid values are from 1 to 1000

-Decreasing the value will decrease latencies at the cost of decreasing

-throughput, while increasing it will improve throughput, but at the cost of

-worsening latencies. The accuracy of the rr interval is limited by HZ resolution

-of the kernel configuration. Thus, the worst case latencies are usually slightly

-higher than this actual value. The default value of 6 is not an arbitrary one.

-It is based on the fact that humans can detect jitter at approximately 7ms, so

-aiming for much lower latencies is pointless under most circumstances. It is

-worth noting this fact when comparing the latency performance of BFS to other

-schedulers. Worst case latencies being higher than 7ms are far worse than

-average latencies not being in the microsecond range.

+The value is in milliseconds, and the default value is set to 6ms. Valid values

+are from 1 to 1000. Decreasing the value will decrease latencies at the cost of

+decreasing throughput, while increasing it will improve throughput, but at the

+cost of worsening latencies. The accuracy of the rr interval is limited by HZ

+resolution of the kernel configuration. Thus, the worst case latencies are

+usually slightly higher than this actual value. BFS uses "dithering" to try and

+minimise the effect the Hz limitation has. The default value of 6 is not an

+arbitrary one. It is based on the fact that humans can detect jitter at

+approximately 7ms, so aiming for much lower latencies is pointless under most

+circumstances. It is worth noting this fact when comparing the latency

+performance of BFS to other schedulers. Worst case latencies being higher than

+7ms are far worse than average latencies not being in the microsecond range.

+Experimentation has shown that rr intervals being increased up to 300 can

+improve throughput but beyond that, scheduling noise from elsewhere prevents

+further demonstrable throughput.

Isochronous scheduling.

@@ -348,4 +344,4 @@ of total wall clock time taken and total

"cpu usage".

-Con Kolivas <kernel@kolivas.org> Fri Aug 27 2010

+Con Kolivas <kernel@kolivas.org> Tue, 5 Apr 2011

file:505e35815ef8eae678ce909c6889911ae25ca45b -> file:4fd8c860b7ca7d83dc068450b34ce202d3e49459

--- a/drivers/cpufreq/cpufreq.c

+++ b/drivers/cpufreq/cpufreq.c

@@ -28,6 +28,7 @@

#include <linux/cpu.h>

#include <linux/completion.h>

#include <linux/mutex.h>

+#include <linux/sched.h>

#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_CORE, \

"cpufreq-core", msg)

@@ -1609,6 +1610,12 @@ int __cpufreq_driver_target(struct cpufr

target_freq, relation);

if (cpu_online(policy->cpu) && cpufreq_driver->target)

retval = cpufreq_driver->target(policy, target_freq, relation);

+ if (likely(retval != -EINVAL)) {

+ if (target_freq == policy->max)

+ cpu_nonscaling(policy->cpu);

+ else

+ cpu_scaling(policy->cpu);

+ }

return retval;

}

file:25688fd1585667512a560e7153df3376d8698504 -> file:7002ac0cae5a4a16fa6b49d34dc2f0e3c48e9902

--- a/include/linux/sched.h

+++ b/include/linux/sched.h

@@ -1252,7 +1252,9 @@ struct task_struct {

struct list_head run_list;

u64 last_ran;

u64 sched_time; /* sched_clock time spent running */

+#ifdef CONFIG_SMP

+ int sticky; /* Soft affined flag */

+#endif

unsigned long rt_timeout;

#else /* CONFIG_SCHED_BFS */

const struct sched_class *sched_class;

@@ -1574,6 +1576,8 @@ struct task_struct {

#ifdef CONFIG_SCHED_BFS

extern int grunqueue_is_locked(void);

extern void grq_unlock_wait(void);

+extern void cpu_scaling(int cpu);

+extern void cpu_nonscaling(int cpu);

#define tsk_seruntime(t) ((t)->sched_time)

#define tsk_rttimeout(t) ((t)->rt_timeout)

#define task_rq_unlock_wait(tsk) grq_unlock_wait()

@@ -1591,7 +1595,7 @@ static inline void tsk_cpus_current(stru

static inline void print_scheduler_version(void)

{

- printk(KERN_INFO"BFS CPU scheduler v0.363 by Con Kolivas.\n");

+ printk(KERN_INFO"BFS CPU scheduler v0.401 by Con Kolivas.\n");

}

static inline int iso_task(struct task_struct *p)

@@ -1601,6 +1605,13 @@ static inline int iso_task(struct task_s

#else

extern int runqueue_is_locked(int cpu);

extern void task_rq_unlock_wait(struct task_struct *p);

+static inline void cpu_scaling(int cpu)

+static inline void cpu_nonscaling(int cpu)

#define tsk_seruntime(t) ((t)->se.sum_exec_runtime)

#define tsk_rttimeout(t) ((t)->rt.timeout)

file:0f0e14f8cdb956035e72fd5ccb35f1278844858a -> file:168e84a745778904e5a69cafe0c13396ba938602

--- a/kernel/sched_bfs.c

+++ b/kernel/sched_bfs.c

@@ -85,7 +85,7 @@

#define idleprio_task(p) unlikely((p)->policy == SCHED_IDLEPRIO)

#define iso_task(p) unlikely((p)->policy == SCHED_ISO)

#define iso_queue(rq) unlikely((rq)->rq_policy == SCHED_ISO)

-#define ISO_PERIOD ((5 * HZ * num_online_cpus()) + 1)

+#define ISO_PERIOD ((5 * HZ * grq.noc) + 1)

* Convert user-nice values [ -20 ... 0 ... 19 ]

@@ -145,7 +145,7 @@ static int prio_ratios[PRIO_RANGE] __rea

* The quota handed out to tasks of all priority levels when refilling their

* time_slice.

-static inline unsigned long timeslice(void)

+static inline int timeslice(void)

{

return MS_TO_US(rr_interval);

}

@@ -167,6 +167,7 @@ struct global_rq {

cpumask_t cpu_idle_map;

int idle_cpus;

#endif

+ int noc; /* num_online_cpus stored and updated when it changes */

u64 niffies; /* Nanosecond jiffies */

unsigned long last_jiffy; /* Last jiffy we updated niffies */

@@ -209,6 +210,8 @@ struct rq {

#ifdef CONFIG_SMP

int cpu; /* cpu of this runqueue */

int online;

+ int scaling; /* This CPU is managed by a scaling CPU freq governor */

+ struct task_struct *sticky_task;

struct root_domain *rd;

struct sched_domain *sd;

@@ -225,7 +228,11 @@ struct rq {

#endif

u64 last_niffy; /* Last time this RQ updated grq.niffies */

#endif

+#ifdef CONFIG_IRQ_TIME_ACCOUNTING

+ u64 prev_irq_time;

+#endif

u64 clock, old_clock, last_tick;

+ u64 clock_task;

int dither;

#ifdef CONFIG_SCHEDSTATS

@@ -397,9 +404,17 @@ static inline void update_clocks(struct

* when we're not updating niffies.

* Looking up task_rq must be done under grq.lock to be safe.

+static u64 irq_time_cpu(int cpu);

static inline void update_rq_clock(struct rq *rq)

{

- rq->clock = sched_clock_cpu(cpu_of(rq));

+ int cpu = cpu_of(rq);

+ u64 irq_time;

+ rq->clock = sched_clock_cpu(cpu);

+ irq_time = irq_time_cpu(cpu);

+ if (rq->clock - irq_time > rq->clock_task)

+ rq->clock_task = rq->clock - irq_time;

}

static inline int task_running(struct task_struct *p)

@@ -751,10 +766,8 @@ static void resched_task(struct task_str

* The best idle CPU is chosen according to the CPUIDLE ranking above where the

- * lowest value would give the most suitable CPU to schedule p onto next. We

- * iterate from the last CPU upwards instead of using for_each_cpu_mask so as

- * to be able to break out immediately if the last CPU is idle. The order works

- * out to be the following:

+ * lowest value would give the most suitable CPU to schedule p onto next. The

+ * order works out to be the following:

* Same core, idle or busy cache, idle threads

* Other core, same cache, idle or busy cache, idle threads.

@@ -766,38 +779,19 @@ static void resched_task(struct task_str

* Other node, other CPU, idle cache, idle threads.

* Other node, other CPU, busy cache, idle threads.

* Other node, other CPU, busy threads.

- *

- * If p was the last task running on this rq, then regardless of where

- * it has been running since then, it is cache warm on this rq.

-static void resched_best_idle(struct task_struct *p)

+static void

+resched_best_mask(unsigned long best_cpu, struct rq *rq, cpumask_t *tmpmask)

{

- unsigned long cpu_tmp, best_cpu, best_ranking;

- cpumask_t tmpmask;

- struct rq *rq;

- int iterate;

+ unsigned long cpu_tmp, best_ranking;

- cpus_and(tmpmask, p->cpus_allowed, grq.cpu_idle_map);

- iterate = cpus_weight(tmpmask);

- best_cpu = task_cpu(p);

- /*

- * Start below the last CPU and work up with next_cpu as the last

- * CPU might not be idle or affinity might not allow it.

- */

- cpu_tmp = best_cpu - 1;

- rq = cpu_rq(best_cpu);

best_ranking = ~0UL;

- do {

+ for_each_cpu_mask(cpu_tmp, *tmpmask) {

unsigned long ranking;

struct rq *tmp_rq;

ranking = 0;

- cpu_tmp = next_cpu(cpu_tmp, tmpmask);

- if (cpu_tmp >= nr_cpu_ids) {

- cpu_tmp = -1;

- cpu_tmp = next_cpu(cpu_tmp, tmpmask);

- }

tmp_rq = cpu_rq(cpu_tmp);

#ifdef CONFIG_NUMA

@@ -825,37 +819,42 @@ static void resched_best_idle(struct tas

break;

best_ranking = ranking;

}

- } while (--iterate > 0);

+ }

resched_task(cpu_rq(best_cpu)->curr);

}

+static void resched_best_idle(struct task_struct *p)

+ cpumask_t tmpmask;

+ cpus_and(tmpmask, p->cpus_allowed, grq.cpu_idle_map);

+ resched_best_mask(task_cpu(p), task_rq(p), &tmpmask);

static inline void resched_suitable_idle(struct task_struct *p)

{

if (suitable_idle_cpus(p))

resched_best_idle(p);

}

- * The cpu cache locality difference between CPUs is used to determine how far

- * to offset the virtual deadline. <2 difference in locality means that one

- * timeslice difference is allowed longer for the cpu local tasks. This is

- * enough in the common case when tasks are up to 2* number of CPUs to keep

- * tasks within their shared cache CPUs only. CPUs on different nodes or not

- * even in this domain (NUMA) have "4" difference, allowing 4 times longer

- * deadlines before being taken onto another cpu, allowing for 2* the double

- * seen by separate CPUs above.

- * Simple summary: Virtual deadlines are equal on shared cache CPUs, double

- * on separate CPUs and quadruple in separate NUMA nodes.

+ * Flags to tell us whether this CPU is running a CPU frequency governor that

+ * has slowed its speed or not. No locking required as the very rare wrongly

+ * read value would be harmless.

-static inline int

-cache_distance(struct rq *task_rq, struct rq *rq, struct task_struct *p)

+void cpu_scaling(int cpu)

{

- int locality = rq->cpu_locality[cpu_of(task_rq)] - 2;

+ cpu_rq(cpu)->scaling = 1;

- if (locality > 0)

- return task_timeslice(p) << locality;

- return 0;

+void cpu_nonscaling(int cpu)

+ cpu_rq(cpu)->scaling = 0;

+static inline int scaling_rq(struct rq *rq)

+ return rq->scaling;

}

#else /* CONFIG_SMP */

static inline void inc_qnr(void)

@@ -888,12 +887,25 @@ static inline void resched_suitable_idle

{

}

-static inline int

-cache_distance(struct rq *task_rq, struct rq *rq, struct task_struct *p)

+void cpu_scaling(int __unused)

+void cpu_nonscaling(int __unused)

+/*

+ * Although CPUs can scale in UP, there is nowhere else for tasks to go so this

+ * always returns 0.

+ */

+static inline int scaling_rq(struct rq *rq)

{

return 0;

}

#endif /* CONFIG_SMP */

+EXPORT_SYMBOL_GPL(cpu_scaling);

+EXPORT_SYMBOL_GPL(cpu_nonscaling);

* activate_idle_task - move idle task to the _front_ of runqueue.

@@ -989,6 +1001,82 @@ void set_task_cpu(struct task_struct *p,

smp_wmb();

task_thread_info(p)->cpu = cpu;

}

+static inline void clear_sticky(struct task_struct *p)

+ p->sticky = 0;

+static inline int task_sticky(struct task_struct *p)

+ return p->sticky;

+/* Reschedule the best idle CPU that is not this one. */

+static void

+resched_closest_idle(struct rq *rq, unsigned long cpu, struct task_struct *p)

+ cpumask_t tmpmask;

+ cpus_and(tmpmask, p->cpus_allowed, grq.cpu_idle_map);

+ cpu_clear(cpu, tmpmask);

+ if (cpus_empty(tmpmask))

+ return;

+ resched_best_mask(cpu, rq, &tmpmask);

+/*

+ * We set the sticky flag on a task that is descheduled involuntarily meaning

+ * it is awaiting further CPU time. If the last sticky task is still sticky

+ * but unlucky enough to not be the next task scheduled, we unstick it and try

+ * to find it an idle CPU. Realtime tasks do not stick to minimise their

+ * latency at all times.

+ */

+static inline void

+swap_sticky(struct rq *rq, unsigned long cpu, struct task_struct *p)

+ if (rq->sticky_task) {

+ if (rq->sticky_task == p) {

+ p->sticky = 1;

+ return;

+ }

+ if (rq->sticky_task->sticky) {

+ rq->sticky_task->sticky = 0;

+ resched_closest_idle(rq, cpu, rq->sticky_task);

+ }

+ if (!rt_task(p)) {

+ p->sticky = 1;

+ rq->sticky_task = p;

+ } else {

+ resched_closest_idle(rq, cpu, p);

+ rq->sticky_task = NULL;

+ }

+static inline void unstick_task(struct rq *rq, struct task_struct *p)

+ rq->sticky_task = NULL;

+ clear_sticky(p);

+#else

+static inline void clear_sticky(struct task_struct *p)

+static inline int task_sticky(struct task_struct *p)

+ return 0;

+static inline void

+swap_sticky(struct rq *rq, unsigned long cpu, struct task_struct *p)

+static inline void unstick_task(struct rq *rq, struct task_struct *p)

#endif

@@ -999,6 +1087,7 @@ static inline void take_task(struct rq *

{

set_task_cpu(p, cpu_of(rq));

dequeue_task(p);

+ clear_sticky(p);

dec_qnr();

}

@@ -1353,6 +1442,13 @@ static void try_preempt(struct task_stru

int highest_prio;

cpumask_t tmp;

+ /*

+ * We clear the sticky flag here because for a task to have called

+ * try_preempt with the sticky flag enabled means some complicated

+ * re-scheduling has occurred and we should ignore the sticky flag.

+ */

+ clear_sticky(p);

if (suitable_idle_cpus(p)) {

resched_best_idle(p);

return;

@@ -1371,7 +1467,6 @@ static void try_preempt(struct task_stru

highest_prio = -1;

for_each_cpu_mask(cpu, tmp) {

- u64 offset_deadline;

struct rq *rq;

int rq_prio;

@@ -1380,12 +1475,9 @@ static void try_preempt(struct task_stru

if (rq_prio < highest_prio)

continue;

- offset_deadline = rq->rq_deadline -

- cache_distance(this_rq, rq, p);

- if (rq_prio > highest_prio || (rq_prio == highest_prio &&

- deadline_after(offset_deadline, latest_deadline))) {

- latest_deadline = offset_deadline;

+ if (rq_prio > highest_prio ||

+ deadline_after(rq->rq_deadline, latest_deadline)) {

+ latest_deadline = rq->rq_deadline;

highest_prio = rq_prio;

highest_prio_rq = rq;

}

@@ -1579,6 +1671,7 @@ void sched_fork(struct task_struct *p, i

#endif

p->oncpu = 0;

+ clear_sticky(p);

#ifdef CONFIG_PREEMPT

/* Want to start with kernel preemption disabled. */

@@ -1919,8 +2012,7 @@ unsigned long nr_active(void)

unsigned long this_cpu_load(void)

{

return this_rq()->rq_running +

- (queued_notrunning() + nr_uninterruptible()) /

- (1 + num_online_cpus());

+ ((queued_notrunning() + nr_uninterruptible()) / grq.noc);

}

/* Variables and functions for calc_load */

@@ -1973,6 +2065,81 @@ DEFINE_PER_CPU(struct kernel_stat, kstat

EXPORT_PER_CPU_SYMBOL(kstat);

+#ifdef CONFIG_IRQ_TIME_ACCOUNTING

+/*

+ * There are no locks covering percpu hardirq/softirq time.

+ * They are only modified in account_system_vtime, on corresponding CPU

+ * with interrupts disabled. So, writes are safe.

+ * They are read and saved off onto struct rq in update_rq_clock().

+ * This may result in other CPU reading this CPU's irq time and can

+ * race with irq/account_system_vtime on this CPU. We would either get old

+ * or new value (or semi updated value on 32 bit) with a side effect of

+ * accounting a slice of irq time to wrong task when irq is in progress

+ * while we read rq->clock. That is a worthy compromise in place of having

+ * locks on each irq in account_system_time.

+ */

+static DEFINE_PER_CPU(u64, cpu_hardirq_time);

+static DEFINE_PER_CPU(u64, cpu_softirq_time);

+static DEFINE_PER_CPU(u64, irq_start_time);

+static int sched_clock_irqtime;

+void enable_sched_clock_irqtime(void)

+ sched_clock_irqtime = 1;

+void disable_sched_clock_irqtime(void)

+ sched_clock_irqtime = 0;

+static u64 irq_time_cpu(int cpu)

+ if (!sched_clock_irqtime)

+ return 0;

+ return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);

+void account_system_vtime(struct task_struct *curr)

+ unsigned long flags;

+ int cpu;

+ u64 now, delta;

+ if (!sched_clock_irqtime)

+ return;

+ local_irq_save(flags);

+ cpu = smp_processor_id();

+ now = sched_clock_cpu(cpu);

+ delta = now - per_cpu(irq_start_time, cpu);

+ per_cpu(irq_start_time, cpu) = now;

+ /*

+ * We do not account for softirq time from ksoftirqd here.

+ * We want to continue accounting softirq time to ksoftirqd thread

+ * in that case, so as not to confuse scheduler with a special task

+ * that do not consume any time, but still wants to run.

+ */

+ if (hardirq_count())

+ per_cpu(cpu_hardirq_time, cpu) += delta;

+ else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))

+ per_cpu(cpu_softirq_time, cpu) += delta;

+ local_irq_restore(flags);

+EXPORT_SYMBOL_GPL(account_system_vtime);

+#else

+static u64 irq_time_cpu(int cpu)

+ return 0;

+#endif

* On each tick, see what percentage of that tick was attributed to each

* component and add the percentage to the _pc values. Once a _pc value has

@@ -2531,9 +2698,14 @@ static inline u64 static_deadline_diff(i

return prio_deadline_diff(USER_PRIO(static_prio));

}

+static inline int longest_deadline_diff(void)

+ return prio_deadline_diff(39);

static inline int ms_longest_deadline_diff(void)

{

- return NS_TO_MS(prio_deadline_diff(39));

+ return NS_TO_MS(longest_deadline_diff());

}

@@ -2603,7 +2775,19 @@ retry:

goto out_take;

}

- dl = p->deadline + cache_distance(task_rq(p), rq, p);

+ /*

+ * Soft affinity happens here by not scheduling a task with

+ * its sticky flag set that ran on a different CPU last when

+ * the CPU is scaling, or by greatly biasing against its

+ * deadline when not.

+ */

+ if (task_rq(p) != rq && task_sticky(p)) {

+ if (scaling_rq(rq))

+ continue;

+ else

+ dl = p->deadline + longest_deadline_diff();

+ } else

+ dl = p->deadline;

* No rt tasks. Find the earliest deadline task. Now we're in

@@ -2681,7 +2865,7 @@ static inline void set_rq_task(struct rq

{

rq->rq_time_slice = p->time_slice;

rq->rq_deadline = p->deadline;

- rq->rq_last_ran = p->last_ran;

+ rq->rq_last_ran = p->last_ran = rq->clock;

rq->rq_policy = p->policy;

rq->rq_prio = p->prio;

if (p != rq->idle)

@@ -2760,14 +2944,8 @@ need_resched_nonpreemptible:

grq_unlock_irq();

goto rerun_prev_unlocked;

- } else {

- /*

- * If prev got kicked off by a task that has to

- * run on this CPU for affinity reasons then

- * there may be an idle CPU it can go to.

- */

- resched_suitable_idle(prev);

- }

+ } else

+ swap_sticky(rq, cpu, prev);

}

return_task(prev, deactivate);

}

@@ -2782,12 +2960,21 @@ need_resched_nonpreemptible:

set_cpuidle_map(cpu);

} else {

next = earliest_deadline_task(rq, idle);

- prefetch(next);

- prefetch_stack(next);

- clear_cpuidle_map(cpu);

+ if (likely(next->prio != PRIO_LIMIT)) {

+ prefetch(next);

+ prefetch_stack(next);

+ clear_cpuidle_map(cpu);

+ } else

+ set_cpuidle_map(cpu);

}

if (likely(prev != next)) {

+ /*

+ * Don't stick tasks when a real time task is going to run as

+ * they may literally get stuck.

+ */

+ if (rt_task(next))

+ unstick_task(rq, prev);

sched_info_switch(prev, next);

perf_event_task_sched_out(prev, next, cpu);

@@ -4345,7 +4532,6 @@ void init_idle(struct task_struct *idle,

rcu_read_unlock();

rq->curr = rq->idle = idle;

idle->oncpu = 1;

- set_cpuidle_map(cpu);

grq_unlock_irqrestore(&flags);

/* Set the preempt count _outside_ the spinlocks! */

@@ -4592,6 +4778,7 @@ static void break_sole_affinity(int src_

task_pid_nr(p), p->comm, src_cpu);

}

+ clear_sticky(p);

} while_each_thread(t, p);

}

@@ -4853,6 +5040,7 @@ migration_call(struct notifier_block *nf

set_rq_online(rq);

}

+ grq.noc = num_online_cpus();

grq_unlock_irqrestore(&flags);

break;

@@ -4883,6 +5071,7 @@ migration_call(struct notifier_block *nf

BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));

set_rq_offline(rq);

}

+ grq.noc = num_online_cpus();

grq_unlock_irqrestore(&flags);

break;

#endif

@@ -6406,7 +6595,7 @@ static int cache_cpu_idle(unsigned long

void __init sched_init_smp(void)

{

struct sched_domain *sd;

- int cpu, cpus;

+ int cpu;

cpumask_var_t non_isolated_cpus;

@@ -6440,14 +6629,6 @@ void __init sched_init_smp(void)

BUG();

free_cpumask_var(non_isolated_cpus);

- /*

- * Assume that every added cpu gives us slightly less overall latency

- * allowing us to increase the base rr_interval, non-linearly and with

- * an upper bound.

- */

- cpus = num_online_cpus();

- rr_interval = rr_interval * (4 * cpus + 4) / (cpus + 6);

grq_lock_irq();

* Set up the relative cache distance of each online cpu from each

@@ -6536,6 +6717,7 @@ void __init sched_init(void)

grq.last_jiffy = jiffies;

spin_lock_init(&grq.iso_lock);

grq.iso_ticks = grq.iso_refractory = 0;

+ grq.noc = 1;

#ifdef CONFIG_SMP

init_defrootdomain();

grq.qnr = grq.idle_cpus = 0;

@@ -6549,6 +6731,7 @@ void __init sched_init(void)

rq->iowait_pc = rq->idle_pc = 0;

rq->dither = 0;

#ifdef CONFIG_SMP

+ rq->sticky_task = NULL;

rq->last_niffy = 0;

rq->sd = NULL;

rq->rd = NULL;