--- a/Documentation/scheduler/sched-BFS.txt
+++ b/Documentation/scheduler/sched-BFS.txt
@@ -177,29 +177,26 @@ The first is the local copy of the runni
on to allow that data to be updated lockless where possible. Then there is
deference paid to the last CPU a task was running on, by trying that CPU first
when looking for an idle CPU to use the next time it's scheduled. Finally there
-is the notion of cache locality beyond the last running CPU. The sched_domains
-information is used to determine the relative virtual "cache distance" that
-other CPUs have from the last CPU a task was running on. CPUs with shared
-caches, such as SMT siblings, or multicore CPUs with shared caches, are treated
-as cache local. CPUs without shared caches are treated as not cache local, and
-CPUs on different NUMA nodes are treated as very distant. This "relative cache
-distance" is used by modifying the virtual deadline value when doing lookups.
-Effectively, the deadline is unaltered between "cache local" CPUs, doubled for
-"cache distant" CPUs, and quadrupled for "very distant" CPUs. The reasoning
-behind the doubling of deadlines is as follows. The real cost of migrating a
-task from one CPU to another is entirely dependant on the cache footprint of
-the task, how cache intensive the task is, how long it's been running on that
-CPU to take up the bulk of its cache, how big the CPU cache is, how fast and
-how layered the CPU cache is, how fast a context switch is... and so on. In
-other words, it's close to random in the real world where we do more than just
-one sole workload. The only thing we can be sure of is that it's not free. So
-BFS uses the principle that an idle CPU is a wasted CPU and utilising idle CPUs
-is more important than cache locality, and cache locality only plays a part
-after that. Doubling the effective deadline is based on the premise that the
-"cache local" CPUs will tend to work on the same tasks up to double the number
-of cache local CPUs, and once the workload is beyond that amount, it is likely
-that none of the tasks are cache warm anywhere anyway. The quadrupling for NUMA
-is a value I pulled out of my arse.
+is the notion of "sticky" tasks that are flagged when they are involuntarily
+descheduled, meaning they still want further CPU time. This sticky flag is
+used to bias heavily against those tasks being scheduled on a different CPU
+unless that CPU would be otherwise idle. When a cpu frequency governor is used
+that scales with CPU load, such as ondemand, sticky tasks are not scheduled
+on a different CPU at all, preferring instead to go idle. This means the CPU
+they were bound to is more likely to increase its speed while the other CPU
+will go idle, thus speeding up total task execution time and likely decreasing
+power usage. This is the only scenario where BFS will allow a CPU to go idle
+in preference to scheduling a task on the earliest available spare CPU.
+
+The real cost of migrating a task from one CPU to another is entirely dependant
+on the cache footprint of the task, how cache intensive the task is, how long
+it's been running on that CPU to take up the bulk of its cache, how big the CPU
+cache is, how fast and how layered the CPU cache is, how fast a context switch
+is... and so on. In other words, it's close to random in the real world where we
+do more than just one sole workload. The only thing we can be sure of is that
+it's not free. So BFS uses the principle that an idle CPU is a wasted CPU and
+utilising idle CPUs is more important than cache locality, and cache locality
+only plays a part after that.
When choosing an idle CPU for a waking task, the cache locality is determined
according to where the task last ran and then idle CPUs are ranked from best
@@ -252,22 +249,21 @@ accessed in
/proc/sys/kernel/rr_interval
-The value is in milliseconds, and the default value is set to 6 on a
-uniprocessor machine, and automatically set to a progressively higher value on
-multiprocessor machines. The reasoning behind increasing the value on more CPUs
-is that the effective latency is decreased by virtue of there being more CPUs on
-BFS (for reasons explained above), and increasing the value allows for less
-cache contention and more throughput. Valid values are from 1 to 1000
-Decreasing the value will decrease latencies at the cost of decreasing
-throughput, while increasing it will improve throughput, but at the cost of
-worsening latencies. The accuracy of the rr interval is limited by HZ resolution
-of the kernel configuration. Thus, the worst case latencies are usually slightly
-higher than this actual value. The default value of 6 is not an arbitrary one.
-It is based on the fact that humans can detect jitter at approximately 7ms, so
-aiming for much lower latencies is pointless under most circumstances. It is
-worth noting this fact when comparing the latency performance of BFS to other
-schedulers. Worst case latencies being higher than 7ms are far worse than
-average latencies not being in the microsecond range.
+The value is in milliseconds, and the default value is set to 6ms. Valid values
+are from 1 to 1000. Decreasing the value will decrease latencies at the cost of
+decreasing throughput, while increasing it will improve throughput, but at the
+cost of worsening latencies. The accuracy of the rr interval is limited by HZ
+resolution of the kernel configuration. Thus, the worst case latencies are
+usually slightly higher than this actual value. BFS uses "dithering" to try and
+minimise the effect the Hz limitation has. The default value of 6 is not an
+arbitrary one. It is based on the fact that humans can detect jitter at
+approximately 7ms, so aiming for much lower latencies is pointless under most
+circumstances. It is worth noting this fact when comparing the latency
+performance of BFS to other schedulers. Worst case latencies being higher than
+7ms are far worse than average latencies not being in the microsecond range.
+Experimentation has shown that rr intervals being increased up to 300 can
+improve throughput but beyond that, scheduling noise from elsewhere prevents
+further demonstrable throughput.
Isochronous scheduling.
@@ -348,4 +344,4 @@ of total wall clock time taken and total
"cpu usage".
-Con Kolivas <kernel@kolivas.org> Fri Aug 27 2010
+Con Kolivas <kernel@kolivas.org> Tue, 5 Apr 2011
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -28,6 +28,7 @@
#include <linux/cpu.h>
#include <linux/completion.h>
#include <linux/mutex.h>
+#include <linux/sched.h>
#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_CORE, \
"cpufreq-core", msg)
@@ -1609,6 +1610,12 @@ int __cpufreq_driver_target(struct cpufr
target_freq, relation);
if (cpu_online(policy->cpu) && cpufreq_driver->target)
retval = cpufreq_driver->target(policy, target_freq, relation);
+ if (likely(retval != -EINVAL)) {
+ if (target_freq == policy->max)
+ cpu_nonscaling(policy->cpu);
+ else
+ cpu_scaling(policy->cpu);
+ }
return retval;
}
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1252,7 +1252,9 @@ struct task_struct {
struct list_head run_list;
u64 last_ran;
u64 sched_time; /* sched_clock time spent running */
-
+#ifdef CONFIG_SMP
+ int sticky; /* Soft affined flag */
+#endif
unsigned long rt_timeout;
#else /* CONFIG_SCHED_BFS */
const struct sched_class *sched_class;
@@ -1574,6 +1576,8 @@ struct task_struct {
#ifdef CONFIG_SCHED_BFS
extern int grunqueue_is_locked(void);
extern void grq_unlock_wait(void);
+extern void cpu_scaling(int cpu);
+extern void cpu_nonscaling(int cpu);
#define tsk_seruntime(t) ((t)->sched_time)
#define tsk_rttimeout(t) ((t)->rt_timeout)
#define task_rq_unlock_wait(tsk) grq_unlock_wait()
@@ -1591,7 +1595,7 @@ static inline void tsk_cpus_current(stru
static inline void print_scheduler_version(void)
{
- printk(KERN_INFO"BFS CPU scheduler v0.363 by Con Kolivas.\n");
+ printk(KERN_INFO"BFS CPU scheduler v0.401 by Con Kolivas.\n");
}
static inline int iso_task(struct task_struct *p)
@@ -1601,6 +1605,13 @@ static inline int iso_task(struct task_s
#else
extern int runqueue_is_locked(int cpu);
extern void task_rq_unlock_wait(struct task_struct *p);
+static inline void cpu_scaling(int cpu)
+{
+}
+
+static inline void cpu_nonscaling(int cpu)
+{
+}
#define tsk_seruntime(t) ((t)->se.sum_exec_runtime)
#define tsk_rttimeout(t) ((t)->rt.timeout)
--- a/kernel/sched_bfs.c
+++ b/kernel/sched_bfs.c
@@ -85,7 +85,7 @@
#define idleprio_task(p) unlikely((p)->policy == SCHED_IDLEPRIO)
#define iso_task(p) unlikely((p)->policy == SCHED_ISO)
#define iso_queue(rq) unlikely((rq)->rq_policy == SCHED_ISO)
-#define ISO_PERIOD ((5 * HZ * num_online_cpus()) + 1)
+#define ISO_PERIOD ((5 * HZ * grq.noc) + 1)
/*
* Convert user-nice values [ -20 ... 0 ... 19 ]
@@ -145,7 +145,7 @@ static int prio_ratios[PRIO_RANGE] __rea
* The quota handed out to tasks of all priority levels when refilling their
* time_slice.
*/
-static inline unsigned long timeslice(void)
+static inline int timeslice(void)
{
return MS_TO_US(rr_interval);
}
@@ -167,6 +167,7 @@ struct global_rq {
cpumask_t cpu_idle_map;
int idle_cpus;
#endif
+ int noc; /* num_online_cpus stored and updated when it changes */
u64 niffies; /* Nanosecond jiffies */
unsigned long last_jiffy; /* Last jiffy we updated niffies */
@@ -209,6 +210,8 @@ struct rq {
#ifdef CONFIG_SMP
int cpu; /* cpu of this runqueue */
int online;
+ int scaling; /* This CPU is managed by a scaling CPU freq governor */
+ struct task_struct *sticky_task;
struct root_domain *rd;
struct sched_domain *sd;
@@ -225,7 +228,11 @@ struct rq {
#endif
u64 last_niffy; /* Last time this RQ updated grq.niffies */
#endif
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ u64 prev_irq_time;
+#endif
u64 clock, old_clock, last_tick;
+ u64 clock_task;
int dither;
#ifdef CONFIG_SCHEDSTATS
@@ -397,9 +404,17 @@ static inline void update_clocks(struct
* when we're not updating niffies.
* Looking up task_rq must be done under grq.lock to be safe.
*/
+static u64 irq_time_cpu(int cpu);
+
static inline void update_rq_clock(struct rq *rq)
{
- rq->clock = sched_clock_cpu(cpu_of(rq));
+ int cpu = cpu_of(rq);
+ u64 irq_time;
+
+ rq->clock = sched_clock_cpu(cpu);
+ irq_time = irq_time_cpu(cpu);
+ if (rq->clock - irq_time > rq->clock_task)
+ rq->clock_task = rq->clock - irq_time;
}
static inline int task_running(struct task_struct *p)
@@ -751,10 +766,8 @@ static void resched_task(struct task_str
/*
* The best idle CPU is chosen according to the CPUIDLE ranking above where the
- * lowest value would give the most suitable CPU to schedule p onto next. We
- * iterate from the last CPU upwards instead of using for_each_cpu_mask so as
- * to be able to break out immediately if the last CPU is idle. The order works
- * out to be the following:
+ * lowest value would give the most suitable CPU to schedule p onto next. The
+ * order works out to be the following:
*
* Same core, idle or busy cache, idle threads
* Other core, same cache, idle or busy cache, idle threads.
@@ -766,38 +779,19 @@ static void resched_task(struct task_str
* Other node, other CPU, idle cache, idle threads.
* Other node, other CPU, busy cache, idle threads.
* Other node, other CPU, busy threads.
- *
- * If p was the last task running on this rq, then regardless of where
- * it has been running since then, it is cache warm on this rq.
*/
-static void resched_best_idle(struct task_struct *p)
+static void
+resched_best_mask(unsigned long best_cpu, struct rq *rq, cpumask_t *tmpmask)
{
- unsigned long cpu_tmp, best_cpu, best_ranking;
- cpumask_t tmpmask;
- struct rq *rq;
- int iterate;
+ unsigned long cpu_tmp, best_ranking;
- cpus_and(tmpmask, p->cpus_allowed, grq.cpu_idle_map);
- iterate = cpus_weight(tmpmask);
- best_cpu = task_cpu(p);
- /*
- * Start below the last CPU and work up with next_cpu as the last
- * CPU might not be idle or affinity might not allow it.
- */
- cpu_tmp = best_cpu - 1;
- rq = cpu_rq(best_cpu);
best_ranking = ~0UL;
- do {
+ for_each_cpu_mask(cpu_tmp, *tmpmask) {
unsigned long ranking;
struct rq *tmp_rq;
ranking = 0;
- cpu_tmp = next_cpu(cpu_tmp, tmpmask);
- if (cpu_tmp >= nr_cpu_ids) {
- cpu_tmp = -1;
- cpu_tmp = next_cpu(cpu_tmp, tmpmask);
- }
tmp_rq = cpu_rq(cpu_tmp);
#ifdef CONFIG_NUMA
@@ -825,37 +819,42 @@ static void resched_best_idle(struct tas
break;
best_ranking = ranking;
}
- } while (--iterate > 0);
+ }
resched_task(cpu_rq(best_cpu)->curr);
}
+static void resched_best_idle(struct task_struct *p)
+{
+ cpumask_t tmpmask;
+
+ cpus_and(tmpmask, p->cpus_allowed, grq.cpu_idle_map);
+ resched_best_mask(task_cpu(p), task_rq(p), &tmpmask);
+}
+
static inline void resched_suitable_idle(struct task_struct *p)
{
if (suitable_idle_cpus(p))
resched_best_idle(p);
}
-
/*
- * The cpu cache locality difference between CPUs is used to determine how far
- * to offset the virtual deadline. <2 difference in locality means that one
- * timeslice difference is allowed longer for the cpu local tasks. This is
- * enough in the common case when tasks are up to 2* number of CPUs to keep
- * tasks within their shared cache CPUs only. CPUs on different nodes or not
- * even in this domain (NUMA) have "4" difference, allowing 4 times longer
- * deadlines before being taken onto another cpu, allowing for 2* the double
- * seen by separate CPUs above.
- * Simple summary: Virtual deadlines are equal on shared cache CPUs, double
- * on separate CPUs and quadruple in separate NUMA nodes.
+ * Flags to tell us whether this CPU is running a CPU frequency governor that
+ * has slowed its speed or not. No locking required as the very rare wrongly
+ * read value would be harmless.
*/
-static inline int
-cache_distance(struct rq *task_rq, struct rq *rq, struct task_struct *p)
+void cpu_scaling(int cpu)
{
- int locality = rq->cpu_locality[cpu_of(task_rq)] - 2;
+ cpu_rq(cpu)->scaling = 1;
+}
- if (locality > 0)
- return task_timeslice(p) << locality;
- return 0;
+void cpu_nonscaling(int cpu)
+{
+ cpu_rq(cpu)->scaling = 0;
+}
+
+static inline int scaling_rq(struct rq *rq)
+{
+ return rq->scaling;
}
#else /* CONFIG_SMP */
static inline void inc_qnr(void)
@@ -888,12 +887,25 @@ static inline void resched_suitable_idle
{
}
-static inline int
-cache_distance(struct rq *task_rq, struct rq *rq, struct task_struct *p)
+void cpu_scaling(int __unused)
+{
+}
+
+void cpu_nonscaling(int __unused)
+{
+}
+
+/*
+ * Although CPUs can scale in UP, there is nowhere else for tasks to go so this
+ * always returns 0.
+ */
+static inline int scaling_rq(struct rq *rq)
{
return 0;
}
#endif /* CONFIG_SMP */
+EXPORT_SYMBOL_GPL(cpu_scaling);
+EXPORT_SYMBOL_GPL(cpu_nonscaling);
/*
* activate_idle_task - move idle task to the _front_ of runqueue.
@@ -989,6 +1001,82 @@ void set_task_cpu(struct task_struct *p,
smp_wmb();
task_thread_info(p)->cpu = cpu;
}
+
+static inline void clear_sticky(struct task_struct *p)
+{
+ p->sticky = 0;
+}
+
+static inline int task_sticky(struct task_struct *p)
+{
+ return p->sticky;
+}
+
+/* Reschedule the best idle CPU that is not this one. */
+static void
+resched_closest_idle(struct rq *rq, unsigned long cpu, struct task_struct *p)
+{
+ cpumask_t tmpmask;
+
+ cpus_and(tmpmask, p->cpus_allowed, grq.cpu_idle_map);
+ cpu_clear(cpu, tmpmask);
+ if (cpus_empty(tmpmask))
+ return;
+ resched_best_mask(cpu, rq, &tmpmask);
+}
+
+/*
+ * We set the sticky flag on a task that is descheduled involuntarily meaning
+ * it is awaiting further CPU time. If the last sticky task is still sticky
+ * but unlucky enough to not be the next task scheduled, we unstick it and try
+ * to find it an idle CPU. Realtime tasks do not stick to minimise their
+ * latency at all times.
+ */
+static inline void
+swap_sticky(struct rq *rq, unsigned long cpu, struct task_struct *p)
+{
+ if (rq->sticky_task) {
+ if (rq->sticky_task == p) {
+ p->sticky = 1;
+ return;
+ }
+ if (rq->sticky_task->sticky) {
+ rq->sticky_task->sticky = 0;
+ resched_closest_idle(rq, cpu, rq->sticky_task);
+ }
+ }
+ if (!rt_task(p)) {
+ p->sticky = 1;
+ rq->sticky_task = p;
+ } else {
+ resched_closest_idle(rq, cpu, p);
+ rq->sticky_task = NULL;
+ }
+}
+
+static inline void unstick_task(struct rq *rq, struct task_struct *p)
+{
+ rq->sticky_task = NULL;
+ clear_sticky(p);
+}
+#else
+static inline void clear_sticky(struct task_struct *p)
+{
+}
+
+static inline int task_sticky(struct task_struct *p)
+{
+ return 0;
+}
+
+static inline void
+swap_sticky(struct rq *rq, unsigned long cpu, struct task_struct *p)
+{
+}
+
+static inline void unstick_task(struct rq *rq, struct task_struct *p)
+{
+}
#endif
/*
@@ -999,6 +1087,7 @@ static inline void take_task(struct rq *
{
set_task_cpu(p, cpu_of(rq));
dequeue_task(p);
+ clear_sticky(p);
dec_qnr();
}
@@ -1353,6 +1442,13 @@ static void try_preempt(struct task_stru
int highest_prio;
cpumask_t tmp;
+ /*
+ * We clear the sticky flag here because for a task to have called
+ * try_preempt with the sticky flag enabled means some complicated
+ * re-scheduling has occurred and we should ignore the sticky flag.
+ */
+ clear_sticky(p);
+
if (suitable_idle_cpus(p)) {
resched_best_idle(p);
return;
@@ -1371,7 +1467,6 @@ static void try_preempt(struct task_stru
highest_prio = -1;
for_each_cpu_mask(cpu, tmp) {
- u64 offset_deadline;
struct rq *rq;
int rq_prio;
@@ -1380,12 +1475,9 @@ static void try_preempt(struct task_stru
if (rq_prio < highest_prio)
continue;
- offset_deadline = rq->rq_deadline -
- cache_distance(this_rq, rq, p);
-
- if (rq_prio > highest_prio || (rq_prio == highest_prio &&
- deadline_after(offset_deadline, latest_deadline))) {
- latest_deadline = offset_deadline;
+ if (rq_prio > highest_prio ||
+ deadline_after(rq->rq_deadline, latest_deadline)) {
+ latest_deadline = rq->rq_deadline;
highest_prio = rq_prio;
highest_prio_rq = rq;
}
@@ -1579,6 +1671,7 @@ void sched_fork(struct task_struct *p, i
#endif
p->oncpu = 0;
+ clear_sticky(p);
#ifdef CONFIG_PREEMPT
/* Want to start with kernel preemption disabled. */
@@ -1919,8 +2012,7 @@ unsigned long nr_active(void)
unsigned long this_cpu_load(void)
{
return this_rq()->rq_running +
- (queued_notrunning() + nr_uninterruptible()) /
- (1 + num_online_cpus());
+ ((queued_notrunning() + nr_uninterruptible()) / grq.noc);
}
/* Variables and functions for calc_load */
@@ -1973,6 +2065,81 @@ DEFINE_PER_CPU(struct kernel_stat, kstat
EXPORT_PER_CPU_SYMBOL(kstat);
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+
+/*
+ * There are no locks covering percpu hardirq/softirq time.
+ * They are only modified in account_system_vtime, on corresponding CPU
+ * with interrupts disabled. So, writes are safe.
+ * They are read and saved off onto struct rq in update_rq_clock().
+ * This may result in other CPU reading this CPU's irq time and can
+ * race with irq/account_system_vtime on this CPU. We would either get old
+ * or new value (or semi updated value on 32 bit) with a side effect of
+ * accounting a slice of irq time to wrong task when irq is in progress
+ * while we read rq->clock. That is a worthy compromise in place of having
+ * locks on each irq in account_system_time.
+ */
+static DEFINE_PER_CPU(u64, cpu_hardirq_time);
+static DEFINE_PER_CPU(u64, cpu_softirq_time);
+
+static DEFINE_PER_CPU(u64, irq_start_time);
+static int sched_clock_irqtime;
+
+void enable_sched_clock_irqtime(void)
+{
+ sched_clock_irqtime = 1;
+}
+
+void disable_sched_clock_irqtime(void)
+{
+ sched_clock_irqtime = 0;
+}
+
+static u64 irq_time_cpu(int cpu)
+{
+ if (!sched_clock_irqtime)
+ return 0;
+
+ return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
+}
+
+void account_system_vtime(struct task_struct *curr)
+{
+ unsigned long flags;
+ int cpu;
+ u64 now, delta;
+
+ if (!sched_clock_irqtime)
+ return;
+
+ local_irq_save(flags);
+
+ cpu = smp_processor_id();
+ now = sched_clock_cpu(cpu);
+ delta = now - per_cpu(irq_start_time, cpu);
+ per_cpu(irq_start_time, cpu) = now;
+ /*
+ * We do not account for softirq time from ksoftirqd here.
+ * We want to continue accounting softirq time to ksoftirqd thread
+ * in that case, so as not to confuse scheduler with a special task
+ * that do not consume any time, but still wants to run.
+ */
+ if (hardirq_count())
+ per_cpu(cpu_hardirq_time, cpu) += delta;
+ else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
+ per_cpu(cpu_softirq_time, cpu) += delta;
+
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(account_system_vtime);
+#else
+
+static u64 irq_time_cpu(int cpu)
+{
+ return 0;
+}
+#endif
+
/*
* On each tick, see what percentage of that tick was attributed to each
* component and add the percentage to the _pc values. Once a _pc value has
@@ -2531,9 +2698,14 @@ static inline u64 static_deadline_diff(i
return prio_deadline_diff(USER_PRIO(static_prio));
}
+static inline int longest_deadline_diff(void)
+{
+ return prio_deadline_diff(39);
+}
+
static inline int ms_longest_deadline_diff(void)
{
- return NS_TO_MS(prio_deadline_diff(39));
+ return NS_TO_MS(longest_deadline_diff());
}
/*
@@ -2603,7 +2775,19 @@ retry:
goto out_take;
}
- dl = p->deadline + cache_distance(task_rq(p), rq, p);
+ /*
+ * Soft affinity happens here by not scheduling a task with
+ * its sticky flag set that ran on a different CPU last when
+ * the CPU is scaling, or by greatly biasing against its
+ * deadline when not.
+ */
+ if (task_rq(p) != rq && task_sticky(p)) {
+ if (scaling_rq(rq))
+ continue;
+ else
+ dl = p->deadline + longest_deadline_diff();
+ } else
+ dl = p->deadline;
/*
* No rt tasks. Find the earliest deadline task. Now we're in
@@ -2681,7 +2865,7 @@ static inline void set_rq_task(struct rq
{
rq->rq_time_slice = p->time_slice;
rq->rq_deadline = p->deadline;
- rq->rq_last_ran = p->last_ran;
+ rq->rq_last_ran = p->last_ran = rq->clock;
rq->rq_policy = p->policy;
rq->rq_prio = p->prio;
if (p != rq->idle)
@@ -2760,14 +2944,8 @@ need_resched_nonpreemptible:
*/
grq_unlock_irq();
goto rerun_prev_unlocked;
- } else {
- /*
- * If prev got kicked off by a task that has to
- * run on this CPU for affinity reasons then
- * there may be an idle CPU it can go to.
- */
- resched_suitable_idle(prev);
- }
+ } else
+ swap_sticky(rq, cpu, prev);
}
return_task(prev, deactivate);
}
@@ -2782,12 +2960,21 @@ need_resched_nonpreemptible:
set_cpuidle_map(cpu);
} else {
next = earliest_deadline_task(rq, idle);
- prefetch(next);
- prefetch_stack(next);
- clear_cpuidle_map(cpu);
+ if (likely(next->prio != PRIO_LIMIT)) {
+ prefetch(next);
+ prefetch_stack(next);
+ clear_cpuidle_map(cpu);
+ } else
+ set_cpuidle_map(cpu);
}
if (likely(prev != next)) {
+ /*
+ * Don't stick tasks when a real time task is going to run as
+ * they may literally get stuck.
+ */
+ if (rt_task(next))
+ unstick_task(rq, prev);
sched_info_switch(prev, next);
perf_event_task_sched_out(prev, next, cpu);
@@ -4345,7 +4532,6 @@ void init_idle(struct task_struct *idle,
rcu_read_unlock();
rq->curr = rq->idle = idle;
idle->oncpu = 1;
- set_cpuidle_map(cpu);
grq_unlock_irqrestore(&flags);
/* Set the preempt count _outside_ the spinlocks! */
@@ -4592,6 +4778,7 @@ static void break_sole_affinity(int src_
task_pid_nr(p), p->comm, src_cpu);
}
}
+ clear_sticky(p);
} while_each_thread(t, p);
}
@@ -4853,6 +5040,7 @@ migration_call(struct notifier_block *nf
set_rq_online(rq);
}
+ grq.noc = num_online_cpus();
grq_unlock_irqrestore(&flags);
break;
@@ -4883,6 +5071,7 @@ migration_call(struct notifier_block *nf
BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
set_rq_offline(rq);
}
+ grq.noc = num_online_cpus();
grq_unlock_irqrestore(&flags);
break;
#endif
@@ -6406,7 +6595,7 @@ static int cache_cpu_idle(unsigned long
void __init sched_init_smp(void)
{
struct sched_domain *sd;
- int cpu, cpus;
+ int cpu;
cpumask_var_t non_isolated_cpus;
@@ -6440,14 +6629,6 @@ void __init sched_init_smp(void)
BUG();
free_cpumask_var(non_isolated_cpus);
- /*
- * Assume that every added cpu gives us slightly less overall latency
- * allowing us to increase the base rr_interval, non-linearly and with
- * an upper bound.
- */
- cpus = num_online_cpus();
- rr_interval = rr_interval * (4 * cpus + 4) / (cpus + 6);
-
grq_lock_irq();
/*
* Set up the relative cache distance of each online cpu from each
@@ -6536,6 +6717,7 @@ void __init sched_init(void)
grq.last_jiffy = jiffies;
spin_lock_init(&grq.iso_lock);
grq.iso_ticks = grq.iso_refractory = 0;
+ grq.noc = 1;
#ifdef CONFIG_SMP
init_defrootdomain();
grq.qnr = grq.idle_cpus = 0;
@@ -6549,6 +6731,7 @@ void __init sched_init(void)
rq->iowait_pc = rq->idle_pc = 0;
rq->dither = 0;
#ifdef CONFIG_SMP
+ rq->sticky_task = NULL;
rq->last_niffy = 0;
rq->sd = NULL;
rq->rd = NULL;