--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1598,7 +1598,7 @@ static inline void tsk_cpus_current(stru
static inline void print_scheduler_version(void)
{
- printk(KERN_INFO"BFS CPU scheduler v0.357 by Con Kolivas.\n");
+ printk(KERN_INFO"BFS CPU scheduler v0.363 by Con Kolivas.\n");
}
static inline int iso_task(struct task_struct *p)
--- a/kernel/sched_bfs.c
+++ b/kernel/sched_bfs.c
@@ -120,7 +120,7 @@
#define NS_TO_MS(TIME) ((TIME) >> 20)
#define NS_TO_US(TIME) ((TIME) >> 10)
-#define RESCHED_US (100) /* Reschedule if less than this many us left */
+#define RESCHED_US (100) /* Reschedule if less than this many μs left */
/*
* This is the time all tasks within the same priority round robin.
@@ -187,7 +187,6 @@ struct rq {
#ifdef CONFIG_NO_HZ
unsigned char in_nohz_recently;
#endif
- struct task_struct *last_task;
#endif
struct task_struct *curr, *idle;
@@ -743,19 +742,12 @@ static int suitable_idle_cpus(struct tas
static void resched_task(struct task_struct *p);
-/*
- * last_task stores the last non-idle task scheduled on the local rq for
- * cache warmth testing.
- */
-static inline void set_last_task(struct rq *rq, struct task_struct *p)
-{
- rq->last_task = p;
-}
-
-#define CPUIDLE_CACHE_BUSY (1)
-#define CPUIDLE_DIFF_CPU (2)
-#define CPUIDLE_THREAD_BUSY (4)
-#define CPUIDLE_DIFF_NODE (8)
+#define CPUIDLE_DIFF_THREAD (1)
+#define CPUIDLE_DIFF_CORE (2)
+#define CPUIDLE_CACHE_BUSY (4)
+#define CPUIDLE_DIFF_CPU (8)
+#define CPUIDLE_THREAD_BUSY (16)
+#define CPUIDLE_DIFF_NODE (32)
/*
* The best idle CPU is chosen according to the CPUIDLE ranking above where the
@@ -808,27 +800,28 @@ static void resched_best_idle(struct tas
}
tmp_rq = cpu_rq(cpu_tmp);
- if (rq->cpu_locality[cpu_tmp]) {
- /* Check rq->last_task hasn't been dereferenced */
- if (rq->last_task && p != rq->last_task) {
#ifdef CONFIG_NUMA
- if (rq->cpu_locality[cpu_tmp] > 1)
- ranking |= CPUIDLE_DIFF_NODE;
+ if (rq->cpu_locality[cpu_tmp] > 3)
+ ranking |= CPUIDLE_DIFF_NODE;
+ else
#endif
- ranking |= CPUIDLE_DIFF_CPU;
- }
- }
+ if (rq->cpu_locality[cpu_tmp] > 2)
+ ranking |= CPUIDLE_DIFF_CPU;
#ifdef CONFIG_SCHED_MC
+ if (rq->cpu_locality[cpu_tmp] == 2)
+ ranking |= CPUIDLE_DIFF_CORE;
if (!(tmp_rq->cache_idle(cpu_tmp)))
ranking |= CPUIDLE_CACHE_BUSY;
#endif
#ifdef CONFIG_SCHED_SMT
+ if (rq->cpu_locality[cpu_tmp] == 1)
+ ranking |= CPUIDLE_DIFF_THREAD;
if (!(tmp_rq->siblings_idle(cpu_tmp)))
ranking |= CPUIDLE_THREAD_BUSY;
#endif
if (ranking < best_ranking) {
best_cpu = cpu_tmp;
- if (ranking <= 1)
+ if (ranking == 0)
break;
best_ranking = ranking;
}
@@ -845,11 +838,11 @@ static inline void resched_suitable_idle
/*
* The cpu cache locality difference between CPUs is used to determine how far
- * to offset the virtual deadline. "One" difference in locality means that one
+ * to offset the virtual deadline. <2 difference in locality means that one
* timeslice difference is allowed longer for the cpu local tasks. This is
* enough in the common case when tasks are up to 2* number of CPUs to keep
* tasks within their shared cache CPUs only. CPUs on different nodes or not
- * even in this domain (NUMA) have "3" difference, allowing 4 times longer
+ * even in this domain (NUMA) have "4" difference, allowing 4 times longer
* deadlines before being taken onto another cpu, allowing for 2* the double
* seen by separate CPUs above.
* Simple summary: Virtual deadlines are equal on shared cache CPUs, double
@@ -858,12 +851,11 @@ static inline void resched_suitable_idle
static inline int
cache_distance(struct rq *task_rq, struct rq *rq, struct task_struct *p)
{
- /* Check rq->last_task hasn't been dereferenced */
- if (likely(rq->last_task)) {
- if (rq->last_task == p)
- return 0;
- }
- return rq->cpu_locality[cpu_of(task_rq)] * task_timeslice(p);
+ int locality = rq->cpu_locality[cpu_of(task_rq)] - 2;
+
+ if (locality > 0)
+ return task_timeslice(p) << locality;
+ return 0;
}
#else /* CONFIG_SMP */
static inline void inc_qnr(void)
@@ -901,10 +893,6 @@ cache_distance(struct rq *task_rq, struc
{
return 0;
}
-
-static inline void set_last_task(struct rq *rq, struct task_struct *p)
-{
-}
#endif /* CONFIG_SMP */
/*
@@ -1348,7 +1336,7 @@ static inline int online_cpus(struct tas
*/
static inline int needs_other_cpu(struct task_struct *p, int cpu)
{
- if (unlikely(!cpu_isset(cpu, p->cpus_allowed) && online_cpus(p)))
+ if (unlikely(!cpu_isset(cpu, p->cpus_allowed)))
return 1;
return 0;
}
@@ -1365,19 +1353,19 @@ static void try_preempt(struct task_stru
int highest_prio;
cpumask_t tmp;
- /* IDLEPRIO tasks never preempt anything */
- if (p->policy == SCHED_IDLEPRIO)
- return;
-
if (suitable_idle_cpus(p)) {
resched_best_idle(p);
return;
}
- if (online_cpus(p))
+ /* IDLEPRIO tasks never preempt anything */
+ if (p->policy == SCHED_IDLEPRIO)
+ return;
+
+ if (likely(online_cpus(p)))
cpus_and(tmp, cpu_online_map, p->cpus_allowed);
else
- (cpumask_copy(&tmp, &cpu_online_map));
+ return;
latest_deadline = 0;
highest_prio = -1;
@@ -1836,14 +1824,14 @@ context_switch(struct rq *rq, struct tas
*/
arch_start_context_switch(prev);
- if (unlikely(!mm)) {
+ if (!mm) {
next->active_mm = oldmm;
atomic_inc(&oldmm->mm_count);
enter_lazy_tlb(oldmm, next);
} else
switch_mm(oldmm, mm, next);
- if (unlikely(!prev->mm)) {
+ if (!prev->mm) {
prev->active_mm = NULL;
rq->prev_mm = oldmm;
}
@@ -2029,9 +2017,13 @@ pc_system_time(struct rq *rq, struct tas
}
p->sched_time += ns;
- if (hardirq_count() - hardirq_offset)
+ if (hardirq_count() - hardirq_offset) {
rq->irq_pc += pc;
- else if (softirq_count()) {
+ if (rq->irq_pc >= 100) {
+ rq->irq_pc %= 100;
+ cpustat->irq = cputime64_add(cpustat->irq, tmp);
+ }
+ } else if (softirq_count()) {
rq->softirq_pc += pc;
if (rq->softirq_pc >= 100) {
rq->softirq_pc %= 100;
@@ -2416,7 +2408,7 @@ static void task_running_tick(struct rq
* Tasks that were scheduled in the first half of a tick are not
* allowed to run into the 2nd half of the next tick if they will
* run out of time slice in the interim. Otherwise, if they have
- * less than 100us of time slice left they will be rescheduled.
+ * less than RESCHED_US μs of time slice left they will be rescheduled.
*/
if (rq->dither) {
if (rq->rq_time_slice > HALF_JIFFY_US)
@@ -2799,8 +2791,6 @@ need_resched_nonpreemptible:
sched_info_switch(prev, next);
perf_event_task_sched_out(prev, next, cpu);
- if (prev != idle)
- set_last_task(rq, prev);
set_rq_task(rq, next);
grq.nr_switches++;
prev->oncpu = 0;
@@ -3627,8 +3617,8 @@ recheck:
* SCHED_BATCH is 0.
*/
if (param->sched_priority < 0 ||
- (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
- (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
+ (p->mm && param->sched_priority > MAX_USER_RT_PRIO - 1) ||
+ (!p->mm && param->sched_priority > MAX_RT_PRIO - 1))
return -EINVAL;
if (is_rt_policy(policy) != (param->sched_priority != 0))
return -EINVAL;
@@ -4349,7 +4339,10 @@ void init_idle(struct task_struct *idle,
idle->prio = PRIO_LIMIT;
set_rq_task(rq, idle);
idle->cpus_allowed = cpumask_of_cpu(cpu);
+ /* Silence PROVE_RCU */
+ rcu_read_lock();
set_task_cpu(idle, cpu);
+ rcu_read_unlock();
rq->curr = rq->idle = idle;
idle->oncpu = 1;
set_cpuidle_map(cpu);
@@ -4579,6 +4572,29 @@ void move_task_off_dead_cpu(int dead_cpu
}
+/* Run through task list and find tasks affined to just the dead cpu, then
+ * allocate a new affinity */
+static void break_sole_affinity(int src_cpu)
+{
+ struct task_struct *p, *t;
+
+ do_each_thread(t, p) {
+ if (!online_cpus(p)) {
+ cpumask_copy(&p->cpus_allowed, cpu_possible_mask);
+ /*
+ * Don't tell them about moving exiting tasks or
+ * kernel threads (both mm NULL), since they never
+ * leave kernel.
+ */
+ if (p->mm && printk_ratelimit()) {
+ printk(KERN_INFO "process %d (%s) no "
+ "longer affine to cpu %d\n",
+ task_pid_nr(p), p->comm, src_cpu);
+ }
+ }
+ } while_each_thread(t, p);
+}
+
/*
* Schedules idle task to be the next runnable task on current CPU.
* It does so by boosting its priority to highest possible.
@@ -4599,6 +4615,7 @@ void sched_idle_next(void)
* and interrupts disabled on the current cpu.
*/
grq_lock_irqsave(&flags);
+ break_sole_affinity(this_cpu);
__setscheduler(idle, rq, SCHED_FIFO, MAX_RT_PRIO - 1);
@@ -6459,10 +6476,12 @@ void __init sched_init_smp(void)
cpumask_set_cpu(other_cpu, &rq->cache_siblings);
}
#endif
- if (sd->level <= SD_LV_MC)
- locality = 0;
- else if (sd->level <= SD_LV_NODE)
+ if (sd->level <= SD_LV_SIBLING)
locality = 1;
+ else if (sd->level <= SD_LV_MC)
+ locality = 2;
+ else if (sd->level <= SD_LV_NODE)
+ locality = 3;
else
continue;
@@ -6568,7 +6587,7 @@ void __init sched_init(void)
if (i == j)
rq->cpu_locality[j] = 0;
else
- rq->cpu_locality[j] = 3;
+ rq->cpu_locality[j] = 4;
}
}
#endif