--- 168e84a745778904e5a69cafe0c13396ba938602 +++ 0f0e14f8cdb956035e72fd5ccb35f1278844858a @@ -85,7 +85,7 @@ #define idleprio_task(p) unlikely((p)->policy == SCHED_IDLEPRIO) #define iso_task(p) unlikely((p)->policy == SCHED_ISO) #define iso_queue(rq) unlikely((rq)->rq_policy == SCHED_ISO) -#define ISO_PERIOD ((5 * HZ * grq.noc) + 1) +#define ISO_PERIOD ((5 * HZ * num_online_cpus()) + 1) /* * Convert user-nice values [ -20 ... 0 ... 19 ] @@ -145,7 +145,7 @@ static int prio_ratios[PRIO_RANGE] __rea * The quota handed out to tasks of all priority levels when refilling their * time_slice. */ -static inline int timeslice(void) +static inline unsigned long timeslice(void) { return MS_TO_US(rr_interval); } @@ -167,7 +167,6 @@ struct global_rq { cpumask_t cpu_idle_map; int idle_cpus; #endif - int noc; /* num_online_cpus stored and updated when it changes */ u64 niffies; /* Nanosecond jiffies */ unsigned long last_jiffy; /* Last jiffy we updated niffies */ @@ -210,8 +209,6 @@ struct rq { #ifdef CONFIG_SMP int cpu; /* cpu of this runqueue */ int online; - int scaling; /* This CPU is managed by a scaling CPU freq governor */ - struct task_struct *sticky_task; struct root_domain *rd; struct sched_domain *sd; @@ -228,11 +225,7 @@ struct rq { #endif u64 last_niffy; /* Last time this RQ updated grq.niffies */ #endif -#ifdef CONFIG_IRQ_TIME_ACCOUNTING - u64 prev_irq_time; -#endif u64 clock, old_clock, last_tick; - u64 clock_task; int dither; #ifdef CONFIG_SCHEDSTATS @@ -404,17 +397,9 @@ static inline void update_clocks(struct * when we're not updating niffies. * Looking up task_rq must be done under grq.lock to be safe. */ -static u64 irq_time_cpu(int cpu); - static inline void update_rq_clock(struct rq *rq) { - int cpu = cpu_of(rq); - u64 irq_time; - - rq->clock = sched_clock_cpu(cpu); - irq_time = irq_time_cpu(cpu); - if (rq->clock - irq_time > rq->clock_task) - rq->clock_task = rq->clock - irq_time; + rq->clock = sched_clock_cpu(cpu_of(rq)); } static inline int task_running(struct task_struct *p) @@ -766,8 +751,10 @@ static void resched_task(struct task_str /* * The best idle CPU is chosen according to the CPUIDLE ranking above where the - * lowest value would give the most suitable CPU to schedule p onto next. The - * order works out to be the following: + * lowest value would give the most suitable CPU to schedule p onto next. We + * iterate from the last CPU upwards instead of using for_each_cpu_mask so as + * to be able to break out immediately if the last CPU is idle. The order works + * out to be the following: * * Same core, idle or busy cache, idle threads * Other core, same cache, idle or busy cache, idle threads. @@ -779,19 +766,38 @@ static void resched_task(struct task_str * Other node, other CPU, idle cache, idle threads. * Other node, other CPU, busy cache, idle threads. * Other node, other CPU, busy threads. + * + * If p was the last task running on this rq, then regardless of where + * it has been running since then, it is cache warm on this rq. */ -static void -resched_best_mask(unsigned long best_cpu, struct rq *rq, cpumask_t *tmpmask) +static void resched_best_idle(struct task_struct *p) { - unsigned long cpu_tmp, best_ranking; + unsigned long cpu_tmp, best_cpu, best_ranking; + cpumask_t tmpmask; + struct rq *rq; + int iterate; + cpus_and(tmpmask, p->cpus_allowed, grq.cpu_idle_map); + iterate = cpus_weight(tmpmask); + best_cpu = task_cpu(p); + /* + * Start below the last CPU and work up with next_cpu as the last + * CPU might not be idle or affinity might not allow it. + */ + cpu_tmp = best_cpu - 1; + rq = cpu_rq(best_cpu); best_ranking = ~0UL; - for_each_cpu_mask(cpu_tmp, *tmpmask) { + do { unsigned long ranking; struct rq *tmp_rq; ranking = 0; + cpu_tmp = next_cpu(cpu_tmp, tmpmask); + if (cpu_tmp >= nr_cpu_ids) { + cpu_tmp = -1; + cpu_tmp = next_cpu(cpu_tmp, tmpmask); + } tmp_rq = cpu_rq(cpu_tmp); #ifdef CONFIG_NUMA @@ -819,42 +825,37 @@ resched_best_mask(unsigned long best_cpu break; best_ranking = ranking; } - } + } while (--iterate > 0); resched_task(cpu_rq(best_cpu)->curr); } -static void resched_best_idle(struct task_struct *p) -{ - cpumask_t tmpmask; - - cpus_and(tmpmask, p->cpus_allowed, grq.cpu_idle_map); - resched_best_mask(task_cpu(p), task_rq(p), &tmpmask); -} - static inline void resched_suitable_idle(struct task_struct *p) { if (suitable_idle_cpus(p)) resched_best_idle(p); } + /* - * Flags to tell us whether this CPU is running a CPU frequency governor that - * has slowed its speed or not. No locking required as the very rare wrongly - * read value would be harmless. + * The cpu cache locality difference between CPUs is used to determine how far + * to offset the virtual deadline. <2 difference in locality means that one + * timeslice difference is allowed longer for the cpu local tasks. This is + * enough in the common case when tasks are up to 2* number of CPUs to keep + * tasks within their shared cache CPUs only. CPUs on different nodes or not + * even in this domain (NUMA) have "4" difference, allowing 4 times longer + * deadlines before being taken onto another cpu, allowing for 2* the double + * seen by separate CPUs above. + * Simple summary: Virtual deadlines are equal on shared cache CPUs, double + * on separate CPUs and quadruple in separate NUMA nodes. */ -void cpu_scaling(int cpu) -{ - cpu_rq(cpu)->scaling = 1; -} - -void cpu_nonscaling(int cpu) +static inline int +cache_distance(struct rq *task_rq, struct rq *rq, struct task_struct *p) { - cpu_rq(cpu)->scaling = 0; -} + int locality = rq->cpu_locality[cpu_of(task_rq)] - 2; -static inline int scaling_rq(struct rq *rq) -{ - return rq->scaling; + if (locality > 0) + return task_timeslice(p) << locality; + return 0; } #else /* CONFIG_SMP */ static inline void inc_qnr(void) @@ -887,25 +888,12 @@ static inline void resched_suitable_idle { } -void cpu_scaling(int __unused) -{ -} - -void cpu_nonscaling(int __unused) -{ -} - -/* - * Although CPUs can scale in UP, there is nowhere else for tasks to go so this - * always returns 0. - */ -static inline int scaling_rq(struct rq *rq) +static inline int +cache_distance(struct rq *task_rq, struct rq *rq, struct task_struct *p) { return 0; } #endif /* CONFIG_SMP */ -EXPORT_SYMBOL_GPL(cpu_scaling); -EXPORT_SYMBOL_GPL(cpu_nonscaling); /* * activate_idle_task - move idle task to the _front_ of runqueue. @@ -1001,82 +989,6 @@ void set_task_cpu(struct task_struct *p, smp_wmb(); task_thread_info(p)->cpu = cpu; } - -static inline void clear_sticky(struct task_struct *p) -{ - p->sticky = 0; -} - -static inline int task_sticky(struct task_struct *p) -{ - return p->sticky; -} - -/* Reschedule the best idle CPU that is not this one. */ -static void -resched_closest_idle(struct rq *rq, unsigned long cpu, struct task_struct *p) -{ - cpumask_t tmpmask; - - cpus_and(tmpmask, p->cpus_allowed, grq.cpu_idle_map); - cpu_clear(cpu, tmpmask); - if (cpus_empty(tmpmask)) - return; - resched_best_mask(cpu, rq, &tmpmask); -} - -/* - * We set the sticky flag on a task that is descheduled involuntarily meaning - * it is awaiting further CPU time. If the last sticky task is still sticky - * but unlucky enough to not be the next task scheduled, we unstick it and try - * to find it an idle CPU. Realtime tasks do not stick to minimise their - * latency at all times. - */ -static inline void -swap_sticky(struct rq *rq, unsigned long cpu, struct task_struct *p) -{ - if (rq->sticky_task) { - if (rq->sticky_task == p) { - p->sticky = 1; - return; - } - if (rq->sticky_task->sticky) { - rq->sticky_task->sticky = 0; - resched_closest_idle(rq, cpu, rq->sticky_task); - } - } - if (!rt_task(p)) { - p->sticky = 1; - rq->sticky_task = p; - } else { - resched_closest_idle(rq, cpu, p); - rq->sticky_task = NULL; - } -} - -static inline void unstick_task(struct rq *rq, struct task_struct *p) -{ - rq->sticky_task = NULL; - clear_sticky(p); -} -#else -static inline void clear_sticky(struct task_struct *p) -{ -} - -static inline int task_sticky(struct task_struct *p) -{ - return 0; -} - -static inline void -swap_sticky(struct rq *rq, unsigned long cpu, struct task_struct *p) -{ -} - -static inline void unstick_task(struct rq *rq, struct task_struct *p) -{ -} #endif /* @@ -1087,7 +999,6 @@ static inline void take_task(struct rq * { set_task_cpu(p, cpu_of(rq)); dequeue_task(p); - clear_sticky(p); dec_qnr(); } @@ -1442,13 +1353,6 @@ static void try_preempt(struct task_stru int highest_prio; cpumask_t tmp; - /* - * We clear the sticky flag here because for a task to have called - * try_preempt with the sticky flag enabled means some complicated - * re-scheduling has occurred and we should ignore the sticky flag. - */ - clear_sticky(p); - if (suitable_idle_cpus(p)) { resched_best_idle(p); return; @@ -1467,6 +1371,7 @@ static void try_preempt(struct task_stru highest_prio = -1; for_each_cpu_mask(cpu, tmp) { + u64 offset_deadline; struct rq *rq; int rq_prio; @@ -1475,9 +1380,12 @@ static void try_preempt(struct task_stru if (rq_prio < highest_prio) continue; - if (rq_prio > highest_prio || - deadline_after(rq->rq_deadline, latest_deadline)) { - latest_deadline = rq->rq_deadline; + offset_deadline = rq->rq_deadline - + cache_distance(this_rq, rq, p); + + if (rq_prio > highest_prio || (rq_prio == highest_prio && + deadline_after(offset_deadline, latest_deadline))) { + latest_deadline = offset_deadline; highest_prio = rq_prio; highest_prio_rq = rq; } @@ -1671,7 +1579,6 @@ void sched_fork(struct task_struct *p, i #endif p->oncpu = 0; - clear_sticky(p); #ifdef CONFIG_PREEMPT /* Want to start with kernel preemption disabled. */ @@ -2012,7 +1919,8 @@ unsigned long nr_active(void) unsigned long this_cpu_load(void) { return this_rq()->rq_running + - ((queued_notrunning() + nr_uninterruptible()) / grq.noc); + (queued_notrunning() + nr_uninterruptible()) / + (1 + num_online_cpus()); } /* Variables and functions for calc_load */ @@ -2065,81 +1973,6 @@ DEFINE_PER_CPU(struct kernel_stat, kstat EXPORT_PER_CPU_SYMBOL(kstat); -#ifdef CONFIG_IRQ_TIME_ACCOUNTING - -/* - * There are no locks covering percpu hardirq/softirq time. - * They are only modified in account_system_vtime, on corresponding CPU - * with interrupts disabled. So, writes are safe. - * They are read and saved off onto struct rq in update_rq_clock(). - * This may result in other CPU reading this CPU's irq time and can - * race with irq/account_system_vtime on this CPU. We would either get old - * or new value (or semi updated value on 32 bit) with a side effect of - * accounting a slice of irq time to wrong task when irq is in progress - * while we read rq->clock. That is a worthy compromise in place of having - * locks on each irq in account_system_time. - */ -static DEFINE_PER_CPU(u64, cpu_hardirq_time); -static DEFINE_PER_CPU(u64, cpu_softirq_time); - -static DEFINE_PER_CPU(u64, irq_start_time); -static int sched_clock_irqtime; - -void enable_sched_clock_irqtime(void) -{ - sched_clock_irqtime = 1; -} - -void disable_sched_clock_irqtime(void) -{ - sched_clock_irqtime = 0; -} - -static u64 irq_time_cpu(int cpu) -{ - if (!sched_clock_irqtime) - return 0; - - return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); -} - -void account_system_vtime(struct task_struct *curr) -{ - unsigned long flags; - int cpu; - u64 now, delta; - - if (!sched_clock_irqtime) - return; - - local_irq_save(flags); - - cpu = smp_processor_id(); - now = sched_clock_cpu(cpu); - delta = now - per_cpu(irq_start_time, cpu); - per_cpu(irq_start_time, cpu) = now; - /* - * We do not account for softirq time from ksoftirqd here. - * We want to continue accounting softirq time to ksoftirqd thread - * in that case, so as not to confuse scheduler with a special task - * that do not consume any time, but still wants to run. - */ - if (hardirq_count()) - per_cpu(cpu_hardirq_time, cpu) += delta; - else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD)) - per_cpu(cpu_softirq_time, cpu) += delta; - - local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(account_system_vtime); -#else - -static u64 irq_time_cpu(int cpu) -{ - return 0; -} -#endif - /* * On each tick, see what percentage of that tick was attributed to each * component and add the percentage to the _pc values. Once a _pc value has @@ -2698,14 +2531,9 @@ static inline u64 static_deadline_diff(i return prio_deadline_diff(USER_PRIO(static_prio)); } -static inline int longest_deadline_diff(void) -{ - return prio_deadline_diff(39); -} - static inline int ms_longest_deadline_diff(void) { - return NS_TO_MS(longest_deadline_diff()); + return NS_TO_MS(prio_deadline_diff(39)); } /* @@ -2775,19 +2603,7 @@ retry: goto out_take; } - /* - * Soft affinity happens here by not scheduling a task with - * its sticky flag set that ran on a different CPU last when - * the CPU is scaling, or by greatly biasing against its - * deadline when not. - */ - if (task_rq(p) != rq && task_sticky(p)) { - if (scaling_rq(rq)) - continue; - else - dl = p->deadline + longest_deadline_diff(); - } else - dl = p->deadline; + dl = p->deadline + cache_distance(task_rq(p), rq, p); /* * No rt tasks. Find the earliest deadline task. Now we're in @@ -2865,7 +2681,7 @@ static inline void set_rq_task(struct rq { rq->rq_time_slice = p->time_slice; rq->rq_deadline = p->deadline; - rq->rq_last_ran = p->last_ran = rq->clock; + rq->rq_last_ran = p->last_ran; rq->rq_policy = p->policy; rq->rq_prio = p->prio; if (p != rq->idle) @@ -2944,8 +2760,14 @@ need_resched_nonpreemptible: */ grq_unlock_irq(); goto rerun_prev_unlocked; - } else - swap_sticky(rq, cpu, prev); + } else { + /* + * If prev got kicked off by a task that has to + * run on this CPU for affinity reasons then + * there may be an idle CPU it can go to. + */ + resched_suitable_idle(prev); + } } return_task(prev, deactivate); } @@ -2960,21 +2782,12 @@ need_resched_nonpreemptible: set_cpuidle_map(cpu); } else { next = earliest_deadline_task(rq, idle); - if (likely(next->prio != PRIO_LIMIT)) { - prefetch(next); - prefetch_stack(next); - clear_cpuidle_map(cpu); - } else - set_cpuidle_map(cpu); + prefetch(next); + prefetch_stack(next); + clear_cpuidle_map(cpu); } if (likely(prev != next)) { - /* - * Don't stick tasks when a real time task is going to run as - * they may literally get stuck. - */ - if (rt_task(next)) - unstick_task(rq, prev); sched_info_switch(prev, next); perf_event_task_sched_out(prev, next, cpu); @@ -4532,6 +4345,7 @@ void init_idle(struct task_struct *idle, rcu_read_unlock(); rq->curr = rq->idle = idle; idle->oncpu = 1; + set_cpuidle_map(cpu); grq_unlock_irqrestore(&flags); /* Set the preempt count _outside_ the spinlocks! */ @@ -4778,7 +4592,6 @@ static void break_sole_affinity(int src_ task_pid_nr(p), p->comm, src_cpu); } } - clear_sticky(p); } while_each_thread(t, p); } @@ -5040,7 +4853,6 @@ migration_call(struct notifier_block *nf set_rq_online(rq); } - grq.noc = num_online_cpus(); grq_unlock_irqrestore(&flags); break; @@ -5071,7 +4883,6 @@ migration_call(struct notifier_block *nf BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_offline(rq); } - grq.noc = num_online_cpus(); grq_unlock_irqrestore(&flags); break; #endif @@ -6595,7 +6406,7 @@ static int cache_cpu_idle(unsigned long void __init sched_init_smp(void) { struct sched_domain *sd; - int cpu; + int cpu, cpus; cpumask_var_t non_isolated_cpus; @@ -6629,6 +6440,14 @@ void __init sched_init_smp(void) BUG(); free_cpumask_var(non_isolated_cpus); + /* + * Assume that every added cpu gives us slightly less overall latency + * allowing us to increase the base rr_interval, non-linearly and with + * an upper bound. + */ + cpus = num_online_cpus(); + rr_interval = rr_interval * (4 * cpus + 4) / (cpus + 6); + grq_lock_irq(); /* * Set up the relative cache distance of each online cpu from each @@ -6717,7 +6536,6 @@ void __init sched_init(void) grq.last_jiffy = jiffies; spin_lock_init(&grq.iso_lock); grq.iso_ticks = grq.iso_refractory = 0; - grq.noc = 1; #ifdef CONFIG_SMP init_defrootdomain(); grq.qnr = grq.idle_cpus = 0; @@ -6731,7 +6549,6 @@ void __init sched_init(void) rq->iowait_pc = rq->idle_pc = 0; rq->dither = 0; #ifdef CONFIG_SMP - rq->sticky_task = NULL; rq->last_niffy = 0; rq->sd = NULL; rq->rd = NULL;