--- cd9a40b5d50e08cdb61e417fbec5485497f73c16 +++ 6bd5ab842f55af19e388049663e2964acddc4448 @@ -488,7 +488,6 @@ __update_curr(struct cfs_rq *cfs_rq, str curr->sum_exec_runtime += delta_exec; schedstat_add(cfs_rq, exec_clock, delta_exec); delta_exec_weighted = calc_delta_fair(delta_exec, curr); - curr->vruntime += delta_exec_weighted; update_min_vruntime(cfs_rq); } @@ -496,7 +495,7 @@ __update_curr(struct cfs_rq *cfs_rq, str static void update_curr(struct cfs_rq *cfs_rq) { struct sched_entity *curr = cfs_rq->curr; - u64 now = rq_of(cfs_rq)->clock_task; + u64 now = rq_of(cfs_rq)->clock; unsigned long delta_exec; if (unlikely(!curr)) @@ -579,7 +578,7 @@ update_stats_curr_start(struct cfs_rq *c /* * We are starting a new run period: */ - se->exec_start = rq_of(cfs_rq)->clock_task; + se->exec_start = rq_of(cfs_rq)->clock; } /************************************************** @@ -744,26 +743,16 @@ place_entity(struct cfs_rq *cfs_rq, stru se->vruntime = vruntime; } -#define ENQUEUE_WAKEUP 1 -#define ENQUEUE_MIGRATE 2 - static void -enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) +enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) { /* - * Update the normalized vruntime before updating min_vruntime - * through callig update_curr(). - */ - if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATE)) - se->vruntime += cfs_rq->min_vruntime; - - /* * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); account_entity_enqueue(cfs_rq, se); - if (flags & ENQUEUE_WAKEUP) { + if (wakeup) { place_entity(cfs_rq, se, 0); enqueue_sleeper(cfs_rq, se); } @@ -817,14 +806,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, st __dequeue_entity(cfs_rq, se); account_entity_dequeue(cfs_rq, se); update_min_vruntime(cfs_rq); - - /* - * Normalize the entity after updating the min_vruntime because the - * update can refer to the ->curr item and we need to reflect this - * movement in our normalized position. - */ - if (!sleep) - se->vruntime -= cfs_rq->min_vruntime; } /* @@ -1031,24 +1012,17 @@ static inline void hrtick_update(struct * increased. Here we update the fair scheduling stats and * then put the task into the rbtree: */ -static void -enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, bool head) +static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; - int flags = 0; - - if (wakeup) - flags |= ENQUEUE_WAKEUP; - if (p->state == TASK_WAKING) - flags |= ENQUEUE_MIGRATE; for_each_sched_entity(se) { if (se->on_rq) break; cfs_rq = cfs_rq_of(se); - enqueue_entity(cfs_rq, se, flags); - flags = ENQUEUE_WAKEUP; + enqueue_entity(cfs_rq, se, wakeup); + wakeup = 1; } hrtick_update(rq); @@ -1124,14 +1098,6 @@ static void yield_task_fair(struct rq *r #ifdef CONFIG_SMP -static void task_waking_fair(struct rq *rq, struct task_struct *p) -{ - struct sched_entity *se = &p->se; - struct cfs_rq *cfs_rq = cfs_rq_of(se); - - se->vruntime -= cfs_rq->min_vruntime; -} - #ifdef CONFIG_FAIR_GROUP_SCHED /* * effective_load() calculates the load change as seen from the root_task_group @@ -1222,6 +1188,7 @@ static int wake_affine(struct sched_doma unsigned long this_load, load; int idx, this_cpu, prev_cpu; unsigned long tl_per_task; + unsigned int imbalance; struct task_group *tg; unsigned long weight; int balanced; @@ -1249,7 +1216,6 @@ static int wake_affine(struct sched_doma * effect of the currently running task from the load * of the current CPU: */ - rcu_read_lock(); if (sync) { tg = task_group(current); weight = current->se.load.weight; @@ -1261,6 +1227,8 @@ static int wake_affine(struct sched_doma tg = task_group(p); weight = p->se.load.weight; + imbalance = 100 + (sd->imbalance_pct - 100) / 2; + /* * In low-load situations, where prev_cpu is idle and this_cpu is idle * due to the sync cause above having dropped this_load to 0, we'll @@ -1270,23 +1238,9 @@ static int wake_affine(struct sched_doma * Otherwise check if either cpus are near enough in load to allow this * task to be woken on this_cpu. */ - if (this_load) { - unsigned long this_eff_load, prev_eff_load; - - this_eff_load = 100; - this_eff_load *= power_of(prev_cpu); - this_eff_load *= this_load + - effective_load(tg, this_cpu, weight, weight); - - prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; - prev_eff_load *= power_of(this_cpu); - prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight); - - balanced = this_eff_load <= prev_eff_load; - } else - balanced = true; - - rcu_read_unlock(); + balanced = !this_load || + 100*(this_load + effective_load(tg, this_cpu, weight, weight)) <= + imbalance*(load + effective_load(tg, prev_cpu, 0, weight)); /* * If the currently running task will sleep within @@ -1394,56 +1348,6 @@ find_idlest_cpu(struct sched_group *grou } /* - * Try and locate an idle CPU in the sched_domain. - */ -static int select_idle_sibling(struct task_struct *p, int target) -{ - int cpu = smp_processor_id(); - int prev_cpu = task_cpu(p); - struct sched_domain *sd; - int i; - - /* - * If the task is going to be woken-up on this cpu and if it is - * already idle, then it is the right target. - */ - if (target == cpu && idle_cpu(cpu)) - return cpu; - - /* - * If the task is going to be woken-up on the cpu where it previously - * ran and if it is currently idle, then it the right target. - */ - if (target == prev_cpu && idle_cpu(prev_cpu)) - return prev_cpu; - - /* - * Otherwise, iterate the domains and find an elegible idle cpu. - */ - for_each_domain(target, sd) { - if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) - break; - - for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) { - if (idle_cpu(i)) { - target = i; - break; - } - } - - /* - * Lets stop looking for an idle sibling when we reached - * the domain that spans the current cpu and prev_cpu. - */ - if (cpumask_test_cpu(cpu, sched_domain_span(sd)) && - cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) - break; - } - - return target; -} - -/* * sched_balance_self: balance the current task (running on cpu) in domains * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and * SD_BALANCE_EXEC. @@ -1454,8 +1358,7 @@ static int select_idle_sibling(struct ta * * preempt must be disabled. */ -static int -select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_flags) +static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) { struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; int cpu = smp_processor_id(); @@ -1472,6 +1375,7 @@ select_task_rq_fair(struct rq *rq, struc new_cpu = prev_cpu; } + rcu_read_lock(); for_each_domain(cpu, tmp) { if (!(tmp->flags & SD_LOAD_BALANCE)) continue; @@ -1500,14 +1404,38 @@ select_task_rq_fair(struct rq *rq, struc want_sd = 0; } - /* - * If both cpu and prev_cpu are part of this domain, - * cpu is a valid SD_WAKE_AFFINE target. - */ - if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && - cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { - affine_sd = tmp; - want_affine = 0; + if (want_affine && (tmp->flags & SD_WAKE_AFFINE)) { + int candidate = -1, i; + + if (cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) + candidate = cpu; + + /* + * Check for an idle shared cache. + */ + if (tmp->flags & SD_PREFER_SIBLING) { + if (candidate == cpu) { + if (!cpu_rq(prev_cpu)->cfs.nr_running) + candidate = prev_cpu; + } + + if (candidate == -1 || candidate == cpu) { + for_each_cpu(i, sched_domain_span(tmp)) { + if (!cpumask_test_cpu(i, &p->cpus_allowed)) + continue; + if (!cpu_rq(i)->cfs.nr_running) { + candidate = i; + break; + } + } + } + } + + if (candidate >= 0) { + affine_sd = tmp; + want_affine = 0; + cpu = candidate; + } } if (!want_sd && !want_affine) @@ -1520,28 +1448,23 @@ select_task_rq_fair(struct rq *rq, struc sd = tmp; } -#ifdef CONFIG_FAIR_GROUP_SCHED if (sched_feat(LB_SHARES_UPDATE)) { /* * Pick the largest domain to update shares over */ tmp = sd; - if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight)) + if (affine_sd && (!tmp || + cpumask_weight(sched_domain_span(affine_sd)) > + cpumask_weight(sched_domain_span(sd)))) tmp = affine_sd; - if (tmp) { - spin_unlock(&rq->lock); + if (tmp) update_shares(tmp); - spin_lock(&rq->lock); - } } -#endif - if (affine_sd) { - if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) - return select_idle_sibling(p, cpu); - else - return select_idle_sibling(p, prev_cpu); + if (affine_sd && wake_affine(affine_sd, p, sync)) { + new_cpu = cpu; + goto out; } while (sd) { @@ -1572,10 +1495,10 @@ select_task_rq_fair(struct rq *rq, struc /* Now try balancing at a lower domain level of new_cpu */ cpu = new_cpu; - weight = sd->span_weight; + weight = cpumask_weight(sched_domain_span(sd)); sd = NULL; for_each_domain(cpu, tmp) { - if (weight <= tmp->span_weight) + if (weight <= cpumask_weight(sched_domain_span(tmp))) break; if (tmp->flags & sd_flag) sd = tmp; @@ -1583,6 +1506,8 @@ select_task_rq_fair(struct rq *rq, struc /* while loop will break here if sd == NULL */ } +out: + rcu_read_unlock(); return new_cpu; } #endif /* CONFIG_SMP */ @@ -1986,35 +1911,28 @@ static void task_tick_fair(struct rq *rq } /* - * called on fork with the child task as argument from the parent's context - * - child not yet on the tasklist - * - preemption disabled + * Share the fairness runtime between parent and child, thus the + * total amount of pressure for CPU stays equal - new tasks + * get a chance to run but frequent forkers are not allowed to + * monopolize the CPU. Note: the parent runqueue is locked, + * the child is not running yet. */ -static void task_fork_fair(struct task_struct *p) +static void task_new_fair(struct rq *rq, struct task_struct *p) { - struct cfs_rq *cfs_rq = task_cfs_rq(current); + struct cfs_rq *cfs_rq = task_cfs_rq(p); struct sched_entity *se = &p->se, *curr = cfs_rq->curr; int this_cpu = smp_processor_id(); - struct rq *rq = this_rq(); - unsigned long flags; - - spin_lock_irqsave(&rq->lock, flags); - update_rq_clock(rq); - - if (unlikely(task_cpu(p) != this_cpu)) { - rcu_read_lock(); - __set_task_cpu(p, this_cpu); - rcu_read_unlock(); - } + sched_info_queued(p); update_curr(cfs_rq); - if (curr) se->vruntime = curr->vruntime; place_entity(cfs_rq, se, 1); - if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) { + /* 'curr' will be NULL if the child belongs to a different group */ + if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) && + curr && entity_before(curr, se)) { /* * Upon rescheduling, sched_class::put_prev_task() will place * 'current' within the tree based on its new key value. @@ -2023,9 +1941,7 @@ static void task_fork_fair(struct task_s resched_task(rq->curr); } - se->vruntime -= cfs_rq->min_vruntime; - - spin_unlock_irqrestore(&rq->lock, flags); + enqueue_task_fair(rq, p, 0); } /* @@ -2078,40 +1994,34 @@ static void set_curr_task_fair(struct rq } #ifdef CONFIG_FAIR_GROUP_SCHED -static void task_move_group_fair(struct task_struct *p, int on_rq) +static void moved_group_fair(struct task_struct *p) { - /* - * If the task was not on the rq at the time of this cgroup movement - * it must have been asleep, sleeping tasks keep their ->vruntime - * absolute on their old rq until wakeup (needed for the fair sleeper - * bonus in place_entity()). - * - * If it was on the rq, we've just 'preempted' it, which does convert - * ->vruntime to a relative base. - * - * Make sure both cases convert their relative position when migrating - * to another cgroup's rq. This does somewhat interfere with the - * fair sleeper stuff for the first placement, but who cares. - */ - if (!on_rq) - p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime; - set_task_rq(p, task_cpu(p)); - if (!on_rq) - p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime; + struct cfs_rq *cfs_rq = task_cfs_rq(p); + s64 delta; + + update_curr(cfs_rq); + delta = (s64) (p->se.vruntime - cfs_rq->min_vruntime); + if(delta > 0) + p->se.vruntime = cfs_rq->min_vruntime; + place_entity(cfs_rq, &p->se, 1); } #endif -unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task) +unsigned int get_rr_interval_fair(struct task_struct *task) { struct sched_entity *se = &task->se; + unsigned long flags; + struct rq *rq; unsigned int rr_interval = 0; /* * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise * idle runqueue: */ + rq = task_rq_lock(task, &flags); if (rq->cfs.load.weight) rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se)); + task_rq_unlock(rq, &flags); return rr_interval; } @@ -2137,13 +2047,11 @@ static const struct sched_class fair_sch .move_one_task = move_one_task_fair, .rq_online = rq_online_fair, .rq_offline = rq_offline_fair, - - .task_waking = task_waking_fair, #endif .set_curr_task = set_curr_task_fair, .task_tick = task_tick_fair, - .task_fork = task_fork_fair, + .task_new = task_new_fair, .prio_changed = prio_changed_fair, .switched_to = switched_to_fair, @@ -2151,7 +2059,7 @@ static const struct sched_class fair_sch .get_rr_interval = get_rr_interval_fair, #ifdef CONFIG_FAIR_GROUP_SCHED - .task_move_group = task_move_group_fair, + .moved_group = moved_group_fair, #endif };