--- 7da55168dc4ffea7b1ea4a135214c4a5a430019a +++ 4af4d9fb5ce0d09175355fadc4ab1dc8268080ed @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -148,7 +149,7 @@ struct scan_control { /* * From 0 .. 100. Higher means more swappy. */ -int vm_swappiness = 60; +int vm_swappiness; long vm_total_pages; /* The total number of pages which the VM controls */ static LIST_HEAD(shrinker_list); @@ -965,7 +966,7 @@ cull_mlocked: activate_locked: /* Not a candidate for swapping, so reclaim swap space. */ - if (PageSwapCache(page) && vm_swap_full()) + if (PageSwapCache(page)) try_to_free_swap(page); VM_BUG_ON(PageActive(page)); SetPageActive(page); @@ -2044,6 +2045,35 @@ restart: } /* + * Helper functions to adjust nice level of kswapd, based on the priority of + * the task (p) that called it. If it is already higher priority we do not + * demote its nice level since it is still working on behalf of a higher + * priority task. With kernel threads we leave it at nice 0. + * + * We don't ever run kswapd real time, so if a real time task calls kswapd we + * set it to highest SCHED_NORMAL priority. + */ +static inline int effective_sc_prio(struct task_struct *p) +{ + if (likely(p->mm)) { + if (rt_task(p)) + return -20; + if (p->policy == SCHED_IDLEPRIO) + return 19; + return task_nice(p); + } + return 0; +} + +static void set_kswapd_nice(struct task_struct *kswapd, int active) +{ + long nice = effective_sc_prio(current); + + if (task_nice(kswapd) > nice || !active) + set_user_nice(kswapd, nice); +} + +/* * This is the direct reclaim path, for page-allocating processes. We only * try to reclaim pages from zones which will satisfy the caller's allocation * request. @@ -2792,6 +2822,8 @@ static void kswapd_try_to_sleep(pg_data_ finish_wait(&pgdat->kswapd_wait, &wait); } +#define WT_EXPIRY (HZ * 5) /* Time to wakeup watermark_timer */ + /* * The background pageout daemon, started as a kernel thread * from the init process. @@ -2847,6 +2879,9 @@ static int kswapd(void *p) for ( ; ; ) { int ret; + /* kswapd has been busy so delay watermark_timer */ + mod_timer(&pgdat->watermark_timer, jiffies + WT_EXPIRY); + /* * If the last balance_pgdat was unsuccessful it's unlikely a * new request of a similar or harder type will succeed soon @@ -2900,6 +2935,7 @@ static int kswapd(void *p) void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) { pg_data_t *pgdat; + int active; if (!populated_zone(zone)) return; @@ -2911,7 +2947,9 @@ void wakeup_kswapd(struct zone *zone, in pgdat->kswapd_max_order = order; pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx); } - if (!waitqueue_active(&pgdat->kswapd_wait)) + active = waitqueue_active(&pgdat->kswapd_wait); + set_kswapd_nice(pgdat->kswapd, active); + if (!active) return; if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0)) return; @@ -3024,20 +3062,57 @@ static int __devinit cpu_callback(struct } /* + * We wake up kswapd every WT_EXPIRY till free ram is above pages_lots + */ +static void watermark_wakeup(unsigned long data) +{ + pg_data_t *pgdat = (pg_data_t *)data; + struct timer_list *wt = &pgdat->watermark_timer; + int i; + + if (!waitqueue_active(&pgdat->kswapd_wait) || above_background_load()) + goto out; + for (i = pgdat->nr_zones - 1; i >= 0; i--) { + struct zone *z = pgdat->node_zones + i; + + if (!populated_zone(z) || is_highmem(z)) { + /* We are better off leaving highmem full */ + continue; + } + if (!zone_watermark_ok(z, 0, lots_wmark_pages(z), 0, 0)) { + wake_up_interruptible(&pgdat->kswapd_wait); + goto out; + } + } +out: + mod_timer(wt, jiffies + WT_EXPIRY); + return; +} + +/* * This kswapd start function will be called by init and node-hot-add. * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added. */ int kswapd_run(int nid) { pg_data_t *pgdat = NODE_DATA(nid); + struct timer_list *wt; int ret = 0; if (pgdat->kswapd) return 0; + wt = &pgdat->watermark_timer; + init_timer(wt); + wt->data = (unsigned long)pgdat; + wt->function = watermark_wakeup; + wt->expires = jiffies + WT_EXPIRY; + add_timer(wt); + pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); if (IS_ERR(pgdat->kswapd)) { /* failure at boot is fatal */ + del_timer(wt); BUG_ON(system_state == SYSTEM_BOOTING); printk("Failed to start kswapd on node %d\n",nid); ret = -1;