From: Ziggy Date: Sat, 18 Aug 2012 17:30:46 +0000 (-0400) Subject: Implement ck1 patchset X-Git-Url: https://ziggy471.com/git/gitweb.cgi?p=ziggy471-l710-kernel.git;a=commitdiff;h=db9cdec8081f972d390ed97af9ccfe68af2dea04 Implement ck1 patchset --- --- a/Makefile +++ b/Makefile @@ -10,6 +10,10 @@ NAME = Sneaky Weasel # Comments in this file are targeted only to the developer, do not # expect to learn how to build the kernel reading this file. +CKVERSION = -ck1 +CKNAME = BFS Powered +EXTRAVERSION := $(EXTRAVERSION)$(CKVERSION) + # Do not: # o use make's built-in rules and variables # (this increases performance and avoids hard-to-debug behaviour); --- a/arch/powerpc/platforms/cell/spufs/sched.c +++ b/arch/powerpc/platforms/cell/spufs/sched.c @@ -64,11 +64,6 @@ static struct timer_list spusched_timer; static struct timer_list spuloadavg_timer; /* - * Priority of a normal, non-rt, non-niced'd process (aka nice level 0). - */ -#define NORMAL_PRIO 120 - -/* * Frequency of the spu scheduler tick. By default we do one SPU scheduler * tick for every 10 CPU scheduler ticks. */ --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1072,7 +1072,7 @@ endchoice choice depends on EXPERIMENTAL - prompt "Memory split" if EXPERT + prompt "Memory split" default VMSPLIT_3G depends on X86_32 ---help--- @@ -1092,17 +1092,17 @@ choice option alone! config VMSPLIT_3G - bool "3G/1G user/kernel split" + bool "Default 896MB lowmem (3G/1G user/kernel split)" config VMSPLIT_3G_OPT depends on !X86_PAE - bool "3G/1G user/kernel split (for full 1G low memory)" + bool "1GB lowmem (3G/1G user/kernel split)" config VMSPLIT_2G - bool "2G/2G user/kernel split" + bool "2GB lowmem (2G/2G user/kernel split)" config VMSPLIT_2G_OPT depends on !X86_PAE - bool "2G/2G user/kernel split (for full 2G low memory)" + bool "2GB lowmem (2G/2G user/kernel split)" config VMSPLIT_1G - bool "1G/3G user/kernel split" + bool "3GB lowmem (1G/3G user/kernel split)" endchoice config PAGE_OFFSET --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -109,7 +109,7 @@ static int show_cpuinfo(struct seq_file seq_printf(m, "\nbogomips\t: %lu.%02lu\n", c->loops_per_jiffy/(500000/HZ), - (c->loops_per_jiffy/(5000/HZ)) % 100); + (c->loops_per_jiffy * 10 /(50000/HZ)) % 100); #ifdef CONFIG_X86_64 if (c->x86_tlbsize > 0) --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -430,7 +430,7 @@ static void impress_friends(void) "Total of %d processors activated (%lu.%02lu BogoMIPS).\n", num_online_cpus(), bogosum/(500000/HZ), - (bogosum/(5000/HZ))%100); + (bogosum * 10/(50000/HZ))%100); pr_debug("Before bogocount - setting activated=1.\n"); } --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -1611,6 +1612,12 @@ int __cpufreq_driver_target(struct cpufr target_freq, relation); if (cpu_online(policy->cpu) && cpufreq_driver->target) retval = cpufreq_driver->target(policy, target_freq, relation); + if (likely(retval != -EINVAL)) { + if (target_freq == policy->max) + cpu_nonscaling(policy->cpu); + else + cpu_scaling(policy->cpu); + } return retval; } --- a/drivers/cpufreq/cpufreq_conservative.c +++ b/drivers/cpufreq/cpufreq_conservative.c @@ -29,8 +29,8 @@ * It helps to keep variable names smaller, simpler */ -#define DEF_FREQUENCY_UP_THRESHOLD (80) -#define DEF_FREQUENCY_DOWN_THRESHOLD (20) +#define DEF_FREQUENCY_UP_THRESHOLD (63) +#define DEF_FREQUENCY_DOWN_THRESHOLD (26) /* * The polling frequency of this governor depends on the capability of --- a/drivers/cpufreq/cpufreq_ondemand.c +++ b/drivers/cpufreq/cpufreq_ondemand.c @@ -31,8 +31,8 @@ * It helps to keep variable names smaller, simpler */ -#define DEF_FREQUENCY_DOWN_DIFFERENTIAL (10) -#define DEF_FREQUENCY_UP_THRESHOLD (80) +#define DEF_FREQUENCY_DOWN_DIFFERENTIAL (26) +#define DEF_FREQUENCY_UP_THRESHOLD (63) #define DEF_SAMPLING_DOWN_FACTOR (1) #define MAX_SAMPLING_DOWN_FACTOR (100000) #define MICRO_FREQUENCY_DOWN_DIFFERENTIAL (3) @@ -549,10 +549,10 @@ static void dbs_check_cpu(struct cpu_dbs /* * Every sampling_rate, we check, if current idle time is less - * than 20% (default), then we try to increase frequency + * than 37% (default), then we try to increase frequency * Every sampling_rate, we look for a the lowest * frequency which can sustain the load while keeping idle time over - * 30%. If such a frequency exist, we try to decrease to this frequency. + * 63%. If such a frequency exist, we try to decrease to this frequency. * * Any frequency increase takes it to the maximum frequency. * Frequency reduction happens at minimum steps of --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -418,7 +418,7 @@ static int proc_pid_stack(struct seq_fil static int proc_pid_schedstat(struct task_struct *task, char *buffer) { return sprintf(buffer, "%llu %llu %lu\n", - (unsigned long long)task->se.sum_exec_runtime, + (unsigned long long)tsk_seruntime(task), (unsigned long long)task->sched_info.run_delay, task->sched_info.pcount); } --- a/include/linux/ioprio.h +++ b/include/linux/ioprio.h @@ -64,6 +64,8 @@ static inline int task_ioprio_class(stru static inline int task_nice_ioprio(struct task_struct *task) { + if (iso_task(task)) + return 0; return (task_nice(task) + 20) / 5; } --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -164,7 +164,7 @@ static inline u64 get_jiffies_64(void) * Have the 32 bit jiffies value wrap 5 minutes after boot * so jiffies wrap bugs show up earlier. */ -#define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-300*HZ)) +#define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-10*HZ)) /* * Change timeval to jiffies, trying to avoid the --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -23,9 +23,12 @@ static inline int page_is_file_cache(str static inline void __add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l, - struct list_head *head) + struct list_head *head, int tail) { - list_add(&page->lru, head); + if (tail) + list_add_tail(&page->lru, head); + else + list_add(&page->lru, head); __mod_zone_page_state(zone, NR_LRU_BASE + l, hpage_nr_pages(page)); mem_cgroup_add_lru_list(page, l); } @@ -33,7 +36,13 @@ __add_page_to_lru_list(struct zone *zone static inline void add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l) { - __add_page_to_lru_list(zone, page, l, &zone->lru[l].list); + __add_page_to_lru_list(zone, page, l, &zone->lru[l].list, 0); +} + +static inline void +add_page_to_lru_list_tail(struct zone *zone, struct page *page, enum lru_list l) +{ + __add_page_to_lru_list(zone, page, l, &zone->lru[l].list, 1); } static inline void --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -162,12 +163,14 @@ enum zone_watermarks { WMARK_MIN, WMARK_LOW, WMARK_HIGH, + WMARK_LOTS, NR_WMARK }; #define min_wmark_pages(z) (z->watermark[WMARK_MIN]) #define low_wmark_pages(z) (z->watermark[WMARK_LOW]) #define high_wmark_pages(z) (z->watermark[WMARK_HIGH]) +#define lots_wmark_pages(z) (z->watermark[WMARK_LOTS]) struct per_cpu_pages { int count; /* number of pages in the list */ @@ -339,7 +342,7 @@ struct zone { ZONE_PADDING(_pad1_) /* Fields commonly accessed by the page reclaim scanner */ - spinlock_t lru_lock; + spinlock_t lru_lock; struct zone_lru { struct list_head list; } lru[NR_LRU_LISTS]; @@ -641,6 +644,7 @@ typedef struct pglist_data { wait_queue_head_t kswapd_wait; struct task_struct *kswapd; int kswapd_max_order; + struct timer_list watermark_timer; enum zone_type classzone_idx; } pg_data_t; --- a/include/linux/nfsd/stats.h +++ b/include/linux/nfsd/stats.h @@ -11,8 +11,8 @@ #include -/* thread usage wraps very million seconds (approx one fortnight) */ -#define NFSD_USAGE_WRAP (HZ*1000000) +/* thread usage wraps every one hundred thousand seconds (approx one day) */ +#define NFSD_USAGE_WRAP (HZ*100000) #ifdef __KERNEL__ --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -458,6 +458,8 @@ int add_to_page_cache_locked(struct page pgoff_t index, gfp_t gfp_mask); int add_to_page_cache_lru(struct page *page, struct address_space *mapping, pgoff_t index, gfp_t gfp_mask); +int __add_to_page_cache_lru(struct page *page, struct address_space *mapping, + pgoff_t offset, gfp_t gfp_mask, int tail); extern void delete_from_page_cache(struct page *page); extern void __delete_from_page_cache(struct page *page); int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask); --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -39,6 +39,8 @@ #define SCHED_BATCH 3 /* SCHED_ISO: reserved but not implemented yet */ #define SCHED_IDLE 5 +#define SCHED_IDLEPRIO SCHED_IDLE + /* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */ #define SCHED_RESET_ON_FORK 0x40000000 @@ -268,8 +270,6 @@ extern asmlinkage void schedule_tail(str extern void init_idle(struct task_struct *idle, int cpu); extern void init_idle_bootup_task(struct task_struct *idle); -extern int runqueue_is_locked(int cpu); - extern cpumask_var_t nohz_cpu_mask; #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ) extern void select_nohz_load_balancer(int stop_tick); @@ -1226,9 +1226,12 @@ struct task_struct { #ifdef CONFIG_SMP struct task_struct *wake_entry; - int on_cpu; #endif - int on_rq; +#if defined(CONFIG_SMP) + bool on_cpu; +#endif +#endif + bool on_rq; int prio, static_prio, normal_prio; unsigned int rt_priority; @@ -1572,6 +1575,42 @@ struct task_struct { #endif }; +extern int runqueue_is_locked(int cpu); +static inline void cpu_scaling(int cpu) +{ +} + +static inline void cpu_nonscaling(int cpu) +{ +} +#define tsk_seruntime(t) ((t)->se.sum_exec_runtime) +#define tsk_rttimeout(t) ((t)->rt.timeout) + +static inline void tsk_cpus_current(struct task_struct *p) +{ + p->rt.nr_cpus_allowed = current->rt.nr_cpus_allowed; +} + +static inline void print_scheduler_version(void) +{ + printk(KERN_INFO"CFS CPU scheduler.\n"); +} + +static inline bool iso_task(struct task_struct *p) +{ + return false; +} + +static inline void remove_cpu(int cpu) +{ +} + +/* Anyone feel like implementing this? */ +static inline int above_background_load(void) +{ + return 1; +} + /* Future-safe accessor for struct task_struct's cpus_allowed. */ #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed) @@ -1589,10 +1628,11 @@ struct task_struct { */ #define MAX_USER_RT_PRIO 100 -#define MAX_RT_PRIO MAX_USER_RT_PRIO +#define MAX_RT_PRIO (MAX_USER_RT_PRIO + 1) +#define DEFAULT_PRIO (MAX_RT_PRIO + 20) #define MAX_PRIO (MAX_RT_PRIO + 40) -#define DEFAULT_PRIO (MAX_RT_PRIO + 20) +#define NORMAL_PRIO DEFAULT_PRIO static inline int rt_prio(int prio) { @@ -1942,7 +1982,7 @@ extern unsigned long long task_sched_runtime(struct task_struct *task); /* sched_exec is called by processes performing an exec */ -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) extern void sched_exec(void); #else #define sched_exec() {} @@ -2571,7 +2611,7 @@ extern void signal_wake_up(struct task_s */ #ifdef CONFIG_SMP -static inline unsigned int task_cpu(const struct task_struct *p) +static inline int task_cpu(const struct task_struct *p) { return task_thread_info(p)->cpu; } @@ -2580,12 +2620,12 @@ extern void set_task_cpu(struct task_str #else -static inline unsigned int task_cpu(const struct task_struct *p) +static inline int task_cpu(const struct task_struct *p) { return 0; } -static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) +static inline void set_task_cpu(struct task_struct *p, int cpu) { } @@ -2699,5 +2739,3 @@ static inline unsigned long rlimit_max(u } #endif /* __KERNEL__ */ - -#endif --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -201,7 +201,7 @@ struct swap_list_t { int next; /* swapfile to be used next */ }; -/* Swap 50% full? Release swapcache more aggressively.. */ +/* Swap 50% full? */ #define vm_swap_full() (nr_swap_pages*2 < total_swap_pages) /* linux/mm/page_alloc.c */ @@ -215,6 +215,7 @@ extern unsigned int nr_free_pagecache_pa /* linux/mm/swap.c */ +extern void ____lru_cache_add(struct page *, enum lru_list lru, int tail); extern void __lru_cache_add(struct page *, enum lru_list lru); extern void lru_cache_add_lru(struct page *, enum lru_list lru); extern void lru_add_page_tail(struct zone* zone, @@ -238,9 +239,14 @@ static inline void lru_cache_add_anon(st __lru_cache_add(page, LRU_INACTIVE_ANON); } +static inline void lru_cache_add_file_tail(struct page *page, int tail) +{ + ____lru_cache_add(page, LRU_INACTIVE_FILE, tail); +} + static inline void lru_cache_add_file(struct page *page) { - __lru_cache_add(page, LRU_INACTIVE_FILE); + ____lru_cache_add(page, LRU_INACTIVE_FILE, 0); } /* LRU Isolation modes. */ @@ -350,9 +356,10 @@ extern void grab_swap_token(struct mm_st extern void __put_swap_token(struct mm_struct *); extern void disable_swap_token(struct mem_cgroup *memcg); +/* Only allow swap token to have effect if swap is full */ static inline int has_swap_token(struct mm_struct *mm) { - return (mm == swap_token_mm); + return (mm == swap_token_mm && vm_swap_full()); } static inline void put_swap_token(struct mm_struct *mm) --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -39,8 +39,8 @@ struct inet_hashinfo; * If time > 4sec, it is "slow" path, no recycling is required, * so that we select tick to get range about 4 seconds. */ -#if HZ <= 16 || HZ > 4096 -# error Unsupported: HZ <= 16 or HZ > 4096 +#if HZ <= 16 || HZ > 16384 +# error Unsupported: HZ <= 16 or HZ > 16384 #elif HZ <= 32 # define INET_TWDR_RECYCLE_TICK (5 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) #elif HZ <= 64 @@ -55,8 +55,12 @@ struct inet_hashinfo; # define INET_TWDR_RECYCLE_TICK (10 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) #elif HZ <= 2048 # define INET_TWDR_RECYCLE_TICK (11 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) -#else +#elif HZ <= 4096 # define INET_TWDR_RECYCLE_TICK (12 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) +#elif HZ <= 8192 +# define INET_TWDR_RECYCLE_TICK (13 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) +#else +# define INET_TWDR_RECYCLE_TICK (14 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) #endif /* TIME_WAIT reaping mechanism. */ --- a/init/calibrate.c +++ b/init/calibrate.c @@ -269,7 +269,7 @@ void __cpuinit calibrate_delay(void) if (!printed) pr_cont("%lu.%02lu BogoMIPS (lpj=%lu)\n", lpj/(500000/HZ), - (lpj/(5000/HZ)) % 100, lpj); + (lpj * 10 /(50000 / HZ)) % 100, lpj); loops_per_jiffy = lpj; printed = true; --- a/init/main.c +++ b/init/main.c @@ -775,6 +775,7 @@ static noinline int init_post(void) system_state = SYSTEM_RUNNING; numa_default_policy(); + print_scheduler_version(); current->signal->flags |= SIGNAL_UNKILLABLE; --- a/kernel/Kconfig.hz +++ b/kernel/Kconfig.hz @@ -4,7 +4,7 @@ choice prompt "Timer frequency" - default HZ_250 + default HZ_1000 help Allows the configuration of the timer frequency. It is customary to have the timer interrupt run at 1000 Hz but 100 Hz may be more @@ -23,13 +23,14 @@ choice with lots of processors that may show reduced performance if too many timer interrupts are occurring. - config HZ_250 + config HZ_250_NODEFAULT bool "250 HZ" help - 250 Hz is a good compromise choice allowing server performance - while also showing good interactive responsiveness even - on SMP and NUMA systems. If you are going to be using NTSC video - or multimedia, selected 300Hz instead. + 250 HZ is a lousy compromise choice allowing server interactivity + while also showing desktop throughput and no extra power saving on + laptops. No good for anything. + + Recommend 100 or 1000 instead. config HZ_300 bool "300 HZ" @@ -43,16 +44,82 @@ choice bool "1000 HZ" help 1000 Hz is the preferred choice for desktop systems and other - systems requiring fast interactive responses to events. + systems requiring fast interactive responses to events. Laptops + can also benefit from this choice without sacrificing battery life + if dynticks is also enabled. + + config HZ_1500 + bool "1500 HZ" + help + 1500 Hz is an insane value to use to run broken software that is Hz + limited. + + Being over 1000, driver breakage is likely. + + config HZ_2000 + bool "2000 HZ" + help + 2000 Hz is an insane value to use to run broken software that is Hz + limited. + + Being over 1000, driver breakage is likely. + + config HZ_3000 + bool "3000 HZ" + help + 3000 Hz is an insane value to use to run broken software that is Hz + limited. + + Being over 1000, driver breakage is likely. + + config HZ_4000 + bool "4000 HZ" + help + 4000 Hz is an insane value to use to run broken software that is Hz + limited. + + Being over 1000, driver breakage is likely. + + config HZ_5000 + bool "5000 HZ" + help + 5000 Hz is an obscene value to use to run broken software that is Hz + limited. + + Being over 1000, driver breakage is likely. + + config HZ_7500 + bool "7500 HZ" + help + 7500 Hz is an obscene value to use to run broken software that is Hz + limited. + + Being over 1000, driver breakage is likely. + + config HZ_10000 + bool "10000 HZ" + help + 10000 Hz is an obscene value to use to run broken software that is Hz + limited. + + Being over 1000, driver breakage is likely. + endchoice config HZ int default 100 if HZ_100 - default 250 if HZ_250 + default 250 if HZ_250_NODEFAULT default 300 if HZ_300 default 1000 if HZ_1000 + default 1500 if HZ_1500 + default 2000 if HZ_2000 + default 3000 if HZ_3000 + default 4000 if HZ_4000 + default 5000 if HZ_5000 + default 7500 if HZ_7500 + default 10000 if HZ_10000 config SCHED_HRTICK def_bool HIGH_RES_TIMERS && (!SMP || USE_GENERIC_SMP_HELPERS) --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -1,7 +1,7 @@ choice prompt "Preemption Model" - default PREEMPT_NONE + default PREEMPT config PREEMPT_NONE bool "No Forced Preemption (Server)" @@ -17,7 +17,7 @@ config PREEMPT_NONE latencies. config PREEMPT_VOLUNTARY - bool "Voluntary Kernel Preemption (Desktop)" + bool "Voluntary Kernel Preemption (Nothing)" help This option reduces the latency of the kernel by adding more "explicit preemption points" to the kernel code. These new @@ -31,7 +31,8 @@ config PREEMPT_VOLUNTARY applications to run more 'smoothly' even when the system is under load. - Select this if you are building a kernel for a desktop system. + Select this for no system in particular (choose Preemptible + instead on a desktop if you know what's good for you). config PREEMPT bool "Preemptible Kernel (Low-Latency Desktop)" --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -128,7 +128,7 @@ int __delayacct_add_tsk(struct taskstats */ t1 = tsk->sched_info.pcount; t2 = tsk->sched_info.run_delay; - t3 = tsk->se.sum_exec_runtime; + t3 = tsk_seruntime(tsk); d->cpu_count += t1; --- a/kernel/exit.c +++ b/kernel/exit.c @@ -132,7 +132,7 @@ static void __exit_signal(struct task_st sig->inblock += task_io_get_inblock(tsk); sig->oublock += task_io_get_oublock(tsk); task_io_accounting_add(&sig->ioac, &tsk->ioac); - sig->sum_sched_runtime += tsk->se.sum_exec_runtime; + sig->sum_sched_runtime += tsk_seruntime(tsk); } sig->nr_threads--; --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -250,7 +250,7 @@ void thread_group_cputime(struct task_st do { times->utime = cputime_add(times->utime, t->utime); times->stime = cputime_add(times->stime, t->stime); - times->sum_exec_runtime += task_sched_runtime(t); + times->sum_exec_runtime += tsk_seruntime(t); } while_each_thread(tsk, t); out: rcu_read_unlock(); @@ -512,7 +512,7 @@ static void cleanup_timers(struct list_h void posix_cpu_timers_exit(struct task_struct *tsk) { cleanup_timers(tsk->cpu_timers, - tsk->utime, tsk->stime, tsk->se.sum_exec_runtime); + tsk->utime, tsk->stime, tsk_seruntime(tsk)); } void posix_cpu_timers_exit_group(struct task_struct *tsk) @@ -522,7 +522,7 @@ void posix_cpu_timers_exit_group(struct cleanup_timers(tsk->signal->cpu_timers, cputime_add(tsk->utime, sig->utime), cputime_add(tsk->stime, sig->stime), - tsk->se.sum_exec_runtime + sig->sum_sched_runtime); + tsk_seruntime(tsk) + sig->sum_sched_runtime); } static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) @@ -953,7 +953,7 @@ static void check_thread_timers(struct t struct cpu_timer_list *t = list_first_entry(timers, struct cpu_timer_list, entry); - if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) { + if (!--maxfire || tsk_seruntime(tsk) < t->expires.sched) { tsk->cputime_expires.sched_exp = t->expires.sched; break; } @@ -970,7 +970,7 @@ static void check_thread_timers(struct t ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max); if (hard != RLIM_INFINITY && - tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) { + tsk_rttimeout(tsk) > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) { /* * At the hard limit, we just die. * No need to calculate anything else now. @@ -978,7 +978,7 @@ static void check_thread_timers(struct t __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); return; } - if (tsk->rt.timeout > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) { + if (tsk_rttimeout(tsk) > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) { /* * At the soft limit, send a SIGXCPU every second. */ @@ -1280,7 +1280,7 @@ static inline int fastpath_timer_check(s struct task_cputime task_sample = { .utime = tsk->utime, .stime = tsk->stime, - .sum_exec_runtime = tsk->se.sum_exec_runtime + .sum_exec_runtime = tsk_seruntime(tsk) }; if (task_cputime_expired(&task_sample, &tsk->cputime_expires)) --- a/kernel/sched.c +++ b/kernel/sched.c @@ -9450,4 +9450,3 @@ struct cgroup_subsys cpuacct_subsys = { .subsys_id = cpuacct_subsys_id, }; #endif /* CONFIG_CGROUP_CPUACCT */ - --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -121,7 +121,7 @@ static int __maybe_unused one = 1; static int __maybe_unused two = 2; static int __maybe_unused three = 3; static unsigned long one_ul = 1; -static int one_hundred = 100; +static int __maybe_unused one_hundred = 100; #ifdef CONFIG_PRINTK static int ten_thousand = 10000; #endif @@ -258,7 +258,7 @@ static struct ctl_table root_table[] = { { } }; -#ifdef CONFIG_SCHED_DEBUG +#if defined(CONFIG_SCHED_DEBUG) static int min_sched_granularity_ns = 100000; /* 100 usecs */ static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ static int min_wakeup_granularity_ns; /* 0 usecs */ --- a/mm/filemap.c +++ b/mm/filemap.c @@ -498,8 +498,8 @@ out: } EXPORT_SYMBOL(add_to_page_cache_locked); -int add_to_page_cache_lru(struct page *page, struct address_space *mapping, - pgoff_t offset, gfp_t gfp_mask) +int __add_to_page_cache_lru(struct page *page, struct address_space *mapping, + pgoff_t offset, gfp_t gfp_mask, int tail) { int ret; @@ -515,12 +515,18 @@ int add_to_page_cache_lru(struct page *p ret = add_to_page_cache(page, mapping, offset, gfp_mask); if (ret == 0) { if (page_is_file_cache(page)) - lru_cache_add_file(page); + lru_cache_add_file_tail(page, tail); else lru_cache_add_anon(page); } return ret; } + +int add_to_page_cache_lru(struct page *page, struct address_space *mapping, + pgoff_t offset, gfp_t gfp_mask) +{ + return __add_to_page_cache_lru(page, mapping, offset, gfp_mask, 0); +} EXPORT_SYMBOL_GPL(add_to_page_cache_lru); #ifdef CONFIG_NUMA --- a/mm/memory.c +++ b/mm/memory.c @@ -2995,7 +2995,7 @@ static int do_swap_page(struct mm_struct mem_cgroup_commit_charge_swapin(page, ptr); swap_free(entry); - if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) + if ((vma->vm_flags & VM_LOCKED) || PageMlocked(page)) try_to_free_swap(page); unlock_page(page); if (swapcache) { --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -78,7 +78,7 @@ int vm_highmem_is_dirtyable; /* * The generator of dirty data starts writeback at this percentage */ -int vm_dirty_ratio = 20; +int vm_dirty_ratio = 1; /* * vm_dirty_bytes starts at 0 (disabled) so that it is a function of --- a/mm/readahead.c +++ b/mm/readahead.c @@ -17,6 +17,7 @@ #include #include #include +#include /* * Initialise a struct file's readahead state. Assumes that the caller has @@ -107,7 +108,7 @@ int read_cache_pages(struct address_spac EXPORT_SYMBOL(read_cache_pages); static int read_pages(struct address_space *mapping, struct file *filp, - struct list_head *pages, unsigned nr_pages) + struct list_head *pages, unsigned nr_pages, int tail) { struct blk_plug plug; unsigned page_idx; @@ -125,8 +126,8 @@ static int read_pages(struct address_spa for (page_idx = 0; page_idx < nr_pages; page_idx++) { struct page *page = list_to_page(pages); list_del(&page->lru); - if (!add_to_page_cache_lru(page, mapping, - page->index, GFP_KERNEL)) { + if (!__add_to_page_cache_lru(page, mapping, + page->index, GFP_KERNEL, tail)) { mapping->a_ops->readpage(filp, page); } page_cache_release(page); @@ -139,6 +140,28 @@ out: return ret; } +static inline int nr_mapped(void) +{ + return global_page_state(NR_FILE_MAPPED) + + global_page_state(NR_ANON_PAGES); +} + +/* + * This examines how large in pages a file size is and returns 1 if it is + * more than half the unmapped ram. Avoid doing read_page_state which is + * expensive unless we already know it is likely to be large enough. + */ +static int large_isize(unsigned long nr_pages) +{ + if (nr_pages * 6 > vm_total_pages) { + unsigned long unmapped_ram = vm_total_pages - nr_mapped(); + + if (nr_pages * 2 > unmapped_ram) + return 1; + } + return 0; +} + /* * __do_page_cache_readahead() actually reads a chunk of disk. It allocates all * the pages first, then submits them all for I/O. This avoids the very bad @@ -196,7 +219,8 @@ __do_page_cache_readahead(struct address * will then handle the error. */ if (ret) - read_pages(mapping, filp, &page_pool, ret); + read_pages(mapping, filp, &page_pool, ret, + large_isize(end_index)); BUG_ON(!list_empty(&page_pool)); out: return ret; --- a/mm/swap.c +++ b/mm/swap.c @@ -348,15 +348,23 @@ void mark_page_accessed(struct page *pag EXPORT_SYMBOL(mark_page_accessed); -void __lru_cache_add(struct page *page, enum lru_list lru) +void ______pagevec_lru_add(struct pagevec *pvec, enum lru_list lru, int tail); + +void ____lru_cache_add(struct page *page, enum lru_list lru, int tail) { struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru]; page_cache_get(page); if (!pagevec_add(pvec, page)) - ____pagevec_lru_add(pvec, lru); + ______pagevec_lru_add(pvec, lru, tail); put_cpu_var(lru_add_pvecs); } +EXPORT_SYMBOL(____lru_cache_add); + +void __lru_cache_add(struct page *page, enum lru_list lru) +{ + ____lru_cache_add(page, lru, 0); +} EXPORT_SYMBOL(__lru_cache_add); /** @@ -364,7 +372,7 @@ EXPORT_SYMBOL(__lru_cache_add); * @page: the page to be added to the LRU. * @lru: the LRU list to which the page is added. */ -void lru_cache_add_lru(struct page *page, enum lru_list lru) +void __lru_cache_add_lru(struct page *page, enum lru_list lru, int tail) { if (PageActive(page)) { VM_BUG_ON(PageUnevictable(page)); @@ -375,7 +383,12 @@ void lru_cache_add_lru(struct page *page } VM_BUG_ON(PageLRU(page) || PageActive(page) || PageUnevictable(page)); - __lru_cache_add(page, lru); + ____lru_cache_add(page, lru, tail); +} + +void lru_cache_add_lru(struct page *page, enum lru_list lru) +{ + __lru_cache_add_lru(page, lru, 0); } /** @@ -662,7 +675,7 @@ void lru_add_page_tail(struct zone* zone head = page->lru.prev; else head = &zone->lru[lru].list; - __add_page_to_lru_list(zone, page_tail, lru, head); + __add_page_to_lru_list(zone, page_tail, lru, head, 0); } else { SetPageUnevictable(page_tail); add_page_to_lru_list(zone, page_tail, LRU_UNEVICTABLE); @@ -691,13 +704,18 @@ static void ____pagevec_lru_add_fn(struc * Add the passed pages to the LRU, then drop the caller's refcount * on them. Reinitialises the caller's pagevec. */ -void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru) +void ______pagevec_lru_add(struct pagevec *pvec, enum lru_list lru, int tail) { VM_BUG_ON(is_unevictable_lru(lru)); pagevec_lru_move_fn(pvec, ____pagevec_lru_add_fn, (void *)lru); } +void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru) +{ + ______pagevec_lru_add(pvec, lru, 0); +} + EXPORT_SYMBOL(____pagevec_lru_add); /* --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -289,7 +289,7 @@ checks: scan_base = offset = si->lowest_bit; /* reuse swap entry of cache-only swap if not busy. */ - if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { + if (si->swap_map[offset] == SWAP_HAS_CACHE) { int swap_was_freed; spin_unlock(&swap_lock); swap_was_freed = __try_to_reclaim_swap(si, offset); @@ -378,7 +378,7 @@ scan: spin_lock(&swap_lock); goto checks; } - if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { + if (si->swap_map[offset] == SWAP_HAS_CACHE) { spin_lock(&swap_lock); goto checks; } @@ -393,7 +393,7 @@ scan: spin_lock(&swap_lock); goto checks; } - if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { + if (si->swap_map[offset] == SWAP_HAS_CACHE) { spin_lock(&swap_lock); goto checks; } @@ -707,8 +707,7 @@ int free_swap_and_cache(swp_entry_t entr * Not mapped elsewhere, or swap space full? Free it! * Also recheck PageSwapCache now page is locked (above). */ - if (PageSwapCache(page) && !PageWriteback(page) && - (!page_mapped(page) || vm_swap_full())) { + if (PageSwapCache(page) && !PageWriteback(page)) { delete_from_swap_cache(page); SetPageDirty(page); } --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -148,7 +149,7 @@ struct scan_control { /* * From 0 .. 100. Higher means more swappy. */ -int vm_swappiness = 60; +int vm_swappiness; long vm_total_pages; /* The total number of pages which the VM controls */ static LIST_HEAD(shrinker_list); @@ -932,7 +933,7 @@ cull_mlocked: activate_locked: /* Not a candidate for swapping, so reclaim swap space. */ - if (PageSwapCache(page) && vm_swap_full()) + if (PageSwapCache(page)) try_to_free_swap(page); VM_BUG_ON(PageActive(page)); SetPageActive(page); @@ -1986,6 +1987,35 @@ restart: } /* + * Helper functions to adjust nice level of kswapd, based on the priority of + * the task (p) that called it. If it is already higher priority we do not + * demote its nice level since it is still working on behalf of a higher + * priority task. With kernel threads we leave it at nice 0. + * + * We don't ever run kswapd real time, so if a real time task calls kswapd we + * set it to highest SCHED_NORMAL priority. + */ +static inline int effective_sc_prio(struct task_struct *p) +{ + if (likely(p->mm)) { + if (rt_task(p)) + return -20; + if (p->policy == SCHED_IDLEPRIO) + return 19; + return task_nice(p); + } + return 0; +} + +static void set_kswapd_nice(struct task_struct *kswapd, int active) +{ + long nice = effective_sc_prio(current); + + if (task_nice(kswapd) > nice || !active) + set_user_nice(kswapd, nice); +} + +/* * This is the direct reclaim path, for page-allocating processes. We only * try to reclaim pages from zones which will satisfy the caller's allocation * request. @@ -2706,6 +2736,8 @@ static void kswapd_try_to_sleep(pg_data_ finish_wait(&pgdat->kswapd_wait, &wait); } +#define WT_EXPIRY (HZ * 5) /* Time to wakeup watermark_timer */ + /* * The background pageout daemon, started as a kernel thread * from the init process. @@ -2757,6 +2789,9 @@ static int kswapd(void *p) for ( ; ; ) { int ret; + /* kswapd has been busy so delay watermark_timer */ + mod_timer(&pgdat->watermark_timer, jiffies + WT_EXPIRY); + /* * If the last balance_pgdat was unsuccessful it's unlikely a * new request of a similar or harder type will succeed soon @@ -2806,6 +2841,7 @@ static int kswapd(void *p) void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) { pg_data_t *pgdat; + int active; if (!populated_zone(zone)) return; @@ -2817,7 +2853,9 @@ void wakeup_kswapd(struct zone *zone, in pgdat->kswapd_max_order = order; pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx); } - if (!waitqueue_active(&pgdat->kswapd_wait)) + active = waitqueue_active(&pgdat->kswapd_wait); + set_kswapd_nice(pgdat->kswapd, active); + if (!active) return; if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0)) return; @@ -2930,20 +2968,57 @@ static int __devinit cpu_callback(struct } /* + * We wake up kswapd every WT_EXPIRY till free ram is above pages_lots + */ +static void watermark_wakeup(unsigned long data) +{ + pg_data_t *pgdat = (pg_data_t *)data; + struct timer_list *wt = &pgdat->watermark_timer; + int i; + + if (!waitqueue_active(&pgdat->kswapd_wait) || above_background_load()) + goto out; + for (i = pgdat->nr_zones - 1; i >= 0; i--) { + struct zone *z = pgdat->node_zones + i; + + if (!populated_zone(z) || is_highmem(z)) { + /* We are better off leaving highmem full */ + continue; + } + if (!zone_watermark_ok(z, 0, lots_wmark_pages(z), 0, 0)) { + wake_up_interruptible(&pgdat->kswapd_wait); + goto out; + } + } +out: + mod_timer(wt, jiffies + WT_EXPIRY); + return; +} + +/* * This kswapd start function will be called by init and node-hot-add. * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added. */ int kswapd_run(int nid) { pg_data_t *pgdat = NODE_DATA(nid); + struct timer_list *wt; int ret = 0; if (pgdat->kswapd) return 0; + wt = &pgdat->watermark_timer; + init_timer(wt); + wt->data = (unsigned long)pgdat; + wt->function = watermark_wakeup; + wt->expires = jiffies + WT_EXPIRY; + add_timer(wt); + pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); if (IS_ERR(pgdat->kswapd)) { /* failure at boot is fatal */ + del_timer(wt); BUG_ON(system_state == SYSTEM_BOOTING); printk("Failed to start kswapd on node %d\n",nid); ret = -1;