From: Ziggy <ziggy@ziggy471.com>
Date: Sat, 18 Aug 2012 17:30:46 +0000 (-0400)
Subject: Implement ck1 patchset
X-Git-Url: https://ziggy471.com/git/gitweb.cgi?p=ziggy471-l710-kernel.git;a=commitdiff;h=db9cdec8081f972d390ed97af9ccfe68af2dea04

Implement ck1 patchset
---

--- a/Makefile
+++ b/Makefile
@@ -10,6 +10,10 @@ NAME = Sneaky Weasel
 # Comments in this file are targeted only to the developer, do not
 # expect to learn how to build the kernel reading this file.
 
+CKVERSION = -ck1
+CKNAME = BFS Powered
+EXTRAVERSION := $(EXTRAVERSION)$(CKVERSION)
+
 # Do not:
 # o  use make's built-in rules and variables
 #    (this increases performance and avoids hard-to-debug behaviour);
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -64,11 +64,6 @@ static struct timer_list spusched_timer;
 static struct timer_list spuloadavg_timer;
 
 /*
- * Priority of a normal, non-rt, non-niced'd process (aka nice level 0).
- */
-#define NORMAL_PRIO		120
-
-/*
  * Frequency of the spu scheduler tick.  By default we do one SPU scheduler
  * tick for every 10 CPU scheduler ticks.
  */
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1072,7 +1072,7 @@ endchoice
 
 choice
 	depends on EXPERIMENTAL
-	prompt "Memory split" if EXPERT
+	prompt "Memory split"
 	default VMSPLIT_3G
 	depends on X86_32
 	---help---
@@ -1092,17 +1092,17 @@ choice
 	  option alone!
 
 	config VMSPLIT_3G
-		bool "3G/1G user/kernel split"
+		bool "Default 896MB lowmem (3G/1G user/kernel split)"
 	config VMSPLIT_3G_OPT
 		depends on !X86_PAE
-		bool "3G/1G user/kernel split (for full 1G low memory)"
+		bool "1GB lowmem (3G/1G user/kernel split)"
 	config VMSPLIT_2G
-		bool "2G/2G user/kernel split"
+		bool "2GB lowmem (2G/2G user/kernel split)"
 	config VMSPLIT_2G_OPT
 		depends on !X86_PAE
-		bool "2G/2G user/kernel split (for full 2G low memory)"
+		bool "2GB lowmem (2G/2G user/kernel split)"
 	config VMSPLIT_1G
-		bool "1G/3G user/kernel split"
+		bool "3GB lowmem (1G/3G user/kernel split)"
 endchoice
 
 config PAGE_OFFSET
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -109,7 +109,7 @@ static int show_cpuinfo(struct seq_file
 
 	seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
 		   c->loops_per_jiffy/(500000/HZ),
-		   (c->loops_per_jiffy/(5000/HZ)) % 100);
+		   (c->loops_per_jiffy * 10 /(50000/HZ)) % 100);
 
 #ifdef CONFIG_X86_64
 	if (c->x86_tlbsize > 0)
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -430,7 +430,7 @@ static void impress_friends(void)
 		"Total of %d processors activated (%lu.%02lu BogoMIPS).\n",
 		num_online_cpus(),
 		bogosum/(500000/HZ),
-		(bogosum/(5000/HZ))%100);
+		(bogosum * 10/(50000/HZ))%100);
 
 	pr_debug("Before bogocount - setting activated=1.\n");
 }
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -28,6 +28,7 @@
 #include <linux/cpu.h>
 #include <linux/completion.h>
 #include <linux/mutex.h>
+#include <linux/sched.h>
 #include <linux/syscore_ops.h>
 
 #include <trace/events/power.h>
@@ -1611,6 +1612,12 @@ int __cpufreq_driver_target(struct cpufr
 		target_freq, relation);
 	if (cpu_online(policy->cpu) && cpufreq_driver->target)
 		retval = cpufreq_driver->target(policy, target_freq, relation);
+	if (likely(retval != -EINVAL)) {
+		if (target_freq == policy->max)
+			cpu_nonscaling(policy->cpu);
+		else
+			cpu_scaling(policy->cpu);
+	}
 
 	return retval;
 }
--- a/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c
@@ -29,8 +29,8 @@
  * It helps to keep variable names smaller, simpler
  */
 
-#define DEF_FREQUENCY_UP_THRESHOLD		(80)
-#define DEF_FREQUENCY_DOWN_THRESHOLD		(20)
+#define DEF_FREQUENCY_UP_THRESHOLD		(63)
+#define DEF_FREQUENCY_DOWN_THRESHOLD		(26)
 
 /*
  * The polling frequency of this governor depends on the capability of
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -31,8 +31,8 @@
  * It helps to keep variable names smaller, simpler
  */
 
-#define DEF_FREQUENCY_DOWN_DIFFERENTIAL		(10)
-#define DEF_FREQUENCY_UP_THRESHOLD		(80)
+#define DEF_FREQUENCY_DOWN_DIFFERENTIAL		(26)
+#define DEF_FREQUENCY_UP_THRESHOLD		(63)
 #define DEF_SAMPLING_DOWN_FACTOR		(1)
 #define MAX_SAMPLING_DOWN_FACTOR		(100000)
 #define MICRO_FREQUENCY_DOWN_DIFFERENTIAL	(3)
@@ -549,10 +549,10 @@ static void dbs_check_cpu(struct cpu_dbs
 
 	/*
 	 * Every sampling_rate, we check, if current idle time is less
-	 * than 20% (default), then we try to increase frequency
+	 * than 37% (default), then we try to increase frequency
 	 * Every sampling_rate, we look for a the lowest
 	 * frequency which can sustain the load while keeping idle time over
-	 * 30%. If such a frequency exist, we try to decrease to this frequency.
+	 * 63%. If such a frequency exist, we try to decrease to this frequency.
 	 *
 	 * Any frequency increase takes it to the maximum frequency.
 	 * Frequency reduction happens at minimum steps of
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -418,7 +418,7 @@ static int proc_pid_stack(struct seq_fil
 static int proc_pid_schedstat(struct task_struct *task, char *buffer)
 {
 	return sprintf(buffer, "%llu %llu %lu\n",
-			(unsigned long long)task->se.sum_exec_runtime,
+			(unsigned long long)tsk_seruntime(task),
 			(unsigned long long)task->sched_info.run_delay,
 			task->sched_info.pcount);
 }
--- a/include/linux/ioprio.h
+++ b/include/linux/ioprio.h
@@ -64,6 +64,8 @@ static inline int task_ioprio_class(stru
 
 static inline int task_nice_ioprio(struct task_struct *task)
 {
+	if (iso_task(task))
+		return 0;
 	return (task_nice(task) + 20) / 5;
 }
 
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h
@@ -164,7 +164,7 @@ static inline u64 get_jiffies_64(void)
  * Have the 32 bit jiffies value wrap 5 minutes after boot
  * so jiffies wrap bugs show up earlier.
  */
-#define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-300*HZ))
+#define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-10*HZ))
 
 /*
  * Change timeval to jiffies, trying to avoid the
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -23,9 +23,12 @@ static inline int page_is_file_cache(str
 
 static inline void
 __add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l,
-		       struct list_head *head)
+		       struct list_head *head, int tail)
 {
-	list_add(&page->lru, head);
+	if (tail)
+		list_add_tail(&page->lru, head);
+	else
+		list_add(&page->lru, head);
 	__mod_zone_page_state(zone, NR_LRU_BASE + l, hpage_nr_pages(page));
 	mem_cgroup_add_lru_list(page, l);
 }
@@ -33,7 +36,13 @@ __add_page_to_lru_list(struct zone *zone
 static inline void
 add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l)
 {
-	__add_page_to_lru_list(zone, page, l, &zone->lru[l].list);
+	__add_page_to_lru_list(zone, page, l, &zone->lru[l].list, 0);
+}
+
+static inline void
+add_page_to_lru_list_tail(struct zone *zone, struct page *page, enum lru_list l)
+{
+	__add_page_to_lru_list(zone, page, l, &zone->lru[l].list, 1);
 }
 
 static inline void
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -15,6 +15,7 @@
 #include <linux/seqlock.h>
 #include <linux/nodemask.h>
 #include <linux/pageblock-flags.h>
+#include <linux/timer.h>
 #include <generated/bounds.h>
 #include <asm/atomic.h>
 #include <asm/page.h>
@@ -162,12 +163,14 @@ enum zone_watermarks {
 	WMARK_MIN,
 	WMARK_LOW,
 	WMARK_HIGH,
+	WMARK_LOTS,
 	NR_WMARK
 };
 
 #define min_wmark_pages(z) (z->watermark[WMARK_MIN])
 #define low_wmark_pages(z) (z->watermark[WMARK_LOW])
 #define high_wmark_pages(z) (z->watermark[WMARK_HIGH])
+#define lots_wmark_pages(z) (z->watermark[WMARK_LOTS])
 
 struct per_cpu_pages {
 	int count;		/* number of pages in the list */
@@ -339,7 +342,7 @@ struct zone {
 	ZONE_PADDING(_pad1_)
 
 	/* Fields commonly accessed by the page reclaim scanner */
-	spinlock_t		lru_lock;	
+	spinlock_t		lru_lock;
 	struct zone_lru {
 		struct list_head list;
 	} lru[NR_LRU_LISTS];
@@ -641,6 +644,7 @@ typedef struct pglist_data {
 	wait_queue_head_t kswapd_wait;
 	struct task_struct *kswapd;
 	int kswapd_max_order;
+	struct timer_list watermark_timer;
 	enum zone_type classzone_idx;
 } pg_data_t;
 
--- a/include/linux/nfsd/stats.h
+++ b/include/linux/nfsd/stats.h
@@ -11,8 +11,8 @@
 
 #include <linux/nfs4.h>
 
-/* thread usage wraps very million seconds (approx one fortnight) */
-#define	NFSD_USAGE_WRAP	(HZ*1000000)
+/* thread usage wraps every one hundred thousand seconds (approx one day) */
+#define	NFSD_USAGE_WRAP	(HZ*100000)
 
 #ifdef __KERNEL__
 
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -458,6 +458,8 @@ int add_to_page_cache_locked(struct page
 				pgoff_t index, gfp_t gfp_mask);
 int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
 				pgoff_t index, gfp_t gfp_mask);
+int __add_to_page_cache_lru(struct page *page, struct address_space *mapping,
+				pgoff_t offset, gfp_t gfp_mask, int tail);
 extern void delete_from_page_cache(struct page *page);
 extern void __delete_from_page_cache(struct page *page);
 int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask);
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -39,6 +39,8 @@
 #define SCHED_BATCH		3
 /* SCHED_ISO: reserved but not implemented yet */
 #define SCHED_IDLE		5
+#define SCHED_IDLEPRIO		SCHED_IDLE
+
 /* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */
 #define SCHED_RESET_ON_FORK     0x40000000
 
@@ -268,8 +270,6 @@ extern asmlinkage void schedule_tail(str
 extern void init_idle(struct task_struct *idle, int cpu);
 extern void init_idle_bootup_task(struct task_struct *idle);
 
-extern int runqueue_is_locked(int cpu);
-
 extern cpumask_var_t nohz_cpu_mask;
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
 extern void select_nohz_load_balancer(int stop_tick);
@@ -1226,9 +1226,12 @@ struct task_struct {
 
 #ifdef CONFIG_SMP
 	struct task_struct *wake_entry;
-	int on_cpu;
 #endif
-	int on_rq;
+#if defined(CONFIG_SMP)
+	bool on_cpu;
+#endif
+#endif
+	bool on_rq;
 
 	int prio, static_prio, normal_prio;
 	unsigned int rt_priority;
@@ -1572,6 +1575,42 @@ struct task_struct {
 #endif
 };
 
+extern int runqueue_is_locked(int cpu);
+static inline void cpu_scaling(int cpu)
+{
+}
+
+static inline void cpu_nonscaling(int cpu)
+{
+}
+#define tsk_seruntime(t)	((t)->se.sum_exec_runtime)
+#define tsk_rttimeout(t)	((t)->rt.timeout)
+
+static inline void tsk_cpus_current(struct task_struct *p)
+{
+	p->rt.nr_cpus_allowed = current->rt.nr_cpus_allowed;
+}
+
+static inline void print_scheduler_version(void)
+{
+	printk(KERN_INFO"CFS CPU scheduler.\n");
+}
+
+static inline bool iso_task(struct task_struct *p)
+{
+	return false;
+}
+
+static inline void remove_cpu(int cpu)
+{
+}
+
+/* Anyone feel like implementing this? */
+static inline int above_background_load(void)
+{
+	return 1;
+}
+
 /* Future-safe accessor for struct task_struct's cpus_allowed. */
 #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
 
@@ -1589,10 +1628,11 @@ struct task_struct {
  */
 
 #define MAX_USER_RT_PRIO	100
-#define MAX_RT_PRIO		MAX_USER_RT_PRIO
+#define MAX_RT_PRIO		(MAX_USER_RT_PRIO + 1)
+#define DEFAULT_PRIO		(MAX_RT_PRIO + 20)
 
 #define MAX_PRIO		(MAX_RT_PRIO + 40)
-#define DEFAULT_PRIO		(MAX_RT_PRIO + 20)
+#define NORMAL_PRIO		DEFAULT_PRIO
 
 static inline int rt_prio(int prio)
 {
@@ -1942,7 +1982,7 @@ extern unsigned long long
 task_sched_runtime(struct task_struct *task);
 
 /* sched_exec is called by processes performing an exec */
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP)
 extern void sched_exec(void);
 #else
 #define sched_exec()   {}
@@ -2571,7 +2611,7 @@ extern void signal_wake_up(struct task_s
  */
 #ifdef CONFIG_SMP
 
-static inline unsigned int task_cpu(const struct task_struct *p)
+static inline int task_cpu(const struct task_struct *p)
 {
 	return task_thread_info(p)->cpu;
 }
@@ -2580,12 +2620,12 @@ extern void set_task_cpu(struct task_str
 
 #else
 
-static inline unsigned int task_cpu(const struct task_struct *p)
+static inline int task_cpu(const struct task_struct *p)
 {
 	return 0;
 }
 
-static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
+static inline void set_task_cpu(struct task_struct *p, int cpu)
 {
 }
 
@@ -2699,5 +2739,3 @@ static inline unsigned long rlimit_max(u
 }
 
 #endif /* __KERNEL__ */
-
-#endif
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -201,7 +201,7 @@ struct swap_list_t {
 	int next;	/* swapfile to be used next */
 };
 
-/* Swap 50% full? Release swapcache more aggressively.. */
+/* Swap 50% full? */
 #define vm_swap_full() (nr_swap_pages*2 < total_swap_pages)
 
 /* linux/mm/page_alloc.c */
@@ -215,6 +215,7 @@ extern unsigned int nr_free_pagecache_pa
 
 
 /* linux/mm/swap.c */
+extern void ____lru_cache_add(struct page *, enum lru_list lru, int tail);
 extern void __lru_cache_add(struct page *, enum lru_list lru);
 extern void lru_cache_add_lru(struct page *, enum lru_list lru);
 extern void lru_add_page_tail(struct zone* zone,
@@ -238,9 +239,14 @@ static inline void lru_cache_add_anon(st
 	__lru_cache_add(page, LRU_INACTIVE_ANON);
 }
 
+static inline void lru_cache_add_file_tail(struct page *page, int tail)
+{
+	____lru_cache_add(page, LRU_INACTIVE_FILE, tail);
+}
+
 static inline void lru_cache_add_file(struct page *page)
 {
-	__lru_cache_add(page, LRU_INACTIVE_FILE);
+	____lru_cache_add(page, LRU_INACTIVE_FILE, 0);
 }
 
 /* LRU Isolation modes. */
@@ -350,9 +356,10 @@ extern void grab_swap_token(struct mm_st
 extern void __put_swap_token(struct mm_struct *);
 extern void disable_swap_token(struct mem_cgroup *memcg);
 
+/* Only allow swap token to have effect if swap is full */
 static inline int has_swap_token(struct mm_struct *mm)
 {
-	return (mm == swap_token_mm);
+	return (mm == swap_token_mm && vm_swap_full());
 }
 
 static inline void put_swap_token(struct mm_struct *mm)
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -39,8 +39,8 @@ struct inet_hashinfo;
  * If time > 4sec, it is "slow" path, no recycling is required,
  * so that we select tick to get range about 4 seconds.
  */
-#if HZ <= 16 || HZ > 4096
-# error Unsupported: HZ <= 16 or HZ > 4096
+#if HZ <= 16 || HZ > 16384
+# error Unsupported: HZ <= 16 or HZ > 16384
 #elif HZ <= 32
 # define INET_TWDR_RECYCLE_TICK (5 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
 #elif HZ <= 64
@@ -55,8 +55,12 @@ struct inet_hashinfo;
 # define INET_TWDR_RECYCLE_TICK (10 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
 #elif HZ <= 2048
 # define INET_TWDR_RECYCLE_TICK (11 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
-#else
+#elif HZ <= 4096
 # define INET_TWDR_RECYCLE_TICK (12 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
+#elif HZ <= 8192
+# define INET_TWDR_RECYCLE_TICK (13 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
+#else
+# define INET_TWDR_RECYCLE_TICK (14 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
 #endif
 
 /* TIME_WAIT reaping mechanism. */
--- a/init/calibrate.c
+++ b/init/calibrate.c
@@ -269,7 +269,7 @@ void __cpuinit calibrate_delay(void)
 	if (!printed)
 		pr_cont("%lu.%02lu BogoMIPS (lpj=%lu)\n",
 			lpj/(500000/HZ),
-			(lpj/(5000/HZ)) % 100, lpj);
+			(lpj * 10 /(50000 / HZ)) % 100, lpj);
 
 	loops_per_jiffy = lpj;
 	printed = true;
--- a/init/main.c
+++ b/init/main.c
@@ -775,6 +775,7 @@ static noinline int init_post(void)
 	system_state = SYSTEM_RUNNING;
 	numa_default_policy();
 
+	print_scheduler_version();
 
 	current->signal->flags |= SIGNAL_UNKILLABLE;
 
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -4,7 +4,7 @@
 
 choice
 	prompt "Timer frequency"
-	default HZ_250
+	default HZ_1000
 	help
 	 Allows the configuration of the timer frequency. It is customary
 	 to have the timer interrupt run at 1000 Hz but 100 Hz may be more
@@ -23,13 +23,14 @@ choice
 	  with lots of processors that may show reduced performance if
 	  too many timer interrupts are occurring.
 
-	config HZ_250
+	config HZ_250_NODEFAULT
 		bool "250 HZ"
 	help
-	 250 Hz is a good compromise choice allowing server performance
-	 while also showing good interactive responsiveness even
-	 on SMP and NUMA systems. If you are going to be using NTSC video
-	 or multimedia, selected 300Hz instead.
+	 250 HZ is a lousy compromise choice allowing server interactivity
+	 while also showing desktop throughput and no extra power saving on
+	 laptops. No good for anything.
+
+	 Recommend 100 or 1000 instead.
 
 	config HZ_300
 		bool "300 HZ"
@@ -43,16 +44,82 @@ choice
 		bool "1000 HZ"
 	help
 	 1000 Hz is the preferred choice for desktop systems and other
-	 systems requiring fast interactive responses to events.
+	 systems requiring fast interactive responses to events. Laptops
+	 can also benefit from this choice without sacrificing battery life
+	 if dynticks is also enabled.
+
+	config HZ_1500
+		bool "1500 HZ"
+	help
+	 1500 Hz is an insane value to use to run broken software that is Hz
+	 limited.
+
+	 Being over 1000, driver breakage is likely.
+
+	config HZ_2000
+		bool "2000 HZ"
+	help
+	 2000 Hz is an insane value to use to run broken software that is Hz
+	 limited.
+
+	 Being over 1000, driver breakage is likely.
+
+	config HZ_3000
+		bool "3000 HZ"
+	help
+	 3000 Hz is an insane value to use to run broken software that is Hz
+	 limited.
+
+	 Being over 1000, driver breakage is likely.
+
+	config HZ_4000
+		bool "4000 HZ"
+	help
+	 4000 Hz is an insane value to use to run broken software that is Hz
+	 limited.
+
+	 Being over 1000, driver breakage is likely.
+
+	config HZ_5000
+		bool "5000 HZ"
+	help
+	 5000 Hz is an obscene value to use to run broken software that is Hz
+	 limited.
+
+	 Being over 1000, driver breakage is likely.
+
+	config HZ_7500
+		bool "7500 HZ"
+	help
+	 7500 Hz is an obscene value to use to run broken software that is Hz
+	 limited.
+
+	 Being over 1000, driver breakage is likely.
+
+	config HZ_10000
+		bool "10000 HZ"
+	help
+	 10000 Hz is an obscene value to use to run broken software that is Hz
+	 limited.
+
+	 Being over 1000, driver breakage is likely.
+
 
 endchoice
 
 config HZ
 	int
 	default 100 if HZ_100
-	default 250 if HZ_250
+	default 250 if HZ_250_NODEFAULT
 	default 300 if HZ_300
 	default 1000 if HZ_1000
+	default 1500 if HZ_1500
+	default 2000 if HZ_2000
+	default 3000 if HZ_3000
+	default 4000 if HZ_4000
+	default 5000 if HZ_5000
+	default 7500 if HZ_7500
+	default 10000 if HZ_10000
 
 config SCHED_HRTICK
 	def_bool HIGH_RES_TIMERS && (!SMP || USE_GENERIC_SMP_HELPERS)
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -1,7 +1,7 @@
 
 choice
 	prompt "Preemption Model"
-	default PREEMPT_NONE
+	default PREEMPT
 
 config PREEMPT_NONE
 	bool "No Forced Preemption (Server)"
@@ -17,7 +17,7 @@ config PREEMPT_NONE
 	  latencies.
 
 config PREEMPT_VOLUNTARY
-	bool "Voluntary Kernel Preemption (Desktop)"
+	bool "Voluntary Kernel Preemption (Nothing)"
 	help
 	  This option reduces the latency of the kernel by adding more
 	  "explicit preemption points" to the kernel code. These new
@@ -31,7 +31,8 @@ config PREEMPT_VOLUNTARY
 	  applications to run more 'smoothly' even when the system is
 	  under load.
 
-	  Select this if you are building a kernel for a desktop system.
+	  Select this for no system in particular (choose Preemptible
+	  instead on a desktop if you know what's good for you).
 
 config PREEMPT
 	bool "Preemptible Kernel (Low-Latency Desktop)"
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -128,7 +128,7 @@ int __delayacct_add_tsk(struct taskstats
 	 */
 	t1 = tsk->sched_info.pcount;
 	t2 = tsk->sched_info.run_delay;
-	t3 = tsk->se.sum_exec_runtime;
+	t3 = tsk_seruntime(tsk);
 
 	d->cpu_count += t1;
 
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -132,7 +132,7 @@ static void __exit_signal(struct task_st
 		sig->inblock += task_io_get_inblock(tsk);
 		sig->oublock += task_io_get_oublock(tsk);
 		task_io_accounting_add(&sig->ioac, &tsk->ioac);
-		sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
+		sig->sum_sched_runtime += tsk_seruntime(tsk);
 	}
 
 	sig->nr_threads--;
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -250,7 +250,7 @@ void thread_group_cputime(struct task_st
 	do {
 		times->utime = cputime_add(times->utime, t->utime);
 		times->stime = cputime_add(times->stime, t->stime);
-		times->sum_exec_runtime += task_sched_runtime(t);
+		times->sum_exec_runtime += tsk_seruntime(t);
 	} while_each_thread(tsk, t);
 out:
 	rcu_read_unlock();
@@ -512,7 +512,7 @@ static void cleanup_timers(struct list_h
 void posix_cpu_timers_exit(struct task_struct *tsk)
 {
 	cleanup_timers(tsk->cpu_timers,
-		       tsk->utime, tsk->stime, tsk->se.sum_exec_runtime);
+		       tsk->utime, tsk->stime, tsk_seruntime(tsk));
 
 }
 void posix_cpu_timers_exit_group(struct task_struct *tsk)
@@ -522,7 +522,7 @@ void posix_cpu_timers_exit_group(struct
 	cleanup_timers(tsk->signal->cpu_timers,
 		       cputime_add(tsk->utime, sig->utime),
 		       cputime_add(tsk->stime, sig->stime),
-		       tsk->se.sum_exec_runtime + sig->sum_sched_runtime);
+		       tsk_seruntime(tsk) + sig->sum_sched_runtime);
 }
 
 static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now)
@@ -953,7 +953,7 @@ static void check_thread_timers(struct t
 		struct cpu_timer_list *t = list_first_entry(timers,
 						      struct cpu_timer_list,
 						      entry);
-		if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) {
+		if (!--maxfire || tsk_seruntime(tsk) < t->expires.sched) {
 			tsk->cputime_expires.sched_exp = t->expires.sched;
 			break;
 		}
@@ -970,7 +970,7 @@ static void check_thread_timers(struct t
 			ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max);
 
 		if (hard != RLIM_INFINITY &&
-		    tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
+		    tsk_rttimeout(tsk) > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
 			/*
 			 * At the hard limit, we just die.
 			 * No need to calculate anything else now.
@@ -978,7 +978,7 @@ static void check_thread_timers(struct t
 			__group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
 			return;
 		}
-		if (tsk->rt.timeout > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) {
+		if (tsk_rttimeout(tsk) > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) {
 			/*
 			 * At the soft limit, send a SIGXCPU every second.
 			 */
@@ -1280,7 +1280,7 @@ static inline int fastpath_timer_check(s
 		struct task_cputime task_sample = {
 			.utime = tsk->utime,
 			.stime = tsk->stime,
-			.sum_exec_runtime = tsk->se.sum_exec_runtime
+			.sum_exec_runtime = tsk_seruntime(tsk)
 		};
 
 		if (task_cputime_expired(&task_sample, &tsk->cputime_expires))
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -9450,4 +9450,3 @@ struct cgroup_subsys cpuacct_subsys = {
 	.subsys_id = cpuacct_subsys_id,
 };
 #endif	/* CONFIG_CGROUP_CPUACCT */
-
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -121,7 +121,7 @@ static int __maybe_unused one = 1;
 static int __maybe_unused two = 2;
 static int __maybe_unused three = 3;
 static unsigned long one_ul = 1;
-static int one_hundred = 100;
+static int __maybe_unused one_hundred = 100;
 #ifdef CONFIG_PRINTK
 static int ten_thousand = 10000;
 #endif
@@ -258,7 +258,7 @@ static struct ctl_table root_table[] = {
 	{ }
 };
 
-#ifdef CONFIG_SCHED_DEBUG
+#if defined(CONFIG_SCHED_DEBUG)
 static int min_sched_granularity_ns = 100000;		/* 100 usecs */
 static int max_sched_granularity_ns = NSEC_PER_SEC;	/* 1 second */
 static int min_wakeup_granularity_ns;			/* 0 usecs */
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -498,8 +498,8 @@ out:
 }
 EXPORT_SYMBOL(add_to_page_cache_locked);
 
-int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
-				pgoff_t offset, gfp_t gfp_mask)
+int __add_to_page_cache_lru(struct page *page, struct address_space *mapping,
+				pgoff_t offset, gfp_t gfp_mask, int tail)
 {
 	int ret;
 
@@ -515,12 +515,18 @@ int add_to_page_cache_lru(struct page *p
 	ret = add_to_page_cache(page, mapping, offset, gfp_mask);
 	if (ret == 0) {
 		if (page_is_file_cache(page))
-			lru_cache_add_file(page);
+			lru_cache_add_file_tail(page, tail);
 		else
 			lru_cache_add_anon(page);
 	}
 	return ret;
 }
+
+int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
+				pgoff_t offset, gfp_t gfp_mask)
+{
+	return __add_to_page_cache_lru(page, mapping, offset, gfp_mask, 0);
+}
 EXPORT_SYMBOL_GPL(add_to_page_cache_lru);
 
 #ifdef CONFIG_NUMA
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2995,7 +2995,7 @@ static int do_swap_page(struct mm_struct
 	mem_cgroup_commit_charge_swapin(page, ptr);
 
 	swap_free(entry);
-	if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
+	if ((vma->vm_flags & VM_LOCKED) || PageMlocked(page))
 		try_to_free_swap(page);
 	unlock_page(page);
 	if (swapcache) {
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -78,7 +78,7 @@ int vm_highmem_is_dirtyable;
 /*
  * The generator of dirty data starts writeback at this percentage
  */
-int vm_dirty_ratio = 20;
+int vm_dirty_ratio = 1;
 
 /*
  * vm_dirty_bytes starts at 0 (disabled) so that it is a function of
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -17,6 +17,7 @@
 #include <linux/task_io_accounting_ops.h>
 #include <linux/pagevec.h>
 #include <linux/pagemap.h>
+#include <linux/swap.h>
 
 /*
  * Initialise a struct file's readahead state.  Assumes that the caller has
@@ -107,7 +108,7 @@ int read_cache_pages(struct address_spac
 EXPORT_SYMBOL(read_cache_pages);
 
 static int read_pages(struct address_space *mapping, struct file *filp,
-		struct list_head *pages, unsigned nr_pages)
+		struct list_head *pages, unsigned nr_pages, int tail)
 {
 	struct blk_plug plug;
 	unsigned page_idx;
@@ -125,8 +126,8 @@ static int read_pages(struct address_spa
 	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
 		struct page *page = list_to_page(pages);
 		list_del(&page->lru);
-		if (!add_to_page_cache_lru(page, mapping,
-					page->index, GFP_KERNEL)) {
+		if (!__add_to_page_cache_lru(page, mapping,
+					page->index, GFP_KERNEL, tail)) {
 			mapping->a_ops->readpage(filp, page);
 		}
 		page_cache_release(page);
@@ -139,6 +140,28 @@ out:
 	return ret;
 }
 
+static inline int nr_mapped(void)
+{
+	return global_page_state(NR_FILE_MAPPED) +
+		global_page_state(NR_ANON_PAGES);
+}
+
+/*
+ * This examines how large in pages a file size is and returns 1 if it is
+ * more than half the unmapped ram. Avoid doing read_page_state which is
+ * expensive unless we already know it is likely to be large enough.
+ */
+static int large_isize(unsigned long nr_pages)
+{
+	if (nr_pages * 6 > vm_total_pages) {
+		 unsigned long unmapped_ram = vm_total_pages - nr_mapped();
+
+		if (nr_pages * 2 > unmapped_ram)
+			return 1;
+	}
+	return 0;
+}
+
 /*
  * __do_page_cache_readahead() actually reads a chunk of disk.  It allocates all
  * the pages first, then submits them all for I/O. This avoids the very bad
@@ -196,7 +219,8 @@ __do_page_cache_readahead(struct address
 	 * will then handle the error.
 	 */
 	if (ret)
-		read_pages(mapping, filp, &page_pool, ret);
+		read_pages(mapping, filp, &page_pool, ret,
+			   large_isize(end_index));
 	BUG_ON(!list_empty(&page_pool));
 out:
 	return ret;
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -348,15 +348,23 @@ void mark_page_accessed(struct page *pag
 
 EXPORT_SYMBOL(mark_page_accessed);
 
-void __lru_cache_add(struct page *page, enum lru_list lru)
+void ______pagevec_lru_add(struct pagevec *pvec, enum lru_list lru, int tail);
+
+void ____lru_cache_add(struct page *page, enum lru_list lru, int tail)
 {
 	struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru];
 
 	page_cache_get(page);
 	if (!pagevec_add(pvec, page))
-		____pagevec_lru_add(pvec, lru);
+		______pagevec_lru_add(pvec, lru, tail);
 	put_cpu_var(lru_add_pvecs);
 }
+EXPORT_SYMBOL(____lru_cache_add);
+
+void __lru_cache_add(struct page *page, enum lru_list lru)
+{
+	____lru_cache_add(page, lru, 0);
+}
 EXPORT_SYMBOL(__lru_cache_add);
 
 /**
@@ -364,7 +372,7 @@ EXPORT_SYMBOL(__lru_cache_add);
  * @page: the page to be added to the LRU.
  * @lru: the LRU list to which the page is added.
  */
-void lru_cache_add_lru(struct page *page, enum lru_list lru)
+void __lru_cache_add_lru(struct page *page, enum lru_list lru, int tail)
 {
 	if (PageActive(page)) {
 		VM_BUG_ON(PageUnevictable(page));
@@ -375,7 +383,12 @@ void lru_cache_add_lru(struct page *page
 	}
 
 	VM_BUG_ON(PageLRU(page) || PageActive(page) || PageUnevictable(page));
-	__lru_cache_add(page, lru);
+	____lru_cache_add(page, lru, tail);
+}
+
+void lru_cache_add_lru(struct page *page, enum lru_list lru)
+{
+	__lru_cache_add_lru(page, lru, 0);
 }
 
 /**
@@ -662,7 +675,7 @@ void lru_add_page_tail(struct zone* zone
 			head = page->lru.prev;
 		else
 			head = &zone->lru[lru].list;
-		__add_page_to_lru_list(zone, page_tail, lru, head);
+		__add_page_to_lru_list(zone, page_tail, lru, head, 0);
 	} else {
 		SetPageUnevictable(page_tail);
 		add_page_to_lru_list(zone, page_tail, LRU_UNEVICTABLE);
@@ -691,13 +704,18 @@ static void ____pagevec_lru_add_fn(struc
  * Add the passed pages to the LRU, then drop the caller's refcount
  * on them.  Reinitialises the caller's pagevec.
  */
-void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru)
+void ______pagevec_lru_add(struct pagevec *pvec, enum lru_list lru, int tail)
 {
 	VM_BUG_ON(is_unevictable_lru(lru));
 
 	pagevec_lru_move_fn(pvec, ____pagevec_lru_add_fn, (void *)lru);
 }
 
+void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru)
+{
+	______pagevec_lru_add(pvec, lru, 0);
+}
+
 EXPORT_SYMBOL(____pagevec_lru_add);
 
 /*
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -289,7 +289,7 @@ checks:
 		scan_base = offset = si->lowest_bit;
 
 	/* reuse swap entry of cache-only swap if not busy. */
-	if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
+	if (si->swap_map[offset] == SWAP_HAS_CACHE) {
 		int swap_was_freed;
 		spin_unlock(&swap_lock);
 		swap_was_freed = __try_to_reclaim_swap(si, offset);
@@ -378,7 +378,7 @@ scan:
 			spin_lock(&swap_lock);
 			goto checks;
 		}
-		if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
+		if (si->swap_map[offset] == SWAP_HAS_CACHE) {
 			spin_lock(&swap_lock);
 			goto checks;
 		}
@@ -393,7 +393,7 @@ scan:
 			spin_lock(&swap_lock);
 			goto checks;
 		}
-		if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
+		if (si->swap_map[offset] == SWAP_HAS_CACHE) {
 			spin_lock(&swap_lock);
 			goto checks;
 		}
@@ -707,8 +707,7 @@ int free_swap_and_cache(swp_entry_t entr
 		 * Not mapped elsewhere, or swap space full? Free it!
 		 * Also recheck PageSwapCache now page is locked (above).
 		 */
-		if (PageSwapCache(page) && !PageWriteback(page) &&
-				(!page_mapped(page) || vm_swap_full())) {
+		if (PageSwapCache(page) && !PageWriteback(page)) {
 			delete_from_swap_cache(page);
 			SetPageDirty(page);
 		}
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -37,6 +37,7 @@
 #include <linux/rwsem.h>
 #include <linux/delay.h>
 #include <linux/kthread.h>
+#include <linux/timer.h>
 #include <linux/freezer.h>
 #include <linux/memcontrol.h>
 #include <linux/delayacct.h>
@@ -148,7 +149,7 @@ struct scan_control {
 /*
  * From 0 .. 100.  Higher means more swappy.
  */
-int vm_swappiness = 60;
+int vm_swappiness;
 long vm_total_pages;	/* The total number of pages which the VM controls */
 
 static LIST_HEAD(shrinker_list);
@@ -932,7 +933,7 @@ cull_mlocked:
 
 activate_locked:
 		/* Not a candidate for swapping, so reclaim swap space. */
-		if (PageSwapCache(page) && vm_swap_full())
+		if (PageSwapCache(page))
 			try_to_free_swap(page);
 		VM_BUG_ON(PageActive(page));
 		SetPageActive(page);
@@ -1986,6 +1987,35 @@ restart:
 }
 
 /*
+ * Helper functions to adjust nice level of kswapd, based on the priority of
+ * the task (p) that called it. If it is already higher priority we do not
+ * demote its nice level since it is still working on behalf of a higher
+ * priority task. With kernel threads we leave it at nice 0.
+ *
+ * We don't ever run kswapd real time, so if a real time task calls kswapd we
+ * set it to highest SCHED_NORMAL priority.
+ */
+static inline int effective_sc_prio(struct task_struct *p)
+{
+	if (likely(p->mm)) {
+		if (rt_task(p))
+			return -20;
+		if (p->policy == SCHED_IDLEPRIO)
+			return 19;
+		return task_nice(p);
+	}
+	return 0;
+}
+
+static void set_kswapd_nice(struct task_struct *kswapd, int active)
+{
+	long nice = effective_sc_prio(current);
+
+	if (task_nice(kswapd) > nice || !active)
+		set_user_nice(kswapd, nice);
+}
+
+/*
  * This is the direct reclaim path, for page-allocating processes.  We only
  * try to reclaim pages from zones which will satisfy the caller's allocation
  * request.
@@ -2706,6 +2736,8 @@ static void kswapd_try_to_sleep(pg_data_
 	finish_wait(&pgdat->kswapd_wait, &wait);
 }
 
+#define WT_EXPIRY	(HZ * 5)	/* Time to wakeup watermark_timer */
+
 /*
  * The background pageout daemon, started as a kernel thread
  * from the init process.
@@ -2757,6 +2789,9 @@ static int kswapd(void *p)
 	for ( ; ; ) {
 		int ret;
 
+		/* kswapd has been busy so delay watermark_timer */
+		mod_timer(&pgdat->watermark_timer, jiffies + WT_EXPIRY);
+
 		/*
 		 * If the last balance_pgdat was unsuccessful it's unlikely a
 		 * new request of a similar or harder type will succeed soon
@@ -2806,6 +2841,7 @@ static int kswapd(void *p)
 void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
 {
 	pg_data_t *pgdat;
+	int active;
 
 	if (!populated_zone(zone))
 		return;
@@ -2817,7 +2853,9 @@ void wakeup_kswapd(struct zone *zone, in
 		pgdat->kswapd_max_order = order;
 		pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx);
 	}
-	if (!waitqueue_active(&pgdat->kswapd_wait))
+	active = waitqueue_active(&pgdat->kswapd_wait);
+	set_kswapd_nice(pgdat->kswapd, active);
+	if (!active)
 		return;
 	if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0))
 		return;
@@ -2930,20 +2968,57 @@ static int __devinit cpu_callback(struct
 }
 
 /*
+ * We wake up kswapd every WT_EXPIRY till free ram is above pages_lots
+ */
+static void watermark_wakeup(unsigned long data)
+{
+	pg_data_t *pgdat = (pg_data_t *)data;
+	struct timer_list *wt = &pgdat->watermark_timer;
+	int i;
+
+	if (!waitqueue_active(&pgdat->kswapd_wait) || above_background_load())
+		goto out;
+	for (i = pgdat->nr_zones - 1; i >= 0; i--) {
+		struct zone *z = pgdat->node_zones + i;
+
+		if (!populated_zone(z) || is_highmem(z)) {
+			/* We are better off leaving highmem full */
+			continue;
+		}
+		if (!zone_watermark_ok(z, 0, lots_wmark_pages(z), 0, 0)) {
+			wake_up_interruptible(&pgdat->kswapd_wait);
+			goto out;
+		}
+	}
+out:
+	mod_timer(wt, jiffies + WT_EXPIRY);
+	return;
+}
+
+/*
  * This kswapd start function will be called by init and node-hot-add.
  * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.
  */
 int kswapd_run(int nid)
 {
 	pg_data_t *pgdat = NODE_DATA(nid);
+	struct timer_list *wt;
 	int ret = 0;
 
 	if (pgdat->kswapd)
 		return 0;
 
+	wt = &pgdat->watermark_timer;
+	init_timer(wt);
+	wt->data = (unsigned long)pgdat;
+	wt->function = watermark_wakeup;
+	wt->expires = jiffies + WT_EXPIRY;
+	add_timer(wt);
+
 	pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
 	if (IS_ERR(pgdat->kswapd)) {
 		/* failure at boot is fatal */
+		del_timer(wt);
 		BUG_ON(system_state == SYSTEM_BOOTING);
 		printk("Failed to start kswapd on node %d\n",nid);
 		ret = -1;