--- a/Makefile
+++ b/Makefile
@@ -10,6 +10,10 @@ NAME = Sneaky Weasel
# Comments in this file are targeted only to the developer, do not
# expect to learn how to build the kernel reading this file.
+CKVERSION = -ck1
+CKNAME = BFS Powered
+EXTRAVERSION := $(EXTRAVERSION)$(CKVERSION)
+
# Do not:
# o use make's built-in rules and variables
# (this increases performance and avoids hard-to-debug behaviour);
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -64,11 +64,6 @@ static struct timer_list spusched_timer;
static struct timer_list spuloadavg_timer;
/*
- * Priority of a normal, non-rt, non-niced'd process (aka nice level 0).
- */
-#define NORMAL_PRIO 120
-
-/*
* Frequency of the spu scheduler tick. By default we do one SPU scheduler
* tick for every 10 CPU scheduler ticks.
*/
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1072,7 +1072,7 @@ endchoice
choice
depends on EXPERIMENTAL
- prompt "Memory split" if EXPERT
+ prompt "Memory split"
default VMSPLIT_3G
depends on X86_32
---help---
@@ -1092,17 +1092,17 @@ choice
option alone!
config VMSPLIT_3G
- bool "3G/1G user/kernel split"
+ bool "Default 896MB lowmem (3G/1G user/kernel split)"
config VMSPLIT_3G_OPT
depends on !X86_PAE
- bool "3G/1G user/kernel split (for full 1G low memory)"
+ bool "1GB lowmem (3G/1G user/kernel split)"
config VMSPLIT_2G
- bool "2G/2G user/kernel split"
+ bool "2GB lowmem (2G/2G user/kernel split)"
config VMSPLIT_2G_OPT
depends on !X86_PAE
- bool "2G/2G user/kernel split (for full 2G low memory)"
+ bool "2GB lowmem (2G/2G user/kernel split)"
config VMSPLIT_1G
- bool "1G/3G user/kernel split"
+ bool "3GB lowmem (1G/3G user/kernel split)"
endchoice
config PAGE_OFFSET
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -109,7 +109,7 @@ static int show_cpuinfo(struct seq_file
seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
c->loops_per_jiffy/(500000/HZ),
- (c->loops_per_jiffy/(5000/HZ)) % 100);
+ (c->loops_per_jiffy * 10 /(50000/HZ)) % 100);
#ifdef CONFIG_X86_64
if (c->x86_tlbsize > 0)
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -430,7 +430,7 @@ static void impress_friends(void)
"Total of %d processors activated (%lu.%02lu BogoMIPS).\n",
num_online_cpus(),
bogosum/(500000/HZ),
- (bogosum/(5000/HZ))%100);
+ (bogosum * 10/(50000/HZ))%100);
pr_debug("Before bogocount - setting activated=1.\n");
}
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -28,6 +28,7 @@
#include <linux/cpu.h>
#include <linux/completion.h>
#include <linux/mutex.h>
+#include <linux/sched.h>
#include <mach/perflock.h>
#include <linux/syscore_ops.h>
@@ -1449,6 +1450,12 @@ int __cpufreq_driver_target(struct cpufr
target_freq, relation);
if (cpu_online(policy->cpu) && cpufreq_driver->target)
retval = cpufreq_driver->target(policy, target_freq, relation);
+ if (likely(retval != -EINVAL)) {
+ if (target_freq == policy->max)
+ cpu_nonscaling(policy->cpu);
+ else
+ cpu_scaling(policy->cpu);
+ }
return retval;
}
--- a/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c
@@ -29,8 +29,8 @@
* It helps to keep variable names smaller, simpler
*/
-#define DEF_FREQUENCY_UP_THRESHOLD (80)
-#define DEF_FREQUENCY_DOWN_THRESHOLD (20)
+#define DEF_FREQUENCY_UP_THRESHOLD (63)
+#define DEF_FREQUENCY_DOWN_THRESHOLD (26)
/*
* The polling frequency of this governor depends on the capability of
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -31,8 +31,8 @@
* It helps to keep variable names smaller, simpler
*/
-#define DEF_FREQUENCY_DOWN_DIFFERENTIAL (10)
-#define DEF_FREQUENCY_UP_THRESHOLD (80)
+#define DEF_FREQUENCY_DOWN_DIFFERENTIAL (26)
+#define DEF_FREQUENCY_UP_THRESHOLD (63)
#define DEF_SAMPLING_DOWN_FACTOR (1)
#define MAX_SAMPLING_DOWN_FACTOR (100000)
#define MICRO_FREQUENCY_DOWN_DIFFERENTIAL (3)
@@ -486,10 +486,10 @@ static void dbs_check_cpu(struct cpu_dbs
/*
* Every sampling_rate, we check, if current idle time is less
- * than 20% (default), then we try to increase frequency
+ * than 37% (default), then we try to increase frequency
* Every sampling_rate, we look for a the lowest
* frequency which can sustain the load while keeping idle time over
- * 30%. If such a frequency exist, we try to decrease to this frequency.
+ * 63%. If such a frequency exist, we try to decrease to this frequency.
*
* Any frequency increase takes it to the maximum frequency.
* Frequency reduction happens at minimum steps of
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -418,7 +418,7 @@ static int proc_pid_stack(struct seq_fil
static int proc_pid_schedstat(struct task_struct *task, char *buffer)
{
return sprintf(buffer, "%llu %llu %lu\n",
- (unsigned long long)task->se.sum_exec_runtime,
+ (unsigned long long)tsk_seruntime(task),
(unsigned long long)task->sched_info.run_delay,
task->sched_info.pcount);
}
--- a/include/linux/ioprio.h
+++ b/include/linux/ioprio.h
@@ -64,6 +64,8 @@ static inline int task_ioprio_class(stru
static inline int task_nice_ioprio(struct task_struct *task)
{
+ if (iso_task(task))
+ return 0;
return (task_nice(task) + 20) / 5;
}
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h
@@ -164,7 +164,7 @@ static inline u64 get_jiffies_64(void)
* Have the 32 bit jiffies value wrap 5 minutes after boot
* so jiffies wrap bugs show up earlier.
*/
-#define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-300*HZ))
+#define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-10*HZ))
/*
* Change timeval to jiffies, trying to avoid the
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -23,9 +23,12 @@ static inline int page_is_file_cache(str
static inline void
__add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l,
- struct list_head *head)
+ struct list_head *head, int tail)
{
- list_add(&page->lru, head);
+ if (tail)
+ list_add_tail(&page->lru, head);
+ else
+ list_add(&page->lru, head);
__mod_zone_page_state(zone, NR_LRU_BASE + l, hpage_nr_pages(page));
mem_cgroup_add_lru_list(page, l);
}
@@ -33,7 +36,13 @@ __add_page_to_lru_list(struct zone *zone
static inline void
add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l)
{
- __add_page_to_lru_list(zone, page, l, &zone->lru[l].list);
+ __add_page_to_lru_list(zone, page, l, &zone->lru[l].list, 0);
+}
+
+static inline void
+add_page_to_lru_list_tail(struct zone *zone, struct page *page, enum lru_list l)
+{
+ __add_page_to_lru_list(zone, page, l, &zone->lru[l].list, 1);
}
static inline void
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -15,6 +15,7 @@
#include <linux/seqlock.h>
#include <linux/nodemask.h>
#include <linux/pageblock-flags.h>
+#include <linux/timer.h>
#include <generated/bounds.h>
#include <asm/atomic.h>
#include <asm/page.h>
@@ -173,12 +174,14 @@ enum zone_watermarks {
WMARK_MIN,
WMARK_LOW,
WMARK_HIGH,
+ WMARK_LOTS,
NR_WMARK
};
#define min_wmark_pages(z) (z->watermark[WMARK_MIN])
#define low_wmark_pages(z) (z->watermark[WMARK_LOW])
#define high_wmark_pages(z) (z->watermark[WMARK_HIGH])
+#define lots_wmark_pages(z) (z->watermark[WMARK_LOTS])
struct per_cpu_pages {
int count; /* number of pages in the list */
@@ -350,7 +353,7 @@ struct zone {
ZONE_PADDING(_pad1_)
/* Fields commonly accessed by the page reclaim scanner */
- spinlock_t lru_lock;
+ spinlock_t lru_lock;
struct zone_lru {
struct list_head list;
} lru[NR_LRU_LISTS];
@@ -652,6 +655,7 @@ typedef struct pglist_data {
wait_queue_head_t kswapd_wait;
struct task_struct *kswapd;
int kswapd_max_order;
+ struct timer_list watermark_timer;
enum zone_type classzone_idx;
} pg_data_t;
--- a/include/linux/nfsd/stats.h
+++ b/include/linux/nfsd/stats.h
@@ -11,8 +11,8 @@
#include <linux/nfs4.h>
-/* thread usage wraps very million seconds (approx one fortnight) */
-#define NFSD_USAGE_WRAP (HZ*1000000)
+/* thread usage wraps every one hundred thousand seconds (approx one day) */
+#define NFSD_USAGE_WRAP (HZ*100000)
#ifdef __KERNEL__
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -458,6 +458,8 @@ int add_to_page_cache_locked(struct page
pgoff_t index, gfp_t gfp_mask);
int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
pgoff_t index, gfp_t gfp_mask);
+int __add_to_page_cache_lru(struct page *page, struct address_space *mapping,
+ pgoff_t offset, gfp_t gfp_mask, int tail);
extern void delete_from_page_cache(struct page *page);
extern void __delete_from_page_cache(struct page *page);
int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask);
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -39,6 +39,8 @@
#define SCHED_BATCH 3
/* SCHED_ISO: reserved but not implemented yet */
#define SCHED_IDLE 5
+#define SCHED_IDLEPRIO SCHED_IDLE
+
/* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */
#define SCHED_RESET_ON_FORK 0x40000000
@@ -268,8 +270,6 @@ extern asmlinkage void schedule_tail(str
extern void init_idle(struct task_struct *idle, int cpu);
extern void init_idle_bootup_task(struct task_struct *idle);
-extern int runqueue_is_locked(int cpu);
-
extern cpumask_var_t nohz_cpu_mask;
#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
extern void select_nohz_load_balancer(int stop_tick);
@@ -1226,9 +1226,12 @@ struct task_struct {
#ifdef CONFIG_SMP
struct task_struct *wake_entry;
- int on_cpu;
#endif
- int on_rq;
+#if defined(CONFIG_SMP)
+ bool on_cpu;
+#endif
+#endif
+ bool on_rq;
int prio, static_prio, normal_prio;
unsigned int rt_priority;
@@ -1572,6 +1575,42 @@ struct task_struct {
#endif
};
+extern int runqueue_is_locked(int cpu);
+static inline void cpu_scaling(int cpu)
+{
+}
+
+static inline void cpu_nonscaling(int cpu)
+{
+}
+#define tsk_seruntime(t) ((t)->se.sum_exec_runtime)
+#define tsk_rttimeout(t) ((t)->rt.timeout)
+
+static inline void tsk_cpus_current(struct task_struct *p)
+{
+ p->rt.nr_cpus_allowed = current->rt.nr_cpus_allowed;
+}
+
+static inline void print_scheduler_version(void)
+{
+ printk(KERN_INFO"CFS CPU scheduler.\n");
+}
+
+static inline bool iso_task(struct task_struct *p)
+{
+ return false;
+}
+
+static inline void remove_cpu(int cpu)
+{
+}
+
+/* Anyone feel like implementing this? */
+static inline int above_background_load(void)
+{
+ return 1;
+}
+
/* Future-safe accessor for struct task_struct's cpus_allowed. */
#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
@@ -1589,10 +1628,11 @@ struct task_struct {
*/
#define MAX_USER_RT_PRIO 100
-#define MAX_RT_PRIO MAX_USER_RT_PRIO
+#define MAX_RT_PRIO (MAX_USER_RT_PRIO + 1)
+#define DEFAULT_PRIO (MAX_RT_PRIO + 20)
#define MAX_PRIO (MAX_RT_PRIO + 40)
-#define DEFAULT_PRIO (MAX_RT_PRIO + 20)
+#define NORMAL_PRIO DEFAULT_PRIO
static inline int rt_prio(int prio)
{
@@ -1942,7 +1982,7 @@ extern unsigned long long
task_sched_runtime(struct task_struct *task);
/* sched_exec is called by processes performing an exec */
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP)
extern void sched_exec(void);
#else
#define sched_exec() {}
@@ -2571,7 +2611,7 @@ extern void signal_wake_up(struct task_s
*/
#ifdef CONFIG_SMP
-static inline unsigned int task_cpu(const struct task_struct *p)
+static inline int task_cpu(const struct task_struct *p)
{
return task_thread_info(p)->cpu;
}
@@ -2580,12 +2620,12 @@ extern void set_task_cpu(struct task_str
#else
-static inline unsigned int task_cpu(const struct task_struct *p)
+static inline int task_cpu(const struct task_struct *p)
{
return 0;
}
-static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
+static inline void set_task_cpu(struct task_struct *p, int cpu)
{
}
@@ -2699,5 +2739,3 @@ static inline unsigned long rlimit_max(u
}
#endif /* __KERNEL__ */
-
-#endif
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -201,7 +201,7 @@ struct swap_list_t {
int next; /* swapfile to be used next */
};
-/* Swap 50% full? Release swapcache more aggressively.. */
+/* Swap 50% full? */
#define vm_swap_full() (nr_swap_pages*2 < total_swap_pages)
/* linux/mm/page_alloc.c */
@@ -215,6 +215,7 @@ extern unsigned int nr_free_pagecache_pa
/* linux/mm/swap.c */
+extern void ____lru_cache_add(struct page *, enum lru_list lru, int tail);
extern void __lru_cache_add(struct page *, enum lru_list lru);
extern void lru_cache_add_lru(struct page *, enum lru_list lru);
extern void lru_add_page_tail(struct zone* zone,
@@ -238,9 +239,14 @@ static inline void lru_cache_add_anon(st
__lru_cache_add(page, LRU_INACTIVE_ANON);
}
+static inline void lru_cache_add_file_tail(struct page *page, int tail)
+{
+ ____lru_cache_add(page, LRU_INACTIVE_FILE, tail);
+}
+
static inline void lru_cache_add_file(struct page *page)
{
- __lru_cache_add(page, LRU_INACTIVE_FILE);
+ ____lru_cache_add(page, LRU_INACTIVE_FILE, 0);
}
/* linux/mm/vmscan.c */
@@ -345,9 +351,10 @@ extern void grab_swap_token(struct mm_st
extern void __put_swap_token(struct mm_struct *);
extern void disable_swap_token(struct mem_cgroup *memcg);
+/* Only allow swap token to have effect if swap is full */
static inline int has_swap_token(struct mm_struct *mm)
{
- return (mm == swap_token_mm);
+ return (mm == swap_token_mm && vm_swap_full());
}
static inline void put_swap_token(struct mm_struct *mm)
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -39,8 +39,8 @@ struct inet_hashinfo;
* If time > 4sec, it is "slow" path, no recycling is required,
* so that we select tick to get range about 4 seconds.
*/
-#if HZ <= 16 || HZ > 4096
-# error Unsupported: HZ <= 16 or HZ > 4096
+#if HZ <= 16 || HZ > 16384
+# error Unsupported: HZ <= 16 or HZ > 16384
#elif HZ <= 32
# define INET_TWDR_RECYCLE_TICK (5 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
#elif HZ <= 64
@@ -55,8 +55,12 @@ struct inet_hashinfo;
# define INET_TWDR_RECYCLE_TICK (10 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
#elif HZ <= 2048
# define INET_TWDR_RECYCLE_TICK (11 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
-#else
+#elif HZ <= 4096
# define INET_TWDR_RECYCLE_TICK (12 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
+#elif HZ <= 8192
+# define INET_TWDR_RECYCLE_TICK (13 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
+#else
+# define INET_TWDR_RECYCLE_TICK (14 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
#endif
/* TIME_WAIT reaping mechanism. */
--- a/init/calibrate.c
+++ b/init/calibrate.c
@@ -269,7 +269,7 @@ void __cpuinit calibrate_delay(void)
if (!printed)
pr_cont("%lu.%02lu BogoMIPS (lpj=%lu)\n",
lpj/(500000/HZ),
- (lpj/(5000/HZ)) % 100, lpj);
+ (lpj * 10 /(50000 / HZ)) % 100, lpj);
loops_per_jiffy = lpj;
printed = true;
--- a/init/main.c
+++ b/init/main.c
@@ -748,6 +748,7 @@ static noinline int init_post(void)
system_state = SYSTEM_RUNNING;
numa_default_policy();
+ print_scheduler_version();
current->signal->flags |= SIGNAL_UNKILLABLE;
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -4,7 +4,7 @@
choice
prompt "Timer frequency"
- default HZ_250
+ default HZ_1000
help
Allows the configuration of the timer frequency. It is customary
to have the timer interrupt run at 1000 Hz but 100 Hz may be more
@@ -23,13 +23,14 @@ choice
with lots of processors that may show reduced performance if
too many timer interrupts are occurring.
- config HZ_250
+ config HZ_250_NODEFAULT
bool "250 HZ"
help
- 250 Hz is a good compromise choice allowing server performance
- while also showing good interactive responsiveness even
- on SMP and NUMA systems. If you are going to be using NTSC video
- or multimedia, selected 300Hz instead.
+ 250 HZ is a lousy compromise choice allowing server interactivity
+ while also showing desktop throughput and no extra power saving on
+ laptops. No good for anything.
+
+ Recommend 100 or 1000 instead.
config HZ_300
bool "300 HZ"
@@ -43,16 +44,82 @@ choice
bool "1000 HZ"
help
1000 Hz is the preferred choice for desktop systems and other
- systems requiring fast interactive responses to events.
+ systems requiring fast interactive responses to events. Laptops
+ can also benefit from this choice without sacrificing battery life
+ if dynticks is also enabled.
+
+ config HZ_1500
+ bool "1500 HZ"
+ help
+ 1500 Hz is an insane value to use to run broken software that is Hz
+ limited.
+
+ Being over 1000, driver breakage is likely.
+
+ config HZ_2000
+ bool "2000 HZ"
+ help
+ 2000 Hz is an insane value to use to run broken software that is Hz
+ limited.
+
+ Being over 1000, driver breakage is likely.
+
+ config HZ_3000
+ bool "3000 HZ"
+ help
+ 3000 Hz is an insane value to use to run broken software that is Hz
+ limited.
+
+ Being over 1000, driver breakage is likely.
+
+ config HZ_4000
+ bool "4000 HZ"
+ help
+ 4000 Hz is an insane value to use to run broken software that is Hz
+ limited.
+
+ Being over 1000, driver breakage is likely.
+
+ config HZ_5000
+ bool "5000 HZ"
+ help
+ 5000 Hz is an obscene value to use to run broken software that is Hz
+ limited.
+
+ Being over 1000, driver breakage is likely.
+
+ config HZ_7500
+ bool "7500 HZ"
+ help
+ 7500 Hz is an obscene value to use to run broken software that is Hz
+ limited.
+
+ Being over 1000, driver breakage is likely.
+
+ config HZ_10000
+ bool "10000 HZ"
+ help
+ 10000 Hz is an obscene value to use to run broken software that is Hz
+ limited.
+
+ Being over 1000, driver breakage is likely.
+
endchoice
config HZ
int
default 100 if HZ_100
- default 250 if HZ_250
+ default 250 if HZ_250_NODEFAULT
default 300 if HZ_300
default 1000 if HZ_1000
+ default 1500 if HZ_1500
+ default 2000 if HZ_2000
+ default 3000 if HZ_3000
+ default 4000 if HZ_4000
+ default 5000 if HZ_5000
+ default 7500 if HZ_7500
+ default 10000 if HZ_10000
config SCHED_HRTICK
def_bool HIGH_RES_TIMERS && (!SMP || USE_GENERIC_SMP_HELPERS)
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -1,7 +1,7 @@
choice
prompt "Preemption Model"
- default PREEMPT_NONE
+ default PREEMPT
config PREEMPT_NONE
bool "No Forced Preemption (Server)"
@@ -17,7 +17,7 @@ config PREEMPT_NONE
latencies.
config PREEMPT_VOLUNTARY
- bool "Voluntary Kernel Preemption (Desktop)"
+ bool "Voluntary Kernel Preemption (Nothing)"
help
This option reduces the latency of the kernel by adding more
"explicit preemption points" to the kernel code. These new
@@ -31,7 +31,8 @@ config PREEMPT_VOLUNTARY
applications to run more 'smoothly' even when the system is
under load.
- Select this if you are building a kernel for a desktop system.
+ Select this for no system in particular (choose Preemptible
+ instead on a desktop if you know what's good for you).
config PREEMPT
bool "Preemptible Kernel (Low-Latency Desktop)"
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -128,7 +128,7 @@ int __delayacct_add_tsk(struct taskstats
*/
t1 = tsk->sched_info.pcount;
t2 = tsk->sched_info.run_delay;
- t3 = tsk->se.sum_exec_runtime;
+ t3 = tsk_seruntime(tsk);
d->cpu_count += t1;
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -132,7 +132,7 @@ static void __exit_signal(struct task_st
sig->inblock += task_io_get_inblock(tsk);
sig->oublock += task_io_get_oublock(tsk);
task_io_accounting_add(&sig->ioac, &tsk->ioac);
- sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
+ sig->sum_sched_runtime += tsk_seruntime(tsk);
}
sig->nr_threads--;
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -250,7 +250,7 @@ void thread_group_cputime(struct task_st
do {
times->utime = cputime_add(times->utime, t->utime);
times->stime = cputime_add(times->stime, t->stime);
- times->sum_exec_runtime += task_sched_runtime(t);
+ times->sum_exec_runtime += tsk_seruntime(t);
} while_each_thread(tsk, t);
out:
rcu_read_unlock();
@@ -512,7 +512,7 @@ static void cleanup_timers(struct list_h
void posix_cpu_timers_exit(struct task_struct *tsk)
{
cleanup_timers(tsk->cpu_timers,
- tsk->utime, tsk->stime, tsk->se.sum_exec_runtime);
+ tsk->utime, tsk->stime, tsk_seruntime(tsk));
}
void posix_cpu_timers_exit_group(struct task_struct *tsk)
@@ -522,7 +522,7 @@ void posix_cpu_timers_exit_group(struct
cleanup_timers(tsk->signal->cpu_timers,
cputime_add(tsk->utime, sig->utime),
cputime_add(tsk->stime, sig->stime),
- tsk->se.sum_exec_runtime + sig->sum_sched_runtime);
+ tsk_seruntime(tsk) + sig->sum_sched_runtime);
}
static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now)
@@ -953,7 +953,7 @@ static void check_thread_timers(struct t
struct cpu_timer_list *t = list_first_entry(timers,
struct cpu_timer_list,
entry);
- if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) {
+ if (!--maxfire || tsk_seruntime(tsk) < t->expires.sched) {
tsk->cputime_expires.sched_exp = t->expires.sched;
break;
}
@@ -970,7 +970,7 @@ static void check_thread_timers(struct t
ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max);
if (hard != RLIM_INFINITY &&
- tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
+ tsk_rttimeout(tsk) > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
/*
* At the hard limit, we just die.
* No need to calculate anything else now.
@@ -978,7 +978,7 @@ static void check_thread_timers(struct t
__group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
return;
}
- if (tsk->rt.timeout > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) {
+ if (tsk_rttimeout(tsk) > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) {
/*
* At the soft limit, send a SIGXCPU every second.
*/
@@ -1280,7 +1280,7 @@ static inline int fastpath_timer_check(s
struct task_cputime task_sample = {
.utime = tsk->utime,
.stime = tsk->stime,
- .sum_exec_runtime = tsk->se.sum_exec_runtime
+ .sum_exec_runtime = tsk_seruntime(tsk)
};
if (task_cputime_expired(&task_sample, &tsk->cputime_expires))
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -9449,4 +9449,3 @@ struct cgroup_subsys cpuacct_subsys = {
.subsys_id = cpuacct_subsys_id,
};
#endif /* CONFIG_CGROUP_CPUACCT */
-
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -121,7 +121,7 @@ static int __maybe_unused one = 1;
static int __maybe_unused two = 2;
static int __maybe_unused three = 3;
static unsigned long one_ul = 1;
-static int one_hundred = 100;
+static int __maybe_unused one_hundred = 100;
#ifdef CONFIG_PRINTK
static int ten_thousand = 10000;
#endif
@@ -258,7 +258,7 @@ static struct ctl_table root_table[] = {
{ }
};
-#ifdef CONFIG_SCHED_DEBUG
+#if defined(CONFIG_SCHED_DEBUG)
static int min_sched_granularity_ns = 100000; /* 100 usecs */
static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */
static int min_wakeup_granularity_ns; /* 0 usecs */
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -498,8 +498,8 @@ out:
}
EXPORT_SYMBOL(add_to_page_cache_locked);
-int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
- pgoff_t offset, gfp_t gfp_mask)
+int __add_to_page_cache_lru(struct page *page, struct address_space *mapping,
+ pgoff_t offset, gfp_t gfp_mask, int tail)
{
int ret;
@@ -515,12 +515,18 @@ int add_to_page_cache_lru(struct page *p
ret = add_to_page_cache(page, mapping, offset, gfp_mask);
if (ret == 0) {
if (page_is_file_cache(page))
- lru_cache_add_file(page);
+ lru_cache_add_file_tail(page, tail);
else
lru_cache_add_anon(page);
}
return ret;
}
+
+int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
+ pgoff_t offset, gfp_t gfp_mask)
+{
+ return __add_to_page_cache_lru(page, mapping, offset, gfp_mask, 0);
+}
EXPORT_SYMBOL_GPL(add_to_page_cache_lru);
#ifdef CONFIG_NUMA
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2995,7 +2995,7 @@ static int do_swap_page(struct mm_struct
mem_cgroup_commit_charge_swapin(page, ptr);
swap_free(entry);
- if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
+ if ((vma->vm_flags & VM_LOCKED) || PageMlocked(page))
try_to_free_swap(page);
unlock_page(page);
if (swapcache) {
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -78,7 +78,7 @@ int vm_highmem_is_dirtyable;
/*
* The generator of dirty data starts writeback at this percentage
*/
-int vm_dirty_ratio = 20;
+int vm_dirty_ratio = 1;
/*
* vm_dirty_bytes starts at 0 (disabled) so that it is a function of
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -17,6 +17,7 @@
#include <linux/task_io_accounting_ops.h>
#include <linux/pagevec.h>
#include <linux/pagemap.h>
+#include <linux/swap.h>
/*
* Initialise a struct file's readahead state. Assumes that the caller has
@@ -107,7 +108,7 @@ int read_cache_pages(struct address_spac
EXPORT_SYMBOL(read_cache_pages);
static int read_pages(struct address_space *mapping, struct file *filp,
- struct list_head *pages, unsigned nr_pages)
+ struct list_head *pages, unsigned nr_pages, int tail)
{
struct blk_plug plug;
unsigned page_idx;
@@ -125,8 +126,8 @@ static int read_pages(struct address_spa
for (page_idx = 0; page_idx < nr_pages; page_idx++) {
struct page *page = list_to_page(pages);
list_del(&page->lru);
- if (!add_to_page_cache_lru(page, mapping,
- page->index, GFP_KERNEL)) {
+ if (!__add_to_page_cache_lru(page, mapping,
+ page->index, GFP_KERNEL, tail)) {
mapping->a_ops->readpage(filp, page);
}
page_cache_release(page);
@@ -139,6 +140,28 @@ out:
return ret;
}
+static inline int nr_mapped(void)
+{
+ return global_page_state(NR_FILE_MAPPED) +
+ global_page_state(NR_ANON_PAGES);
+}
+
+/*
+ * This examines how large in pages a file size is and returns 1 if it is
+ * more than half the unmapped ram. Avoid doing read_page_state which is
+ * expensive unless we already know it is likely to be large enough.
+ */
+static int large_isize(unsigned long nr_pages)
+{
+ if (nr_pages * 6 > vm_total_pages) {
+ unsigned long unmapped_ram = vm_total_pages - nr_mapped();
+
+ if (nr_pages * 2 > unmapped_ram)
+ return 1;
+ }
+ return 0;
+}
+
/*
* __do_page_cache_readahead() actually reads a chunk of disk. It allocates all
* the pages first, then submits them all for I/O. This avoids the very bad
@@ -196,7 +219,8 @@ __do_page_cache_readahead(struct address
* will then handle the error.
*/
if (ret)
- read_pages(mapping, filp, &page_pool, ret);
+ read_pages(mapping, filp, &page_pool, ret,
+ large_isize(end_index));
BUG_ON(!list_empty(&page_pool));
out:
return ret;
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -371,15 +371,23 @@ void mark_page_accessed(struct page *pag
EXPORT_SYMBOL(mark_page_accessed);
-void __lru_cache_add(struct page *page, enum lru_list lru)
+void ______pagevec_lru_add(struct pagevec *pvec, enum lru_list lru, int tail);
+
+void ____lru_cache_add(struct page *page, enum lru_list lru, int tail)
{
struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru];
page_cache_get(page);
if (!pagevec_add(pvec, page))
- ____pagevec_lru_add(pvec, lru);
+ ______pagevec_lru_add(pvec, lru, tail);
put_cpu_var(lru_add_pvecs);
}
+EXPORT_SYMBOL(____lru_cache_add);
+
+void __lru_cache_add(struct page *page, enum lru_list lru)
+{
+ ____lru_cache_add(page, lru, 0);
+}
EXPORT_SYMBOL(__lru_cache_add);
/**
@@ -387,7 +395,7 @@ EXPORT_SYMBOL(__lru_cache_add);
* @page: the page to be added to the LRU.
* @lru: the LRU list to which the page is added.
*/
-void lru_cache_add_lru(struct page *page, enum lru_list lru)
+void __lru_cache_add_lru(struct page *page, enum lru_list lru, int tail)
{
if (PageActive(page)) {
VM_BUG_ON(PageUnevictable(page));
@@ -398,7 +406,12 @@ void lru_cache_add_lru(struct page *page
}
VM_BUG_ON(PageLRU(page) || PageActive(page) || PageUnevictable(page));
- __lru_cache_add(page, lru);
+ ____lru_cache_add(page, lru, tail);
+}
+
+void lru_cache_add_lru(struct page *page, enum lru_list lru)
+{
+ __lru_cache_add_lru(page, lru, 0);
}
/**
@@ -685,7 +698,7 @@ void lru_add_page_tail(struct zone* zone
head = page->lru.prev;
else
head = &zone->lru[lru].list;
- __add_page_to_lru_list(zone, page_tail, lru, head);
+ __add_page_to_lru_list(zone, page_tail, lru, head, 0);
} else {
SetPageUnevictable(page_tail);
add_page_to_lru_list(zone, page_tail, LRU_UNEVICTABLE);
@@ -714,13 +727,18 @@ static void ____pagevec_lru_add_fn(struc
* Add the passed pages to the LRU, then drop the caller's refcount
* on them. Reinitialises the caller's pagevec.
*/
-void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru)
+void ______pagevec_lru_add(struct pagevec *pvec, enum lru_list lru, int tail)
{
VM_BUG_ON(is_unevictable_lru(lru));
pagevec_lru_move_fn(pvec, ____pagevec_lru_add_fn, (void *)lru);
}
+void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru)
+{
+ ______pagevec_lru_add(pvec, lru, 0);
+}
+
EXPORT_SYMBOL(____pagevec_lru_add);
/*
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -289,7 +289,7 @@ checks:
scan_base = offset = si->lowest_bit;
/* reuse swap entry of cache-only swap if not busy. */
- if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
+ if (si->swap_map[offset] == SWAP_HAS_CACHE) {
int swap_was_freed;
spin_unlock(&swap_lock);
swap_was_freed = __try_to_reclaim_swap(si, offset);
@@ -378,7 +378,7 @@ scan:
spin_lock(&swap_lock);
goto checks;
}
- if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
+ if (si->swap_map[offset] == SWAP_HAS_CACHE) {
spin_lock(&swap_lock);
goto checks;
}
@@ -393,7 +393,7 @@ scan:
spin_lock(&swap_lock);
goto checks;
}
- if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
+ if (si->swap_map[offset] == SWAP_HAS_CACHE) {
spin_lock(&swap_lock);
goto checks;
}
@@ -707,8 +707,7 @@ int free_swap_and_cache(swp_entry_t entr
* Not mapped elsewhere, or swap space full? Free it!
* Also recheck PageSwapCache now page is locked (above).
*/
- if (PageSwapCache(page) && !PageWriteback(page) &&
- (!page_mapped(page) || vm_swap_full())) {
+ if (PageSwapCache(page) && !PageWriteback(page)) {
delete_from_swap_cache(page);
SetPageDirty(page);
}
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -37,6 +37,7 @@
#include <linux/rwsem.h>
#include <linux/delay.h>
#include <linux/kthread.h>
+#include <linux/timer.h>
#include <linux/freezer.h>
#include <linux/memcontrol.h>
#include <linux/delayacct.h>
@@ -148,7 +149,7 @@ struct scan_control {
/*
* From 0 .. 100. Higher means more swappy.
*/
-int vm_swappiness = 60;
+int vm_swappiness;
long vm_total_pages; /* The total number of pages which the VM controls */
static LIST_HEAD(shrinker_list);
@@ -965,7 +966,7 @@ cull_mlocked:
activate_locked:
/* Not a candidate for swapping, so reclaim swap space. */
- if (PageSwapCache(page) && vm_swap_full())
+ if (PageSwapCache(page))
try_to_free_swap(page);
VM_BUG_ON(PageActive(page));
SetPageActive(page);
@@ -2044,6 +2045,35 @@ restart:
}
/*
+ * Helper functions to adjust nice level of kswapd, based on the priority of
+ * the task (p) that called it. If it is already higher priority we do not
+ * demote its nice level since it is still working on behalf of a higher
+ * priority task. With kernel threads we leave it at nice 0.
+ *
+ * We don't ever run kswapd real time, so if a real time task calls kswapd we
+ * set it to highest SCHED_NORMAL priority.
+ */
+static inline int effective_sc_prio(struct task_struct *p)
+{
+ if (likely(p->mm)) {
+ if (rt_task(p))
+ return -20;
+ if (p->policy == SCHED_IDLEPRIO)
+ return 19;
+ return task_nice(p);
+ }
+ return 0;
+}
+
+static void set_kswapd_nice(struct task_struct *kswapd, int active)
+{
+ long nice = effective_sc_prio(current);
+
+ if (task_nice(kswapd) > nice || !active)
+ set_user_nice(kswapd, nice);
+}
+
+/*
* This is the direct reclaim path, for page-allocating processes. We only
* try to reclaim pages from zones which will satisfy the caller's allocation
* request.
@@ -2792,6 +2822,8 @@ static void kswapd_try_to_sleep(pg_data_
finish_wait(&pgdat->kswapd_wait, &wait);
}
+#define WT_EXPIRY (HZ * 5) /* Time to wakeup watermark_timer */
+
/*
* The background pageout daemon, started as a kernel thread
* from the init process.
@@ -2847,6 +2879,9 @@ static int kswapd(void *p)
for ( ; ; ) {
int ret;
+ /* kswapd has been busy so delay watermark_timer */
+ mod_timer(&pgdat->watermark_timer, jiffies + WT_EXPIRY);
+
/*
* If the last balance_pgdat was unsuccessful it's unlikely a
* new request of a similar or harder type will succeed soon
@@ -2900,6 +2935,7 @@ static int kswapd(void *p)
void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
{
pg_data_t *pgdat;
+ int active;
if (!populated_zone(zone))
return;
@@ -2911,7 +2947,9 @@ void wakeup_kswapd(struct zone *zone, in
pgdat->kswapd_max_order = order;
pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx);
}
- if (!waitqueue_active(&pgdat->kswapd_wait))
+ active = waitqueue_active(&pgdat->kswapd_wait);
+ set_kswapd_nice(pgdat->kswapd, active);
+ if (!active)
return;
if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0))
return;
@@ -3024,20 +3062,57 @@ static int __devinit cpu_callback(struct
}
/*
+ * We wake up kswapd every WT_EXPIRY till free ram is above pages_lots
+ */
+static void watermark_wakeup(unsigned long data)
+{
+ pg_data_t *pgdat = (pg_data_t *)data;
+ struct timer_list *wt = &pgdat->watermark_timer;
+ int i;
+
+ if (!waitqueue_active(&pgdat->kswapd_wait) || above_background_load())
+ goto out;
+ for (i = pgdat->nr_zones - 1; i >= 0; i--) {
+ struct zone *z = pgdat->node_zones + i;
+
+ if (!populated_zone(z) || is_highmem(z)) {
+ /* We are better off leaving highmem full */
+ continue;
+ }
+ if (!zone_watermark_ok(z, 0, lots_wmark_pages(z), 0, 0)) {
+ wake_up_interruptible(&pgdat->kswapd_wait);
+ goto out;
+ }
+ }
+out:
+ mod_timer(wt, jiffies + WT_EXPIRY);
+ return;
+}
+
+/*
* This kswapd start function will be called by init and node-hot-add.
* On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.
*/
int kswapd_run(int nid)
{
pg_data_t *pgdat = NODE_DATA(nid);
+ struct timer_list *wt;
int ret = 0;
if (pgdat->kswapd)
return 0;
+ wt = &pgdat->watermark_timer;
+ init_timer(wt);
+ wt->data = (unsigned long)pgdat;
+ wt->function = watermark_wakeup;
+ wt->expires = jiffies + WT_EXPIRY;
+ add_timer(wt);
+
pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
if (IS_ERR(pgdat->kswapd)) {
/* failure at boot is fatal */
+ del_timer(wt);
BUG_ON(system_state == SYSTEM_BOOTING);
printk("Failed to start kswapd on node %d\n",nid);
ret = -1;