diff -purN -X /home/mbligh/.diff.exclude 000-virgin/Documentation/filesystems/proc.txt 141-no_vma_sort/Documentation/filesystems/proc.txt
--- 000-virgin/Documentation/filesystems/proc.txt	2003-04-21 14:13:56.000000000 -0700
+++ 141-no_vma_sort/Documentation/filesystems/proc.txt	2003-06-09 10:16:47.000000000 -0700
@@ -37,6 +37,7 @@ Table of Contents
   2.8	/proc/sys/net/ipv4 - IPV4 settings
   2.9	Appletalk
   2.10	IPX
+  2.11  /proc/sys/sched - scheduler tunables
 
 ------------------------------------------------------------------------------
 Preface
@@ -1750,6 +1751,104 @@ The /proc/net/ipx_route  table  holds  a
 gives the  destination  network, the router node (or Directly) and the network
 address of the router (or Connected) for internal networks.
 
+2.11 /proc/sys/sched - scheduler tunables
+-----------------------------------------
+
+Useful knobs for tuning the scheduler live in /proc/sys/sched.
+
+child_penalty
+-------------
+
+Percentage of the parent's sleep_avg that children inherit.  sleep_avg is
+a running average of the time a process spends sleeping.  Tasks with high
+sleep_avg values are considered interactive and given a higher dynamic
+priority and a larger timeslice.  You typically want this some value just
+under 100.
+
+exit_weight
+-----------
+
+When a CPU hog task exits, its parent's sleep_avg is reduced by a factor of
+exit_weight against the exiting task's sleep_avg.
+
+interactive_delta
+-----------------
+
+If a task is "interactive" it is reinserted into the active array after it
+has expired its timeslice, instead of being inserted into the expired array.
+How "interactive" a task must be in order to be deemed interactive is a
+function of its nice value.  This interactive limit is scaled linearly by nice
+value and is offset by the interactive_delta.
+
+max_sleep_avg
+-------------
+
+max_sleep_avg is the largest value (in ms) stored for a task's running sleep
+average.  The larger this value, the longer a task needs to sleep to be
+considered interactive (maximum interactive bonus is a function of
+max_sleep_avg).
+
+max_timeslice
+-------------
+
+Maximum timeslice, in milliseconds.  This is the value given to tasks of the
+highest dynamic priority.
+
+min_timeslice
+-------------
+
+Minimum timeslice, in milliseconds.  This is the value given to tasks of the
+lowest dynamic priority.  Every task gets at least this slice of the processor
+per array switch.
+
+parent_penalty
+--------------
+
+Percentage of the parent's sleep_avg that it retains across a fork().
+sleep_avg is a running average of the time a process spends sleeping.  Tasks
+with high sleep_avg values are considered interactive and given a higher
+dynamic priority and a larger timeslice.  Normally, this value is 100 and thus
+task's retain their sleep_avg on fork.  If you want to punish interactive
+tasks for forking, set this below 100.
+
+prio_bonus_ratio
+----------------
+
+Middle percentage of the priority range that tasks can receive as a dynamic
+priority.  The default value of 25% ensures that nice values at the
+extremes are still enforced.  For example, nice +19 interactive tasks will
+never be able to preempt a nice 0 CPU hog.  Setting this higher will increase
+the size of the priority range the tasks can receive as a bonus.  Setting
+this lower will decrease this range, making the interactivity bonus less
+apparent and user nice values more applicable.
+
+starvation_limit
+----------------
+
+Sufficiently interactive tasks are reinserted into the active array when they
+run out of timeslice.  Normally, tasks are inserted into the expired array.
+Reinserting interactive tasks into the active array allows them to remain
+runnable, which is important to interactive performance.  This could starve
+expired tasks, however, since the interactive task could prevent the array
+switch.  To prevent starving the tasks on the expired array for too long. the
+starvation_limit is the longest (in ms) we will let the expired array starve
+at the expense of reinserting interactive tasks back into active.  Higher
+values here give more preferance to running interactive tasks, at the expense
+of expired tasks.  Lower values provide more fair scheduling behavior, at the
+expense of interactivity.  The units are in milliseconds.
+
+idle_node_rebalance_ratio
+-------------------------
+
+On NUMA machines, we normally rebalance within nodes, but we also rebalance
+globally every N idle rebalance ticks, where N = idle_node_rebalance_ratio.
+
+busy_node_rebalance_ratio
+-------------------------
+
+On NUMA machines, we normally rebalance within nodes, but we also rebalance
+globally every N busy rebalance ticks, where N = busy_node_rebalance_ratio.
+
 ------------------------------------------------------------------------------
 Summary
 ------------------------------------------------------------------------------
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/arch/i386/Kconfig 141-no_vma_sort/arch/i386/Kconfig
--- 000-virgin/arch/i386/Kconfig	2003-06-05 14:51:24.000000000 -0700
+++ 141-no_vma_sort/arch/i386/Kconfig	2003-06-09 10:15:30.000000000 -0700
@@ -666,6 +666,44 @@ config HIGHMEM64G
 
 endchoice
 
+choice
+	help
+	  On i386, a process can only virtually address 4GB of memory.  This
+	  lets you select how much of that virtual space you would like to 
+	  devoted to userspace, and how much to the kernel.
+
+	  Some userspace programs would like to address as much as possible and 
+	  have few demands of the kernel other than it get out of the way.  These
+	  users may opt to use the 3.5GB option to give their userspace program 
+	  as much room as possible.  Due to alignment issues imposed by PAE, 
+	  the "3.5GB" option is unavailable if "64GB" high memory support is 
+	  enabled.
+
+	  Other users (especially those who use PAE) may be running out of
+	  ZONE_NORMAL memory.  Those users may benefit from increasing the
+	  kernel's virtual address space size by taking it away from userspace, 
+	  which may not need all of its space.  An indicator that this is 
+	  happening is when /proc/Meminfo's "LowFree:" is a small percentage of
+	  "LowTotal:" while "HighFree:" is very large.
+
+	  If unsure, say "3GB"
+	prompt "User address space size"
+        default 1GB
+	
+config	05GB
+	bool "3.5 GB"
+	depends on !HIGHMEM64G
+	
+config	1GB
+	bool "3 GB"
+	
+config	2GB
+	bool "2 GB"
+	
+config	3GB
+	bool "1 GB"
+endchoice
+
 config HIGHMEM
 	bool
 	depends on HIGHMEM64G || HIGHMEM4G
@@ -768,6 +806,25 @@ config MTRR
 
 	  See <file:Documentation/mtrr.txt> for more information.
 
+choice
+	help
+	  This is unrelated to your processor's speed.  This variable alters
+	  how often the system is asked to generate timer interrupts.  A larger
+	  value can lead to a more responsive system, but also causes extra 
+	  overhead from the increased number of context switches.
+	    
+	  If in doubt, leave it at the default of 1000. 
+
+	prompt "Kernel HZ"
+	default 1000HZ
+
+config	100HZ
+	bool "100 Hz"
+
+config	1000HZ
+	bool "1000 Hz"
+endchoice
+
 config HAVE_DEC_LOCK
 	bool
 	depends on (SMP || PREEMPT) && X86_CMPXCHG
@@ -1499,6 +1556,26 @@ config MAGIC_SYSRQ
 	  keys are documented in <file:Documentation/sysrq.txt>. Don't say Y
 	  unless you really know what this hack does.
 
+config EARLY_PRINTK
+	bool "Early console support"
+	default n
+	depends on DEBUG_KERNEL
+	help
+	  Write kernel log output directly into the VGA buffer or serial port. 
+	  This is useful for kernel debugging when your machine crashes very 
+	  early before the console code is initialized. For normal operation 
+	  it is not recommended because it looks ugly and doesn't cooperate 
+	  with klogd/syslogd or the X server.You should normally N here, 
+	  unless you want to debug such a crash.
+
+	  Syntax: earlyprintk=vga
+		  earlyprintk=serial[,ttySn[,baudrate]] 
+	  Append ,keep to not disable it when the real console takes over.
+	  Only vga or serial at a time, not both.
+	  Currently only ttyS0 and ttyS1 are supported. 
+	  Interaction with the standard serial driver is not very good. 
+	  The VGA output is eventually overwritten by the real console.
+
 config DEBUG_SPINLOCK
 	bool "Spinlock debugging"
 	depends on DEBUG_KERNEL
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/arch/i386/Makefile 141-no_vma_sort/arch/i386/Makefile
--- 000-virgin/arch/i386/Makefile	2003-06-05 14:51:24.000000000 -0700
+++ 141-no_vma_sort/arch/i386/Makefile	2003-06-09 10:15:30.000000000 -0700
@@ -94,6 +94,7 @@ drivers-$(CONFIG_OPROFILE)		+= arch/i386
 
 CFLAGS += $(mflags-y)
 AFLAGS += $(mflags-y)
+AFLAGS_vmlinux.lds.o += -imacros $(TOPDIR)/include/asm-i386/page.h
 
 boot := arch/i386/boot
 
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/arch/i386/kernel/irq.c 141-no_vma_sort/arch/i386/kernel/irq.c
--- 000-virgin/arch/i386/kernel/irq.c	2003-06-05 14:51:26.000000000 -0700
+++ 141-no_vma_sort/arch/i386/kernel/irq.c	2003-06-09 10:16:51.000000000 -0700
@@ -898,8 +898,9 @@ static int irq_affinity_write_proc (stru
 		return -EINVAL;
 
 	irq_affinity[irq] = new_value;
+#ifndef CONFIG_X86_SUMMIT
 	irq_desc[irq].handler->set_affinity(irq, new_value);
-
+#endif
 	return full_count;
 }
 
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/arch/i386/kernel/smpboot.c 141-no_vma_sort/arch/i386/kernel/smpboot.c
--- 000-virgin/arch/i386/kernel/smpboot.c	2003-06-05 14:51:27.000000000 -0700
+++ 141-no_vma_sort/arch/i386/kernel/smpboot.c	2003-06-09 10:15:18.000000000 -0700
@@ -62,7 +62,7 @@ int smp_num_siblings = 1;
 int phys_proc_id[NR_CPUS]; /* Package ID of each logical CPU */
 
 /* Bitmask of currently online CPUs */
-unsigned long cpu_online_map;
+unsigned long cpu_online_map = 1;
 
 static volatile unsigned long cpu_callin_map;
 volatile unsigned long cpu_callout_map;
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/arch/i386/vmlinux.lds.S 141-no_vma_sort/arch/i386/vmlinux.lds.S
--- 000-virgin/arch/i386/vmlinux.lds.S	2003-06-05 14:51:28.000000000 -0700
+++ 141-no_vma_sort/arch/i386/vmlinux.lds.S	2003-06-09 10:15:30.000000000 -0700
@@ -10,7 +10,7 @@ ENTRY(startup_32)
 jiffies = jiffies_64;
 SECTIONS
 {
-  . = 0xC0000000 + 0x100000;
+  . = __PAGE_OFFSET + 0x100000;
   /* read-only */
   _text = .;			/* Text and read-only data */
   .text : {
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/fs/exec.c 141-no_vma_sort/fs/exec.c
--- 000-virgin/fs/exec.c	2003-06-05 14:55:26.000000000 -0700
+++ 141-no_vma_sort/fs/exec.c	2003-06-09 10:16:53.000000000 -0700
@@ -316,6 +316,7 @@ void put_dirty_page(struct task_struct *
 	}
 	lru_cache_add_active(page);
 	flush_dcache_page(page);
+	SetPageAnon(page);
 	set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(page, prot))));
 	pte_chain = page_add_rmap(page, pte, pte_chain);
 	pte_unmap(pte);
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/fs/proc/proc_misc.c 141-no_vma_sort/fs/proc/proc_misc.c
--- 000-virgin/fs/proc/proc_misc.c	2003-06-05 14:55:42.000000000 -0700
+++ 141-no_vma_sort/fs/proc/proc_misc.c	2003-06-09 10:15:46.000000000 -0700
@@ -303,6 +303,9 @@ static struct file_operations proc_vmsta
 	.release	= seq_release,
 };
 
+extern int schedstats_read_proc(char *page, char **start, off_t off,
+				 int count, int *eof, void *data);
+
 #ifdef CONFIG_PROC_HARDWARE
 static int hardware_read_proc(char *page, char **start, off_t off,
 				 int count, int *eof, void *data)
@@ -359,6 +362,71 @@ static struct file_operations proc_modul
 };
 #endif
 
+#ifdef CONFIG_NUMA
+#define K(x) ((x) << (PAGE_SHIFT - 10))
+static int show_meminfo_numa (struct seq_file *m, void *v)
+{
+	int *d = v;
+	int nid = *d;
+	struct sysinfo i;
+	si_meminfo_node(&i, nid);
+	seq_printf(m, "\n"
+			"Node %d MemTotal:     %8lu kB\n"
+			"Node %d MemFree:      %8lu kB\n"
+			"Node %d MemUsed:      %8lu kB\n"
+			"Node %d HighTotal:    %8lu kB\n"
+			"Node %d HighFree:     %8lu kB\n"
+			"Node %d LowTotal:     %8lu kB\n"
+			"Node %d LowFree:      %8lu kB\n",
+			nid, K(i.totalram),
+			nid, K(i.freeram),
+			nid, K(i.totalram-i.freeram),
+			nid, K(i.totalhigh),
+			nid, K(i.freehigh),
+			nid, K(i.totalram-i.totalhigh),
+			nid, K(i.freeram-i.freehigh));
+
+	return 0;
+}
+#undef K 
+
+extern struct seq_operations meminfo_numa_op;
+static int meminfo_numa_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file,&meminfo_numa_op);
+}
+
+static struct file_operations proc_meminfo_numa_operations = {
+        open:           meminfo_numa_open,
+        read:           seq_read,
+        llseek:         seq_lseek,
+        release:        seq_release,
+};
+
+static void *meminfo_numa_start(struct seq_file *m, loff_t *pos)
+{
+	return  *pos < numnodes ? pos : NULL;
+}
+
+static void *meminfo_numa_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	++*pos;
+	return meminfo_numa_start(m, pos);
+}
+
+static void meminfo_numa_stop(struct seq_file *m, void *v)
+{
+}
+
+struct seq_operations meminfo_numa_op = {
+	.start = meminfo_numa_start,
+	.next  = meminfo_numa_next,
+	.stop  = meminfo_numa_stop,
+	.show  = show_meminfo_numa,
+};
+
+#endif
+
 extern struct seq_operations slabinfo_op;
 extern ssize_t slabinfo_write(struct file *, const char *, size_t, loff_t *);
 static int slabinfo_open(struct inode *inode, struct file *file)
@@ -636,6 +704,7 @@ void __init proc_misc_init(void)
 #endif
 		{"locks",	locks_read_proc},
 		{"execdomains",	execdomains_read_proc},
+		{"schedstat",	schedstats_read_proc},
 		{NULL,}
 	};
 	for (p = simple_ones; p->name; p++)
@@ -659,6 +728,9 @@ void __init proc_misc_init(void)
 #ifdef CONFIG_MODULES
 	create_seq_entry("modules", 0, &proc_modules_operations);
 #endif
+#ifdef CONFIG_NUMA
+	create_seq_entry("meminfo.numa",0,&proc_meminfo_numa_operations);
+#endif
 	proc_root_kcore = create_proc_entry("kcore", S_IRUSR, NULL);
 	if (proc_root_kcore) {
 		proc_root_kcore->proc_fops = &proc_kcore_operations;
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/asm-i386/early_printk.h 141-no_vma_sort/include/asm-i386/early_printk.h
--- 000-virgin/include/asm-i386/early_printk.h	1969-12-31 16:00:00.000000000 -0800
+++ 141-no_vma_sort/include/asm-i386/early_printk.h	2003-06-09 10:15:18.000000000 -0700
@@ -0,0 +1,8 @@
+#ifndef __EARLY_PRINTK_H_I386_
+#define __EARLY_PRINTK_H_i386_
+
+#define VGABASE  0xB8000
+#define SERIAL_BASES { 0x3f8, 0x2f8 }
+#define SERIAL_BASES_LEN 2
+
+#endif
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/asm-i386/page.h 141-no_vma_sort/include/asm-i386/page.h
--- 000-virgin/include/asm-i386/page.h	2003-04-09 11:48:05.000000000 -0700
+++ 141-no_vma_sort/include/asm-i386/page.h	2003-06-09 10:15:30.000000000 -0700
@@ -115,9 +115,26 @@ static __inline__ int get_order(unsigned
 #endif /* __ASSEMBLY__ */
 
 #ifdef __ASSEMBLY__
-#define __PAGE_OFFSET		(0xC0000000)
+#include <linux/config.h>
+#ifdef CONFIG_05GB
+#define __PAGE_OFFSET          (0xE0000000)
+#elif defined(CONFIG_1GB)
+#define __PAGE_OFFSET          (0xC0000000)
+#elif defined(CONFIG_2GB)
+#define __PAGE_OFFSET          (0x80000000)
+#elif defined(CONFIG_3GB)
+#define __PAGE_OFFSET          (0x40000000)
+#endif
 #else
-#define __PAGE_OFFSET		(0xC0000000UL)
+#ifdef CONFIG_05GB
+#define __PAGE_OFFSET          (0xE0000000UL)
+#elif defined(CONFIG_1GB)
+#define __PAGE_OFFSET          (0xC0000000UL)
+#elif defined(CONFIG_2GB)
+#define __PAGE_OFFSET          (0x80000000UL)
+#elif defined(CONFIG_3GB)
+#define __PAGE_OFFSET          (0x40000000UL)
+#endif
 #endif
 
 
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/asm-i386/param.h 141-no_vma_sort/include/asm-i386/param.h
--- 000-virgin/include/asm-i386/param.h	2002-12-09 18:45:45.000000000 -0800
+++ 141-no_vma_sort/include/asm-i386/param.h	2003-06-09 10:15:23.000000000 -0700
@@ -2,11 +2,19 @@
 #define _ASMi386_PARAM_H
 
 #ifdef __KERNEL__
-# define HZ		1000		/* Internal kernel timer frequency */
-# define USER_HZ	100		/* .. some user interfaces are in "ticks" */
-# define CLOCKS_PER_SEC	(USER_HZ)	/* like times() */
+#include <linux/config.h>
+
+#ifdef CONFIG_1000HZ
+# define HZ	1000		/* Internal kernel timer frequency */
+#else
+# define HZ	100
 #endif
 
+#define USER_HZ	100		/* .. some user interfaces are in "ticks" */
+#define CLOCKS_PER_SEC	(USER_HZ)	/* like times() */
+
+#endif	/* __KERNEL__ */
+
 #ifndef HZ
 #define HZ 100
 #endif
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/asm-i386/processor.h 141-no_vma_sort/include/asm-i386/processor.h
--- 000-virgin/include/asm-i386/processor.h	2003-06-05 14:56:10.000000000 -0700
+++ 141-no_vma_sort/include/asm-i386/processor.h	2003-06-09 10:15:30.000000000 -0700
@@ -288,7 +288,11 @@ extern unsigned int mca_pentium_flag;
 /* This decides where the kernel will search for a free chunk of vm
  * space during mmap's.
  */
+#ifdef CONFIG_05GB
+#define TASK_UNMAPPED_BASE	(PAGE_ALIGN(TASK_SIZE / 16))
+#else
 #define TASK_UNMAPPED_BASE	(PAGE_ALIGN(TASK_SIZE / 3))
+#endif
 
 /*
  * Size of io_bitmap in longwords: 32 is ports 0-0x3ff.
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/asm-x86_64/early_printk.h 141-no_vma_sort/include/asm-x86_64/early_printk.h
--- 000-virgin/include/asm-x86_64/early_printk.h	1969-12-31 16:00:00.000000000 -0800
+++ 141-no_vma_sort/include/asm-x86_64/early_printk.h	2003-06-09 10:15:18.000000000 -0700
@@ -0,0 +1,8 @@
+#ifdef __EARLY_PRINTK_H_X86_64_
+#define __EARLY_PRINTK_H_X86_64_
+
+#define VGABASE	0xffffffff800b8000UL
+#define SERIAL_BASES { 0x3f8, 0x2f8 }
+#define SERIAL_BASES_LEN 2
+
+#endif
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/linux/early_printk.h 141-no_vma_sort/include/linux/early_printk.h
--- 000-virgin/include/linux/early_printk.h	1969-12-31 16:00:00.000000000 -0800
+++ 141-no_vma_sort/include/linux/early_printk.h	2003-06-09 10:15:18.000000000 -0700
@@ -0,0 +1,47 @@
+#ifndef __EARLY_PRINTK_H_
+#define __EARLY_PRINTK_H_
+
+#ifdef CONFIG_EARLY_PRINTK
+#include <linux/console.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <asm/io.h>
+#include <asm/early_printk.h>
+
+/* Simple VGA output */
+
+#define MAX_YPOS	25
+#define MAX_XPOS	80
+
+/* Simple serial port output */
+
+#define DEFAULT_BAUD	57600
+#define XMTRDY		0x20
+
+#define DLAB		0x80
+
+#define TXR		0	/*  Transmit register (WRITE) */
+#define RXR		0	/*  Receive register  (READ)  */
+#define IER		1	/*  Interrupt Enable	  	*/
+#define IIR		2	/*  Interrupt ID		*/
+#define FCR		2	/*  FIFO control		*/
+#define LCR		3	/*  Line control		*/
+#define MCR		4	/*  Modem control		*/
+#define LSR		5	/*  Line Status			*/
+#define MSR		6	/*  Modem Status		*/
+#define DLL		0	/*  Divisor Latch Low	 	*/
+#define DLH		1	/*  Divisor latch High		*/
+
+
+void early_printk(const char *fmt, ...);
+int __init setup_early_printk(char *opt); 
+
+#else
+
+#define early_printk(...) do {} while(0)
+#define setup_early_printk(X) do {} while(0)
+
+#endif
+
+#endif
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/linux/mm.h 141-no_vma_sort/include/linux/mm.h
--- 000-virgin/include/linux/mm.h	2003-06-05 14:56:33.000000000 -0700
+++ 141-no_vma_sort/include/linux/mm.h	2003-06-09 10:16:54.000000000 -0700
@@ -179,6 +179,7 @@ struct page {
 		struct pte_chain *chain;/* Reverse pte mapping pointer.
 					 * protected by PG_chainlock */
 		pte_addr_t direct;
+		int mapcount;
 	} pte;
 	unsigned long private;		/* mapping-private opaque data */
 
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/linux/page-flags.h 141-no_vma_sort/include/linux/page-flags.h
--- 000-virgin/include/linux/page-flags.h	2003-04-21 14:14:50.000000000 -0700
+++ 141-no_vma_sort/include/linux/page-flags.h	2003-06-09 10:16:54.000000000 -0700
@@ -74,6 +74,7 @@
 #define PG_mappedtodisk		17	/* Has blocks allocated on-disk */
 #define PG_reclaim		18	/* To be reclaimed asap */
 #define PG_compound		19	/* Part of a compound page */
+#define PG_anon			20	/* Anonymous page */
 
 
 /*
@@ -257,6 +258,10 @@ extern void get_full_page_state(struct p
 #define SetPageCompound(page)	set_bit(PG_compound, &(page)->flags)
 #define ClearPageCompound(page)	clear_bit(PG_compound, &(page)->flags)
 
+#define PageAnon(page)		test_bit(PG_anon, &(page)->flags)
+#define SetPageAnon(page)	set_bit(PG_anon, &(page)->flags)
+#define ClearPageAnon(page)	clear_bit(PG_anon, &(page)->flags)
+
 /*
  * The PageSwapCache predicate doesn't use a PG_flag at this time,
  * but it may again do so one day.
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/linux/swap.h 141-no_vma_sort/include/linux/swap.h
--- 000-virgin/include/linux/swap.h	2003-06-05 14:56:37.000000000 -0700
+++ 141-no_vma_sort/include/linux/swap.h	2003-06-09 10:16:54.000000000 -0700
@@ -186,6 +186,8 @@ struct pte_chain *FASTCALL(page_add_rmap
 void FASTCALL(page_remove_rmap(struct page *, pte_t *));
 int FASTCALL(try_to_unmap(struct page *));
 
+int page_convert_anon(struct page *);
+
 /* linux/mm/shmem.c */
 extern int shmem_unuse(swp_entry_t entry, struct page *page);
 #else
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/linux/sysctl.h 141-no_vma_sort/include/linux/sysctl.h
--- 000-virgin/include/linux/sysctl.h	2003-06-05 14:56:37.000000000 -0700
+++ 141-no_vma_sort/include/linux/sysctl.h	2003-06-09 10:16:47.000000000 -0700
@@ -66,7 +66,8 @@ enum
 	CTL_DEV=7,		/* Devices */
 	CTL_BUS=8,		/* Busses */
 	CTL_ABI=9,		/* Binary emulation */
-	CTL_CPU=10		/* CPU stuff (speed scaling, etc) */
+	CTL_CPU=10,		/* CPU stuff (speed scaling, etc) */
+	CTL_SCHED=11,		/* scheduler tunables */
 };
 
 /* CTL_BUS names: */
@@ -158,6 +159,21 @@ enum
 	VM_LOWER_ZONE_PROTECTION=20,/* Amount of protection of lower zones */
 };
 
+/* Tunable scheduler parameters in /proc/sys/sched/ */
+enum {
+	SCHED_MIN_TIMESLICE=1,		/* minimum process timeslice */
+	SCHED_MAX_TIMESLICE=2,		/* maximum process timeslice */
+	SCHED_CHILD_PENALTY=3,		/* penalty on fork to child */
+	SCHED_PARENT_PENALTY=4,		/* penalty on fork to parent */
+	SCHED_EXIT_WEIGHT=5,		/* penalty to parent of CPU hog child */
+	SCHED_PRIO_BONUS_RATIO=6,	/* percent of max prio given as bonus */
+	SCHED_INTERACTIVE_DELTA=7,	/* delta used to scale interactivity */
+	SCHED_MAX_SLEEP_AVG=8,		/* maximum sleep avg attainable */
+	SCHED_STARVATION_LIMIT=9,	/* no re-active if expired is starved */
+	SCHED_NODE_THRESHOLD=10,	/* NUMA node rebalance threshold */
+	SCHED_IDLE_NODE_REBALANCE_RATIO=11,  /* how often to global balance */
+	SCHED_BUSY_NODE_REBALANCE_RATIO=12,  /* how often to global balance */
+};
 
 /* CTL_NET names: */
 enum
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/linux/timex.h 141-no_vma_sort/include/linux/timex.h
--- 000-virgin/include/linux/timex.h	2003-06-05 14:39:26.000000000 -0700
+++ 141-no_vma_sort/include/linux/timex.h	2003-06-09 10:15:23.000000000 -0700
@@ -75,7 +75,7 @@
 #elif HZ >= 768 && HZ < 1536
 # define SHIFT_HZ	10
 #else
-# error You lose.
+# error Please use a HZ value which is between 12 and 1536 
 #endif
 
 /*
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/init/main.c 141-no_vma_sort/init/main.c
--- 000-virgin/init/main.c	2003-06-05 14:56:42.000000000 -0700
+++ 141-no_vma_sort/init/main.c	2003-06-09 10:15:18.000000000 -0700
@@ -37,6 +37,7 @@
 #include <linux/rcupdate.h>
 #include <linux/moduleparam.h>
 #include <linux/writeback.h>
+#include <linux/early_printk.h>
 
 #include <asm/io.h>
 #include <asm/bugs.h>
@@ -387,6 +388,7 @@ asmlinkage void __init start_kernel(void
  */
 	lock_kernel();
 	printk(linux_banner);
+	setup_early_printk(&command_line);
 	setup_arch(&command_line);
 	setup_per_cpu_areas();
 
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/kernel/Makefile 141-no_vma_sort/kernel/Makefile
--- 000-virgin/kernel/Makefile	2003-06-05 14:56:42.000000000 -0700
+++ 141-no_vma_sort/kernel/Makefile	2003-06-09 10:15:18.000000000 -0700
@@ -19,6 +19,7 @@ obj-$(CONFIG_CPU_FREQ) += cpufreq.o
 obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
 obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend.o
 obj-$(CONFIG_COMPAT) += compat.o
+obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
 
 ifneq ($(CONFIG_IA64),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/kernel/early_printk.c 141-no_vma_sort/kernel/early_printk.c
--- 000-virgin/kernel/early_printk.c	1969-12-31 16:00:00.000000000 -0800
+++ 141-no_vma_sort/kernel/early_printk.c	2003-06-09 10:15:18.000000000 -0700
@@ -0,0 +1,209 @@
+#include <linux/console.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/early_printk.h>
+#include <asm/io.h>
+
+/* Simple VGA output */
+
+#define MAX_YPOS	25
+#define MAX_XPOS	80
+
+static int current_ypos = 1, current_xpos = 0; 
+
+static void early_vga_write(struct console *con, const char *str, unsigned n)
+{
+	char c;
+	int  i, k, j;
+
+	while ((c = *str++) != '\0' && n-- > 0) {
+		if (current_ypos >= MAX_YPOS) {
+			/* scroll 1 line up */
+			for(k = 1, j = 0; k < MAX_YPOS; k++, j++) {
+				for(i = 0; i < MAX_XPOS; i++) {
+					writew(readw(VGABASE + 2*(MAX_XPOS*k + i)),
+					       VGABASE + 2*(MAX_XPOS*j + i));
+				}
+			}
+			for(i = 0; i < MAX_XPOS; i++) {
+				writew(0x720, VGABASE + 2*(MAX_XPOS*j + i));
+			}
+			current_ypos = MAX_YPOS-1;
+		}
+		if (c == '\n') {
+			current_xpos = 0;
+			current_ypos++;
+		} else if (c != '\r')  {
+			writew(((0x7 << 8) | (unsigned short) c),
+			       VGABASE + 2*(MAX_XPOS*current_ypos + current_xpos++));
+			if (current_xpos >= MAX_XPOS) {
+				current_xpos = 0;
+				current_ypos++;
+			}
+		}
+	}
+}
+
+static struct console early_vga_console = {
+	.name =		"earlyvga",
+	.write =	early_vga_write,
+	.flags =	CON_PRINTBUFFER,
+	.index =	-1,
+};
+
+/* Serial functions losely based on a similar package from Klaus P. Gerlicher */ 
+
+int early_serial_base;  /* ttyS0 */ 
+
+static int early_serial_putc(unsigned char ch) 
+{ 
+	unsigned timeout = 0xffff; 
+	while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout) 
+		rep_nop(); 
+	outb(ch, early_serial_base + TXR);
+	return timeout ? 0 : -1;
+} 
+
+static void early_serial_write(struct console *con, const char *s, unsigned n)
+{
+	while (*s && n-- > 0) { 
+		early_serial_putc(*s); 
+		if (*s == '\n') 
+			early_serial_putc('\r'); 
+		s++; 
+	} 
+} 
+
+static __init void early_serial_init(char *opt)
+{
+	unsigned char c; 
+	unsigned divisor, baud = DEFAULT_BAUD;
+	static int bases[] = SERIAL_BASES;
+	char *s, *e;
+
+	early_serial_base = bases[0];
+	
+	if (*opt == ',') 
+		++opt;
+
+	s = strsep(&opt, ","); 
+	if (s != NULL) { 
+		unsigned port; 
+		if (!strncmp(s,"0x",2))
+			early_serial_base = simple_strtoul(s, &e, 16);
+		else {	
+			if (!strncmp(s,"ttyS",4)) 
+				s+=4; 
+			port = simple_strtoul(s, &e, 10); 
+			if (port > (SERIAL_BASES_LEN-1) || s == e) 
+				port = 0; 
+			early_serial_base = bases[port];
+		}
+	}
+
+	outb(0x3, early_serial_base + LCR); /* 8n1 */
+	outb(0, early_serial_base + IER); /* no interrupt */ 
+	outb(0, early_serial_base + FCR); /* no fifo */ 
+	outb(0x3, early_serial_base + MCR); /* DTR + RTS */ 
+
+	s = strsep(&opt, ","); 
+	if (s != NULL) { 
+		baud = simple_strtoul(s, &e, 0); 
+		if (baud == 0 || s == e) 
+			baud = DEFAULT_BAUD;
+	} 
+	
+	divisor = 115200 / baud; 
+	c = inb(early_serial_base + LCR); 
+	outb(c | DLAB, early_serial_base + LCR); 
+	outb(divisor & 0xff, early_serial_base + DLL); 
+	outb((divisor >> 8) & 0xff, early_serial_base +	DLH);
+	outb(c & ~DLAB, early_serial_base + LCR);
+}
+
+static struct console early_serial_console = {
+	.name =		"earlyser",
+	.write =	early_serial_write,
+	.flags =	CON_PRINTBUFFER,
+	.index =	-1,
+};
+
+/* Direct interface for emergencies */
+struct console *early_console = &early_vga_console;
+static int early_console_initialized = 0;
+
+void early_printk(const char *fmt, ...)
+{ 
+	char buf[512]; 
+	int n; 
+	va_list ap;
+	va_start(ap,fmt); 
+	n = vsnprintf(buf,512,fmt,ap);
+	early_console->write(early_console,buf,n);
+	va_end(ap); 
+} 
+
+static int keep_early; 
+
+int __init setup_early_printk(char *opt) 
+{  
+	char *space, *s;
+	char buf[256];
+
+	s = strstr(opt, "earlyprintk=");
+	if (s == NULL)
+		return -1;
+	opt = s+12;
+	
+	if (early_console_initialized)
+		return -1;
+
+	strncpy(buf,opt,256); 
+	buf[255] = 0; 
+	space = strchr(buf, ' '); 
+	if (space)
+		*space = 0; 
+
+	if (strstr(buf,"keep"))
+		keep_early = 1; 
+
+	if (!strncmp(buf, "serial", 6)) { 
+		early_serial_init(buf + 6);
+		early_console = &early_serial_console;
+	} else if (!strncmp(buf, "ttyS", 4)) { 
+		early_serial_init(buf);
+		early_console = &early_serial_console;		
+	} else if (!strncmp(buf, "vga", 3)) {
+		early_console = &early_vga_console; 
+	} else {
+		early_console = NULL; 		
+		return -1; 
+	}
+	early_console_initialized = 1;
+	register_console(early_console);
+	early_printk( "early printk console registered\n" );
+	return 0;
+}
+
+void __init disable_early_printk(void)
+{ 
+	if (!early_console_initialized || !early_console)
+		return;
+	if (!keep_early) {
+		printk("disabling early console...\n"); 
+		unregister_console(early_console);
+		early_console_initialized = 0;
+	} else { 
+		printk("keeping early console.\n"); 
+	}
+} 
+
+/* syntax: earlyprintk=vga
+           earlyprintk=serial[,ttySn[,baudrate]] 
+   Append ,keep to not disable it when the real console takes over.
+   Only vga or serial at a time, not both.
+   Currently only ttyS0 and ttyS1 are supported. 
+   Interaction with the standard serial driver is not very good. 
+   The VGA output is eventually overwritten by the real console. */
+__setup("earlyprintk=", setup_early_printk);  
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/kernel/sched.c 141-no_vma_sort/kernel/sched.c
--- 000-virgin/kernel/sched.c	2003-06-05 14:56:43.000000000 -0700
+++ 141-no_vma_sort/kernel/sched.c	2003-06-09 10:16:47.000000000 -0700
@@ -64,16 +64,27 @@
  * maximum timeslice is 200 msecs. Timeslices get refilled after
  * they expire.
  */
-#define MIN_TIMESLICE		( 10 * HZ / 1000)
-#define MAX_TIMESLICE		(200 * HZ / 1000)
-#define CHILD_PENALTY		50
-#define PARENT_PENALTY		100
-#define EXIT_WEIGHT		3
-#define PRIO_BONUS_RATIO	25
-#define INTERACTIVE_DELTA	2
-#define MAX_SLEEP_AVG		(10*HZ)
-#define STARVATION_LIMIT	(10*HZ)
-#define NODE_THRESHOLD		125
+int min_timeslice = (10 * HZ) / 1000;
+int max_timeslice = (200 * HZ) / 1000;
+int child_penalty = 50;
+int parent_penalty = 100;
+int exit_weight = 3;
+int prio_bonus_ratio = 25;
+int interactive_delta = 2;
+int max_sleep_avg = 10 * HZ;
+int starvation_limit = 10 * HZ;
+int node_threshold = 125;
+
+#define MIN_TIMESLICE		(min_timeslice)
+#define MAX_TIMESLICE		(max_timeslice)
+#define CHILD_PENALTY		(child_penalty)
+#define PARENT_PENALTY		(parent_penalty)
+#define EXIT_WEIGHT		(exit_weight)
+#define PRIO_BONUS_RATIO	(prio_bonus_ratio)
+#define INTERACTIVE_DELTA	(interactive_delta)
+#define MAX_SLEEP_AVG		(max_sleep_avg)
+#define STARVATION_LIMIT	(starvation_limit)
+#define NODE_THRESHOLD		(node_threshold)
 
 /*
  * If a task is 'interactive' then we reinsert it in the active
@@ -230,6 +241,111 @@ __init void node_nr_running_init(void)
 
 #endif /* CONFIG_NUMA */
 
+
+struct schedstat {
+	/* sys_sched_yield stats */
+	unsigned long yld_exp_empty;
+	unsigned long yld_act_empty;
+	unsigned long yld_both_empty;
+	unsigned long yld_cnt;
+
+	/* schedule stats */
+	unsigned long sched_noswitch;
+	unsigned long sched_switch;
+	unsigned long sched_cnt;
+
+	/* load_balance stats */
+	unsigned long lb_imbalance;
+	unsigned long lb_idle;
+	unsigned long lb_busy;
+	unsigned long lb_resched;
+	unsigned long lb_cnt;
+	unsigned long lb_nobusy;
+	unsigned long lb_bnode;
+
+	/* pull_task stats */
+	unsigned long pt_gained;
+	unsigned long pt_lost;
+	unsigned long pt_node_gained;
+	unsigned long pt_node_lost;
+
+	/* balance_node stats */
+	unsigned long bn_cnt;
+	unsigned long bn_idle;
+} ____cacheline_aligned;
+
+/*
+ * bump this up when changing the output format or the meaning of an existing
+ * format, so that tools can adapt (or abort)
+ */
+#define SCHEDSTAT_VERSION	2
+
+struct schedstat schedstats[NR_CPUS];
+
+/*
+ * This could conceivably exceed a page's worth of output on machines with
+ * large number of cpus, where large == about 4096/100 or 40ish. Start
+ * worrying when we pass 32, probably. Then this has to stop being a
+ * "simple" entry in proc/proc_misc.c and needs to be an actual seq_file.
+ */
+int schedstats_read_proc(char *page, char **start, off_t off,
+				 int count, int *eof, void *data)
+{
+	struct schedstat sums;
+	int i, len;
+
+	memset(&sums, 0, sizeof(sums));
+	len = sprintf(page, "version %d\n", SCHEDSTAT_VERSION);
+	for (i = 0; i < NR_CPUS; i++) {
+		if (!cpu_online(i)) continue;
+		sums.yld_exp_empty += schedstats[i].yld_exp_empty;
+		sums.yld_act_empty += schedstats[i].yld_act_empty;
+		sums.yld_both_empty += schedstats[i].yld_both_empty;
+		sums.yld_cnt += schedstats[i].yld_cnt;
+		sums.sched_noswitch += schedstats[i].sched_noswitch;
+		sums.sched_switch += schedstats[i].sched_switch;
+		sums.sched_cnt += schedstats[i].sched_cnt;
+		sums.lb_idle += schedstats[i].lb_idle;
+		sums.lb_busy += schedstats[i].lb_busy;
+		sums.lb_resched += schedstats[i].lb_resched;
+		sums.lb_cnt += schedstats[i].lb_cnt;
+		sums.lb_imbalance += schedstats[i].lb_imbalance;
+		sums.lb_nobusy += schedstats[i].lb_nobusy;
+		sums.lb_bnode += schedstats[i].lb_bnode;
+		sums.pt_node_gained += schedstats[i].pt_node_gained;
+		sums.pt_node_lost += schedstats[i].pt_node_lost;
+		sums.pt_gained += schedstats[i].pt_gained;
+		sums.pt_lost += schedstats[i].pt_lost;
+		sums.bn_cnt += schedstats[i].bn_cnt;
+		sums.bn_idle += schedstats[i].bn_idle;
+		len += sprintf(page + len,
+		    "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu "
+		    "%lu %lu %lu %lu %lu %lu %lu %lu\n",
+		    i, schedstats[i].yld_both_empty,
+		    schedstats[i].yld_act_empty, schedstats[i].yld_exp_empty,
+		    schedstats[i].yld_cnt, schedstats[i].sched_noswitch,
+		    schedstats[i].sched_switch, schedstats[i].sched_cnt,
+		    schedstats[i].lb_idle, schedstats[i].lb_busy,
+		    schedstats[i].lb_resched,
+		    schedstats[i].lb_cnt, schedstats[i].lb_imbalance,
+		    schedstats[i].lb_nobusy, schedstats[i].lb_bnode,
+		    schedstats[i].pt_gained, schedstats[i].pt_lost,
+		    schedstats[i].pt_node_gained, schedstats[i].pt_node_lost,
+		    schedstats[i].bn_cnt, schedstats[i].bn_idle);
+	}
+	len += sprintf(page + len,
+	    "totals %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu "
+	    "%lu %lu %lu %lu %lu %lu %lu\n",
+	    sums.yld_both_empty, sums.yld_act_empty, sums.yld_exp_empty,
+	    sums.yld_cnt, sums.sched_noswitch, sums.sched_switch,
+	    sums.sched_cnt, sums.lb_idle, sums.lb_busy, sums.lb_resched,
+	    sums.lb_cnt, sums.lb_imbalance, sums.lb_nobusy, sums.lb_bnode,
+	    sums.pt_gained, sums.pt_lost, sums.pt_node_gained,
+	    sums.pt_node_lost, sums.bn_cnt, sums.bn_idle);
+
+	return len;
+}
+
 /*
  * task_rq_lock - lock the runqueue a given task resides on and disable
  * interrupts.  Note the ordering: we can safely lookup the task_rq without
@@ -656,7 +772,6 @@ static inline task_t * context_switch(ru
 
 	return prev;
 }
-
 /*
  * nr_running, nr_uninterruptible and nr_context_switches:
  *
@@ -840,6 +955,9 @@ static int find_busiest_node(int this_no
 
 #endif /* CONFIG_NUMA */
 
+int idle_node_rebalance_ratio = 10;
+int busy_node_rebalance_ratio = 2;
+
 #ifdef CONFIG_SMP
 
 /*
@@ -951,6 +1069,12 @@ out:
  */
 static inline void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, runqueue_t *this_rq, int this_cpu)
 {
+	if (cpu_to_node(this_cpu) != cpu_to_node(src_rq - runqueues)) {
+		schedstats[this_cpu].pt_node_gained++;
+		schedstats[src_rq - runqueues].pt_node_lost++;
+	}
+	schedstats[this_cpu].pt_gained++;
+	schedstats[src_rq - runqueues].pt_lost++;
 	dequeue_task(p, src_array);
 	nr_running_dec(src_rq);
 	set_task_cpu(p, this_cpu);
@@ -985,10 +1109,14 @@ static void load_balance(runqueue_t *thi
 	struct list_head *head, *curr;
 	task_t *tmp;
 
+	schedstats[this_cpu].lb_cnt++;
 	busiest = find_busiest_queue(this_rq, this_cpu, idle, &imbalance, cpumask);
-	if (!busiest)
+	if (!busiest) {
+		schedstats[this_cpu].lb_nobusy++;
 		goto out;
+	}
 
+	schedstats[this_cpu].lb_imbalance += imbalance;
 	/*
 	 * We first consider expired tasks. Those will likely not be
 	 * executed in the near future, and they are most likely to
@@ -1067,8 +1195,8 @@ out:
  */
 #define IDLE_REBALANCE_TICK (HZ/1000 ?: 1)
 #define BUSY_REBALANCE_TICK (HZ/5 ?: 1)
-#define IDLE_NODE_REBALANCE_TICK (IDLE_REBALANCE_TICK * 5)
-#define BUSY_NODE_REBALANCE_TICK (BUSY_REBALANCE_TICK * 2)
+#define IDLE_NODE_REBALANCE_TICK (IDLE_REBALANCE_TICK * idle_node_rebalance_ratio)
+#define BUSY_NODE_REBALANCE_TICK (BUSY_REBALANCE_TICK * busy_node_rebalance_ratio)
 
 #ifdef CONFIG_NUMA
 static void balance_node(runqueue_t *this_rq, int idle, int this_cpu)
@@ -1076,9 +1204,13 @@ static void balance_node(runqueue_t *thi
 	int node = find_busiest_node(cpu_to_node(this_cpu));
 	unsigned long cpumask, this_cpumask = 1UL << this_cpu;
 
+	schedstats[this_cpu].bn_cnt++;
+	if (idle)
+	    schedstats[this_cpu].bn_idle++;
 	if (node >= 0) {
 		cpumask = node_to_cpumask(node) | this_cpumask;
 		spin_lock(&this_rq->lock);
+		schedstats[this_cpu].lb_bnode++;
 		load_balance(this_rq, idle, cpumask);
 		spin_unlock(&this_rq->lock);
 	}
@@ -1087,9 +1219,7 @@ static void balance_node(runqueue_t *thi
 
 static void rebalance_tick(runqueue_t *this_rq, int idle)
 {
-#ifdef CONFIG_NUMA
 	int this_cpu = smp_processor_id();
-#endif
 	unsigned long j = jiffies;
 
 	/*
@@ -1107,6 +1237,7 @@ static void rebalance_tick(runqueue_t *t
 #endif
 		if (!(j % IDLE_REBALANCE_TICK)) {
 			spin_lock(&this_rq->lock);
+			schedstats[this_cpu].lb_idle++;
 			load_balance(this_rq, 0, cpu_to_node_mask(this_cpu));
 			spin_unlock(&this_rq->lock);
 		}
@@ -1118,6 +1249,7 @@ static void rebalance_tick(runqueue_t *t
 #endif
 	if (!(j % BUSY_REBALANCE_TICK)) {
 		spin_lock(&this_rq->lock);
+		schedstats[this_cpu].lb_busy++;
 		load_balance(this_rq, idle, cpu_to_node_mask(this_cpu));
 		spin_unlock(&this_rq->lock);
 	}
@@ -1243,13 +1375,14 @@ asmlinkage void schedule(void)
 	runqueue_t *rq;
 	prio_array_t *array;
 	struct list_head *queue;
-	int idx;
+	int idx, mycpu = smp_processor_id();
 
 	/*
 	 * Test if we are atomic.  Since do_exit() needs to call into
 	 * schedule() atomically, we ignore that path for now.
 	 * Otherwise, whine if we are scheduling when we should not be.
 	 */
+	schedstats[mycpu].sched_cnt++;
 	if (likely(!(current->state & (TASK_DEAD | TASK_ZOMBIE)))) {
 		if (unlikely(in_atomic())) {
 			printk(KERN_ERR "bad: scheduling while atomic!\n");
@@ -1288,6 +1421,7 @@ need_resched:
 pick_next_task:
 	if (unlikely(!rq->nr_running)) {
 #ifdef CONFIG_SMP
+		schedstats[mycpu].lb_resched++;
 		load_balance(rq, 1, cpu_to_node_mask(smp_processor_id()));
 		if (rq->nr_running)
 			goto pick_next_task;
@@ -1302,11 +1436,13 @@ pick_next_task:
 		/*
 		 * Switch the active and expired arrays.
 		 */
+		schedstats[mycpu].sched_switch++;
 		rq->active = rq->expired;
 		rq->expired = array;
 		array = rq->active;
 		rq->expired_timestamp = 0;
 	}
+	schedstats[mycpu].sched_noswitch++;
 
 	idx = sched_find_first_bit(array->bitmap);
 	queue = array->queue + idx;
@@ -1958,6 +2094,7 @@ asmlinkage long sys_sched_yield(void)
 {
 	runqueue_t *rq = this_rq_lock();
 	prio_array_t *array = current->array;
+	int mycpu = smp_processor_id();
 
 	/*
 	 * We implement yielding by moving the task into the expired
@@ -1966,7 +2103,15 @@ asmlinkage long sys_sched_yield(void)
 	 * (special rule: RT tasks will just roundrobin in the active
 	 *  array.)
 	 */
+	schedstats[mycpu].yld_cnt++;
 	if (likely(!rt_task(current))) {
+		if (current->array->nr_active == 1) {
+		    schedstats[mycpu].yld_act_empty++;
+		    if (!rq->expired->nr_active)
+			schedstats[mycpu].yld_both_empty++;
+		} else if (!rq->expired->nr_active) {
+			schedstats[mycpu].yld_exp_empty++;
+		}
 		dequeue_task(current, array);
 		enqueue_task(current, rq->expired);
 	} else {
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/kernel/sysctl.c 141-no_vma_sort/kernel/sysctl.c
--- 000-virgin/kernel/sysctl.c	2003-06-05 14:56:44.000000000 -0700
+++ 141-no_vma_sort/kernel/sysctl.c	2003-06-09 10:16:48.000000000 -0700
@@ -57,6 +57,18 @@ extern char core_pattern[];
 extern int cad_pid;
 extern int pid_max;
 extern int sysctl_lower_zone_protection;
+extern int min_timeslice;
+extern int max_timeslice;
+extern int child_penalty;
+extern int parent_penalty;
+extern int exit_weight;
+extern int prio_bonus_ratio;
+extern int interactive_delta;
+extern int max_sleep_avg;
+extern int starvation_limit;
+extern int node_threshold;
+extern int idle_node_rebalance_ratio;
+extern int busy_node_rebalance_ratio;
 
 /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
 static int maxolduid = 65535;
@@ -114,6 +126,7 @@ static struct ctl_table_header root_tabl
 
 static ctl_table kern_table[];
 static ctl_table vm_table[];
+static ctl_table sched_table[];
 #ifdef CONFIG_NET
 extern ctl_table net_table[];
 #endif
@@ -158,6 +171,7 @@ static ctl_table root_table[] = {
 	{CTL_FS, "fs", NULL, 0, 0555, fs_table},
 	{CTL_DEBUG, "debug", NULL, 0, 0555, debug_table},
         {CTL_DEV, "dev", NULL, 0, 0555, dev_table},
+	{CTL_SCHED, "sched", NULL, 0, 0555, sched_table},
 	{0}
 };
 
@@ -362,7 +376,49 @@ static ctl_table debug_table[] = {
 
 static ctl_table dev_table[] = {
 	{0}
-};  
+};
+
+static ctl_table sched_table[] = {
+	{SCHED_MAX_TIMESLICE, "max_timeslice", &max_timeslice,
+	 sizeof(int), 0644, NULL, &proc_dointvec_minmax,
+	 &sysctl_intvec, NULL, &one, NULL},
+	{SCHED_MIN_TIMESLICE, "min_timeslice", &min_timeslice,
+	 sizeof(int), 0644, NULL, &proc_dointvec_minmax,
+	 &sysctl_intvec, NULL, &one, NULL},
+	{SCHED_CHILD_PENALTY, "child_penalty", &child_penalty,
+	 sizeof(int), 0644, NULL, &proc_dointvec_minmax,
+	 &sysctl_intvec, NULL, &zero, NULL},
+	{SCHED_PARENT_PENALTY, "parent_penalty", &parent_penalty,
+	 sizeof(int), 0644, NULL, &proc_dointvec_minmax,
+	 &sysctl_intvec, NULL, &zero, NULL},
+	{SCHED_EXIT_WEIGHT, "exit_weight", &exit_weight,
+	 sizeof(int), 0644, NULL, &proc_dointvec_minmax,
+	 &sysctl_intvec, NULL, &zero, NULL},
+	{SCHED_PRIO_BONUS_RATIO, "prio_bonus_ratio", &prio_bonus_ratio,
+	 sizeof(int), 0644, NULL, &proc_dointvec_minmax,
+	 &sysctl_intvec, NULL, &zero, NULL},
+	{SCHED_INTERACTIVE_DELTA, "interactive_delta", &interactive_delta,
+	 sizeof(int), 0644, NULL, &proc_dointvec_minmax,
+	 &sysctl_intvec, NULL, &zero, NULL},
+	{SCHED_MAX_SLEEP_AVG, "max_sleep_avg", &max_sleep_avg,
+	 sizeof(int), 0644, NULL, &proc_dointvec_minmax,
+	 &sysctl_intvec, NULL, &one, NULL},
+	{SCHED_STARVATION_LIMIT, "starvation_limit", &starvation_limit,
+	 sizeof(int), 0644, NULL, &proc_dointvec_minmax,
+	 &sysctl_intvec, NULL, &zero, NULL},
+	{SCHED_NODE_THRESHOLD, "node_threshold", &node_threshold,
+	 sizeof(int), 0644, NULL, &proc_dointvec_minmax,
+	 sysctl_intvec, NULL, &one, NULL},
+	{SCHED_IDLE_NODE_REBALANCE_RATIO, "idle_node_rebalance_ratio", 
+						&idle_node_rebalance_ratio,
+	 sizeof(int), 0644, NULL, &proc_dointvec_minmax,
+	 &sysctl_intvec, NULL, &zero, NULL},
+	{SCHED_BUSY_NODE_REBALANCE_RATIO, "busy_node_rebalance_ratio", 
+						&busy_node_rebalance_ratio,
+	 sizeof(int), 0644, NULL, &proc_dointvec_minmax,
+	 &sysctl_intvec, NULL, &zero, NULL},
+	{0}
+};
 
 extern void init_irq_proc (void);
 
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/mm/filemap.c 141-no_vma_sort/mm/filemap.c
--- 000-virgin/mm/filemap.c	2003-06-05 14:56:44.000000000 -0700
+++ 141-no_vma_sort/mm/filemap.c	2003-06-09 10:16:54.000000000 -0700
@@ -63,6 +63,9 @@
  *  ->mmap_sem
  *    ->i_shared_sem		(various places)
  *
+ *  ->lock_page
+ *    ->i_shared_sem		(page_convert_anon)
+ *
  *  ->inode_lock
  *    ->sb_lock			(fs/fs-writeback.c)
  *    ->mapping->page_lock	(__sync_single_inode)
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/mm/fremap.c 141-no_vma_sort/mm/fremap.c
--- 000-virgin/mm/fremap.c	2003-06-05 14:56:44.000000000 -0700
+++ 141-no_vma_sort/mm/fremap.c	2003-06-09 10:16:54.000000000 -0700
@@ -60,10 +60,26 @@ int install_page(struct mm_struct *mm, s
 	pgd_t *pgd;
 	pmd_t *pmd;
 	struct pte_chain *pte_chain;
+	unsigned long pgidx;
 
 	pte_chain = pte_chain_alloc(GFP_KERNEL);
 	if (!pte_chain)
 		goto err;
+
+	/*
+	 * Convert this page to anon for objrmap if it's nonlinear
+	 */
+	pgidx = (addr - vma->vm_start) >> PAGE_SHIFT;
+	pgidx += vma->vm_pgoff;
+	pgidx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT;
+	if (!PageAnon(page) && (page->index != pgidx)) {
+		lock_page(page);
+		err = page_convert_anon(page);
+		unlock_page(page);
+		if (err < 0)
+			goto err_free;
+	}
+
 	pgd = pgd_offset(mm, addr);
 	spin_lock(&mm->page_table_lock);
 
@@ -85,12 +101,11 @@ int install_page(struct mm_struct *mm, s
 	if (flush)
 		flush_tlb_page(vma, addr);
 	update_mmu_cache(vma, addr, *pte);
-	spin_unlock(&mm->page_table_lock);
-	pte_chain_free(pte_chain);
-	return 0;
 
+	err = 0;
 err_unlock:
 	spin_unlock(&mm->page_table_lock);
+err_free:
 	pte_chain_free(pte_chain);
 err:
 	return err;
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/mm/memory.c 141-no_vma_sort/mm/memory.c
--- 000-virgin/mm/memory.c	2003-06-05 14:56:44.000000000 -0700
+++ 141-no_vma_sort/mm/memory.c	2003-06-09 10:16:54.000000000 -0700
@@ -102,8 +102,7 @@ static inline void free_one_pmd(struct m
 
 static inline void free_one_pgd(struct mmu_gather *tlb, pgd_t * dir)
 {
-	int j;
-	pmd_t * pmd;
+	pmd_t * pmd, * md, * emd;
 
 	if (pgd_none(*dir))
 		return;
@@ -114,8 +113,21 @@ static inline void free_one_pgd(struct m
 	}
 	pmd = pmd_offset(dir, 0);
 	pgd_clear(dir);
-	for (j = 0; j < PTRS_PER_PMD ; j++)
-		free_one_pmd(tlb, pmd+j);
+	/*
+	 * Beware if changing the loop below.  It once used int j,
+	 * 	for (j = 0; j < PTRS_PER_PMD; j++)
+	 * 		free_one_pmd(pmd+j);
+	 * but some older i386 compilers (e.g. egcs-2.91.66, gcc-2.95.3)
+	 * terminated the loop with a _signed_ address comparison
+	 * using "jle", when configured for HIGHMEM64GB (X86_PAE).
+	 * If also configured for 3GB of kernel virtual address space,
+	 * if page at physical 0x3ffff000 virtual 0x7ffff000 is used as
+	 * a pmd, when that mm exits the loop goes on to free "entries"
+	 * found at 0x80000000 onwards.  The loop below compiles instead
+	 * to be terminated by unsigned address comparison using "jb".
+	 */
+	for (md = pmd, emd = pmd + PTRS_PER_PMD; md < emd; md++)
+		free_one_pmd(tlb,md);
 	pmd_free_tlb(tlb, pmd);
 }
 
@@ -1038,6 +1050,7 @@ static int do_wp_page(struct mm_struct *
 			++mm->rss;
 		page_remove_rmap(old_page, page_table);
 		break_cow(vma, new_page, address, page_table);
+		SetPageAnon(new_page);
 		pte_chain = page_add_rmap(new_page, page_table, pte_chain);
 		lru_cache_add_active(new_page);
 
@@ -1241,6 +1254,7 @@ static int do_swap_page(struct mm_struct
 
 	flush_icache_page(vma, page);
 	set_pte(page_table, pte);
+	SetPageAnon(page);
 	pte_chain = page_add_rmap(page, page_table, pte_chain);
 
 	/* No need to invalidate - it was non-present before */
@@ -1306,6 +1320,7 @@ do_anonymous_page(struct mm_struct *mm, 
 		entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
 		lru_cache_add_active(page);
 		mark_page_accessed(page);
+		SetPageAnon(page);
 	}
 
 	set_pte(page_table, entry);
@@ -1365,6 +1380,10 @@ do_no_page(struct mm_struct *mm, struct 
 	if (!pte_chain)
 		goto oom;
 
+	/* See if nopage returned an anon page */
+	if (!new_page->mapping || PageSwapCache(new_page))
+		SetPageAnon(new_page);
+
 	/*
 	 * Should we do an early C-O-W break?
 	 */
@@ -1377,6 +1396,7 @@ do_no_page(struct mm_struct *mm, struct 
 		copy_user_highpage(page, new_page, address);
 		page_cache_release(new_page);
 		lru_cache_add_active(page);
+		SetPageAnon(page);
 		new_page = page;
 	}
 
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/mm/mmap.c 141-no_vma_sort/mm/mmap.c
--- 000-virgin/mm/mmap.c	2003-06-05 14:56:44.000000000 -0700
+++ 141-no_vma_sort/mm/mmap.c	2003-06-09 10:16:58.000000000 -0700
@@ -377,6 +377,28 @@ static inline int is_mergeable_vma(struc
 	return 1;
 }
 
+static void move_vma_start(struct vm_area_struct *vma, unsigned long addr)
+{
+	spinlock_t *lock = &vma->vm_mm->page_table_lock;
+	struct inode *inode = NULL;
+	
+	if (vma->vm_file) {
+		inode = vma->vm_file->f_dentry->d_inode;
+		down(&inode->i_mapping->i_shared_sem);
+	}
+	spin_lock(lock);
+	if (inode)
+		__remove_shared_vm_struct(vma, inode);
+	/* If no vm_file, perhaps we should always keep vm_pgoff at 0?? */
+	vma->vm_pgoff += (long)(addr - vma->vm_start) >> PAGE_SHIFT;
+	vma->vm_start = addr;
+	if (inode) {
+		__vma_link_file(vma);
+		up(&inode->i_mapping->i_shared_sem);
+	}
+	spin_unlock(lock);
+}
+
 /*
  * Return true if we can merge this (vm_flags,file,vm_pgoff,size)
  * in front of (at a lower virtual address and file offset than) the vma.
@@ -429,8 +451,6 @@ static int vma_merge(struct mm_struct *m
 			unsigned long end, unsigned long vm_flags,
 			struct file *file, unsigned long pgoff)
 {
-	spinlock_t * lock = &mm->page_table_lock;
-
 	/*
 	 * We later require that vma->vm_flags == vm_flags, so this tests
 	 * vma->vm_flags & VM_SPECIAL, too.
@@ -450,6 +470,7 @@ static int vma_merge(struct mm_struct *m
 			is_mergeable_vma(prev, file, vm_flags) &&
 			can_vma_merge_after(prev, vm_flags, file, pgoff)) {
 		struct vm_area_struct *next;
+		spinlock_t *lock = &mm->page_table_lock;
 		struct inode *inode = file ? file->f_dentry->d_inode : NULL;
 		int need_up = 0;
 
@@ -497,10 +518,7 @@ static int vma_merge(struct mm_struct *m
 				pgoff, (end - addr) >> PAGE_SHIFT))
 			return 0;
 		if (end == prev->vm_start) {
-			spin_lock(lock);
-			prev->vm_start = addr;
-			prev->vm_pgoff -= (end - addr) >> PAGE_SHIFT;
-			spin_unlock(lock);
+			move_vma_start(prev, addr);
 			return 1;
 		}
 	}
@@ -1220,8 +1238,7 @@ int split_vma(struct mm_struct * mm, str
 
 	if (new_below) {
 		new->vm_end = addr;
-		vma->vm_start = addr;
-		vma->vm_pgoff += ((addr - new->vm_start) >> PAGE_SHIFT);
+		move_vma_start(vma, addr);
 	} else {
 		vma->vm_end = addr;
 		new->vm_start = addr;
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/mm/page_alloc.c 141-no_vma_sort/mm/page_alloc.c
--- 000-virgin/mm/page_alloc.c	2003-06-05 14:56:45.000000000 -0700
+++ 141-no_vma_sort/mm/page_alloc.c	2003-06-09 10:16:54.000000000 -0700
@@ -220,6 +220,8 @@ static inline void free_pages_check(cons
 		bad_page(function, page);
 	if (PageDirty(page))
 		ClearPageDirty(page);
+	if (PageAnon(page))
+		ClearPageAnon(page);
 }
 
 /*
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/mm/rmap.c 141-no_vma_sort/mm/rmap.c
--- 000-virgin/mm/rmap.c	2003-04-21 14:14:53.000000000 -0700
+++ 141-no_vma_sort/mm/rmap.c	2003-06-09 10:16:58.000000000 -0700
@@ -102,6 +102,136 @@ pte_chain_encode(struct pte_chain *pte_c
  **/
 
 /**
+ * find_pte - Find a pte pointer given a vma and a struct page.
+ * @vma: the vma to search
+ * @page: the page to find
+ *
+ * Determine if this page is mapped in this vma.  If it is, map and rethrn
+ * the pte pointer associated with it.  Return null if the page is not
+ * mapped in this vma for any reason.
+ *
+ * This is strictly an internal helper function for the object-based rmap
+ * functions.
+ * 
+ * It is the caller's responsibility to unmap the pte if it is returned.
+ */
+static inline pte_t *
+find_pte(struct vm_area_struct *vma, struct page *page, unsigned long *addr)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	pgd_t *pgd;
+	pmd_t *pmd;
+	pte_t *pte;
+	unsigned long loffset;
+	unsigned long address;
+
+	loffset = (page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT));
+	address = vma->vm_start + ((loffset - vma->vm_pgoff) << PAGE_SHIFT);
+	if (address < vma->vm_start || address >= vma->vm_end)
+		goto out;
+
+	pgd = pgd_offset(mm, address);
+	if (!pgd_present(*pgd))
+		goto out;
+
+	pmd = pmd_offset(pgd, address);
+	if (!pmd_present(*pmd))
+		goto out;
+
+	pte = pte_offset_map(pmd, address);
+	if (!pte_present(*pte))
+		goto out_unmap;
+
+	if (page_to_pfn(page) != pte_pfn(*pte))
+		goto out_unmap;
+
+	if (addr)
+		*addr = address;
+
+	return pte;
+
+out_unmap:
+	pte_unmap(pte);
+out:
+	return NULL;
+}
+
+/**
+ * page_referenced_obj_one - referenced check for object-based rmap
+ * @vma: the vma to look in.
+ * @page: the page we're working on.
+ *
+ * Find a pte entry for a page/vma pair, then check and clear the referenced
+ * bit.
+ *
+ * This is strictly a helper function for page_referenced_obj.
+ */
+static int
+page_referenced_obj_one(struct vm_area_struct *vma, struct page *page)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	pte_t *pte;
+	int referenced = 0;
+
+	if (!spin_trylock(&mm->page_table_lock))
+		return 1;
+
+	pte = find_pte(vma, page, NULL);
+	if (pte) {
+		if (ptep_test_and_clear_young(pte))
+			referenced++;
+		pte_unmap(pte);
+	}
+
+	spin_unlock(&mm->page_table_lock);
+	return referenced;
+}
+
+/**
+ * page_referenced_obj_one - referenced check for object-based rmap
+ * @page: the page we're checking references on.
+ *
+ * For an object-based mapped page, find all the places it is mapped and
+ * check/clear the referenced flag.  This is done by following the page->mapping
+ * pointer, then walking the chain of vmas it holds.  It returns the number
+ * of references it found.
+ *
+ * This function is only called from page_referenced for object-based pages.
+ *
+ * The semaphore address_space->i_shared_sem is tried.  If it can't be gotten,
+ * assume a reference count of 1.
+ */
+static int
+page_referenced_obj(struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+	struct vm_area_struct *vma;
+	int referenced = 0;
+
+	if (!page->pte.mapcount)
+		return 0;
+
+	if (!mapping)
+		BUG();
+
+	if (PageSwapCache(page))
+		BUG();
+
+	if (down_trylock(&mapping->i_shared_sem))
+		return 1;
+	
+	list_for_each_entry(vma, &mapping->i_mmap, shared)
+		referenced += page_referenced_obj_one(vma, page);
+
+	list_for_each_entry(vma, &mapping->i_mmap_shared, shared)
+		referenced += page_referenced_obj_one(vma, page);
+
+	up(&mapping->i_shared_sem);
+
+	return referenced;
+}
+
+/**
  * page_referenced - test if the page was referenced
  * @page: the page to test
  *
@@ -120,6 +250,10 @@ int page_referenced(struct page * page)
 	if (TestClearPageReferenced(page))
 		referenced++;
 
+	if (!PageAnon(page)) {
+		referenced += page_referenced_obj(page);
+		goto out;
+	}
 	if (PageDirect(page)) {
 		pte_t *pte = rmap_ptep_map(page->pte.direct);
 		if (ptep_test_and_clear_young(pte))
@@ -153,6 +287,7 @@ int page_referenced(struct page * page)
 			__pte_chain_free(pc);
 		}
 	}
+out:
 	return referenced;
 }
 
@@ -175,6 +310,21 @@ page_add_rmap(struct page *page, pte_t *
 
 	pte_chain_lock(page);
 
+	/*
+	 * If this is an object-based page, just count it.  We can
+ 	 * find the mappings by walking the object vma chain for that object.
+	 */
+	if (!PageAnon(page)) {
+		if (!page->mapping)
+			BUG();
+		if (PageSwapCache(page))
+			BUG();
+		if (!page->pte.mapcount)
+			inc_page_state(nr_mapped);
+		page->pte.mapcount++;
+		goto out;
+	}
+
 	if (page->pte.direct == 0) {
 		page->pte.direct = pte_paddr;
 		SetPageDirect(page);
@@ -231,8 +381,25 @@ void page_remove_rmap(struct page *page,
 	pte_chain_lock(page);
 
 	if (!page_mapped(page))
-		goto out_unlock;	/* remap_page_range() from a driver? */
+		goto out_unlock;
 
+	/*
+	 * If this is an object-based page, just uncount it.  We can
+	 * find the mappings by walking the object vma chain for that object.
+	 */
+	if (!PageAnon(page)) {
+		if (!page->mapping)
+			BUG();
+		if (PageSwapCache(page))
+			BUG();
+		if (!page->pte.mapcount)
+			BUG();
+		page->pte.mapcount--;
+		if (!page->pte.mapcount)
+			dec_page_state(nr_mapped);
+		goto out_unlock;
+	}
+  
 	if (PageDirect(page)) {
 		if (page->pte.direct == pte_paddr) {
 			page->pte.direct = 0;
@@ -279,6 +446,102 @@ out_unlock:
 }
 
 /**
+ * try_to_unmap_obj - unmap a page using the object-based rmap method
+ * @page: the page to unmap
+ *
+ * Determine whether a page is mapped in a given vma and unmap it if it's found.
+ *
+ * This function is strictly a helper function for try_to_unmap_obj.
+ */
+static inline int
+try_to_unmap_obj_one(struct vm_area_struct *vma, struct page *page)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	unsigned long address;
+	pte_t *pte;
+	pte_t pteval;
+	int ret = SWAP_AGAIN;
+
+	if (!spin_trylock(&mm->page_table_lock))
+		return ret;
+
+	pte = find_pte(vma, page, &address);
+	if (!pte)
+		goto out;
+
+	if (vma->vm_flags & VM_LOCKED) {
+		ret =  SWAP_FAIL;
+		goto out_unmap;
+	}
+
+	flush_cache_page(vma, address);
+	pteval = ptep_get_and_clear(pte);
+	flush_tlb_page(vma, address);
+
+	if (pte_dirty(pteval))
+		set_page_dirty(page);
+
+	if (!page->pte.mapcount)
+		BUG();
+
+	mm->rss--;
+	page->pte.mapcount--;
+	page_cache_release(page);
+
+out_unmap:
+	pte_unmap(pte);
+
+out:
+	spin_unlock(&mm->page_table_lock);
+	return ret;
+}
+
+/**
+ * try_to_unmap_obj - unmap a page using the object-based rmap method
+ * @page: the page to unmap
+ *
+ * Find all the mappings of a page using the mapping pointer and the vma chains
+ * contained in the address_space struct it points to.
+ *
+ * This function is only called from try_to_unmap for object-based pages.
+ *
+ * The semaphore address_space->i_shared_sem is tried.  If it can't be gotten,
+ * return a temporary error.
+ */
+static int
+try_to_unmap_obj(struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+	struct vm_area_struct *vma;
+	int ret = SWAP_AGAIN;
+
+	if (!mapping)
+		BUG();
+
+	if (PageSwapCache(page))
+		BUG();
+
+	if (down_trylock(&mapping->i_shared_sem))
+		return ret;
+	
+	list_for_each_entry(vma, &mapping->i_mmap, shared) {
+		ret = try_to_unmap_obj_one(vma, page);
+		if (ret == SWAP_FAIL || !page->pte.mapcount)
+			goto out;
+	}
+
+	list_for_each_entry(vma, &mapping->i_mmap_shared, shared) {
+		ret = try_to_unmap_obj_one(vma, page);
+		if (ret == SWAP_FAIL || !page->pte.mapcount)
+			goto out;
+	}
+
+out:
+	up(&mapping->i_shared_sem);
+	return ret;
+}
+
+/**
  * try_to_unmap_one - worker function for try_to_unmap
  * @page: page to unmap
  * @ptep: page table entry to unmap from page
@@ -397,6 +660,15 @@ int try_to_unmap(struct page * page)
 	if (!page->mapping)
 		BUG();
 
+	/*
+	 * If it's an object-based page, use the object vma chain to find all
+	 * the mappings.
+	 */
+	if (!PageAnon(page)) {
+		ret = try_to_unmap_obj(page);
+		goto out;
+	}
+
 	if (PageDirect(page)) {
 		ret = try_to_unmap_one(page, page->pte.direct);
 		if (ret == SWAP_SUCCESS) {
@@ -452,12 +724,115 @@ int try_to_unmap(struct page * page)
 		}
 	}
 out:
-	if (!page_mapped(page))
+	if (!page_mapped(page)) {
 		dec_page_state(nr_mapped);
+		ret = SWAP_SUCCESS;
+	}
 	return ret;
 }
 
 /**
+ * page_convert_anon - Convert an object-based mapped page to pte_chain-based.
+ * @page: the page to convert
+ *
+ * Find all the mappings for an object-based page and convert them
+ * to 'anonymous', ie create a pte_chain and store all the pte pointers there.
+ *
+ * This function takes the address_space->i_shared_sem, sets the PageAnon flag,
+ * then sets the mm->page_table_lock for each vma and calls page_add_rmap. This
+ * means there is a period when PageAnon is set, but still has some mappings
+ * with no pte_chain entry.  This is in fact safe, since page_remove_rmap will
+ * simply not find it.  try_to_unmap might erroneously return success, but it
+ * will never be called because the page_convert_anon() caller has locked the
+ * page.
+ *
+ * page_referenced() may fail to scan all the appropriate pte's and may return
+ * an inaccurate result.  This is so rare that it does not matter.
+ */
+int page_convert_anon(struct page *page)
+{
+	struct address_space *mapping;
+	struct vm_area_struct *vma;
+	struct pte_chain *pte_chain = NULL;
+	pte_t *pte;
+	int err = 0;
+
+	mapping = page->mapping;
+	if (mapping == NULL)
+		goto out;		/* truncate won the lock_page() race */
+
+	down(&mapping->i_shared_sem);
+	pte_chain_lock(page);
+
+	/*
+	 * Has someone else done it for us before we got the lock?
+	 * If so, pte.direct or pte.chain has replaced pte.mapcount.
+	 */
+	if (PageAnon(page)) {
+		pte_chain_unlock(page);
+		goto out_unlock;
+	}
+
+	SetPageAnon(page);
+	if (page->pte.mapcount == 0) {
+		pte_chain_unlock(page);
+		goto out_unlock;
+	}
+	/* This is gonna get incremented by page_add_rmap */
+	dec_page_state(nr_mapped);
+	page->pte.mapcount = 0;
+
+	/*
+	 * Now that the page is marked as anon, unlock it.  page_add_rmap will
+	 * lock it as necessary.
+	 */
+	pte_chain_unlock(page);
+
+	list_for_each_entry(vma, &mapping->i_mmap, shared) {
+		if (!pte_chain) {
+			pte_chain = pte_chain_alloc(GFP_KERNEL);
+			if (!pte_chain) {
+				err = -ENOMEM;
+				goto out_unlock;
+			}
+		}
+		spin_lock(&vma->vm_mm->page_table_lock);
+		pte = find_pte(vma, page, NULL);
+		if (pte) {
+			/* Make sure this isn't a duplicate */
+			page_remove_rmap(page, pte);
+			pte_chain = page_add_rmap(page, pte, pte_chain);
+			pte_unmap(pte);
+		}
+		spin_unlock(&vma->vm_mm->page_table_lock);
+	}
+	list_for_each_entry(vma, &mapping->i_mmap_shared, shared) {
+		if (!pte_chain) {
+			pte_chain = pte_chain_alloc(GFP_KERNEL);
+			if (!pte_chain) {
+				err = -ENOMEM;
+				goto out_unlock;
+			}
+		}
+		spin_lock(&vma->vm_mm->page_table_lock);
+		pte = find_pte(vma, page, NULL);
+		if (pte) {
+			/* Make sure this isn't a duplicate */
+			page_remove_rmap(page, pte);
+			pte_chain = page_add_rmap(page, pte, pte_chain);
+			pte_unmap(pte);
+		}
+		spin_unlock(&vma->vm_mm->page_table_lock);
+	}
+
+out_unlock:
+	pte_chain_free(pte_chain);
+	up(&mapping->i_shared_sem);
+out:
+	return err;
+}
+
+/**
  ** No more VM stuff below this comment, only pte_chain helper
  ** functions.
  **/
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/mm/swapfile.c 141-no_vma_sort/mm/swapfile.c
--- 000-virgin/mm/swapfile.c	2003-06-05 14:56:45.000000000 -0700
+++ 141-no_vma_sort/mm/swapfile.c	2003-06-09 10:16:54.000000000 -0700
@@ -385,6 +385,7 @@ unuse_pte(struct vm_area_struct *vma, un
 	vma->vm_mm->rss++;
 	get_page(page);
 	set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot)));
+	SetPageAnon(page);
 	*pte_chainp = page_add_rmap(page, dir, *pte_chainp);
 	swap_free(entry);
 }
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/scripts/schedcapture 141-no_vma_sort/scripts/schedcapture
--- 000-virgin/scripts/schedcapture	1969-12-31 16:00:00.000000000 -0800
+++ 141-no_vma_sort/scripts/schedcapture	2003-06-09 10:16:45.000000000 -0700
@@ -0,0 +1,6 @@
+while true
+do
+	cat /proc/schedstat
+	echo
+	sleep 20
+done
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/scripts/schedstat 141-no_vma_sort/scripts/schedstat
--- 000-virgin/scripts/schedstat	1969-12-31 16:00:00.000000000 -0800
+++ 141-no_vma_sort/scripts/schedstat	2003-06-09 10:16:45.000000000 -0700
@@ -0,0 +1,168 @@
+#!/usr/bin/perl
+
+$slice = 20;	# seconds
+while (<>) {
+    @curr = split;
+    if ($curr[0] =~ /cpu(\d)/) {
+	$per_cpu_curr[$1] = [ @curr ];
+	$max_cpu = $1 if ($1 > $max_cpu);
+	next;
+    }
+    next if (/^$/);
+    if ($curr[0] eq "version") {
+	if ($curr[1] != 2) {
+	    die "Version mismatch. Update this tool.\n";
+	}
+	next;
+    }
+    #
+    # format of line in /proc/schedstat
+    #
+    # tag 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
+    #
+    # tag is "cpuN" or "cpu".  Right now, we ignore "cpuN" lines (this tool
+    # doesn't collate per-cpu statistics, although it would be trivial to
+    # do so.)
+    #
+    # version == 1
+    # NOTE: the active queue is considered empty if it has only one process
+    #	in it, since obviously the process calling sched_yield is that process.
+    #
+    # First four are sched_yield statistics:
+    #     1) # of times both the active and the expired queue were empty
+    #     2) # of times just the active queue was empty
+    #     3) # of times just the expired queue was empty
+    #     4) # of times sched_yield() was called
+    #
+    # Next two are schedule() statistics:
+    #     5) # of times the active queue had at least one other process on it.
+    #     6) # of times we switched to the expired queue and reused it
+    #     7) # of times schedule() was called
+    #
+    # Next seven are statistics dealing with load balancing:
+    #     8) # of times load_balance was called at an idle tick
+    #     9) # of times load_balance was called at an busy tick
+    #    10) # of times load_balance was called from schedule()
+    #	 11) # of times load_balance was called
+    #	 12) sum of imbalances discovered (if any) with each call to
+    #        load_balance
+    #	 13) # of times load_balance was called when we did not find a
+    #	     "busiest" queue
+    #	 14) # of times load_balance was called from balance_node()
+    #
+    # Next four are statistics dealing with pull_task():
+    #    15) # of times pull_task was called at an idle tick
+    #    16) # of times pull_task was called at an busy tick
+    #    17) # of times pull_task was called from schedule()
+    #	 18) # of times pull_task was called
+    #
+    # Next two are statistics dealing with balance_node():
+    #    19) # of times balance_node was called
+    #    20) # of times balance_node was called at an idle tick
+    #
+    #$curr[7] = $sched_cnt;
+    foreach $i (1..20) {
+	$diff[$i] = $curr[$i] - $prev[$i];
+    }
+
+    for ($cpu = 0; $cpu <= $max_cpu; $cpu++) {
+	@arr_curr = @{$per_cpu_curr[$cpu]};
+	@arr_prev = @{$per_cpu_prev[$cpu]};
+	foreach $i (1..20) {
+	    $arr_diff[$i] = $arr_curr[$i] - $arr_prev[$i];
+	}
+	$per_cpu_diff[$cpu] = [ @arr_diff ];
+    }
+
+    #for ($cpu = 0; $cpu <= $max_cpu; $cpu++) {
+#	print "@{$per_cpu_curr[$cpu]}\n";
+#    }
+#    print "@curr\n";
+    printf "%02d:%02d:%02d--------------------------------------------------------------\n",
+	$tick*$slice/3600, ($tick*$slice/60)%60, ($tick*$slice)%60;
+
+    #
+    # sched_yield() stats
+    #
+    printf "    %7d          sys_sched_yield()\n", $diff[4];
+    printf "    %7d(%6.2f%%) found (only) active queue empty on current cpu\n",
+	$diff[2]-$diff[1], $diff[4] ? (100*($diff[2]-$diff[1])/$diff[4]) : 0;
+    printf "    %7d(%6.2f%%) found (only) expired queue empty on current cpu\n",
+	$diff[3], $diff[4] ? (100*$diff[3]/$diff[4]) : 0;
+    printf "    %7d(%6.2f%%) found both queues empty on current cpu\n",
+	$diff[1], $diff[4] ? (100*$diff[1]/$diff[4]) : 0;
+    printf "    %7d(%6.2f%%) found neither queue empty on current cpu\n\n",
+	$diff[4]-($diff[3]+$diff[2]),
+	$diff[4] ? 100*($diff[4]-($diff[3]+$diff[2]))/$diff[4] : 0;
+
+    #
+    # schedule() stats
+    #
+    printf "    %7d          schedule()\n", $diff[7];
+    printf "    %7d(%6.2f%%) switched active and expired queues\n",
+	$diff[6], $diff[7] ? (100*$diff[6]/$diff[7]) : 0;
+    printf "    %7d(%6.2f%%) used existing active queue\n\n",
+	$diff[5]-$diff[6], $diff[7] ? (100*($diff[5]-$diff[6])/$diff[7]) : 0;
+
+    #
+    # load_balance() stats
+    #
+    printf "    %7d          load_balance()\n", $diff[11];
+    printf "    %7d(%6.2f%%) called while idle\n", $diff[8],
+	100*$diff[8]/$diff[11];
+    printf "    %7d(%6.2f%%) called while busy\n", $diff[9],
+	100*($diff[9])/$diff[11];
+    printf "    %7d(%6.2f%%) called from schedule()\n", $diff[10],
+	100*$diff[10]/$diff[11];
+    printf "    %7d(%6.2f%%) called from balance_node()\n", $diff[14],
+	100*$diff[14]/$diff[11];
+    printf "             %7d no \"busiest\" queue found\n",$diff[13];
+    if ($diff[11]-$diff[13]) {
+	$imbalance = $diff[12] / ($diff[11]-$diff[13]);
+	if ($imbalance < 10) {
+	    printf "             %7.3f average imbalance (over %d)\n",
+		$imbalance, $diff[11]-$diff[13];
+	} elsif ($imbalance < 100) {
+	    printf "            %8.2f average imbalance (over %d)\n",
+		$imbalance, $diff[11]-$diff[13];
+	} else {
+	    printf "           %9.1f average imbalance (over %d)\n",
+		$imbalance, $diff[11]-$diff[13];
+	}
+    }
+    else {
+	printf "                     no imbalances\n";
+    }
+
+    #
+    # pull_task() stats
+    #
+    print "\n";
+    printf "    %7d          pull_task()\n", $diff[15];
+    for ($cpu = 0; $cpu <= $max_cpu; $cpu++) {
+	@arr = @{$per_cpu_diff[$cpu]};
+	if ($arr[15] || $arr[16]) {
+	    printf "    %7d/%-7d  cpu %d lost/gained task to/from another cpu\n",
+		$arr[15], $arr[16], $cpu;
+	}
+	if ($arr[17] || $arr[18]) {
+	    printf "    %7d/%-7d  cpu %d lost/gained task to/from another node\n",
+		$arr[17], $arr[18], $cpu;
+	}
+    }
+    print "\n";
+
+    #
+    # balance_node() stats
+    #
+    printf "    %7d          balance_node()\n", $diff[19];
+    printf "    %7d(%6.2f%%) called while idle\n", $diff[20],
+	$diff[19] ? 100*$diff[20]/$diff[19] : 0;
+    printf "    %7d(%6.2f%%) called while busy\n", $diff[19] - $diff[20],
+	$diff[19] ? 100*(($diff[19]-$diff[20]))/$diff[19] : 0;
+
+    printf("\n");
+    @prev = @curr;
+    @per_cpu_prev = @per_cpu_curr;
+    $tick++;
+}