diff -purN -X /home/mbligh/.diff.exclude 000-virgin/Documentation/filesystems/proc.txt 999-mjb/Documentation/filesystems/proc.txt
--- 000-virgin/Documentation/filesystems/proc.txt	2003-10-01 11:46:27.000000000 -0700
+++ 999-mjb/Documentation/filesystems/proc.txt	2003-10-02 16:39:40.000000000 -0700
@@ -38,6 +38,7 @@ Table of Contents
   2.8	/proc/sys/net/ipv4 - IPV4 settings
   2.9	Appletalk
   2.10	IPX
+  2.11  /proc/sys/sched - scheduler tunables
 
 ------------------------------------------------------------------------------
 Preface
@@ -1805,6 +1806,104 @@ The /proc/net/ipx_route  table  holds  a
 gives the  destination  network, the router node (or Directly) and the network
 address of the router (or Connected) for internal networks.
 
+2.11 /proc/sys/sched - scheduler tunables
+-----------------------------------------
+
+Useful knobs for tuning the scheduler live in /proc/sys/sched.
+
+child_penalty
+-------------
+
+Percentage of the parent's sleep_avg that children inherit.  sleep_avg is
+a running average of the time a process spends sleeping.  Tasks with high
+sleep_avg values are considered interactive and given a higher dynamic
+priority and a larger timeslice.  You typically want this some value just
+under 100.
+
+exit_weight
+-----------
+
+When a CPU hog task exits, its parent's sleep_avg is reduced by a factor of
+exit_weight against the exiting task's sleep_avg.
+
+interactive_delta
+-----------------
+
+If a task is "interactive" it is reinserted into the active array after it
+has expired its timeslice, instead of being inserted into the expired array.
+How "interactive" a task must be in order to be deemed interactive is a
+function of its nice value.  This interactive limit is scaled linearly by nice
+value and is offset by the interactive_delta.
+
+max_sleep_avg
+-------------
+
+max_sleep_avg is the largest value (in ms) stored for a task's running sleep
+average.  The larger this value, the longer a task needs to sleep to be
+considered interactive (maximum interactive bonus is a function of
+max_sleep_avg).
+
+max_timeslice
+-------------
+
+Maximum timeslice, in milliseconds.  This is the value given to tasks of the
+highest dynamic priority.
+
+min_timeslice
+-------------
+
+Minimum timeslice, in milliseconds.  This is the value given to tasks of the
+lowest dynamic priority.  Every task gets at least this slice of the processor
+per array switch.
+
+parent_penalty
+--------------
+
+Percentage of the parent's sleep_avg that it retains across a fork().
+sleep_avg is a running average of the time a process spends sleeping.  Tasks
+with high sleep_avg values are considered interactive and given a higher
+dynamic priority and a larger timeslice.  Normally, this value is 100 and thus
+task's retain their sleep_avg on fork.  If you want to punish interactive
+tasks for forking, set this below 100.
+
+prio_bonus_ratio
+----------------
+
+Middle percentage of the priority range that tasks can receive as a dynamic
+priority.  The default value of 25% ensures that nice values at the
+extremes are still enforced.  For example, nice +19 interactive tasks will
+never be able to preempt a nice 0 CPU hog.  Setting this higher will increase
+the size of the priority range the tasks can receive as a bonus.  Setting
+this lower will decrease this range, making the interactivity bonus less
+apparent and user nice values more applicable.
+
+starvation_limit
+----------------
+
+Sufficiently interactive tasks are reinserted into the active array when they
+run out of timeslice.  Normally, tasks are inserted into the expired array.
+Reinserting interactive tasks into the active array allows them to remain
+runnable, which is important to interactive performance.  This could starve
+expired tasks, however, since the interactive task could prevent the array
+switch.  To prevent starving the tasks on the expired array for too long. the
+starvation_limit is the longest (in ms) we will let the expired array starve
+at the expense of reinserting interactive tasks back into active.  Higher
+values here give more preferance to running interactive tasks, at the expense
+of expired tasks.  Lower values provide more fair scheduling behavior, at the
+expense of interactivity.  The units are in milliseconds.
+
+idle_node_rebalance_ratio
+-------------------------
+
+On NUMA machines, we normally rebalance within nodes, but we also rebalance
+globally every N idle rebalance ticks, where N = idle_node_rebalance_ratio.
+
+busy_node_rebalance_ratio
+-------------------------
+
+On NUMA machines, we normally rebalance within nodes, but we also rebalance
+globally every N busy rebalance ticks, where N = busy_node_rebalance_ratio.
+
 ------------------------------------------------------------------------------
 Summary
 ------------------------------------------------------------------------------
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/Makefile 999-mjb/Makefile
--- 000-virgin/Makefile	2003-10-01 11:47:28.000000000 -0700
+++ 999-mjb/Makefile	2003-10-02 16:54:34.000000000 -0700
@@ -1,7 +1,7 @@
 VERSION = 2
 PATCHLEVEL = 6
 SUBLEVEL = 0
-EXTRAVERSION = -test6
+EXTRAVERSION = -test6-mjb1
 
 # *DOCUMENTATION*
 # To see a list of typical targets execute "make help"
@@ -156,6 +156,8 @@ HOSTCXX  	= g++
 HOSTCFLAGS	= -Wall -Wstrict-prototypes -O2 -fomit-frame-pointer
 HOSTCXXFLAGS	= -O2
 
+GCOV_FLAGS	= -fprofile-arcs -ftest-coverage
+
 
 # 	That's our default target when none is given on the command line
 #	Note that 'modules' will be added as a prerequisite as well, 
@@ -286,6 +288,8 @@ export	VERSION PATCHLEVEL SUBLEVEL EXTRA
 export CPPFLAGS NOSTDINC_FLAGS OBJCOPYFLAGS LDFLAGS
 export CFLAGS CFLAGS_KERNEL CFLAGS_MODULE 
 export AFLAGS AFLAGS_KERNEL AFLAGS_MODULE
+export CFLAGS_NOGCOV
+
 
 export MODVERDIR := .tmp_versions
 
@@ -655,6 +659,11 @@ depend dep:
 # ---------------------------------------------------------------------------
 # Modules
 
+CFLAGS_NOGCOV := $(CFLAGS)
+ifdef CONFIG_GCOV_ALL
+CFLAGS += $(GCOV_FLAGS)
+endif
+
 ifdef CONFIG_MODULES
 
 # 	By default, build modules as well
@@ -777,6 +786,7 @@ clean: archclean $(clean-dirs)
 	$(call cmd,rmclean)
 	@find . $(RCS_FIND_IGNORE) \
 	 	\( -name '*.[oas]' -o -name '*.ko' -o -name '.*.cmd' \
+		-o -name '*.bb' -o -name '*.bbg' -o -name '*.da' \
 		-o -name '.*.d' -o -name '.*.tmp' -o -name '*.mod.c' \) \
 		-type f -print | xargs rm -f
 
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/arch/i386/Kconfig 999-mjb/arch/i386/Kconfig
--- 000-virgin/arch/i386/Kconfig	2003-10-01 11:47:33.000000000 -0700
+++ 999-mjb/arch/i386/Kconfig	2003-10-02 16:43:03.000000000 -0700
@@ -453,17 +453,17 @@ config NR_CPUS
 	  This is purely to save memory - each supported CPU adds
 	  approximately eight kilobytes to the kernel image.
 
-config PREEMPT
-	bool "Preemptible Kernel"
-	help
-	  This option reduces the latency of the kernel when reacting to
-	  real-time or interactive events by allowing a low priority process to
-	  be preempted even if it is in kernel mode executing a system call.
-	  This allows applications to run more reliably even when the system is
-	  under load.
-
-	  Say Y here if you are building a kernel for a desktop, embedded
-	  or real-time system.  Say N if you are unsure.
+# config PREEMPT
+#	bool "Preemptible Kernel"
+#	help
+#	  This option reduces the latency of the kernel when reacting to
+#	  real-time or interactive events by allowing a low priority process to
+#	  be preempted even if it is in kernel mode executing a system call.
+#	  This allows applications to run more reliably even when the system is
+#	  under load.
+#
+#	  Say Y here if you are building a kernel for a desktop, embedded
+#	  or real-time system.  Say N if you are unsure.
 
 config X86_UP_APIC
 	bool "Local APIC support on uniprocessors" if !SMP
@@ -682,6 +682,44 @@ config HIGHMEM64G
 
 endchoice
 
+choice
+	help
+	  On i386, a process can only virtually address 4GB of memory.  This
+	  lets you select how much of that virtual space you would like to 
+	  devoted to userspace, and how much to the kernel.
+
+	  Some userspace programs would like to address as much as possible and 
+	  have few demands of the kernel other than it get out of the way.  These
+	  users may opt to use the 3.5GB option to give their userspace program 
+	  as much room as possible.  Due to alignment issues imposed by PAE, 
+	  the "3.5GB" option is unavailable if "64GB" high memory support is 
+	  enabled.
+
+	  Other users (especially those who use PAE) may be running out of
+	  ZONE_NORMAL memory.  Those users may benefit from increasing the
+	  kernel's virtual address space size by taking it away from userspace, 
+	  which may not need all of its space.  An indicator that this is 
+	  happening is when /proc/Meminfo's "LowFree:" is a small percentage of
+	  "LowTotal:" while "HighFree:" is very large.
+
+	  If unsure, say "3GB"
+	prompt "User address space size"
+        default 1GB
+	
+config	05GB
+	bool "3.5 GB"
+	depends on !HIGHMEM64G
+	
+config	1GB
+	bool "3 GB"
+	
+config	2GB
+	bool "2 GB"
+	
+config	3GB
+	bool "1 GB"
+endchoice
+
 config HIGHMEM
 	bool
 	depends on HIGHMEM64G || HIGHMEM4G
@@ -699,6 +737,11 @@ config NUMA
 	default n if X86_PC
 	default y if (X86_NUMAQ || X86_SUMMIT)
 
+config NUMA_SCHED
+	bool "Numa Scheduling Support"
+	depends on NUMA
+	default y
+
 # Need comments to help the hapless user trying to turn on NUMA support
 comment "NUMA (NUMA-Q) requires SMP, 64GB highmem support"
 	depends on X86_NUMAQ && (!HIGHMEM64G || !SMP)
@@ -784,6 +827,33 @@ config MTRR
 
 	  See <file:Documentation/mtrr.txt> for more information.
 
+choice
+	help
+	  This is unrelated to your processor's speed.  This variable alters
+	  how often the system is asked to generate timer interrupts.  A larger
+	  value can lead to a more responsive system, but also causes extra 
+	  overhead from the increased number of context switches.
+	    
+	  If in doubt, leave it at the default of 1000. 
+
+	prompt "Kernel HZ"
+	default 1000HZ
+
+config	100HZ
+	bool "100 Hz"
+
+config	1000HZ
+	bool "1000 Hz"
+endchoice
+
+config IRQBALANCE
+ 	bool "Enable kernel irq balancing"
+	depends on SMP
+	default y
+	help
+ 	  The defalut yes will allow the kernel to do irq load balancing.  
+	  Saying no will keep the kernel from doing irq load balancing. 	
+
 config HAVE_DEC_LOCK
 	bool
 	depends on (SMP || PREEMPT) && X86_CMPXCHG
@@ -1168,6 +1238,36 @@ source "drivers/usb/Kconfig"
 
 source "arch/i386/oprofile/Kconfig"
 
+menu "GCOV coverage profiling"
+
+config GCOV_PROFILE
+	bool "GCOV coverage profiling"
+	---help---
+	Provide infrastructure for coverage support for the kernel. This
+	will not compile the kernel by default with the necessary flags.
+	To obtain coverage information for the entire kernel, one should
+	enable the subsequent option (Profile entire kernel). If only
+	particular files or directories of the kernel are desired, then
+	one must provide the following compile options for such targets:
+      		"-fprofile-arcs -ftest-coverage" in the CFLAGS. To obtain
+	access to the coverage data one must insmod the gcov-proc kernel
+	module.
+
+config GCOV_ALL
+	bool "GCOV_ALL"
+	depends on GCOV_PROFILE
+	---help---
+	If you say Y here, it will compile the entire kernel with coverage
+	option enabled.
+
+config GCOV_PROC
+	tristate "gcov-proc module"
+	depends on GCOV_PROFILE && PROC_FS
+	---help---
+	This is the gcov-proc module that exposes gcov data through the 
+	/proc filesystem
+
+endmenu
 
 menu "Kernel hacking"
 
@@ -1214,6 +1314,26 @@ config MAGIC_SYSRQ
 	  keys are documented in <file:Documentation/sysrq.txt>. Don't say Y
 	  unless you really know what this hack does.
 
+config X86_EARLY_PRINTK
+	bool "Early console support"
+	default n
+	depends on DEBUG_KERNEL
+	help
+	  Write kernel log output directly into the VGA buffer or serial port. 
+	  This is useful for kernel debugging when your machine crashes very 
+	  early before the console code is initialized. For normal operation 
+	  it is not recommended because it looks ugly and doesn't cooperate 
+	  with klogd/syslogd or the X server.You should normally N here, 
+	  unless you want to debug such a crash.
+
+	  Syntax: earlyprintk=vga
+		  earlyprintk=serial[,ttySn[,baudrate]] 
+	  Append ,keep to not disable it when the real console takes over.
+	  Only vga or serial at a time, not both.
+	  Currently only ttyS0 and ttyS1 are supported. 
+	  Interaction with the standard serial driver is not very good. 
+	  The VGA output is eventually overwritten by the real console.
+
 config DEBUG_SPINLOCK
 	bool "Spinlock debugging"
 	depends on DEBUG_KERNEL
@@ -1231,6 +1351,15 @@ config DEBUG_PAGEALLOC
 	  This results in a large slowdown, but helps to find certain types
 	  of memory corruptions.
 
+config SPINLINE
+	bool "Spinlock inlining"
+	depends on DEBUG_KERNEL
+	help
+	  This will change spinlocks from out of line to inline, making them
+	  account cost to the callers in readprofile, rather than the lock
+	  itself (as ".text.lock.filename"). This can be helpful for finding
+	  the callers of locks.
+
 config DEBUG_HIGHMEM
 	bool "Highmem debugging"
 	depends on DEBUG_KERNEL && HIGHMEM
@@ -1253,6 +1382,14 @@ config DEBUG_SPINLOCK_SLEEP
 	  If you say Y here, various routines which may sleep will become very
 	  noisy if they are called with a spinlock held.	
 
+config LOCKMETER
+	bool "Kernel lock metering"
+	depends on SMP
+	help
+	  Say Y to enable kernel lock metering, which adds overhead to SMP 
+	  locks, but allows you to see various statistics using the lockstat
+	  command
+
 config FRAME_POINTER
 	bool "Compile the kernel with frame pointers"
 	help
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/arch/i386/Makefile 999-mjb/arch/i386/Makefile
--- 000-virgin/arch/i386/Makefile	2003-10-01 11:47:33.000000000 -0700
+++ 999-mjb/arch/i386/Makefile	2003-10-02 16:39:38.000000000 -0700
@@ -98,6 +98,7 @@ drivers-$(CONFIG_PM)			+= arch/i386/powe
 
 CFLAGS += $(mflags-y)
 AFLAGS += $(mflags-y)
+AFLAGS_vmlinux.lds.o += -imacros $(TOPDIR)/include/asm-i386/page.h
 
 boot := arch/i386/boot
 
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/arch/i386/boot/compressed/Makefile 999-mjb/arch/i386/boot/compressed/Makefile
--- 000-virgin/arch/i386/boot/compressed/Makefile	2003-03-20 11:25:38.000000000 -0800
+++ 999-mjb/arch/i386/boot/compressed/Makefile	2003-10-02 16:43:03.000000000 -0700
@@ -7,6 +7,7 @@
 targets		:= vmlinux vmlinux.bin vmlinux.bin.gz head.o misc.o piggy.o
 EXTRA_AFLAGS	:= -traditional
 
+CFLAGS := $(CFLAGS_NOGCOV)
 LDFLAGS_vmlinux := -Ttext $(IMAGE_OFFSET) -e startup_32
 
 $(obj)/vmlinux: $(obj)/head.o $(obj)/misc.o $(obj)/piggy.o FORCE
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/arch/i386/kernel/apic.c 999-mjb/arch/i386/kernel/apic.c
--- 000-virgin/arch/i386/kernel/apic.c	2003-10-01 11:46:30.000000000 -0700
+++ 999-mjb/arch/i386/kernel/apic.c	2003-10-02 16:41:02.000000000 -0700
@@ -1017,7 +1017,7 @@ int setup_profiling_timer(unsigned int m
  * multiplier is 1 and it can be changed by writing the new multiplier
  * value into /proc/profile.
  */
-
+extern void calc_load_cpu(int cpu);
 inline void smp_local_timer_interrupt(struct pt_regs * regs)
 {
 	int cpu = smp_processor_id();
@@ -1045,6 +1045,7 @@ inline void smp_local_timer_interrupt(st
 
 #ifdef CONFIG_SMP
 		update_process_times(user_mode(regs));
+		calc_load_cpu(cpu);
 #endif
 	}
 
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/arch/i386/kernel/entry.S 999-mjb/arch/i386/kernel/entry.S
--- 000-virgin/arch/i386/kernel/entry.S	2003-10-01 11:40:40.000000000 -0700
+++ 999-mjb/arch/i386/kernel/entry.S	2003-10-02 16:41:14.000000000 -0700
@@ -829,7 +829,7 @@ ENTRY(sys_call_table)
 	.long sys_getdents64	/* 220 */
 	.long sys_fcntl64
 	.long sys_ni_syscall	/* reserved for TUX */
-	.long sys_ni_syscall
+ 	.long sys_mbind
 	.long sys_gettid
 	.long sys_readahead	/* 225 */
 	.long sys_setxattr
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/arch/i386/kernel/head.S 999-mjb/arch/i386/kernel/head.S
--- 000-virgin/arch/i386/kernel/head.S	2003-10-01 11:40:40.000000000 -0700
+++ 999-mjb/arch/i386/kernel/head.S	2003-10-02 16:43:03.000000000 -0700
@@ -487,3 +487,24 @@ ENTRY(cpu_gdt_table)
 	.fill (NR_CPUS-1)*GDT_ENTRIES,8,0 /* other CPU's GDT */
 #endif
 
+#ifdef CONFIG_GCOV_PROFILE
+/*
+ * The .ctors-section contains a list of pointers to constructor
+ * functions which are used to initialize gcov structures.
+ *
+ * Because there is no NULL at the end of the constructor list
+ * in the kernel we need the addresses of both the constructor
+ * as well as the destructor list which are supposed to be
+ * adjacent.
+ */
+
+.section ".ctors","aw"
+.globl  __CTOR_LIST__
+.type   __CTOR_LIST__,@object
+__CTOR_LIST__:
+.section ".dtors","aw"
+.globl  __DTOR_LIST__
+.type   __DTOR_LIST__,@object
+__DTOR_LIST__:
+#endif
+
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/arch/i386/kernel/io_apic.c 999-mjb/arch/i386/kernel/io_apic.c
--- 000-virgin/arch/i386/kernel/io_apic.c	2003-10-01 11:47:33.000000000 -0700
+++ 999-mjb/arch/i386/kernel/io_apic.c	2003-10-02 16:40:46.000000000 -0700
@@ -272,7 +272,7 @@ static void set_ioapic_affinity(unsigned
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
-#if defined(CONFIG_SMP)
+#if defined(CONFIG_IRQBALANCE) 
 # include <asm/processor.h>	/* kernel_thread() */
 # include <linux/kernel_stat.h>	/* kstat */
 # include <linux/slab.h>		/* kmalloc() */
@@ -670,8 +670,6 @@ static int __init irqbalance_disable(cha
 
 __setup("noirqbalance", irqbalance_disable);
 
-static void set_ioapic_affinity(unsigned int irq, cpumask_t mask);
-
 static inline void move_irq(int irq)
 {
 	/* note - we hold the desc->lock */
@@ -683,9 +681,11 @@ static inline void move_irq(int irq)
 
 __initcall(balanced_irq_init);
 
-#else /* !SMP */
+#else /* !CONFIG_IRQBALANCE */
 static inline void move_irq(int irq) { }
+#endif /* CONFIG_IRQBALANCE */
 
+#ifndef CONFIG_SMP
 void send_IPI_self(int vector)
 {
 	unsigned int cfg;
@@ -700,7 +700,7 @@ void send_IPI_self(int vector)
 	 */
 	apic_write_around(APIC_ICR, cfg);
 }
-#endif /* defined(CONFIG_SMP) */
+#endif /* !CONFIG_SMP */
 
 
 /*
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/arch/i386/kernel/vmlinux.lds.S 999-mjb/arch/i386/kernel/vmlinux.lds.S
--- 000-virgin/arch/i386/kernel/vmlinux.lds.S	2003-10-01 11:40:41.000000000 -0700
+++ 999-mjb/arch/i386/kernel/vmlinux.lds.S	2003-10-02 16:39:38.000000000 -0700
@@ -10,7 +10,7 @@ ENTRY(startup_32)
 jiffies = jiffies_64;
 SECTIONS
 {
-  . = 0xC0000000 + 0x100000;
+  . = __PAGE_OFFSET + 0x100000;
   /* read-only */
   _text = .;			/* Text and read-only data */
   .text : {
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/arch/i386/lib/dec_and_lock.c 999-mjb/arch/i386/lib/dec_and_lock.c
--- 000-virgin/arch/i386/lib/dec_and_lock.c	2002-12-09 18:45:50.000000000 -0800
+++ 999-mjb/arch/i386/lib/dec_and_lock.c	2003-10-02 16:39:44.000000000 -0700
@@ -10,6 +10,7 @@
 #include <linux/spinlock.h>
 #include <asm/atomic.h>
 
+#ifndef ATOMIC_DEC_AND_LOCK
 int atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock)
 {
 	int counter;
@@ -38,3 +39,5 @@ slow_path:
 	spin_unlock(lock);
 	return 0;
 }
+#endif
+
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/arch/i386/mm/hugetlbpage.c 999-mjb/arch/i386/mm/hugetlbpage.c
--- 000-virgin/arch/i386/mm/hugetlbpage.c	2003-10-01 11:47:33.000000000 -0700
+++ 999-mjb/arch/i386/mm/hugetlbpage.c	2003-10-02 16:42:17.000000000 -0700
@@ -61,6 +61,27 @@ static struct page *alloc_fresh_huge_pag
 
 void free_huge_page(struct page *page);
 
+#ifdef CONFIG_NUMA
+
+static inline void huge_inc_rss(struct mm_struct *mm, struct page *page)
+{
+	mm->rss += (HPAGE_SIZE / PAGE_SIZE);
+	mm->pernode_rss[page_to_nid(page)] += (HPAGE_SIZE / PAGE_SIZE);
+}
+
+static inline void huge_dec_rss(struct mm_struct *mm, struct page *page)
+{
+	mm->rss -= (HPAGE_SIZE / PAGE_SIZE);
+	mm->pernode_rss[page_to_nid(page)] -= (HPAGE_SIZE / PAGE_SIZE);
+}
+
+#else /* !CONFIG_NUMA */
+
+#define huge_inc_rss(mm, page)	((mm)->rss += (HPAGE_SIZE / PAGE_SIZE))
+#define huge_dec_rss(mm, page)	((mm)->rss -= (HPAGE_SIZE / PAGE_SIZE))
+
+#endif /* CONFIG_NUMA */
+
 static struct page *alloc_hugetlb_page(void)
 {
 	int i;
@@ -105,7 +126,7 @@ static void set_huge_pte(struct mm_struc
 {
 	pte_t entry;
 
-	mm->rss += (HPAGE_SIZE / PAGE_SIZE);
+	huge_inc_rss(mm, page);
 	if (write_access) {
 		entry =
 		    pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
@@ -145,7 +166,7 @@ int copy_hugetlb_page_range(struct mm_st
 		ptepage = pte_page(entry);
 		get_page(ptepage);
 		set_pte(dst_pte, entry);
-		dst->rss += (HPAGE_SIZE / PAGE_SIZE);
+		huge_inc_rss(dst, ptepage);
 		addr += HPAGE_SIZE;
 	}
 	return 0;
@@ -314,8 +335,8 @@ void unmap_hugepage_range(struct vm_area
 		page = pte_page(*pte);
 		huge_page_release(page);
 		pte_clear(pte);
+		huge_dec_rss(mm, page);
 	}
-	mm->rss -= (end - start) >> PAGE_SHIFT;
 	flush_tlb_range(vma, start, end);
 }
 
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/arch/ppc/Kconfig 999-mjb/arch/ppc/Kconfig
--- 000-virgin/arch/ppc/Kconfig	2003-10-01 11:47:34.000000000 -0700
+++ 999-mjb/arch/ppc/Kconfig	2003-10-02 16:43:03.000000000 -0700
@@ -1288,6 +1288,36 @@ source "drivers/usb/Kconfig"
 
 source "lib/Kconfig"
 
+menu "GCOV coverage profiling"
+
+config GCOV_PROFILE
+	bool "GCOV coverage profiling"
+	---help---
+	Provide infrastructure for coverage support for the kernel. This
+	will not compile the kernel by default with the necessary flags.
+	To obtain coverage information for the entire kernel, one should
+	enable the subsequent option (Profile entire kernel). If only
+	particular files or directories of the kernel are desired, then
+	one must provide the following compile options for such targets:
+		"-fprofile-arcs -ftest-coverage" in the CFLAGS. To obtain
+	access to the coverage data one must insmod the gcov-prof kernel
+	module.
+
+config GCOV_ALL
+	bool "GCOV_ALL"
+	depends on GCOV_PROFILE
+	---help---
+	If you say Y here, it will compile the entire kernel with coverage
+	option enabled.
+
+config GCOV_PROC
+        tristate "gcov-proc module"
+        depends on GCOV_PROFILE && PROC_FS
+        ---help---
+        This is the gcov-proc module that exposes gcov data through the
+        /proc filesystem
+
+endmenu
 
 menu "Kernel hacking"
 
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/arch/ppc/boot/openfirmware/common.c 999-mjb/arch/ppc/boot/openfirmware/common.c
--- 000-virgin/arch/ppc/boot/openfirmware/common.c	2002-12-09 18:46:16.000000000 -0800
+++ 999-mjb/arch/ppc/boot/openfirmware/common.c	2003-10-02 16:43:03.000000000 -0700
@@ -30,6 +30,10 @@ struct memchunk {
 
 static struct memchunk *freechunks;
 
+#ifdef CONFIG_GCOV_PROFILE
+void __bb_init_func (void *ptr /* struct bb *blocks */) { }
+#endif
+
 static void *zalloc(void *x, unsigned items, unsigned size)
 {
     void *p;
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/arch/ppc/boot/prep/misc.c 999-mjb/arch/ppc/boot/prep/misc.c
--- 000-virgin/arch/ppc/boot/prep/misc.c	2003-01-13 16:04:55.000000000 -0800
+++ 999-mjb/arch/ppc/boot/prep/misc.c	2003-10-02 16:43:03.000000000 -0700
@@ -71,6 +71,10 @@ extern unsigned long serial_init(int cha
 extern void serial_fixups(void);
 extern unsigned long get_mem_size(void);
 
+#ifdef CONFIG_GCOV_PROFILE
+void __bb_init_func (void *ptr /* struct bb *blocks */) { }
+#endif
+
 void
 writel(unsigned int val, unsigned int address)
 {
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/arch/ppc/kernel/Makefile 999-mjb/arch/ppc/kernel/Makefile
--- 000-virgin/arch/ppc/kernel/Makefile	2003-10-01 11:47:36.000000000 -0700
+++ 999-mjb/arch/ppc/kernel/Makefile	2003-10-02 16:43:03.000000000 -0700
@@ -18,8 +18,8 @@ extra-$(CONFIG_6xx)		+= idle_6xx.o
 extra-$(CONFIG_POWER4)		+= idle_power4.o
 extra-y				+= vmlinux.lds.s
 
-obj-y				:= entry.o traps.o irq.o idle.o time.o misc.o \
-					process.o signal.o ptrace.o align.o \
+obj-y				:= entry.o ptrace.o traps.o irq.o idle.o time.o misc.o \
+					process.o signal.o align.o \
 					semaphore.o syscalls.o setup.o \
 					cputable.o ppc_htab.o
 obj-$(CONFIG_6xx)		+= l2cr.o cpu_setup_6xx.o
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/arch/ppc/kernel/entry.S 999-mjb/arch/ppc/kernel/entry.S
--- 000-virgin/arch/ppc/kernel/entry.S	2003-10-01 11:47:36.000000000 -0700
+++ 999-mjb/arch/ppc/kernel/entry.S	2003-10-02 16:43:03.000000000 -0700
@@ -106,10 +106,26 @@ transfer_to_handler:
 	mfspr	r11,SPRN_HID0
 	mtcr	r11
 BEGIN_FTR_SECTION
+#ifdef CONFIG_GCOV_PROFILE
+	bt-	8,near1_power_save_6xx_restore	/* Check DOZE */
+	b       skip1_power_save_6xx_restore    
+near1_power_save_6xx_restore:
+	b	power_save_6xx_restore
+skip1_power_save_6xx_restore:
+#else
 	bt-	8,power_save_6xx_restore	/* Check DOZE */
+#endif
 END_FTR_SECTION_IFSET(CPU_FTR_CAN_DOZE)
 BEGIN_FTR_SECTION
+#ifdef CONFIG_GCOV_PROFILE
+	bt-	9,near2_power_save_6xx_restore	/* Check NAP */
+	b	skip2_power_save_6xx_restore
+near2_power_save_6xx_restore:
+	b	power_save_6xx_restore
+skip2_power_save_6xx_restore:
+#else
 	bt-	9,power_save_6xx_restore	/* Check NAP */
+#endif
 END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP)
 #endif /* CONFIG_6xx */
 	.globl transfer_to_handler_cont
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/arch/ppc/kernel/head.S 999-mjb/arch/ppc/kernel/head.S
--- 000-virgin/arch/ppc/kernel/head.S	2003-10-01 11:47:36.000000000 -0700
+++ 999-mjb/arch/ppc/kernel/head.S	2003-10-02 16:43:03.000000000 -0700
@@ -1742,3 +1742,25 @@ intercept_table:
  */
 abatron_pteptrs:
 	.space	8
+
+#ifdef CONFIG_GCOV_PROFILE
+/*
+ * The .ctors-section contains a list of pointers to constructor
+ * functions which are used to initialize gcov structures.
+ *  
+ * Because there is no NULL at the end of the constructor list
+ * in the kernel we need the addresses of both the constructor
+ * as well as the destructor list which are supposed to be
+ * adjacent.
+ */ 
+ 
+.section ".ctors","aw"
+.globl  __CTOR_LIST__
+.type   __CTOR_LIST__,@object
+__CTOR_LIST__:
+.section ".dtors","aw"
+.globl  __DTOR_LIST__
+.type   __DTOR_LIST__,@object
+__DTOR_LIST__:
+#endif
+
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/arch/ppc/syslib/prom_init.c 999-mjb/arch/ppc/syslib/prom_init.c
--- 000-virgin/arch/ppc/syslib/prom_init.c	2003-10-01 11:47:37.000000000 -0700
+++ 999-mjb/arch/ppc/syslib/prom_init.c	2003-10-02 16:43:03.000000000 -0700
@@ -737,7 +737,11 @@ prom_instantiate_rtas(void)
 		 * Actually OF has bugs so we just arbitrarily
 		 * use memory at the 6MB point.
 		 */
+#ifdef CONFIG_GCOV_PROFILE
+		rtas_data = 0x990000;
+#else
 		rtas_data = 6 << 20;
+#endif
 		prom_print(" at ");
 		prom_print_hex(rtas_data);
 	}
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/arch/ppc64/Kconfig 999-mjb/arch/ppc64/Kconfig
--- 000-virgin/arch/ppc64/Kconfig	2003-10-01 11:47:38.000000000 -0700
+++ 999-mjb/arch/ppc64/Kconfig	2003-10-02 16:43:03.000000000 -0700
@@ -323,6 +323,37 @@ config VIOPATH
 
 source "arch/ppc64/oprofile/Kconfig"
 
+menu "GCOV coverage profiling"
+
+config GCOV_PROFILE
+        bool "GCOV coverage profiling"
+        ---help---
+        Provide infrastructure for coverage support for the kernel. This
+        will not compile the kernel by default with the necessary flags.
+        To obtain coverage information for the entire kernel, one should
+        enable the subsequent option (Profile entire kernel). If only
+        particular files or directories of the kernel are desired, then
+        one must provide the following compile options for such targets:
+                "-fprofile-arcs -ftest-coverage" in the CFLAGS. To obtain
+        access to the coverage data one must insmod the gcov-prof kernel
+        module.
+
+config GCOV_ALL
+        bool "GCOV_ALL"
+        depends on GCOV_PROFILE
+        ---help---
+        If you say Y here, it will compile the entire kernel with coverage
+        option enabled.
+
+config GCOV_PROC
+        tristate "gcov-proc module"
+        depends on GCOV_PROFILE && PROC_FS
+        ---help---
+        This is the gcov-proc module that exposes gcov data through the
+        /proc filesystem
+
+endmenu
+
 menu "Kernel hacking"
 
 config DEBUG_KERNEL
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/arch/ppc64/kernel/head.S 999-mjb/arch/ppc64/kernel/head.S
--- 000-virgin/arch/ppc64/kernel/head.S	2003-10-01 11:47:38.000000000 -0700
+++ 999-mjb/arch/ppc64/kernel/head.S	2003-10-02 16:43:03.000000000 -0700
@@ -1926,3 +1926,24 @@ stab_array:
 	.globl	cmd_line
 cmd_line:
 	.space	512
+
+#ifdef CONFIG_GCOV_PROFILE
+/*
+ * The .ctors-section contains a list of pointers to constructor
+ * functions which are used to initialize gcov structures.
+ *
+ * Because there is no NULL at the end of the constructor list
+ * in the kernel we need the addresses of both the constructor
+ * as well as the destructor list which are supposed to be
+ * adjacent.
+ */
+
+.section ".ctors","aw"
+.globl  __CTOR_LIST__
+.type   __CTOR_LIST__,@object
+__CTOR_LIST__:
+.section ".dtors","aw"
+.globl  __DTOR_LIST__
+.type   __DTOR_LIST__,@object
+__DTOR_LIST__:
+#endif
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/arch/sparc64/kernel/devices.c 999-mjb/arch/sparc64/kernel/devices.c
--- 000-virgin/arch/sparc64/kernel/devices.c	2003-10-01 11:40:45.000000000 -0700
+++ 999-mjb/arch/sparc64/kernel/devices.c	2003-10-02 16:39:44.000000000 -0700
@@ -117,6 +117,8 @@ int cpu_find_by_mid(int mid, int *prom_n
 			     prom_node, NULL);
 }
 
+unsigned long cpu_hz;
+
 void __init device_scan(void)
 {
 	/* FIX ME FAST... -DaveM */
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/arch/sparc64/lib/rwlock.S 999-mjb/arch/sparc64/lib/rwlock.S
--- 000-virgin/arch/sparc64/lib/rwlock.S	2002-12-09 18:45:55.000000000 -0800
+++ 999-mjb/arch/sparc64/lib/rwlock.S	2003-10-02 16:39:44.000000000 -0700
@@ -63,5 +63,33 @@ __write_lock: /* %o0 = lock_ptr */
 	be,pt		%icc, 99b
 	 membar		#StoreLoad | #StoreStore
 	ba,a,pt		%xcc, 1b
+ 
+	.globl	__read_trylock
+__read_trylock: /* %o0 = lock_ptr */
+	ldsw		[%o0], %g5
+	brlz,pn		%g5, 100f
+	 add		%g5, 1, %g7
+	cas		[%o0], %g5, %g7
+	cmp		%g5, %g7
+	bne,pn		%icc, __read_trylock
+	 membar		#StoreLoad | #StoreStore
+	retl
+	 mov		1, %o0
+
+	.globl		__write_trylock
+__write_trylock: /* %o0 = lock_ptr */
+	sethi		%hi(0x80000000), %g2
+1:	lduw		[%o0], %g5
+4:	brnz,pn		%g5, 100f
+	 or		%g5, %g2, %g7
+	cas		[%o0], %g5, %g7
+	cmp		%g5, %g7
+	bne,pn		%icc, 1b
+	 membar		#StoreLoad | #StoreStore
+	retl
+	 mov		1, %o0
+100:	retl
+	 mov		0, %o0
+
 rwlock_impl_end:
 
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/arch/x86_64/Kconfig 999-mjb/arch/x86_64/Kconfig
--- 000-virgin/arch/x86_64/Kconfig	2003-10-01 11:47:39.000000000 -0700
+++ 999-mjb/arch/x86_64/Kconfig	2003-10-02 16:43:03.000000000 -0700
@@ -435,6 +435,37 @@ source "drivers/usb/Kconfig"
 
 source "arch/x86_64/oprofile/Kconfig"
 
+menu "GCOV coverage profiling"
+
+config GCOV_PROFILE
+        bool "GCOV coverage profiling"
+        ---help---
+        Provide infrastructure for coverage support for the kernel. This
+        will not compile the kernel by default with the necessary flags.
+        To obtain coverage information for the entire kernel, one should
+        enable the subsequent option (Profile entire kernel). If only
+        particular files or directories of the kernel are desired, then
+        one must provide the following compile options for such targets:
+                "-fprofile-arcs -ftest-coverage" in the CFLAGS. To obtain
+        access to the coverage data one must insmod the gcov-prof kernel
+        module.
+
+config GCOV_ALL
+        bool "GCOV_ALL"
+        depends on GCOV_PROFILE
+        ---help---
+        If you say Y here, it will compile the entire kernel with coverage
+        option enabled.
+
+config GCOV_PROC
+        tristate "gcov-proc module"
+        depends on GCOV_PROFILE && PROC_FS
+        ---help---
+        This is the gcov-proc module that exposes gcov data through the
+        /proc filesystem
+
+endmenu
+
 menu "Kernel hacking"
 
 config DEBUG_KERNEL
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/arch/x86_64/kernel/head.S 999-mjb/arch/x86_64/kernel/head.S
--- 000-virgin/arch/x86_64/kernel/head.S	2003-10-01 11:34:39.000000000 -0700
+++ 999-mjb/arch/x86_64/kernel/head.S	2003-10-02 16:43:03.000000000 -0700
@@ -383,3 +383,23 @@ ENTRY(idt_table)	
 	.quad 	0
 	.endr
 
+#ifdef CONFIG_GCOV_PROFILE
+/*
+ * The .ctors-section contains a list of pointers to constructor
+ * functions which are used to initialize gcov structures.
+ *
+ * Because there is no NULL at the end of the constructor list
+ * in the kernel we need the addresses of both the constructor
+ * as well as the destructor list which are supposed to be
+ * adjacent.
+ */
+
+.section ".ctors","aw"
+.globl  __CTOR_LIST__
+.type   __CTOR_LIST__,@object
+__CTOR_LIST__:
+.section ".dtors","aw"
+.globl  __DTOR_LIST__
+.type   __DTOR_LIST__,@object
+__DTOR_LIST__:
+#endif
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/drivers/Makefile 999-mjb/drivers/Makefile
--- 000-virgin/drivers/Makefile	2003-10-01 11:46:32.000000000 -0700
+++ 999-mjb/drivers/Makefile	2003-10-02 16:43:03.000000000 -0700
@@ -49,3 +49,4 @@ obj-$(CONFIG_ISDN_BOOL)		+= isdn/
 obj-$(CONFIG_MCA)		+= mca/
 obj-$(CONFIG_EISA)		+= eisa/
 obj-$(CONFIG_CPU_FREQ)		+= cpufreq/
+obj-$(CONFIG_GCOV_PROC)		+= gcov/
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/drivers/gcov/Makefile 999-mjb/drivers/gcov/Makefile
--- 000-virgin/drivers/gcov/Makefile	1969-12-31 16:00:00.000000000 -0800
+++ 999-mjb/drivers/gcov/Makefile	2003-10-02 16:43:03.000000000 -0700
@@ -0,0 +1,8 @@
+#
+# Makefile for GCOV profiling kernel module
+#
+
+obj-$(CONFIG_GCOV_PROC)	+= gcov-proc.o
+
+$(obj)/gcov-proc.o: $(obj)/gcov-proc.c
+
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/drivers/gcov/gcov-proc.c 999-mjb/drivers/gcov/gcov-proc.c
--- 000-virgin/drivers/gcov/gcov-proc.c	1969-12-31 16:00:00.000000000 -0800
+++ 999-mjb/drivers/gcov/gcov-proc.c	2003-10-02 16:43:03.000000000 -0700
@@ -0,0 +1,713 @@
+/*
+ * This kernel module provides access to coverage data produced by
+ * an instrumented kernel via an entry in the proc file system
+ * at /proc/gcov/.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (c) International Business Machines Corp., 2002
+ *
+ * Author: Hubertus Franke <frankeh@us.ibm.com>
+ *         Rajan Ravindran <rajancr@us.ibm.com>
+ *
+ * 	Bugfixes by Peter.Oberparleiter@de.ibm.com:
+ * 	Changes by Paul Larson
+ * 		Automatically detect gcc version for gcov_type
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/kernel.h>   
+#include <linux/module.h>   
+
+#include <linux/proc_fs.h>
+#include <asm/uaccess.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+
+MODULE_LICENSE("GPL");
+#define GCOV_PROF_PROC		"gcov"
+
+static DECLARE_MUTEX_LOCKED(gcov_lock);  
+#define DOWN()  down(&gcov_lock);
+#define UP()    up(&gcov_lock);
+#define PAD8(x)	((x + 7) & ~7)
+
+//#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,4))
+//static inline struct proc_dir_entry *PDE(const struct inode *inode)
+//{
+//	return ((struct proc_dir_entry *) inode->u.generic_ip);
+//}
+//#endif
+
+/* ###################################################################
+   # NOTICE ##########################################################
+   ###################################################################
+
+   GCOV_TYPE defines the count type used by the instrumentation code.
+   Kernels compiled with a gcc version prior to 3.1 should use LONG,
+   otherwise LONG LONG.  */
+
+#if __GNUC__ >= 3 && __GNUC_MINOR__ >= 1
+typedef long long gcov_type;
+#else
+typedef long gcov_type;
+#endif
+
+
+struct bb
+{
+  long zero_word;
+  const char *filename;
+  gcov_type *counts;
+  long ncounts;
+  struct bb *next;
+  const unsigned long *addresses;
+
+  /* Older GCC's did not emit these fields.  */
+  long nwords;
+  const char **functions;
+  const long *line_nums;
+  const char **filenames;
+  char *flags;
+};
+
+extern struct bb *bb_head;
+static struct file_operations proc_gcov_operations;
+extern char   *gcov_kernelpath;
+extern void   (*gcov_callback)(int cmd, struct bb *);
+extern void   do_global_ctors(char *, char *, struct module *, int);
+ 
+static int create_bb_links = 1;
+static int kernel_path_len;
+
+int debug = 0;
+#define PPRINTK(x) do { if (debug) { printk x ; } } while (0)
+
+struct gcov_ftree_node
+{
+	int   isdir;    /* directory or file */
+	char *fname;    /* only the name within the hierachy */
+	struct gcov_ftree_node *sibling;   /* sibling of tree  */
+	struct gcov_ftree_node *files;  /* children of tree */
+	struct gcov_ftree_node *parent; /* parent of current gcov_ftree_node */
+	struct proc_dir_entry  *proc[4];
+	struct bb	      *bb;
+	/* below only valid for leaf nodes == files */
+	unsigned long  offset;	  /* offset in global file */
+	struct gcov_ftree_node *next;   /* next leave node       */
+};
+
+static struct proc_dir_entry  *proc_vmlinux = NULL;
+static struct gcov_ftree_node *leave_nodes = NULL;
+static struct gcov_ftree_node *dumpall_cached_node = NULL;
+static struct gcov_ftree_node tree_root  = 
+	{ 1, GCOV_PROF_PROC, NULL, NULL, NULL,
+	  { NULL, NULL, NULL, NULL} , NULL, 0,NULL };
+static char *endings[3] = { ".bb", ".bbg", ".c" };
+
+
+/* Calculate the header size of an entry in the vmlinux-tracefile which
+   contains the collection of trace data of all instrumented kernel objects.
+
+   An entry header is defined as:
+     0:  length of filename of the respective .da file padded to 8 bytes
+     8:  filename padded to 8 bytes
+
+ */
+
+static inline unsigned long
+hdr_ofs (struct gcov_ftree_node *tptr)
+{
+	return 8 + PAD8(strlen (tptr->bb->filename) + 1);
+}
+
+
+/* Calculate the total size of an entry in the vmlinux-tracefile.
+   An entry consists of the header, an 8 byte word for the number
+   of counts in this entry and the actual array of 8 byte counts.  */
+
+static inline unsigned long
+dump_size(struct gcov_ftree_node *tptr)
+{
+	return (hdr_ofs(tptr) + (tptr->bb->ncounts+1)*8);
+}
+
+
+/* Store a portable representation of VALUE in DEST using BYTES*8-1 bits.
+   Return a non-zero value if VALUE requires more than BYTES*8-1 bits
+   to store (this is adapted code from gcc/gcov-io.h).  */
+
+static int
+store_gcov_type (gcov_type value, void *buf, int offset, int len)
+{
+	const size_t bytes = 8;
+	char dest[10];
+	int upper_bit = (value < 0 ? 128 : 0);
+	size_t i;
+ 
+	if (value < 0) {
+		gcov_type oldvalue = value;
+		value = -value;
+		if (oldvalue != -value)
+		return 1;
+	}
+ 
+	for(i = 0 ;
+	    i < (sizeof (value) < bytes ? sizeof (value) : bytes) ;
+	    i++) {
+		dest[i] = value & (i == (bytes - 1) ? 127 : 255);
+		value = value / 256;
+	}
+ 
+	if (value && value != -1)
+		return 1;
+ 
+	for(; i < bytes ; i++)
+	  dest[i] = 0;
+	dest[bytes - 1] |= upper_bit;
+	copy_to_user(buf,&dest[offset],len);
+	return 0;
+}
+
+
+/* Create a directory entry in the proc file system and fill in
+   the respective fields in the provided tree node. Return a
+   non-zero value on error.  */
+
+int
+create_dir_proc (struct gcov_ftree_node *bt, char *fname) 
+{
+	bt->proc[0] = proc_mkdir(fname, bt->parent->proc[0]);
+	bt->proc[1] = bt->proc[2] = bt->proc[3] = NULL;
+	return (bt->proc[0] == NULL);
+}
+
+
+/* Replace file ending <end> in <fname> with <newend>. Return a new
+   string containing the new filename or NULL on error.  */
+
+static 
+char* replace_ending (const char *fname,char *end, char *newend)
+{
+	char *newfname;
+	char *cptr = strstr(fname,end);
+	int len;
+	if (cptr == NULL) 
+		return NULL;
+	len = cptr - fname;
+	newfname = (char*)kmalloc(len+strlen(newend)+1,GFP_KERNEL);
+	if (newfname == NULL) 
+		return NULL;
+	memcpy(newfname,fname,len);
+	strcpy(newfname+len,newend);
+	return newfname;	
+} 
+	
+
+/* Create a file entry in the proc file system and update the respective
+   fields on the tree node. Optionally try to create links to the
+   source, .bb and .bbg files. Return a non-zero value on error.  */
+
+int
+create_file_proc (struct gcov_ftree_node *bt, struct bb *bptr, char *fname,
+		  const char *fullname) 
+{
+	bt->proc[0]  = create_proc_entry(fname, S_IWUSR | S_IRUGO, 
+					 bt->parent->proc[0]);
+	if (!bt->proc[0]) {
+		PPRINTK(("error creating file proc <%s>\n", fname));
+		return 1;
+	}
+
+	bt->proc[0]->proc_fops = &proc_gcov_operations;
+	bt->proc[0]->size = 8 + (8 * bptr->ncounts);
+
+	if (create_bb_links) {
+		int i;
+		for (i=0;i<3;i++) {
+			char *newfname;
+			char *newfullname;
+			newfname    = replace_ending(fname,".da",endings[i]);
+			newfullname = replace_ending(fullname,".da",endings[i]);
+			if ((newfname) && (newfullname)) {
+				bt->proc[i+1]  = proc_symlink(newfname,bt->parent->proc[0],newfullname);
+			}
+			if (newfname) kfree(newfname);
+			if (newfullname) kfree(newfullname);
+		}
+	} else {
+		bt->proc[1] = bt->proc[2] = bt->proc[3] = NULL; 
+	}
+	return 0;
+}
+
+
+/* Recursively check and if necessary create the file specified by <name>
+   and all its path components, both in the proc file-system as
+   well as in the internal tree structure.  */
+
+void 
+check_proc_fs(const char *fullname, struct gcov_ftree_node *parent, 
+		   char *name, struct bb *bbptr)
+{
+	char dirname[128];
+	char *localname = name;
+	char *tname;
+	int  isdir;
+	struct gcov_ftree_node *tptr;
+
+	tname = strstr(name, "/");
+	if ((isdir = (tname != NULL))) {
+		memcpy(dirname,name,tname-name);
+		dirname[tname-name] = '\0';
+		localname = dirname;
+	}
+
+	/* search the list of files in gcov_ftree_node and 
+	 * see whether file already exists in this directory level */
+	for ( tptr = parent->files ; tptr ; tptr = tptr->sibling) {
+		if (!strcmp(tptr->fname,localname))
+			break;
+	}
+	if (!tptr) {
+		/* no entry yet */
+		tptr = (struct gcov_ftree_node*)
+			kmalloc(sizeof(struct gcov_ftree_node),GFP_KERNEL);
+		tptr->parent  = parent;
+
+		if (!isdir) {
+			if (create_file_proc(tptr, bbptr, localname,fullname)) {
+				kfree(tptr);
+				return;
+			}
+			tptr->bb	 = bbptr;
+			tptr->proc[0]->data = tptr;
+			tptr->next = leave_nodes;
+			leave_nodes = tptr;
+		} else {
+			int len = strlen(dirname)+1;
+			localname = (char*)kmalloc(len,GFP_KERNEL);
+			strncpy(localname,dirname,len);
+			if (create_dir_proc(tptr,localname)) {
+				kfree(tptr);
+				kfree(localname);
+				return;
+			}
+			tptr->bb	 = NULL;
+			tptr->proc[0]->data = NULL;
+			tptr->next       = NULL;
+		}
+		tptr->isdir   = isdir;
+		tptr->fname   = localname;
+		tptr->files   = NULL;
+		tptr->sibling = parent->files;
+		parent->files = tptr;
+	}
+	if (isdir)
+		check_proc_fs(fullname,tptr,tname+1,bbptr);
+}
+
+
+/* Read out tracefile data to user space. Return the number of bytes
+   read.  */
+
+static ssize_t 
+read_gcov(struct file *file, char *buf,
+			 size_t count, loff_t *ppos)
+{
+	unsigned long p = *ppos;
+	ssize_t read;
+	gcov_type ncnt;
+	struct bb *bbptr;
+	gcov_type slen;
+	gcov_type *wptr;
+	struct gcov_ftree_node *treeptr; 
+	struct proc_dir_entry * de;
+	int dumpall;
+	unsigned int hdrofs;
+	unsigned long poffs;
+
+	DOWN();
+
+	read   = 0;
+	hdrofs = 0;
+	poffs  = 0;
+	de = PDE(file->f_dentry->d_inode);
+
+	/* Check whether this is a request to /proc/gcov/vmlinux in
+	   which case we should dump the complete tracefile.  */
+	dumpall = (de == proc_vmlinux);
+
+
+	/* Have treeptr point to the tree node to be dumped.  */
+
+	if (!dumpall)
+		treeptr = (struct gcov_ftree_node*) (de ? de->data : NULL);
+	else {
+		/* dumpall_cached_node will speed up things in case
+		   of a sequential read.  */
+		if (dumpall_cached_node && (p >= dumpall_cached_node->offset)) {
+			treeptr = dumpall_cached_node;
+		}
+		else
+			treeptr = leave_nodes;
+
+		/* Search the tree node that covers the requested
+		   tracefile offset.  */
+		while (treeptr) {
+			struct gcov_ftree_node *next = treeptr->next;
+			if ((next == NULL) || (p < next->offset)) {
+				hdrofs = hdr_ofs(treeptr);
+				poffs  = treeptr->offset;
+				break;
+			}
+			treeptr = next;
+		}
+		dumpall_cached_node = treeptr;
+	}
+
+	bbptr = treeptr ? treeptr->bb : NULL;
+
+	if (bbptr == NULL)
+		goto out;
+
+	ncnt = (gcov_type) bbptr->ncounts;
+	p -= poffs;
+
+	do { 
+		if (p < hdrofs) {
+			/* User wants to read parts of the header.  */
+
+			slen = PAD8(strlen(treeptr->bb->filename)+1);
+
+			if (p >= 8) {
+				/* Read filename */
+				if (slen > (gcov_type) count) slen = count;
+				copy_to_user (buf, &treeptr->bb->filename[p-8],
+					      slen);
+				count-=slen;buf+= slen;read+=slen;p+=slen;
+				continue;
+			}
+			wptr = &slen;
+		} 
+		else if (p < (hdrofs + 8)) {
+			/* User wants to read the number of counts in this
+			   entry.  */
+
+			wptr = &ncnt;
+		}
+		else if (p < (hdrofs) + (unsigned long) (ncnt+1)*8) {
+			/* User wants to read actual counters */
+
+			wptr = &bbptr->counts[((p-hdrofs)/8)-1];
+		}
+		else
+			break;
+
+		/* do we have to write partial word */	
+
+		if ((count < 8) || (p & 0x7)) {
+			/* partial write */
+			unsigned long offset = p & 0x7;
+			unsigned long length = (count+offset)<8?count:(8-offset);
+
+			store_gcov_type(*wptr,buf, offset, length);
+			buf+=length;p+=length;count-=length;read+=length;
+			break;
+		} else {
+			store_gcov_type(*wptr,buf, 0, 8);
+			buf+=8;p+=8;count-=8;read+=8;
+		}
+	} while (count > 0);
+	*ppos = p + poffs;
+out:
+	UP();
+	return read;
+}
+
+
+/* A write to any of our proc file-system entries is interpreted
+   as a request to reset the data from that node.  */
+
+static ssize_t 
+write_gcov(struct file * file, const char * buf,
+		       size_t count, loff_t *ppos)
+{
+	struct bb *ptr;
+	struct proc_dir_entry * de;
+	int resetall, i;
+	struct gcov_ftree_node *tptr; 
+
+	DOWN();
+
+	de = PDE(file->f_dentry->d_inode);
+
+	if (de == NULL) { 
+		count = 0;
+		goto out;
+	}
+
+	/* Check for a write to /proc/gcov/vmlinux */
+	resetall = (de == proc_vmlinux);
+
+	if (resetall) {
+		/* Reset all nodes */
+		for (ptr = bb_head; ptr != (struct bb *) 0; ptr = ptr->next)
+		{
+       			int i;
+			if (ptr->counts == NULL) continue;
+			for (i = 0; i < ptr->ncounts; i++) 
+				ptr->counts[i]=0;
+		}
+	} else {
+		/* Reset a single node */
+		tptr = (struct gcov_ftree_node*)(de->data);
+		if (tptr == NULL)
+			goto out;
+		ptr = tptr->bb; 
+		if (ptr->ncounts != 0) {
+			for (i = 0; i < ptr->ncounts; i++) 
+				ptr->counts[i]=0;
+		}
+	}
+out:
+	UP();
+	return count;
+}
+
+
+/* This struct identifies the functions to be used for proc file-system
+   interaction.  */
+
+static struct file_operations proc_gcov_operations = {
+	read:	read_gcov,
+	write:	write_gcov
+};
+
+
+/* Recursively remove a node and all its children from the internal
+   data tree and from the proc file-system.  */
+
+void 
+cleanup_node(struct gcov_ftree_node *node, int delname, int del_in_parent)
+{
+	struct gcov_ftree_node *next,*tptr;
+	struct proc_dir_entry *par_proc;
+
+	PPRINTK(("parent n:%p p:%p f:%p s:%p <%s>\n", node, 
+		node->parent, node->files, node->sibling, node->fname));
+	if ((tptr = node->parent)) { 
+		if (del_in_parent) {
+			/* Remove node from parent's list of children */
+			struct gcov_ftree_node *cptr,*prev_cptr;
+			for ( prev_cptr = NULL, cptr = tptr->files; cptr && (cptr != node);
+			      prev_cptr = cptr, cptr = cptr->sibling); 
+			if (prev_cptr == NULL)
+				tptr->files = cptr->sibling;
+			else
+				prev_cptr->sibling = cptr->sibling;
+		}
+		par_proc = (struct proc_dir_entry*)(tptr->proc[0]);
+	} else
+		par_proc = &proc_root;
+
+	if (node->isdir) {
+		/* In case of a directory, clean up all child nodes.  */
+		next = node->files;
+		node->files = NULL;
+		for (tptr = next ; tptr; ) {
+			next = tptr->sibling;
+			cleanup_node(tptr,1,0);
+			tptr = next;
+		}
+		remove_proc_entry(node->fname, par_proc);
+		if (delname) kfree(node->fname);
+	} else {
+		/* Remove file entry and optional links.  */
+		remove_proc_entry(node->fname, par_proc);
+		if (create_bb_links) {
+			int i;
+			for (i=0;i<3;i++) {
+				char *newfname;
+				if (node->proc[i+1] == NULL) continue;
+				newfname    = replace_ending(node->fname,".da",endings[i]);
+				if (newfname) {
+					PPRINTK(("remove_proc_entry <%s>\n", node->fname));
+					remove_proc_entry(newfname, par_proc);
+					kfree(newfname);
+				}
+			}
+		}     
+	}
+	/* free the data */
+	if (node != &tree_root) 
+		kfree(node);
+}
+
+
+/* Create a tree node for the given bb struct and initiate the
+   creation of a corresponding proc file-system entry.  */
+
+static void
+create_node_tree(struct bb *bbptr)
+{
+	const char *tmp;
+	const char *filename = bbptr->filename;
+	char *modname;
+	int len;
+
+	PPRINTK(("kernelpath <%s> <%s>\n", gcov_kernelpath, filename));
+
+	/* Check whether this is a file located in the kernel source
+	   directory.  */
+	if (!strncmp (filename, gcov_kernelpath, kernel_path_len))
+	{
+		/* Remove kernel path and create relative proc-file-system
+		   entry.  */
+		tmp = filename + kernel_path_len+1;
+		if (*tmp == '0') return; 
+		check_proc_fs(filename, &tree_root, (char*)tmp, bbptr);
+	} 
+	else {
+		/* Insert entry to module sub-directory.  */
+		len = strlen(filename);
+ 		modname = (char *)kmalloc (len + 7, GFP_KERNEL);
+		strcpy(modname, "module");
+		strcat (modname, filename);
+		check_proc_fs(filename, &tree_root, modname, bbptr);
+	}
+}
+
+
+/* This function will be used as gcov_callback, i.e. it is
+   called from constructor and destructor code of all instrumented
+   object files. It updates the local tree structure and the proc
+   file-system entries.  */
+
+static void 
+gcov_cleanup(int cmd, struct bb *bbptr)
+{
+	unsigned long offset = 0;
+	struct gcov_ftree_node *tptr;
+	struct gcov_ftree_node *parent;
+	struct gcov_ftree_node *prev_cptr;
+
+	DOWN(); 
+	switch (cmd) {
+	case 0:
+		/* remove leave node */
+		prev_cptr = NULL;
+		for (tptr = leave_nodes; tptr ; prev_cptr = tptr, tptr = tptr->next) {
+			if (tptr->bb == bbptr) break;
+		}
+		if (!tptr) {
+			PPRINTK(("Can't find module in /proc/gcov\n"));
+			UP();
+			return;
+		}
+		if (prev_cptr)
+			prev_cptr->next = tptr->next;
+		else
+			leave_nodes = tptr->next;
+		dumpall_cached_node = NULL;
+
+
+		/* Find highest level node without further siblings */
+	
+		parent = tptr->parent;
+		do {
+			if (parent->files->sibling != NULL) break;
+			tptr = parent;
+			parent = parent->parent;
+		} while (parent);
+		cleanup_node(tptr,0,1);
+
+		/* Update the offsets at which a certain node can
+		   be found in the tracefile.  */
+		for (tptr = leave_nodes; tptr; tptr = tptr->next) {
+			tptr->offset = offset; 
+			offset += dump_size(tptr);
+		}
+		break;
+
+	case 1:
+		/* insert node */
+		create_node_tree(bbptr);
+
+		/* Update the offsets at which a certain node can
+		   be found in the tracefile.  */
+		for (tptr = leave_nodes; tptr; tptr = tptr->next) {
+			tptr->offset = offset; 
+			offset += dump_size(tptr);
+		}
+
+		break;
+	}
+	UP();
+}
+
+
+/* Initialize the data structure by calling the constructor code
+   of all instrumented object files and creating the proc
+   file-system entries.  */
+
+int 
+init_module(void)
+{
+	struct bb *bbptr;
+	unsigned long offset = 0;
+	struct gcov_ftree_node *tptr; 
+
+	PPRINTK(("init module <%s>\n\n", GCOV_PROF_PROC));
+
+	do_global_ctors(NULL, NULL, NULL, 0);
+	
+	tree_root.proc[0] = proc_mkdir(GCOV_PROF_PROC, 0);
+	kernel_path_len = strlen(gcov_kernelpath);
+
+	for (bbptr = bb_head; bbptr ; bbptr = bbptr->next) {
+		create_node_tree(bbptr);
+	}
+
+	/* Fill in the offset at which a certain node can
+	   be found in the tracefile.  */
+	for (tptr = leave_nodes; tptr; tptr = tptr->next) {
+		tptr->offset = offset; 
+		offset += dump_size(tptr);
+	}
+
+	proc_vmlinux = create_proc_entry("vmlinux",S_IWUSR | S_IRUGO, 
+					 tree_root.proc[0]);
+	if (proc_vmlinux)
+		proc_vmlinux->proc_fops = &proc_gcov_operations;
+
+	gcov_callback = gcov_cleanup;
+	UP();
+	return 0;
+}
+
+
+void 
+cleanup_module(void)
+{
+	PPRINTK(("remove module <%s>\n\n", GCOV_PROF_PROC));
+	gcov_callback = NULL;
+	DOWN();
+	cleanup_node(&tree_root,0,0); 
+}
+
+//module_init(gcov_init_module);
+//module_exit(gcov_cleanup_module);
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/drivers/net/loopback.c 999-mjb/drivers/net/loopback.c
--- 000-virgin/drivers/net/loopback.c	2003-10-01 11:46:39.000000000 -0700
+++ 999-mjb/drivers/net/loopback.c	2003-10-02 16:39:46.000000000 -0700
@@ -184,7 +184,7 @@ struct net_device loopback_dev = {
 	.rebuild_header		= eth_rebuild_header,
 	.flags			= IFF_LOOPBACK,
 	.features 		= NETIF_F_SG|NETIF_F_FRAGLIST
-				  |NETIF_F_NO_CSUM|NETIF_F_HIGHDMA|NETIF_F_TSO,
+				  |NETIF_F_NO_CSUM|NETIF_F_HIGHDMA,
 };
 
 /* Setup and register the of the LOOPBACK device. */
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/drivers/pci/probe.c 999-mjb/drivers/pci/probe.c
--- 000-virgin/drivers/pci/probe.c	2003-10-01 11:35:11.000000000 -0700
+++ 999-mjb/drivers/pci/probe.c	2003-10-02 16:39:49.000000000 -0700
@@ -176,7 +176,7 @@ void __devinit pci_read_bridge_bases(str
 		limit |= (io_limit_hi << 16);
 	}
 
-	if (base && base <= limit) {
+	if (base <= limit) {
 		res->flags = (io_base_lo & PCI_IO_RANGE_TYPE_MASK) | IORESOURCE_IO;
 		res->start = base;
 		res->end = limit + 0xfff;
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/fs/aio.c 999-mjb/fs/aio.c
--- 000-virgin/fs/aio.c	2003-10-01 11:48:15.000000000 -0700
+++ 999-mjb/fs/aio.c	2003-10-02 16:39:54.000000000 -0700
@@ -203,6 +203,7 @@ static struct kioctx *ioctx_alloc(unsign
 {
 	struct mm_struct *mm;
 	struct kioctx *ctx;
+	int ret = 0;
 
 	/* Prevent overflows */
 	if ((nr_events > (0x10000000U / sizeof(struct io_event))) ||
@@ -232,7 +233,8 @@ static struct kioctx *ioctx_alloc(unsign
 	INIT_LIST_HEAD(&ctx->run_list);
 	INIT_WORK(&ctx->wq, aio_kick_handler, ctx);
 
-	if (aio_setup_ring(ctx) < 0)
+	ret = aio_setup_ring(ctx);
+	if (unlikely(ret < 0))
 		goto out_freectx;
 
 	/* limit the number of system wide aios */
@@ -259,7 +261,7 @@ out_cleanup:
 out_freectx:
 	mmdrop(mm);
 	kmem_cache_free(kioctx_cachep, ctx);
-	ctx = ERR_PTR(-ENOMEM);
+	ctx = ERR_PTR(ret);
 
 	dprintk("aio: error allocating ioctx %p\n", ctx);
 	return ctx;
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/fs/binfmt_aout.c 999-mjb/fs/binfmt_aout.c
--- 000-virgin/fs/binfmt_aout.c	2003-10-01 11:48:15.000000000 -0700
+++ 999-mjb/fs/binfmt_aout.c	2003-10-02 16:42:17.000000000 -0700
@@ -309,7 +309,7 @@ static int load_aout_binary(struct linux
 		(current->mm->start_brk = N_BSSADDR(ex));
 	current->mm->free_area_cache = TASK_UNMAPPED_BASE;
 
-	current->mm->rss = 0;
+	zero_rss(current->mm);
 	current->mm->mmap = NULL;
 	compute_creds(bprm);
  	current->flags &= ~PF_FORKNOEXEC;
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/fs/binfmt_elf.c 999-mjb/fs/binfmt_elf.c
--- 000-virgin/fs/binfmt_elf.c	2003-10-01 11:48:15.000000000 -0700
+++ 999-mjb/fs/binfmt_elf.c	2003-10-02 16:42:17.000000000 -0700
@@ -634,7 +634,7 @@ static int load_elf_binary(struct linux_
 
 	/* Do this so that we can load the interpreter, if need be.  We will
 	   change some of these later */
-	current->mm->rss = 0;
+	zero_rss(current->mm);
 	current->mm->free_area_cache = TASK_UNMAPPED_BASE;
 	retval = setup_arg_pages(bprm);
 	if (retval < 0) {
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/fs/binfmt_flat.c 999-mjb/fs/binfmt_flat.c
--- 000-virgin/fs/binfmt_flat.c	2003-10-01 11:35:23.000000000 -0700
+++ 999-mjb/fs/binfmt_flat.c	2003-10-02 16:42:17.000000000 -0700
@@ -643,7 +643,7 @@ static int load_flat_file(struct linux_b
 		current->mm->start_brk = datapos + data_len + bss_len;
 		current->mm->brk = (current->mm->start_brk + 3) & ~3;
 		current->mm->context.end_brk = memp + ksize((void *) memp) - stack_len;
-		current->mm->rss = 0;
+		zero_rss(current->mm);
 	}
 
 	if (flags & FLAT_FLAG_KTRACE)
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/fs/binfmt_som.c 999-mjb/fs/binfmt_som.c
--- 000-virgin/fs/binfmt_som.c	2003-02-13 16:36:36.000000000 -0800
+++ 999-mjb/fs/binfmt_som.c	2003-10-02 16:42:17.000000000 -0700
@@ -259,7 +259,7 @@ load_som_binary(struct linux_binprm * bp
 	create_som_tables(bprm);
 
 	current->mm->start_stack = bprm->p;
-	current->mm->rss = 0;
+	zero_rss(current->mm);
 
 #if 0
 	printk("(start_brk) %08lx\n" , (unsigned long) current->mm->start_brk);
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/fs/buffer.c 999-mjb/fs/buffer.c
--- 000-virgin/fs/buffer.c	2003-10-01 11:41:12.000000000 -0700
+++ 999-mjb/fs/buffer.c	2003-10-02 16:53:55.000000000 -0700
@@ -865,14 +865,14 @@ int __set_page_dirty_buffers(struct page
 	spin_unlock(&mapping->private_lock);
 
 	if (!TestSetPageDirty(page)) {
-		spin_lock(&mapping->page_lock);
+		mapping_wrlock(&mapping->page_lock);
 		if (page->mapping) {	/* Race with truncate? */
 			if (!mapping->backing_dev_info->memory_backed)
 				inc_page_state(nr_dirty);
 			list_del(&page->list);
 			list_add(&page->list, &mapping->dirty_pages);
 		}
-		spin_unlock(&mapping->page_lock);
+		mapping_wrunlock(&mapping->page_lock);
 		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 	}
 	
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/fs/exec.c 999-mjb/fs/exec.c
--- 000-virgin/fs/exec.c	2003-10-01 11:48:15.000000000 -0700
+++ 999-mjb/fs/exec.c	2003-10-02 16:42:17.000000000 -0700
@@ -317,10 +317,11 @@ void put_dirty_page(struct task_struct *
 	}
 	lru_cache_add_active(page);
 	flush_dcache_page(page);
+	SetPageAnon(page);
 	set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(page, prot))));
 	pte_chain = page_add_rmap(page, pte, pte_chain);
 	pte_unmap(pte);
-	tsk->mm->rss++;
+	inc_rss(tsk->mm, page);
 	spin_unlock(&tsk->mm->page_table_lock);
 
 	/* no need for flush_tlb */
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/fs/fs-writeback.c 999-mjb/fs/fs-writeback.c
--- 000-virgin/fs/fs-writeback.c	2003-07-28 15:31:09.000000000 -0700
+++ 999-mjb/fs/fs-writeback.c	2003-10-02 16:53:55.000000000 -0700
@@ -150,10 +150,10 @@ __sync_single_inode(struct inode *inode,
 	 * read speculatively by this cpu before &= ~I_DIRTY  -- mikulas
 	 */
 
-	spin_lock(&mapping->page_lock);
+	mapping_wrlock(&mapping->page_lock);
 	if (wait || !wbc->for_kupdate || list_empty(&mapping->io_pages))
 		list_splice_init(&mapping->dirty_pages, &mapping->io_pages);
-	spin_unlock(&mapping->page_lock);
+	mapping_wrunlock(&mapping->page_lock);
 	spin_unlock(&inode_lock);
 
 	do_writepages(mapping, wbc);
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/fs/inode.c 999-mjb/fs/inode.c
--- 000-virgin/fs/inode.c	2003-10-01 11:47:01.000000000 -0700
+++ 999-mjb/fs/inode.c	2003-10-02 16:53:55.000000000 -0700
@@ -147,6 +147,9 @@ static struct inode *alloc_inode(struct 
 		mapping->dirtied_when = 0;
 		mapping->assoc_mapping = NULL;
 		mapping->backing_dev_info = &default_backing_dev_info;
+#ifdef CONFIG_NUMA
+		mapping->binding = NULL;
+#endif
 		if (sb->s_bdev)
 			mapping->backing_dev_info = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
 		memset(&inode->u, 0, sizeof(inode->u));
@@ -184,7 +187,7 @@ void inode_init_once(struct inode *inode
 	INIT_LIST_HEAD(&inode->i_devices);
 	sema_init(&inode->i_sem, 1);
 	INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
-	spin_lock_init(&inode->i_data.page_lock);
+	mapping_rwlock_init(&inode->i_data.page_lock);
 	init_MUTEX(&inode->i_data.i_shared_sem);
 	atomic_set(&inode->i_data.truncate_count, 0);
 	INIT_LIST_HEAD(&inode->i_data.private_list);
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/fs/mpage.c 999-mjb/fs/mpage.c
--- 000-virgin/fs/mpage.c	2003-10-01 11:41:13.000000000 -0700
+++ 999-mjb/fs/mpage.c	2003-10-02 16:53:55.000000000 -0700
@@ -635,7 +635,7 @@ mpage_writepages(struct address_space *m
 	if (get_block == NULL)
 		writepage = mapping->a_ops->writepage;
 
-	spin_lock(&mapping->page_lock);
+	mapping_wrlock(&mapping->page_lock);
 	while (!list_empty(&mapping->io_pages) && !done) {
 		struct page *page = list_entry(mapping->io_pages.prev,
 					struct page, list);
@@ -655,7 +655,7 @@ mpage_writepages(struct address_space *m
 		list_add(&page->list, &mapping->locked_pages);
 
 		page_cache_get(page);
-		spin_unlock(&mapping->page_lock);
+		mapping_wrunlock(&mapping->page_lock);
 
 		/*
 		 * At this point we hold neither mapping->page_lock nor
@@ -695,12 +695,12 @@ mpage_writepages(struct address_space *m
 			unlock_page(page);
 		}
 		page_cache_release(page);
-		spin_lock(&mapping->page_lock);
+		mapping_wrlock(&mapping->page_lock);
 	}
 	/*
 	 * Leave any remaining dirty pages on ->io_pages
 	 */
-	spin_unlock(&mapping->page_lock);
+	mapping_wrunlock(&mapping->page_lock);
 	if (bio)
 		mpage_bio_submit(WRITE, bio);
 	return ret;
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/fs/proc/proc_misc.c 999-mjb/fs/proc/proc_misc.c
--- 000-virgin/fs/proc/proc_misc.c	2003-10-01 11:48:19.000000000 -0700
+++ 999-mjb/fs/proc/proc_misc.c	2003-10-02 16:41:02.000000000 -0700
@@ -134,6 +134,41 @@ static struct vmalloc_info get_vmalloc_i
 	return vmi;
 }
 
+static int real_loadavg_read_proc(char *page, char **start, off_t off,
+				 int count, int *eof, void *data)
+{
+	int a, b, c, cpu;
+	int len;
+
+	a = tasks_running[0] + (FIXED_1/200);
+	b = tasks_running[1] + (FIXED_1/200);
+	c = tasks_running[2] + (FIXED_1/200);
+	len = sprintf(page,"Domain    load1    load2    load3  nr_run/nr_thrd\n");
+	len += sprintf(page+len,"SYSTEM %5d.%02d %5d.%02d %5d.%02d %7ld/%7d\n",
+		LOAD_INT(a), LOAD_FRAC(a),
+		LOAD_INT(b), LOAD_FRAC(b),
+		LOAD_INT(c), LOAD_FRAC(c),
+		nr_running(), nr_threads);
+	for (cpu = 0; cpu < NR_CPUS; ++cpu) {
+		unsigned long nr_running;
+		if (!cpu_online(cpu))
+			continue;
+		preempt_disable();
+		a = per_cpu(cpu_tasks_running,cpu)[0] + (FIXED_1/200);
+		b = per_cpu(cpu_tasks_running,cpu)[1] + (FIXED_1/200);
+		c = per_cpu(cpu_tasks_running,cpu)[2] + (FIXED_1/200);
+		nr_running = nr_running_cpu(cpu);
+		preempt_enable();
+		len += sprintf(page+len, "%5d  %5d.%02d %5d.%02d %5d.%02d %7ld/%7d\n",
+			cpu,
+			LOAD_INT(a), LOAD_FRAC(a), 
+			LOAD_INT(b), LOAD_FRAC(b),
+			LOAD_INT(c), LOAD_FRAC(c),
+			nr_running, nr_threads);
+	}
+	return proc_calc_metrics(page, start, off, count, eof, len);
+}
+
 static int uptime_read_proc(char *page, char **start, off_t off,
 				 int count, int *eof, void *data)
 {
@@ -342,6 +377,71 @@ static struct file_operations proc_modul
 };
 #endif
 
+#ifdef CONFIG_NUMA
+#define K(x) ((x) << (PAGE_SHIFT - 10))
+static int show_meminfo_numa (struct seq_file *m, void *v)
+{
+	int *d = v;
+	int nid = *d;
+	struct sysinfo i;
+	si_meminfo_node(&i, nid);
+	seq_printf(m, "\n"
+			"Node %d MemTotal:     %8lu kB\n"
+			"Node %d MemFree:      %8lu kB\n"
+			"Node %d MemUsed:      %8lu kB\n"
+			"Node %d HighTotal:    %8lu kB\n"
+			"Node %d HighFree:     %8lu kB\n"
+			"Node %d LowTotal:     %8lu kB\n"
+			"Node %d LowFree:      %8lu kB\n",
+			nid, K(i.totalram),
+			nid, K(i.freeram),
+			nid, K(i.totalram-i.freeram),
+			nid, K(i.totalhigh),
+			nid, K(i.freehigh),
+			nid, K(i.totalram-i.totalhigh),
+			nid, K(i.freeram-i.freehigh));
+
+	return 0;
+}
+#undef K 
+
+extern struct seq_operations meminfo_numa_op;
+static int meminfo_numa_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file,&meminfo_numa_op);
+}
+
+static struct file_operations proc_meminfo_numa_operations = {
+        open:           meminfo_numa_open,
+        read:           seq_read,
+        llseek:         seq_lseek,
+        release:        seq_release,
+};
+
+static void *meminfo_numa_start(struct seq_file *m, loff_t *pos)
+{
+	return  *pos < numnodes ? pos : NULL;
+}
+
+static void *meminfo_numa_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	++*pos;
+	return meminfo_numa_start(m, pos);
+}
+
+static void meminfo_numa_stop(struct seq_file *m, void *v)
+{
+}
+
+struct seq_operations meminfo_numa_op = {
+	.start = meminfo_numa_start,
+	.next  = meminfo_numa_next,
+	.stop  = meminfo_numa_stop,
+	.show  = show_meminfo_numa,
+};
+
+#endif
+
 extern struct seq_operations slabinfo_op;
 extern ssize_t slabinfo_write(struct file *, const char __user *, size_t, loff_t *);
 static int slabinfo_open(struct inode *inode, struct file *file)
@@ -638,6 +738,36 @@ static void create_seq_entry(char *name,
 		entry->proc_fops = f;
 }
 
+#ifdef CONFIG_LOCKMETER
+extern ssize_t get_lockmeter_info(char *, size_t, loff_t *);
+extern ssize_t put_lockmeter_info(const char *, size_t);
+extern int get_lockmeter_info_size(void);
+
+/*
+ * This function accesses lock metering information. 
+ */
+static ssize_t read_lockmeter(struct file *file, char *buf,
+			      size_t count, loff_t *ppos)
+{
+	return get_lockmeter_info(buf, count, ppos);
+}
+
+/*
+ * Writing to /proc/lockmeter resets the counters
+ */
+static ssize_t write_lockmeter(struct file * file, const char * buf,
+			       size_t count, loff_t *ppos)
+{
+	return put_lockmeter_info(buf, count);
+}
+
+static struct file_operations proc_lockmeter_operations = {
+	NULL,           /* lseek */
+	read:		read_lockmeter,
+	write:		write_lockmeter,
+};
+#endif  /* CONFIG_LOCKMETER */
+
 void __init proc_misc_init(void)
 {
 	struct proc_dir_entry *entry;
@@ -646,6 +776,7 @@ void __init proc_misc_init(void)
 		int (*read_proc)(char*,char**,off_t,int,int*,void*);
 	} *p, simple_ones[] = {
 		{"loadavg",     loadavg_read_proc},
+		{"real_loadavg",real_loadavg_read_proc},
 		{"uptime",	uptime_read_proc},
 		{"meminfo",	meminfo_read_proc},
 		{"version",	version_read_proc},
@@ -685,6 +816,9 @@ void __init proc_misc_init(void)
 #ifdef CONFIG_MODULES
 	create_seq_entry("modules", 0, &proc_modules_operations);
 #endif
+#ifdef CONFIG_NUMA
+	create_seq_entry("meminfo.numa",0,&proc_meminfo_numa_operations);
+#endif
 #ifdef CONFIG_PROC_KCORE
 	proc_root_kcore = create_proc_entry("kcore", S_IRUSR, NULL);
 	if (proc_root_kcore) {
@@ -705,6 +839,13 @@ void __init proc_misc_init(void)
 	if (entry)
 		entry->proc_fops = &proc_sysrq_trigger_operations;
 #endif
+#ifdef CONFIG_LOCKMETER
+	entry = create_proc_entry("lockmeter", S_IWUSR | S_IRUGO, NULL);
+	if (entry) {
+		entry->proc_fops = &proc_lockmeter_operations;
+		entry->size = get_lockmeter_info_size();
+	}
+#endif
 #ifdef CONFIG_PPC32
 	{
 		extern struct file_operations ppc_htab_operations;
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/fs/proc/task_mmu.c 999-mjb/fs/proc/task_mmu.c
--- 000-virgin/fs/proc/task_mmu.c	2003-10-01 11:47:04.000000000 -0700
+++ 999-mjb/fs/proc/task_mmu.c	2003-10-02 16:42:17.000000000 -0700
@@ -3,6 +3,22 @@
 #include <linux/seq_file.h>
 #include <asm/uaccess.h>
 
+#ifdef CONFIG_NUMA
+char *task_mem_pernode(struct mm_struct *mm, char *buffer)
+{
+	int nid;
+
+	for (nid = 0; nid < MAX_NUMNODES; nid++){
+		buffer += sprintf(buffer, "VmRSS-node_%d:\t%8lu kb\n",
+			nid, mm->pernode_rss[nid] << (PAGE_SHIFT-10));
+	}
+
+	return buffer;
+}
+#else /* !CONFIG_NUMA */
+#define task_mem_pernode(mm, buffer)	(buffer)
+#endif /* CONFIG_NUMA */
+
 char *task_mem(struct mm_struct *mm, char *buffer)
 {
 	unsigned long data = 0, stack = 0, exec = 0, lib = 0;
@@ -39,6 +55,7 @@ char *task_mem(struct mm_struct *mm, cha
 		mm->rss << (PAGE_SHIFT-10),
 		data - stack, stack,
 		exec - lib, lib);
+	buffer = task_mem_pernode(mm, buffer);
 	up_read(&mm->mmap_sem);
 	return buffer;
 }
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/asm-alpha/lockmeter.h 999-mjb/include/asm-alpha/lockmeter.h
--- 000-virgin/include/asm-alpha/lockmeter.h	1969-12-31 16:00:00.000000000 -0800
+++ 999-mjb/include/asm-alpha/lockmeter.h	2003-10-02 16:39:44.000000000 -0700
@@ -0,0 +1,90 @@
+/*
+ *  Written by John Hawkes (hawkes@sgi.com)
+ *  Based on klstat.h by Jack Steiner (steiner@sgi.com)
+ *
+ *  Modified by Peter Rival (frival@zk3.dec.com)
+ */
+
+#ifndef _ALPHA_LOCKMETER_H
+#define _ALPHA_LOCKMETER_H
+
+#include <asm/hwrpb.h>
+#define CPU_CYCLE_FREQUENCY	hwrpb->cycle_freq
+
+#define get_cycles64()		get_cycles()
+
+#define THIS_CPU_NUMBER		smp_processor_id()
+
+#include <linux/version.h>
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,3,0)
+#define local_irq_save(x) \
+	__save_and_cli(x)
+#define local_irq_restore(x) \
+	__restore_flags(x)
+#endif	/* Linux version 2.2.x */
+
+#define SPINLOCK_MAGIC_INIT /**/
+
+/*
+ * Macros to cache and retrieve an index value inside of a lock
+ * these macros assume that there are less than 65536 simultaneous
+ * (read mode) holders of a rwlock.
+ * We also assume that the hash table has less than 32767 entries.
+ * the high order bit is used for write locking a rw_lock
+ * Note: although these defines and macros are the same as what is being used
+ *       in include/asm-i386/lockmeter.h, they are present here to easily
+ *	 allow an alternate Alpha implementation.
+ */
+/*
+ * instrumented spinlock structure -- never used to allocate storage
+ * only used in macros below to overlay a spinlock_t
+ */
+typedef struct inst_spinlock_s {
+	/* remember, Alpha is little endian */
+	unsigned short lock;
+	unsigned short index;
+} inst_spinlock_t;
+#define PUT_INDEX(lock_ptr,indexv)	((inst_spinlock_t *)(lock_ptr))->index = indexv
+#define GET_INDEX(lock_ptr)		((inst_spinlock_t *)(lock_ptr))->index
+
+/*
+ * macros to cache and retrieve an index value in a read/write lock
+ * as well as the cpu where a reader busy period started
+ * we use the 2nd word (the debug word) for this, so require the
+ * debug word to be present
+ */
+/*
+ * instrumented rwlock structure -- never used to allocate storage
+ * only used in macros below to overlay a rwlock_t
+ */
+typedef struct inst_rwlock_s {
+	volatile int lock;
+	unsigned short index;
+	unsigned short cpu;
+} inst_rwlock_t;
+#define PUT_RWINDEX(rwlock_ptr,indexv)	((inst_rwlock_t *)(rwlock_ptr))->index = indexv
+#define GET_RWINDEX(rwlock_ptr)		((inst_rwlock_t *)(rwlock_ptr))->index
+#define PUT_RW_CPU(rwlock_ptr,cpuv)	((inst_rwlock_t *)(rwlock_ptr))->cpu = cpuv
+#define GET_RW_CPU(rwlock_ptr)		((inst_rwlock_t *)(rwlock_ptr))->cpu
+
+/*
+ * return true if rwlock is write locked
+ * (note that other lock attempts can cause the lock value to be negative)
+ */
+#define RWLOCK_IS_WRITE_LOCKED(rwlock_ptr) (((inst_rwlock_t *)rwlock_ptr)->lock & 1)
+#define IABS(x) ((x) > 0 ? (x) : -(x))
+
+#define RWLOCK_READERS(rwlock_ptr)	rwlock_readers(rwlock_ptr)
+extern inline int rwlock_readers(rwlock_t *rwlock_ptr)
+{
+	int tmp = (int) ((inst_rwlock_t *)rwlock_ptr)->lock;
+	/* readers subtract 2, so we have to:		*/
+	/* 	- andnot off a possible writer (bit 0)	*/
+	/*	- get the absolute value		*/
+	/*	- divide by 2 (right shift by one)	*/
+	/* to find the number of readers		*/
+	if (tmp == 0) return(0);
+	else return(IABS(tmp & ~1)>>1);
+}
+
+#endif /* _ALPHA_LOCKMETER_H */
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/asm-alpha/spinlock.h 999-mjb/include/asm-alpha/spinlock.h
--- 000-virgin/include/asm-alpha/spinlock.h	2003-06-05 14:55:52.000000000 -0700
+++ 999-mjb/include/asm-alpha/spinlock.h	2003-10-02 16:39:44.000000000 -0700
@@ -6,6 +6,10 @@
 #include <linux/kernel.h>
 #include <asm/current.h>
 
+#ifdef CONFIG_LOCKMETER
+#undef DEBUG_SPINLOCK
+#undef DEBUG_RWLOCK
+#endif
 
 /*
  * Simple spin lock operations.  There are two variants, one clears IRQ's
@@ -95,9 +99,18 @@ static inline int _raw_spin_trylock(spin
 
 typedef struct {
 	volatile int write_lock:1, read_counter:31;
+#ifdef CONFIG_LOCKMETER
+	/* required for LOCKMETER since all bits in lock are used */
+	/* need this storage for CPU and lock INDEX ............. */
+	unsigned magic;
+#endif
 } /*__attribute__((aligned(32)))*/ rwlock_t;
 
+#ifdef CONFIG_LOCKMETER
+#define RW_LOCK_UNLOCKED (rwlock_t) { 0, 0, 0 }
+#else
 #define RW_LOCK_UNLOCKED (rwlock_t) { 0, 0 }
+#endif
 
 #define rwlock_init(x)	do { *(x) = RW_LOCK_UNLOCKED; } while(0)
 #define rwlock_is_locked(x)	(*(volatile int *)(x) != 0)
@@ -169,4 +182,41 @@ static inline void _raw_read_unlock(rwlo
 	: "m" (*lock) : "memory");
 }
 
+#ifdef CONFIG_LOCKMETER
+static inline int _raw_write_trylock(rwlock_t *lock)
+{
+	long temp,result;
+
+	__asm__ __volatile__(
+	"	ldl_l %1,%0\n"
+	"	mov $31,%2\n"
+	"	bne %1,1f\n"
+	"	or $31,1,%2\n"
+	"	stl_c %2,%0\n"
+	"1:	mb\n"
+	: "=m" (*(volatile int *)lock), "=&r" (temp), "=&r" (result)
+	: "m" (*(volatile int *)lock)
+	);
+
+	return (result);
+}
+
+static inline int _raw_read_trylock(rwlock_t *lock)
+{
+	unsigned long temp,result;
+
+	__asm__ __volatile__(
+	"	ldl_l %1,%0\n"
+	"	mov $31,%2\n"
+	"	blbs %1,1f\n"
+	"	subl %1,2,%2\n"
+	"	stl_c %2,%0\n"
+	"1:	mb\n"
+	: "=m" (*(volatile int *)lock), "=&r" (temp), "=&r" (result)
+	: "m" (*(volatile int *)lock)
+	);
+	return (result);
+}
+#endif /* CONFIG_LOCKMETER */
+
 #endif /* _ALPHA_SPINLOCK_H */
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/asm-generic/tlb.h 999-mjb/include/asm-generic/tlb.h
--- 000-virgin/include/asm-generic/tlb.h	2003-10-01 11:41:15.000000000 -0700
+++ 999-mjb/include/asm-generic/tlb.h	2003-10-02 16:42:17.000000000 -0700
@@ -39,7 +39,6 @@ struct mmu_gather {
 	unsigned int		nr;	/* set to ~0U means fast mode */
 	unsigned int		need_flush;/* Really unmapped some ptes? */
 	unsigned int		fullmm; /* non-zero means full mm flush */
-	unsigned long		freed;
 	struct page *		pages[FREE_PTE_NR];
 };
 
@@ -60,7 +59,6 @@ tlb_gather_mmu(struct mm_struct *mm, uns
 	tlb->nr = num_online_cpus() > 1 ? 0U : ~0U;
 
 	tlb->fullmm = full_mm_flush;
-	tlb->freed = 0;
 
 	return tlb;
 }
@@ -85,13 +83,6 @@ tlb_flush_mmu(struct mmu_gather *tlb, un
 static inline void
 tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
 {
-	int freed = tlb->freed;
-	struct mm_struct *mm = tlb->mm;
-	int rss = mm->rss;
-
-	if (rss < freed)
-		freed = rss;
-	mm->rss = rss - freed;
 	tlb_flush_mmu(tlb, start, end);
 
 	/* keep the page table cache within bounds */
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/asm-i386/early_printk.h 999-mjb/include/asm-i386/early_printk.h
--- 000-virgin/include/asm-i386/early_printk.h	1969-12-31 16:00:00.000000000 -0800
+++ 999-mjb/include/asm-i386/early_printk.h	2003-10-02 16:39:35.000000000 -0700
@@ -0,0 +1,8 @@
+#ifndef __X86_EARLY_PRINTK_H_I386_
+#define __X86_EARLY_PRINTK_H_I386_
+
+#define VGABASE  0xB8000
+#define SERIAL_BASES { 0x3f8, 0x2f8 }
+#define SERIAL_BASES_LEN 2
+
+#endif
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/asm-i386/lockmeter.h 999-mjb/include/asm-i386/lockmeter.h
--- 000-virgin/include/asm-i386/lockmeter.h	1969-12-31 16:00:00.000000000 -0800
+++ 999-mjb/include/asm-i386/lockmeter.h	2003-10-02 16:39:44.000000000 -0700
@@ -0,0 +1,127 @@
+/*
+ *  Copyright (C) 1999,2000 Silicon Graphics, Inc.
+ *
+ *  Written by John Hawkes (hawkes@sgi.com)
+ *  Based on klstat.h by Jack Steiner (steiner@sgi.com)
+ *
+ *  Modified by Ray Bryant (raybry@us.ibm.com)
+ *  Changes Copyright (C) 2000 IBM, Inc.
+ *  Added save of index in spinlock_t to improve efficiency
+ *  of "hold" time reporting for spinlocks.
+ *  Added support for hold time statistics for read and write
+ *  locks.
+ *  Moved machine dependent code here from include/lockmeter.h.
+ *
+ */
+
+#ifndef _I386_LOCKMETER_H
+#define _I386_LOCKMETER_H
+
+#include <asm/spinlock.h>
+#include <asm/rwlock.h>
+
+#include <linux/version.h>
+
+#ifdef __KERNEL__
+extern unsigned long cpu_khz;
+#define CPU_CYCLE_FREQUENCY	(cpu_khz * 1000)
+#else
+#define CPU_CYCLE_FREQUENCY	450000000
+#endif
+
+#define THIS_CPU_NUMBER		smp_processor_id()
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,3,0)
+#define local_irq_save(x) \
+    __asm__ __volatile__("pushfl ; popl %0 ; cli":"=g" (x): /* no input */ :"memory")
+
+#define local_irq_restore(x) \
+    __asm__ __volatile__("pushl %0 ; popfl": /* no output */ :"g" (x):"memory")
+#endif	/* Linux version 2.2.x */
+
+/*
+ * macros to cache and retrieve an index value inside of a spin lock  
+ * these macros assume that there are less than 65536 simultaneous
+ * (read mode) holders of a rwlock.  Not normally a problem!!
+ * we also assume that the hash table has less than 65535 entries.
+ */
+/*
+ * instrumented spinlock structure -- never used to allocate storage
+ * only used in macros below to overlay a spinlock_t
+ */
+typedef struct inst_spinlock_s {
+	/* remember, Intel is little endian */
+	unsigned short lock;
+	unsigned short index;
+} inst_spinlock_t;
+#define PUT_INDEX(lock_ptr,indexv) ((inst_spinlock_t *)(lock_ptr))->index = indexv
+#define GET_INDEX(lock_ptr)        ((inst_spinlock_t *)(lock_ptr))->index
+
+/*
+ * macros to cache and retrieve an index value in a read/write lock
+ * as well as the cpu where a reader busy period started
+ * we use the 2nd word (the debug word) for this, so require the
+ * debug word to be present
+ */
+/*
+ * instrumented rwlock structure -- never used to allocate storage
+ * only used in macros below to overlay a rwlock_t
+ */
+typedef struct inst_rwlock_s {
+	volatile int lock;
+	unsigned short index;
+	unsigned short cpu;
+} inst_rwlock_t;
+#define PUT_RWINDEX(rwlock_ptr,indexv) ((inst_rwlock_t *)(rwlock_ptr))->index = indexv
+#define GET_RWINDEX(rwlock_ptr)        ((inst_rwlock_t *)(rwlock_ptr))->index
+#define PUT_RW_CPU(rwlock_ptr,cpuv)    ((inst_rwlock_t *)(rwlock_ptr))->cpu = cpuv
+#define GET_RW_CPU(rwlock_ptr)         ((inst_rwlock_t *)(rwlock_ptr))->cpu
+
+/* 
+ * return the number of readers for a rwlock_t
+ */
+#define RWLOCK_READERS(rwlock_ptr)   rwlock_readers(rwlock_ptr)
+
+extern inline int rwlock_readers(rwlock_t *rwlock_ptr) 
+{
+	int tmp = (int) rwlock_ptr->lock;
+	/* read and write lock attempts may cause the lock value to temporarily */
+	/* be negative.  Until it is >= 0 we know nothing (i. e. can't tell if  */
+	/* is -1 because it was write locked and somebody tried to read lock it */
+	/* or if it is -1 because it was read locked and somebody tried to write*/
+	/* lock it. ........................................................... */
+	do {
+		tmp = (int) rwlock_ptr->lock;
+	} while (tmp < 0);
+	if (tmp == 0) return(0);
+	else return(RW_LOCK_BIAS-tmp);
+}
+
+/*
+ * return true if rwlock is write locked
+ * (note that other lock attempts can cause the lock value to be negative)
+ */
+#define RWLOCK_IS_WRITE_LOCKED(rwlock_ptr) ((rwlock_ptr)->lock <= 0)
+#define IABS(x) ((x) > 0 ? (x) : -(x))
+#define RWLOCK_IS_READ_LOCKED(rwlock_ptr)  ((IABS((rwlock_ptr)->lock) % RW_LOCK_BIAS) != 0)
+
+/* this is a lot of typing just to get gcc to emit "rdtsc" */
+static inline long long get_cycles64 (void)
+{
+#ifndef CONFIG_X86_TSC
+	#error this code requires CONFIG_X86_TSC
+#else
+	union longlong_u {
+		long long intlong;
+		struct intint_s {
+			uint32_t eax;
+			uint32_t edx;
+		} intint;
+	} longlong; 
+
+	rdtsc(longlong.intint.eax,longlong.intint.edx);
+	return longlong.intlong;
+#endif
+}
+
+#endif /* _I386_LOCKMETER_H */
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/asm-i386/mmzone.h 999-mjb/include/asm-i386/mmzone.h
--- 000-virgin/include/asm-i386/mmzone.h	2003-10-01 11:48:22.000000000 -0700
+++ 999-mjb/include/asm-i386/mmzone.h	2003-10-02 16:42:48.000000000 -0700
@@ -10,7 +10,49 @@
 
 #ifdef CONFIG_DISCONTIGMEM
 
+#ifdef CONFIG_NUMA
+	#ifdef CONFIG_X86_NUMAQ
+		#include <asm/numaq.h>
+	#else	/* summit or generic arch */
+		#include <asm/srat.h>
+	#endif
+#else /* !CONFIG_NUMA */
+	#define get_memcfg_numa get_memcfg_numa_flat
+	#define get_zholes_size(n) (0)
+#endif /* CONFIG_NUMA */
+
 extern struct pglist_data *node_data[];
+#define NODE_DATA(nid)		(node_data[nid])
+
+/*
+ * generic node memory support, the following assumptions apply:
+ *
+ * 1) memory comes in 256Mb contigious chunks which are either present or not
+ * 2) we will not have more than 64Gb in total
+ *
+ * for now assume that 64Gb is max amount of RAM for whole system
+ *    64Gb / 4096bytes/page = 16777216 pages
+ */
+#define MAX_NR_PAGES 16777216
+#define MAX_ELEMENTS 256
+#define PAGES_PER_ELEMENT (MAX_NR_PAGES/MAX_ELEMENTS)
+
+extern u8 physnode_map[];
+
+static inline int pfn_to_nid(unsigned long pfn)
+{
+#ifdef CONFIG_NUMA
+	return(physnode_map[(pfn) / PAGES_PER_ELEMENT]);
+#else
+	return 0;
+#endif
+}
+
+static inline struct pglist_data *pfn_to_pgdat(unsigned long pfn)
+{
+	return(NODE_DATA(pfn_to_nid(pfn)));
+}
+
 
 /*
  * Following are macros that are specific to this numa platform.
@@ -43,11 +85,6 @@ extern struct pglist_data *node_data[];
  */
 #define kvaddr_to_nid(kaddr)	pfn_to_nid(__pa(kaddr) >> PAGE_SHIFT)
 
-/*
- * Return a pointer to the node data for node n.
- */
-#define NODE_DATA(nid)		(node_data[nid])
-
 #define node_mem_map(nid)	(NODE_DATA(nid)->node_mem_map)
 #define node_start_pfn(nid)	(NODE_DATA(nid)->node_start_pfn)
 #define node_end_pfn(nid)						\
@@ -93,40 +130,6 @@ extern struct pglist_data *node_data[];
  */ 
 #define pfn_valid(pfn)          ((pfn) < num_physpages)
 
-/*
- * generic node memory support, the following assumptions apply:
- *
- * 1) memory comes in 256Mb contigious chunks which are either present or not
- * 2) we will not have more than 64Gb in total
- *
- * for now assume that 64Gb is max amount of RAM for whole system
- *    64Gb / 4096bytes/page = 16777216 pages
- */
-#define MAX_NR_PAGES 16777216
-#define MAX_ELEMENTS 256
-#define PAGES_PER_ELEMENT (MAX_NR_PAGES/MAX_ELEMENTS)
-
-extern u8 physnode_map[];
-
-static inline int pfn_to_nid(unsigned long pfn)
-{
-	return(physnode_map[(pfn) / PAGES_PER_ELEMENT]);
-}
-static inline struct pglist_data *pfn_to_pgdat(unsigned long pfn)
-{
-	return(NODE_DATA(pfn_to_nid(pfn)));
-}
-
-#ifdef CONFIG_X86_NUMAQ
-#include <asm/numaq.h>
-#elif CONFIG_ACPI_SRAT
-#include <asm/srat.h>
-#elif CONFIG_X86_PC
-#define get_zholes_size(n) (0)
-#else
-#define pfn_to_nid(pfn)		(0)
-#endif /* CONFIG_X86_NUMAQ */
-
 extern int get_memcfg_numa_flat(void );
 /*
  * This allows any one NUMA architecture to be compiled
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/asm-i386/page.h 999-mjb/include/asm-i386/page.h
--- 000-virgin/include/asm-i386/page.h	2003-04-09 11:48:05.000000000 -0700
+++ 999-mjb/include/asm-i386/page.h	2003-10-02 16:39:38.000000000 -0700
@@ -115,9 +115,26 @@ static __inline__ int get_order(unsigned
 #endif /* __ASSEMBLY__ */
 
 #ifdef __ASSEMBLY__
-#define __PAGE_OFFSET		(0xC0000000)
+#include <linux/config.h>
+#ifdef CONFIG_05GB
+#define __PAGE_OFFSET          (0xE0000000)
+#elif defined(CONFIG_1GB)
+#define __PAGE_OFFSET          (0xC0000000)
+#elif defined(CONFIG_2GB)
+#define __PAGE_OFFSET          (0x80000000)
+#elif defined(CONFIG_3GB)
+#define __PAGE_OFFSET          (0x40000000)
+#endif
 #else
-#define __PAGE_OFFSET		(0xC0000000UL)
+#ifdef CONFIG_05GB
+#define __PAGE_OFFSET          (0xE0000000UL)
+#elif defined(CONFIG_1GB)
+#define __PAGE_OFFSET          (0xC0000000UL)
+#elif defined(CONFIG_2GB)
+#define __PAGE_OFFSET          (0x80000000UL)
+#elif defined(CONFIG_3GB)
+#define __PAGE_OFFSET          (0x40000000UL)
+#endif
 #endif
 
 
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/asm-i386/param.h 999-mjb/include/asm-i386/param.h
--- 000-virgin/include/asm-i386/param.h	2002-12-09 18:45:45.000000000 -0800
+++ 999-mjb/include/asm-i386/param.h	2003-10-02 16:39:36.000000000 -0700
@@ -2,11 +2,19 @@
 #define _ASMi386_PARAM_H
 
 #ifdef __KERNEL__
-# define HZ		1000		/* Internal kernel timer frequency */
-# define USER_HZ	100		/* .. some user interfaces are in "ticks" */
-# define CLOCKS_PER_SEC	(USER_HZ)	/* like times() */
+#include <linux/config.h>
+
+#ifdef CONFIG_1000HZ
+# define HZ	1000		/* Internal kernel timer frequency */
+#else
+# define HZ	100
 #endif
 
+#define USER_HZ	100		/* .. some user interfaces are in "ticks" */
+#define CLOCKS_PER_SEC	(USER_HZ)	/* like times() */
+
+#endif	/* __KERNEL__ */
+
 #ifndef HZ
 #define HZ 100
 #endif
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/asm-i386/processor.h 999-mjb/include/asm-i386/processor.h
--- 000-virgin/include/asm-i386/processor.h	2003-10-01 11:48:22.000000000 -0700
+++ 999-mjb/include/asm-i386/processor.h	2003-10-02 16:39:38.000000000 -0700
@@ -299,7 +299,11 @@ extern unsigned int mca_pentium_flag;
 /* This decides where the kernel will search for a free chunk of vm
  * space during mmap's.
  */
+#ifdef CONFIG_05GB
+#define TASK_UNMAPPED_BASE	(PAGE_ALIGN(TASK_SIZE / 16))
+#else
 #define TASK_UNMAPPED_BASE	(PAGE_ALIGN(TASK_SIZE / 3))
+#endif
 
 /*
  * Size of io_bitmap, covering ports 0 to 0x3ff.
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/asm-i386/rwlock.h 999-mjb/include/asm-i386/rwlock.h
--- 000-virgin/include/asm-i386/rwlock.h	2002-12-09 18:46:25.000000000 -0800
+++ 999-mjb/include/asm-i386/rwlock.h	2003-10-02 16:39:42.000000000 -0700
@@ -20,28 +20,52 @@
 #define RW_LOCK_BIAS		 0x01000000
 #define RW_LOCK_BIAS_STR	"0x01000000"
 
-#define __build_read_lock_ptr(rw, helper)   \
-	asm volatile(LOCK "subl $1,(%0)\n\t" \
-		     "js 2f\n" \
-		     "1:\n" \
-		     LOCK_SECTION_START("") \
-		     "2:\tcall " helper "\n\t" \
-		     "jmp 1b\n" \
-		     LOCK_SECTION_END \
-		     ::"a" (rw) : "memory")
-
-#define __build_read_lock_const(rw, helper)   \
-	asm volatile(LOCK "subl $1,%0\n\t" \
-		     "js 2f\n" \
-		     "1:\n" \
-		     LOCK_SECTION_START("") \
-		     "2:\tpushl %%eax\n\t" \
-		     "leal %0,%%eax\n\t" \
-		     "call " helper "\n\t" \
-		     "popl %%eax\n\t" \
-		     "jmp 1b\n" \
-		     LOCK_SECTION_END \
-		     :"=m" (*(volatile int *)rw) : : "memory")
+#ifdef CONFIG_SPINLINE
+
+	#define __build_read_lock_ptr(rw, helper)   \
+		asm volatile(LOCK "subl $1,(%0)\n\t" \
+			     "jns 1f\n\t" \
+			     "call " helper "\n\t" \
+			     "1:\t" \
+			     ::"a" (rw) : "memory")
+
+	#define __build_read_lock_const(rw, helper)   \
+		asm volatile(LOCK "subl $1,%0\n\t" \
+			     "jns 1f\n\t" \
+			     "pushl %%eax\n\t" \
+			     "leal %0,%%eax\n\t" \
+			     "call " helper "\n\t" \
+			     "popl %%eax\n\t" \
+			     "1:\t" \
+			     :"=m" (*(volatile int *)rw) : : "memory")
+
+#else /* !CONFIG_SPINLINE */
+
+	#define __build_read_lock_ptr(rw, helper)   \
+		asm volatile(LOCK "subl $1,(%0)\n\t" \
+			     "js 2f\n" \
+			     "1:\n" \
+			     LOCK_SECTION_START("") \
+			     "2:\tcall " helper "\n\t" \
+			     "jmp 1b\n" \
+			     LOCK_SECTION_END \
+			     ::"a" (rw) : "memory")
+
+	#define __build_read_lock_const(rw, helper)   \
+		asm volatile(LOCK "subl $1,%0\n\t" \
+			     "js 2f\n" \
+			     "1:\n" \
+			     LOCK_SECTION_START("") \
+			     "2:\tpushl %%eax\n\t" \
+			     "leal %0,%%eax\n\t" \
+			     "call " helper "\n\t" \
+			     "popl %%eax\n\t" \
+			     "jmp 1b\n" \
+			     LOCK_SECTION_END \
+			     :"=m" (*(volatile int *)rw) : : "memory")
+
+#endif /* CONFIG_SPINLINE */
+
 
 #define __build_read_lock(rw, helper)	do { \
 						if (__builtin_constant_p(rw)) \
@@ -50,28 +74,51 @@
 							__build_read_lock_ptr(rw, helper); \
 					} while (0)
 
-#define __build_write_lock_ptr(rw, helper) \
-	asm volatile(LOCK "subl $" RW_LOCK_BIAS_STR ",(%0)\n\t" \
-		     "jnz 2f\n" \
-		     "1:\n" \
-		     LOCK_SECTION_START("") \
-		     "2:\tcall " helper "\n\t" \
-		     "jmp 1b\n" \
-		     LOCK_SECTION_END \
-		     ::"a" (rw) : "memory")
-
-#define __build_write_lock_const(rw, helper) \
-	asm volatile(LOCK "subl $" RW_LOCK_BIAS_STR ",%0\n\t" \
-		     "jnz 2f\n" \
-		     "1:\n" \
-		     LOCK_SECTION_START("") \
-		     "2:\tpushl %%eax\n\t" \
-		     "leal %0,%%eax\n\t" \
-		     "call " helper "\n\t" \
-		     "popl %%eax\n\t" \
-		     "jmp 1b\n" \
-		     LOCK_SECTION_END \
-		     :"=m" (*(volatile int *)rw) : : "memory")
+#ifdef CONFIG_SPINLINE
+
+	#define __build_write_lock_ptr(rw, helper) \
+		asm volatile(LOCK "subl $" RW_LOCK_BIAS_STR ",(%0)\n\t" \
+			     "jz 1f\n\t" \
+			     "call " helper "\n\t" \
+			     "1:\n" \
+			     ::"a" (rw) : "memory")
+
+	#define __build_write_lock_const(rw, helper) \
+		asm volatile(LOCK "subl $" RW_LOCK_BIAS_STR ",%0\n\t" \
+			     "jz 1f\n\t" \
+			     "pushl %%eax\n\t" \
+			     "leal %0,%%eax\n\t" \
+			     "call " helper "\n\t" \
+			     "popl %%eax\n\t" \
+			     "1:\n" \
+			     :"=m" (*(volatile int *)rw) : : "memory")
+
+#else /* !CONFIG_SPINLINE */
+
+	#define __build_write_lock_ptr(rw, helper) \
+		asm volatile(LOCK "subl $" RW_LOCK_BIAS_STR ",(%0)\n\t" \
+			     "jnz 2f\n" \
+			     "1:\n" \
+			     LOCK_SECTION_START("") \
+			     "2:\tcall " helper "\n\t" \
+			     "jmp 1b\n" \
+			     LOCK_SECTION_END \
+			     ::"a" (rw) : "memory")
+
+	#define __build_write_lock_const(rw, helper) \
+		asm volatile(LOCK "subl $" RW_LOCK_BIAS_STR ",%0\n\t" \
+			     "jnz 2f\n" \
+			     "1:\n" \
+			     LOCK_SECTION_START("") \
+			     "2:\tpushl %%eax\n\t" \
+			     "leal %0,%%eax\n\t" \
+			     "call " helper "\n\t" \
+			     "popl %%eax\n\t" \
+			     "jmp 1b\n" \
+			     LOCK_SECTION_END \
+			     :"=m" (*(volatile int *)rw) : : "memory")
+
+#endif /* CONFIG_SPINLINE */
 
 #define __build_write_lock(rw, helper)	do { \
 						if (__builtin_constant_p(rw)) \
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/asm-i386/spinlock.h 999-mjb/include/asm-i386/spinlock.h
--- 000-virgin/include/asm-i386/spinlock.h	2003-06-05 14:56:10.000000000 -0700
+++ 999-mjb/include/asm-i386/spinlock.h	2003-10-02 16:39:44.000000000 -0700
@@ -43,18 +43,35 @@ typedef struct {
 #define spin_is_locked(x)	(*(volatile signed char *)(&(x)->lock) <= 0)
 #define spin_unlock_wait(x)	do { barrier(); } while(spin_is_locked(x))
 
-#define spin_lock_string \
-	"\n1:\t" \
-	"lock ; decb %0\n\t" \
-	"js 2f\n" \
-	LOCK_SECTION_START("") \
-	"2:\t" \
-	"rep;nop\n\t" \
-	"cmpb $0,%0\n\t" \
-	"jle 2b\n\t" \
-	"jmp 1b\n" \
-	LOCK_SECTION_END
+#ifdef CONFIG_SPINLINE
 
+	#define spin_lock_string \
+		"\n1:\t" \
+		"lock ; decb %0\n\t" \
+		"js 2f\n" \
+		"jmp 3f\n" \
+		"2:\t" \
+		"rep;nop\n\t" \
+		"cmpb $0,%0\n\t" \
+		"jle 2b\n\t" \
+		"jmp 1b\n" \
+		"3:\t"
+
+#else /* !CONFIG_SPINLINE */
+
+	#define spin_lock_string \
+		"\n1:\t" \
+		"lock ; decb %0\n\t" \
+		"js 2f\n" \
+		LOCK_SECTION_START("") \
+		"2:\t" \
+		"rep;nop\n\t" \
+		"cmpb $0,%0\n\t" \
+		"jle 2b\n\t" \
+		"jmp 1b\n" \
+		LOCK_SECTION_END
+
+#endif /* CONFIG_SPINLINE */
 /*
  * This works. Despite all the confusion.
  * (except on PPro SMP or if we are using OOSTORE)
@@ -138,6 +155,11 @@ here:
  */
 typedef struct {
 	volatile unsigned int lock;
+#if CONFIG_LOCKMETER
+	/* required for LOCKMETER since all bits in lock are used */
+	/* and we need this storage for CPU and lock INDEX        */
+	unsigned lockmeter_magic;
+#endif
 #ifdef CONFIG_DEBUG_SPINLOCK
 	unsigned magic;
 #endif
@@ -145,11 +167,19 @@ typedef struct {
 
 #define RWLOCK_MAGIC	0xdeaf1eed
 
+#ifdef CONFIG_LOCKMETER
+#if CONFIG_DEBUG_SPINLOCK
+#define RWLOCK_MAGIC_INIT	, 0, RWLOCK_MAGIC
+#else
+#define RWLOCK_MAGIC_INIT	, 0
+#endif
+#else /* !CONFIG_LOCKMETER */
 #ifdef CONFIG_DEBUG_SPINLOCK
 #define RWLOCK_MAGIC_INIT	, RWLOCK_MAGIC
 #else
 #define RWLOCK_MAGIC_INIT	/* */
 #endif
+#endif /* !CONFIG_LOCKMETER */
 
 #define RW_LOCK_UNLOCKED (rwlock_t) { RW_LOCK_BIAS RWLOCK_MAGIC_INIT }
 
@@ -196,4 +226,58 @@ static inline int _raw_write_trylock(rwl
 	return 0;
 }
 
+#ifdef CONFIG_LOCKMETER
+static inline int _raw_read_trylock(rwlock_t *lock)
+{
+/* FIXME -- replace with assembler */
+	atomic_t *count = (atomic_t *)lock;
+	atomic_dec(count);
+	if (count->counter > 0)
+		return 1;
+	atomic_inc(count);
+	return 0;
+}
+#endif
+
+#if defined(CONFIG_LOCKMETER) && defined(CONFIG_HAVE_DEC_LOCK)
+extern void _metered_spin_lock  (spinlock_t *lock);
+extern void _metered_spin_unlock(spinlock_t *lock);
+
+/*
+ *  Matches what is in arch/i386/lib/dec_and_lock.c, except this one is
+ *  "static inline" so that the spin_lock(), if actually invoked, is charged
+ *  against the real caller, not against the catch-all atomic_dec_and_lock
+ */
+static inline int atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock)
+{
+	int counter;
+	int newcount;
+
+repeat:
+	counter = atomic_read(atomic);
+	newcount = counter-1;
+
+	if (!newcount)
+		goto slow_path;
+
+	asm volatile("lock; cmpxchgl %1,%2"
+		:"=a" (newcount)
+		:"r" (newcount), "m" (atomic->counter), "0" (counter));
+
+	/* If the above failed, "eax" will have changed */
+	if (newcount != counter)
+		goto repeat;
+	return 0;
+
+slow_path:
+	_metered_spin_lock(lock);
+	if (atomic_dec_and_test(atomic))
+		return 1;
+	_metered_spin_unlock(lock);
+	return 0;
+}
+
+#define ATOMIC_DEC_AND_LOCK
+#endif
+
 #endif /* __ASM_SPINLOCK_H */
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/asm-i386/unistd.h 999-mjb/include/asm-i386/unistd.h
--- 000-virgin/include/asm-i386/unistd.h	2003-10-01 11:41:15.000000000 -0700
+++ 999-mjb/include/asm-i386/unistd.h	2003-10-02 16:41:14.000000000 -0700
@@ -228,7 +228,7 @@
 #define __NR_madvise1		219	/* delete when C lib stub is removed */
 #define __NR_getdents64		220
 #define __NR_fcntl64		221
-/* 223 is unused */
+#define __NR_mbind		223
 #define __NR_gettid		224
 #define __NR_readahead		225
 #define __NR_setxattr		226
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/asm-ia64/lockmeter.h 999-mjb/include/asm-ia64/lockmeter.h
--- 000-virgin/include/asm-ia64/lockmeter.h	1969-12-31 16:00:00.000000000 -0800
+++ 999-mjb/include/asm-ia64/lockmeter.h	2003-10-02 16:39:44.000000000 -0700
@@ -0,0 +1,72 @@
+/*
+ *  Copyright (C) 1999,2000 Silicon Graphics, Inc.
+ *
+ *  Written by John Hawkes (hawkes@sgi.com)
+ *  Based on klstat.h by Jack Steiner (steiner@sgi.com)
+ */
+
+#ifndef _IA64_LOCKMETER_H
+#define _IA64_LOCKMETER_H
+
+#ifdef local_cpu_data
+#define CPU_CYCLE_FREQUENCY	local_cpu_data->itc_freq
+#else
+#define CPU_CYCLE_FREQUENCY	my_cpu_data.itc_freq
+#endif
+#define get_cycles64()		get_cycles()
+
+#define THIS_CPU_NUMBER		smp_processor_id()
+
+/*
+ * macros to cache and retrieve an index value inside of a lock
+ * these macros assume that there are less than 65536 simultaneous
+ * (read mode) holders of a rwlock.
+ * we also assume that the hash table has less than 32767 entries.
+ */
+/*
+ * instrumented spinlock structure -- never used to allocate storage
+ * only used in macros below to overlay a spinlock_t
+ */
+typedef struct inst_spinlock_s {
+	/* remember, Intel is little endian */
+	volatile unsigned short lock;
+	volatile unsigned short index;
+} inst_spinlock_t;
+#define PUT_INDEX(lock_ptr,indexv) ((inst_spinlock_t *)(lock_ptr))->index = indexv
+#define GET_INDEX(lock_ptr)        ((inst_spinlock_t *)(lock_ptr))->index
+
+/*
+ * macros to cache and retrieve an index value in a read/write lock
+ * as well as the cpu where a reader busy period started
+ * we use the 2nd word (the debug word) for this, so require the
+ * debug word to be present
+ */
+/*
+ * instrumented rwlock structure -- never used to allocate storage
+ * only used in macros below to overlay a rwlock_t
+ */
+typedef struct inst_rwlock_s {
+	volatile int read_counter:31;
+	volatile int write_lock:1;
+	volatile unsigned short index;
+	volatile unsigned short cpu;
+} inst_rwlock_t;
+#define PUT_RWINDEX(rwlock_ptr,indexv) ((inst_rwlock_t *)(rwlock_ptr))->index = indexv
+#define GET_RWINDEX(rwlock_ptr)        ((inst_rwlock_t *)(rwlock_ptr))->index
+#define PUT_RW_CPU(rwlock_ptr,cpuv)    ((inst_rwlock_t *)(rwlock_ptr))->cpu = cpuv
+#define GET_RW_CPU(rwlock_ptr)         ((inst_rwlock_t *)(rwlock_ptr))->cpu
+
+/* 
+ * return the number of readers for a rwlock_t
+ */
+#define RWLOCK_READERS(rwlock_ptr)	((rwlock_ptr)->read_counter)
+
+/*
+ * return true if rwlock is write locked
+ * (note that other lock attempts can cause the lock value to be negative)
+ */
+#define RWLOCK_IS_WRITE_LOCKED(rwlock_ptr) ((rwlock_ptr)->write_lock)
+#define RWLOCK_IS_READ_LOCKED(rwlock_ptr)  ((rwlock_ptr)->read_counter)
+
+#endif /* _IA64_LOCKMETER_H */
+
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/asm-ia64/spinlock.h 999-mjb/include/asm-ia64/spinlock.h
--- 000-virgin/include/asm-ia64/spinlock.h	2003-10-01 11:48:23.000000000 -0700
+++ 999-mjb/include/asm-ia64/spinlock.h	2003-10-02 16:39:44.000000000 -0700
@@ -190,4 +190,25 @@ do {										\
 	clear_bit(31, (x));								\
 })
 
+#ifdef CONFIG_LOCKMETER
+extern void _metered_spin_lock  (spinlock_t *lock);
+extern void _metered_spin_unlock(spinlock_t *lock);
+
+/*
+ *  Use a less efficient, and inline, atomic_dec_and_lock() if lockmetering
+ *  so we can see the callerPC of who is actually doing the spin_lock().
+ *  Otherwise, all we see is the generic rollup of all locks done by
+ *  atomic_dec_and_lock().
+ */
+static inline int atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock)
+{
+	_metered_spin_lock(lock);
+	if (atomic_dec_and_test(atomic))
+		return 1;
+	_metered_spin_unlock(lock);
+	return 0;
+}
+#define ATOMIC_DEC_AND_LOCK
+#endif
+
 #endif /*  _ASM_IA64_SPINLOCK_H */
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/asm-mips/lockmeter.h 999-mjb/include/asm-mips/lockmeter.h
--- 000-virgin/include/asm-mips/lockmeter.h	1969-12-31 16:00:00.000000000 -0800
+++ 999-mjb/include/asm-mips/lockmeter.h	2003-10-02 16:39:44.000000000 -0700
@@ -0,0 +1,126 @@
+/*
+ *  Copyright (C) 1999,2000 Silicon Graphics, Inc.
+ *
+ *  Written by John Hawkes (hawkes@sgi.com)
+ *  Based on klstat.h by Jack Steiner (steiner@sgi.com)
+ *  Ported to mips32 for Asita Technologies
+ *   by D.J. Barrow ( dj.barrow@asitatechnologies.com ) 
+ */
+#ifndef _ASM_LOCKMETER_H
+#define _ASM_LOCKMETER_H
+
+/* do_gettimeoffset is a function pointer on mips */
+/* & it is not included by <linux/time.h> */
+#include <asm/time.h>
+#include <linux/time.h>
+#include <asm/div64.h>
+
+#define SPINLOCK_MAGIC_INIT	/* */
+
+#define CPU_CYCLE_FREQUENCY	get_cpu_cycle_frequency()
+
+#define THIS_CPU_NUMBER		smp_processor_id()
+
+static uint32_t cpu_cycle_frequency = 0;
+
+static uint32_t get_cpu_cycle_frequency(void)
+{
+    /* a total hack, slow and invasive, but ... it works */
+    int sec;
+    uint32_t start_cycles;
+    struct timeval tv;
+
+    if (cpu_cycle_frequency == 0) {	/* uninitialized */
+	do_gettimeofday(&tv);
+	sec = tv.tv_sec;	/* set up to catch the tv_sec rollover */
+	while (sec == tv.tv_sec) { do_gettimeofday(&tv); }
+	sec = tv.tv_sec;	/* rolled over to a new sec value */
+	start_cycles = get_cycles();
+	while (sec == tv.tv_sec) { do_gettimeofday(&tv); }
+	cpu_cycle_frequency = get_cycles() - start_cycles;
+    }
+
+    return cpu_cycle_frequency;
+}
+
+extern struct timeval xtime;
+
+static uint64_t get_cycles64(void)
+{
+    static uint64_t last_get_cycles64 = 0;
+    uint64_t ret;
+    unsigned long sec;
+    unsigned long usec, usec_offset;
+
+again:
+    sec  = xtime.tv_sec;
+    usec = xtime.tv_usec;
+    usec_offset = do_gettimeoffset();
+    if ((xtime.tv_sec != sec)  ||
+	(xtime.tv_usec != usec)||
+	(usec_offset >= 20000))
+	goto again;
+
+    ret = ((uint64_t)(usec + usec_offset) * cpu_cycle_frequency);
+    /* We can't do a normal 64 bit division on mips without libgcc.a */
+    do_div(ret,1000000);
+    ret +=  ((uint64_t)sec * cpu_cycle_frequency);
+
+    /* XXX why does time go backwards?  do_gettimeoffset?  general time adj? */
+    if (ret <= last_get_cycles64)
+	ret  = last_get_cycles64+1;
+    last_get_cycles64 = ret;
+
+    return ret;
+}
+
+/*
+ * macros to cache and retrieve an index value inside of a lock
+ * these macros assume that there are less than 65536 simultaneous
+ * (read mode) holders of a rwlock.
+ * we also assume that the hash table has less than 32767 entries.
+ * the high order bit is used for write locking a rw_lock
+ */
+#define INDEX_MASK   0x7FFF0000
+#define READERS_MASK 0x0000FFFF
+#define INDEX_SHIFT 16
+#define PUT_INDEX(lockp,index)   \
+        lockp->lock = (((lockp->lock) & ~INDEX_MASK) | (index) << INDEX_SHIFT)
+#define GET_INDEX(lockp) \
+        (((lockp->lock) & INDEX_MASK) >> INDEX_SHIFT)
+
+/*
+ * macros to cache and retrieve an index value in a read/write lock
+ * as well as the cpu where a reader busy period started
+ * we use the 2nd word (the debug word) for this, so require the
+ * debug word to be present
+ */
+/*
+ * instrumented rwlock structure -- never used to allocate storage
+ * only used in macros below to overlay a rwlock_t
+ */
+typedef struct inst_rwlock_s {
+	volatile int lock;
+	unsigned short index;
+	unsigned short cpu;
+} inst_rwlock_t;
+#define PUT_RWINDEX(rwlock_ptr,indexv) ((inst_rwlock_t *)(rwlock_ptr))->index = indexv
+#define GET_RWINDEX(rwlock_ptr)        ((inst_rwlock_t *)(rwlock_ptr))->index
+#define PUT_RW_CPU(rwlock_ptr,cpuv)    ((inst_rwlock_t *)(rwlock_ptr))->cpu = cpuv
+#define GET_RW_CPU(rwlock_ptr)         ((inst_rwlock_t *)(rwlock_ptr))->cpu
+
+/* 
+ * return the number of readers for a rwlock_t
+ */
+#define RWLOCK_READERS(rwlock_ptr)   rwlock_readers(rwlock_ptr)
+
+extern inline int rwlock_readers(rwlock_t *rwlock_ptr) 
+{
+	int tmp = (int) rwlock_ptr->lock;
+	return (tmp >= 0) ? tmp : 0;
+}
+
+#define RWLOCK_IS_WRITE_LOCKED(rwlock_ptr) ((rwlock_ptr)->lock < 0)
+#define RWLOCK_IS_READ_LOCKED(rwlock_ptr)  ((rwlock_ptr)->lock > 0)
+
+#endif /* _ASM_LOCKMETER_H */
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/asm-mips/spinlock.h 999-mjb/include/asm-mips/spinlock.h
--- 000-virgin/include/asm-mips/spinlock.h	2003-07-02 14:44:56.000000000 -0700
+++ 999-mjb/include/asm-mips/spinlock.h	2003-10-02 16:39:44.000000000 -0700
@@ -91,9 +91,18 @@ static inline unsigned int _raw_spin_try
 
 typedef struct {
 	volatile unsigned int lock;
+#if CONFIG_LOCKMETER
+	/* required for LOCKMETER since all bits in lock are used */
+	/* and we need this storage for CPU and lock INDEX        */
+	unsigned lockmeter_magic;
+#endif
 } rwlock_t;
 
+#ifdef CONFIG_LOCKMETER
+#define RW_LOCK_UNLOCKED (rwlock_t) { 0, 0 }
+#else
 #define RW_LOCK_UNLOCKED (rwlock_t) { 0 }
+#endif
 
 #define rwlock_init(x)  do { *(x) = RW_LOCK_UNLOCKED; } while(0)
 
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/asm-mips64/lockmeter.h 999-mjb/include/asm-mips64/lockmeter.h
--- 000-virgin/include/asm-mips64/lockmeter.h	1969-12-31 16:00:00.000000000 -0800
+++ 999-mjb/include/asm-mips64/lockmeter.h	2003-10-02 16:39:44.000000000 -0700
@@ -0,0 +1,120 @@
+/*
+ *  Copyright (C) 1999,2000 Silicon Graphics, Inc.
+ *
+ *  Written by John Hawkes (hawkes@sgi.com)
+ *  Based on klstat.h by Jack Steiner (steiner@sgi.com)
+ */
+
+#ifndef _ASM_LOCKMETER_H
+#define _ASM_LOCKMETER_H
+
+#include <linux/time.h>
+
+#define SPINLOCK_MAGIC_INIT	/* */
+
+#define CPU_CYCLE_FREQUENCY	get_cpu_cycle_frequency()
+
+#define THIS_CPU_NUMBER		smp_processor_id()
+
+static uint32_t cpu_cycle_frequency = 0;
+
+static uint32_t get_cpu_cycle_frequency(void)
+{
+    /* a total hack, slow and invasive, but ... it works */
+    int sec;
+    uint32_t start_cycles;
+    struct timeval tv;
+
+    if (cpu_cycle_frequency == 0) {	/* uninitialized */
+	do_gettimeofday(&tv);
+	sec = tv.tv_sec;	/* set up to catch the tv_sec rollover */
+	while (sec == tv.tv_sec) { do_gettimeofday(&tv); }
+	sec = tv.tv_sec;	/* rolled over to a new sec value */
+	start_cycles = get_cycles();
+	while (sec == tv.tv_sec) { do_gettimeofday(&tv); }
+	cpu_cycle_frequency = get_cycles() - start_cycles;
+    }
+
+    return cpu_cycle_frequency;
+}
+
+extern struct timeval xtime;
+extern long do_gettimeoffset(void);
+
+static uint64_t get_cycles64(void)
+{
+    static uint64_t last_get_cycles64 = 0;
+    uint64_t ret;
+    unsigned long sec;
+    unsigned long usec, usec_offset;
+
+again:
+    sec  = xtime.tv_sec;
+    usec = xtime.tv_usec;
+    usec_offset = do_gettimeoffset();
+    if ((xtime.tv_sec != sec)  ||
+	(xtime.tv_usec != usec)||
+	(usec_offset >= 20000))
+	goto again;
+
+    ret =  ((uint64_t)sec * cpu_cycle_frequency)
+	+ ( ((uint64_t)(usec + usec_offset) * cpu_cycle_frequency) / 1000000 );
+
+    /* XXX why does time go backwards?  do_gettimeoffset?  general time adj? */
+    if (ret <= last_get_cycles64)
+	ret  = last_get_cycles64+1;
+    last_get_cycles64 = ret;
+
+    return ret;
+}
+
+/*
+ * macros to cache and retrieve an index value inside of a lock
+ * these macros assume that there are less than 65536 simultaneous
+ * (read mode) holders of a rwlock.
+ * we also assume that the hash table has less than 32767 entries.
+ * the high order bit is used for write locking a rw_lock
+ */
+#define INDEX_MASK   0x7FFF0000
+#define READERS_MASK 0x0000FFFF
+#define INDEX_SHIFT 16
+#define PUT_INDEX(lockp,index)   \
+        lockp->lock = (((lockp->lock) & ~INDEX_MASK) | (index) << INDEX_SHIFT)
+#define GET_INDEX(lockp) \
+        (((lockp->lock) & INDEX_MASK) >> INDEX_SHIFT)
+
+/*
+ * macros to cache and retrieve an index value in a read/write lock
+ * as well as the cpu where a reader busy period started
+ * we use the 2nd word (the debug word) for this, so require the
+ * debug word to be present
+ */
+/*
+ * instrumented rwlock structure -- never used to allocate storage
+ * only used in macros below to overlay a rwlock_t
+ */
+typedef struct inst_rwlock_s {
+	volatile int lock;
+	unsigned short index;
+	unsigned short cpu;
+} inst_rwlock_t;
+#define PUT_RWINDEX(rwlock_ptr,indexv) ((inst_rwlock_t *)(rwlock_ptr))->index = indexv
+#define GET_RWINDEX(rwlock_ptr)        ((inst_rwlock_t *)(rwlock_ptr))->index
+#define PUT_RW_CPU(rwlock_ptr,cpuv)    ((inst_rwlock_t *)(rwlock_ptr))->cpu = cpuv
+#define GET_RW_CPU(rwlock_ptr)         ((inst_rwlock_t *)(rwlock_ptr))->cpu
+
+/* 
+ * return the number of readers for a rwlock_t
+ */
+#define RWLOCK_READERS(rwlock_ptr)   rwlock_readers(rwlock_ptr)
+
+extern inline int rwlock_readers(rwlock_t *rwlock_ptr) 
+{
+	int tmp = (int) rwlock_ptr->lock;
+	return (tmp >= 0) ? tmp : 0;
+}
+
+#define RWLOCK_IS_WRITE_LOCKED(rwlock_ptr) ((rwlock_ptr)->lock < 0)
+#define RWLOCK_IS_READ_LOCKED(rwlock_ptr)  ((rwlock_ptr)->lock > 0)
+
+#endif /* _ASM_LOCKMETER_H */
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/asm-sparc64/lockmeter.h 999-mjb/include/asm-sparc64/lockmeter.h
--- 000-virgin/include/asm-sparc64/lockmeter.h	1969-12-31 16:00:00.000000000 -0800
+++ 999-mjb/include/asm-sparc64/lockmeter.h	2003-10-02 16:39:44.000000000 -0700
@@ -0,0 +1,47 @@
+/*
+ * Copyright (C) 2000 Anton Blanchard (anton@linuxcare.com)
+ */
+
+#ifndef _SPARC64_LOCKMETER_H
+#define _SPARC64_LOCKMETER_H
+
+#include <asm/spinlock.h>
+
+#include <linux/version.h>
+
+extern unsigned long cpu_hz;
+#define CPU_CYCLE_FREQUENCY	cpu_hz
+
+#define THIS_CPU_NUMBER		__cpu_number_map[smp_processor_id()]
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,3,0)
+#define local_irq_save(x)	__save_and_cli(x)
+#define local_irq_restore(x)	__restore_flags(x)
+#endif /* Linux version 2.2.x */
+
+#define PUT_INDEX(lock_ptr,indexv)	(lock_ptr)->index = (indexv)
+#define GET_INDEX(lock_ptr)		(lock_ptr)->index
+
+#define PUT_RWINDEX(rwlock_ptr,indexv) (rwlock_ptr)->index = (indexv)
+#define GET_RWINDEX(rwlock_ptr)        (rwlock_ptr)->index
+#define PUT_RW_CPU(rwlock_ptr,cpuv)    (rwlock_ptr)->cpu = (cpuv)
+#define GET_RW_CPU(rwlock_ptr)         (rwlock_ptr)->cpu
+
+#define RWLOCK_READERS(rwlock_ptr)	rwlock_readers(rwlock_ptr)
+
+extern inline int rwlock_readers(rwlock_t *rwlock_ptr)
+{
+	signed int tmp = rwlock_ptr->lock;
+
+	if (tmp > 0)
+		return tmp;
+	else
+		return 0;
+}
+
+#define RWLOCK_IS_WRITE_LOCKED(rwlock_ptr)	((signed int)((rwlock_ptr)->lock) < 0)
+#define RWLOCK_IS_READ_LOCKED(rwlock_ptr)	((signed int)((rwlock_ptr)->lock) > 0)
+
+#define get_cycles64()	get_cycles()
+
+#endif /* _SPARC64_LOCKMETER_H */
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/asm-sparc64/spinlock.h 999-mjb/include/asm-sparc64/spinlock.h
--- 000-virgin/include/asm-sparc64/spinlock.h	2002-12-09 18:45:48.000000000 -0800
+++ 999-mjb/include/asm-sparc64/spinlock.h	2003-10-02 16:39:44.000000000 -0700
@@ -30,15 +30,23 @@
 
 #ifndef CONFIG_DEBUG_SPINLOCK
 
-typedef unsigned char spinlock_t;
-#define SPIN_LOCK_UNLOCKED	0
+typedef struct {
+	unsigned char lock;
+	unsigned int  index;
+} spinlock_t;
 
-#define spin_lock_init(lock)	(*((unsigned char *)(lock)) = 0)
-#define spin_is_locked(lock)	(*((volatile unsigned char *)(lock)) != 0)
+#ifdef CONFIG_LOCKMETER
+#define SPIN_LOCK_UNLOCKED	(spinlock_t) {0, 0}
+#else
+#define SPIN_LOCK_UNLOCKED	(spinlock_t) { 0 }
+#endif
 
-#define spin_unlock_wait(lock)	\
+#define spin_lock_init(__lock)	do { *(__lock) = SPIN_LOCK_UNLOCKED; } while(0)
+#define spin_is_locked(__lock)	(*((volatile unsigned char *)(&((__lock)->lock))) != 0)
+
+#define spin_unlock_wait(__lock)	\
 do {	membar("#LoadLoad");	\
-} while(*((volatile unsigned char *)lock))
+} while(*((volatile unsigned char *)(&(((spinlock_t *)__lock)->lock))))
 
 static __inline__ void _raw_spin_lock(spinlock_t *lock)
 {
@@ -109,8 +117,20 @@ extern int _spin_trylock (spinlock_t *lo
 
 #ifndef CONFIG_DEBUG_SPINLOCK
 
-typedef unsigned int rwlock_t;
-#define RW_LOCK_UNLOCKED	0
+#ifdef CONFIG_LOCKMETER
+typedef struct {
+	unsigned int lock;
+	unsigned int index;
+	unsigned int cpu;
+} rwlock_t;
+#define RW_LOCK_UNLOCKED       (rwlock_t) { 0, 0, 0xff }
+#else
+typedef struct {
+	unsigned int lock;
+} rwlock_t;
+#define RW_LOCK_UNLOCKED        (rwlock_t) { 0 }
+#endif
+
 #define rwlock_init(lp) do { *(lp) = RW_LOCK_UNLOCKED; } while(0)
 #define rwlock_is_locked(x) (*(x) != RW_LOCK_UNLOCKED)
 
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/asm-x86_64/early_printk.h 999-mjb/include/asm-x86_64/early_printk.h
--- 000-virgin/include/asm-x86_64/early_printk.h	1969-12-31 16:00:00.000000000 -0800
+++ 999-mjb/include/asm-x86_64/early_printk.h	2003-10-02 16:39:35.000000000 -0700
@@ -0,0 +1,8 @@
+#ifndef __X86_EARLY_PRINTK_H_X86_64_
+#define __X86_EARLY_PRINTK_H_X86_64_
+
+#define VGABASE	0xffffffff800b8000UL
+#define SERIAL_BASES { 0x3f8, 0x2f8 }
+#define SERIAL_BASES_LEN 2
+
+#endif
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/linux/early_printk.h 999-mjb/include/linux/early_printk.h
--- 000-virgin/include/linux/early_printk.h	1969-12-31 16:00:00.000000000 -0800
+++ 999-mjb/include/linux/early_printk.h	2003-10-02 16:39:35.000000000 -0700
@@ -0,0 +1,47 @@
+#ifndef __X86_EARLY_PRINTK_H_
+#define __X86_EARLY_PRINTK_H_
+
+#ifdef CONFIG_X86_EARLY_PRINTK
+#include <linux/console.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <asm/io.h>
+#include <asm/early_printk.h>
+
+/* Simple VGA output */
+
+#define MAX_YPOS	25
+#define MAX_XPOS	80
+
+/* Simple serial port output */
+
+#define DEFAULT_BAUD	57600
+#define XMTRDY		0x20
+
+#define DLAB		0x80
+
+#define TXR		0	/*  Transmit register (WRITE) */
+#define RXR		0	/*  Receive register  (READ)  */
+#define IER		1	/*  Interrupt Enable	  	*/
+#define IIR		2	/*  Interrupt ID		*/
+#define FCR		2	/*  FIFO control		*/
+#define LCR		3	/*  Line control		*/
+#define MCR		4	/*  Modem control		*/
+#define LSR		5	/*  Line Status			*/
+#define MSR		6	/*  Modem Status		*/
+#define DLL		0	/*  Divisor Latch Low	 	*/
+#define DLH		1	/*  Divisor latch High		*/
+
+
+void early_printk(const char *fmt, ...);
+int __init setup_early_printk(); 
+
+#else
+
+#define early_printk(...) do {} while(0)
+#define setup_early_printk() do {} while(0)
+
+#endif
+
+#endif
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/linux/fs.h 999-mjb/include/linux/fs.h
--- 000-virgin/include/linux/fs.h	2003-10-01 11:48:25.000000000 -0700
+++ 999-mjb/include/linux/fs.h	2003-10-02 16:53:55.000000000 -0700
@@ -19,6 +19,8 @@
 #include <linux/cache.h>
 #include <linux/radix-tree.h>
 #include <linux/kobject.h>
+#include <linux/threads.h>
+#include <linux/spinlock.h>
 #include <asm/atomic.h>
 
 struct iovec;
@@ -315,11 +317,29 @@ struct address_space_operations {
 			loff_t offset, unsigned long nr_segs);
 };
 
+#if NR_CPUS > 8
+typedef rwlock_t mapping_rwlock_t;
+#define mapping_rdlock(lock)		read_lock(lock)
+#define mapping_rdunlock(lock)		read_unlock(lock)
+#define mapping_wrlock(lock)		write_lock(lock)
+#define mapping_wrunlock(lock)		write_unlock(lock)
+#define mapping_rwlock_init(lock)	rwlock_init(lock)
+#define MAPPING_RW_LOCK_UNLOCKED	RW_LOCK_UNLOCKED
+#else
+typedef spinlock_t mapping_rwlock_t;
+#define mapping_rdlock(lock)		spin_lock(lock)
+#define mapping_rdunlock(lock)		spin_unlock(lock)
+#define mapping_wrlock(lock)		spin_lock(lock)
+#define mapping_wrunlock(lock)		spin_unlock(lock)
+#define mapping_rwlock_init(lock)	spin_lock_init(lock)
+#define MAPPING_RW_LOCK_UNLOCKED	SPIN_LOCK_UNLOCKED
+#endif
+
 struct backing_dev_info;
 struct address_space {
 	struct inode		*host;		/* owner: inode, block_device */
 	struct radix_tree_root	page_tree;	/* radix tree of all pages */
-	spinlock_t		page_lock;	/* and spinlock protecting it */
+	mapping_rwlock_t	page_lock;	/* and spinlock protecting it */
 	struct list_head	clean_pages;	/* list of clean pages */
 	struct list_head	dirty_pages;	/* list of dirty pages */
 	struct list_head	locked_pages;	/* list of locked pages */
@@ -336,6 +356,9 @@ struct address_space {
 	spinlock_t		private_lock;	/* for use by the address_space */
 	struct list_head	private_list;	/* ditto */
 	struct address_space	*assoc_mapping;	/* ditto */
+#ifdef CONFIG_NUMA
+	struct binding		*binding;	/* for memory bindings */
+#endif
 };
 
 struct block_device {
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/linux/gfp.h 999-mjb/include/linux/gfp.h
--- 000-virgin/include/linux/gfp.h	2003-10-01 11:41:17.000000000 -0700
+++ 999-mjb/include/linux/gfp.h	2003-10-02 16:44:09.000000000 -0700
@@ -32,6 +32,7 @@
 #define __GFP_NOFAIL	0x800	/* Retry for ever.  Cannot fail */
 #define __GFP_NORETRY	0x1000	/* Do not retry.  Might fail */
 #define __GFP_NO_GROW	0x2000	/* Slab internal usage */
+#define __GFP_NODE_STRICT 0x4000 /* Do not fall back to other nodes */
 
 #define __GFP_BITS_SHIFT 16	/* Room for 16 __GFP_FOO bits */
 #define __GFP_BITS_MASK ((1 << __GFP_BITS_SHIFT) - 1)
@@ -69,7 +70,7 @@ static inline struct page * alloc_pages_
 	if (unlikely(order >= MAX_ORDER))
 		return NULL;
 
-	return __alloc_pages(gfp_mask, order, NODE_DATA(nid)->node_zonelists + (gfp_mask & GFP_ZONEMASK));
+	return __alloc_pages(gfp_mask, order, get_node_zonelist(nid, gfp_mask));
 }
 
 #define alloc_pages(gfp_mask, order) \
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/linux/lockmeter.h 999-mjb/include/linux/lockmeter.h
--- 000-virgin/include/linux/lockmeter.h	1969-12-31 16:00:00.000000000 -0800
+++ 999-mjb/include/linux/lockmeter.h	2003-10-02 16:39:44.000000000 -0700
@@ -0,0 +1,320 @@
+/*
+ *  Copyright (C) 1999-2002 Silicon Graphics, Inc.
+ *
+ *  Written by John Hawkes (hawkes@sgi.com)
+ *  Based on klstat.h by Jack Steiner (steiner@sgi.com)
+ *  
+ *  Modified by Ray Bryant (raybry@us.ibm.com) Feb-Apr 2000
+ *  Changes Copyright (C) 2000 IBM, Inc.
+ *  Added save of index in spinlock_t to improve efficiency
+ *  of "hold" time reporting for spinlocks
+ *  Added support for hold time statistics for read and write
+ *  locks.
+ *  Moved machine dependent code to include/asm/lockmeter.h.
+ *
+ */
+
+#ifndef _LINUX_LOCKMETER_H
+#define _LINUX_LOCKMETER_H
+
+
+/*---------------------------------------------------
+ *	architecture-independent lockmeter.h
+ *-------------------------------------------------*/
+
+/* 
+ * raybry -- version 2: added efficient hold time statistics
+ *           requires lstat recompile, so flagged as new version
+ * raybry -- version 3: added global reader lock data
+ * hawkes -- version 4: removed some unnecessary fields to simplify mips64 port
+ */
+#define LSTAT_VERSION	5
+
+int	lstat_update(void*, void*, int);
+int	lstat_update_time(void*, void*, int, uint32_t);
+
+/*
+ * Currently, the mips64 and sparc64 kernels talk to a 32-bit lockstat, so we
+ * need to force compatibility in the inter-communication data structure.
+ */
+
+#if defined(CONFIG_MIPS32_COMPAT)
+#define TIME_T		uint32_t
+#elif defined(CONFIG_SPARC32_COMPAT)
+#define TIME_T		uint64_t
+#else
+#define TIME_T		time_t
+#endif
+
+#if defined(__KERNEL__) || (!defined(CONFIG_MIPS32_COMPAT) && !defined(CONFIG_SPARC32_COMPAT)) || (_MIPS_SZLONG==32)
+#define POINTER		void *
+#else
+#define	POINTER		int64_t
+#endif
+
+/*
+ * Values for the "action" parameter passed to lstat_update.
+ *	ZZZ - do we want a try-success status here??? 
+ */
+#define LSTAT_ACT_NO_WAIT	0
+#define LSTAT_ACT_SPIN		1
+#define LSTAT_ACT_REJECT	2
+#define LSTAT_ACT_WW_SPIN       3
+#define LSTAT_ACT_SLEPT		4 /* UNUSED */
+
+#define LSTAT_ACT_MAX_VALUES	4 /* NOTE: Increase to 5 if use ACT_SLEPT */
+
+/*
+ * Special values for the low 2 bits of an RA passed to
+ * lstat_update.
+ */
+/* we use these values to figure out what kind of lock data */
+/* is stored in the statistics table entry at index ....... */
+#define LSTAT_RA_SPIN           0  /* spin lock data */
+#define LSTAT_RA_READ           1  /* read lock statistics */
+#define LSTAT_RA_SEMA		2  /* RESERVED */
+#define LSTAT_RA_WRITE          3  /* write lock statistics*/
+
+#define LSTAT_RA(n)	\
+	((void*)( ((unsigned long)__builtin_return_address(0) & ~3) | n) )
+
+/*
+ * Constants used for lock addresses in the lstat_directory
+ * to indicate special values of the lock address. 
+ */
+#define	LSTAT_MULTI_LOCK_ADDRESS	NULL
+
+/*
+ * Maximum size of the lockstats tables. Increase this value 
+ * if its not big enough. (Nothing bad happens if its not
+ * big enough although some locks will not be monitored.)
+ * We record overflows of this quantity in lstat_control.dir_overflows
+ *
+ * Note:  The max value here must fit into the field set
+ * and obtained by the macro's PUT_INDEX() and GET_INDEX().
+ * This value depends on how many bits are available in the 
+ * lock word in the particular machine implementation we are on.
+ */
+#define LSTAT_MAX_STAT_INDEX		2000
+
+/* 
+ * Size and mask for the hash table into the directory.
+ */
+#define LSTAT_HASH_TABLE_SIZE		4096		/* must be 2**N */
+#define LSTAT_HASH_TABLE_MASK		(LSTAT_HASH_TABLE_SIZE-1)
+
+#define DIRHASH(ra)      ((unsigned long)(ra)>>2 & LSTAT_HASH_TABLE_MASK)
+
+/*
+ *	This defines an entry in the lockstat directory. It contains
+ *	information about a lock being monitored.
+ *	A directory entry only contains the lock identification - 
+ *	counts on usage of the lock are kept elsewhere in a per-cpu
+ *	data structure to minimize cache line pinging.
+ */
+typedef struct {
+	POINTER	caller_ra;		  /* RA of code that set lock */
+	POINTER	lock_ptr;		  /* lock address */
+	ushort	next_stat_index;  /* Used to link multiple locks that have the same hash table value */
+} lstat_directory_entry_t;
+
+/*
+ *	A multi-dimensioned array used to contain counts for lock accesses.
+ *	The array is 3-dimensional:
+ *		- CPU number. Keep from thrashing cache lines between CPUs
+ *		- Directory entry index. Identifies the lock
+ *		- Action. Indicates what kind of contention occurred on an
+ *		  access to the lock.
+ *
+ *	The index of an entry in the directory is the same as the 2nd index
+ *	of the entry in the counts array.
+ */
+/* 
+ *  This table contains data for spin_locks, write locks, and read locks
+ *  Not all data is used for all cases.  In particular, the hold time   
+ *  information is not stored here for read locks since that is a global
+ *  (e. g. cannot be separated out by return address) quantity. 
+ *  See the lstat_read_lock_counts_t structure for the global read lock
+ *  hold time.
+ */ 
+typedef struct {
+	uint64_t    cum_wait_ticks;	/* sum of wait times               */
+	                                /* for write locks, sum of time a  */
+					/* writer is waiting for a reader  */
+	int64_t	    cum_hold_ticks;	/* cumulative sum of holds         */
+	                                /* not used for read mode locks    */
+					/* must be signed. ............... */
+	uint32_t    max_wait_ticks;	/* max waiting time                */
+	uint32_t    max_hold_ticks;	/* max holding time                */
+	uint64_t    cum_wait_ww_ticks;  /* sum times writer waits on writer*/
+	uint32_t    max_wait_ww_ticks;  /* max wait time writer vs writer  */
+	                                /* prev 2 only used for write locks*/
+	uint32_t    acquire_time;       /* time lock acquired this CPU     */
+	uint32_t    count[LSTAT_ACT_MAX_VALUES];
+} lstat_lock_counts_t;
+
+typedef lstat_lock_counts_t	lstat_cpu_counts_t[LSTAT_MAX_STAT_INDEX];
+
+/*
+ * User request to:
+ *	- turn statistic collection on/off, or to reset
+ */
+#define LSTAT_OFF	 0
+#define LSTAT_ON	 1
+#define LSTAT_RESET      2
+#define LSTAT_RELEASE    3
+
+#define LSTAT_MAX_READ_LOCK_INDEX 1000
+typedef struct {
+	POINTER	    lock_ptr;            /* address of lock for output stats */
+	uint32_t    read_lock_count;          
+	int64_t     cum_hold_ticks;       /* sum of read lock hold times over */
+	                                  /* all callers. ....................*/
+	uint32_t    write_index;          /* last write lock hash table index */
+	uint32_t    busy_periods;         /* count of busy periods ended this */
+	uint64_t    start_busy;           /* time this busy period started. ..*/
+	uint64_t    busy_ticks;           /* sum of busy periods this lock. ..*/
+	uint64_t    max_busy;             /* longest busy period for this lock*/
+	uint32_t    max_readers;          /* maximum number of readers ...... */
+#ifdef USER_MODE_TESTING
+	rwlock_t    entry_lock;           /* lock for this read lock entry... */
+	                                  /* avoid having more than one rdr at*/
+	                                  /* needed for user space testing... */
+	                                  /* not needed for kernel 'cause it  */
+					  /* is non-preemptive. ............. */
+#endif
+} lstat_read_lock_counts_t;
+typedef lstat_read_lock_counts_t	lstat_read_lock_cpu_counts_t[LSTAT_MAX_READ_LOCK_INDEX];
+
+#if defined(__KERNEL__) || defined(USER_MODE_TESTING)
+
+#ifndef USER_MODE_TESTING
+#include <asm/lockmeter.h>
+#else
+#include "asm_newlockmeter.h"
+#endif
+
+/* 
+ * Size and mask for the hash table into the directory.
+ */
+#define LSTAT_HASH_TABLE_SIZE		4096		/* must be 2**N */
+#define LSTAT_HASH_TABLE_MASK		(LSTAT_HASH_TABLE_SIZE-1)
+
+#define DIRHASH(ra)      ((unsigned long)(ra)>>2 & LSTAT_HASH_TABLE_MASK)
+
+/*
+ * This version eliminates the per processor lock stack.  What we do is to
+ * store the index of the lock hash structure in unused bits in the lock  
+ * itself.  Then on unlock we can find the statistics record without doing
+ * any additional hash or lock stack lookup.  This works for spin_locks.  
+ * Hold time reporting is now basically as cheap as wait time reporting
+ * so we ignore the difference between LSTAT_ON_HOLD and LSTAT_ON_WAIT
+ * as in version 1.1.* of lockmeter.
+ *
+ * For rw_locks, we store the index of a global reader stats structure in 
+ * the lock and the writer index is stored in the latter structure.       
+ * For read mode locks we hash at the time of the lock to find an entry  
+ * in the directory for reader wait time and the like.
+ * At unlock time for read mode locks, we update just the global structure
+ * so we don't need to know the reader directory index value at unlock time.
+ *
+ */
+
+/* 
+ * Protocol to change lstat_control.state
+ *   This is complicated because we don't want the cum_hold_time for
+ * a rw_lock to be decremented in _read_lock_ without making sure it
+ * is incremented in _read_lock_ and vice versa.  So here is the    
+ * way we change the state of lstat_control.state:                  
+ * I.  To Turn Statistics On
+ *     After allocating storage, set lstat_control.state non-zero.
+ * This works because we don't start updating statistics for in use
+ * locks until the reader lock count goes to zero.
+ * II. To Turn Statistics Off:
+ * (0)  Disable interrupts on this CPU                                          
+ * (1)  Seize the lstat_control.directory_lock                            
+ * (2)  Obtain the current value of lstat_control.next_free_read_lock_index   
+ * (3)  Store a zero in lstat_control.state.
+ * (4)  Release the lstat_control.directory_lock                          
+ * (5)  For each lock in the read lock list up to the saved value   
+ *      (well, -1) of the next_free_read_lock_index, do the following:        
+ *      (a)  Check validity of the stored lock address
+ *           by making sure that the word at the saved addr
+ *           has an index that matches this entry.  If not 
+ *           valid, then skip this entry.
+ *      (b)  If there is a write lock already set on this lock,
+ *           skip to (d) below.
+ *      (c)  Set a non-metered write lock on the lock          
+ *      (d)  set the cached INDEX in the lock to zero
+ *      (e)  Release the non-metered write lock.                    
+ * (6)  Re-enable interrupts
+ *
+ * These rules ensure that a read lock will not have its statistics      
+ * partially updated even though the global lock recording state has    
+ * changed.  See put_lockmeter_info() for implementation.
+ *
+ * The reason for (b) is that there may be write locks set on the
+ * syscall path to put_lockmeter_info() from user space.  If we do
+ * not do this check, then we can deadlock.  A similar problem would
+ * occur if the lock was read locked by the current CPU.  At the 
+ * moment this does not appear to happen.
+ */
+
+/*
+ * Main control structure for lockstat. Used to turn statistics on/off
+ * and to maintain directory info.
+ */
+typedef struct {
+	int				state;
+	spinlock_t		control_lock;		/* used to serialize turning statistics on/off   */
+	spinlock_t		directory_lock;		/* for serialize adding entries to directory     */
+	volatile int	next_free_dir_index;/* next free entry in the directory */
+	/* FIXME not all of these fields are used / needed .............. */
+                /* the following fields represent data since     */
+		/* first "lstat on" or most recent "lstat reset" */
+	TIME_T      first_started_time;     /* time when measurement first enabled */
+	TIME_T      started_time;           /* time when measurement last started  */
+	TIME_T      ending_time;            /* time when measurement last disabled */
+	uint64_t    started_cycles64;       /* cycles when measurement last started          */
+	uint64_t    ending_cycles64;        /* cycles when measurement last disabled         */
+	uint64_t    enabled_cycles64;       /* total cycles with measurement enabled         */
+	int         intervals;              /* number of measurement intervals recorded      */
+	                                    /* i. e. number of times did lstat on;lstat off  */
+	lstat_directory_entry_t	*dir;		/* directory */
+	int         dir_overflow;           /* count of times ran out of space in directory  */
+	int         rwlock_overflow;        /* count of times we couldn't allocate a rw block*/
+	ushort		*hashtab;		 	    /* hash table for quick dir scans */
+	lstat_cpu_counts_t	*counts[NR_CPUS];	 /* Array of pointers to per-cpu stats */
+    int         next_free_read_lock_index;   /* next rwlock reader (global) stats block  */
+    lstat_read_lock_cpu_counts_t *read_lock_counts[NR_CPUS]; /* per cpu read lock stats  */
+} lstat_control_t;
+
+#endif	/* defined(__KERNEL__) || defined(USER_MODE_TESTING) */
+
+typedef struct {
+	short		lstat_version;		/* version of the data */
+	short		state;			/* the current state is returned */
+	int		maxcpus;		/* Number of cpus present */
+	int		next_free_dir_index;	/* index of the next free directory entry */
+	TIME_T          first_started_time;	/* when measurement enabled for first time */
+	TIME_T          started_time;		/* time in secs since 1969 when stats last turned on  */
+	TIME_T		ending_time;		/* time in secs since 1969 when stats last turned off */
+	uint32_t	cycleval;		/* cycles per second */
+#ifdef notyet
+	void		*kernel_magic_addr;	/* address of kernel_magic */
+	void		*kernel_end_addr;	/* contents of kernel magic (points to "end") */
+#endif
+	int              next_free_read_lock_index; /* index of next (global) read lock stats struct */
+	uint64_t         started_cycles64;	/* cycles when measurement last started        */
+	uint64_t         ending_cycles64;	/* cycles when stats last turned off           */
+	uint64_t         enabled_cycles64;	/* total cycles with measurement enabled       */
+	int              intervals;		/* number of measurement intervals recorded      */
+						/* i.e. number of times we did lstat on;lstat off*/
+	int              dir_overflow;		/* number of times we wanted more space in directory */
+	int              rwlock_overflow;	/* # of times we wanted more space in read_locks_count */
+	struct new_utsname   uts;		/* info about machine where stats are measured */
+						/* -T option of lockstat allows data to be     */
+						/* moved to another machine. ................. */
+} lstat_user_request_t;
+
+#endif /* _LINUX_LOCKMETER_H */
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/linux/mm.h 999-mjb/include/linux/mm.h
--- 000-virgin/include/linux/mm.h	2003-10-01 11:48:26.000000000 -0700
+++ 999-mjb/include/linux/mm.h	2003-10-02 16:42:18.000000000 -0700
@@ -180,6 +180,7 @@ struct page {
 		struct pte_chain *chain;/* Reverse pte mapping pointer.
 					 * protected by PG_chainlock */
 		pte_addr_t direct;
+		int mapcount;
 	} pte;
 	unsigned long private;		/* mapping-private opaque data */
 
@@ -616,6 +617,39 @@ extern struct page * follow_page(struct 
 extern int remap_page_range(struct vm_area_struct *vma, unsigned long from,
 		unsigned long to, unsigned long size, pgprot_t prot);
 
+/* 
+ * Given a struct page, determine which node's memory it is from.
+ * TODO: There's probably a more efficient way to do this...
+ */
+static inline int page_to_nid(struct page *page)
+{
+	return pfn_to_nid(page_to_pfn(page));
+}
+
+#ifdef CONFIG_NUMA
+static inline void zero_rss(struct mm_struct *mm)
+{
+	mm->rss = 0;
+	memset(mm->pernode_rss, 0, MAX_NUMNODES * sizeof(*mm->pernode_rss));
+}
+
+static inline void inc_rss(struct mm_struct *mm, struct page *page)
+{
+	mm->rss++;
+	mm->pernode_rss[page_to_nid(page)]++;
+}
+
+static inline void dec_rss(struct mm_struct *mm, struct page *page)
+{
+	mm->rss--;
+	mm->pernode_rss[page_to_nid(page)]--;
+}
+#else /* !CONFIG_NUMA */
+#define zero_rss(mm)		((mm)->rss = 0)
+#define inc_rss(mm, page)	((mm)->rss++)
+#define dec_rss(mm, page)	((mm)->rss--)
+#endif /* CONFIG_NUMA */
+
 #ifndef CONFIG_DEBUG_PAGEALLOC
 static inline void
 kernel_map_pages(struct page *page, int numpages, int enable)
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/linux/mmzone.h 999-mjb/include/linux/mmzone.h
--- 000-virgin/include/linux/mmzone.h	2003-10-01 11:47:13.000000000 -0700
+++ 999-mjb/include/linux/mmzone.h	2003-10-02 16:42:48.000000000 -0700
@@ -307,6 +307,7 @@ extern struct pglist_data contig_page_da
 #define NODE_DATA(nid)		(&contig_page_data)
 #define NODE_MEM_MAP(nid)	mem_map
 #define MAX_NR_NODES		1
+#define pfn_to_nid(pfn)		(0)
 #else /* CONFIG_DISCONTIGMEM */
 
 #include <asm/mmzone.h>
@@ -369,6 +370,19 @@ static inline unsigned int num_online_me
 #define num_online_memblks()		1
 
 #endif /* CONFIG_DISCONTIGMEM || CONFIG_NUMA */
+
+static inline struct zonelist *get_node_zonelist(int nid, int gfp_mask)
+{
+	return NODE_DATA(nid)->node_zonelists + (gfp_mask & GFP_ZONEMASK);
+}
+
+#define get_zonelist(gfp_mask) get_node_zonelist(numa_node_id(), gfp_mask)
+
+/* Structure to keep track of memory segment (VMA) bindings */
+struct binding {
+	struct zonelist	zonelist;
+};
+
 #endif /* !__ASSEMBLY__ */
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MMZONE_H */
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/linux/module.h 999-mjb/include/linux/module.h
--- 000-virgin/include/linux/module.h	2003-07-28 15:33:25.000000000 -0700
+++ 999-mjb/include/linux/module.h	2003-10-02 16:43:03.000000000 -0700
@@ -257,6 +257,11 @@ struct module
 	/* The command line arguments (may be mangled).  People like
 	   keeping pointers to this stuff */
 	char *args;
+
+#ifdef CONFIG_GCOV_PROFILE
+	const char *ctors_start;        /* Pointer to start of .ctors-section */
+	const char *ctors_end;          /* Pointer to end of .ctors-section */
+#endif
 };
 
 /* FIXME: It'd be nice to isolate modules during init, too, so they
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/linux/page-flags.h 999-mjb/include/linux/page-flags.h
--- 000-virgin/include/linux/page-flags.h	2003-10-01 11:47:13.000000000 -0700
+++ 999-mjb/include/linux/page-flags.h	2003-10-02 16:39:41.000000000 -0700
@@ -75,6 +75,7 @@
 #define PG_mappedtodisk		17	/* Has blocks allocated on-disk */
 #define PG_reclaim		18	/* To be reclaimed asap */
 #define PG_compound		19	/* Part of a compound page */
+#define PG_anon			20	/* Anonymous page */
 
 
 /*
@@ -269,6 +270,10 @@ extern void get_full_page_state(struct p
 #define SetPageCompound(page)	set_bit(PG_compound, &(page)->flags)
 #define ClearPageCompound(page)	clear_bit(PG_compound, &(page)->flags)
 
+#define PageAnon(page)		test_bit(PG_anon, &(page)->flags)
+#define SetPageAnon(page)	set_bit(PG_anon, &(page)->flags)
+#define ClearPageAnon(page)	clear_bit(PG_anon, &(page)->flags)
+
 /*
  * The PageSwapCache predicate doesn't use a PG_flag at this time,
  * but it may again do so one day.
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/linux/pagemap.h 999-mjb/include/linux/pagemap.h
--- 000-virgin/include/linux/pagemap.h	2003-10-01 11:41:17.000000000 -0700
+++ 999-mjb/include/linux/pagemap.h	2003-10-02 16:41:14.000000000 -0700
@@ -50,14 +50,37 @@ static inline void mapping_set_gfp_mask(
 #define page_cache_release(page)	put_page(page)
 void release_pages(struct page **pages, int nr, int cold);
 
+#ifndef CONFIG_NUMA
+
+static inline struct page *__page_cache_alloc(struct address_space *x, int gfp_mask)
+{
+	return alloc_pages(gfp_mask, 0);
+}
+
+#else /* CONFIG_NUMA */
+
+static inline struct page *__page_cache_alloc(struct address_space *x, int gfp_mask)
+{
+	struct zonelist *zonelist;
+
+	if (!x->binding)
+		zonelist = get_zonelist(gfp_mask);
+	else
+		zonelist = &x->binding->zonelist;
+
+	return __alloc_pages(gfp_mask, 0, zonelist);
+}
+
+#endif /* !CONFIG_NUMA */
+
 static inline struct page *page_cache_alloc(struct address_space *x)
 {
-	return alloc_pages(mapping_gfp_mask(x), 0);
+	return __page_cache_alloc(x, mapping_gfp_mask(x));
 }
 
 static inline struct page *page_cache_alloc_cold(struct address_space *x)
 {
-	return alloc_pages(mapping_gfp_mask(x)|__GFP_COLD, 0);
+	return __page_cache_alloc(x, mapping_gfp_mask(x)|__GFP_COLD);
 }
 
 typedef int filler_t(void *, struct page *);
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/linux/pci.h 999-mjb/include/linux/pci.h
--- 000-virgin/include/linux/pci.h	2003-10-01 11:41:17.000000000 -0700
+++ 999-mjb/include/linux/pci.h	2003-10-02 16:39:49.000000000 -0700
@@ -461,10 +461,10 @@ struct pci_bus {
 	void		*sysdata;	/* hook for sys-specific extension */
 	struct proc_dir_entry *procdir;	/* directory entry in /proc/bus/pci */
 
-	unsigned char	number;		/* bus number */
-	unsigned char	primary;	/* number of primary bridge */
-	unsigned char	secondary;	/* number of secondary bridge */
-	unsigned char	subordinate;	/* max number of subordinate buses */
+	unsigned int	number;		/* bus number */
+	unsigned int	primary;	/* number of primary bridge */
+	unsigned int	secondary;	/* number of secondary bridge */
+	unsigned int	subordinate;	/* max number of subordinate buses */
 
 	char		name[48];
 
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/linux/sched.h 999-mjb/include/linux/sched.h
--- 000-virgin/include/linux/sched.h	2003-10-01 11:48:26.000000000 -0700
+++ 999-mjb/include/linux/sched.h	2003-10-02 16:42:18.000000000 -0700
@@ -71,7 +71,11 @@ struct exec_domain;
  *    the EXP_n values would be 1981, 2034 and 2043 if still using only
  *    11 bit fractions.
  */
-extern unsigned long avenrun[];		/* Load averages */
+extern unsigned long avenrun[];				/* Load averages */
+extern unsigned long tasks_running[3]; 			/* Real load averages */
+DECLARE_PER_CPU(unsigned long[3],cpu_tasks_running);	/* Real load averages per cpu */
+
+extern unsigned long tasks_running[];	/* Real load averages */
 
 #define FSHIFT		11		/* nr of bits of precision */
 #define FIXED_1		(1<<FSHIFT)	/* 1.0 as fixed-point */
@@ -93,6 +97,7 @@ extern int last_pid;
 DECLARE_PER_CPU(unsigned long, process_counts);
 extern int nr_processes(void);
 extern unsigned long nr_running(void);
+extern unsigned long nr_running_cpu(int i);
 extern unsigned long nr_uninterruptible(void);
 extern unsigned long nr_iowait(void);
 
@@ -192,7 +197,7 @@ struct mm_struct {
 	atomic_t mm_count;			/* How many references to "struct mm_struct" (users count as 1) */
 	int map_count;				/* number of VMAs */
 	struct rw_semaphore mmap_sem;
-	spinlock_t page_table_lock;		/* Protects task page tables and mm->rss */
+	spinlock_t page_table_lock;		/* Protects task page tables and RSS data */
 
 	struct list_head mmlist;		/* List of all active mm's.  These are globally strung
 						 * together off init_mm.mmlist, and are protected
@@ -202,7 +207,11 @@ struct mm_struct {
 	unsigned long start_code, end_code, start_data, end_data;
 	unsigned long start_brk, brk, start_stack;
 	unsigned long arg_start, arg_end, env_start, env_end;
-	unsigned long rss, total_vm, locked_vm;
+	unsigned long total_vm, locked_vm;
+	unsigned long rss;
+#ifdef CONFIG_NUMA
+	unsigned long pernode_rss[MAX_NUMNODES];
+#endif
 	unsigned long def_flags;
 	cpumask_t cpu_vm_mask;
 	unsigned long swap_address;
@@ -510,7 +519,7 @@ static inline int set_cpus_allowed(task_
 
 extern unsigned long long sched_clock(void);
 
-#ifdef CONFIG_NUMA
+#ifdef CONFIG_NUMA_SCHED
 extern void sched_balance_exec(void);
 extern void node_nr_running_init(void);
 #else
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/linux/spinlock.h 999-mjb/include/linux/spinlock.h
--- 000-virgin/include/linux/spinlock.h	2003-07-02 14:45:00.000000000 -0700
+++ 999-mjb/include/linux/spinlock.h	2003-10-02 16:39:44.000000000 -0700
@@ -184,6 +184,17 @@ typedef struct {
 
 #endif /* !SMP */
 
+#ifdef CONFIG_LOCKMETER
+extern void _metered_spin_lock   (spinlock_t *lock);
+extern void _metered_spin_unlock (spinlock_t *lock);
+extern int  _metered_spin_trylock(spinlock_t *lock);
+extern void _metered_read_lock    (rwlock_t *lock);
+extern void _metered_read_unlock  (rwlock_t *lock);
+extern void _metered_write_lock   (rwlock_t *lock);
+extern void _metered_write_unlock (rwlock_t *lock);
+extern int  _metered_write_trylock(rwlock_t *lock);
+#endif
+
 /*
  * Define the various spin_lock and rw_lock methods.  Note we define these
  * regardless of whether CONFIG_SMP or CONFIG_PREEMPT are set. The various
@@ -389,6 +400,141 @@ do { \
 				_raw_spin_trylock(lock) ? 1 : \
 				({preempt_enable(); local_bh_enable(); 0;});})
 
+#ifdef CONFIG_LOCKMETER
+#undef spin_lock
+#undef spin_trylock
+#undef spin_unlock
+#undef spin_lock_irqsave
+#undef spin_lock_irq
+#undef spin_lock_bh
+#undef read_lock
+#undef read_unlock
+#undef write_lock
+#undef write_unlock
+#undef write_trylock
+#undef spin_unlock_bh
+#undef read_lock_irqsave
+#undef read_lock_irq
+#undef read_lock_bh
+#undef read_unlock_bh
+#undef write_lock_irqsave
+#undef write_lock_irq
+#undef write_lock_bh
+#undef write_unlock_bh
+
+#define spin_lock(lock) \
+do { \
+	preempt_disable(); \
+	_metered_spin_lock(lock); \
+} while(0)
+
+#define spin_trylock(lock)     ({preempt_disable(); _metered_spin_trylock(lock) ? \
+				1 : ({preempt_enable(); 0;});})
+#define spin_unlock(lock) \
+do { \
+	_metered_spin_unlock(lock); \
+	preempt_enable(); \
+} while (0)
+
+#define spin_lock_irqsave(lock, flags) \
+do { \
+	local_irq_save(flags); \
+	preempt_disable(); \
+	_metered_spin_lock(lock); \
+} while (0)
+
+#define spin_lock_irq(lock) \
+do { \
+	local_irq_disable(); \
+	preempt_disable(); \
+	_metered_spin_lock(lock); \
+} while (0)
+
+#define spin_lock_bh(lock) \
+do { \
+	local_bh_disable(); \
+	preempt_disable(); \
+	_metered_spin_lock(lock); \
+} while (0)
+
+#define spin_unlock_bh(lock) \
+do { \
+	_metered_spin_unlock(lock); \
+	preempt_enable(); \
+	local_bh_enable(); \
+} while (0)
+
+
+#define read_lock(lock)                ({preempt_disable(); _metered_read_lock(lock);})
+#define read_unlock(lock)      ({_metered_read_unlock(lock); preempt_enable();})
+#define write_lock(lock)       ({preempt_disable(); _metered_write_lock(lock);})
+#define write_unlock(lock)     ({_metered_write_unlock(lock); preempt_enable();})
+#define write_trylock(lock)    ({preempt_disable();_metered_write_trylock(lock) ? \
+				1 : ({preempt_enable(); 0;});})
+#define spin_unlock_no_resched(lock) \
+do { \
+	_metered_spin_unlock(lock); \
+	preempt_enable_no_resched(); \
+} while (0)
+
+#define read_lock_irqsave(lock, flags) \
+do { \
+	local_irq_save(flags); \
+	preempt_disable(); \
+	_metered_read_lock(lock); \
+} while (0)
+
+#define read_lock_irq(lock) \
+do { \
+	local_irq_disable(); \
+	preempt_disable(); \
+	_metered_read_lock(lock); \
+} while (0)
+
+#define read_lock_bh(lock) \
+do { \
+	local_bh_disable(); \
+	preempt_disable(); \
+	_metered_read_lock(lock); \
+} while (0)
+
+#define read_unlock_bh(lock) \
+do { \
+	_metered_read_unlock(lock); \
+	preempt_enable(); \
+	local_bh_enable(); \
+} while (0)
+
+#define write_lock_irqsave(lock, flags) \
+do { \
+	local_irq_save(flags); \
+	preempt_disable(); \
+	_metered_write_lock(lock); \
+} while (0)
+
+#define write_lock_irq(lock) \
+do { \
+	local_irq_disable(); \
+	preempt_disable(); \
+	_metered_write_lock(lock); \
+} while (0)
+
+#define write_lock_bh(lock) \
+do { \
+	local_bh_disable(); \
+	preempt_disable(); \
+	_metered_write_lock(lock); \
+} while (0)
+
+#define write_unlock_bh(lock) \
+do { \
+	_metered_write_unlock(lock); \
+	preempt_enable(); \
+	local_bh_enable(); \
+} while (0)
+
+#endif /* !CONFIG_LOCKMETER */
+
 /* "lock on reference count zero" */
 #ifndef ATOMIC_DEC_AND_LOCK
 #include <asm/atomic.h>
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/linux/swap.h 999-mjb/include/linux/swap.h
--- 000-virgin/include/linux/swap.h	2003-10-01 11:48:26.000000000 -0700
+++ 999-mjb/include/linux/swap.h	2003-10-02 16:39:41.000000000 -0700
@@ -185,6 +185,8 @@ struct pte_chain *FASTCALL(page_add_rmap
 void FASTCALL(page_remove_rmap(struct page *, pte_t *));
 int FASTCALL(try_to_unmap(struct page *));
 
+int page_convert_anon(struct page *);
+
 /* linux/mm/shmem.c */
 extern int shmem_unuse(swp_entry_t entry, struct page *page);
 #else
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/linux/sysctl.h 999-mjb/include/linux/sysctl.h
--- 000-virgin/include/linux/sysctl.h	2003-10-01 11:47:14.000000000 -0700
+++ 999-mjb/include/linux/sysctl.h	2003-10-02 16:39:40.000000000 -0700
@@ -61,7 +61,8 @@ enum
 	CTL_DEV=7,		/* Devices */
 	CTL_BUS=8,		/* Busses */
 	CTL_ABI=9,		/* Binary emulation */
-	CTL_CPU=10		/* CPU stuff (speed scaling, etc) */
+	CTL_CPU=10,		/* CPU stuff (speed scaling, etc) */
+	CTL_SCHED=11,		/* scheduler tunables */
 };
 
 /* CTL_BUS names: */
@@ -156,6 +157,21 @@ enum
 	VM_MIN_FREE_KBYTES=21,	/* Minimum free kilobytes to maintain */
 };
 
+/* Tunable scheduler parameters in /proc/sys/sched/ */
+enum {
+	SCHED_MIN_TIMESLICE=1,		/* minimum process timeslice */
+	SCHED_MAX_TIMESLICE=2,		/* maximum process timeslice */
+	SCHED_CHILD_PENALTY=3,		/* penalty on fork to child */
+	SCHED_PARENT_PENALTY=4,		/* penalty on fork to parent */
+	SCHED_EXIT_WEIGHT=5,		/* penalty to parent of CPU hog child */
+	SCHED_PRIO_BONUS_RATIO=6,	/* percent of max prio given as bonus */
+	SCHED_INTERACTIVE_DELTA=7,	/* delta used to scale interactivity */
+	SCHED_MAX_SLEEP_AVG=8,		/* maximum sleep avg attainable */
+	SCHED_STARVATION_LIMIT=9,	/* no re-active if expired is starved */
+	SCHED_NODE_THRESHOLD=10,	/* NUMA node rebalance threshold */
+	SCHED_IDLE_NODE_REBALANCE_RATIO=11,  /* how often to global balance */
+	SCHED_BUSY_NODE_REBALANCE_RATIO=12,  /* how often to global balance */
+};
 
 /* CTL_NET names: */
 enum
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/linux/timex.h 999-mjb/include/linux/timex.h
--- 000-virgin/include/linux/timex.h	2003-06-24 16:43:14.000000000 -0700
+++ 999-mjb/include/linux/timex.h	2003-10-02 16:39:36.000000000 -0700
@@ -78,7 +78,7 @@
 #elif HZ >= 768 && HZ < 1536
 # define SHIFT_HZ	10
 #else
-# error You lose.
+# error Please use a HZ value which is between 12 and 1536 
 #endif
 
 /*
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/init/main.c 999-mjb/init/main.c
--- 000-virgin/init/main.c	2003-10-01 11:48:27.000000000 -0700
+++ 999-mjb/init/main.c	2003-10-02 16:43:03.000000000 -0700
@@ -37,6 +37,7 @@
 #include <linux/moduleparam.h>
 #include <linux/writeback.h>
 #include <linux/cpu.h>
+#include <linux/early_printk.h>
 
 #include <asm/io.h>
 #include <asm/bugs.h>
@@ -113,6 +114,10 @@ char *execute_command;
 /* Setup configured maximum number of CPUs to activate */
 static unsigned int max_cpus = NR_CPUS;
 
+#if defined(CONFIG_GCOV_PROFILE) && (defined(CONFIG_PPC32) || defined(CONFIG_PPC64))
+void __bb_fork_func (void) { }
+#endif
+
 /*
  * Setup routine for controlling SMP activation
  *
@@ -387,6 +392,8 @@ asmlinkage void __init start_kernel(void
  */
 	lock_kernel();
 	printk(linux_banner);
+	setup_early_printk();
+	
 	setup_arch(&command_line);
 	setup_per_zone_pages_min();
 	setup_per_cpu_areas();
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/ipc/shm.c 999-mjb/ipc/shm.c
--- 000-virgin/ipc/shm.c	2003-10-01 11:47:15.000000000 -0700
+++ 999-mjb/ipc/shm.c	2003-10-02 16:53:55.000000000 -0700
@@ -380,9 +380,9 @@ static void shm_get_stat(unsigned long *
 
 		if (is_file_hugepages(shp->shm_file)) {
 			struct address_space *mapping = inode->i_mapping;
-			spin_lock(&mapping->page_lock);
+			mapping_wrlock(&mapping->page_lock);
 			*rss += (HPAGE_SIZE/PAGE_SIZE)*mapping->nrpages;
-			spin_unlock(&mapping->page_lock);
+			mapping_wrunlock(&mapping->page_lock);
 		} else {
 			struct shmem_inode_info *info = SHMEM_I(inode);
 			spin_lock(&info->lock);
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/kernel/Makefile 999-mjb/kernel/Makefile
--- 000-virgin/kernel/Makefile	2003-10-01 11:48:27.000000000 -0700
+++ 999-mjb/kernel/Makefile	2003-10-02 16:43:03.000000000 -0700
@@ -8,9 +8,16 @@ obj-y     = sched.o fork.o exec_domain.o
 	    signal.o sys.o kmod.o workqueue.o pid.o \
 	    rcupdate.o intermodule.o extable.o params.o posix-timers.o
 
+ifdef CONFIG_GCOV_PROFILE
+obj-y += gcov.o
+export-objs += gcov.o
+CFLAGS_gcov.o := -DGCOV_PATH='"$(TOPDIR)"'
+endif
+
 obj-$(CONFIG_FUTEX) += futex.o
 obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
 obj-$(CONFIG_SMP) += cpu.o
+obj-$(CONFIG_LOCKMETER) += lockmeter.o
 obj-$(CONFIG_UID16) += uid16.o
 obj-$(CONFIG_MODULES) += ksyms.o module.o
 obj-$(CONFIG_KALLSYMS) += kallsyms.o
@@ -19,6 +26,7 @@ obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
 obj-$(CONFIG_COMPAT) += compat.o
 obj-$(CONFIG_IKCONFIG) += configs.o
 obj-$(CONFIG_IKCONFIG_PROC) += configs.o
+obj-$(CONFIG_X86_EARLY_PRINTK) += early_printk.o
 
 ifneq ($(CONFIG_IA64),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/kernel/early_printk.c 999-mjb/kernel/early_printk.c
--- 000-virgin/kernel/early_printk.c	1969-12-31 16:00:00.000000000 -0800
+++ 999-mjb/kernel/early_printk.c	2003-10-02 16:39:35.000000000 -0700
@@ -0,0 +1,218 @@
+#include <linux/console.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/early_printk.h>
+#include <asm/io.h>
+#include <asm/setup.h>
+#include <asm/pgtable.h>
+
+/* Simple VGA output */
+
+#define MAX_YPOS	25
+#define MAX_XPOS	80
+
+static int current_ypos = 1, current_xpos = 0; 
+
+static void early_vga_write(struct console *con, const char *str, unsigned n)
+{
+	char c;
+	int  i, k, j;
+
+	while ((c = *str++) != '\0' && n-- > 0) {
+		if (current_ypos >= MAX_YPOS) {
+			/* scroll 1 line up */
+			for(k = 1, j = 0; k < MAX_YPOS; k++, j++) {
+				for(i = 0; i < MAX_XPOS; i++) {
+					writew(readw(VGABASE + 2*(MAX_XPOS*k + i)),
+					       VGABASE + 2*(MAX_XPOS*j + i));
+				}
+			}
+			for(i = 0; i < MAX_XPOS; i++) {
+				writew(0x720, VGABASE + 2*(MAX_XPOS*j + i));
+			}
+			current_ypos = MAX_YPOS-1;
+		}
+		if (c == '\n') {
+			current_xpos = 0;
+			current_ypos++;
+		} else if (c != '\r')  {
+			writew(((0x7 << 8) | (unsigned short) c),
+			       VGABASE + 2*(MAX_XPOS*current_ypos + current_xpos++));
+			if (current_xpos >= MAX_XPOS) {
+				current_xpos = 0;
+				current_ypos++;
+			}
+		}
+	}
+}
+
+static struct console early_vga_console = {
+	.name =		"earlyvga",
+	.write =	early_vga_write,
+	.flags =	CON_PRINTBUFFER,
+	.index =	-1,
+};
+
+/* Serial functions losely based on a similar package from Klaus P. Gerlicher */ 
+
+int early_serial_base;  /* ttyS0 */ 
+
+static int early_serial_putc(unsigned char ch) 
+{ 
+	unsigned timeout = 0xffff; 
+	while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout) 
+		rep_nop(); 
+	outb(ch, early_serial_base + TXR);
+	return timeout ? 0 : -1;
+} 
+
+static void early_serial_write(struct console *con, const char *s, unsigned n)
+{
+	while (*s && n-- > 0) { 
+		early_serial_putc(*s); 
+		if (*s == '\n') 
+			early_serial_putc('\r'); 
+		s++; 
+	} 
+} 
+
+static __init void early_serial_init(char *opt)
+{
+	unsigned char c; 
+	unsigned divisor, baud = DEFAULT_BAUD;
+	static int bases[] = SERIAL_BASES;
+	char *s, *e;
+
+	early_serial_base = bases[0];
+	
+	if (*opt == ',') 
+		++opt;
+
+	s = strsep(&opt, ","); 
+	if (s != NULL) { 
+		unsigned port; 
+		if (!strncmp(s,"0x",2))
+			early_serial_base = simple_strtoul(s, &e, 16);
+		else {	
+			if (!strncmp(s,"ttyS",4)) 
+				s+=4; 
+			port = simple_strtoul(s, &e, 10); 
+			if (port > (SERIAL_BASES_LEN-1) || s == e) 
+				port = 0; 
+			early_serial_base = bases[port];
+		}
+	}
+
+	outb(0x3, early_serial_base + LCR); /* 8n1 */
+	outb(0, early_serial_base + IER); /* no interrupt */ 
+	outb(0, early_serial_base + FCR); /* no fifo */ 
+	outb(0x3, early_serial_base + MCR); /* DTR + RTS */ 
+
+	s = strsep(&opt, ","); 
+	if (s != NULL) { 
+		baud = simple_strtoul(s, &e, 0); 
+		if (baud == 0 || s == e) 
+			baud = DEFAULT_BAUD;
+	} 
+	
+	divisor = 115200 / baud; 
+	c = inb(early_serial_base + LCR); 
+	outb(c | DLAB, early_serial_base + LCR); 
+	outb(divisor & 0xff, early_serial_base + DLL); 
+	outb((divisor >> 8) & 0xff, early_serial_base +	DLH);
+	outb(c & ~DLAB, early_serial_base + LCR);
+}
+
+static struct console early_serial_console = {
+	.name =		"earlyser",
+	.write =	early_serial_write,
+	.flags =	CON_PRINTBUFFER,
+	.index =	-1,
+};
+
+/* Direct interface for emergencies */
+struct console *early_console = &early_vga_console;
+static int early_console_initialized = 0;
+
+void early_printk(const char *fmt, ...)
+{ 
+	char buf[512]; 
+	int n; 
+	va_list ap;
+	va_start(ap,fmt); 
+	n = vsnprintf(buf,512,fmt,ap);
+	early_console->write(early_console,buf,n);
+	va_end(ap); 
+} 
+
+static int keep_early; 
+
+int __init setup_early_printk(void) 
+{  
+	char *space, *s;
+	char buf[256];
+	char cmd[COMMAND_LINE_SIZE];
+	char *opt;
+
+	/* Get our own copy of the cmd line */
+	memcpy(cmd, COMMAND_LINE, COMMAND_LINE_SIZE);
+	cmd[COMMAND_LINE_SIZE-1] = '\0';
+	opt = cmd;
+	
+	s = strstr(opt, "earlyprintk=");
+	if (s == NULL)
+		return -1;
+	opt = s+12;
+	
+	if (early_console_initialized)
+		return -1;
+
+	strncpy(buf,opt,256); 
+	buf[255] = 0; 
+	space = strchr(buf, ' '); 
+	if (space)
+		*space = 0; 
+
+	if (strstr(buf,"keep"))
+		keep_early = 1; 
+
+	if (!strncmp(buf, "serial", 6)) { 
+		early_serial_init(buf + 6);
+		early_console = &early_serial_console;
+	} else if (!strncmp(buf, "ttyS", 4)) { 
+		early_serial_init(buf);
+		early_console = &early_serial_console;		
+	} else if (!strncmp(buf, "vga", 3)) {
+		early_console = &early_vga_console; 
+	} else {
+		early_console = NULL; 		
+		return -1; 
+	}
+	early_console_initialized = 1;
+	register_console(early_console);
+	printk("early printk console registered\n");
+	return 0;
+}
+
+void __init disable_early_printk(void)
+{ 
+	if (!early_console_initialized || !early_console)
+		return;
+	if (!keep_early) {
+		printk("disabling early console...\n"); 
+		unregister_console(early_console);
+		early_console_initialized = 0;
+	} else { 
+		printk("keeping early console.\n"); 
+	}
+} 
+
+/* syntax: earlyprintk=vga
+           earlyprintk=serial[,ttySn[,baudrate]] 
+   Append ,keep to not disable it when the real console takes over.
+   Only vga or serial at a time, not both.
+   Currently only ttyS0 and ttyS1 are supported. 
+   Interaction with the standard serial driver is not very good. 
+   The VGA output is eventually overwritten by the real console. */
+__setup("earlyprintk=", setup_early_printk);  
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/kernel/fork.c 999-mjb/kernel/fork.c
--- 000-virgin/kernel/fork.c	2003-10-01 11:48:27.000000000 -0700
+++ 999-mjb/kernel/fork.c	2003-10-02 16:42:18.000000000 -0700
@@ -232,7 +232,7 @@ static inline int dup_mmap(struct mm_str
 	mm->mmap_cache = NULL;
 	mm->free_area_cache = TASK_UNMAPPED_BASE;
 	mm->map_count = 0;
-	mm->rss = 0;
+	zero_rss(mm);
 	cpus_clear(mm->cpu_vm_mask);
 	pprev = &mm->mmap;
 
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/kernel/gcov.c 999-mjb/kernel/gcov.c
--- 000-virgin/kernel/gcov.c	1969-12-31 16:00:00.000000000 -0800
+++ 999-mjb/kernel/gcov.c	2003-10-02 16:43:03.000000000 -0700
@@ -0,0 +1,158 @@
+/*
+ * Coverage support under Linux
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (c) International Business Machines Corp., 2002
+ *
+ * Author: Hubertus Franke <frankeh@us.ibm.com>
+ *         Rajan Ravindran <rajancr@us.ibm.com>
+ *
+ * Modified by <Peter.Oberparleiter@de.ibm.com>
+ */
+
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/smp_lock.h>
+#include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
+#include <linux/completion.h>
+
+#include <asm/uaccess.h>
+#include <asm/mmu_context.h>
+
+struct bb
+{
+  long zero_word;
+  const char *filename;
+  long *counts;
+  long ncounts;
+  struct bb *next;
+  const unsigned long *addresses;
+
+  /* Older GCC's did not emit these fields.  */
+  long nwords;
+  const char **functions;
+  const long *line_nums;
+  const char **filenames;
+  char *flags;
+};
+
+struct bb *bb_head;
+struct module *bb_context_address;
+void (*gcov_callback)(int cmd, struct bb *bbptr) = NULL;
+
+#ifdef GCOV_PATH
+char *gcov_kernelpath = GCOV_PATH;
+#else
+char *gcov_kernelpath = __FILE__;
+#endif
+
+
+void
+__bb_init_func (struct bb *blocks)
+{
+  if (blocks->zero_word)
+    return;
+
+  /* Set up linked list.  */
+  blocks->zero_word = 1;
+
+  /* Store the address of the module of which this object-file is a part
+     of (set in do_global_ctors). */
+  blocks->addresses = (unsigned long *) bb_context_address;
+
+  blocks->next = bb_head;
+  bb_head = blocks;
+
+  if (gcov_callback && bb_context_address) 
+    (*gcov_callback)(1,blocks);
+}
+
+/* Call constructors for all kernel objects and dynamic modules. This function
+ * is called both during module initialization and when the gcov kernel
+ * module is insmod'ed. The list of constructors is compiled into the
+ * kernel at &__CTOR_LIST__ to &__DTOR_LIST__ (labels are defined in
+ * head.S). In the case of a dynamic module the list is located at
+ * ctors_start to ctors_end.
+ *
+ * The constructors in turn call __bb_init_func, reporting the respective
+ * struct bb for each object file.
+ */
+
+void
+do_global_ctors (char *ctors_start, char *ctors_end, struct module *addr, int mod_flag)
+{
+  extern char __CTOR_LIST__;
+  extern char __DTOR_LIST__;
+  typedef void (*func_ptr)(void) ;
+  func_ptr *constructor_ptr=NULL;
+ 
+  if (!mod_flag) {
+    /* Set start and end ptr from global kernel constructor list. */
+    ctors_start = &__CTOR_LIST__;
+    ctors_end = &__DTOR_LIST__;
+    bb_context_address = NULL;
+  } else {
+    /* Set context to current module address. */
+    bb_context_address = addr;
+  }
+
+  if (!ctors_start)
+    return;
+
+  /* Call all constructor functions until either the end of the
+     list is reached or until a NULL is encountered. */
+  for (constructor_ptr = (func_ptr *) ctors_start;
+       (constructor_ptr != (func_ptr *) ctors_end) &&
+         (*constructor_ptr != NULL);
+       constructor_ptr++) {
+    	(*constructor_ptr) ();
+  }
+}        
+
+
+/* When a module is unloaded, this function is called to remove
+ * the respective bb entries from our list. context specifies
+ * the address of the module that is unloaded. */
+
+void
+remove_bb_link (struct module *context)
+{
+  struct bb *bbptr;
+  struct bb *prev = NULL;
+
+  /* search for all the module's bbptrs */
+  for (bbptr = bb_head; bbptr ; bbptr = bbptr->next) {
+    if (bbptr->addresses == (unsigned long *) context) {
+      if (gcov_callback)
+        (*gcov_callback)(0,bbptr);
+      if (prev == NULL) 
+        bb_head = bbptr->next;
+      else
+        prev->next = bbptr->next;
+    }
+    else
+      prev = bbptr;
+  }
+}
+
+EXPORT_SYMBOL(bb_head);
+EXPORT_SYMBOL(__bb_init_func);
+EXPORT_SYMBOL(do_global_ctors);
+EXPORT_SYMBOL(gcov_kernelpath);
+EXPORT_SYMBOL(gcov_callback);
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/kernel/ksyms.c 999-mjb/kernel/ksyms.c
--- 000-virgin/kernel/ksyms.c	2003-10-01 11:48:27.000000000 -0700
+++ 999-mjb/kernel/ksyms.c	2003-10-02 16:39:44.000000000 -0700
@@ -607,6 +607,16 @@ EXPORT_SYMBOL(__per_cpu_offset);
 EXPORT_SYMBOL(set_fs_pwd);
 EXPORT_SYMBOL(set_fs_root);
 
+#if defined(CONFIG_LOCKMETER)
+EXPORT_SYMBOL(_metered_spin_lock);
+EXPORT_SYMBOL(_metered_spin_unlock);
+EXPORT_SYMBOL(_metered_spin_trylock);
+EXPORT_SYMBOL(_metered_read_lock);
+EXPORT_SYMBOL(_metered_read_unlock);
+EXPORT_SYMBOL(_metered_write_lock);
+EXPORT_SYMBOL(_metered_write_unlock);
+#endif
+
 /* debug */
 EXPORT_SYMBOL(dump_stack);
 EXPORT_SYMBOL(ptrace_notify);
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/kernel/lockmeter.c 999-mjb/kernel/lockmeter.c
--- 000-virgin/kernel/lockmeter.c	1969-12-31 16:00:00.000000000 -0800
+++ 999-mjb/kernel/lockmeter.c	2003-10-02 16:39:44.000000000 -0700
@@ -0,0 +1,1088 @@
+/*
+ *  Copyright (C) 1999,2000 Silicon Graphics, Inc.
+ *
+ *  Written by John Hawkes (hawkes@sgi.com)
+ *  Based on klstat.c by Jack Steiner (steiner@sgi.com)
+ *  
+ *  Modified by Ray Bryant (raybry@us.ibm.com)
+ *  Changes Copyright (C) 2000 IBM, Inc.
+ *  Added save of index in spinlock_t to improve efficiency
+ *  of "hold" time reporting for spinlocks
+ *  Added support for hold time statistics for read and write
+ *  locks.
+ */
+
+#ifdef __KERNEL__
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/smp.h>
+#include <linux/threads.h>
+#include <linux/version.h>
+#include <linux/vmalloc.h>
+#include <linux/spinlock.h>
+#include <linux/utsname.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+#include <linux/lockmeter.h>
+#else
+#define __SMP__
+#include <linux/config.h>
+#include <stdio.h>
+#include <time.h>
+#include "bitops.h"
+#include "user_scaffold.h"
+#include <linux/utsname.h>
+#include <linux/spinlock.h>
+#include "newlockmeter.h"
+#endif
+
+#ifdef __KERNEL__
+#define ASSERT(cond)
+#define bzero(loc,size)		memset(loc,0,size)
+#endif
+
+/*<---------------------------------------------------*/
+/*              lockmeter.c                           */
+/*>---------------------------------------------------*/
+
+#ifdef __KERNEL__
+static lstat_control_t	lstat_control __cacheline_aligned = {LSTAT_OFF, SPIN_LOCK_UNLOCKED, SPIN_LOCK_UNLOCKED, 19*0, NR_CPUS*0, 0, NR_CPUS*0};
+#else
+lstat_control_t	lstat_control = {LSTAT_OFF, SPIN_LOCK_UNLOCKED, SPIN_LOCK_UNLOCKED, 19*0, NR_CPUS*0, 0, NR_CPUS*0};
+#endif
+
+int smp_num_cpus=NR_CPUS;
+
+#undef BUG
+#define BUG()
+
+static ushort	lstat_make_dir_entry(void *, void *);
+
+/*
+ * lstat_lookup
+ * 
+ * Given a RA, locate the directory entry for the lock.
+ */
+static ushort	
+lstat_lookup(
+	void	*lock_ptr,
+	void	*caller_ra)
+{
+	ushort			index;
+	lstat_directory_entry_t	*dirp;
+
+	dirp = lstat_control.dir;
+
+	index = lstat_control.hashtab[DIRHASH(caller_ra)];
+	while (dirp[index].caller_ra != caller_ra) {
+		if (index == 0) {
+			return(lstat_make_dir_entry(lock_ptr, caller_ra));
+		}
+		index = dirp[index].next_stat_index;
+	}
+
+	if (dirp[index].lock_ptr != NULL && 
+			dirp[index].lock_ptr != lock_ptr)  {
+		dirp[index].lock_ptr = NULL;
+	}
+
+	return(index);
+}
+
+
+/*
+ * lstat_make_dir_entry
+ * Called to add a new lock to the lock directory.
+ */
+static ushort	
+lstat_make_dir_entry(
+	void	*lock_ptr, 			
+	void	*caller_ra)
+{
+	lstat_directory_entry_t	*dirp;
+	ushort			index, hindex;
+	unsigned long		flags;
+
+	/* lock the table without recursively reentering this metering code */
+	do { local_irq_save(flags);
+	     _raw_spin_lock(&lstat_control.directory_lock); } while(0);
+
+	hindex = DIRHASH(caller_ra);
+	index = lstat_control.hashtab[hindex];
+	dirp = lstat_control.dir;
+	while (index && dirp[index].caller_ra != caller_ra)
+		index = dirp[index].next_stat_index;
+
+	if (index == 0) {
+		if(lstat_control.next_free_dir_index < LSTAT_MAX_STAT_INDEX) {
+			index = lstat_control.next_free_dir_index++;
+			lstat_control.dir[index].caller_ra = caller_ra;
+			lstat_control.dir[index].lock_ptr = lock_ptr;
+			lstat_control.dir[index].next_stat_index = lstat_control.hashtab[hindex];
+			lstat_control.hashtab[hindex] = index;
+		} else  {
+			lstat_control.dir_overflow++;
+		}
+	}
+
+	do { _raw_spin_unlock(&lstat_control.directory_lock);
+	     local_irq_restore(flags);} while(0);
+	return(index);
+}
+
+int
+lstat_update (
+	void	*lock_ptr,
+	void	*caller_ra,
+	int	action)
+{
+	int	index;
+	int	cpu;
+
+	ASSERT(action < LSTAT_ACT_MAX_VALUES);
+
+	if (lstat_control.state == LSTAT_OFF) {
+	    return(0);
+	}
+
+	index = lstat_lookup(lock_ptr, caller_ra);
+	cpu = THIS_CPU_NUMBER;
+	(*lstat_control.counts[cpu])[index].count[action]++;
+	(*lstat_control.counts[cpu])[index].acquire_time = get_cycles();
+
+	return(index);
+}
+
+int
+lstat_update_time (
+	void 		*lock_ptr,
+	void		*caller_ra,
+	int		action,
+	uint32_t	ticks)
+{
+	ushort	index;
+	int	cpu;
+
+	ASSERT(action < LSTAT_ACT_MAX_VALUES);
+
+	if (lstat_control.state == LSTAT_OFF) {
+		return(0);
+	}
+
+	index = lstat_lookup(lock_ptr, caller_ra);
+	cpu = THIS_CPU_NUMBER;
+	(*lstat_control.counts[cpu])[index].count[action]++;
+	(*lstat_control.counts[cpu])[index].cum_wait_ticks += (uint64_t)ticks;
+	if ((*lstat_control.counts[cpu])[index].max_wait_ticks < ticks)
+	    (*lstat_control.counts[cpu])[index].max_wait_ticks = ticks;
+
+	(*lstat_control.counts[cpu])[index].acquire_time = get_cycles();
+
+	return(index);
+}
+
+void _metered_spin_lock(spinlock_t *lock_ptr)
+{
+	if (lstat_control.state == LSTAT_OFF) {
+	    _raw_spin_lock(lock_ptr);	/* do the real lock */
+	    PUT_INDEX(lock_ptr,0);	/* clean index in case lockmetering  */
+					/* gets turned on before unlock      */
+	} else {
+	void *this_pc = LSTAT_RA(LSTAT_RA_SPIN);
+	int index;
+
+	    if (_raw_spin_trylock(lock_ptr)) {
+		index = lstat_update(lock_ptr, this_pc, LSTAT_ACT_NO_WAIT);
+	    } else {
+		uint32_t start_cycles = get_cycles();
+		_raw_spin_lock(lock_ptr);		/* do the real lock */
+		index = lstat_update_time(lock_ptr, this_pc, LSTAT_ACT_SPIN,
+					  get_cycles() - start_cycles);
+	    }
+	    /* save the index in the lock itself for use in spin unlock */
+	    PUT_INDEX(lock_ptr,index);
+	}
+}
+
+int _metered_spin_trylock(spinlock_t *lock_ptr)
+{
+	if (lstat_control.state == LSTAT_OFF) {
+	    return _raw_spin_trylock(lock_ptr);
+	} else {
+	    int retval;
+	    void *this_pc = LSTAT_RA(LSTAT_RA_SPIN);
+
+	    if ((retval = _raw_spin_trylock(lock_ptr))) {
+		int index = lstat_update(lock_ptr, this_pc, LSTAT_ACT_NO_WAIT);
+		/* save the index in the lock itself for use in spin unlock */
+		PUT_INDEX(lock_ptr,index);
+	    } else {
+		lstat_update(lock_ptr, this_pc, LSTAT_ACT_REJECT);
+	    }
+
+	    return retval;
+	}
+}
+
+void _metered_spin_unlock(spinlock_t *lock_ptr)
+{
+	int index=-1;
+
+	if (lstat_control.state != LSTAT_OFF) {
+		index = GET_INDEX(lock_ptr);
+		/*
+		 * If statistics were turned off when we set the lock,
+		 * then the index can be zero.  If that is the case,
+		 * then collect no stats on this call.
+		 */
+		if (index > 0) {
+			uint32_t hold_time;
+			int cpu = THIS_CPU_NUMBER;
+			hold_time = get_cycles() - (*lstat_control.counts[cpu])[index].acquire_time;
+			(*lstat_control.counts[cpu])[index].cum_hold_ticks += (uint64_t)hold_time;
+			if ((*lstat_control.counts[cpu])[index].max_hold_ticks < hold_time)
+				(*lstat_control.counts[cpu])[index].max_hold_ticks = hold_time;
+		}
+	}
+
+	/* make sure we don't have a stale index value saved */
+	PUT_INDEX(lock_ptr,0);
+	_raw_spin_unlock(lock_ptr);	/* do the real unlock */
+}
+
+/* 
+ * allocate the next global read lock structure and store its index
+ * in the rwlock at "lock_ptr". 
+ */
+uint32_t alloc_rwlock_struct(rwlock_t *rwlock_ptr)
+{
+	int index;
+	int flags;
+	int cpu=THIS_CPU_NUMBER;
+
+	/* If we've already overflowed, then do a quick exit */
+	if (lstat_control.next_free_read_lock_index > LSTAT_MAX_READ_LOCK_INDEX) {
+		lstat_control.rwlock_overflow++;
+		return(0);
+	}
+
+	do { local_irq_save(flags);
+	     _raw_spin_lock(&lstat_control.directory_lock); } while(0);
+
+	/* It is possible this changed while we were waiting for the directory_lock */
+	if (lstat_control.state == LSTAT_OFF) {
+		index=0;
+		goto unlock;
+	}
+
+    /* It is possible someone else got here first and set the index */
+	if ((index=GET_RWINDEX(rwlock_ptr)) == 0) {
+
+		/* we can't turn on read stats for this lock while there are readers */
+		/* (this would mess up the running hold time sum at unlock time)     */
+		if (RWLOCK_READERS(rwlock_ptr) != 0) {
+			index=0;
+			goto unlock;
+		}
+
+	    /* if stats are turned on after being off, we may need to return an old  */
+		/* index from when the statistics were on last time. ................... */
+		for(index=1;index<lstat_control.next_free_read_lock_index;index++) 
+			if ((*lstat_control.read_lock_counts[cpu])[index].lock_ptr == rwlock_ptr)
+				goto put_index_and_unlock;
+
+		/* allocate the next global read lock structure */
+		if (lstat_control.next_free_read_lock_index >= LSTAT_MAX_READ_LOCK_INDEX) {
+		    lstat_control.rwlock_overflow++;
+			index = 0;
+			goto unlock;
+		}
+		index = lstat_control.next_free_read_lock_index++;
+
+		/* initialize the global read stats data structure for each cpu */
+		for(cpu=0; cpu < smp_num_cpus; cpu++) {
+			(*lstat_control.read_lock_counts[cpu])[index].lock_ptr = rwlock_ptr;
+		}
+put_index_and_unlock:
+		/* store the index for the read lock structure into the lock */
+		PUT_RWINDEX(rwlock_ptr,index);
+	}
+
+unlock:
+	do { _raw_spin_unlock(&lstat_control.directory_lock);
+	     local_irq_restore(flags);} while(0);
+
+	return(index);
+}
+
+void 
+_metered_read_lock(rwlock_t *rwlock_ptr)
+{
+	void *this_pc;
+	uint32_t start_cycles;
+	int index;
+	int cpu;
+	int flags;
+	int readers_before, readers_after;
+	uint64_t cycles64;
+
+	if (lstat_control.state == LSTAT_OFF) {
+		_raw_read_lock(rwlock_ptr);
+		/* clean index in case lockmetering turns on before an unlock */
+		PUT_RWINDEX(rwlock_ptr, 0);
+		return;
+	}
+
+	this_pc = LSTAT_RA(LSTAT_RA_READ);
+	cpu = THIS_CPU_NUMBER;
+	index = GET_RWINDEX(rwlock_ptr);
+
+	/* allocate the global stats entry for this lock, if needed */
+	if (index==0) {
+		index = alloc_rwlock_struct(rwlock_ptr);
+	}
+
+	readers_before = RWLOCK_READERS(rwlock_ptr);
+	if (_raw_read_trylock(rwlock_ptr)) {
+	    /*
+	     * We have decremented the lock to count a new reader,
+	     * and have confirmed that no writer has it locked.
+	     */
+		/* update statistics if enabled */
+		if (index>0) { 
+#ifndef __KERNEL__
+			_raw_spin_lock((spinlock_t *)&(*lstat_control.read_lock_counts[cpu])[index].entry_lock);
+#else
+			do { local_irq_save(flags); } while(0);
+#endif
+			lstat_update((void *)rwlock_ptr, this_pc, LSTAT_ACT_NO_WAIT);
+			/* preserve value of TSC so cum_hold_ticks and start_busy use same value */
+			cycles64 = get_cycles64();
+			(*lstat_control.read_lock_counts[cpu])[index].cum_hold_ticks -= cycles64;
+
+			/* record time and cpu of start of busy period */
+			/* this is not perfect (some race conditions are possible) */
+			if (readers_before==0) {
+				(*lstat_control.read_lock_counts[cpu])[index].start_busy = cycles64;
+				PUT_RW_CPU(rwlock_ptr, cpu);
+			}
+			readers_after=RWLOCK_READERS(rwlock_ptr);
+			if (readers_after > (*lstat_control.read_lock_counts[cpu])[index].max_readers)
+				(*lstat_control.read_lock_counts[cpu])[index].max_readers = readers_after;
+#ifndef __KERNEL__
+			_raw_spin_unlock((spinlock_t*)&(*lstat_control.read_lock_counts[cpu])[index].entry_lock);
+#else
+			do {local_irq_restore(flags);} while(0);
+#endif
+		}
+
+	    return;	
+	}
+	/* If we get here, then we could not quickly grab the read lock */
+
+	start_cycles = get_cycles();	/* start counting the wait time */
+
+	/* Now spin until read_lock is successful */
+	_raw_read_lock(rwlock_ptr);
+
+	lstat_update_time((void *)rwlock_ptr, this_pc, LSTAT_ACT_SPIN,
+			  get_cycles() - start_cycles);
+
+	/* update statistics if they are enabled for this lock */
+	if (index>0) {
+#ifndef __KERNEL__
+		_raw_spin_lock((spinlock_t *)&(*lstat_control.read_lock_counts[cpu])[index].entry_lock);
+#else
+		do { local_irq_save(flags); } while(0);
+#endif
+		cycles64 = get_cycles64();
+		(*lstat_control.read_lock_counts[cpu])[index].cum_hold_ticks -= cycles64;
+
+		/* this is not perfect (some race conditions are possible) */
+		if (readers_before==0) { 
+			(*lstat_control.read_lock_counts[cpu])[index].start_busy = cycles64;
+			PUT_RW_CPU(rwlock_ptr, cpu);
+		}
+		readers_after=RWLOCK_READERS(rwlock_ptr);
+		if (readers_after > (*lstat_control.read_lock_counts[cpu])[index].max_readers)
+			(*lstat_control.read_lock_counts[cpu])[index].max_readers = readers_after;
+
+#ifndef __KERNEL__
+		_raw_spin_unlock((spinlock_t *)&(*lstat_control.read_lock_counts[cpu])[index].entry_lock);
+#else
+		do {local_irq_restore(flags);} while(0);
+#endif
+	}
+}
+
+void _metered_read_unlock(rwlock_t *rwlock_ptr) 
+{
+	int index;
+	int cpu;
+	int flags;
+	uint64_t busy_length;
+	uint64_t cycles64;
+
+	if (lstat_control.state == LSTAT_OFF) {
+		_raw_read_unlock(rwlock_ptr);
+		return;
+	}
+
+	index = GET_RWINDEX(rwlock_ptr);
+	cpu = THIS_CPU_NUMBER;
+
+	if (index>0) {
+#ifndef __KERNEL__
+		_raw_spin_lock((spinlock_t *)&(*lstat_control.read_lock_counts[cpu])[index].entry_lock);
+#else
+		/* updates below are non-atomic */
+		do { local_irq_save(flags); } while(0);
+#endif
+		/* preserve value of TSC so cum_hold_ticks and busy_ticks are consistent.. */
+		cycles64 = get_cycles64();
+		(*lstat_control.read_lock_counts[cpu])[index].cum_hold_ticks += cycles64;
+		(*lstat_control.read_lock_counts[cpu])[index].read_lock_count++;
+
+		/* once again, this is not perfect (some race conditions are possible) */
+		if (RWLOCK_READERS(rwlock_ptr) == 1) {
+			int cpu1 = GET_RW_CPU(rwlock_ptr);
+			uint64_t last_start_busy = (*lstat_control.read_lock_counts[cpu1])[index].start_busy;
+			(*lstat_control.read_lock_counts[cpu])[index].busy_periods++;
+			if (cycles64 > last_start_busy) {
+				busy_length = cycles64 - last_start_busy;
+				(*lstat_control.read_lock_counts[cpu])[index].busy_ticks += busy_length;
+				if (busy_length > (*lstat_control.read_lock_counts[cpu])[index].max_busy)
+					(*lstat_control.read_lock_counts[cpu])[index].max_busy = busy_length;
+			}
+		}
+#ifndef __KERNEL__
+		_raw_spin_unlock((spinlock_t *)&(*lstat_control.read_lock_counts[cpu])[index].entry_lock);
+#else
+		do {local_irq_restore(flags);} while(0);
+#endif
+	}
+
+	/* unlock the lock */
+	_raw_read_unlock(rwlock_ptr);
+}
+
+void _metered_write_lock(rwlock_t *rwlock_ptr)
+{
+	uint32_t start_cycles;
+	void *this_pc;
+	uint32_t spin_ticks = 0;    /* in anticipation of a potential wait */
+	int index;
+	int write_index = 0;
+	int cpu;
+	enum {writer_writer_conflict, writer_reader_conflict} why_wait = writer_writer_conflict;
+
+	if (lstat_control.state == LSTAT_OFF) {
+		_raw_write_lock(rwlock_ptr);
+		/* clean index in case lockmetering turns on before an unlock */
+		PUT_RWINDEX(rwlock_ptr, 0);
+		return;
+	}
+
+	this_pc = LSTAT_RA(LSTAT_RA_WRITE);
+	cpu = THIS_CPU_NUMBER;
+	index = GET_RWINDEX(rwlock_ptr);
+
+	/* allocate the global stats entry for this lock, if needed */
+	if (index == 0) {
+		index = alloc_rwlock_struct(rwlock_ptr);
+	} 
+
+	if (_raw_write_trylock(rwlock_ptr)) {
+	    /* We acquired the lock on the first try */
+	    write_index = lstat_update((void *)rwlock_ptr, this_pc, LSTAT_ACT_NO_WAIT);
+		/* save the write_index for use in unlock if stats enabled */
+		if (index > 0) 
+			(*lstat_control.read_lock_counts[cpu])[index].write_index = write_index;
+		return;
+	}
+
+	/* If we get here, then we could not quickly grab the write lock */
+	start_cycles = get_cycles();	/* start counting the wait time */
+
+	why_wait = RWLOCK_READERS(rwlock_ptr) ? writer_reader_conflict : writer_writer_conflict;
+
+	/* Now set the lock and wait for conflicts to disappear */
+	_raw_write_lock(rwlock_ptr);
+
+	spin_ticks = get_cycles() - start_cycles;
+
+	/* update stats -- if enabled */
+	if (index > 0)  
+		if (spin_ticks) {
+			if (why_wait == writer_reader_conflict) {
+				/* waited due to a reader holding the lock */
+				write_index = lstat_update_time((void *)rwlock_ptr, this_pc,
+						  LSTAT_ACT_SPIN, spin_ticks);
+			} else {
+				/* waited due to another writer holding the lock */
+				write_index = lstat_update_time((void *)rwlock_ptr, this_pc,
+						  LSTAT_ACT_WW_SPIN, spin_ticks);
+				(*lstat_control.counts[cpu])[write_index].cum_wait_ww_ticks += spin_ticks;
+				if (spin_ticks > 
+					(*lstat_control.counts[cpu])[write_index].max_wait_ww_ticks) {
+					(*lstat_control.counts[cpu])[write_index].max_wait_ww_ticks = spin_ticks;
+				}
+			}
+
+		/* save the directory index for use on write_unlock */
+		(*lstat_control.read_lock_counts[cpu])[index].write_index = write_index;
+	}
+
+}
+
+void
+_metered_write_unlock(rwlock_t *rwlock_ptr)
+{
+	int index;
+	int cpu;
+	int write_index;
+	uint32_t hold_time;
+
+	if (lstat_control.state == LSTAT_OFF) {
+		_raw_write_unlock(rwlock_ptr);
+		return;
+	}
+
+	cpu = THIS_CPU_NUMBER;
+	index = GET_RWINDEX(rwlock_ptr);
+
+	/* update statistics if stats enabled for this lock */
+	if (index>0) { 
+		write_index = (*lstat_control.read_lock_counts[cpu])[index].write_index;
+
+		hold_time = get_cycles() - (*lstat_control.counts[cpu])[write_index].acquire_time;
+		(*lstat_control.counts[cpu])[write_index].cum_hold_ticks += (uint64_t)hold_time;
+		if ((*lstat_control.counts[cpu])[write_index].max_hold_ticks < hold_time)
+			(*lstat_control.counts[cpu])[write_index].max_hold_ticks = hold_time;
+	}
+	_raw_write_unlock(rwlock_ptr);
+}
+
+int _metered_write_trylock(rwlock_t *rwlock_ptr)
+{
+	int retval;
+	void *this_pc = LSTAT_RA(LSTAT_RA_WRITE);
+
+	if ((retval = _raw_write_trylock(rwlock_ptr))) {
+	    lstat_update(rwlock_ptr, this_pc, LSTAT_ACT_NO_WAIT);
+	} else {
+	    lstat_update(rwlock_ptr, this_pc, LSTAT_ACT_REJECT);
+	}
+
+	return retval;
+}
+
+#ifdef __KERNEL__
+static void
+init_control_space(void)
+{
+	/* Set all control space pointers to null and indices to "empty" */
+	int cpu;
+
+	/*
+	 * Access CPU_CYCLE_FREQUENCY at the outset, which in some
+	 * architectures may trigger a runtime calculation that uses a
+	 * spinlock.  Let's do this before lockmetering is turned on.
+	 */
+	if (CPU_CYCLE_FREQUENCY == 0)
+		BUG();
+
+	lstat_control.hashtab	= NULL;
+	lstat_control.dir	= NULL;
+	for (cpu=0; cpu<NR_CPUS; cpu++) {
+		lstat_control.counts[cpu]	= NULL;
+		lstat_control.read_lock_counts[cpu]	= NULL;
+	}
+}
+
+static int
+reset_lstat_data(void)
+{
+	int cpu,flags;
+
+	flags = 0;
+	lstat_control.next_free_dir_index = 1;	/* 0 is for overflows */
+	lstat_control.next_free_read_lock_index = 1;
+	lstat_control.dir_overflow = 0;
+	lstat_control.rwlock_overflow = 0;
+
+	lstat_control.started_cycles64 = 0;
+	lstat_control.ending_cycles64 = 0;
+	lstat_control.enabled_cycles64 = 0;
+	lstat_control.first_started_time = 0;
+	lstat_control.started_time = 0;
+	lstat_control.ending_time = 0;
+	lstat_control.intervals = 0;
+
+	/* paranoia -- in case someone does a "lockstat reset" before "lockstat on" */
+	if (lstat_control.hashtab) {
+		bzero(lstat_control.hashtab, LSTAT_HASH_TABLE_SIZE*sizeof(short));
+		bzero(lstat_control.dir, LSTAT_MAX_STAT_INDEX*sizeof(lstat_directory_entry_t));
+
+		for (cpu = 0; cpu<smp_num_cpus; cpu++) {
+			bzero(lstat_control.counts[cpu], sizeof(lstat_cpu_counts_t));
+			bzero(lstat_control.read_lock_counts[cpu], sizeof(lstat_read_lock_cpu_counts_t));
+		}
+	}
+	#ifdef NOTDEF
+	_raw_spin_unlock(&lstat_control.directory_lock);
+	local_irq_restore(flags);
+	#endif
+	return(1);
+}
+
+static void
+release_control_space(void)
+{
+	/*
+	 * Called when either (1) allocation of kmem
+	 * or (2) when user writes LSTAT_RELEASE to /pro/lockmeter.
+	 * Assume that all pointers have been initialized to zero,
+	 * i.e., nonzero pointers are valid addresses.
+	 */
+	int cpu;
+
+	if (lstat_control.hashtab) {
+		kfree(lstat_control.hashtab);
+		lstat_control.hashtab = NULL;
+	}
+
+	if (lstat_control.dir) {
+		vfree(lstat_control.dir);
+		lstat_control.dir = NULL;
+	}
+
+	for (cpu = 0; cpu<NR_CPUS; cpu++) {
+		if (lstat_control.counts[cpu]) {
+			vfree(lstat_control.counts[cpu]);
+			lstat_control.counts[cpu] = NULL;
+		}
+		if (lstat_control.read_lock_counts[cpu]) {
+			kfree(lstat_control.read_lock_counts[cpu]);
+			lstat_control.read_lock_counts[cpu] = NULL;
+		}
+	}
+}
+
+int get_lockmeter_info_size(void)
+{
+	return sizeof(lstat_user_request_t)
+		+ smp_num_cpus * sizeof(lstat_cpu_counts_t)
+		+ smp_num_cpus * sizeof(lstat_read_lock_cpu_counts_t)
+		+ (LSTAT_MAX_STAT_INDEX * sizeof(lstat_directory_entry_t));
+}
+
+ssize_t get_lockmeter_info(char *buffer, size_t max_len, loff_t *last_index)
+{
+	lstat_user_request_t	req;
+	struct timeval		tv;
+	ssize_t			next_ret_bcount;
+	ssize_t			actual_ret_bcount = 0;
+	int             cpu;
+    
+	*last_index = 0;	/* a one-shot read */
+
+	req.lstat_version	    = LSTAT_VERSION;
+	req.state	            = lstat_control.state;
+	req.maxcpus		    = smp_num_cpus;
+	req.cycleval		    = CPU_CYCLE_FREQUENCY;
+#ifdef notyet
+	req.kernel_magic_addr	= (void *)&_etext;
+	req.kernel_end_addr	    = (void *)&_etext;
+#endif
+	req.uts                 = system_utsname;
+	req.intervals           = lstat_control.intervals;
+
+	req.first_started_time      = lstat_control.first_started_time;
+	req.started_time            = lstat_control.started_time;
+	req.started_cycles64        = lstat_control.started_cycles64;
+
+	req.next_free_dir_index	     = lstat_control.next_free_dir_index;
+	req.next_free_read_lock_index= lstat_control.next_free_read_lock_index;
+	req.dir_overflow             = lstat_control.dir_overflow;
+	req.rwlock_overflow          = lstat_control.rwlock_overflow;
+
+	if (lstat_control.state == LSTAT_OFF) {
+		if (req.intervals==0) {
+			/* mesasurement is off and no valid data present */
+			next_ret_bcount = sizeof(lstat_user_request_t);
+			req.enabled_cycles64= 0;
+
+			if ((actual_ret_bcount + next_ret_bcount) > max_len)
+				return actual_ret_bcount;
+
+			copy_to_user(buffer, (void *)&req, next_ret_bcount);
+			actual_ret_bcount += next_ret_bcount;
+			return actual_ret_bcount;
+		} else {
+			/* measurement is off but valid data present     */
+			/* fetch time info from lstat_control            */
+			req.ending_time      = lstat_control.ending_time;
+			req.ending_cycles64  = lstat_control.ending_cycles64;
+			req.enabled_cycles64 = lstat_control.enabled_cycles64;
+		}
+	} else {
+		/* this must be a read while data active--use current time, etc */
+		do_gettimeofday(&tv);
+		req.ending_time	         = tv.tv_sec;
+		req.ending_cycles64      = get_cycles64();
+		req.enabled_cycles64     = req.ending_cycles64-req.started_cycles64
+									+ lstat_control.enabled_cycles64;
+	}
+
+	next_ret_bcount = sizeof(lstat_user_request_t);
+	if ((actual_ret_bcount + next_ret_bcount) > max_len)
+	    return actual_ret_bcount;
+
+	copy_to_user(buffer, (void *)&req, next_ret_bcount);
+	actual_ret_bcount += next_ret_bcount;
+
+	if (!lstat_control.counts[0])	/* not initialized? */
+	    return actual_ret_bcount;
+
+	next_ret_bcount = sizeof(lstat_cpu_counts_t);
+	for (cpu = 0; cpu < smp_num_cpus; cpu++) {
+	    if ((actual_ret_bcount + next_ret_bcount) > max_len)
+		return actual_ret_bcount;	/* leave early */
+	    copy_to_user(buffer + actual_ret_bcount, lstat_control.counts[cpu],
+			 next_ret_bcount);
+	    actual_ret_bcount += next_ret_bcount;
+	}
+
+	next_ret_bcount = LSTAT_MAX_STAT_INDEX * sizeof(lstat_directory_entry_t);
+	if (  ((actual_ret_bcount + next_ret_bcount) > max_len)
+	   || !lstat_control.dir )
+	    return actual_ret_bcount;	/* leave early */
+
+	copy_to_user(buffer + actual_ret_bcount, lstat_control.dir,
+		     next_ret_bcount);
+	actual_ret_bcount += next_ret_bcount;
+
+	next_ret_bcount = sizeof(lstat_read_lock_cpu_counts_t);
+	for (cpu = 0; cpu <  smp_num_cpus; cpu++) {
+		if (actual_ret_bcount + next_ret_bcount > max_len)
+			return actual_ret_bcount;
+	    copy_to_user(buffer + actual_ret_bcount, lstat_control.read_lock_counts[cpu],
+			 next_ret_bcount);
+		actual_ret_bcount += next_ret_bcount;
+	}
+
+	return actual_ret_bcount;
+}
+
+/*
+ *  Writing to the /proc lockmeter node enables or disables metering.
+ *  based upon the first byte of the "written" data.
+ *  The following values are defined:
+ *  LSTAT_ON: 1st call: allocates storage, intializes and turns on measurement
+ *            subsequent calls just turn on measurement
+ *  LSTAT_OFF: turns off measurement
+ *  LSTAT_RESET: resets statistics
+ *  LSTAT_RELEASE: releases statistics storage
+ *
+ *  This allows one to accumulate statistics over several lockstat runs:
+ *
+ *  lockstat on
+ *  lockstat off
+ *  ...repeat above as desired...
+ *  lockstat get
+ *  ...now start a new set of measurements...
+ *  lockstat reset
+ *  lockstat on
+ *  ...
+ *
+ */
+ssize_t put_lockmeter_info(const char *buffer, size_t len)
+{
+	int	error = 0;
+	int	dirsize, countsize, read_lock_countsize, hashsize;
+	int	cpu;
+	char	put_char;
+	int i, read_lock_blocks, flags;
+	rwlock_t *lock_ptr;
+	struct timeval		tv;
+
+	if (len <= 0)
+	    return -EINVAL;
+
+	_raw_spin_lock(&lstat_control.control_lock);
+
+	get_user(put_char, buffer);
+	switch (put_char) {
+
+	case LSTAT_OFF:
+	    if (lstat_control.state != LSTAT_OFF) {
+			/*
+			 * To avoid seeing read lock hold times in an inconsisent state,
+			 * we have to follow this protocol to turn off statistics
+			 */
+			do { local_irq_save(flags); } while(0);
+			/* getting this lock will stop any read lock block allocations */
+			_raw_spin_lock(&lstat_control.directory_lock);
+			/* keep any more read lock blocks from being allocated */
+			lstat_control.state = LSTAT_OFF;
+			/* record how may read lock blocks there are */
+			read_lock_blocks = lstat_control.next_free_read_lock_index;
+			_raw_spin_unlock(&lstat_control.directory_lock);
+			/* now go through the list of read locks */
+			cpu = THIS_CPU_NUMBER;
+			for(i=1;i<read_lock_blocks;i++) {
+				lock_ptr = (*lstat_control.read_lock_counts[cpu])[i].lock_ptr;
+				/* is this saved lock address still valid? */
+				if (GET_RWINDEX(lock_ptr) == i) {
+					/* lock address appears to still be valid */
+					/* because we only hold one lock at a time, this can't */
+					/* cause a deadlock unless this is a lock held as part */
+					/* of the current system call path. At the moment there*/
+					/* are no READ mode locks held to get here from user   */
+					/* space, so we solve this by skipping locks held in   */
+					/* write mode. ....................................... */
+					if (RWLOCK_IS_WRITE_LOCKED(lock_ptr)) {
+						PUT_RWINDEX(lock_ptr,0);
+						continue;
+					}
+					/* now we know there are no read holders of this lock! */
+					/* stop statistics collection for this lock */
+					_raw_write_lock(lock_ptr);
+					PUT_RWINDEX(lock_ptr,0);
+					_raw_write_unlock(lock_ptr);
+				} 
+				/* it may still be possible for the hold time sum to be negative */
+				/* e. g. if a lock is reallocated while "busy" ................. */
+				/* we will have to fix this up in the data reduction program.... */
+			}
+		    do {local_irq_restore(flags);} while(0);
+		lstat_control.intervals++;
+		lstat_control.ending_cycles64 = get_cycles64();
+		lstat_control.enabled_cycles64 += lstat_control.ending_cycles64
+				- lstat_control.started_cycles64;
+		do_gettimeofday(&tv);
+		lstat_control.ending_time = tv.tv_sec;
+		/* don't deallocate the structures -- we may do a lockstat on to add to  */
+		/* the data that is already there. Use LSTAT_RELEASE to release storage  */
+	    } else {
+		error = -EBUSY;		/* already OFF */
+	    }
+	    break;
+
+	case LSTAT_ON:
+	    if (lstat_control.state == LSTAT_OFF) {
+#ifdef DEBUG_LOCKMETER
+		printk("put_lockmeter_info(cpu=%d): LSTAT_ON\n",THIS_CPU_NUMBER);
+#endif
+		lstat_control.next_free_dir_index = 1;	/* 0 is for overflows */
+
+		dirsize = LSTAT_MAX_STAT_INDEX * sizeof(lstat_directory_entry_t);
+		hashsize = (1 + LSTAT_HASH_TABLE_SIZE) * sizeof(ushort);
+		countsize = sizeof(lstat_cpu_counts_t);
+		read_lock_countsize = sizeof(lstat_read_lock_cpu_counts_t);
+#ifdef DEBUG_LOCKMETER
+		printk(" dirsize:%d",dirsize);
+		printk(" hashsize:%d", hashsize);
+		printk(" countsize:%d", countsize);
+		printk(" read_lock_countsize:%d\n", read_lock_countsize);
+#endif
+#ifdef DEBUG_LOCKMETER
+		{
+		int secs;
+		unsigned long cycles;
+		uint64_t cycles64;
+
+		do_gettimeofday(&tv);
+		secs = tv.tv_sec;
+		do { do_gettimeofday(&tv); } while (secs == tv.tv_sec);
+		cycles = get_cycles();
+		cycles64 = get_cycles64();
+		secs = tv.tv_sec;
+		do { do_gettimeofday(&tv); } while (secs == tv.tv_sec);
+		cycles = get_cycles() - cycles;
+		cycles64 = get_cycles64() - cycles;
+		printk("lockmeter: cycleFrequency:%d cycles:%d cycles64:%d\n",
+			CPU_CYCLE_FREQUENCY, cycles, cycles64);
+		}
+#endif
+
+		/* if this is the first call, allocate storage and initialize */
+		if (!lstat_control.hashtab) {
+
+		    spin_lock_init(&lstat_control.directory_lock);
+
+		    init_control_space();  /* guarantee all pointers at zero */
+
+		    lstat_control.hashtab = kmalloc(hashsize, GFP_KERNEL);
+		    if (!lstat_control.hashtab) {
+			error = -ENOSPC;
+#ifdef DEBUG_LOCKMETER
+			printk("!!error kmalloc of hashtab\n");
+#endif
+		    }
+		    lstat_control.dir = vmalloc(dirsize);
+		    if (!lstat_control.dir) {
+			error = -ENOSPC;
+#ifdef DEBUG_LOCKMETER
+			printk("!!error kmalloc of dir\n");
+#endif
+		    }
+
+		    for (cpu = 0; cpu<smp_num_cpus; cpu++) {
+			lstat_control.counts[cpu] = vmalloc(countsize);
+			if (!lstat_control.counts[cpu]) {
+			    error = -ENOSPC;
+#ifdef DEBUG_LOCKMETER
+			    printk("!!error vmalloc of counts[%d]\n",cpu);
+#endif
+			}
+			lstat_control.read_lock_counts[cpu] = 
+				(lstat_read_lock_cpu_counts_t *) kmalloc(read_lock_countsize, GFP_KERNEL);
+			if (!lstat_control.read_lock_counts[cpu]) {
+			    error = -ENOSPC;
+#ifdef DEBUG_LOCKMETER
+			    printk("!!error kmalloc of read_lock_counts[%d]\n",cpu);
+#endif
+			}
+		    }
+		}
+
+		if (error) {
+		    /* One or more kmalloc failures -- free everything */
+		    release_control_space();
+		} else {
+
+			if (!reset_lstat_data()) {
+				error = -EINVAL;
+				break;
+			};
+
+			/* record starting and ending times and the like */
+			if (lstat_control.intervals == 0) {
+				do_gettimeofday(&tv);
+				lstat_control.first_started_time = tv.tv_sec;
+			}
+			lstat_control.started_cycles64 = get_cycles64();
+			do_gettimeofday(&tv);
+			lstat_control.started_time = tv.tv_sec;
+
+			lstat_control.state = LSTAT_ON;
+		}
+	    } else {
+		error = -EBUSY;		/* already ON */
+	    }
+	    break;
+
+	case LSTAT_RESET:
+		if (lstat_control.state == LSTAT_OFF) {
+			if (!reset_lstat_data()) {
+				error = -EINVAL;
+			};
+		}
+		else
+			error = -EBUSY; /* still on; can't reset */
+		break;
+
+	case LSTAT_RELEASE:
+		if (lstat_control.state == LSTAT_OFF) {
+			release_control_space();
+			lstat_control.intervals = 0;
+			lstat_control.enabled_cycles64 = 0;
+		}
+		else
+			error = -EBUSY;
+		break;
+
+	default:
+	    error = -EINVAL;
+	} /* switch */
+
+	_raw_spin_unlock(&lstat_control.control_lock);
+	return ( (error) ? error : len );
+}
+
+#endif /* __KERNEL__ */
+#ifdef USER_MODE_TESTING
+/* following used for user mode testing */
+void lockmeter_init() {
+	int dirsize, hashsize, countsize, read_lock_countsize, cpu;
+
+	printf("lstat_control is at %x size=%d\n",&lstat_control,sizeof(lstat_control));
+	printf("sizeof(spinlock_t)=%d\n",sizeof(spinlock_t));
+	lstat_control.state = LSTAT_ON;
+
+	lstat_control.directory_lock      = SPIN_LOCK_UNLOCKED;
+	lstat_control.next_free_dir_index = 1;	/* 0 is for overflows */
+	lstat_control.next_free_read_lock_index = 1;
+
+	dirsize = LSTAT_MAX_STAT_INDEX * sizeof(lstat_directory_entry_t);
+	hashsize = (1 + LSTAT_HASH_TABLE_SIZE) * sizeof(ushort);
+	countsize = sizeof(lstat_cpu_counts_t);
+	read_lock_countsize = sizeof(lstat_read_lock_cpu_counts_t);
+
+	lstat_control.hashtab = (ushort *) malloc(hashsize);
+
+	if (lstat_control.hashtab == 0) {
+		printf("malloc failure for at line %d in lockmeter.c\n",__LINE__);
+		exit(0);
+	}
+
+	lstat_control.dir = (lstat_directory_entry_t *) malloc(dirsize);
+
+	if (lstat_control.dir == 0) {
+		printf("malloc failure for at line %d in lockmeter.c\n",cpu,__LINE__);
+		exit(0);
+	}
+
+	for (cpu = 0; cpu<smp_num_cpus; cpu++) {
+		int j,k;
+		j = (int) (lstat_control.counts[cpu] = (lstat_cpu_counts_t *) malloc(countsize));
+		k = (int) (lstat_control.read_lock_counts[cpu] = (lstat_read_lock_cpu_counts_t *) malloc(read_lock_countsize));
+		if( j*k == 0) {
+			printf("malloc failure for cpu=%d at line %d in lockmeter.c\n",cpu,__LINE__);
+			exit(0);
+		}
+	}
+
+	memset(lstat_control.hashtab, 0, hashsize);
+	memset(lstat_control.dir, 0, dirsize);
+
+	for (cpu = 0; cpu<smp_num_cpus; cpu++) {
+		memset(lstat_control.counts[cpu], 0, countsize);
+		memset(lstat_control.read_lock_counts[cpu], 0, read_lock_countsize);
+	}
+
+}
+
+asm(
+"
+.align	4
+.globl	__write_lock_failed
+__write_lock_failed:
+	" LOCK "addl	$" RW_LOCK_BIAS_STR ",(%eax)
+1:	cmpl	$" RW_LOCK_BIAS_STR ",(%eax)
+	jne	1b
+
+	" LOCK "subl	$" RW_LOCK_BIAS_STR ",(%eax)
+	jnz	__write_lock_failed
+	ret
+
+
+.align	4
+.globl	__read_lock_failed
+__read_lock_failed:
+	lock ; incl	(%eax)
+1:	cmpl	$1,(%eax)
+	js	1b
+
+	lock ; decl	(%eax)
+	js	__read_lock_failed
+	ret
+"
+);
+#endif
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/kernel/module.c 999-mjb/kernel/module.c
--- 000-virgin/kernel/module.c	2003-10-01 11:47:15.000000000 -0700
+++ 999-mjb/kernel/module.c	2003-10-02 16:43:03.000000000 -0700
@@ -83,6 +83,11 @@ int unregister_module_notifier(struct no
 }
 EXPORT_SYMBOL(unregister_module_notifier);
 
+#ifdef CONFIG_GCOV_PROFILE
+extern void remove_bb_link (struct module *);
+extern void do_global_ctors (char *, char *, struct module *, int);
+#endif
+
 /* We require a truly strong try_module_get() */
 static inline int strong_try_module_get(struct module *mod)
 {
@@ -1082,6 +1087,11 @@ static void free_module(struct module *m
 	/* Arch-specific cleanup. */
 	module_arch_cleanup(mod);
 
+#ifdef CONFIG_GCOV_PROFILE
+	if (mod->ctors_start && mod->ctors_end)
+		remove_bb_link(mod);
+#endif
+
 	/* Module unload stuff */
 	module_unload_free(mod);
 
@@ -1575,6 +1585,13 @@ static struct module *load_module(void _
 	/* Module has been moved. */
 	mod = (void *)sechdrs[modindex].sh_addr;
 
+#ifdef CONFIG_GCOV_PROFILE
+	modindex = find_sec(hdr, sechdrs, secstrings, ".ctors");
+	mod->ctors_start = (char *)sechdrs[modindex].sh_addr;
+	mod->ctors_end   = (char *)(mod->ctors_start +
+				sechdrs[modindex].sh_size);
+#endif
+
 	/* Now we've moved module, initialize linked lists, etc. */
 	module_unload_init(mod);
 
@@ -1724,6 +1741,12 @@ sys_init_module(void __user *umod,
 
 	/* Start the module */
 	ret = mod->init();
+
+#ifdef CONFIG_GCOV_PROFILE
+	if (mod->ctors_start && mod->ctors_end) {
+		do_global_ctors(mod->ctors_start, mod->ctors_end, mod, 1);
+	}
+#endif
 	if (ret < 0) {
 		/* Init routine failed: abort.  Try to protect us from
                    buggy refcounters. */
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/kernel/sched.c 999-mjb/kernel/sched.c
--- 000-virgin/kernel/sched.c	2003-10-01 11:48:28.000000000 -0700
+++ 999-mjb/kernel/sched.c	2003-10-02 16:41:02.000000000 -0700
@@ -37,7 +37,7 @@
 #include <linux/cpu.h>
 #include <linux/percpu.h>
 
-#ifdef CONFIG_NUMA
+#ifdef CONFIG_NUMA_SCHED
 #define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu))
 #else
 #define cpu_to_node_mask(cpu) (cpu_online_map)
@@ -76,19 +76,28 @@
  * maximum timeslice is 200 msecs. Timeslices get refilled after
  * they expire.
  */
-#define MIN_TIMESLICE		( 10 * HZ / 1000)
-#define MAX_TIMESLICE		(200 * HZ / 1000)
+
+int min_timeslice = (10 * HZ) / 1000;
+#define MIN_TIMESLICE		(min_timeslice)
+int max_timeslice = (200 * HZ) / 1000;
+#define MAX_TIMESLICE		(max_timeslice)
 #define ON_RUNQUEUE_WEIGHT	30
-#define CHILD_PENALTY		95
-#define PARENT_PENALTY		100
-#define EXIT_WEIGHT		3
-#define PRIO_BONUS_RATIO	25
+int child_penalty = 95;
+#define CHILD_PENALTY		(child_penalty)
+int parent_penalty = 100;
+#define PARENT_PENALTY		(parent_penalty)
+int exit_weight = 3;
+#define EXIT_WEIGHT		(exit_weight)
+int prio_bonus_ratio = 25;
+#define PRIO_BONUS_RATIO	(prio_bonus_ratio)
 #define MAX_BONUS		(MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
-#define INTERACTIVE_DELTA	2
+int interactive_delta = 2;
+#define INTERACTIVE_DELTA	(interactive_delta)
 #define MAX_SLEEP_AVG		(AVG_TIMESLICE * MAX_BONUS)
 #define STARVATION_LIMIT	(MAX_SLEEP_AVG)
 #define NS_MAX_SLEEP_AVG	(JIFFIES_TO_NS(MAX_SLEEP_AVG))
-#define NODE_THRESHOLD		125
+int node_threshold = 125;
+#define NODE_THRESHOLD		(node_threshold)
 #define CREDIT_LIMIT		100
 
 /*
@@ -203,7 +212,7 @@ struct runqueue {
 	struct mm_struct *prev_mm;
 	prio_array_t *active, *expired, arrays[2];
 	int prev_cpu_load[NR_CPUS];
-#ifdef CONFIG_NUMA
+#ifdef CONFIG_NUMA_SCHED
 	atomic_t *node_nr_running;
 	int prev_node_load[MAX_NUMNODES];
 #endif
@@ -229,7 +238,7 @@ static DEFINE_PER_CPU(struct runqueue, r
 # define task_running(rq, p)		((rq)->curr == (p))
 #endif
 
-#ifdef CONFIG_NUMA
+#ifdef CONFIG_NUMA_SCHED
 
 /*
  * Keep track of running tasks.
@@ -266,13 +275,13 @@ __init void node_nr_running_init(void)
 	}
 }
 
-#else /* !CONFIG_NUMA */
+#else /* !CONFIG_NUMA_SCHED */
 
 # define nr_running_init(rq)   do { } while (0)
 # define nr_running_inc(rq)    do { (rq)->nr_running++; } while (0)
 # define nr_running_dec(rq)    do { (rq)->nr_running--; } while (0)
 
-#endif /* CONFIG_NUMA */
+#endif /* CONFIG_NUMA_SCHED */
 
 /*
  * task_rq_lock - lock the runqueue a given task resides on and disable
@@ -822,6 +831,11 @@ unsigned long nr_running(void)
 	return sum;
 }
 
+unsigned long nr_running_cpu(int cpu)
+{
+	return cpu_rq(cpu)->nr_running;
+}
+
 unsigned long nr_uninterruptible(void)
 {
 	unsigned long i, sum = 0;
@@ -892,7 +906,7 @@ static inline void double_rq_unlock(runq
 		spin_unlock(&rq2->lock);
 }
 
-#ifdef CONFIG_NUMA
+#ifdef CONFIG_NUMA_SCHED
 /*
  * If dest_cpu is allowed for this process, migrate the task to it.
  * This is accomplished by forcing the cpu_allowed mask to only
@@ -919,36 +933,72 @@ static void sched_migrate_task(task_t *p
  */
 static int sched_best_cpu(struct task_struct *p)
 {
-	int i, minload, load, best_cpu, node = 0;
+	int cpu, node, minload, load, best_cpu, best_node;
+	int this_cpu, this_node, this_node_load;
 	cpumask_t cpumask;
 
-	best_cpu = task_cpu(p);
-	if (cpu_rq(best_cpu)->nr_running <= 2)
-		return best_cpu;
+	this_cpu = best_cpu = task_cpu(p);
+	if (cpu_rq(this_cpu)->nr_running <= 2)
+		return this_cpu;
+	this_node = best_node = cpu_to_node(this_cpu);
+
+	/* 
+	 * First look for any node-local idle queue and use that. 
+	 * This improves performance under light loads (mbligh).
+	 * In case this node turns out to be the lightest node, store the best
+	 * cpu that we find, so we don't go sniffing the same runqueues again.
+	 */
+	minload = 10000000;
+	cpumask = node_to_cpumask(this_node);
+	for (cpu = 0; cpu < NR_CPUS; ++cpu) {
+		if (!cpu_isset(cpu, cpumask))
+			continue;
+		load = cpu_rq(cpu)->nr_running;
+		if (load == 0)
+			return cpu;
+		if (load < minload) {
+			minload = load;
+			best_cpu = cpu;
+		}
+	}
 
+	/* 
+	 * Now find the lightest loaded node, and put it in best_node
+	 * 
+	 * Node load is always divided by nr_cpus_node to normalise load 
+	 * values in case cpu count differs from node to node. We first 
+	 * multiply node_nr_running by 16 to get a little better resolution.
+	 */
 	minload = 10000000;
-	for_each_node_with_cpus(i) {
-		/*
-		 * Node load is always divided by nr_cpus_node to normalise 
-		 * load values in case cpu count differs from node to node.
-		 * We first multiply node_nr_running by 10 to get a little
-		 * better resolution.   
-		 */
-		load = 10 * atomic_read(&node_nr_running[i]) / nr_cpus_node(i);
+	this_node_load = 16 * atomic_read(&node_nr_running[this_node])
+					/ nr_cpus_node(this_node);
+	for_each_node_with_cpus(node) {
+		if (node == this_node)
+			load = this_node_load;
+		else
+			load = 16 * atomic_read(&node_nr_running[node])
+					/ nr_cpus_node(node);
 		if (load < minload) {
 			minload = load;
-			node = i;
+			best_node = node;
 		}
 	}
 
+	/* If we chose this node, we already did the legwork earlier */
+	if (best_node == this_node)
+		return best_cpu;
+
+	/* Now find the lightest loaded cpu on best_node, and use that */
 	minload = 10000000;
-	cpumask = node_to_cpumask(node);
-	for (i = 0; i < NR_CPUS; ++i) {
-		if (!cpu_isset(i, cpumask))
+	best_cpu = this_cpu;
+	cpumask = node_to_cpumask(best_node);
+	for (cpu = 0; cpu < NR_CPUS; ++cpu) {
+		if (!cpu_isset(cpu, cpumask))
 			continue;
-		if (cpu_rq(i)->nr_running < minload) {
-			best_cpu = i;
-			minload = cpu_rq(i)->nr_running;
+		load = cpu_rq(cpu)->nr_running;
+		if (load < minload) {
+			minload = load;
+			best_cpu = cpu;
 		}
 	}
 	return best_cpu;
@@ -999,7 +1049,10 @@ static int find_busiest_node(int this_no
 	return node;
 }
 
-#endif /* CONFIG_NUMA */
+#endif /* CONFIG_NUMA_SCHED */
+
+int idle_node_rebalance_ratio = 10;
+int busy_node_rebalance_ratio = 2;
 
 #ifdef CONFIG_SMP
 
@@ -1247,10 +1300,10 @@ out:
  */
 #define IDLE_REBALANCE_TICK (HZ/1000 ?: 1)
 #define BUSY_REBALANCE_TICK (HZ/5 ?: 1)
-#define IDLE_NODE_REBALANCE_TICK (IDLE_REBALANCE_TICK * 5)
-#define BUSY_NODE_REBALANCE_TICK (BUSY_REBALANCE_TICK * 2)
+#define IDLE_NODE_REBALANCE_TICK (IDLE_REBALANCE_TICK * idle_node_rebalance_ratio)
+#define BUSY_NODE_REBALANCE_TICK (BUSY_REBALANCE_TICK * busy_node_rebalance_ratio)
 
-#ifdef CONFIG_NUMA
+#ifdef CONFIG_NUMA_SCHED
 static void balance_node(runqueue_t *this_rq, int idle, int this_cpu)
 {
 	int node = find_busiest_node(cpu_to_node(this_cpu));
@@ -1281,7 +1334,7 @@ static void rebalance_tick(runqueue_t *t
 	 * are not balanced.)
 	 */
 	if (idle) {
-#ifdef CONFIG_NUMA
+#ifdef CONFIG_NUMA_SCHED
 		if (!(j % IDLE_NODE_REBALANCE_TICK))
 			balance_node(this_rq, idle, this_cpu);
 #endif
@@ -1292,7 +1345,7 @@ static void rebalance_tick(runqueue_t *t
 		}
 		return;
 	}
-#ifdef CONFIG_NUMA
+#ifdef CONFIG_NUMA_SCHED
 	if (!(j % BUSY_NODE_REBALANCE_TICK))
 		balance_node(this_rq, idle, this_cpu);
 #endif
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/kernel/sys.c 999-mjb/kernel/sys.c
--- 000-virgin/kernel/sys.c	2003-10-01 11:48:28.000000000 -0700
+++ 999-mjb/kernel/sys.c	2003-10-02 16:41:14.000000000 -0700
@@ -235,6 +235,7 @@ cond_syscall(sys_epoll_ctl)
 cond_syscall(sys_epoll_wait)
 cond_syscall(sys_pciconfig_read)
 cond_syscall(sys_pciconfig_write)
+cond_syscall(sys_mbind)
 
 static int set_one_prio(struct task_struct *p, int niceval, int error)
 {
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/kernel/sysctl.c 999-mjb/kernel/sysctl.c
--- 000-virgin/kernel/sysctl.c	2003-10-01 11:48:28.000000000 -0700
+++ 999-mjb/kernel/sysctl.c	2003-10-02 16:39:40.000000000 -0700
@@ -59,6 +59,16 @@ extern int cad_pid;
 extern int pid_max;
 extern int sysctl_lower_zone_protection;
 extern int min_free_kbytes;
+extern int min_timeslice;
+extern int max_timeslice;
+extern int child_penalty;
+extern int parent_penalty;
+extern int exit_weight;
+extern int prio_bonus_ratio;
+extern int interactive_delta;
+extern int node_threshold;
+extern int idle_node_rebalance_ratio;
+extern int busy_node_rebalance_ratio;
 
 /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
 static int maxolduid = 65535;
@@ -121,6 +131,7 @@ static struct ctl_table_header root_tabl
 
 static ctl_table kern_table[];
 static ctl_table vm_table[];
+static ctl_table sched_table[];
 #ifdef CONFIG_NET
 extern ctl_table net_table[];
 #endif
@@ -200,6 +211,12 @@ static ctl_table root_table[] = {
 		.mode		= 0555,
 		.child		= dev_table,
 	},
+	{
+		.ctl_name	= CTL_SCHED,
+		.procname	= "sched",
+		.mode		= 0555,
+		.child		= sched_table,
+	},
 	{ .ctl_name = 0 }
 };
 
@@ -587,6 +604,7 @@ static ctl_table kern_table[] = {
 /* Constants for minimum and maximum testing in vm_table.
    We use these as one-element integer vectors. */
 static int zero;
+static int one = 1;
 static int one_hundred = 100;
 
 
@@ -807,6 +825,42 @@ static ctl_table dev_table[] = {
 	{ .ctl_name = 0 }
 };  
 
+static ctl_table sched_table[] = {
+	{SCHED_MAX_TIMESLICE, "max_timeslice", &max_timeslice,
+	 sizeof(int), 0644, NULL, &proc_dointvec_minmax,
+	 &sysctl_intvec, NULL, &one, NULL},
+	{SCHED_MIN_TIMESLICE, "min_timeslice", &min_timeslice,
+	 sizeof(int), 0644, NULL, &proc_dointvec_minmax,
+	 &sysctl_intvec, NULL, &one, NULL},
+	{SCHED_CHILD_PENALTY, "child_penalty", &child_penalty,
+	 sizeof(int), 0644, NULL, &proc_dointvec_minmax,
+	 &sysctl_intvec, NULL, &zero, NULL},
+	{SCHED_PARENT_PENALTY, "parent_penalty", &parent_penalty,
+	 sizeof(int), 0644, NULL, &proc_dointvec_minmax,
+	 &sysctl_intvec, NULL, &zero, NULL},
+	{SCHED_EXIT_WEIGHT, "exit_weight", &exit_weight,
+	 sizeof(int), 0644, NULL, &proc_dointvec_minmax,
+	 &sysctl_intvec, NULL, &zero, NULL},
+	{SCHED_PRIO_BONUS_RATIO, "prio_bonus_ratio", &prio_bonus_ratio,
+	 sizeof(int), 0644, NULL, &proc_dointvec_minmax,
+	 &sysctl_intvec, NULL, &zero, NULL},
+	{SCHED_INTERACTIVE_DELTA, "interactive_delta", &interactive_delta,
+	 sizeof(int), 0644, NULL, &proc_dointvec_minmax,
+	 &sysctl_intvec, NULL, &zero, NULL},
+	{SCHED_NODE_THRESHOLD, "node_threshold", &node_threshold,
+	 sizeof(int), 0644, NULL, &proc_dointvec_minmax,
+	 sysctl_intvec, NULL, &one, NULL},
+	{SCHED_IDLE_NODE_REBALANCE_RATIO, "idle_node_rebalance_ratio", 
+						&idle_node_rebalance_ratio,
+	 sizeof(int), 0644, NULL, &proc_dointvec_minmax,
+	 &sysctl_intvec, NULL, &zero, NULL},
+	{SCHED_BUSY_NODE_REBALANCE_RATIO, "busy_node_rebalance_ratio", 
+						&busy_node_rebalance_ratio,
+	 sizeof(int), 0644, NULL, &proc_dointvec_minmax,
+	 &sysctl_intvec, NULL, &zero, NULL},
+	{0}
+};
+
 extern void init_irq_proc (void);
 
 void __init sysctl_init(void)
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/kernel/timer.c 999-mjb/kernel/timer.c
--- 000-virgin/kernel/timer.c	2003-10-01 11:47:15.000000000 -0700
+++ 999-mjb/kernel/timer.c	2003-10-02 16:41:02.000000000 -0700
@@ -750,6 +750,8 @@ static unsigned long count_active_tasks(
  * Requires xtime_lock to access.
  */
 unsigned long avenrun[3];
+unsigned long tasks_running[3];
+DEFINE_PER_CPU(unsigned long[3],cpu_tasks_running);
 
 /*
  * calc_load - given tick count, update the avenrun load estimates.
@@ -757,7 +759,7 @@ unsigned long avenrun[3];
  */
 static inline void calc_load(unsigned long ticks)
 {
-	unsigned long active_tasks; /* fixed-point */
+	unsigned long active_tasks, running_tasks; /* fixed-point */
 	static int count = LOAD_FREQ;
 
 	count -= ticks;
@@ -767,9 +769,39 @@ static inline void calc_load(unsigned lo
 		CALC_LOAD(avenrun[0], EXP_1, active_tasks);
 		CALC_LOAD(avenrun[1], EXP_5, active_tasks);
 		CALC_LOAD(avenrun[2], EXP_15, active_tasks);
+		running_tasks = nr_running() * FIXED_1;
+		CALC_LOAD(tasks_running[0], EXP_1,  running_tasks);
+		CALC_LOAD(tasks_running[1], EXP_5,  running_tasks);
+		CALC_LOAD(tasks_running[2], EXP_15, running_tasks);
 	}
 }
 
+/*
+ * This does the frequency calculation a little bit different from the
+ * global version above.  It doesn't ever look at the kernel's concept
+ * of time, it just updates that stats every LOAD_FREQ times into the
+ * function.
+ *
+ * Using jiffies is more accurate, but there _are_ just statistics, so
+ * they're not worth messing with xtime_lock and company.  If we miss
+ * an interrupt or two, big deal.
+ */
+void calc_load_cpu(int cpu)
+{
+	unsigned long running_tasks;
+	static DEFINE_PER_CPU(int, count) = { LOAD_FREQ };
+	
+	per_cpu(count, cpu)--;
+	if (per_cpu(count, cpu) != 0)
+		return;
+
+	per_cpu(count, cpu) += LOAD_FREQ;
+	running_tasks = nr_running_cpu(cpu) * FIXED_1;
+	CALC_LOAD(per_cpu(cpu_tasks_running, cpu)[0], EXP_1,  running_tasks);
+	CALC_LOAD(per_cpu(cpu_tasks_running, cpu)[1], EXP_5,  running_tasks);
+	CALC_LOAD(per_cpu(cpu_tasks_running, cpu)[2], EXP_15, running_tasks);
+}
+
 /* jiffies at the most recent update of wall time */
 unsigned long wall_jiffies = INITIAL_JIFFIES;
 
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/mm/Makefile 999-mjb/mm/Makefile
--- 000-virgin/mm/Makefile	2003-10-01 11:47:15.000000000 -0700
+++ 999-mjb/mm/Makefile	2003-10-02 16:41:14.000000000 -0700
@@ -7,8 +7,10 @@ mmu-$(CONFIG_MMU)	:= fremap.o highmem.o 
 			   mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
 			   shmem.o vmalloc.o
 
-obj-y			:= bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
+obj-y			:= bootmem.o fadvise.o filemap.o mempool.o oom_kill.o \
 			   page_alloc.o page-writeback.o pdflush.o readahead.o \
 			   slab.o swap.o truncate.o vmscan.o $(mmu-y)
 
 obj-$(CONFIG_SWAP)	+= page_io.o swap_state.o swapfile.o
+
+obj-$(CONFIG_NUMA)	+= mbind.o
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/mm/filemap.c 999-mjb/mm/filemap.c
--- 000-virgin/mm/filemap.c	2003-10-01 11:48:28.000000000 -0700
+++ 999-mjb/mm/filemap.c	2003-10-02 16:53:55.000000000 -0700
@@ -70,6 +70,9 @@
  *  ->mmap_sem
  *    ->i_sem			(msync)
  *
+ *  ->lock_page
+ *    ->i_shared_sem		(page_convert_anon)
+ *
  *  ->inode_lock
  *    ->sb_lock			(fs/fs-writeback.c)
  *    ->mapping->page_lock	(__sync_single_inode)
@@ -105,9 +108,9 @@ void remove_from_page_cache(struct page 
 	if (unlikely(!PageLocked(page)))
 		PAGE_BUG(page);
 
-	spin_lock(&mapping->page_lock);
+	mapping_wrlock(&mapping->page_lock);
 	__remove_from_page_cache(page);
-	spin_unlock(&mapping->page_lock);
+	mapping_wrunlock(&mapping->page_lock);
 }
 
 static inline int sync_page(struct page *page)
@@ -139,9 +142,9 @@ static int __filemap_fdatawrite(struct a
 	if (mapping->backing_dev_info->memory_backed)
 		return 0;
 
-	spin_lock(&mapping->page_lock);
+	mapping_wrlock(&mapping->page_lock);
 	list_splice_init(&mapping->dirty_pages, &mapping->io_pages);
-	spin_unlock(&mapping->page_lock);
+	mapping_wrunlock(&mapping->page_lock);
 	ret = do_writepages(mapping, &wbc);
 	return ret;
 }
@@ -172,7 +175,7 @@ int filemap_fdatawait(struct address_spa
 
 restart:
 	progress = 0;
-	spin_lock(&mapping->page_lock);
+	mapping_wrlock(&mapping->page_lock);
         while (!list_empty(&mapping->locked_pages)) {
 		struct page *page;
 
@@ -186,7 +189,7 @@ restart:
 		if (!PageWriteback(page)) {
 			if (++progress > 32) {
 				if (need_resched()) {
-					spin_unlock(&mapping->page_lock);
+					mapping_wrunlock(&mapping->page_lock);
 					__cond_resched();
 					goto restart;
 				}
@@ -196,16 +199,16 @@ restart:
 
 		progress = 0;
 		page_cache_get(page);
-		spin_unlock(&mapping->page_lock);
+		mapping_wrunlock(&mapping->page_lock);
 
 		wait_on_page_writeback(page);
 		if (PageError(page))
 			ret = -EIO;
 
 		page_cache_release(page);
-		spin_lock(&mapping->page_lock);
+		mapping_wrlock(&mapping->page_lock);
 	}
-	spin_unlock(&mapping->page_lock);
+	mapping_wrunlock(&mapping->page_lock);
 
 	/* Check for outstanding write errors */
 	if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
@@ -240,7 +243,7 @@ int add_to_page_cache(struct page *page,
 
 	if (error == 0) {
 		page_cache_get(page);
-		spin_lock(&mapping->page_lock);
+		mapping_wrlock(&mapping->page_lock);
 		error = radix_tree_insert(&mapping->page_tree, offset, page);
 		if (!error) {
 			SetPageLocked(page);
@@ -248,7 +251,7 @@ int add_to_page_cache(struct page *page,
 		} else {
 			page_cache_release(page);
 		}
-		spin_unlock(&mapping->page_lock);
+		mapping_wrunlock(&mapping->page_lock);
 		radix_tree_preload_end();
 	}
 	return error;
@@ -377,11 +380,11 @@ struct page * find_get_page(struct addre
 	 * We scan the hash list read-only. Addition to and removal from
 	 * the hash-list needs a held write-lock.
 	 */
-	spin_lock(&mapping->page_lock);
+	mapping_rdlock(&mapping->page_lock);
 	page = radix_tree_lookup(&mapping->page_tree, offset);
 	if (page)
 		page_cache_get(page);
-	spin_unlock(&mapping->page_lock);
+	mapping_rdunlock(&mapping->page_lock);
 	return page;
 }
 
@@ -392,11 +395,11 @@ struct page *find_trylock_page(struct ad
 {
 	struct page *page;
 
-	spin_lock(&mapping->page_lock);
+	mapping_rdlock(&mapping->page_lock);
 	page = radix_tree_lookup(&mapping->page_tree, offset);
 	if (page && TestSetPageLocked(page))
 		page = NULL;
-	spin_unlock(&mapping->page_lock);
+	mapping_rdunlock(&mapping->page_lock);
 	return page;
 }
 
@@ -416,15 +419,15 @@ struct page *find_lock_page(struct addre
 {
 	struct page *page;
 
-	spin_lock(&mapping->page_lock);
+	mapping_rdlock(&mapping->page_lock);
 repeat:
 	page = radix_tree_lookup(&mapping->page_tree, offset);
 	if (page) {
 		page_cache_get(page);
 		if (TestSetPageLocked(page)) {
-			spin_unlock(&mapping->page_lock);
+			mapping_rdunlock(&mapping->page_lock);
 			lock_page(page);
-			spin_lock(&mapping->page_lock);
+			mapping_rdlock(&mapping->page_lock);
 
 			/* Has the page been truncated while we slept? */
 			if (page->mapping != mapping || page->index != offset) {
@@ -434,7 +437,7 @@ repeat:
 			}
 		}
 	}
-	spin_unlock(&mapping->page_lock);
+	mapping_rdunlock(&mapping->page_lock);
 	return page;
 }
 
@@ -504,12 +507,12 @@ unsigned int find_get_pages(struct addre
 	unsigned int i;
 	unsigned int ret;
 
-	spin_lock(&mapping->page_lock);
+	mapping_rdlock(&mapping->page_lock);
 	ret = radix_tree_gang_lookup(&mapping->page_tree,
 				(void **)pages, start, nr_pages);
 	for (i = 0; i < ret; i++)
 		page_cache_get(pages[i]);
-	spin_unlock(&mapping->page_lock);
+	mapping_rdunlock(&mapping->page_lock);
 	return ret;
 }
 
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/mm/fremap.c 999-mjb/mm/fremap.c
--- 000-virgin/mm/fremap.c	2003-10-01 11:48:28.000000000 -0700
+++ 999-mjb/mm/fremap.c	2003-10-02 16:42:18.000000000 -0700
@@ -38,7 +38,7 @@ static inline int zap_pte(struct mm_stru
 					set_page_dirty(page);
 				page_remove_rmap(page, ptep);
 				page_cache_release(page);
-				mm->rss--;
+				dec_rss(mm, page);
 			}
 		}
 		return 1;
@@ -63,10 +63,26 @@ int install_page(struct mm_struct *mm, s
 	pmd_t *pmd;
 	pte_t pte_val;
 	struct pte_chain *pte_chain;
+	unsigned long pgidx;
 
 	pte_chain = pte_chain_alloc(GFP_KERNEL);
 	if (!pte_chain)
 		goto err;
+
+	/*
+	 * Convert this page to anon for objrmap if it's nonlinear
+	 */
+	pgidx = (addr - vma->vm_start) >> PAGE_SHIFT;
+	pgidx += vma->vm_pgoff;
+	pgidx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT;
+	if (!PageAnon(page) && (page->index != pgidx)) {
+		lock_page(page);
+		err = page_convert_anon(page);
+		unlock_page(page);
+		if (err < 0)
+			goto err_free;
+	}
+
 	pgd = pgd_offset(mm, addr);
 	spin_lock(&mm->page_table_lock);
 
@@ -80,7 +96,7 @@ int install_page(struct mm_struct *mm, s
 
 	flush = zap_pte(mm, vma, addr, pte);
 
-	mm->rss++;
+	inc_rss(mm, page);
 	flush_icache_page(vma, page);
 	set_pte(pte, mk_pte(page, prot));
 	pte_chain = page_add_rmap(page, pte, pte_chain);
@@ -89,12 +105,11 @@ int install_page(struct mm_struct *mm, s
 	if (flush)
 		flush_tlb_page(vma, addr);
 	update_mmu_cache(vma, addr, pte_val);
-	spin_unlock(&mm->page_table_lock);
-	pte_chain_free(pte_chain);
-	return 0;
 
+	err = 0;
 err_unlock:
 	spin_unlock(&mm->page_table_lock);
+err_free:
 	pte_chain_free(pte_chain);
 err:
 	return err;
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/mm/mbind.c 999-mjb/mm/mbind.c
--- 000-virgin/mm/mbind.c	1969-12-31 16:00:00.000000000 -0800
+++ 999-mjb/mm/mbind.c	2003-10-02 16:41:14.000000000 -0700
@@ -0,0 +1,147 @@
+/*
+ * mm/mbind.c
+ *
+ * Written by: Matthew Dobson, IBM Corporation
+ *
+ * Copyright (C) 2003, IBM Corp.
+ *
+ * All rights reserved.          
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Send feedback to <colpatch@us.ibm.com>
+ */
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <asm/topology.h>
+#include <asm/uaccess.h>
+
+/* Translate a cpumask to a nodemask */
+static inline void cpumask_to_nodemask(unsigned long * cpumask, unsigned long * nodemask)
+{
+	int i;
+
+	for (i = 0; i < NR_CPUS; i++)
+		if (test_bit(i, cpumask))
+			set_bit(cpu_to_node(i), nodemask);
+}
+
+/*
+ * Adds the zones belonging to @pgdat to @zonelist.  Returns the next 
+ * index in @zonelist.
+ */
+static inline int add_node(pg_data_t *pgdat, struct zonelist *zonelist, int zone_num)
+{
+	int i;
+	struct zone *zone;
+
+	for (i = MAX_NR_ZONES-1; i >=0 ; i--) {
+		zone = pgdat->node_zones + i;
+		if (zone->present_pages)
+			zonelist->zones[zone_num++] = zone;
+	}
+	return zone_num;
+}
+
+/* Builds a binding for a region of memory, based on a bitmask of nodes. */
+static inline int build_binding(unsigned long * nodemask, struct binding *binding)
+{
+	int node, zone_num;
+
+	memset(binding, 0, sizeof(struct binding));
+
+	/* Build binding zonelist */
+	for (node = 0, zone_num = 0; node < MAX_NUMNODES; node++)
+		if (test_bit(node, nodemask) && node_online(node))
+			zone_num = add_node(NODE_DATA(node), 
+				&binding->zonelist, zone_num);
+	binding->zonelist.zones[zone_num] = NULL;
+
+	if (zone_num == 0)
+		/* No zones were added to the zonelist.  Let the caller know. */
+		return -EINVAL;
+
+	return 0;
+} 
+
+
+/*
+ * mbind -  Bind a range of a process' VM space to a set of memory blocks according to
+ *            a predefined policy.
+ * @start:    beginning address of memory region to bind
+ * @len:      length of memory region to bind
+ * @mask_ptr: pointer to bitmask of cpus
+ * @mask_len: length of the bitmask
+ * @policy:   flag specifying the policy to use for the segment
+ */
+asmlinkage unsigned long sys_mbind(unsigned long start, unsigned long len, 
+		unsigned long *mask_ptr, unsigned int mask_len, unsigned long policy)
+{
+	DECLARE_BITMAP(cpu_mask, NR_CPUS);
+	DECLARE_BITMAP(node_mask, MAX_NUMNODES);
+	struct vm_area_struct *vma = NULL;
+	struct address_space *mapping;
+	int copy_len, error = 0;
+
+	/* Deal with getting cpu_mask from userspace & translating to node_mask */
+	CLEAR_BITMAP(cpu_mask, NR_CPUS);
+	CLEAR_BITMAP(node_mask, MAX_NUMNODES);
+	copy_len = min(mask_len, (unsigned int)NR_CPUS);
+	if (copy_from_user(cpu_mask, mask_ptr, (copy_len+7)/8)) {
+		error = -EFAULT;
+		goto out;
+	}
+	cpumask_to_nodemask(cpu_mask, node_mask);
+
+	down_read(&current->mm->mmap_sem);
+	vma = find_vma(current->mm, start);
+	up_read(&current->mm->mmap_sem);
+	/* This is an ugly, gross hack.  This is purely because I've hurt my
+	 * brain trying to come up with a brilliant way of implementing this 
+	 * for VMA's in general.  Shared Memory VMA's lend themselves to binding
+	 * both because of how they're implemented, and their actual uses.
+	 * If anyone has a great place to squirrel-away some data about the 
+	 * requested binding, and a way to easily force the allocator to respect
+	 * these bindings, then send a patch, or let me know.  Otherwise, this 
+	 * will have to wait for a stroke of insight.
+	 */
+	if (!(vma && vma->vm_file && vma->vm_ops && 
+		vma->vm_ops->nopage == shmem_nopage)) {
+		/* This isn't a shm segment.  For now, we bail. */
+		error = -EINVAL;
+		goto out;
+	}
+
+	mapping = vma->vm_file->f_dentry->d_inode->i_mapping;
+	if (mapping->binding) {
+		kfree(mapping->binding);
+		mapping->binding = NULL;
+	}
+	mapping->binding = kmalloc(sizeof(struct binding), GFP_KERNEL);
+	if (!mapping->binding) {
+		error = -ENOMEM;
+		goto out;
+	}
+	error = build_binding(node_mask, mapping->binding);
+	if (error) {
+		kfree(mapping->binding);
+		mapping->binding = NULL;
+	}
+
+out:
+	return error;
+}
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/mm/memory.c 999-mjb/mm/memory.c
--- 000-virgin/mm/memory.c	2003-10-01 11:48:28.000000000 -0700
+++ 999-mjb/mm/memory.c	2003-10-02 16:42:18.000000000 -0700
@@ -102,8 +102,7 @@ static inline void free_one_pmd(struct m
 
 static inline void free_one_pgd(struct mmu_gather *tlb, pgd_t * dir)
 {
-	int j;
-	pmd_t * pmd;
+	pmd_t * pmd, * md, * emd;
 
 	if (pgd_none(*dir))
 		return;
@@ -114,8 +113,21 @@ static inline void free_one_pgd(struct m
 	}
 	pmd = pmd_offset(dir, 0);
 	pgd_clear(dir);
-	for (j = 0; j < PTRS_PER_PMD ; j++)
-		free_one_pmd(tlb, pmd+j);
+	/*
+	 * Beware if changing the loop below.  It once used int j,
+	 * 	for (j = 0; j < PTRS_PER_PMD; j++)
+	 * 		free_one_pmd(pmd+j);
+	 * but some older i386 compilers (e.g. egcs-2.91.66, gcc-2.95.3)
+	 * terminated the loop with a _signed_ address comparison
+	 * using "jle", when configured for HIGHMEM64GB (X86_PAE).
+	 * If also configured for 3GB of kernel virtual address space,
+	 * if page at physical 0x3ffff000 virtual 0x7ffff000 is used as
+	 * a pmd, when that mm exits the loop goes on to free "entries"
+	 * found at 0x80000000 onwards.  The loop below compiles instead
+	 * to be terminated by unsigned address comparison using "jb".
+	 */
+	for (md = pmd, emd = pmd + PTRS_PER_PMD; md < emd; md++)
+		free_one_pmd(tlb,md);
 	pmd_free_tlb(tlb, pmd);
 }
 
@@ -319,7 +331,7 @@ skip_copy_pte_range:
 					pte = pte_mkclean(pte);
 				pte = pte_mkold(pte);
 				get_page(page);
-				dst->rss++;
+				inc_rss(dst, page);
 
 				set_pte(dst_pte, pte);
 				pte_chain = page_add_rmap(page, dst_pte,
@@ -411,7 +423,14 @@ zap_pte_range(struct mmu_gather *tlb, pm
 					if (page->mapping && pte_young(pte) &&
 							!PageSwapCache(page))
 						mark_page_accessed(page);
-					tlb->freed++;
+					/*
+					 * While we have the page that is being
+					 * freed handy, make sure we decrement
+					 * the mm's RSS accordingly.  This is 
+					 * only important for NUMA per-node
+					 * RSS accounting.
+					 */
+					dec_rss(tlb->mm, page);
 					page_remove_rmap(page, ptep);
 					tlb_remove_page(tlb, page);
 				}
@@ -1041,9 +1060,10 @@ static int do_wp_page(struct mm_struct *
 	page_table = pte_offset_map(pmd, address);
 	if (pte_same(*page_table, pte)) {
 		if (PageReserved(old_page))
-			++mm->rss;
+			inc_rss(mm, new_page);
 		page_remove_rmap(old_page, page_table);
 		break_cow(vma, new_page, address, page_table);
+		SetPageAnon(new_page);
 		pte_chain = page_add_rmap(new_page, page_table, pte_chain);
 		lru_cache_add_active(new_page);
 
@@ -1275,7 +1295,7 @@ static int do_swap_page(struct mm_struct
 	if (vm_swap_full())
 		remove_exclusive_swap_page(page);
 
-	mm->rss++;
+	inc_rss(mm, page);
 	pte = mk_pte(page, vma->vm_page_prot);
 	if (write_access && can_share_swap_page(page))
 		pte = pte_mkdirty(pte_mkwrite(pte));
@@ -1283,6 +1303,7 @@ static int do_swap_page(struct mm_struct
 
 	flush_icache_page(vma, page);
 	set_pte(page_table, pte);
+	SetPageAnon(page);
 	pte_chain = page_add_rmap(page, page_table, pte_chain);
 
 	/* No need to invalidate - it was non-present before */
@@ -1344,10 +1365,11 @@ do_anonymous_page(struct mm_struct *mm, 
 			ret = VM_FAULT_MINOR;
 			goto out;
 		}
-		mm->rss++;
+		inc_rss(mm, page);
 		entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
 		lru_cache_add_active(page);
 		mark_page_accessed(page);
+		SetPageAnon(page);
 	}
 
 	set_pte(page_table, entry);
@@ -1415,6 +1437,10 @@ retry:
 	if (!pte_chain)
 		goto oom;
 
+	/* See if nopage returned an anon page */
+	if (!new_page->mapping || PageSwapCache(new_page))
+		SetPageAnon(new_page);
+
 	/*
 	 * Should we do an early C-O-W break?
 	 */
@@ -1427,6 +1453,7 @@ retry:
 		copy_user_highpage(page, new_page, address);
 		page_cache_release(new_page);
 		lru_cache_add_active(page);
+		SetPageAnon(page);
 		new_page = page;
 	}
 
@@ -1458,7 +1485,7 @@ retry:
 	/* Only go through if we didn't race with anybody else... */
 	if (pte_none(*page_table)) {
 		if (!PageReserved(new_page))
-			++mm->rss;
+			inc_rss(mm, new_page);
 		flush_icache_page(vma, new_page);
 		entry = mk_pte(new_page, vma->vm_page_prot);
 		if (write_access)
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/mm/mmap.c 999-mjb/mm/mmap.c
--- 000-virgin/mm/mmap.c	2003-10-01 11:48:28.000000000 -0700
+++ 999-mjb/mm/mmap.c	2003-10-02 16:42:18.000000000 -0700
@@ -268,9 +268,7 @@ static void vma_link(struct mm_struct *m
 
 	if (mapping)
 		down(&mapping->i_shared_sem);
-	spin_lock(&mm->page_table_lock);
 	__vma_link(mm, vma, prev, rb_link, rb_parent);
-	spin_unlock(&mm->page_table_lock);
 	if (mapping)
 		up(&mapping->i_shared_sem);
 
@@ -299,6 +297,25 @@ static inline int is_mergeable_vma(struc
 	return 1;
 }
 
+static void move_vma_start(struct vm_area_struct *vma, unsigned long addr)
+{
+	struct inode *inode = NULL;
+	
+	if (vma->vm_file) {
+		inode = vma->vm_file->f_dentry->d_inode;
+		down(&inode->i_mapping->i_shared_sem);
+	}
+	if (inode)
+		__remove_shared_vm_struct(vma, inode);
+	/* If no vm_file, perhaps we should always keep vm_pgoff at 0?? */
+	vma->vm_pgoff += (long)(addr - vma->vm_start) >> PAGE_SHIFT;
+	vma->vm_start = addr;
+	if (inode) {
+		__vma_link_file(vma);
+		up(&inode->i_mapping->i_shared_sem);
+	}
+}
+
 /*
  * Return true if we can merge this (vm_flags,file,vm_pgoff,size)
  * in front of (at a lower virtual address and file offset than) the vma.
@@ -351,8 +368,6 @@ static int vma_merge(struct mm_struct *m
 			unsigned long end, unsigned long vm_flags,
 			struct file *file, unsigned long pgoff)
 {
-	spinlock_t * lock = &mm->page_table_lock;
-
 	/*
 	 * We later require that vma->vm_flags == vm_flags, so this tests
 	 * vma->vm_flags & VM_SPECIAL, too.
@@ -380,7 +395,6 @@ static int vma_merge(struct mm_struct *m
 			down(&inode->i_mapping->i_shared_sem);
 			need_up = 1;
 		}
-		spin_lock(lock);
 		prev->vm_end = end;
 
 		/*
@@ -393,7 +407,6 @@ static int vma_merge(struct mm_struct *m
 			prev->vm_end = next->vm_end;
 			__vma_unlink(mm, next, prev);
 			__remove_shared_vm_struct(next, inode);
-			spin_unlock(lock);
 			if (need_up)
 				up(&inode->i_mapping->i_shared_sem);
 			if (file)
@@ -403,7 +416,6 @@ static int vma_merge(struct mm_struct *m
 			kmem_cache_free(vm_area_cachep, next);
 			return 1;
 		}
-		spin_unlock(lock);
 		if (need_up)
 			up(&inode->i_mapping->i_shared_sem);
 		return 1;
@@ -419,10 +431,7 @@ static int vma_merge(struct mm_struct *m
 				pgoff, (end - addr) >> PAGE_SHIFT))
 			return 0;
 		if (end == prev->vm_start) {
-			spin_lock(lock);
-			prev->vm_start = addr;
-			prev->vm_pgoff -= (end - addr) >> PAGE_SHIFT;
-			spin_unlock(lock);
+			move_vma_start(prev, addr);
 			return 1;
 		}
 	}
@@ -868,19 +877,16 @@ int expand_stack(struct vm_area_struct *
 	 */
 	address += 4 + PAGE_SIZE - 1;
 	address &= PAGE_MASK;
- 	spin_lock(&vma->vm_mm->page_table_lock);
 	grow = (address - vma->vm_end) >> PAGE_SHIFT;
 
 	/* Overcommit.. */
 	if (security_vm_enough_memory(grow)) {
-		spin_unlock(&vma->vm_mm->page_table_lock);
 		return -ENOMEM;
 	}
 	
 	if (address - vma->vm_start > current->rlim[RLIMIT_STACK].rlim_cur ||
 			((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) >
 			current->rlim[RLIMIT_AS].rlim_cur) {
-		spin_unlock(&vma->vm_mm->page_table_lock);
 		vm_unacct_memory(grow);
 		return -ENOMEM;
 	}
@@ -888,7 +894,6 @@ int expand_stack(struct vm_area_struct *
 	vma->vm_mm->total_vm += grow;
 	if (vma->vm_flags & VM_LOCKED)
 		vma->vm_mm->locked_vm += grow;
-	spin_unlock(&vma->vm_mm->page_table_lock);
 	return 0;
 }
 
@@ -922,19 +927,16 @@ int expand_stack(struct vm_area_struct *
 	 * the spinlock only before relocating the vma range ourself.
 	 */
 	address &= PAGE_MASK;
- 	spin_lock(&vma->vm_mm->page_table_lock);
 	grow = (vma->vm_start - address) >> PAGE_SHIFT;
 
 	/* Overcommit.. */
 	if (security_vm_enough_memory(grow)) {
-		spin_unlock(&vma->vm_mm->page_table_lock);
 		return -ENOMEM;
 	}
 	
 	if (vma->vm_end - address > current->rlim[RLIMIT_STACK].rlim_cur ||
 			((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) >
 			current->rlim[RLIMIT_AS].rlim_cur) {
-		spin_unlock(&vma->vm_mm->page_table_lock);
 		vm_unacct_memory(grow);
 		return -ENOMEM;
 	}
@@ -943,7 +945,6 @@ int expand_stack(struct vm_area_struct *
 	vma->vm_mm->total_vm += grow;
 	if (vma->vm_flags & VM_LOCKED)
 		vma->vm_mm->locked_vm += grow;
-	spin_unlock(&vma->vm_mm->page_table_lock);
 	return 0;
 }
 
@@ -1106,8 +1107,6 @@ static void unmap_region(struct mm_struc
 /*
  * Create a list of vma's touched by the unmap, removing them from the mm's
  * vma list as we go..
- *
- * Called with the page_table_lock held.
  */
 static void
 detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -1151,8 +1150,7 @@ int split_vma(struct mm_struct * mm, str
 
 	if (new_below) {
 		new->vm_end = addr;
-		vma->vm_start = addr;
-		vma->vm_pgoff += ((addr - new->vm_start) >> PAGE_SHIFT);
+		move_vma_start(vma, addr);
 	} else {
 		vma->vm_end = addr;
 		new->vm_start = addr;
@@ -1231,8 +1229,8 @@ int do_munmap(struct mm_struct *mm, unsi
 	/*
 	 * Remove the vma's, and unmap the actual pages
 	 */
-	spin_lock(&mm->page_table_lock);
 	detach_vmas_to_be_unmapped(mm, mpnt, prev, end);
+	spin_lock(&mm->page_table_lock);
 	unmap_region(mm, mpnt, prev, start, end);
 	spin_unlock(&mm->page_table_lock);
 
@@ -1384,7 +1382,7 @@ void exit_mmap(struct mm_struct *mm)
 	vma = mm->mmap;
 	mm->mmap = mm->mmap_cache = NULL;
 	mm->mm_rb = RB_ROOT;
-	mm->rss = 0;
+	zero_rss(mm);
 	mm->total_vm = 0;
 	mm->locked_vm = 0;
 
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/mm/page-writeback.c 999-mjb/mm/page-writeback.c
--- 000-virgin/mm/page-writeback.c	2003-10-01 11:48:28.000000000 -0700
+++ 999-mjb/mm/page-writeback.c	2003-10-02 16:53:55.000000000 -0700
@@ -469,12 +469,12 @@ int write_one_page(struct page *page, in
 	if (wait)
 		wait_on_page_writeback(page);
 
-	spin_lock(&mapping->page_lock);
+	mapping_wrlock(&mapping->page_lock);
 	list_del(&page->list);
 	if (test_clear_page_dirty(page)) {
 		list_add(&page->list, &mapping->locked_pages);
 		page_cache_get(page);
-		spin_unlock(&mapping->page_lock);
+		mapping_wrunlock(&mapping->page_lock);
 		ret = mapping->a_ops->writepage(page, &wbc);
 		if (ret == 0 && wait) {
 			wait_on_page_writeback(page);
@@ -484,7 +484,7 @@ int write_one_page(struct page *page, in
 		page_cache_release(page);
 	} else {
 		list_add(&page->list, &mapping->clean_pages);
-		spin_unlock(&mapping->page_lock);
+		mapping_wrunlock(&mapping->page_lock);
 		unlock_page(page);
 	}
 	return ret;
@@ -512,7 +512,7 @@ int __set_page_dirty_nobuffers(struct pa
 		struct address_space *mapping = page->mapping;
 
 		if (mapping) {
-			spin_lock(&mapping->page_lock);
+			mapping_wrlock(&mapping->page_lock);
 			if (page->mapping) {	/* Race with truncate? */
 				BUG_ON(page->mapping != mapping);
 				if (!mapping->backing_dev_info->memory_backed)
@@ -520,7 +520,7 @@ int __set_page_dirty_nobuffers(struct pa
 				list_del(&page->list);
 				list_add(&page->list, &mapping->dirty_pages);
 			}
-			spin_unlock(&mapping->page_lock);
+			mapping_wrunlock(&mapping->page_lock);
 			if (!PageSwapCache(page))
 				__mark_inode_dirty(mapping->host,
 							I_DIRTY_PAGES);
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/mm/page_alloc.c 999-mjb/mm/page_alloc.c
--- 000-virgin/mm/page_alloc.c	2003-10-01 11:48:28.000000000 -0700
+++ 999-mjb/mm/page_alloc.c	2003-10-02 16:44:09.000000000 -0700
@@ -225,6 +225,8 @@ static inline void free_pages_check(cons
 		bad_page(function, page);
 	if (PageDirty(page))
 		ClearPageDirty(page);
+	if (PageAnon(page))
+		ClearPageAnon(page);
 }
 
 /*
@@ -562,6 +564,10 @@ __alloc_pages(unsigned int gfp_mask, uns
 		struct zone *z = zones[i];
 		unsigned long local_low;
 
+		if ((__GFP_NODE_STRICT & gfp_mask) &&
+		    (pfn_to_nid(z->zone_start_pfn) != numa_node_id()))
+			continue;
+
 		/*
 		 * This is the fabled 'incremental min'. We let real-time tasks
 		 * dip their real-time paws a little deeper into reserves.
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/mm/readahead.c 999-mjb/mm/readahead.c
--- 000-virgin/mm/readahead.c	2003-10-01 11:35:37.000000000 -0700
+++ 999-mjb/mm/readahead.c	2003-10-02 16:53:55.000000000 -0700
@@ -222,7 +222,7 @@ __do_page_cache_readahead(struct address
 	/*
 	 * Preallocate as many pages as we will need.
 	 */
-	spin_lock(&mapping->page_lock);
+	mapping_rdlock(&mapping->page_lock);
 	for (page_idx = 0; page_idx < nr_to_read; page_idx++) {
 		unsigned long page_offset = offset + page_idx;
 		
@@ -233,16 +233,16 @@ __do_page_cache_readahead(struct address
 		if (page)
 			continue;
 
-		spin_unlock(&mapping->page_lock);
+		mapping_rdunlock(&mapping->page_lock);
 		page = page_cache_alloc_cold(mapping);
-		spin_lock(&mapping->page_lock);
+		mapping_rdlock(&mapping->page_lock);
 		if (!page)
 			break;
 		page->index = page_offset;
 		list_add(&page->list, &page_pool);
 		ret++;
 	}
-	spin_unlock(&mapping->page_lock);
+	mapping_rdunlock(&mapping->page_lock);
 
 	/*
 	 * Now start the IO.  We ignore I/O errors - if the page is not
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/mm/rmap.c 999-mjb/mm/rmap.c
--- 000-virgin/mm/rmap.c	2003-10-01 11:47:15.000000000 -0700
+++ 999-mjb/mm/rmap.c	2003-10-02 16:42:18.000000000 -0700
@@ -102,6 +102,136 @@ pte_chain_encode(struct pte_chain *pte_c
  **/
 
 /**
+ * find_pte - Find a pte pointer given a vma and a struct page.
+ * @vma: the vma to search
+ * @page: the page to find
+ *
+ * Determine if this page is mapped in this vma.  If it is, map and rethrn
+ * the pte pointer associated with it.  Return null if the page is not
+ * mapped in this vma for any reason.
+ *
+ * This is strictly an internal helper function for the object-based rmap
+ * functions.
+ * 
+ * It is the caller's responsibility to unmap the pte if it is returned.
+ */
+static inline pte_t *
+find_pte(struct vm_area_struct *vma, struct page *page, unsigned long *addr)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	pgd_t *pgd;
+	pmd_t *pmd;
+	pte_t *pte;
+	unsigned long loffset;
+	unsigned long address;
+
+	loffset = (page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT));
+	address = vma->vm_start + ((loffset - vma->vm_pgoff) << PAGE_SHIFT);
+	if (address < vma->vm_start || address >= vma->vm_end)
+		goto out;
+
+	pgd = pgd_offset(mm, address);
+	if (!pgd_present(*pgd))
+		goto out;
+
+	pmd = pmd_offset(pgd, address);
+	if (!pmd_present(*pmd))
+		goto out;
+
+	pte = pte_offset_map(pmd, address);
+	if (!pte_present(*pte))
+		goto out_unmap;
+
+	if (page_to_pfn(page) != pte_pfn(*pte))
+		goto out_unmap;
+
+	if (addr)
+		*addr = address;
+
+	return pte;
+
+out_unmap:
+	pte_unmap(pte);
+out:
+	return NULL;
+}
+
+/**
+ * page_referenced_obj_one - referenced check for object-based rmap
+ * @vma: the vma to look in.
+ * @page: the page we're working on.
+ *
+ * Find a pte entry for a page/vma pair, then check and clear the referenced
+ * bit.
+ *
+ * This is strictly a helper function for page_referenced_obj.
+ */
+static int
+page_referenced_obj_one(struct vm_area_struct *vma, struct page *page)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	pte_t *pte;
+	int referenced = 0;
+
+	if (!spin_trylock(&mm->page_table_lock))
+		return 1;
+
+	pte = find_pte(vma, page, NULL);
+	if (pte) {
+		if (ptep_test_and_clear_young(pte))
+			referenced++;
+		pte_unmap(pte);
+	}
+
+	spin_unlock(&mm->page_table_lock);
+	return referenced;
+}
+
+/**
+ * page_referenced_obj_one - referenced check for object-based rmap
+ * @page: the page we're checking references on.
+ *
+ * For an object-based mapped page, find all the places it is mapped and
+ * check/clear the referenced flag.  This is done by following the page->mapping
+ * pointer, then walking the chain of vmas it holds.  It returns the number
+ * of references it found.
+ *
+ * This function is only called from page_referenced for object-based pages.
+ *
+ * The semaphore address_space->i_shared_sem is tried.  If it can't be gotten,
+ * assume a reference count of 1.
+ */
+static int
+page_referenced_obj(struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+	struct vm_area_struct *vma;
+	int referenced = 0;
+
+	if (!page->pte.mapcount)
+		return 0;
+
+	if (!mapping)
+		BUG();
+
+	if (PageSwapCache(page))
+		BUG();
+
+	if (down_trylock(&mapping->i_shared_sem))
+		return 1;
+	
+	list_for_each_entry(vma, &mapping->i_mmap, shared)
+		referenced += page_referenced_obj_one(vma, page);
+
+	list_for_each_entry(vma, &mapping->i_mmap_shared, shared)
+		referenced += page_referenced_obj_one(vma, page);
+
+	up(&mapping->i_shared_sem);
+
+	return referenced;
+}
+
+/**
  * page_referenced - test if the page was referenced
  * @page: the page to test
  *
@@ -120,6 +250,10 @@ int page_referenced(struct page * page)
 	if (TestClearPageReferenced(page))
 		referenced++;
 
+	if (!PageAnon(page)) {
+		referenced += page_referenced_obj(page);
+		goto out;
+	}
 	if (PageDirect(page)) {
 		pte_t *pte = rmap_ptep_map(page->pte.direct);
 		if (ptep_test_and_clear_young(pte))
@@ -153,6 +287,7 @@ int page_referenced(struct page * page)
 			__pte_chain_free(pc);
 		}
 	}
+out:
 	return referenced;
 }
 
@@ -175,6 +310,21 @@ page_add_rmap(struct page *page, pte_t *
 
 	pte_chain_lock(page);
 
+	/*
+	 * If this is an object-based page, just count it.  We can
+ 	 * find the mappings by walking the object vma chain for that object.
+	 */
+	if (!PageAnon(page)) {
+		if (!page->mapping)
+			BUG();
+		if (PageSwapCache(page))
+			BUG();
+		if (!page->pte.mapcount)
+			inc_page_state(nr_mapped);
+		page->pte.mapcount++;
+		goto out;
+	}
+
 	if (page->pte.direct == 0) {
 		page->pte.direct = pte_paddr;
 		SetPageDirect(page);
@@ -231,8 +381,25 @@ void page_remove_rmap(struct page *page,
 	pte_chain_lock(page);
 
 	if (!page_mapped(page))
-		goto out_unlock;	/* remap_page_range() from a driver? */
+		goto out_unlock;
 
+	/*
+	 * If this is an object-based page, just uncount it.  We can
+	 * find the mappings by walking the object vma chain for that object.
+	 */
+	if (!PageAnon(page)) {
+		if (!page->mapping)
+			BUG();
+		if (PageSwapCache(page))
+			BUG();
+		if (!page->pte.mapcount)
+			BUG();
+		page->pte.mapcount--;
+		if (!page->pte.mapcount)
+			dec_page_state(nr_mapped);
+		goto out_unlock;
+	}
+  
 	if (PageDirect(page)) {
 		if (page->pte.direct == pte_paddr) {
 			page->pte.direct = 0;
@@ -279,6 +446,102 @@ out_unlock:
 }
 
 /**
+ * try_to_unmap_obj - unmap a page using the object-based rmap method
+ * @page: the page to unmap
+ *
+ * Determine whether a page is mapped in a given vma and unmap it if it's found.
+ *
+ * This function is strictly a helper function for try_to_unmap_obj.
+ */
+static inline int
+try_to_unmap_obj_one(struct vm_area_struct *vma, struct page *page)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	unsigned long address;
+	pte_t *pte;
+	pte_t pteval;
+	int ret = SWAP_AGAIN;
+
+	if (!spin_trylock(&mm->page_table_lock))
+		return ret;
+
+	pte = find_pte(vma, page, &address);
+	if (!pte)
+		goto out;
+
+	if (vma->vm_flags & VM_LOCKED) {
+		ret =  SWAP_FAIL;
+		goto out_unmap;
+	}
+
+	flush_cache_page(vma, address);
+	pteval = ptep_get_and_clear(pte);
+	flush_tlb_page(vma, address);
+
+	if (pte_dirty(pteval))
+		set_page_dirty(page);
+
+	if (!page->pte.mapcount)
+		BUG();
+
+	mm->rss--;
+	page->pte.mapcount--;
+	page_cache_release(page);
+
+out_unmap:
+	pte_unmap(pte);
+
+out:
+	spin_unlock(&mm->page_table_lock);
+	return ret;
+}
+
+/**
+ * try_to_unmap_obj - unmap a page using the object-based rmap method
+ * @page: the page to unmap
+ *
+ * Find all the mappings of a page using the mapping pointer and the vma chains
+ * contained in the address_space struct it points to.
+ *
+ * This function is only called from try_to_unmap for object-based pages.
+ *
+ * The semaphore address_space->i_shared_sem is tried.  If it can't be gotten,
+ * return a temporary error.
+ */
+static int
+try_to_unmap_obj(struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+	struct vm_area_struct *vma;
+	int ret = SWAP_AGAIN;
+
+	if (!mapping)
+		BUG();
+
+	if (PageSwapCache(page))
+		BUG();
+
+	if (down_trylock(&mapping->i_shared_sem))
+		return ret;
+	
+	list_for_each_entry(vma, &mapping->i_mmap, shared) {
+		ret = try_to_unmap_obj_one(vma, page);
+		if (ret == SWAP_FAIL || !page->pte.mapcount)
+			goto out;
+	}
+
+	list_for_each_entry(vma, &mapping->i_mmap_shared, shared) {
+		ret = try_to_unmap_obj_one(vma, page);
+		if (ret == SWAP_FAIL || !page->pte.mapcount)
+			goto out;
+	}
+
+out:
+	up(&mapping->i_shared_sem);
+	return ret;
+}
+
+/**
  * try_to_unmap_one - worker function for try_to_unmap
  * @page: page to unmap
  * @ptep: page table entry to unmap from page
@@ -360,7 +623,7 @@ static int try_to_unmap_one(struct page 
 	if (pte_dirty(pte))
 		set_page_dirty(page);
 
-	mm->rss--;
+	dec_rss(mm, page);
 	page_cache_release(page);
 	ret = SWAP_SUCCESS;
 
@@ -397,6 +660,15 @@ int try_to_unmap(struct page * page)
 	if (!page->mapping)
 		BUG();
 
+	/*
+	 * If it's an object-based page, use the object vma chain to find all
+	 * the mappings.
+	 */
+	if (!PageAnon(page)) {
+		ret = try_to_unmap_obj(page);
+		goto out;
+	}
+
 	if (PageDirect(page)) {
 		ret = try_to_unmap_one(page, page->pte.direct);
 		if (ret == SWAP_SUCCESS) {
@@ -452,12 +724,115 @@ int try_to_unmap(struct page * page)
 		}
 	}
 out:
-	if (!page_mapped(page))
+	if (!page_mapped(page)) {
 		dec_page_state(nr_mapped);
+		ret = SWAP_SUCCESS;
+	}
 	return ret;
 }
 
 /**
+ * page_convert_anon - Convert an object-based mapped page to pte_chain-based.
+ * @page: the page to convert
+ *
+ * Find all the mappings for an object-based page and convert them
+ * to 'anonymous', ie create a pte_chain and store all the pte pointers there.
+ *
+ * This function takes the address_space->i_shared_sem, sets the PageAnon flag,
+ * then sets the mm->page_table_lock for each vma and calls page_add_rmap. This
+ * means there is a period when PageAnon is set, but still has some mappings
+ * with no pte_chain entry.  This is in fact safe, since page_remove_rmap will
+ * simply not find it.  try_to_unmap might erroneously return success, but it
+ * will never be called because the page_convert_anon() caller has locked the
+ * page.
+ *
+ * page_referenced() may fail to scan all the appropriate pte's and may return
+ * an inaccurate result.  This is so rare that it does not matter.
+ */
+int page_convert_anon(struct page *page)
+{
+	struct address_space *mapping;
+	struct vm_area_struct *vma;
+	struct pte_chain *pte_chain = NULL;
+	pte_t *pte;
+	int err = 0;
+
+	mapping = page->mapping;
+	if (mapping == NULL)
+		goto out;		/* truncate won the lock_page() race */
+
+	down(&mapping->i_shared_sem);
+	pte_chain_lock(page);
+
+	/*
+	 * Has someone else done it for us before we got the lock?
+	 * If so, pte.direct or pte.chain has replaced pte.mapcount.
+	 */
+	if (PageAnon(page)) {
+		pte_chain_unlock(page);
+		goto out_unlock;
+	}
+
+	SetPageAnon(page);
+	if (page->pte.mapcount == 0) {
+		pte_chain_unlock(page);
+		goto out_unlock;
+	}
+	/* This is gonna get incremented by page_add_rmap */
+	dec_page_state(nr_mapped);
+	page->pte.mapcount = 0;
+
+	/*
+	 * Now that the page is marked as anon, unlock it.  page_add_rmap will
+	 * lock it as necessary.
+	 */
+	pte_chain_unlock(page);
+
+	list_for_each_entry(vma, &mapping->i_mmap, shared) {
+		if (!pte_chain) {
+			pte_chain = pte_chain_alloc(GFP_KERNEL);
+			if (!pte_chain) {
+				err = -ENOMEM;
+				goto out_unlock;
+			}
+		}
+		spin_lock(&vma->vm_mm->page_table_lock);
+		pte = find_pte(vma, page, NULL);
+		if (pte) {
+			/* Make sure this isn't a duplicate */
+			page_remove_rmap(page, pte);
+			pte_chain = page_add_rmap(page, pte, pte_chain);
+			pte_unmap(pte);
+		}
+		spin_unlock(&vma->vm_mm->page_table_lock);
+	}
+	list_for_each_entry(vma, &mapping->i_mmap_shared, shared) {
+		if (!pte_chain) {
+			pte_chain = pte_chain_alloc(GFP_KERNEL);
+			if (!pte_chain) {
+				err = -ENOMEM;
+				goto out_unlock;
+			}
+		}
+		spin_lock(&vma->vm_mm->page_table_lock);
+		pte = find_pte(vma, page, NULL);
+		if (pte) {
+			/* Make sure this isn't a duplicate */
+			page_remove_rmap(page, pte);
+			pte_chain = page_add_rmap(page, pte, pte_chain);
+			pte_unmap(pte);
+		}
+		spin_unlock(&vma->vm_mm->page_table_lock);
+	}
+
+out_unlock:
+	pte_chain_free(pte_chain);
+	up(&mapping->i_shared_sem);
+out:
+	return err;
+}
+
+/**
  ** No more VM stuff below this comment, only pte_chain helper
  ** functions.
  **/
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/mm/swap_state.c 999-mjb/mm/swap_state.c
--- 000-virgin/mm/swap_state.c	2003-10-01 11:35:37.000000000 -0700
+++ 999-mjb/mm/swap_state.c	2003-10-02 16:53:55.000000000 -0700
@@ -25,7 +25,7 @@ extern struct address_space_operations s
 
 struct address_space swapper_space = {
 	.page_tree	= RADIX_TREE_INIT(GFP_ATOMIC),
-	.page_lock	= SPIN_LOCK_UNLOCKED,
+	.page_lock	= MAPPING_RW_LOCK_UNLOCKED,
 	.clean_pages	= LIST_HEAD_INIT(swapper_space.clean_pages),
 	.dirty_pages	= LIST_HEAD_INIT(swapper_space.dirty_pages),
 	.io_pages	= LIST_HEAD_INIT(swapper_space.io_pages),
@@ -182,9 +182,9 @@ void delete_from_swap_cache(struct page 
   
 	entry.val = page->index;
 
-	spin_lock(&swapper_space.page_lock);
+	mapping_wrlock(&swapper_space.page_lock);
 	__delete_from_swap_cache(page);
-	spin_unlock(&swapper_space.page_lock);
+	mapping_wrunlock(&swapper_space.page_lock);
 
 	swap_free(entry);
 	page_cache_release(page);
@@ -195,8 +195,8 @@ int move_to_swap_cache(struct page *page
 	struct address_space *mapping = page->mapping;
 	int err;
 
-	spin_lock(&swapper_space.page_lock);
-	spin_lock(&mapping->page_lock);
+	mapping_wrlock(&swapper_space.page_lock);
+	mapping_wrlock(&mapping->page_lock);
 
 	err = radix_tree_insert(&swapper_space.page_tree, entry.val, page);
 	if (!err) {
@@ -204,8 +204,8 @@ int move_to_swap_cache(struct page *page
 		___add_to_page_cache(page, &swapper_space, entry.val);
 	}
 
-	spin_unlock(&mapping->page_lock);
-	spin_unlock(&swapper_space.page_lock);
+	mapping_wrunlock(&mapping->page_lock);
+	mapping_wrunlock(&swapper_space.page_lock);
 
 	if (!err) {
 		if (!swap_duplicate(entry))
@@ -231,8 +231,8 @@ int move_from_swap_cache(struct page *pa
 
 	entry.val = page->index;
 
-	spin_lock(&swapper_space.page_lock);
-	spin_lock(&mapping->page_lock);
+	mapping_wrlock(&swapper_space.page_lock);
+	mapping_wrlock(&mapping->page_lock);
 
 	err = radix_tree_insert(&mapping->page_tree, index, page);
 	if (!err) {
@@ -240,8 +240,8 @@ int move_from_swap_cache(struct page *pa
 		___add_to_page_cache(page, mapping, index);
 	}
 
-	spin_unlock(&mapping->page_lock);
-	spin_unlock(&swapper_space.page_lock);
+	mapping_wrunlock(&mapping->page_lock);
+	mapping_wrunlock(&swapper_space.page_lock);
 
 	if (!err) {
 		swap_free(entry);
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/mm/swapfile.c 999-mjb/mm/swapfile.c
--- 000-virgin/mm/swapfile.c	2003-10-01 11:47:15.000000000 -0700
+++ 999-mjb/mm/swapfile.c	2003-10-02 16:53:56.000000000 -0700
@@ -253,10 +253,10 @@ static int exclusive_swap_page(struct pa
 		/* Is the only swap cache user the cache itself? */
 		if (p->swap_map[swp_offset(entry)] == 1) {
 			/* Recheck the page count with the pagecache lock held.. */
-			spin_lock(&swapper_space.page_lock);
+			mapping_rdlock(&swapper_space.page_lock);
 			if (page_count(page) - !!PagePrivate(page) == 2)
 				retval = 1;
-			spin_unlock(&swapper_space.page_lock);
+			mapping_rdunlock(&swapper_space.page_lock);
 		}
 		swap_info_put(p);
 	}
@@ -324,13 +324,13 @@ int remove_exclusive_swap_page(struct pa
 	retval = 0;
 	if (p->swap_map[swp_offset(entry)] == 1) {
 		/* Recheck the page count with the pagecache lock held.. */
-		spin_lock(&swapper_space.page_lock);
+		mapping_wrlock(&swapper_space.page_lock);
 		if ((page_count(page) == 2) && !PageWriteback(page)) {
 			__delete_from_swap_cache(page);
 			SetPageDirty(page);
 			retval = 1;
 		}
-		spin_unlock(&swapper_space.page_lock);
+		mapping_wrunlock(&swapper_space.page_lock);
 	}
 	swap_info_put(p);
 
@@ -387,9 +387,10 @@ static void
 unuse_pte(struct vm_area_struct *vma, unsigned long address, pte_t *dir,
 	swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp)
 {
-	vma->vm_mm->rss++;
+	inc_rss(vma->vm_mm, page);
 	get_page(page);
 	set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot)));
+	SetPageAnon(page);
 	*pte_chainp = page_add_rmap(page, dir, *pte_chainp);
 	swap_free(entry);
 }
@@ -498,6 +499,7 @@ static int unuse_process(struct mm_struc
 	/*
 	 * Go through process' page directory.
 	 */
+	down_read(&mm->mmap_sem);
 	spin_lock(&mm->page_table_lock);
 	for (vma = mm->mmap; vma; vma = vma->vm_next) {
 		pgd_t * pgd = pgd_offset(mm, vma->vm_start);
@@ -505,6 +507,7 @@ static int unuse_process(struct mm_struc
 			break;
 	}
 	spin_unlock(&mm->page_table_lock);
+	up_read(&mm->mmap_sem);
 	pte_chain_free(pte_chain);
 	return 0;
 }
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/mm/truncate.c 999-mjb/mm/truncate.c
--- 000-virgin/mm/truncate.c	2003-06-05 14:56:45.000000000 -0700
+++ 999-mjb/mm/truncate.c	2003-10-02 16:53:56.000000000 -0700
@@ -73,13 +73,13 @@ invalidate_complete_page(struct address_
 	if (PagePrivate(page) && !try_to_release_page(page, 0))
 		return 0;
 
-	spin_lock(&mapping->page_lock);
+	mapping_wrlock(&mapping->page_lock);
 	if (PageDirty(page)) {
-		spin_unlock(&mapping->page_lock);
+		mapping_wrunlock(&mapping->page_lock);
 		return 0;
 	}
 	__remove_from_page_cache(page);
-	spin_unlock(&mapping->page_lock);
+	mapping_wrunlock(&mapping->page_lock);
 	ClearPageUptodate(page);
 	page_cache_release(page);	/* pagecache ref */
 	return 1;
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/mm/vmscan.c 999-mjb/mm/vmscan.c
--- 000-virgin/mm/vmscan.c	2003-10-01 11:47:15.000000000 -0700
+++ 999-mjb/mm/vmscan.c	2003-10-02 16:53:56.000000000 -0700
@@ -353,7 +353,7 @@ shrink_list(struct list_head *page_list,
 				goto keep_locked;
 			if (!may_write_to_queue(mapping->backing_dev_info))
 				goto keep_locked;
-			spin_lock(&mapping->page_lock);
+			mapping_wrlock(&mapping->page_lock);
 			if (test_clear_page_dirty(page)) {
 				int res;
 				struct writeback_control wbc = {
@@ -364,7 +364,7 @@ shrink_list(struct list_head *page_list,
 				};
 
 				list_move(&page->list, &mapping->locked_pages);
-				spin_unlock(&mapping->page_lock);
+				mapping_wrunlock(&mapping->page_lock);
 
 				SetPageReclaim(page);
 				res = mapping->a_ops->writepage(page, &wbc);
@@ -380,7 +380,7 @@ shrink_list(struct list_head *page_list,
 				}
 				goto keep;
 			}
-			spin_unlock(&mapping->page_lock);
+			mapping_wrunlock(&mapping->page_lock);
 		}
 
 		/*
@@ -414,7 +414,7 @@ shrink_list(struct list_head *page_list,
 		if (!mapping)
 			goto keep_locked;	/* truncate got there first */
 
-		spin_lock(&mapping->page_lock);
+		mapping_wrlock(&mapping->page_lock);
 
 		/*
 		 * The non-racy check for busy page.  It is critical to check
@@ -422,7 +422,7 @@ shrink_list(struct list_head *page_list,
 		 * not in use by anybody. 	(pagecache + us == 2)
 		 */
 		if (page_count(page) != 2 || PageDirty(page)) {
-			spin_unlock(&mapping->page_lock);
+			mapping_wrunlock(&mapping->page_lock);
 			goto keep_locked;
 		}
 
@@ -430,7 +430,7 @@ shrink_list(struct list_head *page_list,
 		if (PageSwapCache(page)) {
 			swp_entry_t swap = { .val = page->index };
 			__delete_from_swap_cache(page);
-			spin_unlock(&mapping->page_lock);
+			mapping_wrunlock(&mapping->page_lock);
 			swap_free(swap);
 			__put_page(page);	/* The pagecache ref */
 			goto free_it;
@@ -438,7 +438,7 @@ shrink_list(struct list_head *page_list,
 #endif /* CONFIG_SWAP */
 
 		__remove_from_page_cache(page);
-		spin_unlock(&mapping->page_lock);
+		mapping_wrunlock(&mapping->page_lock);
 		__put_page(page);
 
 free_it:
diff -purN -X /home/mbligh/.diff.exclude 000-virgin/scripts/Makefile.build 999-mjb/scripts/Makefile.build
--- 000-virgin/scripts/Makefile.build	2003-10-01 11:48:31.000000000 -0700
+++ 999-mjb/scripts/Makefile.build	2003-10-02 16:43:03.000000000 -0700
@@ -128,7 +128,16 @@ cmd_cc_i_c       = $(CPP) $(c_flags)   -
 quiet_cmd_cc_o_c = CC $(quiet_modtag)  $@
 
 ifndef CONFIG_MODVERSIONS
-cmd_cc_o_c = $(CC) $(c_flags) -c -o $@ $<
+new1_c_flags = $(c_flags:-I%=-I$(TOPDIR)/%)
+new2_c_flags = $(new1_c_flags:-Wp%=)
+PWD = $(TOPDIR)
+
+quiet_cmd_cc_o_c = CC $(quiet_modtag)  $@
+cmd_cc_o_c = $(CC) $(c_flags) -E -o $@ $< \
+		&& cd $(dir $<) \
+		&& $(CC) $(new2_c_flags) -c -o $(notdir $@) $(notdir $<) \
+		&& cd $(TOPDIR)
+#cmd_cc_o_c = $(CC) $(c_flags) -c -o $@ $<
 
 else
 # When module versioning is enabled the following steps are executed:
@@ -143,12 +152,21 @@ else
 #   replace the unresolved symbols __crc_exported_symbol with
 #   the actual value of the checksum generated by genksyms
 
-cmd_cc_o_c = $(CC) $(c_flags) -c -o $(@D)/.tmp_$(@F) $<
+new1_c_flags = $(c_flags:-I%=-I$(TOPDIR)/%)
+new2_c_flags = $(new1_c_flags:-Wp%=)
+PWD = $(TOPDIR)
+
+quiet_cmd_cc_o_c = CC $(quiet_modtag)  $@
+cmd_cc_o_c = $(CC) $(c_flags) -E -o $@ $< \
+		&& cd $(dir $<) \
+		&& $(CC) $(new2_c_flags) -c -o .tmp_$(@F) $(notdir $<) \
+		&& cd $(TOPDIR)
+#cmd_cc_o_c = $(CC) $(c_flags) -c -o $(@D)/.tmp_$(@F) $<
 cmd_modversions =							\
 	if ! $(OBJDUMP) -h $(@D)/.tmp_$(@F) | grep -q __ksymtab; then	\
 		mv $(@D)/.tmp_$(@F) $@;					\
 	else								\
-		$(CPP) -D__GENKSYMS__ $(c_flags) $<			\
+		$(CPP) -D__GENKSYMS__ $(new2_c_flags) $<		\
 		| $(GENKSYMS)						\
 		> $(@D)/.tmp_$(@F:.o=.ver);				\
 									\