diff -Naurp linux-2.4.20-wolk4.0s/Documentation/Configure.help linux-2.4.20-wolk4.1-fullkernel/Documentation/Configure.help
--- linux-2.4.20-wolk4.0s/Documentation/Configure.help	2003-05-15 21:52:18.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/Documentation/Configure.help	2003-05-19 13:03:55.000000000 +0200
@@ -1485,7 +1485,8 @@ CONFIG_HZ
   your desktop behaviour, for example sound output *may* stutter.
 
   So, if you have very unusual behaviour of your desktop with HZ=1000, set
-  this back to 100 and try again.
+  this back to 100 and try again. You can also try using HZ=200. This should
+  work ok.
 
   If unsure, leave the default 100.
 
@@ -2252,10 +2253,11 @@ CONFIG_BLK_DEV_ELEVATOR_LOWLAT
 
   For the interested ones:
   ------------------------
-  nr_requests:		4
+  nr_requests:		32
   read_passovers:	0
   write_passovers:	0
-  bdflush:		50, 500, 0, 0, 5*HZ, 30*HZ, 60, 20, 0
+  max_bomb_segments:	1
+  bdflush:		30/50, 500, 0, 0, 5*HZ, 30*HZ, 60, 20, 0
 
   You can, for sure, lower the latency much more but you'll experience
   less write throughput.
@@ -25511,7 +25513,7 @@ CONFIG_FILE_RESERVED
   root after the hard limit is reached. Don't lower this value unless
   you know what you are doing!
 
-  If unsure, use the default of 128.
+  If unsure, use the default of 256.
 
 Maximum amount of unix sockets
 CONFIG_UNIX_MAX_SOCKETS
diff -Naurp linux-2.4.20-wolk4.0s/Documentation/filesystems/proc.txt linux-2.4.20-wolk4.1-fullkernel/Documentation/filesystems/proc.txt
--- linux-2.4.20-wolk4.0s/Documentation/filesystems/proc.txt	2003-05-15 21:52:18.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/Documentation/filesystems/proc.txt	2003-05-16 13:36:57.000000000 +0200
@@ -1892,25 +1892,6 @@ values here give more preferance to runn
 of expired tasks.  Lower values provide more fair scheduling behavior, at the
 expense of interactivity.  The units are in milliseconds.
 
-thead_penalty
--------------
-
-Limit sum of timeslices used by a threadgroup to 100/n timeslices. This
-is used to prevent heavily thread applications from slowing down the system
-when many threads are active. For this item, threads are defined as processes
-sharing their mm and files. This implies that if this is set to 33 and six
-processes from a given threadgroup are in runqueues each process will have its
-timeslice reduced by 50%. Set to zero to disable.
-
-user_penalty
-------------
-
-Limit the sum of timeslices used by a user to 100/n timeslices. This prevents
-one user from stealing the cpu by creating many active threads. For example,
-if this is set to 25 and six processes are in runqueues the timeslice of each
-process will be reduced by 33%. Set to zero to disable - root is always
-excluded from this logic.
-
 ------------------------------------------------------------------------------
 Summary
 ------------------------------------------------------------------------------
diff -Naurp linux-2.4.20-wolk4.0s/INDEX linux-2.4.20-wolk4.1-fullkernel/INDEX
--- linux-2.4.20-wolk4.0s/INDEX	2003-05-15 21:52:19.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/INDEX	1970-01-01 01:00:00.000000000 +0100
@@ -1,29 +0,0 @@
-Kernel patches against 2.4.20 + 2.4.20-dm-10 for EVMS 2.0.1. Apply in the
-following order, with:
-
-cd /usr/src/linux-2.4.20/
-patch -p1 < filename.patch
-
-1-dm-base.patch:
-	Extra patch for the base device-mapper code to operate correctly with
-	the new EVMS engine.
-
-2-syncio.patch:
-	Patch to provide a synchronous I/O service to device-mapper targets.
-
-3-dm-bbr.patch:
-	Patch to provide a Bad-Block-Relocation target for device-mapper.
-
-4-dm-sparse.patch:
-	Patch to provide a Sparse-device target for device-mapper.
-
-5-md.c.patch:
-	Extra patch for the base software-RAID code to operate correctly with
-	the new EVMS engine.
-
-6-vsprintf.c.patch:
-	Extra patch for a kernel library to properly scan hex digits.
-
-7-vfs-lock.patch
-	Patch to add VFS-locking code. Required for using snapshots with
-	journalled filesystems.
diff -Naurp linux-2.4.20-wolk4.0s/Makefile linux-2.4.20-wolk4.1-fullkernel/Makefile
--- linux-2.4.20-wolk4.0s/Makefile	2003-05-15 21:52:19.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/Makefile	2003-05-16 14:21:50.000000000 +0200
@@ -1,7 +1,7 @@
 VERSION = 2
 PATCHLEVEL = 4
 SUBLEVEL = 20
-EXTRAVERSION = -wolk4.0s
+EXTRAVERSION = -wolk4.1s
 
 KERNELRELEASE=$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION)
 
diff -Naurp linux-2.4.20-wolk4.0s/REPORTING-BUGS linux-2.4.20-wolk4.1-fullkernel/REPORTING-BUGS
--- linux-2.4.20-wolk4.0s/REPORTING-BUGS	2003-05-15 21:52:19.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/REPORTING-BUGS	2003-05-16 14:21:51.000000000 +0200
@@ -1,6 +1,8 @@
 WOLK SPECIFIC OR IN OTHER WORDS: THE RIGHT WAY TO SUBMIT BUGS REPORTS:
 ----------------------------------------------------------------------
 
+ NOTE: IF YOU DON'T REPORT BUGS DON'T EXPECT ANYTHING GET FIXED!!!!!!!
+
  1.   What _exactly_ is not working for you? Tell me as much as you can!
 
  2.   Steps to reproduce?
@@ -9,28 +11,32 @@ WOLK SPECIFIC OR IN OTHER WORDS: THE RIG
 
  4.   What filesystem? / What mount options?
 
- 4.   Changed the fs from ext3 to ext2 for example and see a difference?
+ 5.   Changed the fs from ext3 to ext2 for example and see a difference?
 
- 5.   What tweaks?
+ 6.   What tweaks?
 
- 6.   You've read the CHANGELOG _carefully_ and noticed the things you can
+ 7.   You've read the CHANGELOG _carefully_ and noticed the things you can
       change at, for example, mount time, sched_yield_scale in /proc?
 
- 7.   Your ".config"
+ 8.   Your ".config"
 
- 8.   Output of: "dmesg" just after reboot or: "/var/log/dmesg" after some uptime
+ 9.   Output of: "dmesg" just after reboot or: "/var/log/dmesg" after some uptime
 
- 9.   Output of: "lspci -vvv"
+10.   Output of: "lspci -vvv"
 
-10.   If you have an OOPS, ksymoops it and do _not_ send the OOPS only!
+11.   If you have an OOPS, ksymoops it and do _not_ send the OOPS only!
+      Also enable these options so the oops output is more helpfull:
+	- enable CONFIG_FRAME_POINTER
+	- enable CONFIG_KALLSYMS
 
-11.   If you have a deadlock, en-/disable the following, test again and
+12.   If you have a deadlock, en-/disable the following, test again and
        come back:
 	- enable CONFIG_FRAME_POINTER
 	- enable CONFIG_KALLSYMS
 	- enable CONFIG_DEBUG_SLAB
 	- enable CONFIG_DEBUG_IOVIRT
 	- enable CONFIG_KDB (and read the docs in Documentation/kdb)
+	- disable CONFIG_ACPI *or* boot with "acpi=off"
 	- disable CONFIG_DEBUG_STACKOVERFLOW
 	- disable CONFIG_MEMORYPOOL
 	- disable CONFIG_PREEMPT
@@ -38,12 +44,12 @@ WOLK SPECIFIC OR IN OTHER WORDS: THE RIG
 	- disable _all_ GRSECURITY stuff
 	- less Documentation/nmi_watchdog.txt. Read it, do it, come back.
 
-12.   Disable all useless stuff for your machine you are actually testing!
+13.   Disable all useless stuff for your machine you are actually testing!
 
-13.   What compiler version? / What Distribution?
+14.   What compiler version? / What Distribution?
        Redhat is known to always fuck up their compiler!
 
-14.   Write those stuff to the list, _not_ as private mail!
+15.   Write those stuff to the list, _not_ as private mail!
 
 16.   Output of: "/proc/slabinfo"
 
diff -Naurp linux-2.4.20-wolk4.0s/VERSION linux-2.4.20-wolk4.1-fullkernel/VERSION
--- linux-2.4.20-wolk4.0s/VERSION	2003-05-15 21:52:19.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/VERSION	2003-05-16 14:17:06.000000000 +0200
@@ -1 +1 @@
-WOLK v4.0s "Server Edition" FINAL, based on 2.4.20
+WOLK v4.1s "Server Edition" FINAL, based on 2.4.20
diff -Naurp linux-2.4.20-wolk4.0s/WOLK-CHANGELOG linux-2.4.20-wolk4.1-fullkernel/WOLK-CHANGELOG
--- linux-2.4.20-wolk4.0s/WOLK-CHANGELOG	2003-05-15 21:52:19.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/WOLK-CHANGELOG	2003-05-19 12:52:07.000000000 +0200
@@ -1,3 +1,26 @@
+Changelog from v4.0s -> v4.1s
+-----------------------------
+o   fixed:   hashing exploits in ipv4 routing, IP conntrack, and TCP synq
+o   fixed:   IOPERM system call I/O port access vulnerability
++   fixed:   cloop unresolved symbols vs. zlib
+o   fixed:   NFS client stuck in D state
+o   fixed:   mm corrupting SMP race between remove_inode_page and prune_icache
+o   fixed:   Must wakeup with end_buffer_io_kiobuf as last thing to be sure
+               the wakeup will happen on a still allocated kiobuf
+o   fixed:   Avoid spurious duplicate acks for very minor window updates,
+               that generates the double outgoing traffic with streaming
+               services that tends to fill the whole receive window to buffer
+o   fixed:   fd leak
+o   fixed:   vma merging issue with device driver supplied mappings
+o   fixed:   tcp_tw_death_row corruption
+o   fixed:   OOM killer braindamage (RMAP)
+o   fixed:   VesaFB and highmem where screen stays black
+o   update:  AIO-18
+o   update:  Super FreeS/WAN v1.99.7 Final (including Dead Pear Detection)
+o   removed: Network checksumming speed runtime detection
+               does not give any real advantage, causes bugs on recent Athlons
+
+
 Changelog from v4.0s-rc8 -> v4.0s-final
 ---------------------------------------
 o   update:  rmap VM v15i
diff -Naurp linux-2.4.20-wolk4.0s/WOLK-README linux-2.4.20-wolk4.1-fullkernel/WOLK-README
--- linux-2.4.20-wolk4.0s/WOLK-README	2003-05-15 21:52:19.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/WOLK-README	2003-05-19 12:52:51.000000000 +0200
@@ -1,4 +1,4 @@
-Kernel - patched - WOLK v4.0s - Base: Linux kernel 2.4.20
+Kernel - patched - WOLK v4.1s - Base: Linux kernel 2.4.20
  located at http://sf.net/projects/wolk
  by Marc-Christian Petersen <m.c.p@wolk-project.de>
 --------------------------------------------------------------------------
@@ -42,7 +42,7 @@ Some of the features:
 ---------------------
 O(1) Scheduler, RMAP VM, GRsecurity, Crypto, XFS, KDB, Preempt, Systrace,
 Super FreeS/WAN, Trustees, IPVS, i2c/lmsensors, TUX, EVMS, BadMEM, ftpfs,
-HostAP ... and many more.
+HostAP, all known security fixes, all known filesystem fixes, and many more.
 
 
 Credits go to all the people who created the patches, working hard on
diff -Naurp linux-2.4.20-wolk4.0s/arch/all/Config-TWEAKS.in linux-2.4.20-wolk4.1-fullkernel/arch/all/Config-TWEAKS.in
--- linux-2.4.20-wolk4.0s/arch/all/Config-TWEAKS.in	2003-05-15 21:52:19.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/arch/all/Config-TWEAKS.in	2003-05-19 13:05:09.000000000 +0200
@@ -16,7 +16,7 @@ fi
 # Low Latency / Low Latency Elevator
 comment 'Low Latency fixes are enabled'
 
-bool 'Low Latency Elevator - block atomic Edition' CONFIG_BLK_DEV_ELEVATOR_LOWLAT
+bool 'Low Latency Elevator - Read Latency v2 Edition' CONFIG_BLK_DEV_ELEVATOR_LOWLAT
 
 
 # Preempt
diff -Naurp linux-2.4.20-wolk4.0s/arch/i386/kernel/ioport.c linux-2.4.20-wolk4.1-fullkernel/arch/i386/kernel/ioport.c
--- linux-2.4.20-wolk4.0s/arch/i386/kernel/ioport.c	2003-05-15 21:52:19.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/arch/i386/kernel/ioport.c	2003-05-17 12:55:41.000000000 +0200
@@ -113,17 +113,18 @@ asmlinkage int sys_ioperm(unsigned long 
 		 */
 		memset(t->io_bitmap,0xff,(IO_BITMAP_SIZE+1)*4);
 		t->ioperm = 1;
-		/*
-		 * this activates it in the TSS
-		 */
-		tss->bitmap = IO_BITMAP_OFFSET;
 	}
 
 	/*
 	 * do it in the per-thread copy and in the TSS ...
 	 */
 	set_bitmap(t->io_bitmap, from, num, !turn_on);
-	set_bitmap(tss->io_bitmap, from, num, !turn_on);
+	if (tss->bitmap == IO_BITMAP_OFFSET) { /* already active? */
+		set_bitmap(tss->io_bitmap, from, num, !turn_on);
+	} else {
+		memcpy(tss->io_bitmap, t->io_bitmap, IO_BITMAP_SIZE);
+		tss->bitmap = IO_BITMAP_OFFSET; /* Activate it in the TSS */
+	}
 	preempt_enable();
 
 	return 0;
diff -Naurp linux-2.4.20-wolk4.0s/arch/i386/lib/Makefile linux-2.4.20-wolk4.1-fullkernel/arch/i386/lib/Makefile
--- linux-2.4.20-wolk4.0s/arch/i386/lib/Makefile	2003-05-15 21:52:20.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/arch/i386/lib/Makefile	2003-05-19 08:37:47.000000000 +0200
@@ -7,16 +7,9 @@
 
 L_TARGET = lib.a
 
-obj-y = old-checksum.o delay.o \
+obj-y = checksum.o old-checksum.o delay.o \
 	usercopy.o getuser.o \
-	memcpy.o strstr.o \
-	bench_csum.o \
-	csum.o \
-	csum_basic.o \
-	csum_naive.o \
-	csum_3dnow.o \
-	csum_ssemmxplus.o \
-	csumcpy.o
+	memcpy.o strstr.o
 
 obj-$(CONFIG_X86_USE_3DNOW) += mmx.o
 obj-$(CONFIG_HAVE_DEC_LOCK) += dec_and_lock.o
diff -Naurp linux-2.4.20-wolk4.0s/arch/i386/lib/bench_csum.c linux-2.4.20-wolk4.1-fullkernel/arch/i386/lib/bench_csum.c
--- linux-2.4.20-wolk4.0s/arch/i386/lib/bench_csum.c	2003-05-15 21:52:20.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/arch/i386/lib/bench_csum.c	1970-01-01 01:00:00.000000000 +0100
@@ -1,217 +0,0 @@
-#include <linux/mm.h>		// for get_pages
-#include <asm/uaccess.h>	// for access_ok in asm/checksum.h
-#include <linux/in6.h>		// for in6_addr in asm/checksum.h
-#include <asm/byteorder.h>	// for ntoh in asm/checksum.h
-#include <asm/cpufeature.h>	// for X86_FEATURE_xx
-#include <linux/byteorder/generic.h>	// for ntohX in asm/checksum.h
-#include <linux/stddef.h>	// for NULL in asm/checksum.h
-#include <linux/linkage.h>	// for asmlinkage in asm/checksum.h
-#include <linux/module.h>
-
-//#include "bench_csum.h"
-#include <asm/checksum.h>
-#include <linux/bench_func.h>
-
-//#define dprintk(a...)	printk(a)
-#define dprintk(a...) ((void)0)
-
-/* Features usable for mem optimization:
-	Intel
-X86_FEATURE_FPU		Onboard FPU
-X86_FEATURE_MMX		Multimedia Extensions
-X86_FEATURE_XMM		Streaming SIMD Extensions
-X86_FEATURE_XMM2	Streaming SIMD Extensions-2
-	AMD
-X86_FEATURE_3DNOW	3DNow!
-X86_FEATURE_MMXEXT	AMD MMX extensions
-X86_FEATURE_3DNOWEXT	AMD 3DNow! extensions
-	Cyrix
-X86_FEATURE_CXMMX	Cyrix MMX extensions
-*/
-
-typedef typeof(jiffies) jiffies_t;
-
-typedef void asm_helper(void);
-
-extern asm_helper csum_basic;
-extern asm_helper csum_naive;
-extern asm_helper csum_3dnow;
-extern asm_helper csum_ssemmxplus;
-
-static struct candidate csum_runner[] = {
-    { "basic"	, csum_basic     , 1, { -1 } },
-    { "simple"	, csum_naive     , 1, { -1 } },
-    { "3Dnow!"	, csum_3dnow     , 1, { X86_FEATURE_3DNOW, -1 } },
-    { "AMD MMX", csum_ssemmxplus, 1, { X86_FEATURE_MMXEXT, -1 } },
-    { "SSE1+", csum_ssemmxplus, 1, { X86_FEATURE_XMM, -1 } },
-};
-
-extern asm_helper csumcpy_basic;
-extern asm_helper csumcpy_naive;
-extern asm_helper csumcpy_ssemmxplus;
-extern asm_helper csumcpy_sse;
-
-static struct candidate csumcpy_runner[] = {
-    { "basic"	, csumcpy_basic     , 2, { -1 } },
-    { "simple"	, csumcpy_naive     , 2, { -1 } },
-    /* higher weight: we prefer these for less cache pollution: */
-    { "AMD MMX", csumcpy_ssemmxplus, 3, { X86_FEATURE_MMXEXT, -1 } },
-    { "SSE1+", csumcpy_ssemmxplus, 3, { X86_FEATURE_XMM, -1 } },
-    { "SSE1"	, csumcpy_sse       , 3, { X86_FEATURE_XMM, -1 } },
-};
-
-//====== TODO: split here: above: arch, below:generic
-
-/* set this to value bigger than cache(s) */
-/* TODO: heuristic for buffer size */
-#define bufshift	20		/* 10=1kb, 20=1MB etc */
-/* typical size of a packet */
-#define chunksz		(4*1024)
-
-#define bufsz		(1<<bufshift)
-#define chunkcnt	(bufsz/chunksz)
-
-#define VECTOR_SZ(a)	(sizeof(a)/sizeof((a)[0]))
-
-asm_helper *best_csum = csum_basic;
-asm_helper *best_csumcpy = csumcpy_basic;
-
-/*
-** Count the number of iterations done during a fixed period,
-** and use this to calculate throughput.
-*/
-
-static int duration = 1;	// jiffies for each run
-static int report;
-
-static inline void
-wait_for_jiffy(void) {
-	jiffies_t now = jiffies;
-	while(now == jiffies) cpu_relax();
-}
-
-static int
-bench_csum(struct candidate *cand, char *buf)
-{
-	int i, max;
-	best_csum = (asm_helper*)(cand->f);
-	
-	max = 0;
-	// In practice these are pretty repeatable
-	// so 3 runs is an overkill
-	for(i=0; i<3; i++) {
-		int count = 0;	
-		jiffies_t limit;
-		wait_for_jiffy();
-		limit = jiffies+duration;
-		while(time_before(jiffies, limit)) {
-			int i;
-			mb();
-			// interleaved to avoid bias due to prefetch
-			for(i=0; i<chunkcnt; i+=2)
-				csum_partial(buf+i*chunksz, chunksz, 0);
-			for(i=1; i<chunkcnt; i+=2)
-				csum_partial(buf+i*chunksz, chunksz, 0);
-			mb();
-			count++;
-			mb();
-		}
-		dprintk("   count =%6i\n",count);
-		if(count>max)
-			max = count;
-	}
-
-	if(report) {
-		int kb_sec = max * (((chunksz*chunkcnt)/1024) * HZ) / duration;
-		printk("   %-10s:%6d.%03d MB/sec\n", cand->name,
-			kb_sec / 1000, kb_sec % 1000);
-	}
-	       
-	return max;
-}
-
-static int
-bench_csumcpy(struct candidate *cand, char *buf)
-{
-	int err;
-	int i, max;
-	best_csumcpy = (asm_helper*)(cand->f);
-
-	max = 0;
-	for(i=0; i<3; i++) {
-		int count = 0;
-		jiffies_t limit;
-		wait_for_jiffy();
-		limit = jiffies+duration;
-		while(time_before(jiffies, limit)) {
-			int i;
-			mb();
-			// interleaved to avoid bias due to prefetch
-			for(i=0; i<chunkcnt; i+=2)
-				csum_partial_copy_generic(buf+i*chunksz,
-					buf+(chunkcnt-1-i)*chunksz,
-					chunksz, 0, &err, &err);
-			for(i=1; i<chunkcnt; i+=2)
-				csum_partial_copy_generic(buf+i*chunksz,
-					buf+(chunkcnt-1-i)*chunksz,
-					chunksz, 0, &err, &err);
-			mb();
-			count++;
-			mb();
-		}
-		dprintk("   count =%6i\n",count);
-		if(count>max)
-			max = count;
-	}
-
-	if(report) {
-		int kb_sec = max * (((chunksz*chunkcnt)/1024) * HZ) / duration;
-		printk("   %-10s:%6d.%03d MB/sec\n", cand->name,
-			kb_sec / 1000, kb_sec % 1000);
-	}
-	       
-	return max;
-}
-
-static int
-find_best_csum(void)
-{
-	struct candidate *best;
-	char *buffer = (char *) __get_free_pages(GFP_KERNEL,
-					(bufshift-PAGE_SHIFT));
-					
-	printk(KERN_INFO "Measuring network checksumming speed\n");
-	if(!buffer) {
-		printk("csum: cannot allocate %i pages\n",
-			1<<(bufshift-PAGE_SHIFT)
-		);
-		return -ENOMEM;
-	}
-	dprintk("allocated %i pages\n",1<<(bufshift-PAGE_SHIFT));
-
-	// find # of jiffies suitable for reliable results
-	// (at least %5 accuracy)
-	while(bench_csumcpy(&csumcpy_runner[0], buffer)<20) {
-		duration<<=1;
-	}
-	dprintk("test run will last %i ticks\n", duration);
-	report = 1;
-
-	best = find_best(bench_csum, buffer, csum_runner,
-			VECTOR_SZ(csum_runner));
-	printk("csum: using csum function: %s\n", best->name);
-	best_csum = (asm_helper*)(best->f);
-
-	best = find_best(bench_csumcpy, buffer, csumcpy_runner,
-			VECTOR_SZ(csumcpy_runner));
-	printk("csum: using csum_copy function: %s\n", best->name);
-	best_csumcpy = (asm_helper*)(best->f);
-	
-	free_pages((unsigned long)buffer, (bufshift-PAGE_SHIFT));
-	dprintk("freed %i pages\n",1<<(bufshift-PAGE_SHIFT));
-	return 0;
-}
-
-MODULE_LICENSE("GPL");
-
-module_init(find_best_csum);
diff -Naurp linux-2.4.20-wolk4.0s/arch/i386/lib/checksum.S linux-2.4.20-wolk4.1-fullkernel/arch/i386/lib/checksum.S
--- linux-2.4.20-wolk4.0s/arch/i386/lib/checksum.S	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.4.20-wolk4.1-fullkernel/arch/i386/lib/checksum.S	2003-05-19 08:37:47.000000000 +0200
@@ -0,0 +1,496 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		IP/TCP/UDP checksumming routines
+ *
+ * Authors:	Jorge Cwik, <jorge@laser.satlink.net>
+ *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ *		Tom May, <ftom@netcom.com>
+ *              Pentium Pro/II routines:
+ *              Alexander Kjeldaas <astor@guardian.no>
+ *              Finn Arne Gangstad <finnag@guardian.no>
+ *		Lots of code moved from tcp.c and ip.c; see those files
+ *		for more names.
+ *
+ * Changes:     Ingo Molnar, converted csum_partial_copy() to 2.1 exception
+ *			     handling.
+ *		Andi Kleen,  add zeroing on error
+ *                   converted to pure assembler
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <asm/errno.h>
+				
+/*
+ * computes a partial checksum, e.g. for TCP/UDP fragments
+ */
+
+/*	
+unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum)
+ */
+		
+.text
+.align 4
+.globl csum_partial								
+		
+#ifndef CONFIG_X86_USE_PPRO_CHECKSUM
+
+	  /*		
+	   * Experiments with Ethernet and SLIP connections show that buff
+	   * is aligned on either a 2-byte or 4-byte boundary.  We get at
+	   * least a twofold speedup on 486 and Pentium if it is 4-byte aligned.
+	   * Fortunately, it is easy to convert 2-byte alignment to 4-byte
+	   * alignment for the unrolled loop.
+	   */		
+csum_partial:	
+	pushl %esi
+	pushl %ebx
+	movl 20(%esp),%eax	# Function arg: unsigned int sum
+	movl 16(%esp),%ecx	# Function arg: int len
+	movl 12(%esp),%esi	# Function arg: unsigned char *buff
+	testl $3, %esi		# Check alignment.
+	jz 2f			# Jump if alignment is ok.
+	testl $1, %esi		# Check alignment.
+	jz 10f			# Jump if alignment is boundary of 2bytes.
+
+	# buf is odd
+	dec %ecx
+	jl 8f
+	movzbl (%esi), %ebx
+	adcl %ebx, %eax
+	roll $8, %eax
+	inc %esi
+	testl $2, %esi
+	jz 2f
+10:
+	subl $2, %ecx		# Alignment uses up two bytes.
+	jae 1f			# Jump if we had at least two bytes.
+	addl $2, %ecx		# ecx was < 2.  Deal with it.
+	jmp 4f
+1:	movw (%esi), %bx
+	addl $2, %esi
+	addw %bx, %ax
+	adcl $0, %eax
+2:
+	movl %ecx, %edx
+	shrl $5, %ecx
+	jz 2f
+	testl %esi, %esi
+1:	movl (%esi), %ebx
+	adcl %ebx, %eax
+	movl 4(%esi), %ebx
+	adcl %ebx, %eax
+	movl 8(%esi), %ebx
+	adcl %ebx, %eax
+	movl 12(%esi), %ebx
+	adcl %ebx, %eax
+	movl 16(%esi), %ebx
+	adcl %ebx, %eax
+	movl 20(%esi), %ebx
+	adcl %ebx, %eax
+	movl 24(%esi), %ebx
+	adcl %ebx, %eax
+	movl 28(%esi), %ebx
+	adcl %ebx, %eax
+	lea 32(%esi), %esi
+	dec %ecx
+	jne 1b
+	adcl $0, %eax
+2:	movl %edx, %ecx
+	andl $0x1c, %edx
+	je 4f
+	shrl $2, %edx		# This clears CF
+3:	adcl (%esi), %eax
+	lea 4(%esi), %esi
+	dec %edx
+	jne 3b
+	adcl $0, %eax
+4:	andl $3, %ecx
+	jz 7f
+	cmpl $2, %ecx
+	jb 5f
+	movw (%esi),%cx
+	leal 2(%esi),%esi
+	je 6f
+	shll $16,%ecx
+5:	movb (%esi),%cl
+6:	addl %ecx,%eax
+	adcl $0, %eax 
+7:	
+	testl $1, 12(%esp)
+	jz 8f
+	roll $8, %eax
+8:
+	popl %ebx
+	popl %esi
+	ret
+
+#else
+
+/* Version for PentiumII/PPro */
+
+csum_partial:
+	pushl %esi
+	pushl %ebx
+	movl 20(%esp),%eax	# Function arg: unsigned int sum
+	movl 16(%esp),%ecx	# Function arg: int len
+	movl 12(%esp),%esi	# Function arg:	const unsigned char *buf
+
+	testl $3, %esi         
+	jnz 25f                 
+10:
+	movl %ecx, %edx
+	movl %ecx, %ebx
+	andl $0x7c, %ebx
+	shrl $7, %ecx
+	addl %ebx,%esi
+	shrl $2, %ebx  
+	negl %ebx
+	lea 45f(%ebx,%ebx,2), %ebx
+	testl %esi, %esi
+	jmp *%ebx
+
+	# Handle 2-byte-aligned regions
+20:	addw (%esi), %ax
+	lea 2(%esi), %esi
+	adcl $0, %eax
+	jmp 10b
+25:
+	testl $1, %esi         
+	jz 30f                 
+	# buf is odd
+	dec %ecx
+	jl 90f
+	movzbl (%esi), %ebx
+	addl %ebx, %eax
+	adcl $0, %eax
+	roll $8, %eax
+	inc %esi
+	testl $2, %esi
+	jz 10b
+
+30:	subl $2, %ecx          
+	ja 20b                 
+	je 32f
+	addl $2, %ecx
+	jz 80f
+	movzbl (%esi),%ebx	# csumming 1 byte, 2-aligned
+	addl %ebx, %eax
+	adcl $0, %eax
+	jmp 80f
+32:
+	addw (%esi), %ax	# csumming 2 bytes, 2-aligned
+	adcl $0, %eax
+	jmp 80f
+
+40: 
+	addl -128(%esi), %eax
+	adcl -124(%esi), %eax
+	adcl -120(%esi), %eax
+	adcl -116(%esi), %eax   
+	adcl -112(%esi), %eax   
+	adcl -108(%esi), %eax
+	adcl -104(%esi), %eax
+	adcl -100(%esi), %eax
+	adcl -96(%esi), %eax
+	adcl -92(%esi), %eax
+	adcl -88(%esi), %eax
+	adcl -84(%esi), %eax
+	adcl -80(%esi), %eax
+	adcl -76(%esi), %eax
+	adcl -72(%esi), %eax
+	adcl -68(%esi), %eax
+	adcl -64(%esi), %eax     
+	adcl -60(%esi), %eax     
+	adcl -56(%esi), %eax     
+	adcl -52(%esi), %eax   
+	adcl -48(%esi), %eax   
+	adcl -44(%esi), %eax
+	adcl -40(%esi), %eax
+	adcl -36(%esi), %eax
+	adcl -32(%esi), %eax
+	adcl -28(%esi), %eax
+	adcl -24(%esi), %eax
+	adcl -20(%esi), %eax
+	adcl -16(%esi), %eax
+	adcl -12(%esi), %eax
+	adcl -8(%esi), %eax
+	adcl -4(%esi), %eax
+45:
+	lea 128(%esi), %esi
+	adcl $0, %eax
+	dec %ecx
+	jge 40b
+	movl %edx, %ecx
+50:	andl $3, %ecx
+	jz 80f
+
+	# Handle the last 1-3 bytes without jumping
+	notl %ecx		# 1->2, 2->1, 3->0, higher bits are masked
+	movl $0xffffff,%ebx	# by the shll and shrl instructions
+	shll $3,%ecx
+	shrl %cl,%ebx
+	andl -128(%esi),%ebx	# esi is 4-aligned so should be ok
+	addl %ebx,%eax
+	adcl $0,%eax
+80: 
+	testl $1, 12(%esp)
+	jz 90f
+	roll $8, %eax
+90: 
+	popl %ebx
+	popl %esi
+	ret
+				
+#endif
+
+/*
+unsigned int csum_partial_copy_generic (const char *src, char *dst,
+				  int len, int sum, int *src_err_ptr, int *dst_err_ptr)
+ */ 
+
+/*
+ * Copy from ds while checksumming, otherwise like csum_partial
+ *
+ * The macros SRC and DST specify the type of access for the instruction.
+ * thus we can call a custom exception handler for all access types.
+ *
+ * FIXME: could someone double-check whether I haven't mixed up some SRC and
+ *	  DST definitions? It's damn hard to trigger all cases.  I hope I got
+ *	  them all but there's no guarantee.
+ */
+
+#define SRC(y...)			\
+	9999: y;			\
+	.section __ex_table, "a";	\
+	.long 9999b, 6001f	;	\
+	.previous
+
+#define DST(y...)			\
+	9999: y;			\
+	.section __ex_table, "a";	\
+	.long 9999b, 6002f	;	\
+	.previous
+
+.align 4
+.globl csum_partial_copy_generic
+				
+#ifndef CONFIG_X86_USE_PPRO_CHECKSUM
+
+#define ARGBASE 16		
+#define FP		12
+		
+csum_partial_copy_generic:
+	subl  $4,%esp	
+	pushl %edi
+	pushl %esi
+	pushl %ebx
+	movl ARGBASE+16(%esp),%eax	# sum
+	movl ARGBASE+12(%esp),%ecx	# len
+	movl ARGBASE+4(%esp),%esi	# src
+	movl ARGBASE+8(%esp),%edi	# dst
+
+	testl $2, %edi			# Check alignment. 
+	jz 2f				# Jump if alignment is ok.
+	subl $2, %ecx			# Alignment uses up two bytes.
+	jae 1f				# Jump if we had at least two bytes.
+	addl $2, %ecx			# ecx was < 2.  Deal with it.
+	jmp 4f
+SRC(1:	movw (%esi), %bx	)
+	addl $2, %esi
+DST(	movw %bx, (%edi)	)
+	addl $2, %edi
+	addw %bx, %ax	
+	adcl $0, %eax
+2:
+	movl %ecx, FP(%esp)
+	shrl $5, %ecx
+	jz 2f
+	testl %esi, %esi
+SRC(1:	movl (%esi), %ebx	)
+SRC(	movl 4(%esi), %edx	)
+	adcl %ebx, %eax
+DST(	movl %ebx, (%edi)	)
+	adcl %edx, %eax
+DST(	movl %edx, 4(%edi)	)
+
+SRC(	movl 8(%esi), %ebx	)
+SRC(	movl 12(%esi), %edx	)
+	adcl %ebx, %eax
+DST(	movl %ebx, 8(%edi)	)
+	adcl %edx, %eax
+DST(	movl %edx, 12(%edi)	)
+
+SRC(	movl 16(%esi), %ebx 	)
+SRC(	movl 20(%esi), %edx	)
+	adcl %ebx, %eax
+DST(	movl %ebx, 16(%edi)	)
+	adcl %edx, %eax
+DST(	movl %edx, 20(%edi)	)
+
+SRC(	movl 24(%esi), %ebx	)
+SRC(	movl 28(%esi), %edx	)
+	adcl %ebx, %eax
+DST(	movl %ebx, 24(%edi)	)
+	adcl %edx, %eax
+DST(	movl %edx, 28(%edi)	)
+
+	lea 32(%esi), %esi
+	lea 32(%edi), %edi
+	dec %ecx
+	jne 1b
+	adcl $0, %eax
+2:	movl FP(%esp), %edx
+	movl %edx, %ecx
+	andl $0x1c, %edx
+	je 4f
+	shrl $2, %edx			# This clears CF
+SRC(3:	movl (%esi), %ebx	)
+	adcl %ebx, %eax
+DST(	movl %ebx, (%edi)	)
+	lea 4(%esi), %esi
+	lea 4(%edi), %edi
+	dec %edx
+	jne 3b
+	adcl $0, %eax
+4:	andl $3, %ecx
+	jz 7f
+	cmpl $2, %ecx
+	jb 5f
+SRC(	movw (%esi), %cx	)
+	leal 2(%esi), %esi
+DST(	movw %cx, (%edi)	)
+	leal 2(%edi), %edi
+	je 6f
+	shll $16,%ecx
+SRC(5:	movb (%esi), %cl	)
+DST(	movb %cl, (%edi)	)
+6:	addl %ecx, %eax
+	adcl $0, %eax
+7:
+5000:
+
+# Exception handler:
+.section .fixup, "ax"							
+
+6001:
+	movl ARGBASE+20(%esp), %ebx	# src_err_ptr
+	movl $-EFAULT, (%ebx)
+
+	# zero the complete destination - computing the rest
+	# is too much work 
+	movl ARGBASE+8(%esp), %edi	# dst
+	movl ARGBASE+12(%esp), %ecx	# len
+	xorl %eax,%eax
+	rep ; stosb
+
+	jmp 5000b
+
+6002:
+	movl ARGBASE+24(%esp), %ebx	# dst_err_ptr
+	movl $-EFAULT,(%ebx)
+	jmp 5000b
+
+.previous
+
+	popl %ebx
+	popl %esi
+	popl %edi
+	popl %ecx			# equivalent to addl $4,%esp
+	ret	
+
+#else
+
+/* Version for PentiumII/PPro */
+
+#define ROUND1(x) \
+	SRC(movl x(%esi), %ebx	)	;	\
+	addl %ebx, %eax			;	\
+	DST(movl %ebx, x(%edi)	)	; 
+
+#define ROUND(x) \
+	SRC(movl x(%esi), %ebx	)	;	\
+	adcl %ebx, %eax			;	\
+	DST(movl %ebx, x(%edi)	)	;
+
+#define ARGBASE 12
+		
+csum_partial_copy_generic:
+	pushl %ebx
+	pushl %edi
+	pushl %esi
+	movl ARGBASE+4(%esp),%esi	#src
+	movl ARGBASE+8(%esp),%edi	#dst	
+	movl ARGBASE+12(%esp),%ecx	#len
+	movl ARGBASE+16(%esp),%eax	#sum
+#	movl %ecx, %edx  
+	movl %ecx, %ebx  
+	movl %esi, %edx
+	shrl $6, %ecx     
+	andl $0x3c, %ebx  
+	negl %ebx
+	subl %ebx, %esi  
+	subl %ebx, %edi  
+	lea  -1(%esi),%edx
+	andl $-32,%edx
+	lea 3f(%ebx,%ebx), %ebx
+	testl %esi, %esi 
+	jmp *%ebx
+1:	addl $64,%esi
+	addl $64,%edi 
+	SRC(movb -32(%edx),%bl)	; SRC(movb (%edx),%bl)
+	ROUND1(-64) ROUND(-60) ROUND(-56) ROUND(-52)	
+	ROUND (-48) ROUND(-44) ROUND(-40) ROUND(-36)	
+	ROUND (-32) ROUND(-28) ROUND(-24) ROUND(-20)	
+	ROUND (-16) ROUND(-12) ROUND(-8)  ROUND(-4)	
+3:	adcl $0,%eax
+	addl $64, %edx
+	dec %ecx
+	jge 1b
+4:	movl ARGBASE+12(%esp),%edx	#len
+	andl $3, %edx
+	jz 7f
+	cmpl $2, %edx
+	jb 5f
+SRC(	movw (%esi), %dx         )
+	leal 2(%esi), %esi
+DST(	movw %dx, (%edi)         )
+	leal 2(%edi), %edi
+	je 6f
+	shll $16,%edx
+5:
+SRC(	movb (%esi), %dl         )
+DST(	movb %dl, (%edi)         )
+6:	addl %edx, %eax
+	adcl $0, %eax
+7:
+.section .fixup, "ax"
+6001:	movl	ARGBASE+20(%esp), %ebx	# src_err_ptr	
+	movl $-EFAULT, (%ebx)
+	# zero the complete destination (computing the rest is too much work)
+	movl ARGBASE+8(%esp),%edi	# dst
+	movl ARGBASE+12(%esp),%ecx	# len
+	xorl %eax,%eax
+	rep; stosb
+	jmp 7b
+6002:	movl ARGBASE+24(%esp), %ebx	# dst_err_ptr
+	movl $-EFAULT, (%ebx)
+	jmp  7b			
+.previous				
+
+	popl %esi
+	popl %edi
+	popl %ebx
+	ret
+				
+#undef ROUND
+#undef ROUND1		
+		
+#endif
diff -Naurp linux-2.4.20-wolk4.0s/arch/i386/lib/csum.S linux-2.4.20-wolk4.1-fullkernel/arch/i386/lib/csum.S
--- linux-2.4.20-wolk4.0s/arch/i386/lib/csum.S	2003-05-15 21:52:20.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/arch/i386/lib/csum.S	1970-01-01 01:00:00.000000000 +0100
@@ -1,97 +0,0 @@
-/*
- * INET		An implementation of the TCP/IP protocol suite for the LINUX
- *		operating system.  INET is implemented using the  BSD Socket
- *		interface as the means of communication with the user level.
- *
- *		IP/TCP/UDP checksumming routines
- *
- * Authors:	Jorge Cwik, <jorge@laser.satlink.net>
- *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
- *		Tom May, <ftom@netcom.com>
- *              Pentium Pro/II routines:
- *              Alexander Kjeldaas <astor@guardian.no>
- *              Finn Arne Gangstad <finnag@guardian.no>
- *		Lots of code moved from tcp.c and ip.c; see those files
- *		for more names.
- *
- * Changes:     Ingo Molnar, converted csum_partial_copy() to 2.1 exception
- *			     handling.
- *		Andi Kleen,  add zeroing on error converted to pure assembler
- *		2002-10-30 Denis Vlasenko
- *			boot-time benchmarking, 3Dnow/MMX+/SSE versions
- *
- *		This program is free software; you can redistribute it and/or
- *		modify it under the terms of the GNU General Public License
- *		as published by the Free Software Foundation; either version
- *		2 of the License, or (at your option) any later version.
- */
-
-/*
-** computes a partial checksum, e.g. for TCP/UDP fragments
-**
-** unsigned int csum_partial(const unsigned char * buff,
-**		int len, unsigned int sum)
-*/
-	   
-.text
-.align 4
-.globl csum_partial
-
-csum_partial:
-	pushl	%esi
-	pushl	%ebx
-	movl	20(%esp), %eax	# arg: sum
-	movl	16(%esp), %ecx	# arg: len
-	movl	12(%esp), %esi	# arg:	buf
-
-	testl	$3, %esi
-	jz	40f
-20:
-    # not 4-aligned: analyze and align...
-	testl	$1, %esi
-	jz	30f
-	
-    # unaligned start addr
-	decl	%ecx
-	js	90f 		# sz==0, exit
-	movzbl	(%esi), %ebx	# eat one byte...
-	addl	%ebx, %eax
-	adcl	$0, %eax
-	roll	$8, %eax	# NB: need to be undone at exit!
-	incl	%esi
-	testl	$2, %esi
-	jz	40f
-30:
-    # This is 2-aligned, but not 4-aligned
-	cmpl	$3, %ecx
-	jbe	60f
-	addw	(%esi), %ax	# eat 2 bytes
-	leal	2(%esi), %esi
-	adcl	$0, %eax
-	subl	$2, %ecx
-40:
-    # esi is 4-aligned here, call block routine
-	movl	$csum_basic, %ebx	# known ok even for ecx==0 etc
-	cmpl	$128, %ecx		# use optimized routine
-	jb	50f			#   only for large blocks
-	movl	best_csum, %ebx
-50:	call	*%ebx
-60:
-    # handle the last 0-3 bytes without much jumping
-	jecxz	80f
-	notl	%ecx		# 0->3, 1->2, 2->1, 3->0, higher bits are masked
-	movl	$0xffffff, %ebx	#   by the shll and shrl instructions
-	shll	$3, %ecx
-	shrl	%cl, %ebx
-	andl	(%esi), %ebx	# esi is 4-aligned so should be ok
-	addl	%ebx, %eax
-	adcl	$0, %eax
-80:
-    # undo csum rotation if start addr was odd
-	testl	$1, 12(%esp)
-	jz	90f
-	roll	$8, %eax
-90: 
-	popl	%ebx
-	popl	%esi
-	ret
diff -Naurp linux-2.4.20-wolk4.0s/arch/i386/lib/csum_3dnow.S linux-2.4.20-wolk4.1-fullkernel/arch/i386/lib/csum_3dnow.S
--- linux-2.4.20-wolk4.0s/arch/i386/lib/csum_3dnow.S	2003-05-15 21:52:20.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/arch/i386/lib/csum_3dnow.S	1970-01-01 01:00:00.000000000 +0100
@@ -1,4 +0,0 @@
-#define PREFETCH(a) prefetch a
-#define NAME csum_3dnow
-
-#include "csum_pf.inc"
diff -Naurp linux-2.4.20-wolk4.0s/arch/i386/lib/csum_basic.S linux-2.4.20-wolk4.1-fullkernel/arch/i386/lib/csum_basic.S
--- linux-2.4.20-wolk4.0s/arch/i386/lib/csum_basic.S	2003-05-15 21:52:20.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/arch/i386/lib/csum_basic.S	1970-01-01 01:00:00.000000000 +0100
@@ -1,63 +0,0 @@
-.text
-.align 4
-.globl csum_basic
-
-/* Experiments with Ethernet and SLIP connections show that buff
-** is aligned on either a 2-byte or 4-byte boundary.  We get at
-** least a twofold speedup on 486 and Pentium if it is 4-byte aligned.
-** Fortunately, it is easy to convert 2-byte alignment to 4-byte
-** alignment for the unrolled loop.
-*/
-csum_basic:
-	movl	%ecx, %ebx
-	movl	%ecx, %edx
-	shrl	$7, %ecx
-	andl	$0x7c, %ebx
-	addl	%ebx, %esi
-	shrl	$2, %ebx
-	negl	%ebx
-	leal	50f(%ebx,%ebx,2), %ebx
-	clc
-	jmp	*%ebx
-40:
-	leal	128(%esi), %esi
-	adcl	-128(%esi), %eax
-	adcl	-124(%esi), %eax
-	adcl	-120(%esi), %eax
-	adcl	-116(%esi), %eax
-	adcl	-112(%esi), %eax
-	adcl	-108(%esi), %eax
-	adcl	-104(%esi), %eax
-	adcl	-100(%esi), %eax
-	adcl	-96(%esi), %eax
-	adcl	-92(%esi), %eax
-	adcl	-88(%esi), %eax
-	adcl	-84(%esi), %eax
-	adcl	-80(%esi), %eax
-	adcl	-76(%esi), %eax
-	adcl	-72(%esi), %eax
-	adcl	-68(%esi), %eax
-	adcl	-64(%esi), %eax
-	adcl	-60(%esi), %eax
-	adcl	-56(%esi), %eax
-	adcl	-52(%esi), %eax
-	adcl	-48(%esi), %eax
-	adcl	-44(%esi), %eax
-	adcl	-40(%esi), %eax
-	adcl	-36(%esi), %eax
-	adcl	-32(%esi), %eax
-	adcl	-28(%esi), %eax
-	adcl	-24(%esi), %eax
-	adcl	-20(%esi), %eax
-	adcl	-16(%esi), %eax
-	adcl	-12(%esi), %eax
-	adcl	-8(%esi), %eax
-	adcl	-4(%esi), %eax
-50:
-	decl	%ecx
-	jge	40b
-	
-	adcl	$0, %eax
-	movl	%edx, %ecx
-	andl	$3, %ecx
-	ret
diff -Naurp linux-2.4.20-wolk4.0s/arch/i386/lib/csum_naive.S linux-2.4.20-wolk4.1-fullkernel/arch/i386/lib/csum_naive.S
--- linux-2.4.20-wolk4.0s/arch/i386/lib/csum_naive.S	2003-05-15 21:52:20.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/arch/i386/lib/csum_naive.S	1970-01-01 01:00:00.000000000 +0100
@@ -1,17 +0,0 @@
-.text
-.align 4
-.globl csum_naive
-
-csum_naive:
-	mov	%ecx, %edx
-	shrl	$2, %ecx
-	clc
-1:
-	adcl	(%esi), %eax
-	leal	4(%esi), %esi
-	loop	1b
-
-	adcl	$0, %eax
-	mov	%edx, %ecx
-	andl	$3, %ecx
-	ret
diff -Naurp linux-2.4.20-wolk4.0s/arch/i386/lib/csum_pf.inc linux-2.4.20-wolk4.1-fullkernel/arch/i386/lib/csum_pf.inc
--- linux-2.4.20-wolk4.0s/arch/i386/lib/csum_pf.inc	2003-05-15 21:52:20.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/arch/i386/lib/csum_pf.inc	1970-01-01 01:00:00.000000000 +0100
@@ -1,95 +0,0 @@
-//#define PREFETCH(a) prefetchnta a
-//#define PREFETCH(a) prefetch a
-//#define PREFETCH(a)
-
-// How much unrolling do you want?
-//vda: 5 is best on Duron 650
-#define ITER_BITS 5		// ...5,6,7 - ...32,64,128 bytes
-				// NB: tweak unrolled loop too...
-/*
-** computes a partial checksum, e.g. for TCP/UDP fragments
-** int csum_partial(const char *buff, int len, int sum)
-*/
-
-#define ITER_SZ		(1<<ITER_BITS)
-#define ITER_MSK	((1<<ITER_BITS)-4)
-   
-.text
-.align 4
-.globl NAME
-
-NAME:
-
-# Guaranteed by caller: esi is 4-aligned, ecx>=16
-10:
-	PREFETCH((%esi))	# Prefetch _each_ cacheline
-	PREFETCH(32(%esi))	# Note! Athlons have 64 bytes long ones, but
-	PREFETCH(64(%esi))	# PIIIs only 32! This gives ~20% speedup
-	PREFETCH(64+32(%esi))	# for PIII
-	PREFETCH(128(%esi))
-	PREFETCH(128+32(%esi))
-	PREFETCH(192(%esi))
-	PREFETCH(192+32(%esi))
-	movl	%ecx, %ebx
-	movl	%ecx, %edx
-	andl	$ITER_MSK, %ebx	# = bytes to handle in first (partial) iteration
-	shrl	$ITER_BITS, %ecx # = iterations to make 
-	addl	%ebx, %esi	# => 1st byte to handle in 2nd complete iteration
-	shrl	$2, %ebx	# = dwords to handle
-	negl	%ebx
-	lea	50f(%ebx,%ebx,2), %ebx # = 45f - 3*dwords_to_handle
-	clc
-	jmp	*%ebx		# here we go!
-
-40:
-	PREFETCH(256(%esi))
-41:
-	lea	ITER_SZ(%esi), %esi	# does NOT change CF!
-/*
-	addl	-128(%esi), %eax
-	adcl	-124(%esi), %eax
-	adcl	-120(%esi), %eax
-	adcl	-116(%esi), %eax
-	adcl	-112(%esi), %eax
-	adcl	-108(%esi), %eax
-	adcl	-104(%esi), %eax
-	adcl	-100(%esi), %eax
-	adcl	-96(%esi), %eax
-	adcl	-92(%esi), %eax
-	adcl	-88(%esi), %eax
-	adcl	-84(%esi), %eax
-	adcl	-80(%esi), %eax
-	adcl	-76(%esi), %eax
-	adcl	-72(%esi), %eax
-	adcl	-68(%esi), %eax
-	adcl	-64(%esi), %eax
-	adcl	-60(%esi), %eax
-	adcl	-56(%esi), %eax
-	adcl	-52(%esi), %eax
-	adcl	-48(%esi), %eax
-	adcl	-44(%esi), %eax
-	adcl	-40(%esi), %eax
-	adcl	-36(%esi), %eax
-*/
-	addl	-32(%esi), %eax
-	adcl	-28(%esi), %eax
-	adcl	-24(%esi), %eax
-	adcl	-20(%esi), %eax
-	adcl	-16(%esi), %eax
-	adcl	-12(%esi), %eax
-	adcl	-8(%esi), %eax
-	adcl	-4(%esi), %eax
-50:
-	adcl	$0, %eax
-	dec	%ecx			# does NOT change CF!
-    # We can do just "jge 40b" here, but we can be a bit clever...
-    # This little twist gives surprisingly noticeable benefits!
-    # Seen 11% increase on random 1K blocks on Duron 650
-	js	60f
-	cmp	$256/ITER_SZ, %ecx
-	jae	40b	# need prefetch
-	jmp	41b	# do not need it
-60:
-	movl	%edx, %ecx
-	andl	$3, %ecx
-	ret
diff -Naurp linux-2.4.20-wolk4.0s/arch/i386/lib/csum_ssemmxplus.S linux-2.4.20-wolk4.1-fullkernel/arch/i386/lib/csum_ssemmxplus.S
--- linux-2.4.20-wolk4.0s/arch/i386/lib/csum_ssemmxplus.S	2003-05-15 21:52:20.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/arch/i386/lib/csum_ssemmxplus.S	1970-01-01 01:00:00.000000000 +0100
@@ -1,4 +0,0 @@
-#define PREFETCH(a) prefetchnta a
-#define NAME csum_ssemmxplus
-
-#include "csum_pf.inc"
diff -Naurp linux-2.4.20-wolk4.0s/arch/i386/lib/csumcpy.S linux-2.4.20-wolk4.1-fullkernel/arch/i386/lib/csumcpy.S
--- linux-2.4.20-wolk4.0s/arch/i386/lib/csumcpy.S	2003-05-15 21:52:20.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/arch/i386/lib/csumcpy.S	1970-01-01 01:00:00.000000000 +0100
@@ -1,178 +0,0 @@
-/*
- * INET		An implementation of the TCP/IP protocol suite for the LINUX
- *		operating system.  INET is implemented using the  BSD Socket
- *		interface as the means of communication with the user level.
- *
- *		IP/TCP/UDP checksumming routines
- *
- * Authors:	Jorge Cwik, <jorge@laser.satlink.net>
- *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
- *		Tom May, <ftom@netcom.com>
- *              Pentium Pro/II routines:
- *              Alexander Kjeldaas <astor@guardian.no>
- *              Finn Arne Gangstad <finnag@guardian.no>
- *		Lots of code moved from tcp.c and ip.c; see those files
- *		for more names.
- *
- * Changes:     Ingo Molnar, converted csum_partial_copy() to 2.1 exception
- *			     handling.
- *		Andi Kleen,  add zeroing on error converted to pure assembler
- *		2002-10-30 Denis Vlasenko
- *			boot-time benchmarking, 3Dnow/MMX+/SSE versions
- *
- *		This program is free software; you can redistribute it and/or
- *		modify it under the terms of the GNU General Public License
- *		as published by the Free Software Foundation; either version
- *		2 of the License, or (at your option) any later version.
- */
-
-#include <asm/errno.h>
-				
-/*
-** computes a partial checksum, e.g. for TCP/UDP fragments
-**
-** unsigned int csum_partial(const unsigned char * buff,
-**		int len, unsigned int sum)
-*/
-
-#ifdef __KERNEL__
-#define K(a...) a
-#else
-#define K(a...)
-#endif
-
-#define SRC(y...) \
-9999:	y			;\
-	.section __ex_table, "a";\
-	.long	9999b, 6001f	;\
-	.previous
-
-#define DST(y...) \
-9999:	y			;\
-	.section __ex_table, "a";\
-	.long	9999b, 6002f	;\
-	.previous
-
-#define	KERNEL_FPU_BEGIN \
-	call	kernel_fpu_begin
-
-#define	KERNEL_FPU_END(r) \
-K(	movl	%cr0, r		;)\
-K(	orl	$8, r		;)\
-K(	movl	r, %cr0		;)
-
-.text
-
-#include "csumcpy_naive.inc"
-#include "csumcpy_basic.inc"
-#include "csumcpy_ssemmxplus.inc"
-#include "csumcpy_sse.inc"
-
-.align 4
-.globl csum_partial_copy_generic
-
-csum_partial_copy_generic:
-	pushl	%ebx
-	pushl	%edi
-	pushl	%esi
-	pushl	%ebp
-	movl	%esp, %ebp
-
-#define STK_DERR 40(%ebp)
-#define STK_SERR 36(%ebp)
-#define STK_SUM 32(%ebp)
-#define STK_LEN 28(%ebp)
-#define STK_DST 24(%ebp)
-#define STK_SRC 20(%ebp)
-#define STK_EIP 16(%ebp)
-#define STK_EBX 12(%ebp)
-#define STK_EDI 8(%ebp)
-#define STK_ESI 4(%ebp)
-#define STK_EBP (%ebp)
-	
-	movl	STK_SRC, %esi	#src
-	movl	STK_DST, %edi	#dst	
-	movl	STK_LEN, %ecx	#len
-	movl	STK_SUM, %eax	#sum
-
-        testl	$3, %edi	# Check dst alignment
-        jz	40f
-
-    # not 4-aligned: analyze and align...
-	testl	$1, %edi
-	jz	30f
-
-    # unaligned start addr
-	decl	%ecx
-	js	90f 		# sz==0, exit
-	movzbl	(%esi), %ebx	# eat one byte...
-	movb	%bl, (%edi)
-	addl	%ebx, %eax
-	adcl	$0, %eax
-	roll	$8, %eax	# NB: need to be undone at exit!
-	incl	%esi
-	incl	%edi
-	testl	$2, %edi
-	jz	40f
-30:
-    # This is 2-aligned, but not 4-aligned
-	cmpl	$3, %ecx
-	jbe	60f
-	movw	(%esi), %bx	# eat 2 bytes
-	addw	%bx, %ax
-	movw	%bx, (%edi)
-	adcl	$0, %eax
-	leal	2(%esi), %esi
-	leal	2(%edi), %edi
-	subl	$2, %ecx
-40:
-    # edi is 4-aligned now: call block routine
-	movl	$csumcpy_basic, %ebx	# 'default', known good for ecx==0 etc
-	cmpl	$128, %ecx		# use optimized routine
-	jb	50f			#   only for large blocks
-	movl	best_csumcpy, %ebx
-50:	call	*%ebx
-60:
-    # handle last 0-3 bytes
-	jecxz	80f
-	cmpl	$2, %ecx
-	jb	70f
-SRC(	movw	(%esi), %cx         )
-	leal	2(%esi), %esi
-DST(	movw	%cx, (%edi)         )
-	leal	2(%edi), %edi
-	je	75f
-	shll	$16, %ecx
-70:
-SRC(	movb	(%esi), %cl         )
-DST(	movb	%cl, (%edi)         )
-75:	addl	%ecx, %eax
-	adcl	$0, %eax
-80:
-    # undo csum rotation if dst was unaligned
-	testl	$1, STK_DST
-	jz	90f
-	roll	$8, %eax
-90:
-	movl	%esp, %ebp
-	popl	%ebp
-	popl	%esi
-	popl	%edi
-	popl	%ebx
-	ret
-
-
-.section .fixup, "ax"
-6001:	movl	STK_SERR, %ebx		# src_err_ptr	
-	movl	$-EFAULT, (%ebx)
-	# zero the complete destination (computing the rest is too much work)
-	movl	STK_DST, %edi		# dst
-	movl	STK_LEN, %ecx		# len
-	xorl	%eax, %eax
-	cld
-	rep; stosb
-	jmp	90b
-6002:	movl	STK_DERR, %ebx		# dst_err_ptr
-	movl	$-EFAULT, (%ebx)
-	jmp	90b
-.previous
diff -Naurp linux-2.4.20-wolk4.0s/arch/i386/lib/csumcpy_basic.inc linux-2.4.20-wolk4.1-fullkernel/arch/i386/lib/csumcpy_basic.inc
--- linux-2.4.20-wolk4.0s/arch/i386/lib/csumcpy_basic.inc	2003-05-15 21:52:20.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/arch/i386/lib/csumcpy_basic.inc	1970-01-01 01:00:00.000000000 +0100
@@ -1,40 +0,0 @@
-// Please somebody experiment with unroll length
-// on a PII. Do _not_ optimize for PIII/Athlons/etc,
-// they won't typically use this...
-
-.align 4
-.globl csumcpy_basic
-
-csumcpy_basic:
-	movl	%ecx, %ebx
-	movl	%ecx, %edx
-	shrl	$6, %ecx
-	andl	$0x3c, %ebx
-	negl	%ebx
-	subl	%ebx, %esi
-	subl	%ebx, %edi
-	leal	50f(%ebx,%ebx), %ebx
-	clc
-	jmp	*%ebx
-40:
-	leal	64(%esi), %esi
-	leal	64(%edi), %edi 
-
-#undef ROUND
-#define ROUND(x) \
-SRC(	movl	x(%esi), %ebx	);	\
-	adcl	%ebx, %eax	;	\
-DST(	movl	%ebx, x(%edi)	);
-
-	ROUND(-64) ROUND(-60) ROUND(-56) ROUND(-52)
-	ROUND(-48) ROUND(-44) ROUND(-40) ROUND(-36)
-	ROUND(-32) ROUND(-28) ROUND(-24) ROUND(-20)
-	ROUND(-16) ROUND(-12) ROUND(-8)  ROUND(-4)
-50:
-	decl	%ecx
-	jge	40b
-
-	adcl	$0, %eax
-	movl	%edx, %ecx
-	andl	$3, %ecx
-	ret
diff -Naurp linux-2.4.20-wolk4.0s/arch/i386/lib/csumcpy_naive.inc linux-2.4.20-wolk4.1-fullkernel/arch/i386/lib/csumcpy_naive.inc
--- linux-2.4.20-wolk4.0s/arch/i386/lib/csumcpy_naive.inc	2003-05-15 21:52:20.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/arch/i386/lib/csumcpy_naive.inc	1970-01-01 01:00:00.000000000 +0100
@@ -1,21 +0,0 @@
-// Heh... at least it's small ;)
-
-.align 4
-.globl csumcpy_naive
-
-csumcpy_naive:
-	mov	%ecx, %edx
-	shrl	$2, %ecx
-	clc
-1:
-SRC(	movl	(%esi), %ebx	)
-DST(	movl	%ebx, (%edi)	)
-	adcl	%ebx, %eax
-	leal	4(%esi), %esi
-	leal	4(%edi), %edi
-	loop	1b
-
-	adcl	$0, %eax
-	mov	%edx, %ecx
-	and	$3, %ecx
-	ret
diff -Naurp linux-2.4.20-wolk4.0s/arch/i386/lib/csumcpy_sse.inc linux-2.4.20-wolk4.1-fullkernel/arch/i386/lib/csumcpy_sse.inc
--- linux-2.4.20-wolk4.0s/arch/i386/lib/csumcpy_sse.inc	2003-05-15 21:52:20.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/arch/i386/lib/csumcpy_sse.inc	1970-01-01 01:00:00.000000000 +0100
@@ -1,147 +0,0 @@
-// Huge routine, I don't like it's size and number
-// of fixups... think of that when you want
-// to unroll loop more
-// TODO: benchmark and reduce size
-// I won't stand 1K behemot just for 5% speedup
-
-#undef PREFETCH
-#define PREFETCH(a) prefetchnta a
-
-// How much unrolling do you want?
-// vda: celeron 1200: 5 with movaps, 4 with movups
-#undef ITER_BITS
-#define ITER_BITS 6	// ...4,5,6,7 - ...16,32,64,128 bytes
-                        // NB: tweak unrolled loop too...
-
-#undef ITER_SZ
-#undef ITER_MSK
-#define ITER_SZ		(1<<ITER_BITS)
-#define ITER_MSK	((1<<ITER_BITS)-4)
-
-.align 4
-.globl csumcpy_sse
-
-csumcpy_sse:
-	testl	$0xe, %edi	# Check alignment
-	jnz	5500f		# align to 16 bytes
-1:
-	movl	%ecx, %edx
-	shrl	$ITER_BITS, %ecx
-	jz	20f
-
-# "big chunks" loop 
-	PREFETCH((%esi))	# Prefetch a couple of cachelines
-	PREFETCH(32(%esi))	// Note: Athlons have 64 bytes long ones, but
-	PREFETCH(64(%esi))	// PIIIs only 32! This gives ~20% speedup
-	PREFETCH(64+32(%esi))	// for PIII
-	PREFETCH(128(%esi))	// Note2: 128 pf depth is slower for Athlons
-	PREFETCH(128+32(%esi))	// let them enjoy 256
-	PREFETCH(192(%esi))
-	PREFETCH(192+32(%esi))
-
-	//KERNEL_FPU_BEGIN	// We can't use lazy save - can be in irq :(
-	subl	$32, %esp	// hopefully this is not too slow...
-K(	movl	%cr0, %ebx	)
-K(	clts			)
-	movups	%xmm0, (%esp)
-	movups	%xmm1, 16(%esp)
-	
-
-#undef ROUND0
-#undef ROUND
-#define ROUND0(au,r) \
-SRC(	mov##au##ps (%esi), r	;)	\
-	adcl	(%esi), %eax	;	\
-	adcl	4(%esi), %eax	;	\
-	adcl	8(%esi), %eax	;	\
-	adcl	12(%esi), %eax	;	\
-DST(	movntps	r, (%edi)	;)	\
-
-#define ROUND(au,x,r) \
-SRC(	mov##au##ps x(%esi), r	;)	\
-	adcl	x(%esi), %eax	;	\
-	adcl	x+4(%esi), %eax	;	\
-	adcl	x+8(%esi), %eax	;	\
-	adcl	x+12(%esi), %eax;	\
-DST(	movntps	r, x(%edi)	;)	\
-
-// ROUND[0]: edi must be 16-aligned!
-// if esi is not aligned, movaps wouldn't work,
-// not caught by testsuite. TODO.
-// We don't need SRC() around adcl's
-// (exception, if any, would be caught by 1st one)
-// (FIXME: can races against interrupts bite us?)
-
-	testl	$0xf, %esi	# Check esi alignment + clear CF
-	jz	15f
-10:				# esi is NOT 16-aligned
-	PREFETCH(256(%esi))
-	ROUND0(u,%xmm0)
-	ROUND(u,16,%xmm1)
-	PREFETCH(256+32(%esi))
-	ROUND(u,32,%xmm0)
-	ROUND(u,48,%xmm1)
-	lea	ITER_SZ(%esi), %esi
-	lea	ITER_SZ(%edi), %edi
-	//dec	%ecx
-	//jnz	10b
-	loop	10b	// Beware: loop and ITER_BITS>6 don't mix
-	adcl	$0, %eax
-	jmp	19f
-15:				# esi is 16-aligned
-	PREFETCH(256(%esi))
-	ROUND0(a,%xmm0)
-	ROUND(a,16,%xmm1)
-	PREFETCH(256+32(%esi))
-	ROUND(a,32,%xmm0)
-	ROUND(a,48,%xmm1)
-	lea	ITER_SZ(%esi), %esi
-	lea	ITER_SZ(%edi), %edi
-	//dec	%ecx
-	//jnz	15b
-	loop	15b	// Beware: loop and ITER_BITS>6 don't mix
-	adcl	$0, %eax
-19:
-	sfence			# clean up XMM
-	//KERNEL_FPU_END(%ebx)
-	movups	(%esp), %xmm0
-	movups	16(%esp), %xmm1
-	addl	$32, %esp
-K(	movl	%ebx, %cr0	)
-	
-20:
-    # loop for dwords
-	movl	%edx, %ecx
-	andl	$ITER_MSK, %edx
-	jz	40f
-	shrl	$2, %edx	# this also clears CF
-30:
-SRC(	movl	(%esi), %ebx	)
-	adcl	%ebx, %eax
-DST(	movl	%ebx, (%edi)	)
-	lea	4(%esi), %esi
-	lea	4(%edi), %edi
-	dec	%edx
-	jnz	30b
-	adcl	$0, %eax
-40:	
-    # last 1, 2 or 3 bytes: handled by caller
-	andl	$3, %ecx
-	ret
-
-
-# This is 16-align edi and get back
-5500:	cmp	$ITER_SZ, %ecx	# edi is 4-aligned here
-	mov	%ecx, %edx      # edx needed at 20:
-	jb	20b		# not worthy: too short
-	
-5520:	test	$0xe, %edi	# loop until we are 16-aligned
-	jz	1b
-SRC(	movl	(%esi), %ebx	)
-	addl	$4, %esi
-DST(	movl	%ebx, (%edi)	)
-	addl	$4, %edi
-	addl	%ebx, %eax
-	adcl	$0, %eax
-	subl	$4, %ecx
-	jmp	5520b
diff -Naurp linux-2.4.20-wolk4.0s/arch/i386/lib/csumcpy_ssemmxplus.inc linux-2.4.20-wolk4.1-fullkernel/arch/i386/lib/csumcpy_ssemmxplus.inc
--- linux-2.4.20-wolk4.0s/arch/i386/lib/csumcpy_ssemmxplus.inc	2003-05-15 21:52:20.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/arch/i386/lib/csumcpy_ssemmxplus.inc	1970-01-01 01:00:00.000000000 +0100
@@ -1,103 +0,0 @@
-#undef PREFETCH
-#define PREFETCH(a) prefetchnta a
-
-// How much unrolling do you want?
-#undef ITER_BITS
-#define ITER_BITS 5             // ...5,6,7 - ...32,64,128 bytes
-                                // NB: tweak unrolled loop too...
-
-#undef ITER_SZ
-#undef ITER_MSK
-#define ITER_SZ		(1<<ITER_BITS)
-#define ITER_MSK	((1<<ITER_BITS)-4)
-
-.align 4
-.globl csumcpy_ssemmxplus
-
-csumcpy_ssemmxplus:
-	movl	%ecx, %edx
-	shrl	$ITER_BITS, %ecx
-	jz	20f
-
-# "big chunks" loop 
-	PREFETCH((%esi))	# Prefetch a couple of cachelines
-	PREFETCH(32(%esi))	// Note: Athlons have 64 bytes long ones, but
-	PREFETCH(64(%esi))	// PIIIs only 32! This gives ~20% speedup
-	PREFETCH(64+32(%esi))	// for PIII
-	PREFETCH(128(%esi))	// Note2: 128 pf depth is slower for Athlons
-	PREFETCH(128+32(%esi))	// let them enjoy 256
-	PREFETCH(192(%esi))
-	PREFETCH(192+32(%esi))
-	
-	//KERNEL_FPU_BEGIN	// We can't use lazy save - can be in irq :(
-K(	movl	%cr0, %ebx	)
-K(	clts			)
-	subl	$108, %esp
-	fnsave	(%esp)
-	fwait
-	
-	clc
-
-#undef ROUND0
-#undef ROUND
-#define ROUND0(r) \
-SRC(	movq	(%esi), r	;)	\
-	adcl	(%esi), %eax	;	\
-	adcl	4(%esi), %eax	;	\
-DST(	movntq	r, (%edi)	;)	\
-
-#define ROUND(x,r) \
-SRC(	movq	x(%esi), r	;)	\
-	adcl	x(%esi), %eax	;	\
-	adcl	x+4(%esi), %eax	;	\
-DST(	movntq	r, x(%edi)	;)	\
-
-// moving store to the end of a ROUND makes it faster
-// don't ask me why
-// we don't need SRC() around adcl's
-// (exception, if any, would be caught by 1st one)
-// (FIXME: can races against interrupts bite us?)
-
-10:
-	PREFETCH(256(%esi))
-	ROUND0(%mm0)		// using mm1,2,3 does not speed up things
-	ROUND(8,%mm0)
-	ROUND(16,%mm0)
-	ROUND(24,%mm0)
-/*	PREFETCH(256+32(%esi))
-	ROUND(32,%mm0)
-	ROUND(40,%mm0)
-	ROUND(48,%mm0)
-	ROUND(56,%mm0)*/
-
-	lea	ITER_SZ(%esi), %esi
-	lea	ITER_SZ(%edi), %edi
-	//dec	%ecx
-	//jnz	10b
-	loop	10b	// Beware: loop and ITER_BITS>5 don't mix
-	adcl	$0, %eax
-
-	sfence			
-	//KERNEL_FPU_END(%ebx)
-	frstor	(%esp)
-	addl	$108, %esp
-K(	movl	%ebx, %cr0	)
-
-20:	
-    # loop for dwords
-	movl	%edx, %ecx
-	andl	$ITER_MSK, %edx
-	jz	40f
-	shrl	$2, %edx	# this also clears CF
-30:
-SRC(	movl	(%esi), %ebx	)
-	adcl	%ebx, %eax
-DST(	movl	%ebx, (%edi)	)
-	lea	4(%esi), %esi
-	lea	4(%edi), %edi
-	dec	%edx
-	jnz	30b
-	adcl	$0, %eax
-	
-40:	andl	$3, %ecx
-	ret
diff -Naurp linux-2.4.20-wolk4.0s/arch/i386/vmlinux.lds.S linux-2.4.20-wolk4.1-fullkernel/arch/i386/vmlinux.lds.S
--- linux-2.4.20-wolk4.0s/arch/i386/vmlinux.lds.S	2003-05-15 21:52:20.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/arch/i386/vmlinux.lds.S	2003-05-16 13:45:27.000000000 +0200
@@ -18,15 +18,6 @@ SECTIONS
 	SHORT(__KERNEL_CS)
 	}
 
-  . = ALIGN(16);		/* Exception table */
-  __start___ex_table = .;
-  __ex_table : { *(__ex_table) }
-  __stop___ex_table = .;
-
-  __start___ksymtab = .;	/* Kernel symbol table */
-  __ksymtab : { *(__ksymtab) }
-  __stop___ksymtab = .;
-
   __start___kallsyms = .;	/* All kernel symbols */
   __kallsyms : { *(__kallsyms) }
   __stop___kallsyms = .;
@@ -39,13 +30,13 @@ SECTIONS
   . = ALIGN(32);
   .data.cacheline_aligned : { *(.data.cacheline_aligned) }
 
+  . = ALIGN(8192);
   .data.init_task : {
-	. = ALIGN(8192);
 	*(.data.init_task)
 	}
 
+  . = ALIGN(4096);
   .data.page_aligned : {
-  	. = ALIGN(4096);
 	*(.data.swapper_pg_dir)
 	*(.data.pg0)
 	*(.data.pg1)
@@ -84,23 +75,18 @@ SECTIONS
 	. = ALIGN(4*1024*1024) - 1;
 	BYTE(0)
 	}
-  . += __KERNEL_TEXT_OFFSET;
-#else
-  .text.init : { *(.text.init) }
-  . = ALIGN(4096);
-#endif
-
-  __init_end = .;
-
+  __init_end = . + __KERNEL_TEXT_OFFSET;
 
-#ifdef CONFIG_GRKERNSEC_PAX_KERNEXEC
 /*
  * PaX: this must be kept in synch with the KERNEL_CS base
  * in the GDTs in arch/i386/kernel/head.S
  */
-  _text = . - __KERNEL_TEXT_OFFSET;	/* Text and read-only data */
-  .text (. - __KERNEL_TEXT_OFFSET) : AT (_text + __KERNEL_TEXT_OFFSET) {
+  _text = .;			/* Text and read-only data */
+  .text : AT (. + __KERNEL_TEXT_OFFSET) {
 #else
+  .text.init : { *(.text.init) }
+  . = ALIGN(4096);
+  __init_end = .;
   _text = .;			/* Text and read-only data */
   .text : {
 #endif
@@ -117,13 +103,23 @@ SECTIONS
   . += __KERNEL_TEXT_OFFSET;
 #endif
 
+  .rodata.page_aligned : { *(.data.idt) }
   .rodata : { *(.rodata) *(.rodata.*) }
-  .rodata.page_aligned : {
-  	. = ALIGN(4096);
-	*(.data.idt)
-	}
   .kstrtab : { *(.kstrtab) }
 
+  . = ALIGN(16);		/* Exception table */
+  __start___ex_table = .;
+  __ex_table : { *(__ex_table) }
+  __stop___ex_table = .;
+
+  __start___ksymtab = .;	/* Kernel symbol table */
+  __ksymtab : { *(__ksymtab) }
+  __stop___ksymtab = .;
+
+  __start___kallsyms = .;	/* All kernel symbols */
+   __kallsyms : { *(__kallsyms) }
+  __stop___kallsyms = .;
+
 #ifdef CONFIG_GRKERNSEC_PAX_KERNEXEC
   _end = ALIGN(4*1024*1024);
 #else
diff -Naurp linux-2.4.20-wolk4.0s/drivers/block/cloop.c linux-2.4.20-wolk4.1-fullkernel/drivers/block/cloop.c
--- linux-2.4.20-wolk4.0s/drivers/block/cloop.c	2003-05-15 21:52:23.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/drivers/block/cloop.c	2003-05-16 13:54:58.000000000 +0200
@@ -73,6 +73,7 @@
 
 #include <linux/version.h>
 #include <linux/module.h>
+#include <linux/config.h>
 #include <linux/sched.h>
 #include <linux/fs.h>
 #include <linux/file.h>
@@ -130,6 +131,7 @@ MODULE_LICENSE("GPL");
 static char *file=NULL;
 MODULE_PARM(file, "s");
 
+#define HAS_BUILTIN_ZLIB  CONFIG_ZLIB_INFLATE || CONFIG_ZLIB_INFLATE_MODULE
 struct cloop_device
 {
  /* Copied straight from the file */
@@ -144,7 +146,7 @@ struct cloop_device
   void *buffer;
   void *compressed_buffer;
 
-#ifndef CONFIG_ZLIB_INFLATE
+#ifndef HAS_BUILTIN_ZLIB
   /* Preallocated scratch area for zlib, saves a lot of kmalloc()s */
   void *zlib_used;
   void *zlib_scratch;
@@ -170,7 +172,7 @@ static const int max_cloop = 1;
 static devfs_handle_t devfs_handle;      /*  For the directory */
 #endif
 
-#ifndef CONFIG_ZLIB_INFLATE
+#ifndef HAS_BUILTIN_ZLIB
 /* Use zlib uncompress */
 extern int uncompress(char *dest, unsigned long *destLen,
                       const char *source, unsigned long sourceLen);
@@ -180,8 +182,8 @@ void free(void *p)
 /* Memory pointer is reset after uncompress(); */
 }
 
-void *calloc(size_t nmemb, size_t size)
-{
+void *calloc(size_t nmemb, size_t size) {
+
 /* Rusty was right, preallocating gives better performance.      */
 /* return(kmalloc(nmemb*size, GFP_KERNEL)); */
   void *ret = cloop_dev.zlib_used;
@@ -194,6 +196,7 @@ void *calloc(size_t nmemb, size_t size)
         }
   return ret;
 }
+
 #else
 /* Use zlib_inflate from lib/zlib_inflate */
 #include <linux/zutil.h>
@@ -409,7 +412,7 @@ static int load_buffer(int blocknum)
  /* Do decompression into real buffer. */
  buflen = ntohl(cloop_dev.head.block_size);
 
-#ifndef CONFIG_ZLIB_INFLATE
+#ifndef HAS_BUILTIN_ZLIB
  /* Reset zlib usage pool */
  cloop_dev.zlib_used = cloop_dev.zlib_scratch;
 #endif
@@ -736,7 +739,7 @@ static int init_loopback(void)
           cloop_name, largest_block);
    goto error_release_free_buffer;
   }
-#ifndef CONFIG_ZLIB_INFLATE
+#ifndef HAS_BUILTIN_ZLIB
 /* largest_block / 10 + 48000 seems to be sufficient for zlib working area */
  cloop_dev.zlib_size=largest_block/10+ZLIB_NEEDS;
  cloop_dev.zlib_scratch = kmalloc(cloop_dev.zlib_size, GFP_KERNEL); 
@@ -764,7 +767,7 @@ static int init_loopback(void)
           cloop_name,
           ntohl(cloop_dev.offsets[ntohl(cloop_dev.head.num_blocks)]),
           (unsigned long)inode->i_size);
-#ifndef CONFIG_ZLIB_INFLATE
+#ifndef HAS_BUILTIN_ZLIB
    kfree(cloop_dev.zlib_scratch);
 #else
    vfree(zstream.workspace); zstream.workspace=NULL;
@@ -908,7 +911,7 @@ void cleanup_module(void) 
  kfree(cloop_dev.offsets);
  kfree(cloop_dev.buffer);
  kfree(cloop_dev.compressed_buffer);
-#ifndef CONFIG_ZLIB_INFLATE
+#ifndef HAS_BUILTIN_ZLIB
  kfree(cloop_dev.zlib_scratch);
 #ifdef DEBUGMEM
  printk("%s: Maximum zlib_scratch usage was %lu bytes.\n",
diff -Naurp linux-2.4.20-wolk4.0s/drivers/block/elevator.c linux-2.4.20-wolk4.1-fullkernel/drivers/block/elevator.c
--- linux-2.4.20-wolk4.0s/drivers/block/elevator.c	2003-05-15 21:52:23.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/drivers/block/elevator.c	2003-05-16 14:38:42.000000000 +0200
@@ -87,8 +87,8 @@ static int rq_mergeable(struct request *
 		return 0;
 	if (req->nr_sectors + count > max_sectors)
 		return 0;
-	if (bh_elv_seq(bh) != bh_elv_seq(req->bh))
-		return 0;
+//	if (bh_elv_seq(bh) != bh_elv_seq(req->bh))
+//		return 0;
 
 	return 1;
 }
@@ -98,16 +98,15 @@ int elevator_linus_merge(request_queue_t
 			 struct buffer_head *bh, int rw,
 			 int max_sectors)
 {
-	struct list_head *entry, *real_head;
-	unsigned int count = bh->b_size >> 9, ret = ELEVATOR_NO_MERGE;
+	struct list_head *entry;
+	unsigned int count = bh->b_size >> 9;
+	unsigned int ret = ELEVATOR_NO_MERGE;
+	const int max_bomb_segments = q->elevator.max_bomb_segments;
 	struct request *__rq;
 	int backmerge_only = 0;
+	int passed_a_read = 0;
 
-	if (!bh_elv_seq(bh))
-		entry = &q->queue_head;
-	else
-		entry = &q->atomic_head;
-	real_head = entry;
+	entry = &q->queue_head;
 
 	/*
 	 * check last merge hint
@@ -131,32 +130,27 @@ int elevator_linus_merge(request_queue_t
 	while (!backmerge_only && (entry = entry->prev) != head) {
 		__rq = blkdev_entry_to_request(entry);
 
-		/*
-		 * we can't insert beyond a zero sequence point
-		 */
-		if (__rq->elevator_sequence <= 0 && !bh_elv_seq(bh))
+		if (__rq->elevator_sequence-- <= 0) {
 			/*
 			 * OK, we've exceeded someone's latency limit.
 			 * But we still continue to look for merges,
 			 * because they're so much better than seeks.
 			 */
 			backmerge_only = 1;
+		}
 
 		if (__rq->waiting)
 			continue;
 		if (__rq->rq_dev != bh->b_rdev)
 			continue;
-		if (!*req && bh_rq_in_between(bh, __rq, real_head) && !backmerge_only)
+		if (!*req && bh_rq_in_between(bh, __rq, &q->queue_head) && !backmerge_only)
 			*req = __rq;
+		if (__rq->cmd != WRITE)
+			passed_a_read = 1;
 		if (__rq->cmd != rw)
 			continue;
 		if (__rq->nr_sectors + count > max_sectors)
 			continue;
-		/*
-		 * possibly move this inside the merge path and make it a break
-		 */
-		if (bh_elv_seq(bh) != bh_elv_seq(__rq->bh))
-			continue;
 		if (__rq->sector + __rq->nr_sectors == bh->b_rsector) {
 			/*
 			 * Really here we could re-increase the elevator_latency of __rq,
@@ -184,12 +178,63 @@ out:
 		int scan_cost = ret ? 1 : ELV_LINUS_SEEK_COST;
 		struct list_head *entry = &(*req)->queue;
 
-		while ((entry = entry->next) != real_head) {
+		while ((entry = entry->next) != &q->queue_head) {
 			__rq = blkdev_entry_to_request(entry);
 			__rq->elevator_sequence -= scan_cost;
 		}
 	}
 
+	/*
+	 * If we failed to merge a read anywhere in the request
+	 * queue, we really don't want to place it at the end
+	 * of the list, behind lots of writes.  So place it near
+	 * the front.
+	 *
+	 * We don't want to place it in front of _all_ writes: that
+	 * would create lots of seeking, and isn't tunable.
+	 * We try to avoid promoting this read in front of existing
+	 * reads.
+	 *
+	 * max_bomb_segments becomes the maximum number of write
+	 * requests which we allow to remain in place in front of
+	 * a newly introduced read.  We weight things a little bit,
+	 * so large writes are more expensive than small ones, but it's
+	 * requests which count, not sectors.
+	 */
+	if (max_bomb_segments && rw == READ && !passed_a_read &&
+				ret == ELEVATOR_NO_MERGE) {
+		int cur_latency = 0;
+		struct request * const cur_request = *req;
+
+		entry = head->next;
+		while (entry != &q->queue_head) {
+			struct request *__rq;
+
+			if (entry == &q->queue_head)
+				BUG();
+			if (entry == q->queue_head.next &&
+					q->head_active && !q->plugged)
+				BUG();
+			__rq = blkdev_entry_to_request(entry);
+
+			if (__rq == cur_request) {
+				/*
+				 * This is where the old algorithm placed it.
+				 * There's no point pushing it further back,
+				 * so leave it here, in sorted order.
+				 */
+				break;
+			}
+			if (__rq->cmd == WRITE) {
+				cur_latency += 1 + __rq->nr_sectors / 64;
+				if (cur_latency >= max_bomb_segments) {
+					*req = __rq;
+					break;
+				}
+			}
+			entry = entry->next;
+		}
+	}
 	return ret;
 }
 
@@ -209,18 +254,13 @@ int elevator_noop_merge(request_queue_t 
 			struct buffer_head *bh, int rw,
 			int max_sectors)
 {
-	struct list_head *entry, *real_head;
+	struct list_head *entry;
 	unsigned int count = bh->b_size >> 9;
 
-	if (!bh_elv_seq(bh))
-		entry = &q->queue_head;
-	else
-		entry = &q->atomic_head;
-	real_head = entry;
-
-	if (list_empty(real_head))
+	if (list_empty(&q->queue_head))
 		return ELEVATOR_NO_MERGE;
 
+	entry = &q->queue_head;
 	while ((entry = entry->prev) != head) {
 		struct request *__rq = blkdev_entry_to_request(entry);
 
@@ -232,11 +272,6 @@ int elevator_noop_merge(request_queue_t 
 			continue;
 		if (__rq->waiting)
 			continue;
-		/*
-		 * possibly move this inside the merge path and make it a break
-		 */
-		if (bh_elv_seq(bh) != bh_elv_seq(__rq->bh))
-			continue;
 		if (__rq->sector + __rq->nr_sectors == bh->b_rsector) {
 			*req = __rq;
 			return ELEVATOR_BACK_MERGE;
@@ -246,7 +281,7 @@ int elevator_noop_merge(request_queue_t 
 		}
 	}
 
-	*req = blkdev_entry_to_request(real_head->prev);
+	*req = blkdev_entry_to_request(q->queue_head.prev);
 	return ELEVATOR_NO_MERGE;
 }
 
@@ -259,7 +294,7 @@ int blkelvget_ioctl(elevator_t * elevato
 	output.queue_ID			= elevator->queue_ID;
 	output.read_latency		= elevator->read_latency;
 	output.write_latency		= elevator->write_latency;
-	output.max_bomb_segments	= 0;
+	output.max_bomb_segments	= elevator->max_bomb_segments;
 
 	if (copy_to_user(arg, &output, sizeof(blkelv_ioctl_arg_t)))
 		return -EFAULT;
@@ -278,9 +313,12 @@ int blkelvset_ioctl(elevator_t * elevato
 		return -EINVAL;
 	if (input.write_latency < 0)
 		return -EINVAL;
+	if (input.max_bomb_segments < 0)
+		return -EINVAL;
 
 	elevator->read_latency		= input.read_latency;
 	elevator->write_latency		= input.write_latency;
+	elevator->max_bomb_segments	= input.max_bomb_segments;
 	return 0;
 }
 
diff -Naurp linux-2.4.20-wolk4.0s/drivers/block/ll_rw_blk.c linux-2.4.20-wolk4.1-fullkernel/drivers/block/ll_rw_blk.c
--- linux-2.4.20-wolk4.0s/drivers/block/ll_rw_blk.c	2003-05-15 21:52:23.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/drivers/block/ll_rw_blk.c	2003-05-17 23:30:17.000000000 +0200
@@ -51,8 +51,6 @@ static kmem_cache_t *request_cachep;
  */
 DECLARE_TASK_QUEUE(tq_disk);
 
-LIST_HEAD(blk_atomic_head);
-
 /*
  * Protect the request list against multiple users..
  *
@@ -127,63 +125,9 @@ int * max_sectors[MAX_BLKDEV];
  */
 char * blkdev_varyio[MAX_BLKDEV];
 
-/*
- * only allow merging of buffer_heads with identical sequence, for transparent
- * support for writing atomic blocks larger than what a single bh can hold
- */
-static unsigned int blk_atomic_seq;
-static spinlock_cacheline_t blk_atomic_lock_cacheline = {SPIN_LOCK_UNLOCKED};
-static spinlock_cacheline_t blk_atomic_queue_lock_cacheline = {SPIN_LOCK_UNLOCKED};
-
-#ifdef CONFIG_SMP
-struct blk_atomic_cpu {
-	unsigned int seq;
-	unsigned int left;
-} ____cacheline_aligned_in_smp;
-
-struct blk_atomic_cpu __cacheline_aligned_in_smp blk_atomic_cpu[NR_CPUS];
-
-#define BLK_ATOMIC_SEQ_GRAB	1024
-#endif
-
 unsigned long blk_max_low_pfn, blk_max_pfn;
 int blk_nohighio = 0;
 
-unsigned int blk_get_atomic_seq(void)
-{
-	unsigned int ret;
-
-#ifdef CONFIG_SMP
-	{
-		struct blk_atomic_cpu *bcpu = &blk_atomic_cpu[smp_processor_id()];
-
-restart:
-		if (unlikely(!bcpu->left)) {
-			spin_lock_irq(&blk_atomic_lock);
-			bcpu->seq = blk_atomic_seq;
-			blk_atomic_seq += BLK_ATOMIC_SEQ_GRAB;
-			spin_unlock_irq(&blk_atomic_lock);
-			bcpu->left = BLK_ATOMIC_SEQ_GRAB;
-		}
-		bcpu->seq++;
-		bcpu->left--;
-		if (unlikely(!bcpu->seq))
-			goto restart;
-
-		ret = bcpu->seq;
-	}
-#else
-	spin_lock_irq(&blk_atomic_lock);
-	ret = ++blk_atomic_seq;
-	if (unlikely(!ret)) {
-		ret = 1;
-		++blk_atomic_seq;
-	}
-	spin_unlock_irq(&blk_atomic_lock);
-#endif
-	return ret;
-}
-
 static inline int get_max_sectors(kdev_t dev)
 {
 	if (!max_sectors[MAJOR(dev)])
@@ -446,91 +390,6 @@ void generic_unplug_device(void *data)
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 
-static void blk_atomic_add(request_queue_t *q)
-{
-	spin_lock_irq(&blk_atomic_queue_lock);
-	/* it's empty only when it's out of the blk_atomic_head queue */
-	if (list_empty(&q->atomic_entry))
-		list_add_tail(&q->atomic_entry, &blk_atomic_head);
-	spin_unlock_irq(&blk_atomic_queue_lock);
-}
-
-static struct list_head *blk_find_insert_point(request_queue_t *q,
-					       struct request *rq)
-{
-	struct list_head *head = &q->queue_head, *insert = q->queue_head.prev;
-	struct buffer_head *bh;
-	int elv_seq;
-	struct request *dummy;
-
-	if (list_empty(head))
-		goto done;
-	else if (q->head_active && !q->plugged)
-		head = head->next;
-
-	dummy = NULL;
-	bh = rq->bh;
-
-	elv_seq = bh_elv_seq(bh);
-	bh_elv_seq(bh) = 0;
-
-	q->elevator.elevator_merge_fn(q, &dummy, head, bh,
-				      -1 /* non cmd -> no merge */,
-				      0 /* too small max_sectors -> no merge */);
-
-	bh_elv_seq(bh) = elv_seq;
-
-	if (dummy)
-		insert = &dummy->queue;
-
-done:
-	return insert;
-}
-
-void blk_refile_atomic_queue(int sequence)
-{
-	request_queue_t *q;
-	struct request * rq;
-	unsigned long flags;
-	struct list_head * q_entry, * rq_entry;
-	int __sequence;
-
-	spin_lock_irqsave(&blk_atomic_queue_lock, flags);
-
-	q_entry = blk_atomic_head.next;
-	while (q_entry != &blk_atomic_head) {
-		q = list_entry(q_entry, request_queue_t, atomic_entry);
-		q_entry = q_entry->next;
-
-		spin_lock(q->queue_lock);
-		rq_entry = q->atomic_head.next;
-		while (rq_entry != &q->atomic_head) {
-			rq = list_entry(rq_entry, struct request, queue);
-			rq_entry = rq_entry->next;
-
-			BUG_ON(!rq->q);
-			BUG_ON(!rq->bh);
-			__sequence = bh_elv_seq(rq->bh);
-			BUG_ON(!__sequence);
-			if (__sequence == sequence) {
-				struct list_head *ipoint;
-
-				list_del(&rq->queue);
-				if (list_empty(&q->queue_head))
-					q->plug_device_fn(q, rq->bh->b_rdev);
-
-				ipoint = blk_find_insert_point(q, rq);
-				list_add(&rq->queue, ipoint);
-			}
-		}
-		if (list_empty(&q->atomic_head))
-			list_del_init(&q->atomic_entry);
-		spin_unlock(q->queue_lock);
-	}
-
-	spin_unlock_irqrestore(&blk_atomic_queue_lock, flags);
-}
-
 /** blk_grow_request_list
  *  @q: The &request_queue_t
  *  @nr_requests: how many requests are desired
@@ -589,11 +448,13 @@ static void blk_init_free_list(request_q
 	si_meminfo(&si);
 	megs = si.totalram >> (20 - PAGE_SHIFT);
 #ifndef CONFIG_BLK_DEV_ELEVATOR_LOWLAT
-	nr_requests = 128;
-	if (megs < 32)
-		nr_requests /= 2;
+	nr_requests = (megs * 2) & ~15;	/* One per half-megabyte */
+	if (nr_requests < 32)
+		nr_requests = 32;
+	if (nr_requests > 1024)
+		nr_requests = 1024;
 #else
-	nr_requests = 4;
+	nr_requests = 32;
 #endif
 	blk_grow_request_list(q, nr_requests);
 
@@ -639,8 +500,6 @@ static int __make_request(request_queue_
 void blk_init_queue(request_queue_t * q, request_fn_proc * rfn)
 {
 	INIT_LIST_HEAD(&q->queue_head);
-	INIT_LIST_HEAD(&q->atomic_head);
-	INIT_LIST_HEAD(&q->atomic_entry);
 	elevator_init(&q->elevator, ELEVATOR_LINUS);
 	q->queue_lock		= &io_request_lock;
 	blk_init_free_list(q);
@@ -982,6 +841,11 @@ static inline void add_request(request_q
 {
 	drive_stat_acct(req->rq_dev, req->cmd, req->nr_sectors, 1);
 
+	if (!q->plugged && q->head_active && insert_here == &q->queue_head) {
+		spin_unlock_irq(q->queue_lock);
+		BUG();
+	}
+
 	/*
 	 * elevator indicated where it wants this request to be
 	 * inserted at elevator_merge time
@@ -1035,8 +899,6 @@ static void attempt_merge(request_queue_
 	    || req->nr_sectors + next->nr_sectors > max_sectors
 	    || next->waiting)
 		return;
-	if (bh_elv_seq(req->bh) != bh_elv_seq(next->bh))
-		return;
 	/*
 	 * If we are not allowed to merge these requests, then
 	 * return.  If we are allowed to merge, then the count
@@ -1060,12 +922,11 @@ static void attempt_merge(request_queue_
 }
 
 static inline void attempt_back_merge(request_queue_t * q,
-				      struct list_head * head,
 				      struct request *req,
 				      int max_sectors,
 				      int max_segments)
 {
-	if (&req->queue == head->prev)
+	if (&req->queue == q->queue_head.prev)
 		return;
 	attempt_merge(q, req, max_sectors, max_segments);
 }
@@ -1091,10 +952,9 @@ static int __make_request(request_queue_
 	int max_segments = MAX_SEGMENTS;
 	struct request * req, *freereq = NULL;
 	int rw_ahead, max_sectors, el_ret;
-	struct list_head *head, *real_head, *insert_here;
+	struct list_head *head, *insert_here;
 	int latency;
 	elevator_t *elevator = &q->elevator;
-	int atomic = bh_elv_seq(bh), atomic_add = 0;
 
 	count = bh->b_size >> 9;
 	sector = bh->b_rsector;
@@ -1136,7 +996,7 @@ static int __make_request(request_queue_
 	max_sectors = get_max_sectors(bh->b_rdev);
 
 	req = NULL;
-	real_head = head = !atomic ? &q->queue_head : &q->atomic_head;
+	head = &q->queue_head;
 	/*
 	 * Now we acquire the request spinlock, we have to be mega careful
 	 * not to schedule or do something nonatomic
@@ -1145,14 +1005,11 @@ static int __make_request(request_queue_
 
 again:
 	insert_here = head->prev;
-	if (!atomic) {
-		if (list_empty(head)) {
-			q->plug_device_fn(q, bh->b_rdev); /* is atomic */
-			goto get_rq;
-		} else if (q->head_active && !q->plugged)
-			head = head->next;
-	} else if (list_empty(head))
+	if (list_empty(head)) {
+		q->plug_device_fn(q, bh->b_rdev); /* is atomic */
 		goto get_rq;
+	} else if (q->head_active && !q->plugged)
+		head = head->next;
 
 	el_ret = elevator->elevator_merge_fn(q, &req, head, bh, rw,max_sectors);
 	switch (el_ret) {
@@ -1168,7 +1025,7 @@ again:
 			blk_started_io(count);
 			drive_stat_acct(req->rq_dev, req->cmd, count, 0);
 			req_new_io(req, 1, count);
-			attempt_back_merge(q, real_head, req, max_sectors, max_segments);
+			attempt_back_merge(q, req, max_sectors, max_segments);
 			goto out;
 
 		case ELEVATOR_FRONT_MERGE:
@@ -1231,10 +1088,9 @@ get_rq:
 			req = get_request(q, rw);
 			if (req == NULL) {
 				spin_unlock_irq(q->queue_lock);
-				if (atomic)
-					blk_refile_atomic_queue(atomic);
 				freereq = __get_request_wait(q, rw);
-				head = real_head;
+				req = NULL;
+				head = &q->queue_head;
 				spin_lock_irq(q->queue_lock);
 				get_request_wait_wakeup(q, rw);
 				goto again;
@@ -1260,13 +1116,10 @@ get_rq:
 	req_new_io(req, 0, count);
 	blk_started_io(count);
 	add_request(q, req, insert_here);
-	atomic_add = atomic;
 out:
 	if (freereq)
 		blkdev_release_request(freereq);
 	spin_unlock_irq(q->queue_lock);
-	if (atomic_add)
-		blk_atomic_add(q);
 	return 0;
 end_io:
 	bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state));
@@ -1315,8 +1168,6 @@ void generic_make_request (int rw, struc
 
 	if (!bh->b_end_io)
 		BUG();
-	if (!buffer_atomic(bh))
-		bh->b_elv_sequence = 0;
 
 	/* Test device size, when known. */
 	if (blk_size[major])
@@ -1608,10 +1459,6 @@ int __init blk_dev_init(void)
 	memset(max_readahead, 0, sizeof(max_readahead));
 	memset(max_sectors, 0, sizeof(max_sectors));
 
-#ifdef CONFIG_SMP
-	memset(blk_atomic_cpu, 0, sizeof(blk_atomic_cpu));
-#endif
-
 	blk_max_low_pfn = max_low_pfn - 1;
 	blk_max_pfn = max_pfn - 1;
 
@@ -1737,5 +1584,3 @@ EXPORT_SYMBOL(blk_max_low_pfn);
 EXPORT_SYMBOL(blk_max_pfn);
 EXPORT_SYMBOL(blk_seg_merge_ok);
 EXPORT_SYMBOL(blk_nohighio);
-EXPORT_SYMBOL(blk_get_atomic_seq);
-EXPORT_SYMBOL(blk_refile_atomic_queue);
diff -Naurp linux-2.4.20-wolk4.0s/drivers/char/wdt977.c linux-2.4.20-wolk4.1-fullkernel/drivers/char/wdt977.c
--- linux-2.4.20-wolk4.0s/drivers/char/wdt977.c	2002-12-18 01:03:53.000000000 +0100
+++ linux-2.4.20-wolk4.1-fullkernel/drivers/char/wdt977.c	2003-05-16 14:00:41.000000000 +0200
@@ -27,6 +27,7 @@
 #include <asm/io.h>
 #include <asm/system.h>
 #include <asm/mach-types.h>
+#include <asm/uaccess.h>
 
 #define WATCHDOG_MINOR	130
 
diff -Naurp linux-2.4.20-wolk4.0s/drivers/md/raid1.c linux-2.4.20-wolk4.1-fullkernel/drivers/md/raid1.c
--- linux-2.4.20-wolk4.0s/drivers/md/raid1.c	2003-05-15 21:52:27.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/drivers/md/raid1.c	2002-12-18 01:03:54.000000000 +0100
@@ -686,7 +686,6 @@ static int raid1_make_request (mddev_t *
  		mbh->b_list       = BUF_LOCKED;
  		mbh->b_end_io     = raid1_end_request;
  		mbh->b_private    = r1_bh;
-		mbh->b_elv_sequence = bh->b_elv_sequence;
 
 		mbh->b_next = r1_bh->mirror_bh_list;
 		r1_bh->mirror_bh_list = mbh;
@@ -1457,7 +1456,6 @@ static int raid1_sync_request (mddev_t *
 	bh->b_private = r1_bh;
 	bh->b_blocknr = sector_nr;
 	bh->b_rsector = sector_nr;
-	bh->b_elv_sequence = 0;
 	init_waitqueue_head(&bh->b_wait);
 
 	generic_make_request(READ, bh);
diff -Naurp linux-2.4.20-wolk4.0s/drivers/md/raid5.c linux-2.4.20-wolk4.1-fullkernel/drivers/md/raid5.c
--- linux-2.4.20-wolk4.0s/drivers/md/raid5.c	2003-05-15 21:52:27.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/drivers/md/raid5.c	2003-05-16 13:56:12.000000000 +0200
@@ -151,7 +151,7 @@ static void shrink_buffers(struct stripe
 			return;
 		sh->bh_cache[i] = NULL;
 		free_page((unsigned long) bh->b_data);
-		kmem_cache_free(bh_cachep, bh);
+		kfree(bh);
 	}
 }
 
@@ -162,7 +162,7 @@ static int grow_buffers(struct stripe_he
 
 	for (i=0; i<num; i++) {
 		struct page *page;
-		bh = kmem_cache_alloc(bh_cachep, priority);
+		bh = kmalloc(sizeof(struct buffer_head), priority);
 		if (!bh)
 			return 1;
 		memset(bh, 0, sizeof (struct buffer_head));
@@ -170,7 +170,7 @@ static int grow_buffers(struct stripe_he
 		if ((page = alloc_page(priority)))
 			bh->b_data = page_address(page);
 		else {
-			kmem_cache_free(bh_cachep, bh);
+			kfree(bh);
 			return 1;
 		}
 		atomic_set(&bh->b_count, 0);
@@ -474,7 +474,6 @@ static struct buffer_head *raid5_build_b
 	bh->b_state	= (1 << BH_Req) | (1 << BH_Mapped);
 	bh->b_size	= sh->size;
 	bh->b_list	= BUF_LOCKED;
-	bh->b_elv_sequence = 0;
 	return bh;
 }
 
@@ -942,6 +941,7 @@ static void handle_stripe(struct stripe_
 	
 	/* if already written requests can't be returned as successful fail them */
 	if (failed > 1 && written) {
+		printk(KERN_CRIT "DEBUG: RAID5: already written requests can't be returned as successfull so fail them!\n");
 		for (i=disks; i--; ) {
 			if (sh->bh_written[i]) written--;
 			while ((bh = sh->bh_written[i])) {
diff -Naurp linux-2.4.20-wolk4.0s/drivers/scsi/aic7xxx/aic79xx_osm.c linux-2.4.20-wolk4.1-fullkernel/drivers/scsi/aic7xxx/aic79xx_osm.c
--- linux-2.4.20-wolk4.0s/drivers/scsi/aic7xxx/aic79xx_osm.c	2003-05-15 21:52:32.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/drivers/scsi/aic7xxx/aic79xx_osm.c	2003-05-17 12:27:20.000000000 +0200
@@ -761,8 +761,9 @@ ahd_linux_map_seg(struct ahd_softc *ahd,
 		 * Due to DAC restrictions, we can't
 		 * cross a 4GB boundary.
 		 */
-		if ((addr ^ (addr + len - 1)) & ~0xFFFFFFFF) {
+		if ((addr ^ (addr + len - 1)) & 0xFFFFFFFF00000000ULL) {
 			struct	 ahd_dma_seg *next_sg;
+			uint32_t first_len;
 			uint32_t next_len;
 
 			printf("Crossed Seg\n");
@@ -773,12 +774,14 @@ ahd_linux_map_seg(struct ahd_softc *ahd,
 			consumed++;
 			next_sg = sg + 1;
 			next_sg->addr = 0;
-			next_len = 0x100000000 - (addr & 0xFFFFFFFF);
-			len -= next_len;
-			next_len |= ((addr >> 8) + 0x1000000) & 0x7F000000;
+			first_len = 0x100000000ULL - (addr & 0xFFFFFFFF);
+			next_len = len - first_len;
+			len = next_len;
+			next_len |=
+			    ((addr >> 8) + 0x1000000) & AHD_SG_HIGH_ADDR_MASK;
 			next_sg->len = ahd_htole32(next_len);
 		}
-		len |= (addr >> 8) & 0x7F000000;
+		len |= (addr >> 8) & AHD_SG_HIGH_ADDR_MASK;
 	}
 	sg->len = ahd_htole32(len);
 	return (consumed);
diff -Naurp linux-2.4.20-wolk4.0s/drivers/scsi/aic7xxx/aic7xxx_osm.c linux-2.4.20-wolk4.1-fullkernel/drivers/scsi/aic7xxx/aic7xxx_osm.c
--- linux-2.4.20-wolk4.0s/drivers/scsi/aic7xxx/aic7xxx_osm.c	2003-05-15 21:52:32.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/drivers/scsi/aic7xxx/aic7xxx_osm.c	2003-05-17 12:27:20.000000000 +0200
@@ -751,12 +751,14 @@ ahc_linux_map_seg(struct ahc_softc *ahc,
 	scb->platform_data->xfer_len += len;
 	if (sizeof(bus_addr_t) > 4
 	 && (ahc->flags & AHC_39BIT_ADDRESSING) != 0) {
+
 		/*
 		 * Due to DAC restrictions, we can't
 		 * cross a 4GB boundary.
 		 */
-		if ((addr ^ (addr + len - 1)) & ~0xFFFFFFFF) {
+		if ((addr ^ (addr + len - 1)) & 0xFFFFFFFF00000000ULL) {
 			struct	 ahc_dma_seg *next_sg;
+			uint32_t first_len;
 			uint32_t next_len;
 
 			printf("Crossed Seg\n");
@@ -767,12 +769,14 @@ ahc_linux_map_seg(struct ahc_softc *ahc,
 			consumed++;
 			next_sg = sg + 1;
 			next_sg->addr = 0;
-			next_len = 0x100000000 - (addr & 0xFFFFFFFF);
-			len -= next_len;
-			next_len |= ((addr >> 8) + 0x1000000) & 0x7F000000;
+			first_len = 0x100000000ULL - (addr & 0xFFFFFFFF);
+			next_len = len - first_len;
+			len = first_len;
+			next_len |=
+			    ((addr >> 8) + 0x1000000) & AHC_SG_HIGH_ADDR_MASK;
 			next_sg->len = ahc_htole32(next_len);
 		}
-		len |= (addr >> 8) & 0x7F000000;
+		len |= (addr >> 8) & AHC_SG_HIGH_ADDR_MASK;
 	}
 	sg->len = ahc_htole32(len);
 	return (consumed);
diff -Naurp linux-2.4.20-wolk4.0s/drivers/video/vesafb.c linux-2.4.20-wolk4.1-fullkernel/drivers/video/vesafb.c
--- linux-2.4.20-wolk4.0s/drivers/video/vesafb.c	2003-05-15 21:52:38.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/drivers/video/vesafb.c	2003-05-16 14:00:42.000000000 +0200
@@ -637,7 +637,7 @@ int __init vesafb_init(void)
 	video_width         = screen_info.lfb_width;
 	video_height        = screen_info.lfb_height;
 	video_linelength    = screen_info.lfb_linelength;
-	video_size          = screen_info.lfb_size * 65536;
+	video_size          = screen_info.lfb_width *	screen_info.lfb_height * video_bpp / 8;
 	video_visual = (video_bpp == 8) ?
 		FB_VISUAL_PSEUDOCOLOR : FB_VISUAL_TRUECOLOR;
 
diff -Naurp linux-2.4.20-wolk4.0s/fs/Config.in linux-2.4.20-wolk4.1-fullkernel/fs/Config.in
--- linux-2.4.20-wolk4.0s/fs/Config.in	2003-05-15 21:52:38.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/fs/Config.in	2003-05-19 12:51:33.000000000 +0200
@@ -98,7 +98,7 @@ mainmenu_option next_comment
    comment 'VFS settings'
    int 'Soft limit of filedescriptors' CONFIG_FILE_SOFT 8192
    int 'Hard limit of filedescriptors' CONFIG_FILE_HARD 65536
-   int 'Reserved for root' CONFIG_FILE_RESERVED 128
+   int 'Reserved for root' CONFIG_FILE_RESERVED 256
 endmenu
 
 # Journalling File Systems
diff -Naurp linux-2.4.20-wolk4.0s/fs/aio.c linux-2.4.20-wolk4.1-fullkernel/fs/aio.c
--- linux-2.4.20-wolk4.0s/fs/aio.c	2003-05-15 21:52:38.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/fs/aio.c	2003-05-16 13:34:28.000000000 +0200
@@ -228,6 +228,7 @@ static struct kioctx *ioctx_alloc(unsign
 {
 	struct kioctx *ctx;
 	unsigned i;
+	int ret = 0;
 
 	/* Prevent overflows */
 	if ((nr_reqs > (0x10000000U / sizeof(struct io_event))) ||
@@ -256,7 +257,8 @@ static struct kioctx *ioctx_alloc(unsign
 	INIT_LIST_HEAD(&ctx->free_reqs);
 	INIT_LIST_HEAD(&ctx->active_reqs);
 
-	if (aio_setup_ring(ctx) < 0)
+	ret = aio_setup_ring(ctx);
+	if (unlikely(ret < 0))
 		goto out_freectx;
 
 	/* Allocate nr_reqs iocbs for io.  Free iocbs are on the 
@@ -298,7 +300,7 @@ out_freering:
 	ioctx_free_reqs(ctx);
 out_freectx:
 	kmem_cache_free(kioctx_cachep, ctx);
-	ctx = ERR_PTR(-ENOMEM);
+	ctx = ERR_PTR(ret);
 
 	dprintk("aio: error allocating ioctx %p\n", ctx);
 	return ctx;
@@ -761,6 +763,7 @@ static int read_events(struct kioctx *ct
 		ret = -EFAULT;
 		if (unlikely(copy_from_user(&ts, timeout, sizeof(ts))))
 			goto out;
+		ret = 0;
 
 		init_timeout(&to);
 		set_timeout(start_jiffies, &to, &ts);
@@ -1209,61 +1212,9 @@ ssize_t generic_file_aio_read(struct fil
 	return generic_aio_rw(READ, file, req, iocb, iocb->aio_nbytes);  
 }
 
-ssize_t generic_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb, size_t min_size)
-{
-	return generic_aio_rw(WRITE, file, req, iocb, 1);
-#if 0
-	unsigned long buf = iocb.aio_buf;
-	size_t size = iocb.aio_nbytes;
-	loff_t pos = iocb.aio_offset;
-	ssize_t	nr_written = 0;
-	kvec_cb_t cb;
-	long res;
-#if 0
-	if (likely(NULL != file->f_op->new_write)) {
-		nr_written = file->f_op->new_write(file, (void *)buf, size,
-					       &pos, F_ATOMIC);
-		pr_debug("generic_aio_write: new_write: %ld\n", (long)nr_written);
-		if (-EAGAIN == nr_written)
-			nr_written = 0;
-		if ((nr_written >= min_size) || (nr_written < 0))
-			return nr_written;
-	}
-#endif
-
-	req->nr_transferred = nr_written;
-	size -= nr_written;
-	if (size > aio_max_size)
-		size = aio_max_size;
-	req->this_size = size;
-	buf += nr_written;
-	cb.vec = map_user_kvec(WRITE, buf, size);
-	cb.fn = generic_aio_complete_write;
-	cb.data = req;
-
-	if (IS_ERR(cb.vec)) {
-		pr_debug("generic_aio_write: map_user_kvec: %ld\n", PTR_ERR(cb.vec));
-		return nr_written ? nr_written : PTR_ERR(cb.vec);
-	}
-
-	res = file->f_op->kvec_write(file, cb, size, iocb.aio_offset);
-	pr_debug("generic_aio_write: kvec_write: %ld\n", res);
-	if (unlikely(res != 0)) {
-		unmap_kvec(cb.vec, 0);
-		free_kvec(cb.vec);
-		if (nr_written) {
-			if (res < 0)
-				res = 0;
-			res += nr_written;
-		}
-	}
-	return res;
-#endif
-}
-
 ssize_t generic_file_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb)
 {
-	return generic_aio_write(file, req, iocb, iocb->aio_nbytes);	
+	return generic_aio_rw(WRITE, file, req, iocb, 1);
 }
 
 /* lookup_kiocb
diff -Naurp linux-2.4.20-wolk4.0s/fs/buffer.c linux-2.4.20-wolk4.1-fullkernel/fs/buffer.c
--- linux-2.4.20-wolk4.0s/fs/buffer.c	2003-05-15 21:52:38.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/fs/buffer.c	2003-05-16 14:17:00.000000000 +0200
@@ -142,7 +142,6 @@ void unlock_buffer(struct buffer_head *b
 {
 	clear_bit(BH_Wait_IO, &bh->b_state);
 	clear_bit(BH_Launder, &bh->b_state);
-	clear_bit(BH_Atomic, &bh->b_state);
 	/*
 	 * When a locked buffer is visible to the I/O layer BH_Launder
 	 * is set. This means before unlocking we must clear BH_Launder,
@@ -1680,10 +1679,6 @@ static int __block_write_full_page(struc
 	/* Stage 3: submit the IO */
 	do {
 		struct buffer_head *next = bh->b_this_page;
-		/*
-		 * Stick it on BUF_LOCKED so shrink_buffer_cache() can nail it.
-		 */
-		refile_buffer(bh);
 		submit_bh(WRITE, bh);
 		bh = next;
 	} while (bh != head);
@@ -2373,8 +2368,8 @@ static void end_buffer_io_kiobuf(struct 
 	mark_buffer_uptodate(bh, uptodate);
 
 	kiobuf = bh->b_private;
-	unlock_buffer(bh);
 	end_kio_request(kiobuf, uptodate);
+	unlock_buffer(bh);
 }
 
 /*
@@ -2437,7 +2432,6 @@ int brw_kiovec(int rw, int nr, struct ki
 	struct page *	map;
 	struct buffer_head *tmp, **bhs = NULL;
 	int iosize = size;
-	unsigned int	atomic_seq = 0;
 
 	if (!nr)
 		return 0;
@@ -2454,9 +2448,6 @@ int brw_kiovec(int rw, int nr, struct ki
 			panic("brw_kiovec: iobuf not initialised");
 	}
 
-	if (rw == WRITE)
-		atomic_seq = blk_get_atomic_seq();
-
 	/* 
 	 * OK to walk down the iovec doing page IO on each page we find. 
 	 */
@@ -2514,8 +2505,6 @@ int brw_kiovec(int rw, int nr, struct ki
 				tmp->b_dev = dev;
 				tmp->b_blocknr = blocknr;
 				tmp->b_state = (1 << BH_Mapped) | (1 << BH_Lock) | (1 << BH_Req);
-				bh_elv_seq(tmp) = atomic_seq;
-				set_bit(BH_Atomic, &tmp->b_state);
 
 				if (rw == WRITE) {
 					set_bit(BH_Uptodate, &tmp->b_state);
@@ -2533,15 +2522,12 @@ int brw_kiovec(int rw, int nr, struct ki
 				 * Wait for IO if we have got too much 
 				 */
 				if (bhind >= KIO_MAX_SECTORS) {
-					blk_refile_atomic_queue(atomic_seq);
 					kiobuf_wait_for_io(iobuf); /* wake-one */
 					err = wait_kio(rw, bhind, bhs, size);
 					if (err >= 0)
 						transferred += err;
 					else
 						goto finished;
-					if (rw == WRITE)
-						atomic_seq = blk_get_atomic_seq();
 					bhind = 0;
 				}
 
@@ -2560,11 +2546,12 @@ int brw_kiovec(int rw, int nr, struct ki
 
 	/* Is there any IO still left to submit? */
 	if (bhind) {
-		blk_refile_atomic_queue(atomic_seq);
 		kiobuf_wait_for_io(iobuf); /* wake-one */
 		err = wait_kio(rw, bhind, bhs, size);
 		if (err >= 0)
 			transferred += err;
+		else
+			goto finished;
 	}
 
  finished:
@@ -2830,25 +2817,6 @@ static void sync_page_buffers(struct buf
 int try_to_free_buffers(struct page * page, unsigned int gfp_mask)
 {
 	struct buffer_head * tmp, * bh = page->buffers;
-	int was_uptodate = 1;
-
-	if (!PageLocked(page))
-		BUG();
-
-	if (!bh)
-		return 1;
-	/*
-	 * Quick check for freeable buffers before we go take three
-	 * global locks.
-	 */
-	if (!(gfp_mask & __GFP_IO)) {
-		tmp = bh;
-		do {
-			if (buffer_busy(tmp))
-				return 0;
-			tmp = tmp->b_this_page;
-		} while (tmp != bh);
-	}
 
 	spin_lock(&lru_list_lock);
 	write_lock(&hash_table_lock);
@@ -2870,8 +2838,7 @@ int try_to_free_buffers(struct page * pa
 		tmp = tmp->b_this_page;
 
 		if (p->b_dev == B_FREE) BUG();
-		if (!buffer_uptodate(p))
-			was_uptodate = 0;
+
 		remove_inode_queue(p);
 		__remove_from_queues(p);
 		__put_unused_buffer_head(p);
@@ -2879,15 +2846,7 @@ int try_to_free_buffers(struct page * pa
 	spin_unlock(&unused_list_lock);
 
 	/* Wake up anyone waiting for buffer heads */
-	smp_mb();
-	if (waitqueue_active(&buffer_wait))
-		wake_up(&buffer_wait);
-
-	/*
-	 * Make sure we don't read buffers again when they are reattached
-	 */
-	if (was_uptodate)
-		SetPageUptodate(page);
+	wake_up(&buffer_wait);
 
 	/* And free the page */
 	page->buffers = NULL;
@@ -2910,62 +2869,6 @@ busy_buffer_page:
 }
 EXPORT_SYMBOL(try_to_free_buffers);
 
-/*
- * Returns the number of pages which might have become freeable 
- */
-int shrink_buffer_cache(void)
-{
-	struct buffer_head *bh;
-	int nr_todo;
-	int nr_shrunk = 0;
-
-	/*
-	 * Move any clean unlocked buffers from BUF_LOCKED onto BUF_CLEAN
-	 */
-	spin_lock(&lru_list_lock);
-	for ( ; ; ) {
-		bh = lru_list[BUF_LOCKED];
-		if (!bh || buffer_locked(bh))
-			break;
-		__refile_buffer(bh);
-	}
-
-	/*
-	 * Now start liberating buffers
-	 */
-	nr_todo = nr_buffers_type[BUF_CLEAN];
-	while (nr_todo--) {
-		struct page *page;
-
-		bh = lru_list[BUF_CLEAN];
-		if (!bh)
-			break;
-
-		/*
-		 * Park the buffer on BUF_LOCKED so we don't revisit it on
-		 * this pass.
-		 */
-		__remove_from_lru_list(bh);
-		bh->b_list = BUF_LOCKED;
-		__insert_into_lru_list(bh, BUF_LOCKED);
-		page = bh->b_page;
-		if (TryLockPage(page))
-			continue;
-
-		page_cache_get(page);
-		spin_unlock(&lru_list_lock);
-		if (try_to_release_page(page, GFP_NOIO))
-			nr_shrunk++;
-		unlock_page(page);
-		page_cache_release(page);
-		spin_lock(&lru_list_lock);
-	}
-	spin_unlock(&lru_list_lock);
-//	printk("%s: liberated %d page's worth of buffer_heads\n",
-//		__FUNCTION__, nr_shrunk);
-	return (nr_shrunk * sizeof(struct buffer_head)) / PAGE_CACHE_SIZE;
-}
-
 /* ================== Debugging =================== */
 
 void show_buffers(void)
@@ -3339,7 +3242,6 @@ int kupdate(void *startup)
 		printk(KERN_DEBUG "kupdate() activated...\n");
 #endif
 		do_io_postprocessing();
-//		shrink_buffer_cache();
 		sync_old_buffers();
 		run_task_queue(&tq_disk);
 	}
@@ -3539,6 +3441,8 @@ error:
 
 	return err;
 }
+EXPORT_SYMBOL(brw_kvec_async);
+
 #if 0
 int brw_kiovec(int rw, int nr, struct kiobuf *iovec[],
 		kdev_t dev, int nr_blocks, unsigned long b[], int sector_size)
diff -Naurp linux-2.4.20-wolk4.0s/fs/jfs/jfs_logmgr.c linux-2.4.20-wolk4.1-fullkernel/fs/jfs/jfs_logmgr.c
--- linux-2.4.20-wolk4.0s/fs/jfs/jfs_logmgr.c	2003-05-15 21:52:41.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/fs/jfs/jfs_logmgr.c	2003-05-16 14:35:35.000000000 +0200
@@ -1834,7 +1834,6 @@ static inline void lbmRedrive(struct lbu
 static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp)
 {
 	struct lbuf *bp;
-	unsigned long flags;
 
 	/*
 	 * allocate a log buffer
@@ -1842,8 +1841,6 @@ static int lbmRead(struct jfs_log * log,
 	*bpp = bp = lbmAllocate(log, pn);
 	jfs_info("lbmRead: bp:0x%p pn:0x%x", bp, pn);
 
-	LCACHE_LOCK(flags);		/* disable+lock */
-
 	bp->l_flag |= lbmREAD;
 	bp->l_bh.b_reqnext = NULL;
 	clear_bit(BH_Uptodate, &bp->l_bh.b_state);
@@ -1852,13 +1849,10 @@ static int lbmRead(struct jfs_log * log,
 	set_bit(BH_Req, &bp->l_bh.b_state);
 	bp->l_bh.b_rdev = bp->l_bh.b_dev;
 	bp->l_bh.b_rsector = bp->l_blkno << (log->l2bsize - 9);
-	bh_elv_seq(&bp->l_bh) = 0;
 	generic_make_request(READ, &bp->l_bh);
 	run_task_queue(&tq_disk);
 
-	LCACHE_SLEEP_COND(bp->l_ioevent, (bp->l_flag != lbmREAD), flags);
-
-	LCACHE_UNLOCK(flags);	/* unlock+enable */
+	wait_event(bp->l_ioevent, (bp->l_flag != lbmREAD));
 
 	return 0;
 }
@@ -1991,7 +1985,6 @@ static void lbmStartIO(struct lbuf * bp)
 	set_bit(BH_Req, &bp->l_bh.b_state);
 	bp->l_bh.b_rdev = bp->l_bh.b_dev;
 	bp->l_bh.b_rsector = bp->l_blkno << (bp->l_log->l2bsize - 9);
-	bh_elv_seq(&bp->l_bh) = 0;
 	generic_make_request(WRITE, &bp->l_bh);
 
 	INCREMENT(lmStat.submitted);
diff -Naurp linux-2.4.20-wolk4.0s/fs/nfs/inode.c linux-2.4.20-wolk4.1-fullkernel/fs/nfs/inode.c
--- linux-2.4.20-wolk4.0s/fs/nfs/inode.c	2003-05-15 21:52:41.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/fs/nfs/inode.c	2003-05-16 13:55:00.000000000 +0200
@@ -849,19 +849,18 @@ printk("nfs_notify_change: revalidate fa
 			goto out;
 	} while (flusher && NFS_I(inode)->npages);
 
+	/* Truncate now in order to avoid races on the client side */
+	if (attr->ia_valid & ATTR_SIZE)
+		vmtruncate(inode, attr->ia_size);
+
 	error = NFS_PROTO(inode)->setattr(inode, &fattr, attr);
 	if (error)
 		goto out;
-	/*
-	 * If we changed the size or mtime, update the inode
-	 * now to avoid invalidating the page cache.
-	 */
-	if (attr->ia_valid & ATTR_SIZE) {
+
+	if (attr->ia_valid & ATTR_SIZE)
 		if (attr->ia_size != fattr.size)
 			printk("nfs_notify_change: attr=%Ld, fattr=%Ld??\n",
 			       (long long) attr->ia_size, (long long)fattr.size);
-		vmtruncate(inode, attr->ia_size);
-	}
 
 	/*
 	 * If we changed the size or mtime, update the inode
diff -Naurp linux-2.4.20-wolk4.0s/include/asm-ia64/bitops.h linux-2.4.20-wolk4.1-fullkernel/include/asm-ia64/bitops.h
--- linux-2.4.20-wolk4.0s/include/asm-ia64/bitops.h	2003-05-15 21:52:45.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/include/asm-ia64/bitops.h	2003-05-16 13:35:52.000000000 +0200
@@ -449,8 +449,6 @@ find_next_bit (void *addr, unsigned long
 
 #ifdef __KERNEL__
 
-#define __clear_bit(nr, addr)        clear_bit(nr, addr)
-
 #define ext2_set_bit                 test_and_set_bit
 #define ext2_clear_bit               test_and_clear_bit
 #define ext2_test_bit                test_bit
diff -Naurp linux-2.4.20-wolk4.0s/include/linux/bench_func.h linux-2.4.20-wolk4.1-fullkernel/include/linux/bench_func.h
--- linux-2.4.20-wolk4.0s/include/linux/bench_func.h	2003-05-15 21:52:46.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/include/linux/bench_func.h	1970-01-01 01:00:00.000000000 +0100
@@ -1,16 +0,0 @@
-#ifndef _LINUX_BENCH_FUNC_H
-#define _LINUX_BENCH_FUNC_H
- 
-struct candidate {
-	const char	*name;
-	void 		*f;	// pointer to func
-	int		weight;
-	int		cpu_caps_needed[4];
-};
-
-typedef int bench_func(struct candidate *cand, char *opaque);
-
-struct candidate* find_best(bench_func *bench, char *opaque,
-		struct candidate runner[], int count);
-
-#endif
diff -Naurp linux-2.4.20-wolk4.0s/include/linux/blkdev.h linux-2.4.20-wolk4.1-fullkernel/include/linux/blkdev.h
--- linux-2.4.20-wolk4.0s/include/linux/blkdev.h	2003-05-15 21:52:46.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/include/linux/blkdev.h	2003-05-19 08:44:09.000000000 +0200
@@ -96,7 +96,6 @@ struct request_queue
 	 * Together with queue_head for cacheline sharing
 	 */
 	struct list_head	queue_head;
-	struct list_head	atomic_head;
 	elevator_t		elevator;
 	struct request		*last_merge;
 
@@ -116,7 +115,6 @@ struct request_queue
 	 * This is used to remove the plug when tq_disk runs.
 	 */
 	struct tq_struct	plug_tq;
-	struct list_head	atomic_entry;
 
 	/*
 	 * Boolean that indicates whether this queue is plugged or not.
@@ -148,14 +146,6 @@ extern unsigned long blk_max_low_pfn, bl
 #define BLK_BOUNCE_HIGH		(blk_max_low_pfn << PAGE_SHIFT)
 #define BLK_BOUNCE_ANY		(blk_max_pfn << PAGE_SHIFT)
 
-/*
- * max guaranteed atomic I/O size while dealing with bounce buffers.
- * highmemio capable devices (pci64 in particular) can go well beyond
- * this limit. Must be a multiple of 512bytes obviously.
- */
-#define BLK_ATOMIC_BOUNCE_SIZE		32768
-#define BLK_ATOMIC_BOUNCE_ENTRIES	(BLK_ATOMIC_BOUNCE_SIZE >> 9)
-
 extern void blk_queue_bounce_limit(request_queue_t *, u64);
 
 #ifdef CONFIG_HIGHMEM
@@ -213,13 +203,6 @@ extern void generic_make_request(int rw,
 extern inline request_queue_t *blk_get_queue(kdev_t dev);
 extern void blkdev_release_request(struct request *);
 
-extern spinlock_cacheline_t blk_atomic_lock_cacheline;
-#define blk_atomic_lock (blk_atomic_lock_cacheline.lock)
-extern unsigned int blk_get_atomic_seq(void);
-extern spinlock_cacheline_t blk_atomic_queue_lock_cacheline;
-#define blk_atomic_queue_lock (blk_atomic_queue_lock_cacheline.lock)
-extern void FASTCALL(blk_refile_atomic_queue(int sequence));
-
 /*
  * Access functions for manipulating queue properties
  */
diff -Naurp linux-2.4.20-wolk4.0s/include/linux/elevator.h linux-2.4.20-wolk4.1-fullkernel/include/linux/elevator.h
--- linux-2.4.20-wolk4.0s/include/linux/elevator.h	2003-05-15 21:52:46.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/include/linux/elevator.h	2003-05-19 13:02:42.000000000 +0200
@@ -1,12 +1,9 @@
 #ifndef _LINUX_ELEVATOR_H
 #define _LINUX_ELEVATOR_H
 
-typedef void (elevator_fn) (struct request *, elevator_t *,
-			    struct list_head *,
-			    struct list_head *, int);
-
-typedef int (elevator_merge_fn) (request_queue_t *, struct request **, struct list_head *,
-				 struct buffer_head *, int, int);
+typedef int (elevator_merge_fn)(request_queue_t *, struct request **,
+				struct list_head *, struct buffer_head *bh,
+				int rw, int max_sectors);
 
 typedef void (elevator_merge_cleanup_fn) (request_queue_t *, struct request *, int);
 
@@ -16,6 +13,7 @@ struct elevator_s
 {
 	int read_latency;
 	int write_latency;
+	int max_bomb_segments;
 
 	elevator_merge_fn *elevator_merge_fn;
 	elevator_merge_req_fn *elevator_merge_req_fn;
@@ -23,13 +21,13 @@ struct elevator_s
 	unsigned int queue_ID;
 };
 
-int elevator_noop_merge(request_queue_t *, struct request **, struct list_head *, struct buffer_head *, int, int);
-void elevator_noop_merge_cleanup(request_queue_t *, struct request *, int);
-void elevator_noop_merge_req(struct request *, struct request *);
-
-int elevator_linus_merge(request_queue_t *, struct request **, struct list_head *, struct buffer_head *, int, int);
-void elevator_linus_merge_cleanup(request_queue_t *, struct request *, int);
-void elevator_linus_merge_req(struct request *, struct request *);
+elevator_merge_fn		elevator_noop_merge;
+elevator_merge_cleanup_fn	elevator_noop_merge_cleanup;
+elevator_merge_req_fn		elevator_noop_merge_req;
+
+elevator_merge_fn		elevator_linus_merge;
+elevator_merge_cleanup_fn	elevator_linus_merge_cleanup;
+elevator_merge_req_fn		elevator_linus_merge_req;
 
 typedef struct blkelv_ioctl_arg_s {
 	int queue_ID;
@@ -53,22 +51,6 @@ extern void elevator_init(elevator_t *, 
 #define ELEVATOR_FRONT_MERGE	1
 #define ELEVATOR_BACK_MERGE	2
 
-/*
- * This is used in the elevator algorithm.  We don't prioritise reads
- * over writes any more --- although reads are more time-critical than
- * writes, by treating them equally we increase filesystem throughput.
- * This turns out to give better overall performance.  -- sct
- */
-#define IN_ORDER(s1,s2)				\
-	((((s1)->rq_dev == (s2)->rq_dev &&	\
-	   (s1)->sector < (s2)->sector)) ||	\
-	 (s1)->rq_dev < (s2)->rq_dev)
-
-#define BHRQ_IN_ORDER(bh, rq)			\
-	((((bh)->b_rdev == (rq)->rq_dev &&	\
-	   (bh)->b_rsector < (rq)->sector)) ||	\
-	 (bh)->b_rdev < (rq)->rq_dev)
-
 static inline int elevator_request_latency(elevator_t * elevator, int rw)
 {
 	int latency;
@@ -86,7 +68,7 @@ static inline int elevator_request_laten
 ((elevator_t) {								\
 	0,				/* read_latency */		\
 	0,				/* write_latency */		\
-									\
+	0,				/* max_bomb_segments */		\
 	elevator_noop_merge,		/* elevator_merge_fn */		\
 	elevator_noop_merge_req,	/* elevator_merge_req_fn */	\
 	})
@@ -94,12 +76,12 @@ static inline int elevator_request_laten
 #if (!defined (CONFIG_BLK_DEV_ELEVATOR_LOWLAT) && defined (CONFIG_SCHED_SERVER)) || (!defined (CONFIG_BLK_DEV_ELEVATOR_LOWLAT) && !defined (CONFIG_SCHED_SERVER) && !defined (CONFIG_SCHED_DESKTOP))
 #define ELEVATOR_READ_LATENCY		2048
 #define ELEVATOR_WRITE_LATENCY		8192
-#define ELEVATOR_MAX_BOMB_SEGMENTS	0
+#define ELEVATOR_MAX_BOMB_SEGMENTS	6
 #define ELEVATOR_LINUS							\
 ((elevator_t) {								\
 	2048,				/* read passovers */		\
 	8192,				/* write passovers */		\
-									\
+	6,				/* max_bomb_segments */		\
 	elevator_linus_merge,		/* elevator_merge_fn */		\
 	elevator_linus_merge_req,	/* elevator_merge_req_fn */	\
 	})
@@ -107,12 +89,12 @@ static inline int elevator_request_laten
 #elif (!defined (CONFIG_BLK_DEV_ELEVATOR_LOWLAT) && defined (CONFIG_SCHED_DESKTOP))
 #define ELEVATOR_READ_LATENCY		512
 #define ELEVATOR_WRITE_LATENCY		8192
-#define ELEVATOR_MAX_BOMB_SEGMENTS	0
+#define ELEVATOR_MAX_BOMB_SEGMENTS	2
 #define ELEVATOR_LINUS							\
 ((elevator_t) {								\
 	512,				/* read passovers */		\
 	8192,				/* write passovers */		\
-									\
+	2,				/* max_bomb_segments */		\
 	elevator_linus_merge,		/* elevator_merge_fn */		\
 	elevator_linus_merge_req,	/* elevator_merge_req_fn */	\
 	})
@@ -125,7 +107,7 @@ static inline int elevator_request_laten
 ((elevator_t) {								\
 	0,				/* read passovers */		\
 	0,				/* write passovers */		\
-									\
+	1,				/* max_bomb_segments */		\
 	elevator_linus_merge,		/* elevator_merge_fn */		\
 	elevator_linus_merge_req,	/* elevator_merge_req_fn */	\
 	})
diff -Naurp linux-2.4.20-wolk4.0s/include/linux/fs.h linux-2.4.20-wolk4.1-fullkernel/include/linux/fs.h
--- linux-2.4.20-wolk4.0s/include/linux/fs.h	2003-05-15 21:52:46.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/include/linux/fs.h	2003-05-19 12:51:33.000000000 +0200
@@ -242,7 +242,6 @@ enum bh_state_bits {
 	BH_Attached,	/* 1 if b_inode_buffers is linked into a list */
 	BH_JBD,		/* 1 if it has an attached journal_head */
 	BH_Delay,	/* 1 if the buffer is delayed allocate */
-	BH_Atomic,	/* 1 if b_elv_sequence is valid */
 
 	BH_PrivateStart,/* not a state bit, but the first bit available
 			 * for private allocation by other entities
@@ -287,7 +286,6 @@ struct buffer_head {
  	void *b_private;		/* reserved for b_end_io */
 	void *b_journal_head;		/* FS journal_heads */
 	unsigned long b_rsector;	/* Real buffer location on disk */
-	int b_elv_sequence;		/* for atomic blocks */
 	wait_queue_head_t b_wait;
 
 	struct list_head     b_inode_buffers;	/* doubly linked list of inode dirty buffers */
@@ -308,7 +306,6 @@ void init_buffer(struct buffer_head *, b
 #define buffer_async(bh)	__buffer_state(bh,Async)
 #define buffer_launder(bh)	__buffer_state(bh,Launder)
 #define buffer_delay(bh)	__buffer_state(bh,Delay)
-#define buffer_atomic(bh)	__buffer_state(bh,Atomic)
 
 #define bh_offset(bh)		((unsigned long)(bh)->b_data & ~PAGE_MASK)
 
@@ -316,7 +313,6 @@ extern void set_bh_page(struct buffer_he
 
 #define touch_buffer(bh)	mark_page_accessed(bh->b_page)
 
-#define bh_elv_seq(bh)		(bh)->b_elv_sequence
 
 #include <linux/pipe_fs_i.h>
 #include <linux/minix_fs_i.h>
@@ -433,6 +429,7 @@ struct address_space_operations {
 	int (*releasepage) (struct page *, int);
 #define KERNEL_HAS_O_DIRECT /* this is for modules out of the kernel */
 	int (*direct_IO)(int, struct file *, struct kiobuf *, unsigned long, int);
+	void (*removepage)(struct page *); /* called when page gets removed from the inode */
 };
 
 struct address_space {
@@ -1286,7 +1283,6 @@ extern void refile_buffer(struct buffer_
 extern void create_empty_buffers(struct page *, kdev_t, unsigned long);
 extern void end_buffer_io_sync(struct buffer_head *bh, int uptodate);
 extern void end_buffer_io_async(struct buffer_head *bh, int uptodate);
-extern int shrink_buffer_cache(void);
 
 /* reiserfs_writepage needs this */
 extern void set_buffer_async_io(struct buffer_head *bh) ;
diff -Naurp linux-2.4.20-wolk4.0s/include/linux/jhash.h linux-2.4.20-wolk4.1-fullkernel/include/linux/jhash.h
--- linux-2.4.20-wolk4.0s/include/linux/jhash.h	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.4.20-wolk4.1-fullkernel/include/linux/jhash.h	2003-05-16 14:00:34.000000000 +0200
@@ -0,0 +1,143 @@
+#ifndef _LINUX_JHASH_H
+#define _LINUX_JHASH_H
+
+/* jhash.h: Jenkins hash support.
+ *
+ * Copyright (C) 1996 Bob Jenkins (bob_jenkins@burtleburtle.net)
+ *
+ * http://burtleburtle.net/bob/hash/
+ *
+ * These are the credits from Bob's sources:
+ *
+ * lookup2.c, by Bob Jenkins, December 1996, Public Domain.
+ * hash(), hash2(), hash3, and mix() are externally useful functions.
+ * Routines to test the hash are included if SELF_TEST is defined.
+ * You can use this free for any purpose.  It has no warranty.
+ *
+ * Copyright (C) 2003 David S. Miller (davem@redhat.com)
+ *
+ * I've modified Bob's hash to be useful in the Linux kernel, and
+ * any bugs present are surely my fault.  -DaveM
+ */
+
+/* NOTE: Arguments are modified. */
+#define __jhash_mix(a, b, c) \
+{ \
+  a -= b; a -= c; a ^= (c>>13); \
+  b -= c; b -= a; b ^= (a<<8); \
+  c -= a; c -= b; c ^= (b>>13); \
+  a -= b; a -= c; a ^= (c>>12);  \
+  b -= c; b -= a; b ^= (a<<16); \
+  c -= a; c -= b; c ^= (b>>5); \
+  a -= b; a -= c; a ^= (c>>3);  \
+  b -= c; b -= a; b ^= (a<<10); \
+  c -= a; c -= b; c ^= (b>>15); \
+}
+
+/* The golden ration: an arbitrary value */
+#define JHASH_GOLDEN_RATIO	0x9e3779b9
+
+/* The most generic version, hashes an arbitrary sequence
+ * of bytes.  No alignment or length assumptions are made about
+ * the input key.
+ */
+static inline u32 jhash(void *key, u32 length, u32 initval)
+{
+	u32 a, b, c, len;
+	u8 *k = key;
+
+	len = length;
+	a = b = JHASH_GOLDEN_RATIO;
+	c = initval;
+
+	while (len >= 12) {
+		a += (k[0] +((u32)k[1]<<8) +((u32)k[2]<<16) +((u32)k[3]<<24));
+		b += (k[4] +((u32)k[5]<<8) +((u32)k[6]<<16) +((u32)k[7]<<24));
+		c += (k[8] +((u32)k[9]<<8) +((u32)k[10]<<16)+((u32)k[11]<<24));
+
+		__jhash_mix(a,b,c);
+
+		k += 12;
+		len -= 12;
+	}
+
+	c += length;
+	switch (len) {
+	case 11: c += ((u32)k[10]<<24);
+	case 10: c += ((u32)k[9]<<16);
+	case 9 : c += ((u32)k[8]<<8);
+	case 8 : b += ((u32)k[7]<<24);
+	case 7 : b += ((u32)k[6]<<16);
+	case 6 : b += ((u32)k[5]<<8);
+	case 5 : b += k[4];
+	case 4 : a += ((u32)k[3]<<24);
+	case 3 : a += ((u32)k[2]<<16);
+	case 2 : a += ((u32)k[1]<<8);
+	case 1 : a += k[0];
+	};
+
+	__jhash_mix(a,b,c);
+
+	return c;
+}
+
+/* A special optimized version that handles 1 or more of u32s.
+ * The length parameter here is the number of u32s in the key.
+ */
+static inline u32 jhash2(u32 *k, u32 length, u32 initval)
+{
+	u32 a, b, c, len;
+
+	a = b = JHASH_GOLDEN_RATIO;
+	c = initval;
+	len = length;
+
+	while (len >= 3) {
+		a += k[0];
+		b += k[1];
+		c += k[2];
+		__jhash_mix(a, b, c);
+		k += 3; len -= 3;
+	}
+
+	c += length * 4;
+
+	switch (len) {
+	case 2 : b += k[1];
+	case 1 : a += k[0];
+	};
+
+	__jhash_mix(a,b,c);
+
+	return c;
+}
+
+
+/* A special ultra-optimized versions that knows they are hashing exactly
+ * 3, 2 or 1 word(s).
+ *
+ * NOTE: In partilar the "c += length; __jhash_mix(a,b,c);" normally
+ *       done at the end is not done here.
+ */
+static inline u32 jhash_3words(u32 a, u32 b, u32 c, u32 initval)
+{
+	a += JHASH_GOLDEN_RATIO;
+	b += JHASH_GOLDEN_RATIO;
+	c += initval;
+
+	__jhash_mix(a, b, c);
+
+	return c;
+}
+
+static inline u32 jhash_2words(u32 a, u32 b, u32 initval)
+{
+	return jhash_3words(a, b, 0, initval);
+}
+
+static inline u32 jhash_1word(u32 a, u32 initval)
+{
+	return jhash_3words(a, 0, 0, initval);
+}
+
+#endif /* _LINUX_JHASH_H */
diff -Naurp linux-2.4.20-wolk4.0s/include/linux/kmap_types.h linux-2.4.20-wolk4.1-fullkernel/include/linux/kmap_types.h
--- linux-2.4.20-wolk4.0s/include/linux/kmap_types.h	2003-05-15 21:52:46.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/include/linux/kmap_types.h	2003-05-16 13:55:58.000000000 +0200
@@ -14,8 +14,8 @@ enum km_type {
 	KM_PTE0,
 	KM_PTE1,
 	KM_PTE2,
-	KM_NETDUMP,
 	KM_KDB,
+	KM_NETDUMP,
 	KM_TYPE_NR
 };
 
diff -Naurp linux-2.4.20-wolk4.0s/include/linux/mm.h linux-2.4.20-wolk4.1-fullkernel/include/linux/mm.h
--- linux-2.4.20-wolk4.0s/include/linux/mm.h	2003-05-15 21:52:47.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/include/linux/mm.h	2003-05-18 18:33:16.000000000 +0200
@@ -761,6 +761,8 @@ static inline void __vma_unlink(struct m
 		mm->mmap_cache = prev;
 }
 
+#define VM_SPECIAL (VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_RESERVED)
+
 #define can_vma_merge(vma, vm_flags) __can_vma_merge(vma, vm_flags, NULL, 0, 0)
 /*
  * We don't check here for the merged mmap wrapping around the end of pagecache
@@ -776,7 +778,9 @@ static inline int __can_vma_merge(struct
 	else
 #endif
 
-	if (vma->vm_file == file && vma->vm_flags == vm_flags) {
+	if (vma->vm_file == file && vma->vm_flags == vm_flags &&
+	    likely((!vma->vm_ops || !vma->vm_ops->close) && !vma->vm_private_data &&
+		   !(vm_flags & VM_SPECIAL))) {
 		if (file) {
 			if (vma->vm_pgoff == vm_pgoff + offset)
 				return 1;
diff -Naurp linux-2.4.20-wolk4.0s/include/linux/pagemap.h linux-2.4.20-wolk4.1-fullkernel/include/linux/pagemap.h
--- linux-2.4.20-wolk4.0s/include/linux/pagemap.h	2003-05-15 21:52:47.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/include/linux/pagemap.h	2003-05-18 18:33:28.000000000 +0200
@@ -86,9 +86,14 @@ extern void add_to_page_cache_locked(str
 extern int add_to_page_cache_unique(struct page * page, struct address_space *mapping, unsigned long index, struct page **hash);
 extern wait_queue_head_t *FASTCALL(page_waitqueue(struct page *page));
 
+extern void ___wait_on_page(struct page *);
 extern int wait_on_page_timeout(struct page *page, int timeout);
 
-extern void wait_on_page(struct page *);
+static inline void wait_on_page(struct page * page)
+{
+	if (PageLocked(page))
+		___wait_on_page(page);
+}
 
 extern void FASTCALL(wakeup_page_waiters(struct page * page));
 
diff -Naurp linux-2.4.20-wolk4.0s/include/linux/sched.h linux-2.4.20-wolk4.1-fullkernel/include/linux/sched.h
--- linux-2.4.20-wolk4.0s/include/linux/sched.h	2003-05-15 21:52:47.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/include/linux/sched.h	2003-05-18 18:33:16.000000000 +0200
@@ -241,13 +241,6 @@ struct files_struct {
 extern int max_map_count;
 
 struct kioctx;
-
-struct ptg_struct {		/* pseudo thread groups */
-	atomic_t active;	/* number of tasks in run queues */
-	atomic_t count;		/* number of refs */
-};
-
-
 struct mm_struct {
 	struct vm_area_struct * mmap;		/* list of VMAs */
 	rb_root_t mm_rb;
@@ -327,7 +320,6 @@ struct signal_struct {
 struct user_struct {
 	atomic_t __count;	/* reference count */
 	atomic_t processes;	/* How many processes does this user have? */
-	atomic_t active;	/* How many active processes does this user have? */
 	atomic_t files;		/* How many open files does this user have? */
 
 	/* Hash table maintenance information */
@@ -439,7 +431,6 @@ struct task_struct {
 	task_t *next_task, *prev_task;
 
 	struct mm_struct *mm, *active_mm;
-	struct ptg_struct * ptgroup;		/* pseudo thread group for this task */
 
 #ifdef CONFIG_SYSTRACE
 	/* back pointer to systrace */
diff -Naurp linux-2.4.20-wolk4.0s/include/linux/sunrpc/sched.h linux-2.4.20-wolk4.1-fullkernel/include/linux/sunrpc/sched.h
--- linux-2.4.20-wolk4.0s/include/linux/sunrpc/sched.h	2002-12-29 18:46:57.000000000 +0100
+++ linux-2.4.20-wolk4.1-fullkernel/include/linux/sunrpc/sched.h	2003-05-18 18:33:28.000000000 +0200
@@ -128,7 +128,12 @@ typedef void			(*rpc_action)(struct rpc_
 #define RPC_IS_RUNNING(t)	(test_bit(RPC_TASK_RUNNING, &(t)->tk_runstate))
 
 #define rpc_set_running(t)	(set_bit(RPC_TASK_RUNNING, &(t)->tk_runstate))
-#define rpc_clear_running(t)	(clear_bit(RPC_TASK_RUNNING, &(t)->tk_runstate))
+#define rpc_clear_running(t) \
+	do { \
+		smp_mb__before_clear_bit(); \
+		clear_bit(RPC_TASK_RUNNING, &(t)->tk_runstate); \
+		smp_mb__after_clear_bit(); \
+	} while(0)
 
 #define rpc_set_sleeping(t)	(set_bit(RPC_TASK_SLEEPING, &(t)->tk_runstate))
 
diff -Naurp linux-2.4.20-wolk4.0s/include/linux/sunrpc/xprt.h linux-2.4.20-wolk4.1-fullkernel/include/linux/sunrpc/xprt.h
--- linux-2.4.20-wolk4.0s/include/linux/sunrpc/xprt.h	2003-05-15 21:52:48.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/include/linux/sunrpc/xprt.h	2003-05-18 18:33:28.000000000 +0200
@@ -186,7 +186,12 @@ void			xprt_sock_setbufsize(struct rpc_x
 #define xprt_connected(xp)		(!(xp)->stream || test_bit(XPRT_CONNECT, &(xp)->sockstate))
 #define xprt_set_connected(xp)		(set_bit(XPRT_CONNECT, &(xp)->sockstate))
 #define xprt_test_and_set_connected(xp)	(test_and_set_bit(XPRT_CONNECT, &(xp)->sockstate))
-#define xprt_clear_connected(xp)	(clear_bit(XPRT_CONNECT, &(xp)->sockstate))
+#define xprt_clear_connected(xp) \
+	do { \
+		smp_mb__before_clear_bit(); \
+		clear_bit(XPRT_CONNECT, &(xp)->sockstate); \
+		smp_mb__after_clear_bit(); \
+	} while(0)
 
 #endif /* __KERNEL__*/
 
diff -Naurp linux-2.4.20-wolk4.0s/include/linux/sysctl.h linux-2.4.20-wolk4.1-fullkernel/include/linux/sysctl.h
--- linux-2.4.20-wolk4.0s/include/linux/sysctl.h	2003-05-15 21:52:48.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/include/linux/sysctl.h	2003-05-18 18:33:20.000000000 +0200
@@ -338,7 +338,8 @@ enum {
 	NET_IPV4_ROUTE_GC_ELASTICITY=14,
 	NET_IPV4_ROUTE_MTU_EXPIRES=15,
 	NET_IPV4_ROUTE_MIN_PMTU=16,
-	NET_IPV4_ROUTE_MIN_ADVMSS=17
+	NET_IPV4_ROUTE_MIN_ADVMSS=17,
+	NET_IPV4_ROUTE_SECRET_INTERVAL=18,
 };
 
 enum
diff -Naurp linux-2.4.20-wolk4.0s/include/net/tcp.h linux-2.4.20-wolk4.1-fullkernel/include/net/tcp.h
--- linux-2.4.20-wolk4.0s/include/net/tcp.h	2003-05-15 21:52:48.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/include/net/tcp.h	2003-05-18 18:37:14.000000000 +0200
@@ -1634,6 +1634,7 @@ struct tcp_listen_opt
 	int			qlen;
 	int			qlen_young;
 	int			clock_hand;
+	u32			hash_rnd;
 	struct open_request	*syn_table[TCP_SYNQ_HSIZE];
 };
 
diff -Naurp linux-2.4.20-wolk4.0s/kernel/Makefile linux-2.4.20-wolk4.1-fullkernel/kernel/Makefile
--- linux-2.4.20-wolk4.0s/kernel/Makefile	2003-05-15 21:52:48.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/kernel/Makefile	2003-05-19 08:37:47.000000000 +0200
@@ -17,10 +17,6 @@ obj-y     = sched.o dma.o fork.o exec_do
 	    sysctl.o acct.o capability.o ptrace.o timer.o user.o \
 	    signal.o sys.o kmod.o context.o rcupdate.o futex.o
 
-ifeq ($(CONFIG_X86),y)
-  obj-y += bench_func.o
-endif
-
 obj-$(CONFIG_UID16) += uid16.o
 obj-$(CONFIG_MODULES) += ksyms.o
 obj-$(CONFIG_PM) += pm.o
diff -Naurp linux-2.4.20-wolk4.0s/kernel/bench_func.c linux-2.4.20-wolk4.1-fullkernel/kernel/bench_func.c
--- linux-2.4.20-wolk4.0s/kernel/bench_func.c	2003-05-15 21:52:48.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/kernel/bench_func.c	1970-01-01 01:00:00.000000000 +0100
@@ -1,51 +0,0 @@
-#include <linux/kernel.h>	// for KERN_DEBUG
-
-#include <asm/bitops.h>		// for test_bit
-#include <asm/processor.h>	// cpu caps
-#include <asm/cpufeature.h>	// cpu features constants
-#include <linux/bench_func.h>
-
-//#define dprintk(a...)	printk(a)
-#define dprintk(a...) ((void)0)
-
-// 2.4 only, already in 2.5
-extern inline int boot_cpu_has_cap(int cap)
-{
-	return test_bit(cap, boot_cpu_data.x86_capability);
-}
-
-extern inline int cpu_supports(int *cap)
-{
-	while(*cap != -1) {
-		if(!boot_cpu_has_cap(*cap)) {
-			dprintk("unsupported caps: %i\n", *cap);
-			return 0;
-		}
-		cap++;
-	}
-	return 1;
-}
-
-/*
-** Call all the candidates which can be run on this CPU,
-** find the best
-*/
-struct candidate*
-find_best(bench_func *bench, char *opaque, struct candidate runner[], int count)
-{
-	int score, max = 0;
-	struct candidate *best = 0;
-	while(count--) {
-		if(!cpu_supports(runner->cpu_caps_needed)) {
-			printk("func %s skipped: not supported by CPU\n", runner->name);
-		} else {
-			score = bench(runner,opaque) * runner->weight;
-			if(max < score) {
-				max = score;
-				best = runner;
-			}
-		}
-		runner++;
-	}
-	return best;
-}
diff -Naurp linux-2.4.20-wolk4.0s/kernel/exit.c linux-2.4.20-wolk4.1-fullkernel/kernel/exit.c
--- linux-2.4.20-wolk4.0s/kernel/exit.c	2003-05-15 21:52:48.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/kernel/exit.c	2003-05-16 14:21:51.000000000 +0200
@@ -13,12 +13,11 @@
 #include <linux/personality.h>
 #include <linux/tty.h>
 #include <linux/namespace.h>
-#include <linux/grsecurity.h>
-#include <linux/file.h>
-
 #ifdef CONFIG_BSD_PROCESS_ACCT
 #include <linux/acct.h>
 #endif
+#include <linux/grsecurity.h>
+#include <linux/file.h>
 
 #ifdef CONFIG_SYSTRACE
 #include <linux/queue.h>
@@ -46,12 +45,6 @@ static void release_task(struct task_str
 {
 	if (p == current)
 		BUG();
-
-	if (p->ptgroup && atomic_sub_and_test(1,&p->ptgroup->count)) {
-		kfree(p->ptgroup);
-		p->ptgroup = NULL;
-	}
-
 #ifdef CONFIG_SMP
 	wait_task_inactive(p);
 #endif
diff -Naurp linux-2.4.20-wolk4.0s/kernel/fork.c linux-2.4.20-wolk4.1-fullkernel/kernel/fork.c
--- linux-2.4.20-wolk4.0s/kernel/fork.c	2003-05-15 21:52:48.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/kernel/fork.c	2003-05-16 14:21:51.000000000 +0200
@@ -449,7 +449,6 @@ static int copy_mm(unsigned long clone_f
 
 	tsk->mm = NULL;
 	tsk->active_mm = NULL;
-	tsk->ptgroup = NULL;
 
 	/*
 	 * Are we cloning a kernel thread?
@@ -900,21 +899,6 @@ int do_fork(unsigned long clone_flags, u
 		goto bad_fork_cleanup_namespace;
 	gr_copy_label(p);
 	p->semundo = NULL;
-
-	/* detect a 'thread' and link to the ptg block for group */
-	if ( ((clone_flags & CLONE_VM) && (clone_flags & CLONE_FILES)) ||
-	     (clone_flags & CLONE_THREAD)) {
-		if (current->ptgroup)
-			atomic_inc(&current->ptgroup->count);
-		else {
-			current->ptgroup = kmalloc(sizeof(struct ptg_struct), GFP_ATOMIC);
-			if (current->ptgroup) {
-				atomic_set(&current->ptgroup->count,2);
-				atomic_set(&current->ptgroup->active,1);
-			}
-		}
-		p->ptgroup = current->ptgroup;
-	}
 	
 	/* Our parent execution domain becomes current domain
 	   These must match for thread signalling to apply */
diff -Naurp linux-2.4.20-wolk4.0s/kernel/ksyms.c linux-2.4.20-wolk4.1-fullkernel/kernel/ksyms.c
--- linux-2.4.20-wolk4.0s/kernel/ksyms.c	2003-05-15 21:52:48.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/kernel/ksyms.c	2003-05-19 12:51:33.000000000 +0200
@@ -265,6 +265,7 @@ EXPORT_SYMBOL(ll_rw_block);
 EXPORT_SYMBOL(__submit_bh);
 EXPORT_SYMBOL(unlock_buffer);
 EXPORT_SYMBOL(__wait_on_buffer);
+EXPORT_SYMBOL(___wait_on_page);
 EXPORT_SYMBOL(generic_direct_IO);
 EXPORT_SYMBOL(discard_bh_page);
 EXPORT_SYMBOL(block_write_full_page);
diff -Naurp linux-2.4.20-wolk4.0s/kernel/sched.c linux-2.4.20-wolk4.1-fullkernel/kernel/sched.c
--- linux-2.4.20-wolk4.0s/kernel/sched.c	2003-05-15 21:52:48.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/kernel/sched.c	2003-05-18 18:49:41.000000000 +0200
@@ -21,7 +21,6 @@
 #include <linux/nmi.h>
 #include <linux/interrupt.h>
 #include <linux/init.h>
-#include <linux/file.h>
 #include <asm/uaccess.h>
 #include <linux/smp_lock.h>
 #include <asm/mmu_context.h>
@@ -64,8 +63,6 @@
 #define MAX_TIMESLICE		( 10 * HZ / 1000)
 #define CHILD_PENALTY		95
 #define PARENT_PENALTY		100
-#define THREAD_PENALTY		50	/* allow threads groups 2 full timeslices */
-#define USER_PENALTY		10	/* allow user 10 full timeslices */
 #define PRIO_BONUS_RATIO	25
 #define INTERACTIVE_DELTA	2
 #define MAX_SLEEP_AVG		(2*HZ)
@@ -75,11 +72,9 @@
 #warning INFO: Server Scheduler Tweaks will be used.
 
 #define MIN_TIMESLICE		( 10 * HZ / 1000)
-#define MAX_TIMESLICE		(300 * HZ / 1000)
+#define MAX_TIMESLICE		(200 * HZ / 1000)
 #define CHILD_PENALTY		50
 #define PARENT_PENALTY		100
-#define THREAD_PENALTY		50	/* allow threads groups 2 full timeslices */
-#define USER_PENALTY		10	/* allow user 10 full timeslices */
 #define PRIO_BONUS_RATIO	25
 #define INTERACTIVE_DELTA	2
 #define MAX_SLEEP_AVG		(2*HZ)
@@ -141,20 +136,7 @@
 
 static inline unsigned int task_timeslice(task_t *p)
 {
-	int work, slice, weight = 100;
-	if (p->ptgroup) {
-		work = atomic_read(&p->ptgroup->active) * THREAD_PENALTY;
-		if (work > weight)
-			weight = work;
-	}
-	if (p->user->uid) {
-		work = atomic_read(&p->user->active) * USER_PENALTY;
-		if (work > weight)
-			weight = work;
-	}
-	slice = 100 * BASE_TIMESLICE(p) / weight;
-	return slice > MIN_TIMESLICE ? slice : MIN_TIMESLICE;
-//	return BASE_TIMESLICE(p);
+	return BASE_TIMESLICE(p);
 }
 
 runqueue_t runqueues[NR_CPUS] __cacheline_aligned;
@@ -273,16 +255,10 @@ static inline void __activate_task(task_
 	}
 	__enqueue_task(p, array, parent);
 	rq->nr_running++;
-	if (p->ptgroup)
-		atomic_inc(&p->ptgroup->active);
-	atomic_inc(&p->user->active);
 }
 
 static inline void deactivate_task(struct task_struct *p, runqueue_t *rq)
 {
-	atomic_dec(&p->user->active);
-	if (p->ptgroup)
-		atomic_dec(&p->ptgroup->active);
 	rq->nr_running--;
 	if (p->state == TASK_UNINTERRUPTIBLE)
 		rq->nr_uninterruptible++;
diff -Naurp linux-2.4.20-wolk4.0s/kernel/softirq.c linux-2.4.20-wolk4.1-fullkernel/kernel/softirq.c
--- linux-2.4.20-wolk4.0s/kernel/softirq.c	2003-05-15 21:52:48.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/kernel/softirq.c	2003-05-16 14:21:51.000000000 +0200
@@ -377,7 +377,7 @@ static int ksoftirqd(void * __bind_cpu)
 	if (cpu() != cpu)
 		BUG();
 
-	sprintf(current->comm, "ksoftirqd_CPU%d", bind_cpu);
+	sprintf(current->comm, "ksoftirqd/%d", bind_cpu);
 
 	__set_current_state(TASK_INTERRUPTIBLE);
 	mb();
diff -Naurp linux-2.4.20-wolk4.0s/kernel/user.c linux-2.4.20-wolk4.1-fullkernel/kernel/user.c
--- linux-2.4.20-wolk4.0s/kernel/user.c	2003-05-15 21:52:48.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/kernel/user.c	2003-05-16 13:54:44.000000000 +0200
@@ -49,7 +49,6 @@ static spinlock_t uidhash_lock = SPIN_LO
 struct user_struct root_user = {
 	__count:	ATOMIC_INIT(1),
 	processes:	ATOMIC_INIT(1),
-	active:		ATOMIC_INIT(1),
 	files:		ATOMIC_INIT(0)
 };
 
@@ -140,7 +139,6 @@ struct user_struct * alloc_uid(uid_t uid
 #endif /* CONFIG_SCONTEXTS */
 		atomic_set(&new->__count, 1);
 		atomic_set(&new->processes, 0);
-		atomic_set(&new->active, 0);
 		atomic_set(&new->files, 0);
 
 		/*
diff -Naurp linux-2.4.20-wolk4.0s/mm/filemap.c linux-2.4.20-wolk4.1-fullkernel/mm/filemap.c
--- linux-2.4.20-wolk4.0s/mm/filemap.c	2003-05-15 21:52:48.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/mm/filemap.c	2003-05-16 13:54:46.000000000 +0200
@@ -97,11 +97,15 @@ static inline void remove_page_from_inod
 {
 	struct address_space * mapping = page->mapping;
 
-	mapping->nrpages--;
+	if (mapping->a_ops->removepage)
+		mapping->a_ops->removepage(page);
+	
 	list_del(&page->list);
 	if (!mapping->nrpages) 
 		refile_inode(mapping->host);
 	page->mapping = NULL;
+	wmb();
+	mapping->nrpages--;
 }
 
 static inline void remove_page_from_hash_queue(struct page * page)
@@ -614,7 +618,7 @@ int filemap_fdatawait(struct address_spa
 		page_cache_get(page);
 		spin_unlock(&pagecache_lock);
 
-		wait_on_page(page);
+		___wait_on_page(page);
 		if (PageError(page))
 			ret = -EIO;
 
@@ -828,18 +832,12 @@ void wakeup_page_waiters(struct page * p
 		wake_up(head);
 }
 
-static void kill_buffers(struct page *page)
-{
-	if (!PageLocked(page))
-		BUG();
-	if (page->buffers)
-		try_to_release_page(page, GFP_NOIO);
-}
-
 /* 
- * Wait for a page to come unlocked.  Then try to ditch its buffer_heads.
+ * Wait for a page to get unlocked.
  *
- * FIXME: Make the ditching dependent on CONFIG_MONSTER_BOX or something.
+ * This must be called with the caller "holding" the page,
+ * ie with increased "page->count" so that the page won't
+ * go away during the wait..
  *
  * The waiting strategy is to get on a waitqueue determined
  * by hashing. Waiters will then collide, and the newly woken
@@ -856,17 +854,27 @@ static void kill_buffers(struct page *pa
  * be very rare due to the few pages that are actually being
  * waited on at any given time and the quality of the hash function.
  */
-void wait_on_page(struct page *page)
+void ___wait_on_page(struct page *page)
 {
-	lock_page(page);
-	kill_buffers(page);
-	unlock_page(page);
+	wait_queue_head_t *waitqueue = page_waitqueue(page);
+	struct task_struct *tsk = current;
+	DECLARE_WAITQUEUE(wait, tsk);
+
+	add_wait_queue(waitqueue, &wait);
+	do {
+		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+		if (!PageLocked(page))
+			break;
+		sync_page(page);
+		schedule();
+	} while (PageLocked(page));
+	__set_task_state(tsk, TASK_RUNNING);
+	remove_wait_queue(waitqueue, &wait);
 }
-EXPORT_SYMBOL(wait_on_page);
 
 /*
  * unlock_page() is the other half of the story just above
- * lock_page().. Here a couple of quick checks are done
+ * __wait_on_page(). Here a couple of quick checks are done
  * and a couple of flags are set on the page, and then all
  * of the waiters for all of the pages in the appropriate
  * wait queue are woken.
@@ -1569,11 +1577,6 @@ found_page:
 			}
 			goto page_not_up_to_date;
 		}
-		if (page->buffers) {
-			lock_page(page);
-			kill_buffers(page);
-			unlock_page(page);
-		}
 		if (!nonblock)
 			generic_file_readahead(reada_ok, filp, inode, page);
 page_ok:
@@ -1648,7 +1651,6 @@ page_not_up_to_date:
 
 		/* Did somebody else fill it already? */
 		if (Page_Uptodate(page)) {
-			kill_buffers(page);
 			UnlockPage(page);
 			goto page_ok;
 		}
@@ -2172,11 +2174,6 @@ retry_find:
 	 */
 	if (!Page_Uptodate(page))
 		goto page_not_uptodate;
-	if (page->buffers) {
-		lock_page(page);
-		kill_buffers(page);
-		unlock_page(page);
-	}
 
 success:
  	/*
@@ -2235,7 +2232,6 @@ page_not_uptodate:
 
 	/* Did somebody else get it up-to-date? */
 	if (Page_Uptodate(page)) {
-		kill_buffers(page);
 		UnlockPage(page);
 		goto success;
 	}
@@ -2263,7 +2259,6 @@ page_not_uptodate:
 
 	/* Somebody else successfully read it in? */
 	if (Page_Uptodate(page)) {
-		kill_buffers(page);
 		UnlockPage(page);
 		goto success;
 	}
@@ -3088,7 +3083,6 @@ retry:
 		goto retry;
 	}
 	if (Page_Uptodate(page)) {
-		kill_buffers(page);
 		UnlockPage(page);
 		goto out;
 	}
diff -Naurp linux-2.4.20-wolk4.0s/mm/highmem.c linux-2.4.20-wolk4.1-fullkernel/mm/highmem.c
--- linux-2.4.20-wolk4.0s/mm/highmem.c	2003-05-15 21:52:48.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/mm/highmem.c	2003-05-16 13:54:47.000000000 +0200
@@ -21,7 +21,6 @@
 #include <linux/highmem.h>
 #include <linux/swap.h>
 #include <linux/slab.h>
-#include <linux/blkdev.h>
 
 /*
  * Virtual_count is not a pure "count".
@@ -212,14 +211,6 @@ static LIST_HEAD(emergency_pages);
 int nr_emergency_bhs;
 static LIST_HEAD(emergency_bhs);
 
-int nr_atomic_emergency_pages;
-static LIST_HEAD(atomic_emergency_pages);
-
-int nr_atomic_emergency_bhs;
-static LIST_HEAD(atomic_emergency_bhs);
-
-int atomic_emergency_owner;
-
 /*
  * Simple bounce buffer support for highmem pages.
  * This will be moved to the block layer in 2.5.
@@ -259,66 +250,35 @@ static inline void bounce_end_io (struct
 	struct page *page;
 	struct buffer_head *bh_orig = (struct buffer_head *)(bh->b_private);
 	unsigned long flags;
-	int atomic = bh_elv_seq(bh);
 
 	bh_orig->b_end_io(bh_orig, uptodate);
 
 	page = bh->b_page;
 
 	spin_lock_irqsave(&emergency_lock, flags);
-	if (!atomic) {
-		if (nr_emergency_pages >= POOL_SIZE)
-			__free_page(page);
-		else {
-			/*
-			 * We are abusing page->list to manage
-			 * the highmem emergency pool:
-			 */
-			list_add(&page->list, &emergency_pages);
-			nr_emergency_pages++;
-		}
-
-		if (nr_emergency_bhs >= POOL_SIZE) {
+	if (nr_emergency_pages >= POOL_SIZE)
+		__free_page(page);
+	else {
+		/*
+		 * We are abusing page->list to manage
+		 * the highmem emergency pool:
+		 */
+		list_add(&page->list, &emergency_pages);
+		nr_emergency_pages++;
+	}
+	
+	if (nr_emergency_bhs >= POOL_SIZE) {
 #ifdef HIGHMEM_DEBUG
-			/* Don't clobber the constructed slab cache */
-			init_waitqueue_head(&bh->b_wait);
+		/* Don't clobber the constructed slab cache */
+		init_waitqueue_head(&bh->b_wait);
 #endif
-			kmem_cache_free(bh_cachep, bh);
-		} else {
-			/*
-			 * Ditto in the bh case, here we abuse b_inode_buffers:
-			 */
-			list_add(&bh->b_inode_buffers, &emergency_bhs);
-			nr_emergency_bhs++;
-		}
+		kmem_cache_free(bh_cachep, bh);
 	} else {
-		if (nr_atomic_emergency_pages >= BLK_ATOMIC_BOUNCE_ENTRIES)
-			__free_page(page);
-		else {
-			/*
-			 * We are abusing page->list to manage
-			 * the highmem emergency pool:
-			 */
-			list_add(&page->list, &atomic_emergency_pages);
-			nr_atomic_emergency_pages++;
-		}
-
-		if (nr_atomic_emergency_bhs >= BLK_ATOMIC_BOUNCE_ENTRIES) {
-#ifdef HIGHMEM_DEBUG
-			/* Don't clobber the constructed slab cache */
-			init_waitqueue_head(&bh->b_wait);
-#endif
-			kmem_cache_free(bh_cachep, bh);
-		} else {
-			/*
-			 * Ditto in the bh case, here we abuse b_inode_buffers:
-			 */
-			list_add(&bh->b_inode_buffers, &atomic_emergency_bhs);
-			nr_atomic_emergency_bhs++;
-		}
-		BUG_ON(nr_atomic_emergency_pages != nr_atomic_emergency_bhs);
-		if (nr_atomic_emergency_pages >= BLK_ATOMIC_BOUNCE_ENTRIES)
-			atomic_emergency_owner = 0;
+		/*
+		 * Ditto in the bh case, here we abuse b_inode_buffers:
+		 */
+		list_add(&bh->b_inode_buffers, &emergency_bhs);
+		nr_emergency_bhs++;
 	}
 	spin_unlock_irqrestore(&emergency_lock, flags);
 }
@@ -351,24 +311,6 @@ static __init int init_emergency_pool(vo
 		list_add(&bh->b_inode_buffers, &emergency_bhs);
 		nr_emergency_bhs++;
 	}
-	while (nr_atomic_emergency_pages < BLK_ATOMIC_BOUNCE_ENTRIES) {
-		struct page * page = alloc_page(GFP_ATOMIC);
-		if (!page) {
-			printk("couldn't refill highmem emergency pages");
-			break;
-		}
-		list_add(&page->list, &atomic_emergency_pages);
-		nr_atomic_emergency_pages++;
-	}
-	while (nr_atomic_emergency_bhs < BLK_ATOMIC_BOUNCE_ENTRIES) {
-		struct buffer_head * bh = kmem_cache_alloc(bh_cachep, SLAB_ATOMIC);
-		if (!bh) {
-			printk("couldn't refill highmem emergency bhs");
-			break;
-		}
-		list_add(&bh->b_inode_buffers, &atomic_emergency_bhs);
-		nr_atomic_emergency_bhs++;
-	}
 	spin_unlock_irq(&emergency_lock);
 	printk("allocated %d pages and %d bhs reserved for the highmem bounces\n",
 	       nr_emergency_pages, nr_emergency_bhs);
@@ -392,7 +334,7 @@ static void bounce_end_io_read (struct b
 	bounce_end_io(bh, uptodate);
 }
 
-struct page *alloc_bounce_page (int atomic)
+struct page *alloc_bounce_page (void)
 {
 	struct list_head *tmp;
 	struct page *page = NULL;
@@ -424,30 +366,17 @@ repeat_alloc:
 	/*
 	 * Try to allocate from the emergency pool.
 	 */
+	tmp = &emergency_pages;
 	spin_lock_irq(&emergency_lock);
-	if (!atomic) {
-		tmp = &emergency_pages;
-		if (!list_empty(tmp)) {
-			page = list_entry(tmp->next, struct page, list);
-			list_del(tmp->next);
-			nr_emergency_pages--;
-		}
-	} else {
-		tmp = &atomic_emergency_pages;
-		if ((!atomic_emergency_owner || atomic_emergency_owner == atomic) &&
-		    !list_empty(tmp)) {
-			page = list_entry(tmp->next, struct page, list);
-			list_del(tmp->next);
-			nr_atomic_emergency_pages--;
-			atomic_emergency_owner = atomic;
-		}
+	if (!list_empty(tmp)) {
+		page = list_entry(tmp->next, struct page, list);
+		list_del(tmp->next);
+		nr_emergency_pages--;
 	}
 	spin_unlock_irq(&emergency_lock);
 	if (page)
 		return page;
 
-	if (atomic)
-		blk_refile_atomic_queue(atomic);
 	/* we need to wait I/O completion */
 	run_task_queue(&tq_disk);
 
@@ -456,7 +385,7 @@ repeat_alloc:
 	goto repeat_alloc;
 }
 
-struct buffer_head *alloc_bounce_bh (int atomic)
+struct buffer_head *alloc_bounce_bh (void)
 {
 	struct list_head *tmp;
 	struct buffer_head *bh = NULL;
@@ -488,31 +417,17 @@ repeat_alloc:
 	/*
 	 * Try to allocate from the emergency pool.
 	 */
+	tmp = &emergency_bhs;
 	spin_lock_irq(&emergency_lock);
-	if (!atomic) {
-		tmp = &emergency_bhs;
-		if (!list_empty(tmp)) {
-			bh = list_entry(tmp->next, struct buffer_head, b_inode_buffers);
-			list_del(tmp->next);
-			nr_emergency_bhs--;
-		}
-	} else {
-		tmp = &atomic_emergency_bhs;
-		if ((!atomic_emergency_owner || atomic_emergency_owner == atomic) &&
-		    !list_empty(tmp)) {
-			bh = list_entry(tmp->next, struct buffer_head, b_inode_buffers);
-			list_del(tmp->next);
-			nr_atomic_emergency_bhs--;
-			atomic_emergency_owner = atomic;
-		}
-
+	if (!list_empty(tmp)) {
+		bh = list_entry(tmp->next, struct buffer_head, b_inode_buffers);
+		list_del(tmp->next);
+		nr_emergency_bhs--;
 	}
 	spin_unlock_irq(&emergency_lock);
 	if (bh)
 		return bh;
 
-	if (atomic)
-		blk_refile_atomic_queue(atomic);
 	/* we need to wait I/O completion */
 	run_task_queue(&tq_disk);
 
@@ -529,14 +444,14 @@ struct buffer_head * create_bounce(int r
 	if (!PageHighMem(bh_orig->b_page))
 		return bh_orig;
 
-	bh = alloc_bounce_bh(bh_elv_seq(bh_orig));
+	bh = alloc_bounce_bh();
 	/*
 	 * This is wasteful for 1k buffers, but this is a stopgap measure
 	 * and we are being ineffective anyway. This approach simplifies
 	 * things immensly. On boxes with more than 4GB RAM this should
 	 * not be an issue anyway.
 	 */
-	page = alloc_bounce_page(bh_elv_seq(bh_orig));
+	page = alloc_bounce_page();
 
 	set_bh_page(bh, page, 0);
 
@@ -564,7 +479,6 @@ struct buffer_head * create_bounce(int r
 		bh->b_end_io = bounce_end_io_read;
 	bh->b_private = (void *)bh_orig;
 	bh->b_rsector = bh_orig->b_rsector;
-	bh_elv_seq(bh) = bh_elv_seq(bh_orig);
 #ifdef HIGHMEM_DEBUG
 	memset(&bh->b_wait, -1, sizeof(bh->b_wait));
 #endif
diff -Naurp linux-2.4.20-wolk4.0s/mm/mmap.c linux-2.4.20-wolk4.1-fullkernel/mm/mmap.c
--- linux-2.4.20-wolk4.0s/mm/mmap.c	2003-05-15 21:52:48.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/mm/mmap.c	2003-05-16 14:21:51.000000000 +0200
@@ -390,6 +390,8 @@ static int vma_merge(struct mm_struct * 
 			spin_unlock(lock);
 			if (need_unlock)
 				unlock_vma_mappings(next);
+			if (file)
+				fput(file);
 
 			mm->map_count--;
 			kmem_cache_free(vm_area_cachep, next);
diff -Naurp linux-2.4.20-wolk4.0s/mm/page_alloc.c linux-2.4.20-wolk4.1-fullkernel/mm/page_alloc.c
--- linux-2.4.20-wolk4.0s/mm/page_alloc.c	2003-05-15 21:52:48.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/mm/page_alloc.c	2003-05-16 14:21:51.000000000 +0200
@@ -531,7 +531,7 @@ try_again:
 	/*
 	 * Oh well, we didn't succeed.
 	 */
-	if (!(current->flags & PF_MEMALLOC)) {
+	if (!(current->flags & (PF_MEMALLOC|PF_MEMDIE))) {
 		/*
 		 * Are we dealing with a higher order allocation?
 		 *
diff -Naurp linux-2.4.20-wolk4.0s/mm/vmscan.c linux-2.4.20-wolk4.1-fullkernel/mm/vmscan.c
--- linux-2.4.20-wolk4.0s/mm/vmscan.c	2003-05-15 21:52:48.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/mm/vmscan.c	2003-05-16 13:43:23.000000000 +0200
@@ -556,9 +556,6 @@ int refill_inactive_zone(struct zone_str
 				continue;
 			}
 
-			if (page->buffers)
-				try_to_release_page(page, 0);
-
 			/*
 			 * Do aging on the pages.
 			 */
@@ -903,16 +900,6 @@ static int do_try_to_free_pages(unsigned
 	 */
 	ret += kmem_cache_reap(gfp_mask);
 
-	if ((gfp_mask & __GFP_WAIT) && (shrink_buffer_cache() > 16))
-		ret += kmem_cache_reap(gfp_mask);
-
-	/*
-	 * Hmm.. Cache shrink failed - time to kill something?
-	 * Mhwahahhaha! This is the part I really like. Giggle.
-	 */
-	if (!ret && free_low(ANY_ZONE) && (gfp_mask&__GFP_WAIT))
-		out_of_memory();
-
 	return ret;
 }
 
@@ -937,7 +924,7 @@ static int do_try_to_free_pages_kswapd(u
 	for_each_zone(zone) {
 		int worktodo = max(free_low(zone), BATCH_WORK_AMOUNT);
 		if (need_rebalance_laundry(zone))
-			rebalance_laundry_zone(zone, worktodo, 0);
+			ret += rebalance_laundry_zone(zone, worktodo, 0);
 
 		if (need_rebalance_dirty(zone))
 			rebalance_dirty_zone(zone, 4 * worktodo,  gfp_mask);
@@ -963,9 +950,11 @@ static int do_try_to_free_pages_kswapd(u
 
 	refill_freelist();
 
-	/* Start IO when needed. */
-	if (free_plenty(ALL_ZONES) > 0 || free_low(ANY_ZONE) > 0)
-		run_task_queue(&tq_disk);
+	/*
+	 * Mhwahahhaha! This is the part I really like. Giggle.
+	 */
+	if (!ret && free_min(ANY_ZONE))
+		out_of_memory();
 
 	return ret;
 }
diff -Naurp linux-2.4.20-wolk4.0s/net/ipsec/Makefile.inc linux-2.4.20-wolk4.1-fullkernel/net/ipsec/Makefile.inc
--- linux-2.4.20-wolk4.0s/net/ipsec/Makefile.inc	2003-05-15 21:52:49.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/net/ipsec/Makefile.inc	2003-05-16 13:36:25.000000000 +0200
@@ -11,7 +11,7 @@
 # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 # for more details.
 #
-# RCSID $Id: Makefile.inc,v 1.2 2002/11/05 03:00:10 ken Exp $
+# RCSID $Id: Makefile.inc,v 1.3 2003/04/26 02:27:17 ken Exp $
 
 
 
@@ -42,7 +42,7 @@ SHELL=/bin/sh
 # reside rather than where install puts them, are exempt from this.)
 # The prefixing is done in this file, so as to have central control over
 # it; DESTDIR itself should never appear in any other Makefile.
-DESTDIR=
+DESTDIR?=
 
 # "local" part of tree, used in building other pathnames
 INC_USRLOCAL=/usr/local
@@ -92,7 +92,7 @@ RCDIR=$(DESTDIR)$(FINALRCDIR)
 ### kernel pathnames
 
 # Kernel location:  where patches are inserted, where kernel builds are done.
-KERNELSRC=/usr/src/linux
+KERNELSRC?=/usr/src/linux
 
 # things whose existence indicates what kernel version we have
 DIRIN22=$(KERNELSRC)/net/netlink
diff -Naurp linux-2.4.20-wolk4.0s/net/ipsec/Makefile.ver linux-2.4.20-wolk4.1-fullkernel/net/ipsec/Makefile.ver
--- linux-2.4.20-wolk4.0s/net/ipsec/Makefile.ver	2003-05-15 21:52:49.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/net/ipsec/Makefile.ver	2003-05-16 13:36:25.000000000 +0200
@@ -1 +1 @@
-IPSECVERSION=super-freeswan-1.99.6.2
+IPSECVERSION=super-freeswan-1.99.7
diff -Naurp linux-2.4.20-wolk4.0s/net/ipsec/alg/ipsec_alg_blowfish.c linux-2.4.20-wolk4.1-fullkernel/net/ipsec/alg/ipsec_alg_blowfish.c
--- linux-2.4.20-wolk4.0s/net/ipsec/alg/ipsec_alg_blowfish.c	2003-05-15 21:52:49.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/net/ipsec/alg/ipsec_alg_blowfish.c	2003-05-16 13:36:25.000000000 +0200
@@ -3,7 +3,7 @@
  *
  * Author: JuanJo Ciarlante <jjo-ipsec@mendoza.gov.ar>
  * 
- * $Id: ipsec_alg_blowfish.c,v 1.4 2003/02/07 13:14:25 ken Exp $
+ * $Id: ipsec_alg_blowfish.c,v 1.5 2003/05/08 13:48:39 jjo Exp $
  * 
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the
@@ -49,6 +49,7 @@
 
 #define ESP_BLOWFISH_KEY_SZ_MIN	12 	/* 96 bit secret key min */
 #define ESP_BLOWFISH_KEY_SZ		16 	/* 128 bit secret key */
+#define ESP_BLOWFISH_KEY_SZ_MAX		56 	/* 448 bit secret key */
 #define ESP_BLOWFISH_CBC_BLK_LEN	8  	/* block size */
 
 MODULE_AUTHOR("JuanJo Ciarlante <jjo-ipsec@mendoza.gov.ar>");
@@ -94,7 +95,7 @@ static struct ipsec_alg_enc ipsec_alg_BL
 	ixt_name: 	"blowfish",
 	ixt_blocksize:	ESP_BLOWFISH_CBC_BLK_LEN,
 	ixt_keyminbits:	ESP_BLOWFISH_KEY_SZ_MIN*8,
-	ixt_keymaxbits:	ESP_BLOWFISH_KEY_SZ*8,
+	ixt_keymaxbits:	ESP_BLOWFISH_KEY_SZ_MAX*8,
 	ixt_e_keylen:	ESP_BLOWFISH_KEY_SZ,	
 	ixt_e_ctx_size:	sizeof(blowfish_context),
 	ixt_e_set_key:	_blowfish_set_key,
diff -Naurp linux-2.4.20-wolk4.0s/net/ipv4/netfilter/ip_conntrack_core.c linux-2.4.20-wolk4.1-fullkernel/net/ipv4/netfilter/ip_conntrack_core.c
--- linux-2.4.20-wolk4.0s/net/ipv4/netfilter/ip_conntrack_core.c	2003-05-15 21:52:51.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/net/ipv4/netfilter/ip_conntrack_core.c	2003-05-16 14:00:34.000000000 +0200
@@ -31,6 +31,8 @@
 #include <linux/stddef.h>
 #include <linux/sysctl.h>
 #include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/jhash.h>
 /* For ERR_PTR().  Yeah, I know... --RR */
 #include <linux/fs.h>
 
@@ -47,9 +49,6 @@
 
 #define IP_CONNTRACK_VERSION	"2.1"
 
-/* Multiplier used to avoid hash clashes - should be a prime, and better not 2. */
-#define HASH_CONNTRACK_SRC_MULTIPLIER 7
-
 #if 0
 #define DEBUGP printk
 #else
@@ -110,21 +109,19 @@ ip_conntrack_put(struct ip_conntrack *ct
 	nf_conntrack_put(&ct->infos[0]);
 }
 
-static inline u_int32_t
+static int ip_conntrack_hash_rnd_initted;
+static unsigned int ip_conntrack_hash_rnd;
+
+static u_int32_t
 hash_conntrack(const struct ip_conntrack_tuple *tuple)
 {
 #if 0
 	dump_tuple(tuple);
 #endif
-	/* ntohl because more differences in low bits. */
-	/* ports must be outside ntohl or else they will add to high bits. */
-	/* To ensure that halves of the same connection don't hash
-	   clash, we use a multiplier for the src port. */
-	return (ntohl(tuple->src.ip + tuple->dst.ip)
-		+ HASH_CONNTRACK_SRC_MULTIPLIER * ntohs (tuple->src.u.all) 
-		+ ntohs (tuple->dst.u.all)
-		+ tuple->dst.protonum)
-		% ip_conntrack_htable_size;
+	return (jhash_3words(tuple->src.ip,
+	                     (tuple->dst.ip ^ tuple->dst.protonum),
+	                     (tuple->src.u.all | (tuple->dst.u.all << 16)),
+	                     ip_conntrack_hash_rnd) % ip_conntrack_htable_size);
 }
 
 inline int
@@ -643,11 +640,16 @@ init_conntrack(const struct ip_conntrack
 {
 	struct ip_conntrack *conntrack;
 	struct ip_conntrack_tuple repl_tuple;
-	size_t hash, repl_hash;
+	size_t hash;
 	struct ip_conntrack_expect *expected;
 	int i;
 	static unsigned int drop_next = 0;
 
+	if (!ip_conntrack_hash_rnd_initted) {
+		get_random_bytes(&ip_conntrack_hash_rnd, 4);
+		ip_conntrack_hash_rnd_initted = 1;
+	}
+
 	hash = hash_conntrack(tuple);
 
 	if (ip_conntrack_max &&
@@ -671,7 +673,6 @@ init_conntrack(const struct ip_conntrack
 		DEBUGP("Can't invert tuple.\n");
 		return NULL;
 	}
-	repl_hash = hash_conntrack(&repl_tuple);
 
 	conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
 	if (!conntrack) {
@@ -1415,7 +1416,7 @@ int __init ip_conntrack_init(void)
 	ip_conntrack_max = 8 * ip_conntrack_htable_size;
 
 	printk("ip_conntrack version %s (%u buckets, %d max)"
-	       " - %d bytes per conntrack\n", IP_CONNTRACK_VERSION,
+	       " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION,
 	       ip_conntrack_htable_size, ip_conntrack_max,
 	       sizeof(struct ip_conntrack));
 
diff -Naurp linux-2.4.20-wolk4.0s/net/ipv4/route.c linux-2.4.20-wolk4.1-fullkernel/net/ipv4/route.c
--- linux-2.4.20-wolk4.0s/net/ipv4/route.c	2003-05-15 21:52:52.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/net/ipv4/route.c	2003-05-16 14:00:34.000000000 +0200
@@ -85,6 +85,7 @@
 #include <linux/mroute.h>
 #include <linux/netfilter_ipv4.h>
 #include <linux/random.h>
+#include <linux/jhash.h>
 #include <net/protocol.h>
 #include <net/ip.h>
 #include <net/route.h>
@@ -117,13 +118,14 @@ int ip_rt_gc_elasticity		= 8;
 int ip_rt_mtu_expires		= 10 * 60 * HZ;
 int ip_rt_min_pmtu		= 512 + 20 + 20;
 int ip_rt_min_advmss		= 256;
-
+int ip_rt_secret_interval	= 10 * 60 * HZ;
 static unsigned long rt_deadline;
 
 #define RTprint(a...)	printk(KERN_DEBUG a)
 
 static struct timer_list rt_flush_timer;
 static struct timer_list rt_periodic_timer;
+static struct timer_list rt_secret_timer;
 
 /*
  *	Interface to generic destination cache.
@@ -194,19 +196,17 @@ struct rt_hash_bucket {
 static struct rt_hash_bucket 	*rt_hash_table;
 static unsigned			rt_hash_mask;
 static int			rt_hash_log;
+static unsigned int		rt_hash_rnd;
 
 struct rt_cache_stat rt_cache_stat[NR_CPUS];
 
 static int rt_intern_hash(unsigned hash, struct rtable *rth,
 				struct rtable **res);
 
-static __inline__ unsigned rt_hash_code(u32 daddr, u32 saddr, u8 tos)
+static unsigned int rt_hash_code(u32 daddr, u32 saddr, u8 tos)
 {
-	unsigned hash = ((daddr & 0xF0F0F0F0) >> 4) |
-			((daddr & 0x0F0F0F0F) << 4);
-	hash ^= saddr ^ tos;
-	hash ^= (hash >> 16);
-	return (hash ^ (hash >> 8)) & rt_hash_mask;
+	return (jhash_3words(daddr, saddr, (u32) tos, rt_hash_rnd)
+		& rt_hash_mask);
 }
 
 static int rt_cache_get_info(char *buffer, char **start, off_t offset,
@@ -479,6 +479,15 @@ void rt_cache_flush(int delay)
 	spin_unlock_bh(&rt_flush_lock);
 }
 
+static void rt_secret_rebuild(unsigned long dummy)
+{
+	unsigned long now = jiffies;
+
+	get_random_bytes(&rt_hash_rnd, 4);
+	rt_cache_flush(0);
+	mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
+}
+
 /*
    Short description of GC goals.
 
@@ -2454,6 +2463,15 @@ ctl_table ipv4_route_table[] = {
 		mode:		0644,
 		proc_handler:	&proc_dointvec,
 	},
+	{
+		ctl_name:	NET_IPV4_ROUTE_SECRET_INTERVAL,
+		procname:	"secret_interval",
+		data:		&ip_rt_secret_interval,
+		maxlen:		sizeof(int),
+		mode:		0644,
+		proc_handler:	&proc_dointvec_jiffies,
+		strategy:	&sysctl_jiffies,
+	},
 	 { 0 }
 };
 #endif
@@ -2464,7 +2482,7 @@ struct ip_rt_acct *ip_rt_acct;
 /* This code sucks.  But you should have seen it before! --RR */
 
 /* IP route accounting ptr for this logical cpu number. */
-#define IP_RT_ACCT_CPU(i) ((u8*)ip_rt_acct + cpu_logical_map(i) * 256)
+#define IP_RT_ACCT_CPU(i) (ip_rt_acct + cpu_logical_map(i) * 256)
 
 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
 			   int length, int *eof, void *data)
@@ -2484,22 +2502,27 @@ static int ip_rt_acct_read(char *buffer,
 		*eof = 1;
 	}
 
-	*start = buffer;
+	offset /= sizeof(u32);
+
+	if (length > 0) {
+		u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
+		u32 *dst = (u32 *) buffer;
 
-	if (length > 0)
-	{
 		/* Copy first cpu. */
-		memcpy(buffer, IP_RT_ACCT_CPU(0) + offset, length);
+		*start = buffer;
+		memcpy(dst, src, length);
 
 		/* Add the other cpus in, one int at a time */
 		for (i = 1; i < smp_num_cpus; i++) {
 			unsigned int j;
+
+			src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
+
 			for (j = 0; j < length/4; j++)
-				((u32*)buffer)[j] += ((u32*)(IP_RT_ACCT_CPU(i) + offset))[j];
+				dst[j] += src[j];
 		}
-		return length;
 	}
-	return 0;
+	return length;
 }
 #endif
 
@@ -2507,6 +2530,9 @@ void __init ip_rt_init(void)
 {
 	int i, order, goal;
 
+	rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
+			     (jiffies ^ (jiffies >> 7)));
+
 #ifdef CONFIG_NET_CLS_ROUTE
 	for (order = 0;
 	     (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
@@ -2563,6 +2589,7 @@ void __init ip_rt_init(void)
 
 	rt_flush_timer.function = rt_run_flush;
 	rt_periodic_timer.function = rt_check_expire;
+	rt_secret_timer.function = rt_secret_rebuild;
 
 	/* All the timers, started at system startup tend
 	   to synchronize. Perturb it a bit.
@@ -2571,6 +2598,10 @@ void __init ip_rt_init(void)
 					ip_rt_gc_interval;
 	add_timer(&rt_periodic_timer);
 
+	rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
+		ip_rt_secret_interval;
+	add_timer(&rt_secret_timer);
+
 	proc_net_create ("rt_cache", 0, rt_cache_get_info);
 	proc_net_create ("rt_cache_stat", 0, rt_cache_stat_get_info);
 #ifdef CONFIG_NET_CLS_ROUTE
diff -Naurp linux-2.4.20-wolk4.0s/net/ipv4/tcp.c linux-2.4.20-wolk4.1-fullkernel/net/ipv4/tcp.c
--- linux-2.4.20-wolk4.0s/net/ipv4/tcp.c	2003-05-15 21:52:52.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/net/ipv4/tcp.c	2003-05-16 14:00:24.000000000 +0200
@@ -252,6 +252,7 @@
 #include <linux/init.h>
 #include <linux/smp_lock.h>
 #include <linux/fs.h>
+#include <linux/random.h>
 
 #include <net/icmp.h>
 #include <net/tcp.h>
@@ -553,6 +554,7 @@ int tcp_listen_start(struct sock *sk)
 	for (lopt->max_qlen_log = 6; ; lopt->max_qlen_log++)
 		if ((1<<lopt->max_qlen_log) >= sysctl_max_syn_backlog)
 			break;
+	get_random_bytes(&lopt->hash_rnd, 4);
 
 	write_lock_bh(&tp->syn_wait_lock);
 	tp->listen_opt = lopt;
@@ -1333,7 +1335,7 @@ void cleanup_rbuf(struct sock *sk, int c
 		__u32 rcv_window_now = tcp_receive_window(tp);
 
 		/* Optimize, __tcp_select_window() is not cheap. */
-		if (2*rcv_window_now <= tp->window_clamp) {
+		if (2*rcv_window_now < tp->window_clamp) {
 			__u32 new_window = __tcp_select_window(sk);
 
 			/* Send ACK now, if this read freed lots of space
@@ -1341,7 +1343,7 @@ void cleanup_rbuf(struct sock *sk, int c
 			 * We can advertise it now, if it is not less than current one.
 			 * "Lots" means "at least twice" here.
 			 */
-			if(new_window && new_window >= 2*rcv_window_now)
+			if(new_window && new_window > 2*rcv_window_now)
 				time_to_ack = 1;
 		}
 	}
diff -Naurp linux-2.4.20-wolk4.0s/net/ipv4/tcp_ipv4.c linux-2.4.20-wolk4.1-fullkernel/net/ipv4/tcp_ipv4.c
--- linux-2.4.20-wolk4.0s/net/ipv4/tcp_ipv4.c	2003-05-15 21:52:52.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/net/ipv4/tcp_ipv4.c	2003-05-16 14:00:34.000000000 +0200
@@ -52,6 +52,7 @@
 #include <linux/fcntl.h>
 #include <linux/random.h>
 #include <linux/cache.h>
+#include <linux/jhash.h>
 #include <linux/init.h>
 
 #include <net/icmp.h>
@@ -1006,12 +1007,9 @@ static __inline__ int tcp_v4_iif(struct 
 	return ((struct rtable*)skb->dst)->rt_iif;
 }
 
-static __inline__ unsigned tcp_v4_synq_hash(u32 raddr, u16 rport)
+static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
 {
-	unsigned h = raddr ^ rport;
-	h ^= h>>16;
-	h ^= h>>8;
-	return h&(TCP_SYNQ_HSIZE-1);
+	return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
 }
 
 static struct open_request *tcp_v4_search_req(struct tcp_opt *tp, 
@@ -1022,7 +1020,7 @@ static struct open_request *tcp_v4_searc
 	struct tcp_listen_opt *lopt = tp->listen_opt;
 	struct open_request *req, **prev;  
 
-	for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport)];
+	for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
 	     (req = *prev) != NULL;
 	     prev = &req->dl_next) {
 		if (req->rmt_port == rport &&
@@ -1042,7 +1040,7 @@ static void tcp_v4_synq_add(struct sock 
 {
 	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
 	struct tcp_listen_opt *lopt = tp->listen_opt;
-	unsigned h = tcp_v4_synq_hash(req->af.v4_req.rmt_addr, req->rmt_port);
+	u32 h = tcp_v4_synq_hash(req->af.v4_req.rmt_addr, req->rmt_port, lopt->hash_rnd);
 
 	req->expires = jiffies + TCP_TIMEOUT_INIT;
 	req->retrans = 0;
diff -Naurp linux-2.4.20-wolk4.0s/net/ipv4/tcp_minisocks.c linux-2.4.20-wolk4.1-fullkernel/net/ipv4/tcp_minisocks.c
--- linux-2.4.20-wolk4.0s/net/ipv4/tcp_minisocks.c	2003-05-15 21:52:52.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/net/ipv4/tcp_minisocks.c	2003-05-16 13:54:44.000000000 +0200
@@ -452,6 +452,8 @@ static void SMP_TIMER_NAME(tcp_twkill)(u
 
 	while((tw = tcp_tw_death_row[tcp_tw_death_row_slot]) != NULL) {
 		tcp_tw_death_row[tcp_tw_death_row_slot] = tw->next_death;
+		if(tw->next_death)
+			tw->next_death->pprev_death = tw->pprev_death;
 		tw->pprev_death = NULL;
 		spin_unlock(&tw_death_lock);
 
diff -Naurp linux-2.4.20-wolk4.0s/net/ipv6/tcp_ipv6.c linux-2.4.20-wolk4.1-fullkernel/net/ipv6/tcp_ipv6.c
--- linux-2.4.20-wolk4.0s/net/ipv6/tcp_ipv6.c	2003-05-15 21:52:52.000000000 +0200
+++ linux-2.4.20-wolk4.1-fullkernel/net/ipv6/tcp_ipv6.c	2003-05-16 14:00:34.000000000 +0200
@@ -34,6 +34,7 @@
 #include <linux/in6.h>
 #include <linux/netdevice.h>
 #include <linux/init.h>
+#include <linux/jhash.h>
 #include <linux/ipsec.h>
 
 #include <linux/ipv6.h>
@@ -357,12 +358,11 @@ __inline__ struct sock *tcp_v6_lookup(st
  * Open request hash tables.
  */
 
-static __inline__ unsigned tcp_v6_synq_hash(struct in6_addr *raddr, u16 rport)
+static u32 tcp_v6_synq_hash(struct in6_addr *raddr, u16 rport, u32 rnd)
 {
-	unsigned h = raddr->s6_addr32[3] ^ rport;
-	h ^= h>>16;
-	h ^= h>>8;
-	return h&(TCP_SYNQ_HSIZE-1);
+	return (jhash_3words(raddr->s6_addr32[0] ^ raddr->s6_addr32[1],
+	                     raddr->s6_addr32[2] ^ raddr->s6_addr32[3],
+	                     (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
 }
 
 static struct open_request *tcp_v6_search_req(struct tcp_opt *tp,
@@ -375,7 +375,7 @@ static struct open_request *tcp_v6_searc
 	struct tcp_listen_opt *lopt = tp->listen_opt;
 	struct open_request *req, **prev;  
 
-	for (prev = &lopt->syn_table[tcp_v6_synq_hash(raddr, rport)];
+	for (prev = &lopt->syn_table[tcp_v6_synq_hash(raddr, rport, lopt->hash_rnd)];
 	     (req = *prev) != NULL;
 	     prev = &req->dl_next) {
 		if (req->rmt_port == rport &&
@@ -1121,7 +1121,7 @@ static void tcp_v6_synq_add(struct sock 
 {
 	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
 	struct tcp_listen_opt *lopt = tp->listen_opt;
-	unsigned h = tcp_v6_synq_hash(&req->af.v6_req.rmt_addr, req->rmt_port);
+	u32 h = tcp_v6_synq_hash(&req->af.v6_req.rmt_addr, req->rmt_port, lopt->hash_rnd);
 
 	req->sk = NULL;
 	req->expires = jiffies + TCP_TIMEOUT_INIT;
diff -Naurp linux-2.4.20-wolk4.0s/userspace-programs/cap/cap.c linux-2.4.20-wolk4.1-fullkernel/userspace-programs/cap/cap.c
--- linux-2.4.20-wolk4.0s/userspace-programs/cap/cap.c	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.4.20-wolk4.1-fullkernel/userspace-programs/cap/cap.c	2003-05-16 14:17:02.000000000 +0200
@@ -0,0 +1,84 @@
+/* $Id: cap.in,v 1.3 2002/07/08 11:14:33 karol Exp $
+ * cap - interface for CPU patch; setting CPU limit
+ *
+ * Copyright (c) 2002 Karol 'Broege' Golab, TLS-Technologies
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ *
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/resource.h>
+#include <linux/unistd.h>
+
+/* ugly hack - syscall number is not constant so it will be set by make */
+#ifndef __NR_setcpucap
+#define __NR_setcpucap      260
+#endif
+
+_syscall3(int, setcpucap, int, which, int, who, int, cap);
+
+void show_help(void)
+{
+	printf("Usage: cap <pid> <CPU percentage>\n");
+
+	exit(0);
+}
+
+
+void show_version(void)
+{
+	printf("This is cap - CPU patch utility, $version: $\n\n");
+	printf("Written by Karol 'Broege' Golab\n");
+	printf("Copyright (c) 2002 TLS-Technologies\n\n");
+	printf("This is free software; see the source for copying conditions.\n");
+	printf("There is NO warranty; not even for MERCHANTABILITY\n");
+	printf("or FITNESS FOR A PARTICULAR PURPOSE.\n");
+
+	exit(0);
+}
+
+
+void error(const char *msg)
+{
+	fprintf(stderr, "%s\n", msg);
+
+	exit(1);
+}
+
+
+int main(int argc, const char *argv[])
+{
+	int pid, perc;
+	int ret;
+	char *bad;
+
+	if (argc<3) show_help();
+	if (! strcmp(argv[1], "--version")) show_version();
+	if (argc!=3) show_help();
+
+	pid=strtol(argv[1], &bad, 10);
+	if (! argv[1][0] || bad[0]) error("Cannot parse argument 1");
+	perc=strtol(argv[2], &bad, 10);
+	if (! argv[2][0] || bad[0]) error("Cannot parse argument 2");
+
+	ret=setcpucap(PRIO_PROCESS, pid, perc);
+	if (ret!=0) error("Priority was not changed");
+	return 0;
+}
+
+