diff -urN 2.2.18/CREDITS 2.2.18aa1/CREDITS
--- 2.2.18/CREDITS	Mon Dec 11 16:57:43 2000
+++ 2.2.18aa1/CREDITS	Mon Dec 11 17:20:48 2000
@@ -1441,6 +1441,13 @@
 S: 2300 Copenhagen S
 S: Denmark
 
+N: Heinz Mauelshagen
+E: mge@EZ-Darmstadt.Telekom.de
+D: Logical Volume Manager
+S: Bartningstr. 12
+S: 64289 Darmstadt
+S: Germany
+
 N: Mike McLagan
 E: mike.mclagan@linux.org
 W: http://www.invlogic.com/~mmclagan
diff -urN 2.2.18/Documentation/Configure.help 2.2.18aa1/Documentation/Configure.help
--- 2.2.18/Documentation/Configure.help	Mon Dec 11 16:57:43 2000
+++ 2.2.18aa1/Documentation/Configure.help	Mon Dec 11 17:20:53 2000
@@ -166,6 +166,11 @@
   on the Alpha. The only time you would ever not say Y is to say M in
   order to debug the code. Say Y unless you know what you are doing.
 
+Big memory support
+CONFIG_BIGMEM
+  This option is required if you want to utilize physical memory which
+  is not covered by the kernel virtual address space (> 1GB).
+
 Normal PC floppy disk support
 CONFIG_BLK_DEV_FD
   If you want to use the floppy disk drive(s) of your PC under Linux,
@@ -997,6 +1002,30 @@
   called on26.o. You must also have a high-level driver for the type
   of device that you want to support.
 
+Logical Volume Manager (LVM) support
+CONFIG_BLK_DEV_LVM
+  This driver lets you combine several hard disks, hard disk partitions,
+  multiple devices or even loop devices (for evaluation purposes) into
+  a volume group. Imagine a volume group as a kind of virtual disk.
+  Logical volumes, which can be thought of as virtual partitions,
+  can be created in the volume group.  You can resize volume groups and
+  logical volumes after creation time, corresponding to new capacity needs.
+  Logical volumes are accessed as block devices named
+  /dev/VolumeGroupName/LogicalVolumeName.
+
+  For details see /usr/src/linux/Documentaion/LVM-HOWTO.
+
+  To get the newest software see <http://www.sistina.com/lvm>.
+
+Logical Volume Manager proc filesystem information
+CONFIG_LVM_PROC_FS
+  If you say Y here, you are able to access overall Logical Volume Manager,
+  Volume Group, Logical and Physical Volume information in /proc/lvm.
+  
+  To use this option, you have to check, that the "proc filesystem support"
+  (CONFIG_PROC_FS) is enabled too.
+
+
 Multiple devices driver support
 CONFIG_BLK_DEV_MD
   This driver lets you combine several hard disk partitions into one
@@ -1015,6 +1044,13 @@
 
   If unsure, say N.
 
+Autodetect RAID partitions
+CONFIG_AUTODETECT_RAID
+  This feature lets the kernel detect RAID partitions on bootup.
+  An autodetect RAID partition is a normal partition with partition
+  type 0xfd. Use this if you want to boot RAID devices, or want to
+  run them automatically.
+
 Linear (append) mode
 CONFIG_MD_LINEAR
   If you say Y here, then your multiple devices driver will be able to
@@ -1094,6 +1130,21 @@
 
   If unsure, say Y.
 
+Translucent Block Device Support (EXPERIMENTAL)
+CONFIG_MD_TRANSLUCENT
+  DO NOT USE THIS STUFF YET!
+
+  currently there is only a placeholder there as the implementation
+  is not yet usable.
+
+Hierarchical Storage Management support (EXPERIMENTAL)
+CONFIG_MD_HSM
+  DO NOT USE THIS STUFF YET!
+
+  i have released this so people can comment on the architecture,
+  but user-space tools are still unusable so there is nothing much
+  you can do with this.
+
 Boot support (linear, striped)
 CONFIG_MD_BOOT
   To boot with an initial linear or striped md device you have to
@@ -1768,6 +1819,8 @@
      K6-3D.
    - "PPro" for the Cyrix/IBM/National Semiconductor 6x86MX, MII and
      Intel Pentium II/Pentium Pro.
+   - "K7" for the AMD Athlon K7 CPUs with 64 bytes large L1 cachelines.
+   - "P4" for the Intel Pentium4 CPUs with 128 bytes large L1 cachelines.
 
   If you don't know what to do, choose "386".
 
@@ -10459,6 +10512,20 @@
   If you think you have a use for such a device (such as periodic data
   sampling), then say Y here, and read Documentation/rtc.txt for
   details.
+  For DEC Alpha users it is highly recommended to say Y here; if you
+  don't need all the features, you can choose the lightweight version
+  afterwards.
+
+Use only lightweight version (no interrupts)
+CONFIG_RTC_LIGHT
+  This option turns off extended features of the RTC driver that deal
+  with interrupts (periodic signals and alarm).  If you only need this
+  driver to read and set your system hardware clock, say Y here.
+  If you are on DEC Alpha, enabling this option will allow the kernel
+  to receive system clock interrupts in the standard, traditional
+  manner (that is, from the RTC device).  Fully featured RTC driver
+  would move the clock signal source to the PIT (Programmable
+  Interrupt Timer), like on a PC.
 
 AGP/GART support
 CONFIG_AGP
diff -urN 2.2.18/Documentation/LVM-HOWTO 2.2.18aa1/Documentation/LVM-HOWTO
--- 2.2.18/Documentation/LVM-HOWTO	Thu Jan  1 01:00:00 1970
+++ 2.2.18aa1/Documentation/LVM-HOWTO	Mon Dec 11 17:20:48 2000
@@ -0,0 +1,118 @@
+Heinz Mauelshagen's LVM (Logical Volume Manager) howto.             01/28/1999
+
+
+Abstract:
+---------
+The LVM adds a kind of virtual disks and virtual partitions functionality
+to the Linux operating system
+
+It achieves this by adding an additional layer between the physical peripherals
+and the i/o interface in the kernel.
+
+This allows the concatenation of several disk partitions or total disks
+(so-called physical volumes or PVs) or even multiple devices
+to form a storage pool (so-called Volume Group or VG) with
+allocation units called physical extents (called PE).
+You can think of the volume group as a virtual disk.
+Please see scenario below.
+
+Some or all PEs of this VG then can be allocated to so-called Logical Volumes
+or LVs in units called logical extents or LEs.
+Each LE is mapped to a corresponding PE.
+LEs and PEs are equal in size.
+Logical volumes are a kind of virtual partitions.
+
+
+The LVs can be used through device special files similar to the known
+/dev/sd[a-z]* or /dev/hd[a-z]* named /dev/VolumeGroupName/LogicalVolumeName.
+
+But going beyond this, you are able to extend or reduce
+VGs _AND_ LVs at runtime!
+
+So...
+If for example the capacity of a LV gets too small and your VG containing
+this LV is full, you could add another PV to that VG and simply extend
+the LV afterwards.
+If you reduce or delete a LV you can use the freed capacity for different
+LVs in the same VG.
+
+
+The above scenario looks like this:
+
+     /------------------------------------------\
+     |  /--PV2---\      VG 1      /--PVn---\    |
+     |  |-VGDA---|                |-VGDA-- |    |
+     |  |PE1PE2..|                |PE1PE2..|    |
+     |  |        |     ......     |        |    |
+     |  |        |                |        |    |
+     |  |    /-----------------------\     |    |
+     |  |    \-------LV 1------------/     |    |
+     |  |   ..PEn|                |   ..PEn|    |
+     |  \--------/                \--------/    |
+     \------------------------------------------/
+
+PV 1 could be /dev/sdc1 sized 3GB
+PV n could be /dev/sde1 sized 4GB
+VG 1 could be test_vg
+LV 1 could be /dev/test_vg/test_lv
+VGDA is the volume group descriptor area holding the LVM metadata
+PE1 up to PEn is the number of physical extents on each disk(partition)
+
+
+
+Installation steps see INSTALL and insmod(1)/modprobe(1), kmod/kerneld(8)
+to load the logical volume manager module if you did not bind it
+into the kernel.
+
+
+Configuration steps for getting the above scenario:
+
+1. Set the partition system id to 0xFE on /dev/sdc1 and /dev/sde1.
+
+2. do a "pvcreate /dev/sd[ce]1"
+   For testing purposes you can use more than one partition on a disk.
+   You should not use more than one partition because in the case of
+   a striped LV you'll have a performance breakdown.
+
+3. do a "vgcreate test_vg /dev/sd[ce]1" to create the new VG named "test_vg"
+   which has the total capacity of both partitions.
+   vgcreate activates (transfers the metadata into the LVM driver in the kernel)
+   the new volume group too to be able to create LVs in the next step.
+
+4. do a "lvcreate -L1500 -ntest_lv test_vg" to get a 1500MB linear LV named
+   "test_lv" and it's block device special "/dev/test_vg/test_lv".
+
+   Or do a "lvcreate -i2 -I4 -l1500 -nanother_test_lv test_vg" to get a 100 LE
+   large logical volume with 2 stripes and stripesize 4 KB.
+
+5. For example generate a filesystem in one LV with
+   "mke2fs /dev/test_vg/test_lv" and mount it.
+
+6. extend /dev/test_vg/test_lv to 1600MB with relative size by
+   "lvextend -L+100 /dev/test_vg/test_lv"
+   or with absolute size by
+   "lvextend -L1600 /dev/test_vg/test_lv"
+ 
+7. reduce /dev/test_vg/test_lv to 900 logical extents with relative extents by
+   "lvreduce -l-700 /dev/test_vg/test_lv"
+   or with absolute extents by
+   "lvreduce -l900 /dev/test_vg/test_lv"
+ 
+9. rename a VG by deactivating it with
+   "vgchange -an test_vg"   # only VGs with _no_ open LVs can be deactivated!
+   "vgrename test_vg whatever"
+   and reactivate it again by
+   "vgchange -ay whatever"
+
+9. rename a LV after closing it by
+   "lvchange -an /dev/whatever/test_lv" # only closed LVs can be deactivated
+   "lvrename  /dev/whatever/test_lv  /dev/whatever/whatvolume"
+   or by
+   "lvrename  whatever test_lv whatvolume"
+   and reactivate it again by
+   "lvchange -ay /dev/whatever/whatvolume"
+
+10. if you have the resize2fs program from e2fsprogs 1.19 or later and/or the
+    GNU ext2resize tools, you are able to resize the ext2 type filesystems
+    contained in logical volumes without destroyiing the data by
+    "e2fsadm -L+100 /dev/test_vg/another_test_lv"
diff -urN 2.2.18/MAINTAINERS 2.2.18aa1/MAINTAINERS
--- 2.2.18/MAINTAINERS	Mon Dec 11 16:57:43 2000
+++ 2.2.18aa1/MAINTAINERS	Mon Dec 11 17:20:48 2000
@@ -602,6 +602,13 @@
 W:	http://people.redhat.com/zab/maestro/
 S:	Supported
 
+LOGICAL VOLUME MANAGER
+P:	Heinz Mauelshagen
+M:	linux-LVM@EZ-Darmstadt.Telekom.de
+L:	linux-LVM@sistina.com
+W:	http://www.sistina.com/lvm
+S:	Maintained
+
 M68K
 P:	Jes Sorensen
 M:	Jes.Sorensen@cern.ch
diff -urN 2.2.18/Makefile 2.2.18aa1/Makefile
--- 2.2.18/Makefile	Mon Dec 11 16:57:43 2000
+++ 2.2.18aa1/Makefile	Mon Dec 11 17:20:47 2000
@@ -1,7 +1,7 @@
 VERSION = 2
 PATCHLEVEL = 2
 SUBLEVEL = 18
-EXTRAVERSION =
+EXTRAVERSION = aa1
 
 ARCH := $(shell uname -m | sed -e s/i.86/i386/ -e s/sun4u/sparc64/ -e s/arm.*/arm/ -e s/sa110/arm/)
 
diff -urN 2.2.18/arch/alpha/config.in 2.2.18aa1/arch/alpha/config.in
--- 2.2.18/arch/alpha/config.in	Mon Dec 11 16:57:43 2000
+++ 2.2.18aa1/arch/alpha/config.in	Mon Dec 11 17:20:47 2000
@@ -21,6 +21,7 @@
 mainmenu_option next_comment
 comment 'General setup'
 
+bool 'BIGMEM support' CONFIG_BIGMEM
 choice 'Alpha system type' \
 	"Generic		CONFIG_ALPHA_GENERIC		\
 	 Alcor/Alpha-XLT	CONFIG_ALPHA_ALCOR		\
diff -urN 2.2.18/arch/alpha/defconfig 2.2.18aa1/arch/alpha/defconfig
--- 2.2.18/arch/alpha/defconfig	Sun Apr  2 21:07:48 2000
+++ 2.2.18aa1/arch/alpha/defconfig	Mon Dec 11 17:20:49 2000
@@ -255,7 +255,8 @@
 # CONFIG_QIC02_TAPE is not set
 # CONFIG_WATCHDOG is not set
 # CONFIG_NVRAM is not set
-# CONFIG_RTC is not set
+CONFIG_RTC=y
+CONFIG_RTC_LIGHT=y
 
 #
 # Video For Linux
diff -urN 2.2.18/arch/alpha/kernel/alpha_ksyms.c 2.2.18aa1/arch/alpha/kernel/alpha_ksyms.c
--- 2.2.18/arch/alpha/kernel/alpha_ksyms.c	Mon Dec 11 16:57:43 2000
+++ 2.2.18aa1/arch/alpha/kernel/alpha_ksyms.c	Mon Dec 11 17:20:54 2000
@@ -50,6 +50,7 @@
 extern void __remqu (void);
 
 EXPORT_SYMBOL(init_mm);
+EXPORT_SYMBOL(get_new_mmu_context);
 
 EXPORT_SYMBOL(alpha_mv);
 EXPORT_SYMBOL(enable_irq);
@@ -169,6 +170,7 @@
 EXPORT_SYMBOL(global_bh_lock);
 EXPORT_SYMBOL(global_bh_count);
 EXPORT_SYMBOL(synchronize_bh);
+EXPORT_SYMBOL(alpha_bh_lock);
 EXPORT_SYMBOL(global_irq_holder);
 EXPORT_SYMBOL(__global_cli);
 EXPORT_SYMBOL(__global_sti);
diff -urN 2.2.18/arch/alpha/kernel/irq.c 2.2.18aa1/arch/alpha/kernel/irq.c
--- 2.2.18/arch/alpha/kernel/irq.c	Tue Jun 13 03:48:12 2000
+++ 2.2.18aa1/arch/alpha/kernel/irq.c	Mon Dec 11 17:20:45 2000
@@ -385,6 +385,7 @@
 /* This protects BH software state (masks, things like that). */
 atomic_t global_bh_lock = ATOMIC_INIT(0);
 atomic_t global_bh_count = ATOMIC_INIT(0);
+spinlock_t alpha_bh_lock = SPIN_LOCK_UNLOCKED;
 
 static void *previous_irqholder = NULL;
 
@@ -650,6 +651,7 @@
 void
 synchronize_bh(void)
 {
+	mb();
 	if (atomic_read(&global_bh_count) && !in_interrupt())
 			wait_on_bh();
 }
diff -urN 2.2.18/arch/alpha/kernel/irq.h 2.2.18aa1/arch/alpha/kernel/irq.h
--- 2.2.18/arch/alpha/kernel/irq.h	Sun Apr  2 21:07:48 2000
+++ 2.2.18aa1/arch/alpha/kernel/irq.h	Mon Dec 11 19:29:34 2000
@@ -44,7 +44,7 @@
 }
 
 #define RTC_IRQ    8
-#ifdef CONFIG_RTC
+#if defined(CONFIG_RTC) && !defined(CONFIG_RTC_LIGHT)
 #define TIMER_IRQ  0			 /* timer is the pit */
 #else
 #define TIMER_IRQ  RTC_IRQ		 /* timer is the rtc */
diff -urN 2.2.18/arch/alpha/kernel/osf_sys.c 2.2.18aa1/arch/alpha/kernel/osf_sys.c
--- 2.2.18/arch/alpha/kernel/osf_sys.c	Thu May  4 13:00:36 2000
+++ 2.2.18aa1/arch/alpha/kernel/osf_sys.c	Mon Dec 11 17:20:49 2000
@@ -108,7 +108,7 @@
 	int error;
 };
 
-static int osf_filldir(void *__buf, const char *name, int namlen, off_t offset, ino_t ino)
+static int osf_filldir(void *__buf, const char *name, int namlen, off_t offset, ino_t ino, unsigned int d_type)
 {
 	struct osf_dirent *dirent;
 	struct osf_dirent_callback *buf = (struct osf_dirent_callback *) __buf;
diff -urN 2.2.18/arch/alpha/kernel/process.c 2.2.18aa1/arch/alpha/kernel/process.c
--- 2.2.18/arch/alpha/kernel/process.c	Thu May  4 13:00:36 2000
+++ 2.2.18aa1/arch/alpha/kernel/process.c	Mon Dec 11 17:20:49 2000
@@ -30,7 +30,7 @@
 #include <linux/reboot.h>
 #include <linux/console.h>
 
-#ifdef CONFIG_RTC
+#if defined(CONFIG_RTC) && !defined(CONFIG_RTC_LIGHT)
 #include <linux/mc146818rtc.h>
 #endif
 
@@ -150,7 +150,7 @@
 	}
 #endif /* __SMP__ */
 
-#ifdef CONFIG_RTC
+#if defined(CONFIG_RTC) && !defined(CONFIG_RTC_LIGHT)
 	/* Reset rtc to defaults.  */
 	{
 		unsigned char control;
@@ -249,8 +249,11 @@
 	alpha_mv.kill_arch(LINUX_REBOOT_CMD_POWER_OFF, NULL);
 }
 
-void show_regs(struct pt_regs * regs)
+void __show_regs(struct pt_regs * regs)
 {
+	extern void dik_show_trace(unsigned long *);
+
+	printk("\nCPU: %d", smp_processor_id());
 	printk("\nps: %04lx pc: [<%016lx>]\n", regs->ps, regs->pc);
 	printk("rp: [<%016lx>] sp: %p\n", regs->r26, regs+1);
 	printk(" r0: %016lx  r1: %016lx  r2: %016lx  r3: %016lx\n",
@@ -265,6 +268,15 @@
 	       regs->r23, regs->r24, regs->r25, regs->r26);
 	printk("r27: %016lx r28: %016lx r29: %016lx hae: %016lx\n",
 	       regs->r27, regs->r28, regs->gp, regs->hae);
+	dik_show_trace(regs+1);
+}
+
+void show_regs(struct pt_regs * regs)
+{
+	__show_regs(regs);
+#ifdef CONFIG_SMP
+	smp_show_regs();
+#endif
 }
 
 /*
diff -urN 2.2.18/arch/alpha/kernel/setup.c 2.2.18aa1/arch/alpha/kernel/setup.c
--- 2.2.18/arch/alpha/kernel/setup.c	Mon Dec 11 16:57:43 2000
+++ 2.2.18aa1/arch/alpha/kernel/setup.c	Mon Dec 11 17:20:52 2000
@@ -25,8 +25,9 @@
 #include <linux/errno.h>
 #include <linux/init.h>
 #include <linux/string.h>
+#include <linux/bigmem.h>
 
-#ifdef CONFIG_RTC
+#if defined(CONFIG_RTC) && !defined(CONFIG_RTC_LIGHT)
 #include <linux/timex.h>
 #endif
 #ifdef CONFIG_BLK_DEV_INITRD
@@ -65,7 +66,7 @@
 
 #define N(a) (sizeof(a)/sizeof(a[0]))
 
-static unsigned long find_end_memory(void);
+static unsigned long find_end_memory(unsigned long);
 static unsigned long get_memory_end_override(char *);
 static struct alpha_machine_vector *get_sysvec(long, long, long);
 static struct alpha_machine_vector *get_sysvec_byname(const char *);
@@ -279,26 +280,40 @@
 
 	/* Find our memory.  */
 	*memory_start_p = (unsigned long)kernel_end;
-	*memory_end_p = find_end_memory();
-	if (memory_end_override && memory_end_override < *memory_end_p) {
-		printk("Overriding memory size from %luMB to %luMB\n",
-		       __pa(*memory_end_p) >> 20,
-		       __pa(memory_end_override) >> 20);
-		*memory_end_p = memory_end_override;
-	}
+	*memory_end_p = find_end_memory(memory_end_override);
 
 #ifdef CONFIG_BLK_DEV_INITRD
 	initrd_start = INITRD_START;
 	if (initrd_start) {
-		initrd_end = initrd_start+INITRD_SIZE;
+		unsigned long initrd_size = INITRD_SIZE;
+		initrd_end = initrd_start+initrd_size;
 		printk("Initial ramdisk at: 0x%p (%lu bytes)\n",
-		       (void *) initrd_start, INITRD_SIZE);
+		       (void *) initrd_start, initrd_size);
 
 		if (initrd_end > *memory_end_p) {
 			printk("initrd extends beyond end of memory "
 			       "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
-			       initrd_end, (unsigned long) memory_end_p);
+			       initrd_end, *memory_end_p);
 			initrd_start = initrd_end = 0;
+		} else {
+			/* move initrd from the middle of the RAM to the
+			   start of the RAM so we won't risk to rewrite
+			   initrd while allocating the memory at boot time */
+			unsigned long memory_start;
+
+			memory_start = *memory_start_p;
+			/*
+			 * Alloc initrd in a page aligned region,
+			 * the memory between memory_start and
+			 * the end of the page would be wasted anyway...
+			 */
+			memory_start = PAGE_ALIGN(memory_start);
+			memmove((char *) memory_start,
+				(char *) initrd_start, initrd_size);
+			initrd_start = memory_start;
+			initrd_end = initrd_start + initrd_size;
+			*memory_start_p = PAGE_ALIGN(initrd_end);
+			initrd_below_start_ok = 1;
 		}
 	}
 #endif
@@ -312,7 +327,7 @@
 	/* ??? There is some circumstantial evidence that this needs
 	   to be done now rather than later in time_init, which would
 	   be more natural.  Someone please explain or refute.  */
-#if defined(CONFIG_RTC)
+#if defined(CONFIG_RTC) && !defined(CONFIG_RTC_LIGHT)
 	rtc_init_pit();
 #else
 	alpha_mv.init_pit();
@@ -354,7 +369,7 @@
 }
 
 static unsigned long __init
-find_end_memory(void)
+find_end_memory(unsigned long memory_end_override)
 {
 	int i;
 	unsigned long high = 0;
@@ -372,17 +387,49 @@
 			high = tmp;
 	}
 
-	/* Round it up to an even number of pages. */
-	high = (high + PAGE_SIZE) & (PAGE_MASK*2);
+#ifndef CONFIG_BIGMEM
+#define MAX_MEMORY 0x80000000UL
+#else
+#define LOW_MEMORY 0x80000000UL
+#define MAX_MEMORY (VMALLOC_START-PAGE_OFFSET)
+#endif
 
 	/* Enforce maximum of 2GB even if there is more,
 	 * but only if the platform (support) cannot handle it.
 	 */
-	if (high > 0x80000000UL) {
-		printk("Cropping memory from %luMB to 2048MB\n", high >> 20);
-		high = 0x80000000UL;
+	if (high > MAX_MEMORY) {
+		printk("Cropping memory from %luMB to %luMB\n",
+		       high>>20, MAX_MEMORY>>20);
+		high = MAX_MEMORY;
 	}
 
+	if (memory_end_override && memory_end_override < high) {
+		printk("Overriding memory size from %luMB to %luMB\n",
+		       high >> 20, memory_end_override >> 20);
+		high = memory_end_override;
+	}
+
+#ifdef CONFIG_BIGMEM
+	bigmem_start = bigmem_end = high;
+	if (high > LOW_MEMORY)
+	{
+		high = bigmem_start = LOW_MEMORY;
+		printk(KERN_NOTICE "%luMB BIGMEM available\n",
+		       (bigmem_end-bigmem_start)>>20);
+	}
+#ifdef BIGMEM_DEBUG
+	else
+	{
+		high -= high/4;
+		bigmem_start = high;
+		printk(KERN_NOTICE "emulating %luMB BIGMEM\n",
+		       (bigmem_end-bigmem_start)>>20);
+	}
+#endif
+	bigmem_start += PAGE_OFFSET;
+	bigmem_end += PAGE_OFFSET;
+#endif
+
 	return (unsigned long) __va(high);
 }
 
@@ -403,7 +450,7 @@
 		end = end << 30;
 		from++;
 	}
-	return (unsigned long) __va(end);
+	return end;
 }
 
 
diff -urN 2.2.18/arch/alpha/kernel/smp.c 2.2.18aa1/arch/alpha/kernel/smp.c
--- 2.2.18/arch/alpha/kernel/smp.c	Mon Dec 11 16:57:43 2000
+++ 2.2.18aa1/arch/alpha/kernel/smp.c	Mon Dec 11 17:20:45 2000
@@ -95,8 +95,7 @@
 smp_store_cpu_info(int cpuid)
 {
 	cpu_data[cpuid].loops_per_jiffy = loops_per_jiffy;
-	cpu_data[cpuid].last_asn
-	  = (cpuid << WIDTH_HARDWARE_ASN) + ASN_FIRST_VERSION;
+	cpu_data[cpuid].last_asn = ASN_FIRST_VERSION;
 
         cpu_data[cpuid].irq_count = 0;
         cpu_data[cpuid].bh_count = 0;
@@ -739,7 +738,14 @@
 
 			/* At this point the structure may be gone unless
 			   wait is true.  */
-			(*func)(info);
+			{
+			static void ipi_show_regs(void *);
+
+			if (func != ipi_show_regs)
+				(*func)(info);
+			else
+				(*func)((void *) regs);
+			}
 
 			/* Notify the sending CPU that the task is done.  */
 			mb();
@@ -856,6 +862,21 @@
 }
 
 static void
+ipi_show_regs(void * param)
+{
+	struct pt_regs *regs = (struct pt_regs *) param;
+
+	__show_regs(regs);
+}
+
+void
+smp_show_regs(void)
+{
+	if (smp_call_function(ipi_show_regs, NULL, 1, 1))
+		printk(KERN_CRIT "smp_show_regs: timed out\n");
+}
+
+static void
 ipi_flush_tlb_all(void *ignored)
 {
 	tbia();
@@ -879,6 +900,8 @@
 	struct mm_struct *mm = (struct mm_struct *) x;
 	if (mm == current->mm)
 		flush_tlb_current(mm);
+	else
+		flush_tlb_other(mm);
 }
 
 void
@@ -886,10 +909,17 @@
 {
 	if (mm == current->mm) {
 		flush_tlb_current(mm);
-		if (atomic_read(&mm->count) == 1)
+		if (atomic_read(&mm->count) == 1) {
+			int i, cpu, this_cpu = smp_processor_id();
+			for (i = 0; i < smp_num_cpus; i++) {
+				cpu = cpu_logical_map(i);
+				if (cpu == this_cpu)
+					continue;
+				mm->context[cpu] = 0;
+			}
 			return;
-	} else
-		flush_tlb_other(mm);
+		}
+	}
 
 	if (smp_call_function(ipi_flush_tlb_mm, mm, 1, 1)) {
 		printk(KERN_CRIT "flush_tlb_mm: timed out\n");
@@ -906,8 +936,12 @@
 ipi_flush_tlb_page(void *x)
 {
 	struct flush_tlb_page_struct *data = (struct flush_tlb_page_struct *)x;
-	if (data->mm == current->mm)
-		flush_tlb_current_page(data->mm, data->vma, data->addr);
+	struct mm_struct * mm = data->mm;
+
+	if (mm == current->mm)
+		flush_tlb_current_page(mm, data->vma, data->addr);
+	else
+		flush_tlb_other(mm);
 }
 
 void
@@ -918,10 +952,17 @@
 
 	if (mm == current->mm) {
 		flush_tlb_current_page(mm, vma, addr);
-		if (atomic_read(&current->mm->count) == 1)
+		if (atomic_read(&current->mm->count) == 1) {
+			int i, cpu, this_cpu = smp_processor_id();
+			for (i = 0; i < smp_num_cpus; i++) {
+				cpu = cpu_logical_map(i);
+				if (cpu == this_cpu)
+					continue;
+				mm->context[cpu] = 0;
+			}
 			return;
-	} else
-		flush_tlb_other(mm);
+		}
+	}
        
 	data.vma = vma;
 	data.mm = mm;
diff -urN 2.2.18/arch/alpha/kernel/sys_nautilus.c 2.2.18aa1/arch/alpha/kernel/sys_nautilus.c
--- 2.2.18/arch/alpha/kernel/sys_nautilus.c	Tue Jun 13 03:48:12 2000
+++ 2.2.18aa1/arch/alpha/kernel/sys_nautilus.c	Mon Dec 11 17:20:49 2000
@@ -89,7 +89,7 @@
 nautilus_kill_arch (int mode, char *restart_cmd)
 {
 
-#ifdef CONFIG_RTC
+#if defined(CONFIG_RTC) && !defined(CONFIG_RTC_LIGHT)
 	/* Reset rtc to defaults.  */
 	{
 		unsigned char control;
diff -urN 2.2.18/arch/alpha/kernel/time.c 2.2.18aa1/arch/alpha/kernel/time.c
--- 2.2.18/arch/alpha/kernel/time.c	Mon Dec 11 16:57:43 2000
+++ 2.2.18aa1/arch/alpha/kernel/time.c	Mon Dec 11 17:20:49 2000
@@ -174,7 +174,7 @@
  * drivers depend on them being initialized (e.g., joystick driver).
  */
 
-#ifdef CONFIG_RTC
+#if defined(CONFIG_RTC) && !defined(CONFIG_RTC_LIGHT)
 void
 rtc_init_pit (void)
 {
@@ -236,7 +236,7 @@
 time_init(void)
 {
 	void (*irq_handler)(int, void *, struct pt_regs *);
-	unsigned int year, mon, day, hour, min, sec, cc1, cc2;
+	unsigned int year, mon, day, hour, min, sec, cc1, cc2, epoch;
 	unsigned long cycle_freq, ppm_error;
 	long diff;
 
@@ -311,16 +311,21 @@
 		BCD_TO_BIN(mon);
 		BCD_TO_BIN(year);
 	}
-#ifdef ALPHA_PRE_V1_2_SRM_CONSOLE
-	/*
-	 * The meaning of life, the universe, and everything. Plus
-	 * this makes the year come out right on SRM consoles earlier
-	 * than v1.2.
-	 */
-	year -= 42;
-#endif
-	if ((year += 1900) < 1970)
+
+	/* PC-like is standard; used for year < 20 || year >= 70 */
+	epoch = 1900;
+	if (year >= 20 && year < 48)
+		/* NT epoch */
+		epoch = 1980;
+	else if (year >= 48 && year < 70)
+		/* Digital UNIX epoch */
+		epoch = 1952;
+
+	printk(KERN_INFO "Using epoch = %d\n", epoch);
+
+	if ((year += epoch) < 1970)
 		year += 100;
+
 	xtime.tv_sec = mktime(year, mon, day, hour, min, sec);
 	xtime.tv_usec = 0;
 
@@ -339,6 +344,20 @@
 	irq_handler = timer_interrupt;
 	if (request_irq(TIMER_IRQ, irq_handler, 0, "timer", NULL))
 		panic("Could not allocate timer IRQ!");
+	do_get_fast_time = do_gettimeofday;
+}
+
+static inline void
+timeval_normalize(struct timeval * tv)
+{
+	time_t __sec;
+
+	__sec = tv->tv_usec / 1000000;
+	if (__sec)
+	{
+		tv->tv_usec %= 1000000;
+		tv->tv_sec += __sec;
+	}
 }
 
 /*
@@ -389,13 +408,11 @@
 #endif
 
 	usec += delta_usec;
-	if (usec >= 1000000) {
-		sec += 1;
-		usec -= 1000000;
-	}
 
 	tv->tv_sec = sec;
 	tv->tv_usec = usec;
+
+	timeval_normalize(tv);
 }
 
 void
diff -urN 2.2.18/arch/alpha/kernel/traps.c 2.2.18aa1/arch/alpha/kernel/traps.c
--- 2.2.18/arch/alpha/kernel/traps.c	Tue Jun 13 03:48:12 2000
+++ 2.2.18aa1/arch/alpha/kernel/traps.c	Mon Dec 11 17:20:45 2000
@@ -269,7 +269,7 @@
 	}
 }
 
-static void
+void
 dik_show_trace(unsigned long *sp)
 {
 	int i = 1;
diff -urN 2.2.18/arch/alpha/mm/fault.c 2.2.18aa1/arch/alpha/mm/fault.c
--- 2.2.18/arch/alpha/mm/fault.c	Tue Sep  5 02:28:38 2000
+++ 2.2.18aa1/arch/alpha/mm/fault.c	Mon Dec 11 17:20:46 2000
@@ -41,7 +41,7 @@
 get_new_mmu_context(struct task_struct *p, struct mm_struct *mm)
 {
 	unsigned long new = __get_new_mmu_context();
-	mm->context = new;
+	mm->context[smp_processor_id()] = new;
 	p->tss.asn = new & HARDWARE_ASN_MASK;
 }
 
@@ -102,7 +102,7 @@
 		goto good_area;
 	if (!(vma->vm_flags & VM_GROWSDOWN))
 		goto bad_area;
-	if (expand_stack(vma, address))
+	if (expand_stack(vma, address, NULL))
 		goto bad_area;
 /*
  * Ok, we have a good vm_area for this memory access, so
diff -urN 2.2.18/arch/alpha/mm/init.c 2.2.18aa1/arch/alpha/mm/init.c
--- 2.2.18/arch/alpha/mm/init.c	Tue Jun 13 03:48:12 2000
+++ 2.2.18aa1/arch/alpha/mm/init.c	Mon Dec 11 17:20:48 2000
@@ -18,6 +18,7 @@
 #ifdef CONFIG_BLK_DEV_INITRD
 #include <linux/blk.h>
 #endif
+#include <linux/bigmem.h>
 
 #include <asm/system.h>
 #include <asm/uaccess.h>
@@ -30,6 +31,11 @@
 extern void die_if_kernel(char *,struct pt_regs *,long);
 extern void show_net_buffers(void);
 
+static unsigned long totalram_pages, totalbig_pages;
+
+#ifdef CONFIG_BIGMEM
+unsigned long bigmem_start, bigmem_end;
+#endif
 struct thread_struct original_pcb;
 
 #ifndef __SMP__
@@ -232,7 +238,11 @@
 	struct memdesc_struct * memdesc;
 
 	/* initialize mem_map[] */
+#ifndef CONFIG_BIGMEM
 	start_mem = free_area_init(start_mem, end_mem);
+#else
+	start_mem = free_area_init(start_mem, bigmem_end);
+#endif
 
 	/* find free clusters, update mem_map[] accordingly */
 	memdesc = (struct memdesc_struct *)
@@ -247,11 +257,20 @@
 		if (cluster->usage & 3)
 			continue;
 		pfn = cluster->start_pfn;
+#ifndef CONFIG_BIGMEM
 		if (pfn >= MAP_NR(end_mem)) /* if we overrode mem size */
+#else
+		if (pfn >= MAP_NR(bigmem_end))
+#endif
 			continue;
 		nr = cluster->numpages;
+#ifndef CONFIG_BIGMEM
 		if ((pfn + nr) > MAP_NR(end_mem)) /* if override in cluster */
 			nr = MAP_NR(end_mem) - pfn;
+#else
+		if ((pfn + nr) > MAP_NR(bigmem_end)) /* if override in cluster */
+			nr = MAP_NR(bigmem_end) - pfn;
+#endif
 
 		while (nr--)
 			clear_bit(PG_reserved, &mem_map[pfn++].flags);
@@ -306,9 +325,20 @@
 mem_init(unsigned long start_mem, unsigned long end_mem)
 {
 	unsigned long tmp;
+	unsigned long reservedpages = 0;
 
+#ifdef CONFIG_BIGMEM
+	bigmem_start = PAGE_ALIGN(bigmem_start);
+	bigmem_end &= PAGE_MASK;
+#endif
 	end_mem &= PAGE_MASK;
+#ifndef CONFIG_BIGMEM
 	max_mapnr = num_physpages = MAP_NR(end_mem);
+#else
+	max_mapnr = num_physpages = MAP_NR(bigmem_end);
+	/* cache the bigmem_mapnr */
+	bigmem_mapnr = MAP_NR(bigmem_start);
+#endif
 	high_memory = (void *) end_mem;
 	start_mem = PAGE_ALIGN(start_mem);
 
@@ -321,11 +351,24 @@
 		tmp += PAGE_SIZE;
 	}
 
+#ifndef CONFIG_BIGMEM
 	for (tmp = PAGE_OFFSET ; tmp < end_mem ; tmp += PAGE_SIZE) {
+#else
+	for (tmp = PAGE_OFFSET ; tmp < bigmem_end; tmp += PAGE_SIZE) {
+#endif
 		if (tmp >= MAX_DMA_ADDRESS)
 			clear_bit(PG_DMA, &mem_map[MAP_NR(tmp)].flags);
 		if (PageReserved(mem_map+MAP_NR(tmp)))
+		{
+			reservedpages++;
 			continue;
+		}
+#ifdef CONFIG_BIGMEM
+		if (tmp >= bigmem_start) {
+			set_bit(PG_BIGMEM, &mem_map[MAP_NR(tmp)].flags);
+			totalbig_pages++;
+		}
+#endif
 		atomic_set(&mem_map[MAP_NR(tmp)].count, 1);
 #ifdef CONFIG_BLK_DEV_INITRD
 		if (initrd_start && tmp >= initrd_start && tmp < initrd_end)
@@ -334,8 +377,10 @@
 		kill_page(tmp);
 		free_page(tmp);
 	}
-	tmp = nr_free_pages << (PAGE_SHIFT - 10);
+	tmp = (unsigned long) nr_free_pages << (PAGE_SHIFT - 10);
 	printk("Memory: %luk available\n", tmp);
+
+	totalram_pages = max_mapnr - reservedpages;
 	return;
 }
 
@@ -359,22 +404,11 @@
 void
 si_meminfo(struct sysinfo *val)
 {
-	int i;
-
-	i = max_mapnr;
-	val->totalram = 0;
+	val->totalram = totalram_pages << PAGE_SHIFT;
 	val->sharedram = 0;
 	val->freeram = ((unsigned long)nr_free_pages) << PAGE_SHIFT;
 	val->bufferram = buffermem;
-	while (i-- > 0)  {
-		if (PageReserved(mem_map+i))
-			continue;
-		val->totalram++;
-		if (!atomic_read(&mem_map[i].count))
-			continue;
-		val->sharedram += atomic_read(&mem_map[i].count) - 1;
-	}
-	val->totalram <<= PAGE_SHIFT;
-	val->sharedram <<= PAGE_SHIFT;
+	val->totalbig = totalbig_pages << PAGE_SHIFT;
+	val->freebig = (unsigned long) nr_free_bigpages << PAGE_SHIFT;
 	return;
 }
diff -urN 2.2.18/arch/arm/mm/init.c 2.2.18aa1/arch/arm/mm/init.c
--- 2.2.18/arch/arm/mm/init.c	Mon Dec 11 16:57:44 2000
+++ 2.2.18aa1/arch/arm/mm/init.c	Mon Dec 11 17:20:48 2000
@@ -277,5 +277,7 @@
 	}
 	val->totalram <<= PAGE_SHIFT;
 	val->sharedram <<= PAGE_SHIFT;
+	val->totalbig = 0;
+	val->freebig = 0;
 }
 
diff -urN 2.2.18/arch/i386/Makefile 2.2.18aa1/arch/i386/Makefile
--- 2.2.18/arch/i386/Makefile	Wed Aug  2 19:24:47 2000
+++ 2.2.18aa1/arch/i386/Makefile	Mon Dec 11 17:20:43 2000
@@ -43,6 +43,14 @@
 CFLAGS := $(CFLAGS) -m486 -malign-loops=2 -malign-jumps=2 -malign-functions=2 -DCPU=686
 endif
 
+ifdef CONFIG_M686_L1_64
+CFLAGS := $(CFLAGS) -m486 -malign-loops=2 -malign-jumps=2 -malign-functions=2 -DCPU=686
+endif
+
+ifdef CONFIG_M686_L1_128
+CFLAGS := $(CFLAGS) -m486 -malign-loops=2 -malign-jumps=2 -malign-functions=2 -DCPU=686
+endif
+
 HEAD := arch/i386/kernel/head.o arch/i386/kernel/init_task.o
 
 SUBDIRS := $(SUBDIRS) arch/i386/kernel arch/i386/mm arch/i386/lib
diff -urN 2.2.18/arch/i386/config.in 2.2.18aa1/arch/i386/config.in
--- 2.2.18/arch/i386/config.in	Mon Dec 11 16:57:44 2000
+++ 2.2.18aa1/arch/i386/config.in	Mon Dec 11 17:20:48 2000
@@ -16,7 +16,9 @@
 	 486/Cx486		CONFIG_M486	\
 	 586/K5/5x86/6x86	CONFIG_M586	\
 	 Pentium/K6/TSC		CONFIG_M586TSC	\
-	 PPro/6x86MX		CONFIG_M686" PPro
+	 PPro/6x86MX		CONFIG_M686	\
+	 K7			CONFIG_M686_L1_64 \
+	 P4			CONFIG_M686_L1_128" PPro
 #
 # Define implied options from the CPU selection here
 #
@@ -26,10 +28,10 @@
   define_bool CONFIG_X86_BSWAP y
   define_bool CONFIG_X86_POPAD_OK y
 fi
-if [ "$CONFIG_M686" = "y" -o "$CONFIG_M586TSC" = "y" ]; then
+if [ "$CONFIG_M686" = "y" -o "$CONFIG_M586TSC" = "y" -o "$CONFIG_M686_L1_64" = "y" -o "$CONFIG_M686_L1_128" = "y" ]; then
   define_bool CONFIG_X86_TSC y
 fi
-if [ "$CONFIG_M686" = "y" ]; then
+if [ "$CONFIG_M686" = "y" -o "$CONFIG_M686_L1_64" = "y" -o "$CONFIG_M686_L1_128" = "y" ]; then
   define_bool CONFIG_X86_GOOD_APIC y
 fi
 
@@ -58,6 +60,7 @@
 mainmenu_option next_comment
 comment 'General setup'
 
+bool 'BIGMEM support' CONFIG_BIGMEM
 bool 'Networking support' CONFIG_NET
 bool 'PCI support' CONFIG_PCI
 if [ "$CONFIG_PCI" = "y" ]; then
diff -urN 2.2.18/arch/i386/defconfig 2.2.18aa1/arch/i386/defconfig
--- 2.2.18/arch/i386/defconfig	Mon Dec 11 16:57:44 2000
+++ 2.2.18aa1/arch/i386/defconfig	Mon Dec 11 17:20:54 2000
@@ -93,7 +93,15 @@
 #
 # CONFIG_BLK_DEV_LOOP is not set
 # CONFIG_BLK_DEV_NBD is not set
-# CONFIG_BLK_DEV_MD is not set
+CONFIG_BLK_DEV_MD=y
+CONFIG_AUTODETECT_RAID=y
+CONFIG_MD_TRANSLUCENT=y
+CONFIG_MD_LINEAR=y
+CONFIG_MD_STRIPED=y
+CONFIG_MD_MIRRORING=y
+CONFIG_MD_RAID5=y
+CONFIG_MD_BOOT=y
+CONFIG_BLK_DEV_HSM=y
 # CONFIG_BLK_DEV_RAM is not set
 # CONFIG_BLK_DEV_XD is not set
 # CONFIG_BLK_DEV_DAC960 is not set
diff -urN 2.2.18/arch/i386/kernel/Makefile 2.2.18aa1/arch/i386/kernel/Makefile
--- 2.2.18/arch/i386/kernel/Makefile	Mon Dec 11 16:57:44 2000
+++ 2.2.18aa1/arch/i386/kernel/Makefile	Mon Dec 11 17:20:44 2000
@@ -15,7 +15,7 @@
 O_TARGET := kernel.o
 O_OBJS   := process.o signal.o entry.o traps.o irq.o vm86.o \
             ptrace.o ioport.o ldt.o setup.o time.o sys_i386.o \
-	    bluesmoke.o dmi_scan.o
+	    bluesmoke.o dmi_scan.o i387.o
 OX_OBJS  := i386_ksyms.o
 MX_OBJS  :=
 
diff -urN 2.2.18/arch/i386/kernel/entry.S 2.2.18aa1/arch/i386/kernel/entry.S
--- 2.2.18/arch/i386/kernel/entry.S	Tue Sep  5 02:28:38 2000
+++ 2.2.18aa1/arch/i386/kernel/entry.S	Mon Dec 11 17:20:52 2000
@@ -285,6 +285,11 @@
 	pushl $ SYMBOL_NAME(do_coprocessor_error)
 	jmp error_code
 
+ENTRY(simd_coprocessor_error)
+	pushl $0
+	pushl $ SYMBOL_NAME(do_simd_coprocessor_error)
+	jmp error_code
+
 ENTRY(device_not_available)
 	pushl $-1		# mark this as an int
 	SAVE_ALL
@@ -304,9 +309,14 @@
 	jmp error_code
 
 ENTRY(nmi)
+	pushl %eax
+	SAVE_ALL
+	movl %esp,%edx
 	pushl $0
-	pushl $ SYMBOL_NAME(do_nmi)
-	jmp error_code
+	pushl %edx
+	call SYMBOL_NAME(do_nmi)
+	addl $8,%esp
+	RESTORE_ALL
 
 ENTRY(int3)
 	pushl $0
@@ -569,6 +579,18 @@
 	.long SYMBOL_NAME(sys_ni_syscall)		/* streams1 */
 	.long SYMBOL_NAME(sys_ni_syscall)		/* streams2 */
 	.long SYMBOL_NAME(sys_vfork)            /* 190 */
+	.long SYMBOL_NAME(sys_ni_syscall)		/* getrlimit */
+	.long SYMBOL_NAME(sys_mmap2)		/* 192 */
+	.long SYMBOL_NAME(sys_truncate64)	/* 193 */
+	.long SYMBOL_NAME(sys_ftruncate64)	/* 194 */
+	.long SYMBOL_NAME(sys_stat64)		/* 195 */
+	.long SYMBOL_NAME(sys_lstat64)		/* 196 */
+	.long SYMBOL_NAME(sys_fstat64)		/* 197 */
+	.rept 22
+		.long SYMBOL_NAME(sys_ni_syscall)
+	.endr
+	.long SYMBOL_NAME(sys_getdents64)	/* 220 */
+	.long SYMBOL_NAME(sys_fcntl64)		/* 221 */
 
 	/*
 	 * NOTE!! This doesn't have to be exact - we just have
@@ -576,6 +598,6 @@
 	 * entries. Don't panic if you notice that this hasn't
 	 * been shrunk every time we add a new system call.
 	 */
-	.rept NR_syscalls-190
+	.rept NR_syscalls-221
 		.long SYMBOL_NAME(sys_ni_syscall)
 	.endr
diff -urN 2.2.18/arch/i386/kernel/head.S 2.2.18aa1/arch/i386/kernel/head.S
--- 2.2.18/arch/i386/kernel/head.S	Mon Jan 17 16:44:33 2000
+++ 2.2.18aa1/arch/i386/kernel/head.S	Mon Dec 11 17:20:44 2000
@@ -59,10 +59,13 @@
  *	NOTE! We have to correct for the fact that we're
  *	not yet offset PAGE_OFFSET..
  */
-#define cr4_bits mmu_cr4_features-__PAGE_OFFSET
+#define cr4_bits x86_cr4-__PAGE_OFFSET
 	movl %cr4,%eax		# Turn on 4Mb pages
 	orl cr4_bits,%eax
 	movl %eax,%cr4
+	movl %cr3,%eax		# Intel specification clarification says
+	movl %eax,%cr3		# to do this. Maybe it makes a difference.
+				# Who knows ?
 #endif
 /*
  * Setup paging (the tables are already set up, just switch them on)
@@ -210,21 +213,6 @@
 	orl $2,%eax		# set MP
 2:	movl %eax,%cr0
 	call check_x87
-#ifdef __SMP__
-	movb ready,%al		# First CPU if 0
-	orb %al,%al
-	jz 4f			# First CPU skip this stuff
-	movl %cr4,%eax		# Turn on 4Mb pages
-	orl $16,%eax
-	movl %eax,%cr4
-	movl %cr3,%eax		# Intel specification clarification says
-	movl %eax,%cr3		# to do this. Maybe it makes a difference.
-				# Who knows ?
-#endif
-4:
-#ifdef __SMP__
-	incb ready
-#endif
 	lgdt gdt_descr
 	lidt idt_descr
 	ljmp $(__KERNEL_CS),$1f
@@ -246,10 +234,6 @@
 L6:
 	jmp L6			# main should never return here, but
 				# just in case, we know what happens.
-
-#ifdef __SMP__
-ready:	.byte 0
-#endif
 
 /*
  * We depend on ET to be correct. This checks for 287/387.
diff -urN 2.2.18/arch/i386/kernel/i386_ksyms.c 2.2.18aa1/arch/i386/kernel/i386_ksyms.c
--- 2.2.18/arch/i386/kernel/i386_ksyms.c	Mon Dec 11 16:57:44 2000
+++ 2.2.18aa1/arch/i386/kernel/i386_ksyms.c	Mon Dec 11 17:20:54 2000
@@ -11,6 +11,7 @@
 
 #include <asm/semaphore.h>
 #include <asm/processor.h>
+#include <asm/i387.h>
 #include <asm/uaccess.h>
 #include <asm/checksum.h>
 #include <asm/io.h>
@@ -19,7 +20,6 @@
 #include <asm/irq.h>
 
 extern void dump_thread(struct pt_regs *, struct user *);
-extern int dump_fpu(elf_fpregset_t *);
 extern spinlock_t rtc_lock;
 
 #if defined(CONFIG_BLK_DEV_IDE) || defined(CONFIG_BLK_DEV_HD) || defined(CONFIG_BLK_DEV_IDE_MODULE) || defined(CONFIG_BLK_DEV_HD_MODULE)
@@ -34,6 +34,7 @@
 EXPORT_SYMBOL(__verify_write);
 EXPORT_SYMBOL(dump_thread);
 EXPORT_SYMBOL(dump_fpu);
+EXPORT_SYMBOL(dump_extended_fpu);
 EXPORT_SYMBOL(__ioremap);
 EXPORT_SYMBOL(iounmap);
 EXPORT_SYMBOL(local_bh_count);
@@ -97,6 +98,7 @@
 EXPORT_SYMBOL(__global_save_flags);
 EXPORT_SYMBOL(__global_restore_flags);
 EXPORT_SYMBOL(smp_call_function);
+EXPORT_SYMBOL(smp_flush_tlb);
 #endif
 
 #ifdef CONFIG_MCA
diff -urN 2.2.18/arch/i386/kernel/i387.c 2.2.18aa1/arch/i386/kernel/i387.c
--- 2.2.18/arch/i386/kernel/i387.c	Thu Jan  1 01:00:00 1970
+++ 2.2.18aa1/arch/i386/kernel/i387.c	Mon Dec 11 17:20:44 2000
@@ -0,0 +1,510 @@
+/*
+ *  linux/arch/i386/kernel/i387.c
+ *
+ *  Copyright (C) 1994 Linus Torvalds
+ *
+ *  Pentium III FXSR, SSE support
+ *  General FPU state handling cleanups
+ *	Gareth Hughes <gareth@valinux.com>, May 2000
+ */
+
+#include <linux/config.h>
+#include <linux/sched.h>
+#include <asm/processor.h>
+#include <asm/i387.h>
+#include <asm/math_emu.h>
+#include <asm/sigcontext.h>
+#include <asm/user.h>
+#include <asm/ptrace.h>
+#include <asm/uaccess.h>
+
+#define HAVE_FXSR	(x86_cr4 & X86_CR4_OSFXSR)
+#define HAVE_XMM	(x86_cr4 & X86_CR4_OSXMMEXCPT)
+
+#ifdef CONFIG_MATH_EMULATION
+#define HAVE_HWFP (boot_cpu_data.hard_math)
+#else
+#define HAVE_HWFP 1
+#endif
+
+/*
+ * The _current_ task is using the FPU for the first time
+ * so initialize it and set the mxcsr to its default
+ * value at reset if we support XMM instructions and then
+ * remeber the current task has used the FPU.
+ */
+void init_fpu(void)
+{
+	__asm__("fninit");
+	if ( HAVE_XMM )
+		load_mxcsr(0x1f80);
+		
+	current->used_math = 1;
+}
+
+/*
+ * FPU lazy state save handling.
+ */
+
+void save_init_fpu( struct task_struct *tsk )
+{
+	if ( HAVE_FXSR ) {
+		asm volatile( "fxsave %0 ; fnclex"
+			      : "=m" (tsk->tss.i387.fxsave) );
+	} else {
+		asm volatile( "fnsave %0 ; fwait"
+			      : "=m" (tsk->tss.i387.fsave) );
+	}
+	tsk->flags &= ~PF_USEDFPU;
+	stts();
+}
+
+void restore_fpu( struct task_struct *tsk )
+{
+	if ( HAVE_FXSR ) {
+		asm volatile( "fxrstor %0"
+			      : : "m" (tsk->tss.i387.fxsave) );
+	} else {
+		asm volatile( "frstor %0"
+			      : : "m" (tsk->tss.i387.fsave) );
+	}
+}
+
+/*
+ * FPU tag word conversions.
+ */
+
+static inline unsigned short twd_i387_to_fxsr( unsigned short twd )
+{
+	unsigned int tmp; /* to avoid 16 bit prefixes in the code */
+
+	/* Transform each pair of bits into 01 (valid) or 00 (empty) */
+	tmp = ~twd;
+	tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */
+	/* and move the valid bits to the lower byte. */
+	tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */
+	tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */
+	tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */
+	return tmp;
+}
+
+static inline unsigned long twd_fxsr_to_i387( struct i387_fxsave_struct *fxsave )
+{
+	struct _fpxreg *st = NULL;
+	unsigned long twd = (unsigned long) fxsave->twd;
+	unsigned long tag;
+	unsigned long ret = 0xffff0000;
+	int i;
+
+#define FPREG_ADDR(f, n)	((char *)&(f)->st_space + (n) * 16);
+
+	for ( i = 0 ; i < 8 ; i++ ) {
+		if ( twd & 0x1 ) {
+			st = (struct _fpxreg *) FPREG_ADDR( fxsave, i );
+
+			switch ( st->exponent & 0x7fff ) {
+			case 0x7fff:
+				tag = 2;		/* Special */
+				break;
+			case 0x0000:
+				if ( !st->significand[0] &&
+				     !st->significand[1] &&
+				     !st->significand[2] &&
+				     !st->significand[3] ) {
+					tag = 1;	/* Zero */
+				} else {
+					tag = 2;	/* Special */
+				}
+				break;
+			default:
+				if ( st->significand[3] & 0x8000 ) {
+					tag = 0;	/* Valid */
+				} else {
+					tag = 2;	/* Special */
+				}
+				break;
+			}
+		} else {
+			tag = 3;			/* Empty */
+		}
+		ret |= (tag << (2 * i));
+		twd = twd >> 1;
+	}
+	return ret;
+}
+
+/*
+ * FPU state interaction.
+ */
+
+unsigned short get_fpu_cwd( struct task_struct *tsk )
+{
+	if ( HAVE_FXSR ) {
+		return tsk->tss.i387.fxsave.cwd;
+	} else {
+		return (unsigned short)tsk->tss.i387.fsave.cwd;
+	}
+}
+
+unsigned short get_fpu_swd( struct task_struct *tsk )
+{
+	if ( HAVE_FXSR ) {
+		return tsk->tss.i387.fxsave.swd;
+	} else {
+		return (unsigned short)tsk->tss.i387.fsave.swd;
+	}
+}
+
+unsigned short get_fpu_twd( struct task_struct *tsk )
+{
+	if ( HAVE_FXSR ) {
+		return tsk->tss.i387.fxsave.twd;
+	} else {
+		return (unsigned short)tsk->tss.i387.fsave.twd;
+	}
+}
+
+unsigned short get_fpu_mxcsr( struct task_struct *tsk )
+{
+	if ( HAVE_XMM ) {
+		return tsk->tss.i387.fxsave.mxcsr;
+	} else {
+		return 0x1f80;
+	}
+}
+
+void set_fpu_cwd( struct task_struct *tsk, unsigned short cwd )
+{
+	if ( HAVE_FXSR ) {
+		tsk->tss.i387.fxsave.cwd = cwd;
+	} else {
+		tsk->tss.i387.fsave.cwd = ((long)cwd | 0xffff0000);
+	}
+}
+
+void set_fpu_swd( struct task_struct *tsk, unsigned short swd )
+{
+	if ( HAVE_FXSR ) {
+		tsk->tss.i387.fxsave.swd = swd;
+	} else {
+		tsk->tss.i387.fsave.swd = ((long)swd | 0xffff0000);
+	}
+}
+
+void set_fpu_twd( struct task_struct *tsk, unsigned short twd )
+{
+	if ( HAVE_FXSR ) {
+		tsk->tss.i387.fxsave.twd = twd_i387_to_fxsr(twd);
+	} else {
+		tsk->tss.i387.fsave.twd = ((long)twd | 0xffff0000);
+	}
+}
+
+void set_fpu_mxcsr( struct task_struct *tsk, unsigned short mxcsr )
+{
+	if ( HAVE_XMM ) {
+		tsk->tss.i387.fxsave.mxcsr = mxcsr;
+	}
+}
+
+/*
+ * FXSR floating point environment conversions.
+ */
+
+static inline int convert_fxsr_to_user( struct _fpstate *buf,
+					struct i387_fxsave_struct *fxsave )
+{
+	unsigned long env[7];
+	struct _fpreg *to;
+	struct _fpxreg *from;
+	int i;
+
+	env[0] = (unsigned long)fxsave->cwd | 0xffff0000;
+	env[1] = (unsigned long)fxsave->swd | 0xffff0000;
+	env[2] = twd_fxsr_to_i387(fxsave);
+	env[3] = fxsave->fip;
+	env[4] = fxsave->fcs | ((unsigned long)fxsave->fop << 16);
+	env[5] = fxsave->foo;
+	env[6] = fxsave->fos;
+
+	if ( __copy_to_user( buf, env, 7 * sizeof(unsigned long) ) )
+		return 1;
+
+	to = &buf->_st[0];
+	from = (struct _fpxreg *) &fxsave->st_space[0];
+	for ( i = 0 ; i < 8 ; i++, to++, from++ ) {
+		if ( __copy_to_user( to, from, sizeof(*to) ) )
+			return 1;
+	}
+	return 0;
+}
+
+static inline int convert_fxsr_from_user( struct i387_fxsave_struct *fxsave,
+					  struct _fpstate *buf )
+{
+	unsigned long env[7];
+	struct _fpxreg *to;
+	struct _fpreg *from;
+	int i;
+
+	if ( __copy_from_user( env, buf, 7 * sizeof(long) ) )
+		return 1;
+
+	fxsave->cwd = (unsigned short)(env[0] & 0xffff);
+	fxsave->swd = (unsigned short)(env[1] & 0xffff);
+	fxsave->twd = twd_i387_to_fxsr((unsigned short)(env[2] & 0xffff));
+	fxsave->fip = env[3];
+	fxsave->fop = (unsigned short)((env[4] & 0xffff0000) >> 16);
+	fxsave->fcs = (env[4] & 0xffff);
+	fxsave->foo = env[5];
+	fxsave->fos = env[6];
+
+	to = (struct _fpxreg *) &fxsave->st_space[0];
+	from = &buf->_st[0];
+	for ( i = 0 ; i < 8 ; i++, to++, from++ ) {
+		if ( __copy_from_user( to, from, sizeof(*from) ) )
+			return 1;
+	}
+	return 0;
+}
+
+/*
+ * Signal frame handlers.
+ */
+
+static inline int save_i387_fsave( struct _fpstate *buf )
+{
+	struct task_struct *tsk = current;
+
+	unlazy_fpu( tsk );
+	tsk->tss.i387.fsave.status = tsk->tss.i387.fsave.swd;
+	if ( __copy_to_user( buf, &tsk->tss.i387.fsave,
+			     sizeof(struct i387_fsave_struct) ) )
+		return -1;
+	return 1;
+}
+
+static inline int save_i387_fxsave( struct _fpstate *buf )
+{
+	struct task_struct *tsk = current;
+	int err = 0;
+
+	unlazy_fpu( tsk );
+
+	if ( convert_fxsr_to_user( buf, &tsk->tss.i387.fxsave ) )
+		return -1;
+
+	err |= __put_user( tsk->tss.i387.fxsave.swd, &buf->status );
+	err |= __put_user( X86_FXSR_MAGIC, &buf->magic );
+	if ( err )
+		return -1;
+
+	if ( __copy_to_user( &buf->_fxsr_env[0], &tsk->tss.i387.fxsave,
+			     sizeof(struct i387_fxsave_struct) ) )
+		return -1;
+	return 1;
+}
+
+int save_i387( struct _fpstate *buf )
+{
+	if ( !current->used_math )
+		return 0;
+
+	/* This will cause a "finit" to be triggered by the next
+	 * attempted FPU operation by the 'current' process.
+	 */
+	current->used_math = 0;
+
+	if ( HAVE_HWFP ) {
+		if ( HAVE_FXSR ) {
+			return save_i387_fxsave( buf );
+		} else {
+			return save_i387_fsave( buf );
+		}
+	} else {
+		return save_i387_soft( &current->tss.i387.soft, buf );
+	}
+}
+
+static inline int restore_i387_fsave( struct _fpstate *buf )
+{
+	struct task_struct *tsk = current;
+	clear_fpu( tsk );
+	return __copy_from_user( &tsk->tss.i387.fsave, buf,
+				 sizeof(struct i387_fsave_struct) );
+}
+
+static inline int restore_i387_fxsave( struct _fpstate *buf )
+{
+	struct task_struct *tsk = current;
+	clear_fpu( tsk );
+	if ( __copy_from_user( &tsk->tss.i387.fxsave, &buf->_fxsr_env[0],
+			       sizeof(struct i387_fxsave_struct) ) )
+		return 1;
+	return convert_fxsr_from_user( &tsk->tss.i387.fxsave, buf );
+}
+
+int restore_i387( struct _fpstate *buf )
+{
+	int err;
+
+	if ( HAVE_HWFP ) {
+		if ( HAVE_FXSR ) {
+			err =  restore_i387_fxsave( buf );
+		} else {
+			err = restore_i387_fsave( buf );
+		}
+	} else {
+		err = restore_i387_soft( &current->tss.i387.soft, buf );
+	}
+	current->used_math = 1;
+	return err;
+}
+
+/*
+ * ptrace request handlers.
+ */
+
+static inline int get_fpregs_fsave( struct user_i387_struct *buf,
+				    struct task_struct *tsk )
+{
+	return __copy_to_user( buf, &tsk->tss.i387.fsave,
+			       sizeof(struct user_i387_struct) );
+}
+
+static inline int get_fpregs_fxsave( struct user_i387_struct *buf,
+				     struct task_struct *tsk )
+{
+	return convert_fxsr_to_user( (struct _fpstate *)buf,
+				     &tsk->tss.i387.fxsave );
+}
+
+int get_fpregs( struct user_i387_struct *buf, struct task_struct *tsk )
+{
+	if ( HAVE_HWFP ) {
+		if ( HAVE_FXSR ) {
+			return get_fpregs_fxsave( buf, tsk );
+		} else {
+			return get_fpregs_fsave( buf, tsk );
+		}
+	} else {
+		return save_i387_soft( &tsk->tss.i387.soft,
+				       (struct _fpstate *)buf );
+	}
+}
+
+static inline int set_fpregs_fsave( struct task_struct *tsk,
+				    struct user_i387_struct *buf )
+{
+	return __copy_from_user( &tsk->tss.i387.fsave, buf,
+				 sizeof(struct user_i387_struct) );
+}
+
+static inline int set_fpregs_fxsave( struct task_struct *tsk,
+				     struct user_i387_struct *buf )
+{
+	return convert_fxsr_from_user( &tsk->tss.i387.fxsave,
+				       (struct _fpstate *)buf );
+}
+
+int set_fpregs( struct task_struct *tsk, struct user_i387_struct *buf )
+{
+	if ( HAVE_HWFP ) {
+		if ( HAVE_FXSR ) {
+			return set_fpregs_fxsave( tsk, buf );
+		} else {
+			return set_fpregs_fsave( tsk, buf );
+		}
+	} else {
+		return restore_i387_soft( &tsk->tss.i387.soft,
+					  (struct _fpstate *)buf );
+	}
+}
+
+int get_fpxregs( struct user_fxsr_struct *buf, struct task_struct *tsk )
+{
+	if ( HAVE_FXSR ) {
+		if (__copy_to_user( (void *)buf, &tsk->tss.i387.fxsave,
+				    sizeof(struct user_fxsr_struct) ))
+			return -EFAULT;
+		return 0;
+	} else {
+		return -EIO;
+	}
+}
+
+int set_fpxregs( struct task_struct *tsk, struct user_fxsr_struct *buf )
+{
+	if ( HAVE_FXSR ) {
+		int error;
+
+		error = __copy_from_user(&tsk->tss.i387.fxsave, (void *)buf,
+					 sizeof(struct user_fxsr_struct));
+		/* bit 6 and 31-16 must be zero for security reasons */
+		tsk->tss.i387.fxsave.mxcsr &= 0xffbf;
+
+		return error ? -EFAULT : 0;
+	} else {
+		return -EIO;
+	}
+}
+
+/*
+ * FPU state for core dumps.
+ */
+
+static inline void copy_fpu_fsave( struct task_struct *tsk,
+				   struct user_i387_struct *fpu )
+{
+	memcpy( fpu, &tsk->tss.i387.fsave,
+		sizeof(struct user_i387_struct) );
+}
+
+static inline void copy_fpu_fxsave( struct task_struct *tsk,
+				   struct user_i387_struct *fpu )
+{
+	unsigned short *to;
+	unsigned short *from;
+	int i;
+
+	memcpy( fpu, &tsk->tss.i387.fxsave, 7 * sizeof(long) );
+
+	to = (unsigned short *)&fpu->st_space[0];
+	from = (unsigned short *)&tsk->tss.i387.fxsave.st_space[0];
+	for ( i = 0 ; i < 8 ; i++, to += 5, from += 8 ) {
+		memcpy( to, from, 5 * sizeof(unsigned short) );
+	}
+}
+
+int dump_fpu( struct pt_regs *regs, struct user_i387_struct *fpu )
+{
+	int fpvalid;
+	struct task_struct *tsk = current;
+
+	fpvalid = tsk->used_math;
+	if ( fpvalid ) {
+		unlazy_fpu( tsk );
+		if ( HAVE_FXSR ) {
+			copy_fpu_fxsave( tsk, fpu );
+		} else {
+			copy_fpu_fsave( tsk, fpu );
+		}
+	}
+
+	return fpvalid;
+}
+
+int dump_extended_fpu( struct pt_regs *regs, struct user_fxsr_struct *fpu )
+{
+	int fpvalid;
+	struct task_struct *tsk = current;
+
+	fpvalid = tsk->used_math && HAVE_FXSR;
+	if ( fpvalid ) {
+		unlazy_fpu( tsk );
+		memcpy( fpu, &tsk->tss.i387.fxsave,
+			sizeof(struct user_fxsr_struct) );
+	}
+
+	return fpvalid;
+}
diff -urN 2.2.18/arch/i386/kernel/io_apic.c 2.2.18aa1/arch/i386/kernel/io_apic.c
--- 2.2.18/arch/i386/kernel/io_apic.c	Tue Sep  5 02:28:38 2000
+++ 2.2.18aa1/arch/i386/kernel/io_apic.c	Mon Dec 11 17:20:52 2000
@@ -21,6 +21,9 @@
  */
 #define IO_APIC_BASE(idx) ((volatile int *)__fix_to_virt(FIX_IO_APIC_BASE_0 + idx))
 
+static int nmi_pin __initdata = -1;
+int nmi_irq = -1;
+
 /*
  * The structure of the IO-APIC:
  */
@@ -638,6 +641,16 @@
 
 		if (!apic && !IO_APIC_IRQ(irq))
 			continue;
+		if (irq == nmi_irq) {
+			entry.delivery_mode = 4; /* broadcast NMI */
+			make_8259A_irq(irq);
+			/*
+			 * Remember which register has the NMI IRQ entry,
+			 * so we can turn it off in case there is some
+			 * screwup
+			 */
+			nmi_pin = pin;
+		}
 
 		entry.vector = assign_irq_vector(irq);
 
@@ -1196,6 +1209,8 @@
 	 * 0x80, because int 0x80 is hm, kind of importantish. ;)
 	 */
 	for (i = 0; i < NR_IRQS ; i++) {
+		if (i == nmi_irq)
+			continue;
 		if (IO_APIC_VECTOR(i) > 0) {
 			if (IO_APIC_irq_trigger(i))
 				irq_desc[i].handler = &ioapic_level_irq_type;
@@ -1237,6 +1252,8 @@
 {
 	int pin1, pin2;
 
+	if (nmi_irq != -1)
+		printk("NMI Watchdog activated on source IRQ %d\n", nmi_irq);
 	pin1 = find_timer_pin(mp_INT);
 	pin2 = find_timer_pin(mp_ExtINT);
 	enable_IO_APIC_irq(0);
@@ -1274,6 +1291,8 @@
 			}
 		}
 		printk(" works.\n");
+		if ((nmi_pin != -1) && (nmi_irq == 0))
+			printk("NMI Watchdog disabled (source IRQ was 0)!\n");
 	}
 }
 
diff -urN 2.2.18/arch/i386/kernel/irq.c 2.2.18aa1/arch/i386/kernel/irq.c
--- 2.2.18/arch/i386/kernel/irq.c	Tue Jun 13 03:48:12 2000
+++ 2.2.18aa1/arch/i386/kernel/irq.c	Mon Dec 11 17:20:44 2000
@@ -381,10 +381,11 @@
  
 static void math_error_irq(int cpl, void *dev_id, struct pt_regs *regs)
 {
+	extern void math_error(void *);
 	outb(0,0xF0);
 	if (ignore_irq13 || !boot_cpu_data.hard_math)
 		return;
-	math_error();
+	math_error((void *)regs->eip);
 }
 
 static struct irqaction irq13 = { math_error_irq, 0, 0, "fpu", NULL, NULL };
diff -urN 2.2.18/arch/i386/kernel/irq.h 2.2.18aa1/arch/i386/kernel/irq.h
--- 2.2.18/arch/i386/kernel/irq.h	Tue Nov 14 03:39:08 2000
+++ 2.2.18aa1/arch/i386/kernel/irq.h	Mon Dec 11 17:45:32 2000
@@ -40,7 +40,9 @@
 	struct hw_interrupt_type *handler;	/* handle/enable/disable functions */
 	struct irqaction *action;		/* IRQ action list */
 	unsigned int depth;			/* Disable depth for nested irq disables */
-	unsigned int unused[4];
+#ifdef CONFIG_SMP
+	unsigned int unused[L1_CACHE_BYTES-16];
+#endif
 } irq_desc_t;
 
 /*
diff -urN 2.2.18/arch/i386/kernel/mtrr.c 2.2.18aa1/arch/i386/kernel/mtrr.c
--- 2.2.18/arch/i386/kernel/mtrr.c	Mon Dec 11 16:57:44 2000
+++ 2.2.18aa1/arch/i386/kernel/mtrr.c	Mon Dec 11 17:20:48 2000
@@ -484,9 +484,9 @@
 static void intel_get_mtrr (unsigned int reg, unsigned long *base,
 			    unsigned long *size, mtrr_type *type)
 {
-    unsigned long dummy, mask_lo, base_lo;
+    unsigned long mask_lo, mask_hi, base_lo, base_hi;
 
-    rdmsr (MTRRphysMask_MSR(reg), mask_lo, dummy);
+    rdmsr (MTRRphysMask_MSR(reg), mask_lo, mask_hi);
     if ((mask_lo & 0x800) == 0) {
 	/* Invalid (i.e. free) range. */
 	*base = 0;
@@ -495,20 +495,17 @@
 	return;
     }
 
-    rdmsr(MTRRphysBase_MSR(reg), base_lo, dummy);
+    rdmsr(MTRRphysBase_MSR(reg), base_lo, base_hi);
 
-    /* We ignore the extra address bits (32-35). If someone wants to
-       run x86 Linux on a machine with >4GB memory, this will be the
-       least of their problems. */
+    /* Work out the shifted address mask. */
+    mask_lo = 0xff000000 | mask_hi << (32 - PAGE_SHIFT)
+		| mask_lo >> PAGE_SHIFT;
 
-    /* Clean up mask_lo so it gives the real address mask. */
-    mask_lo = (mask_lo & 0xfffff000UL);
     /* This works correctly if size is a power of two, i.e. a
        contiguous range. */
-    *size = ~(mask_lo - 1);
-
-    *base = (base_lo & 0xfffff000UL);
-    *type = (base_lo & 0xff);
+    *size = -mask_lo;
+    *base = base_hi << (32 - PAGE_SHIFT) | base_lo >> PAGE_SHIFT;
+    *type = base_lo & 0xff;
 }   /*  End Function intel_get_mtrr  */
 
 static void cyrix_get_arr (unsigned int reg, unsigned long *base,
@@ -533,13 +530,13 @@
     /* Enable interrupts if it was enabled previously */
     __restore_flags (flags);
     shift = ((unsigned char *) base)[1] & 0x0f;
-    *base &= 0xfffff000UL;
+    *base >>= PAGE_SHIFT;
 
     /* Power of two, at least 4K on ARR0-ARR6, 256K on ARR7
      * Note: shift==0xf means 4G, this is unsupported.
      */
     if (shift)
-      *size = (reg < 7 ? 0x800UL : 0x20000UL) << shift;
+      *size = (reg < 7 ? 0x1UL : 0x40UL) << (shift - 1);
     else
       *size = 0;
 
@@ -572,7 +569,7 @@
     /*  Upper dword is region 1, lower is region 0  */
     if (reg == 1) low = high;
     /*  The base masks off on the right alignment  */
-    *base = low & 0xFFFE0000;
+    *base = (low & 0xFFFE0000) >> PAGE_SHIFT;
     *type = 0;
     if (low & 1) *type = MTRR_TYPE_UNCACHABLE;
     if (low & 2) *type = MTRR_TYPE_WRCOMB;
@@ -597,7 +594,7 @@
      *	*128K	...
      */
     low = (~low) & 0x1FFFC;
-    *size = (low + 4) << 15;
+    *size = (low + 4) << (15 - PAGE_SHIFT);
     return;
 }   /*  End Function amd_get_mtrr  */
 
@@ -616,8 +613,8 @@
     unsigned i;
     u32 tb;
     tb = centaur_ctx->mcr[reg].low & 0xfff;
-    *base = centaur_ctx->mcr[reg].high & 0xfffff000;
-    *size = (~(centaur_ctx->mcr[reg].low & 0xfffff000))+1;
+    *base = centaur_ctx->mcr[reg].high >> PAGE_SHIFT;
+    *size = -(centaur_ctx->mcr[reg].low & 0xfffff000) >> PAGE_SHIFT;
     if (*size) {
         for( i=0; i<MTRR_NUM_TYPES; ++i)
 	    if (centaur_ctx->type_bits[i]==tb) {
@@ -654,8 +651,10 @@
     }
     else
     {
-	wrmsr (MTRRphysBase_MSR (reg), base | type, 0);
-	wrmsr (MTRRphysMask_MSR (reg), ~(size - 1) | 0x800, 0);
+	wrmsr (MTRRphysBase_MSR (reg), base << PAGE_SHIFT | type,
+	       (base & 0xf00000) >> (32 - PAGE_SHIFT));
+	wrmsr (MTRRphysMask_MSR (reg), -size << PAGE_SHIFT | 0x800,
+	       (-size & 0xf00000) >> (32 - PAGE_SHIFT));
     }
     if (do_safe) set_mtrr_done (&ctxt);
 }   /*  End Function intel_set_mtrr_up  */
@@ -669,7 +668,9 @@
     arr = CX86_ARR_BASE + (reg << 1) + reg; /* avoid multiplication by 3 */
 
     /* count down from 32M (ARR0-ARR6) or from 2G (ARR7) */
-    size >>= (reg < 7 ? 12 : 18);
+    if (reg >= 7)
+	size >>= 6;
+
     size &= 0x7fff; /* make sure arr_size <= 14 */
     for(arr_size = 0; size; arr_size++, size >>= 1);
 
@@ -690,6 +691,7 @@
     }
 
     if (do_safe) set_mtrr_prepare (&ctxt);
+    base <<= PAGE_SHIFT;
     setCx86(arr,    ((unsigned char *) &base)[3]);
     setCx86(arr+1,  ((unsigned char *) &base)[2]);
     setCx86(arr+2, (((unsigned char *) &base)[1]) | arr_size);
@@ -709,34 +711,36 @@
     [RETURNS] Nothing.
 */
 {
-    u32 low, high;
+    u32 regs[2];
     struct set_mtrr_context ctxt;
 
     if (do_safe) set_mtrr_prepare (&ctxt);
     /*
      *	Low is MTRR0 , High MTRR 1
      */
-    rdmsr (0xC0000085, low, high);
+    rdmsr (0xC0000085, regs[0], regs[1]);
     /*
      *	Blank to disable
      */
     if (size == 0)
-	*(reg ? &high : &low) = 0;
+	regs[reg] = 0;
     else
-	/* Set the register to the base (already shifted for us), the
-	   type (off by one) and an inverted bitmask of the size
-	   The size is the only odd bit. We are fed say 512K
-	   We invert this and we get 111 1111 1111 1011 but
-	   if you subtract one and invert you get the desired
-	   111 1111 1111 1100 mask
-	   */
-	*(reg ? &high : &low)=(((~(size-1))>>15)&0x0001FFFC)|base|(type+1);
+	/* Set the register to the base, the type (off by one) and an
+	   inverted bitmask of the size The size is the only odd
+	   bit. We are fed say 512K We invert this and we get 111 1111
+	   1111 1011 but if you subtract one and invert you get the
+	   desired 111 1111 1111 1100 mask
+
+	   But ~(x - 1) == ~x + 1 == -x. Two's complement rocks!  */
+	regs[reg] = (-size>>(15-PAGE_SHIFT) & 0x0001FFFC)
+	    			| (base<<PAGE_SHIFT) | (type+1);
+
     /*
      *	The writeback rule is quite specific. See the manual. Its
      *	disable local interrupts, write back the cache, set the mtrr
      */
     __asm__ __volatile__ ("wbinvd" : : : "memory");
-    wrmsr (0xC0000085, low, high);
+    wrmsr (0xC0000085, regs[0], regs[1]);
     if (do_safe) set_mtrr_done (&ctxt);
 }   /*  End Function amd_set_mtrr_up  */
 
@@ -757,8 +761,8 @@
     }
     else
     {
-        high = base & 0xfffff000; /* base works on 4K pages... */
-        low = ((~(size-1))&0xfffff000)|(centaur_ctx->type_bits[type]);
+        high = base << PAGE_SHIFT;
+        low = -size << PAGE_SHIFT | centaur_ctx->type_bits[type];
     }
     centaur_ctx->mcr[reg].high = high;
     centaur_ctx->mcr[reg].low = low;
@@ -1058,7 +1062,7 @@
     for (i = 0; i < max; ++i)
     {
 	(*get_mtrr) (i, &lbase, &lsize, &ltype);
-	if (lsize < 1) return i;
+	if (lsize == 0) return i;
     }
     return -ENOSPC;
 }   /*  End Function generic_get_free_region  */
@@ -1075,7 +1079,7 @@
     unsigned long lbase, lsize;
 
     /* If we are to set up a region >32M then look at ARR7 immediately */
-    if (size > 0x2000000UL) {
+    if (size > 0x2000UL) {
 	cyrix_get_arr (7, &lbase, &lsize, &ltype);
 	if (lsize < 1) return 7;
     /* else try ARR0-ARR6 first */
@@ -1083,11 +1087,11 @@
 	for (i = 0; i < 7; i++)
 	{
 	    cyrix_get_arr (i, &lbase, &lsize, &ltype);
-	    if (lsize < 1) return i;
+	    if (lsize == 0) return i;
 	}
 	/* ARR0-ARR6 isn't free, try ARR7 but its size must be at least 256K */
 	cyrix_get_arr (i, &lbase, &lsize, &ltype);
-	if ((lsize < 1) && (size >= 0x40000)) return i;
+	if ((lsize == 0) && (size >= 0x40)) return i;
     }
     return -ENOSPC;
 }   /*  End Function cyrix_get_free_region  */
@@ -1146,7 +1150,7 @@
 	/*  Fall through  */
       case X86_VENDOR_CYRIX:
       case X86_VENDOR_CENTAUR:
-	if ( (base & 0xfff) || (size & 0xfff) )
+	if ( (base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1)) )
 	{
 	    printk ("mtrr: size and base must be multiples of 4 kiB\n");
 	    printk ("mtrr: size: %lx  base: %lx\n", size, base);
@@ -1159,7 +1163,7 @@
 		return -EINVAL;
 	    }
 	}
-	else if (base + size < 0x100000) /* Cyrix */
+	else if (base + size < 0x100000) /* Not Centaur */
 	{
 	    printk ("mtrr: cannot set region below 1 MiB (0x%lx,0x%lx)\n",
 		    base, size);
@@ -1181,6 +1185,12 @@
 	return -EINVAL;
 	/*break;*/
     }
+
+    /* For all CPU types, the checks above should have ensured that
+       base and size are page aligned */
+    base >>= PAGE_SHIFT;
+    size >>= PAGE_SHIFT;
+
     /*  If the type is WC, check that this processor supports it  */
     if ( (type == MTRR_TYPE_WRCOMB) && !have_wrcomb () )
     {
@@ -1200,7 +1210,8 @@
 	if ( (base < lbase) || (base + size > lbase + lsize) )
 	{
 	    spin_unlock (&main_lock);
-	    printk ("mtrr: 0x%lx,0x%lx overlaps existing 0x%lx,0x%lx\n",
+	    printk ("mtrr: 0x%lx000,0x%lx000 overlaps existing"
+		    " 0x%lx000,0x%lx000\n",
 		    base, size, lbase, lsize);
 	    return -EINVAL;
 	}
@@ -1210,7 +1221,7 @@
 	    if ((boot_cpu_data.x86_vendor != X86_VENDOR_CENTAUR) && 
 	        (type == MTRR_TYPE_UNCACHABLE)) continue;
 	    spin_unlock (&main_lock);
-	    printk ( "mtrr: type mismatch for %lx,%lx old: %s new: %s\n",
+	    printk ( "mtrr: type mismatch for %lx000,%lx000 old: %s new: %s\n",
 		     base, size, attrib_to_str (ltype), attrib_to_str (type) );
 	    return -EINVAL;
 	}
@@ -1258,7 +1269,8 @@
 	for (i = 0; i < max; ++i)
 	{
 	    (*get_mtrr) (i, &lbase, &lsize, &ltype);
-	    if ( (lbase == base) && (lsize == size) )
+	    if (lbase < 0x100000 && lbase << PAGE_SHIFT == base
+		&& lsize < 0x100000 && lsize << PAGE_SHIFT == size)
 	    {
 		reg = i;
 		break;
@@ -1267,7 +1279,7 @@
 	if (reg < 0)
 	{
 	    spin_unlock (&main_lock);
-	    printk ("mtrr: no MTRR for %lx,%lx found\n", base, size);
+	    printk ("mtrr: no MTRR for %lx000,%lx000 found\n", base, size);
 	    return -EINVAL;
 	}
     }
@@ -1448,7 +1460,16 @@
 	    return -EFAULT;
 	if ( gentry.regnum >= get_num_var_ranges () ) return -EINVAL;
 	(*get_mtrr) (gentry.regnum, &gentry.base, &gentry.size, &type);
-	gentry.type = type;
+
+	/* Hide entries that go above 4GB */
+	if (gentry.base + gentry.size > 0x100000 || gentry.size == 0x100000)
+	    gentry.base = gentry.size = gentry.type = 0;
+	else {
+	    gentry.base <<= PAGE_SHIFT;
+	    gentry.size <<= PAGE_SHIFT;
+	    gentry.type = type;
+	}
+	
 	if ( copy_to_user ( (void *) arg, &gentry, sizeof gentry) )
 	     return -EFAULT;
 	break;
@@ -1540,24 +1561,24 @@
     for (i = 0; i < max; i++)
     {
 	(*get_mtrr) (i, &base, &size, &type);
-	if (size < 1) usage_table[i] = 0;
+	if (size == 0) usage_table[i] = 0;
 	else
 	{
-	    if (size < 0x100000)
+	    if (size < 0x100000 >> PAGE_SHIFT)
 	    {
-		/* 1MB */
+		/* less than 1MB */
 		factor = 'k';
-		size >>= 10;
+		size <<= PAGE_SHIFT - 10;
 	    }
 	    else
 	    {
 		factor = 'M';
-		size >>= 20;
+		size >>= 20 - PAGE_SHIFT;
 	    }
 	    sprintf
 		(ascii_buffer + ascii_buf_bytes,
-		 "reg%02i: base=0x%08lx (%4liMB), size=%4li%cB: %s, count=%d\n",
-		 i, base, base>>20, size, factor,
+		 "reg%02i: base=0x%05lx000 (%4liMB), size=%4li%cB: %s, count=%d\n",
+		 i, base, base >> (20 - PAGE_SHIFT), size, factor,
 		 attrib_to_str (type), usage_table[i]);
 	    ascii_buf_bytes += strlen (ascii_buffer + ascii_buf_bytes);
 	}
diff -urN 2.2.18/arch/i386/kernel/process.c 2.2.18aa1/arch/i386/kernel/process.c
--- 2.2.18/arch/i386/kernel/process.c	Mon Dec 11 16:57:44 2000
+++ 2.2.18aa1/arch/i386/kernel/process.c	Mon Dec 11 17:20:46 2000
@@ -2,6 +2,9 @@
  *  linux/arch/i386/kernel/process.c
  *
  *  Copyright (C) 1995  Linus Torvalds
+ *
+ *  Pentium III FXSR, SSE support
+ *	Gareth Hughes <gareth@valinux.com>, May 2000
  */
 
 /*
@@ -40,6 +43,7 @@
 #include <asm/ldt.h>
 #include <asm/processor.h>
 #include <asm/desc.h>
+#include <asm/i387.h>
 #ifdef CONFIG_MATH_EMULATION
 #include <asm/math_emu.h>
 #endif
@@ -384,23 +388,15 @@
 
 void show_regs(struct pt_regs * regs)
 {
+	extern void show_registers(struct pt_regs *);
 	long cr0 = 0L, cr2 = 0L, cr3 = 0L;
 
 	printk("\n");
-	printk("EIP: %04x:[<%08lx>]",0xffff & regs->xcs,regs->eip);
-	if (regs->xcs & 3)
-		printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp);
-	printk(" EFLAGS: %08lx\n",regs->eflags);
-	printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
-		regs->eax,regs->ebx,regs->ecx,regs->edx);
-	printk("ESI: %08lx EDI: %08lx EBP: %08lx",
-		regs->esi, regs->edi, regs->ebp);
-	printk(" DS: %04x ES: %04x\n",
-		0xffff & regs->xds,0xffff & regs->xes);
 	__asm__("movl %%cr0, %0": "=r" (cr0));
 	__asm__("movl %%cr2, %0": "=r" (cr2));
 	__asm__("movl %%cr3, %0": "=r" (cr3));
 	printk("CR0: %08lx CR2: %08lx CR3: %08lx\n", cr0, cr2, cr3);
+	show_registers(regs);
 }
 
 /*
@@ -611,23 +607,6 @@
 	p->tss.i387 = current->tss.i387;
 
 	return 0;
-}
-
-/*
- * fill in the FPU structure for a core dump.
- */
-int dump_fpu (struct pt_regs * regs, struct user_i387_struct* fpu)
-{
-	int fpvalid;
-	struct task_struct *tsk = current;
-
-	fpvalid = tsk->used_math;
-	if (fpvalid) {
-		unlazy_fpu(tsk);
-		memcpy(fpu,&tsk->tss.i387.hard,sizeof(*fpu));
-	}
-
-	return fpvalid;
 }
 
 /*
diff -urN 2.2.18/arch/i386/kernel/ptrace.c 2.2.18aa1/arch/i386/kernel/ptrace.c
--- 2.2.18/arch/i386/kernel/ptrace.c	Thu May  4 13:00:36 2000
+++ 2.2.18aa1/arch/i386/kernel/ptrace.c	Mon Dec 11 17:20:48 2000
@@ -1,8 +1,11 @@
 /* ptrace.c */
 /* By Ross Biro 1/23/92 */
 /* edited by Linus Torvalds */
+/*
+ * Pentium III FXSR, SSE support
+ *	Gareth Hughes <gareth@valinux.com>, May 2000
+ */
 
-#include <linux/config.h> /* for CONFIG_MATH_EMULATION */
 #include <linux/kernel.h>
 #include <linux/sched.h>
 #include <linux/mm.h>
@@ -12,12 +15,14 @@
 #include <linux/ptrace.h>
 #include <linux/user.h>
 #include <linux/sys.h> 
+#include <linux/bigmem.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/system.h>
 #include <asm/processor.h>
 #include <asm/debugreg.h>
+#include <asm/i387.h>
 
 /*
  * does not yet catch signals sent when the child dies.
@@ -81,6 +86,7 @@
 	pmd_t * pgmiddle;
 	pte_t * pgtable;
 	unsigned long page;
+	unsigned long retval;
 	int fault;
 
 repeat:
@@ -126,7 +132,10 @@
 	if (MAP_NR(page) >= max_mapnr)
 		return 0;
 	page += addr & ~PAGE_MASK;
-	return *(unsigned long *) page;
+	page = kmap(page, KM_READ);
+	retval = *(unsigned long *) page;
+	kunmap(page, KM_READ);
+	return retval;
 }
 
 /*
@@ -196,7 +205,13 @@
 	}
 /* this is a hack for non-kernel-mapped video buffers and similar */
 	if (MAP_NR(page) < max_mapnr)
-		*(unsigned long *) (page + (addr & ~PAGE_MASK)) = data;
+	{
+		unsigned long vaddr;
+
+		vaddr = kmap(page, KM_WRITE);
+		*(unsigned long *) (vaddr + (addr & ~PAGE_MASK)) = data;
+		kunmap(vaddr, KM_WRITE);
+	}
 /* we're bypassing pagetables, so we have to set the dirty bit ourselves */
 /* this should also re-instate whatever read-only mode there was before */
 	set_pte(pgtable, pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
@@ -658,21 +673,11 @@
 			ret = 0;
 			if ( !child->used_math ) {
 			  /* Simulate an empty FPU. */
-			  child->tss.i387.hard.cwd = 0xffff037f;
-			  child->tss.i387.hard.swd = 0xffff0000;
-			  child->tss.i387.hard.twd = 0xffffffff;
+			  set_fpu_cwd(child, 0x037f);
+			  set_fpu_swd(child, 0x0000);
+			  set_fpu_twd(child, 0xffff);
 			}
-#ifdef CONFIG_MATH_EMULATION
-			if ( boot_cpu_data.hard_math ) {
-#endif
-				__copy_to_user((void *)data, &child->tss.i387.hard,
-						sizeof(struct user_i387_struct));
-#ifdef CONFIG_MATH_EMULATION
-			} else {
-			  save_i387_soft(&child->tss.i387.soft,
-					 (struct _fpstate *)data);
-			}
-#endif
+			get_fpregs((struct user_i387_struct *)data, child);
 			goto out;
 		  };
 
@@ -684,20 +689,38 @@
 			    goto out;
 			  }
 			child->used_math = 1;
-#ifdef CONFIG_MATH_EMULATION
-			if ( boot_cpu_data.hard_math ) {
-#endif
-			  __copy_from_user(&child->tss.i387.hard, (void *)data,
-					   sizeof(struct user_i387_struct));
-#ifdef CONFIG_MATH_EMULATION
-			} else {
-			  restore_i387_soft(&child->tss.i387.soft,
-					    (struct _fpstate *)data);
-			}
-#endif
+			set_fpregs(child, (struct user_i387_struct *)data);
 			ret = 0;
 			goto out;
 		  };
+
+		case PTRACE_GETFPXREGS: { /* Get the child extended FPU state. */
+			if (!access_ok(VERIFY_WRITE, (unsigned *)data,
+				       sizeof(struct user_fxsr_struct))) {
+				ret = -EIO;
+				goto out;
+			}
+			if ( !child->used_math ) {
+				/* Simulate an empty FPU. */
+				set_fpu_cwd(child, 0x037f);
+				set_fpu_swd(child, 0x0000);
+				set_fpu_twd(child, 0xffff);
+				set_fpu_mxcsr(child, 0x1f80);
+			}
+			ret = get_fpxregs((struct user_fxsr_struct *)data, child);
+			goto out;
+		};
+
+		case PTRACE_SETFPXREGS: { /* Set the child extended FPU state. */
+			if (!access_ok(VERIFY_READ, (unsigned *)data,
+				       sizeof(struct user_fxsr_struct))) {
+				ret = -EIO;
+				goto out;
+			}
+			child->used_math = 1;
+			ret = set_fpxregs(child, (struct user_fxsr_struct *)data);
+			goto out;
+		};
 
 		default:
 			ret = -EIO;
diff -urN 2.2.18/arch/i386/kernel/setup.c 2.2.18aa1/arch/i386/kernel/setup.c
--- 2.2.18/arch/i386/kernel/setup.c	Mon Dec 11 16:57:44 2000
+++ 2.2.18aa1/arch/i386/kernel/setup.c	Mon Dec 11 17:20:48 2000
@@ -30,6 +30,8 @@
  *
  *	Transmeta CPU detection.  H. Peter Anvin <hpa@zytor.com>, May 2000
  *
+ *	Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
+ *
  *	Cleaned up get_model_name(), AMD_model(), added display_cacheinfo().
  *	Dave Jones <davej@suse.de>, September 2000
  *
@@ -64,6 +66,7 @@
 #ifdef CONFIG_BLK_DEV_RAM
 #include <linux/blk.h>
 #endif
+#include <linux/bigmem.h>
 #include <asm/processor.h>
 #include <linux/console.h>
 #include <asm/uaccess.h>
@@ -81,6 +84,8 @@
 char ignore_irq13 = 0;		/* set if exception 16 works */
 struct cpuinfo_x86 boot_cpu_data = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
 
+unsigned long x86_cr4;
+
 /*
  * Bus types ..
  */
@@ -116,6 +121,9 @@
 extern int _etext, _edata, _end;
 extern unsigned long cpu_khz;
 
+static int disable_x86_serial_nr __initdata = 1;
+int disable_x86_fxsr __initdata = 0;
+
 /*
  * This is set up by the setup-routine at boot-time
  */
@@ -393,12 +401,31 @@
 #define VMALLOC_RESERVE	(64 << 20)	/* 64MB for vmalloc */
 #define MAXMEM	((unsigned long)(-PAGE_OFFSET-VMALLOC_RESERVE))
 
+#ifdef CONFIG_BIGMEM
+	bigmem_start = bigmem_end = memory_end;
+#endif
 	if (memory_end > MAXMEM)
 	{
+#ifdef CONFIG_BIGMEM
+#define MAXBIGMEM ((unsigned long)(~(VMALLOC_RESERVE-1)))
+		bigmem_start = MAXMEM;
+		bigmem_end = (memory_end < MAXBIGMEM) ? memory_end : MAXBIGMEM;
+#endif
 		memory_end = MAXMEM;
+#ifdef CONFIG_BIGMEM
+		printk(KERN_NOTICE "%ldMB BIGMEM available.\n",
+			(bigmem_end-bigmem_start)>>20);
+#else
 		printk(KERN_WARNING "Warning only %ldMB will be used.\n",
 			MAXMEM>>20);
+#endif
 	}
+#if defined(CONFIG_BIGMEM) && defined(BIGMEM_DEBUG)
+	else {
+		memory_end -= memory_end/4;
+		bigmem_start = memory_end;
+	}
+#endif
 
 	memory_end += PAGE_OFFSET;
 	*memory_start_p = memory_start;
@@ -928,10 +955,10 @@
 	    c->cpuid_level < 0)
 		return;
 
-	/* It should be possible for the user to override this. */
-	if(c->cpuid_level > 0 && 
+	if(disable_x86_serial_nr &&
+	   c->cpuid_level > 0 && 
 	   (c->x86_vendor == X86_VENDOR_INTEL || c->x86_vendor == X86_VENDOR_TRANSMETA) &&
-	   c->x86_capability&(1<<18)) {
+	   c->x86_capability & X86_FEATURE_PN) {
 		/* Disable processor serial number */
 		unsigned long lo,hi;
 		rdmsr(0x119,lo,hi);
@@ -1103,7 +1130,24 @@
 	}
 	cyrix_model(&boot_cpu_data);
 }
-	
+
+/*
+ * Setup function for serial number stuff
+ */
+
+int __init x86_serial_nr_setup(char * s)
+{
+	disable_x86_serial_nr = 0;
+	return 1;
+}
+__setup("serialnumber", x86_serial_nr_setup);
+
+int __init x86_fxsr_setup(char * s)
+{
+	disable_x86_fxsr = 1;
+	return 1;
+}
+__setup("nofxsr", x86_fxsr_setup);
 	
 
 static char *cpu_vendor_names[] __initdata = {
diff -urN 2.2.18/arch/i386/kernel/signal.c 2.2.18aa1/arch/i386/kernel/signal.c
--- 2.2.18/arch/i386/kernel/signal.c	Sun Apr  2 21:07:48 2000
+++ 2.2.18aa1/arch/i386/kernel/signal.c	Mon Dec 11 17:20:44 2000
@@ -4,6 +4,7 @@
  *  Copyright (C) 1991, 1992  Linus Torvalds
  *
  *  1997-11-28  Modified for POSIX.1b signals by Richard Henderson
+ *  2000-06-20  Pentium III FXSR, SSE support by Gareth Hughes
  */
 
 #include <linux/config.h>
@@ -21,6 +22,7 @@
 #include <linux/stddef.h>
 #include <asm/ucontext.h>
 #include <asm/uaccess.h>
+#include <asm/i387.h>
 
 #define DEBUG_SIG 0
 
@@ -150,29 +152,6 @@
 	char retcode[8];
 };
 
-
-static inline int restore_i387_hard(struct _fpstate *buf)
-{
-	struct task_struct *tsk = current;
-	clear_fpu(tsk);
-	return __copy_from_user(&tsk->tss.i387.hard, buf, sizeof(*buf));
-}
-
-static inline int restore_i387(struct _fpstate *buf)
-{
-	int err;
-#ifndef CONFIG_MATH_EMULATION
-	err = restore_i387_hard(buf);
-#else
-	if (boot_cpu_data.hard_math)
-		err = restore_i387_hard(buf);
-	else
-		err = restore_i387_soft(&current->tss.i387.soft, buf);
-#endif
-	current->used_math = 1;
-	return err;
-}
-
 static int
 restore_sigcontext(struct pt_regs *regs, struct sigcontext *sc, int *peax)
 {
@@ -298,39 +277,6 @@
 	force_sig(SIGSEGV, current);
 	return 0;
 }	
-
-/*
- * Set up a signal frame.
- */
-
-static inline int save_i387_hard(struct _fpstate * buf)
-{
-	struct task_struct *tsk = current;
-
-	unlazy_fpu(tsk);
-	tsk->tss.i387.hard.status = tsk->tss.i387.hard.swd;
-	if (__copy_to_user(buf, &tsk->tss.i387.hard, sizeof(*buf)))
-		return -1;
-	return 1;
-}
-
-static int save_i387(struct _fpstate *buf)
-{
-	if (!current->used_math)
-		return 0;
-
-	/* This will cause a "finit" to be triggered by the next
-	   attempted FPU operation by the 'current' process.
-	   */
-	current->used_math = 0;
-
-#ifndef CONFIG_MATH_EMULATION
-	return save_i387_hard(buf);
-#else
-	return boot_cpu_data.hard_math ? save_i387_hard(buf)
-	  : save_i387_soft(&current->tss.i387.soft, buf);
-#endif
-}
 
 static int
 setup_sigcontext(struct sigcontext *sc, struct _fpstate *fpstate,
diff -urN 2.2.18/arch/i386/kernel/smp.c 2.2.18aa1/arch/i386/kernel/smp.c
--- 2.2.18/arch/i386/kernel/smp.c	Mon Dec 11 16:57:44 2000
+++ 2.2.18aa1/arch/i386/kernel/smp.c	Mon Dec 11 17:20:52 2000
@@ -112,7 +112,7 @@
 static volatile unsigned long cpu_callout_map[NR_CPUS] = {0,};	/* We always use 0 the rest is ready for parallel delivery */
 volatile unsigned long smp_invalidate_needed;		/* Used for the invalidate map that's also checked in the spinlock */
 volatile unsigned long kstack_ptr;			/* Stack vector for booting CPUs			*/
-struct cpuinfo_x86 cpu_data[NR_CPUS];			/* Per CPU bogomips and other parameters 		*/
+struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned = { { 0, }, };			/* Per CPU bogomips and other parameters 		*/
 static unsigned int num_processors = 1;			/* Internal processor count				*/
 unsigned char boot_cpu_id = 0;				/* Processor that is doing the boot up 			*/
 static int smp_activated = 0;				/* Tripped once we need to start cross invalidating 	*/
@@ -708,6 +708,15 @@
 	    c->x86_mask >= 1 && c->x86_mask <= 4 &&
 	    c->x86_model <= 3)
 		smp_b_stepping=1;		/* Remember we have B step Pentia with bugs */
+
+	{
+		extern int disable_x86_fxsr;
+		/* Check all CPUs supports FXSR */
+		if (!disable_x86_fxsr &&
+		    (x86_cr4 & (X86_CR4_OSFXSR|X86_CR4_OSXMMEXCPT)) &&
+		    !(c->x86_capability & (X86_FEATURE_FXSR|X86_FEATURE_XMM)))
+			panic("To boot use the `nofxsr' kernel parameter");
+	}
 }
 
 /*
@@ -810,7 +819,6 @@
 	return memory_start;
 }
 
-#ifdef CONFIG_X86_TSC
 /*
  * TSC synchronization.
  *
@@ -1010,8 +1018,6 @@
 }
 #undef NR_LOOPS
 
-#endif
-
 extern void calibrate_delay(void);
 
 void __init smp_callin(void)
@@ -1098,12 +1104,11 @@
 	 */
 	set_bit(cpuid, (unsigned long *)&cpu_callin_map[0]);
 
-#ifdef CONFIG_X86_TSC
 	/*
 	 *	Synchronize the TSC with the BP
 	 */
- 	synchronize_tsc_ap ();
-#endif
+	if (boot_cpu_data.x86_capability & X86_FEATURE_TSC)
+		synchronize_tsc_ap ();
 }
 
 int cpucount = 0;
@@ -1640,13 +1645,11 @@
 
 smp_done:
 
-#ifdef CONFIG_X86_TSC
 	/*
 	 * Synchronize the TSC with the AP
 	 */
-	if (cpucount)
+	if (boot_cpu_data.x86_capability & X86_FEATURE_TSC && cpucount)
 	 	synchronize_tsc_bp();
-#endif
 }
 
 /*
@@ -2068,6 +2071,8 @@
 	 */
 }
 
+unsigned int apic_timer_irqs[NR_CPUS];
+
 /*
  * Local APIC timer interrupt. This is the most natural way for doing
  * local interrupts, but local timer interrupts can be emulated by
@@ -2078,6 +2083,13 @@
  */
 void smp_apic_timer_interrupt(struct pt_regs * regs)
 {
+	extern int nmi_irq;
+	/*
+	 * the only thing that can lock an NMI is an unACK-ed APIC ...
+	 */
+	if (nmi_irq >= 0)
+		apic_timer_irqs[smp_processor_id()]++;
+
 	/*
 	 * NOTE! We'd better ACK the irq immediately,
 	 * because timer handling can be slow, and we
diff -urN 2.2.18/arch/i386/kernel/sys_i386.c 2.2.18aa1/arch/i386/kernel/sys_i386.c
--- 2.2.18/arch/i386/kernel/sys_i386.c	Mon Jan 17 16:44:33 2000
+++ 2.2.18aa1/arch/i386/kernel/sys_i386.c	Mon Dec 11 17:20:49 2000
@@ -41,6 +41,42 @@
 	return error;
 }
 
+/* common code for old and new mmaps */
+static inline long do_mmap2(
+	unsigned long addr, unsigned long len,
+	unsigned long prot, unsigned long flags,
+	unsigned long fd, unsigned long pgoff)
+{
+	int error = -EBADF;
+	struct file * file = NULL;
+
+	down(&current->mm->mmap_sem);
+	lock_kernel();
+
+	flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
+	if (!(flags & MAP_ANONYMOUS)) {
+		file = fget(fd);
+		if (!file)
+			goto out;
+	}
+
+	error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
+
+	if (file)
+		fput(file);
+out:
+	unlock_kernel();
+	up(&current->mm->mmap_sem);
+	return error;
+}
+
+asmlinkage long sys_mmap2(unsigned long addr, unsigned long len,
+	unsigned long prot, unsigned long flags,
+	unsigned long fd, unsigned long pgoff)
+{
+	return do_mmap2(addr, len, prot, flags, fd, pgoff);
+}
+
 /*
  * Perform the select(nd, in, out, ex, tv) and mmap() system
  * calls. Linux/i386 didn't use to be able to handle more than
@@ -59,30 +95,19 @@
 
 asmlinkage int old_mmap(struct mmap_arg_struct *arg)
 {
-	int error = -EFAULT;
-	struct file * file = NULL;
 	struct mmap_arg_struct a;
+	int err = -EFAULT;
 
 	if (copy_from_user(&a, arg, sizeof(a)))
-		return -EFAULT;
+		goto out;
 
-	down(&current->mm->mmap_sem);
-	lock_kernel();
-	if (!(a.flags & MAP_ANONYMOUS)) {
-		error = -EBADF;
-		file = fget(a.fd);
-		if (!file)
-			goto out;
-	}
-	a.flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
+	err = -EINVAL;
+	if (a.offset & ~PAGE_MASK)
+		goto out;
 
-	error = do_mmap(file, a.addr, a.len, a.prot, a.flags, a.offset);
-	if (file)
-		fput(file);
+	err = do_mmap2(a.addr, a.len, a.prot, a.flags, a.fd, a.offset >> PAGE_SHIFT);
 out:
-	unlock_kernel();
-	up(&current->mm->mmap_sem);
-	return error;
+	return err;
 }
 
 extern asmlinkage int sys_select(int, fd_set *, fd_set *, fd_set *, struct timeval *);
diff -urN 2.2.18/arch/i386/kernel/time.c 2.2.18aa1/arch/i386/kernel/time.c
--- 2.2.18/arch/i386/kernel/time.c	Mon Dec 11 16:57:44 2000
+++ 2.2.18aa1/arch/i386/kernel/time.c	Mon Dec 11 17:20:46 2000
@@ -239,6 +239,20 @@
 
 #endif
 
+/* FIXME: should be inline but gcc is buggy and breaks */
+static void
+timeval_normalize(struct timeval * tv)
+{
+	time_t __sec;
+
+	__sec = tv->tv_usec / 1000000;
+	if (__sec)
+	{
+		tv->tv_usec %= 1000000;
+		tv->tv_sec += __sec;
+	}
+}
+
 /*
  * This version of gettimeofday has microsecond resolution
  * and better than microsecond precision on fast x86 machines with TSC.
@@ -259,13 +273,10 @@
 	usec += xtime.tv_usec;
 	read_unlock_irqrestore(&xtime_lock, flags);
 
-	while (usec >= 1000000) {
-		usec -= 1000000;
-		sec++;
-	}
-
 	tv->tv_sec = sec;
 	tv->tv_usec = usec;
+
+	timeval_normalize(tv);
 }
 
 void do_settimeofday(struct timeval *tv)
diff -urN 2.2.18/arch/i386/kernel/traps.c 2.2.18aa1/arch/i386/kernel/traps.c
--- 2.2.18/arch/i386/kernel/traps.c	Mon Dec 11 16:57:44 2000
+++ 2.2.18aa1/arch/i386/kernel/traps.c	Mon Dec 11 17:20:52 2000
@@ -2,6 +2,11 @@
  *  linux/arch/i386/traps.c
  *
  *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  Pentium III FXSR, SSE support
+ *	Gareth Hughes <gareth@valinux.com>, May 2000
+ *
+ *  1998, Ingo Molnar, added NMI-Watchdog driver
  */
 
 /*
@@ -33,6 +38,7 @@
 #include <asm/atomic.h>
 #include <asm/debugreg.h>
 #include <asm/desc.h>
+#include <asm/i387.h>
 
 #include <asm/smp.h>
 
@@ -106,6 +112,7 @@
 asmlinkage void general_protection(void);
 asmlinkage void page_fault(void);
 asmlinkage void coprocessor_error(void);
+asmlinkage void simd_coprocessor_error(void);
 asmlinkage void reserved(void);
 asmlinkage void alignment_check(void);
 asmlinkage void spurious_interrupt_bug(void);
@@ -121,7 +128,7 @@
 #define VMALLOC_OFFSET (8*1024*1024)
 #define MODULE_RANGE (8*1024*1024)
 
-static void show_registers(struct pt_regs *regs)
+void show_registers(struct pt_regs *regs)
 {
 	int i;
 	int in_kernel = 1;
@@ -156,7 +163,7 @@
 		printk("\nStack: ");
 		stack = (unsigned long *) esp;
 		for(i=0; i < kstack_depth_to_print; i++) {
-			if (((long) stack & 4095) == 0)
+			if (((long) stack & 8191) == 0)
 				break;
 			if (i && ((i % 8) == 0))
 				printk("\n       ");
@@ -171,7 +178,7 @@
 			module_start = PAGE_OFFSET + (max_mapnr << PAGE_SHIFT);
 			module_start = ((module_start + VMALLOC_OFFSET) & ~(VMALLOC_OFFSET-1));
 			module_end = module_start + MODULE_RANGE;
-			while (((long) stack & 4095) != 0) {
+			while (((long) stack & 8191) != 0) {
 				addr = *stack++;
 				/*
 				 * If the address is either in the text segment of the
@@ -331,12 +338,90 @@
 	printk("Do you have a strange power saving mode enabled?\n");
 }
 
+#ifdef CONFIG_SMP
+static int __init setup_nmi_irq(char *str)
+{
+	extern int nmi_irq;
+	int ints[11];
+
+	get_options(str, ints);
+	if (ints[0] == 1)
+		nmi_irq = ints[1];
+	return 1;
+}
+
+__setup("nmi_irq=", setup_nmi_irq);
+
+static void nmi_watchdog(struct pt_regs * regs)
+{
+	/*
+	 * the best way to detect wether a CPU has a 'hard lockup' problem
+	 * is to check it's local APIC timer IRQ counts. If they are not
+	 * changing then that CPU has some problem.
+	 *
+	 * as these watchdog NMI IRQs are broadcasted to every CPU, here
+	 * we only have to check the current processor.
+	 *
+	 * since NMIs dont listen to _any_ locks, we have to be extremely
+	 * careful not to rely on unsafe variables. The printk might lock
+	 * up though, so we have to break up console_lock first ...
+	 * [when there will be more tty-related locks, break them up
+	 *  here too!]
+	 */
+	extern spinlock_t console_lock;
+	extern unsigned int apic_timer_irqs[NR_CPUS];
+	static unsigned int last_irq_sums[NR_CPUS], alert_counter[NR_CPUS];
+	static spinlock_t nmi_print_lock = SPIN_LOCK_UNLOCKED;
+
+	/*
+	 * Since current-> is always on the stack, and we always switch
+	 * the stack NMI-atomically, it's safe to use smp_processor_id().
+	 */
+	int sum, cpu = smp_processor_id();
+
+	sum = apic_timer_irqs[cpu];
+
+	if (last_irq_sums[cpu] == sum) {
+		/*
+		 * Ayiee, looks like this CPU is stuck ...
+		 * wait a few IRQs (5 seconds) before doing the oops ...
+		 */
+		alert_counter[cpu]++;
+		if (alert_counter[cpu] == 5*HZ) {
+			spin_lock(&nmi_print_lock);
+			/*
+			 * We are in trouble anyway, lets at least try
+			 * to get a message out.
+			 */
+			(void) spin_trylock(&console_lock);
+			spin_unlock(&console_lock);
+			printk("NMI Watchdog detected LOCKUP on CPU%d, registers:\n", cpu);
+			show_registers(regs);
+			spin_unlock(&nmi_print_lock);
+			do_exit(SIGSEGV);
+		}
+	} else {
+		last_irq_sums[cpu] = sum;
+		alert_counter[cpu] = 0;
+	}
+}
+#endif
+
 asmlinkage void do_nmi(struct pt_regs * regs, long error_code)
 {
 	unsigned char reason = inb(0x61);
 	extern atomic_t nmi_counter;
+#ifdef CONFIG_SMP
+	extern int nmi_irq;
+#endif
 
 	atomic_inc(&nmi_counter);
+#ifdef CONFIG_SMP
+	if (nmi_irq >= 0) {
+		nmi_watchdog(regs);
+		return;
+	}
+#endif
 	if (reason & 0x80)
 		mem_parity_error(reason, regs);
 	if (reason & 0x40)
@@ -425,25 +510,138 @@
  * the correct behaviour even in the presence of the asynchronous
  * IRQ13 behaviour
  */
-void math_error(void)
+void math_error(void *eip)
 {
 	struct task_struct * task;
+	siginfo_t info;
+	unsigned short cwd, swd;
 
 	/*
 	 * Save the info for the exception handler
 	 * (this will also clear the error)
 	 */
 	task = current;
-	save_fpu(task);
+	save_init_fpu(task);
 	task->tss.trap_no = 16;
 	task->tss.error_code = 0;
-	force_sig(SIGFPE, task);
+	info.si_signo = SIGFPE;
+	info.si_errno = 0;
+	info.si_code = __SI_FAULT;
+	info.si_addr = eip;
+	/*
+	 * (~cwd & swd) will mask out exceptions that are not set to unmasked
+	 * status.  0x3f is the exception bits in these regs, 0x200 is the
+	 * C1 reg you need in case of a stack fault, 0x040 is the stack
+	 * fault bit.  We should only be taking one exception at a time,
+	 * so if this combination doesn't produce any single exception,
+	 * then we have a bad program that isn't syncronizing its FPU usage
+	 * and it will suffer the consequences since we won't be able to
+	 * fully reproduce the context of the exception
+	 */
+	cwd = get_fpu_cwd(task);
+	swd = get_fpu_swd(task);
+	switch (((~cwd) & swd & 0x3f) | (swd & 0x240)) {
+		case 0x000:
+		default:
+			break;
+		case 0x001: /* Invalid Op */
+		case 0x040: /* Stack Fault */
+		case 0x240: /* Stack Fault | Direction */
+			info.si_code = FPE_FLTINV;
+			break;
+		case 0x002: /* Denormalize */
+		case 0x010: /* Underflow */
+			info.si_code = FPE_FLTUND;
+			break;
+		case 0x004: /* Zero Divide */
+			info.si_code = FPE_FLTDIV;
+			break;
+		case 0x008: /* Overflow */
+			info.si_code = FPE_FLTOVF;
+			break;
+		case 0x020: /* Precision */
+			info.si_code = FPE_FLTRES;
+			break;
+	}
+	force_sig_info(SIGFPE, &info, task);
 }
 
 asmlinkage void do_coprocessor_error(struct pt_regs * regs, long error_code)
 {
 	ignore_irq13 = 1;
-	math_error();
+	math_error((void *)regs->eip);
+}
+
+void simd_math_error(void *eip)
+{
+	struct task_struct * task;
+	siginfo_t info;
+	unsigned short mxcsr;
+
+	/*
+	 * Save the info for the exception handler and clear the error.
+	 */
+	task = current;
+	save_init_fpu(task);
+	task->tss.trap_no = 19;
+	task->tss.error_code = 0;
+	info.si_signo = SIGFPE;
+	info.si_errno = 0;
+	info.si_code = __SI_FAULT;
+	info.si_addr = eip;
+	/*
+	 * The SIMD FPU exceptions are handled a little differently, as there
+	 * is only a single status/control register.  Thus, to determine which
+	 * unmasked exception was caught we must mask the exception mask bits
+	 * at 0x1f80, and then use these to mask the exception bits at 0x3f.
+	 */
+	mxcsr = get_fpu_mxcsr(task);
+	switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
+		case 0x000:
+		default:
+			break;
+		case 0x001: /* Invalid Op */
+			info.si_code = FPE_FLTINV;
+			break;
+		case 0x002: /* Denormalize */
+		case 0x010: /* Underflow */
+			info.si_code = FPE_FLTUND;
+			break;
+		case 0x004: /* Zero Divide */
+			info.si_code = FPE_FLTDIV;
+			break;
+		case 0x008: /* Overflow */
+			info.si_code = FPE_FLTOVF;
+			break;
+		case 0x020: /* Precision */
+			info.si_code = FPE_FLTRES;
+			break;
+	}
+	force_sig_info(SIGFPE, &info, task);
+}
+
+asmlinkage void do_simd_coprocessor_error(struct pt_regs * regs,
+					  long error_code)
+{
+	if (cpu_has_xmm) {
+		/* Handle SIMD FPU exceptions on PIII+ processors. */
+		ignore_irq13 = 1;
+		simd_math_error((void *)regs->eip);
+	} else {
+		/*
+		 * Handle strange cache flush from user space exception
+		 * in all other cases.  This is undocumented behaviour.
+		 */
+		if (regs->eflags & VM_MASK) {
+			handle_vm86_fault((struct kernel_vm86_regs *)regs,
+					  error_code);
+			return;
+		}
+		die_if_kernel("cache flush denied", regs, error_code);
+		current->tss.trap_no = 19;
+		current->tss.error_code = error_code;
+		force_sig(SIGSEGV, current);
+	}
 }
 
 asmlinkage void do_spurious_interrupt_bug(struct pt_regs * regs,
@@ -465,17 +663,13 @@
 asmlinkage void math_state_restore(struct pt_regs regs)
 {
 	__asm__ __volatile__("clts");		/* Allow maths ops (or we recurse) */
-	if(current->used_math)
-		__asm__("frstor %0": :"m" (current->tss.i387));
-	else
-	{
-		/*
-		 *	Our first FPU usage, clean the chip.
-		 */
-		__asm__("fninit");
-		current->used_math = 1;
+
+	if (current->used_math) {
+		restore_fpu(current);
+	} else {
+		init_fpu();
 	}
-	current->flags|=PF_USEDFPU;		/* So we fnsave on switch_to() */
+	current->flags |= PF_USEDFPU;	/* So we fnsave on switch_to() */
 }
 
 #ifndef CONFIG_MATH_EMULATION
@@ -705,6 +899,7 @@
 	set_trap_gate(16,&coprocessor_error);
 	set_trap_gate(17,&alignment_check);
 	set_trap_gate(18,&machine_check);
+	set_trap_gate(19,&simd_coprocessor_error);
 	set_system_gate(SYSCALL_VECTOR,&system_call);
 
 	/* set up GDT task & ldt entries */
diff -urN 2.2.18/arch/i386/lib/usercopy.c 2.2.18aa1/arch/i386/lib/usercopy.c
--- 2.2.18/arch/i386/lib/usercopy.c	Mon Jan 17 16:44:33 2000
+++ 2.2.18aa1/arch/i386/lib/usercopy.c	Mon Dec 11 17:20:52 2000
@@ -31,6 +31,8 @@
 #define __do_strncpy_from_user(dst,src,count,res)			   \
 do {									   \
 	int __d0, __d1, __d2;						   \
+	DECLARE_LOCAL_LOCK_DEPTH(lock_depth);				   \
+	release_kernel_lock_save(lock_depth);				   \
 	__asm__ __volatile__(						   \
 		"	testl %1,%1\n"					   \
 		"	jz 2f\n"					   \
@@ -54,6 +56,8 @@
 		  "=&D" (__d2)						   \
 		: "i"(-EFAULT), "0"(count), "1"(count), "3"(src), "4"(dst) \
 		: "memory");						   \
+	conditional_schedule();						   \
+	reacquire_kernel_lock_restore(lock_depth);			   \
 } while (0)
 
 long
@@ -81,6 +85,8 @@
 #define __do_clear_user(addr,size)					\
 do {									\
 	int __d0;							\
+	DECLARE_LOCAL_LOCK_DEPTH(lock_depth);				\
+	release_kernel_lock_save(lock_depth);				\
   	__asm__ __volatile__(						\
 		"0:	rep; stosl\n"					\
 		"	movl %2,%0\n"					\
@@ -97,6 +103,8 @@
 		".previous"						\
 		: "=&c"(size), "=&D" (__d0)				\
 		: "r"(size & 3), "0"(size / 4), "1"(addr), "a"(0));	\
+	conditional_schedule();						\
+	reacquire_kernel_lock_restore(lock_depth);			\
 } while (0)
 
 unsigned long
@@ -124,7 +132,9 @@
 {
 	unsigned long mask = -__addr_ok(s);
 	unsigned long res, tmp;
+	DECLARE_LOCAL_LOCK_DEPTH(lock_depth);
 
+	release_kernel_lock_save(lock_depth);
 	__asm__ __volatile__(
 		"	andl %0,%%ecx\n"
 		"0:	repne; scasb\n"
@@ -143,5 +153,7 @@
 		:"=r" (n), "=D" (s), "=a" (res), "=c" (tmp)
 		:"0" (n), "1" (s), "2" (0), "3" (mask)
 		:"cc");
+	conditional_schedule();
+	reacquire_kernel_lock_restore(lock_depth);
 	return res & mask;
 }
diff -urN 2.2.18/arch/i386/mm/Makefile 2.2.18aa1/arch/i386/mm/Makefile
--- 2.2.18/arch/i386/mm/Makefile	Mon Jan 18 02:28:56 1999
+++ 2.2.18aa1/arch/i386/mm/Makefile	Mon Dec 11 17:20:48 2000
@@ -10,4 +10,8 @@
 O_TARGET := mm.o
 O_OBJS	 := init.o fault.o ioremap.o extable.o
 
+ifeq ($(CONFIG_BIGMEM),y)
+O_OBJS += bigmem.o
+endif
+
 include $(TOPDIR)/Rules.make
diff -urN 2.2.18/arch/i386/mm/bigmem.c 2.2.18aa1/arch/i386/mm/bigmem.c
--- 2.2.18/arch/i386/mm/bigmem.c	Thu Jan  1 01:00:00 1970
+++ 2.2.18aa1/arch/i386/mm/bigmem.c	Mon Dec 11 17:20:48 2000
@@ -0,0 +1,35 @@
+/*
+ * BIGMEM IA32 code and variables.
+ *
+ * (C) 1999 Andrea Arcangeli, SuSE GmbH, andrea@suse.de
+ *          Gerhard Wichert, Siemens AG, Gerhard.Wichert@pdb.siemens.de
+ */
+
+#include <linux/mm.h>
+#include <linux/bigmem.h>
+
+unsigned long bigmem_start, bigmem_end;
+
+/* NOTE: fixmap_init alloc all the fixmap pagetables contigous on the
+   physical space so we can cache the place of the first one and move
+   around without checking the pgd every time. */
+pte_t *kmap_pte;
+pgprot_t kmap_prot;
+
+#define kmap_get_fixmap_pte(vaddr)					\
+	pte_offset(pmd_offset(pgd_offset_k(vaddr), (vaddr)), (vaddr))
+
+void __init kmap_init(void)
+{
+	unsigned long kmap_vstart;
+
+	/* cache the first kmap pte */
+	kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN);
+	kmap_pte = kmap_get_fixmap_pte(kmap_vstart);
+
+	kmap_prot = PAGE_KERNEL;
+#if 0
+	if (boot_cpu_data.x86_capability & X86_FEATURE_PGE)
+		pgprot_val(kmap_prot) |= _PAGE_GLOBAL;
+#endif
+}
diff -urN 2.2.18/arch/i386/mm/fault.c 2.2.18aa1/arch/i386/mm/fault.c
--- 2.2.18/arch/i386/mm/fault.c	Thu May  4 13:00:36 2000
+++ 2.2.18aa1/arch/i386/mm/fault.c	Mon Dec 11 17:20:46 2000
@@ -29,13 +29,13 @@
  */
 int __verify_write(const void * addr, unsigned long size)
 {
-	struct vm_area_struct * vma;
+	struct vm_area_struct * vma, * prev_vma;
 	unsigned long start = (unsigned long) addr;
 
 	if (!size)
 		return 1;
 
-	vma = find_vma(current->mm, start);
+	vma = find_vma_prev(current->mm, start, &prev_vma);
 	if (!vma)
 		goto bad_area;
 	if (vma->vm_start > start)
@@ -75,7 +75,7 @@
 check_stack:
 	if (!(vma->vm_flags & VM_GROWSDOWN))
 		goto bad_area;
-	if (expand_stack(vma, start) == 0)
+	if (expand_stack(vma, start, prev_vma) == 0)
 		goto good_area;
 
 bad_area:
@@ -112,7 +112,7 @@
 {
 	struct task_struct *tsk;
 	struct mm_struct *mm;
-	struct vm_area_struct * vma;
+	struct vm_area_struct * vma, * prev_vma;
 	unsigned long address;
 	unsigned long page;
 	unsigned long fixup;
@@ -133,7 +133,7 @@
 
 	down(&mm->mmap_sem);
 
-	vma = find_vma(mm, address);
+	vma = find_vma_prev(mm, address, &prev_vma);
 	if (!vma)
 		goto bad_area;
 	if (vma->vm_start <= address)
@@ -150,7 +150,7 @@
 		if (address + 32 < regs->esp)
 			goto bad_area;
 	}
-	if (expand_stack(vma, address))
+	if (expand_stack(vma, address, prev_vma))
 		goto bad_area;
 /*
  * Ok, we have a good vm_area for this memory access, so
diff -urN 2.2.18/arch/i386/mm/init.c 2.2.18aa1/arch/i386/mm/init.c
--- 2.2.18/arch/i386/mm/init.c	Mon Dec 11 16:57:44 2000
+++ 2.2.18aa1/arch/i386/mm/init.c	Mon Dec 11 17:20:52 2000
@@ -2,6 +2,8 @@
  *  linux/arch/i386/mm/init.c
  *
  *  Copyright (C) 1995  Linus Torvalds
+ *
+ *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
  */
 
 #include <linux/config.h>
@@ -20,6 +22,7 @@
 #ifdef CONFIG_BLK_DEV_INITRD
 #include <linux/blk.h>
 #endif
+#include <linux/bigmem.h>
 
 #include <asm/processor.h>
 #include <asm/system.h>
@@ -28,6 +31,8 @@
 #include <asm/dma.h>
 #include <asm/fixmap.h>
 
+static int totalram_pages, totalbig_pages;
+
 extern void show_net_buffers(void);
 extern unsigned long init_smp_mappings(unsigned long);
 
@@ -148,6 +153,7 @@
 {
 	int i,free = 0,total = 0,reserved = 0;
 	int shared = 0, cached = 0;
+	int bigmem = 0;
 
 	printk("Mem-info:\n");
 	show_free_areas();
@@ -155,6 +161,8 @@
 	i = max_mapnr;
 	while (i-- > 0) {
 		total++;
+		if (PageBIGMEM(mem_map+i))
+			bigmem++;
 		if (PageReserved(mem_map+i))
 			reserved++;
 		else if (PageSwapCache(mem_map+i))
@@ -165,6 +173,7 @@
 			shared += atomic_read(&mem_map[i].count) - 1;
 	}
 	printk("%d pages of RAM\n",total);
+	printk("%d pages of BIGMEM\n",bigmem);
 	printk("%d reserved pages\n",reserved);
 	printk("%d pages shared\n",shared);
 	printk("%d pages swap cached\n",cached);
@@ -184,34 +193,6 @@
 extern char _text, _etext, _edata, __bss_start, _end;
 extern char __init_begin, __init_end;
 
-#define X86_CR4_VME		0x0001		/* enable vm86 extensions */
-#define X86_CR4_PVI		0x0002		/* virtual interrupts flag enable */
-#define X86_CR4_TSD		0x0004		/* disable time stamp at ipl 3 */
-#define X86_CR4_DE		0x0008		/* enable debugging extensions */
-#define X86_CR4_PSE		0x0010		/* enable page size extensions */
-#define X86_CR4_PAE		0x0020		/* enable physical address extensions */
-#define X86_CR4_MCE		0x0040		/* Machine check enable */
-#define X86_CR4_PGE		0x0080		/* enable global pages */
-#define X86_CR4_PCE		0x0100		/* enable performance counters at ipl 3 */
-
-/*
- * Save the cr4 feature set we're using (ie
- * Pentium 4MB enable and PPro Global page
- * enable), so that any CPU's that boot up
- * after us can get the correct flags.
- */
-unsigned long mmu_cr4_features __initdata = 0;
-
-static inline void set_in_cr4(unsigned long mask)
-{
-	mmu_cr4_features |= mask;
-	__asm__("movl %%cr4,%%eax\n\t"
-		"orl %0,%%eax\n\t"
-		"movl %%eax,%%cr4\n"
-		: : "irg" (mask)
-		:"ax");
-}
-
 /*
  * allocate page table(s) for compile-time fixed mappings
  */
@@ -260,6 +241,24 @@
 	set_pte_phys (address,phys);
 }
 
+static void __init relocate_initrd(unsigned long mem_start,
+				   unsigned long end_mem)
+{
+#ifdef CONFIG_BLK_DEV_INITRD
+	unsigned long initrd_size, relocate;
+
+	if (!initrd_start || mem_start > initrd_start)
+		return;
+	initrd_size = initrd_end - initrd_start;
+	relocate = (end_mem - initrd_size) & PAGE_MASK;
+	if (initrd_start < relocate) {
+		memmove((char *) relocate, (char *) initrd_start, initrd_size);
+		initrd_start = relocate;
+		initrd_end = initrd_start + initrd_size;
+	}
+#endif
+}
+
 /*
  * paging_init() sets up the page tables - note that the first 4MB are
  * already mapped by head.S.
@@ -344,7 +343,15 @@
 #endif
 	local_flush_tlb();
 
+	/* relocate initrd as soon as we have the paging working */
+	relocate_initrd(start_mem, end_mem);
+
+#ifndef CONFIG_BIGMEM
 	return free_area_init(start_mem, end_mem);
+#else
+	kmap_init(); /* run after fixmap_init */
+	return free_area_init(start_mem, bigmem_end + PAGE_OFFSET);
+#endif
 }
 
 /*
@@ -396,8 +403,18 @@
 	unsigned long tmp;
 
 	end_mem &= PAGE_MASK;
+#ifdef CONFIG_BIGMEM
+	bigmem_start = PAGE_ALIGN(bigmem_start);
+	bigmem_end &= PAGE_MASK;
+#endif
 	high_memory = (void *) end_mem;
+#ifndef CONFIG_BIGMEM
 	max_mapnr = num_physpages = MAP_NR(end_mem);
+#else
+	max_mapnr = num_physpages = PHYSMAP_NR(bigmem_end);
+	/* cache the bigmem_mapnr */
+	bigmem_mapnr = PHYSMAP_NR(bigmem_start);
+#endif
 
 	/* clear the zero-page */
 	memset(empty_zero_page, 0, PAGE_SIZE);
@@ -452,16 +469,39 @@
 #endif
 			free_page(tmp);
 	}
-	printk("Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init)\n",
+#ifdef CONFIG_BIGMEM
+	for (tmp = bigmem_start; tmp < bigmem_end;  tmp += PAGE_SIZE) {
+		/*
+		  RMQUEUE_ORDER in page_alloc.c returns PAGE_OFFSET + tmp
+		  which cannot be allowed to be 0 since the callers of
+		  __get_free_pages treat 0 as an allocation failure.  To
+		  avoid this possibility, do not allow allocation of the
+		  BIGMEM page which would map to 0.
+
+		  Leonard N. Zubkoff, 30 October 1999
+		*/
+		if (tmp + PAGE_OFFSET != 0) {
+			clear_bit(PG_reserved, &mem_map[PHYSMAP_NR(tmp)].flags);
+			set_bit(PG_BIGMEM, &mem_map[PHYSMAP_NR(tmp)].flags);
+			atomic_set(&mem_map[PHYSMAP_NR(tmp)].count, 1);
+			free_page(tmp + PAGE_OFFSET);
+			totalbig_pages++;
+		}
+	}
+#endif
+	printk("Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %dk bigmem)\n",
 		(unsigned long) nr_free_pages << (PAGE_SHIFT-10),
 		max_mapnr << (PAGE_SHIFT-10),
 		codepages << (PAGE_SHIFT-10),
 		reservedpages << (PAGE_SHIFT-10),
 		datapages << (PAGE_SHIFT-10),
-		initpages << (PAGE_SHIFT-10));
+		initpages << (PAGE_SHIFT-10),
+		totalbig_pages << (PAGE_SHIFT-10));
 
 	if (boot_cpu_data.wp_works_ok < 0)
 		test_wp_bit();
+
+	totalram_pages = max_mapnr - reservedpages;
 }
 
 void free_initmem(void)
@@ -479,22 +519,11 @@
 
 void si_meminfo(struct sysinfo *val)
 {
-	int i;
-
-	i = max_mapnr;
-	val->totalram = 0;
+	val->totalram = totalram_pages << PAGE_SHIFT;
 	val->sharedram = 0;
 	val->freeram = nr_free_pages << PAGE_SHIFT;
 	val->bufferram = buffermem;
-	while (i-- > 0)  {
-		if (PageReserved(mem_map+i))
-			continue;
-		val->totalram++;
-		if (!atomic_read(&mem_map[i].count))
-			continue;
-		val->sharedram += atomic_read(&mem_map[i].count) - 1;
-	}
-	val->totalram <<= PAGE_SHIFT;
-	val->sharedram <<= PAGE_SHIFT;
+	val->totalbig = totalbig_pages << PAGE_SHIFT;
+	val->freebig = nr_free_bigpages << PAGE_SHIFT;
 	return;
 }
diff -urN 2.2.18/arch/i386/vmlinux.lds.S 2.2.18aa1/arch/i386/vmlinux.lds.S
--- 2.2.18/arch/i386/vmlinux.lds.S	Mon Dec 11 16:57:45 2000
+++ 2.2.18aa1/arch/i386/vmlinux.lds.S	Mon Dec 11 17:20:44 2000
@@ -1,6 +1,7 @@
 /* ld script to make i386 Linux kernel
  * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>;
  */
+#include <asm/cache.h>
 OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
 OUTPUT_ARCH(i386)
 ENTRY(_start)
@@ -53,7 +54,7 @@
   __init_end = .;
 
 
-  . = ALIGN(32);
+  . = ALIGN(L1_CACHE_BYTES);
   .data.cacheline_aligned : { *(.data.cacheline_aligned) }
 
   . = ALIGN(4096);
diff -urN 2.2.18/arch/m68k/mm/init.c 2.2.18aa1/arch/m68k/mm/init.c
--- 2.2.18/arch/m68k/mm/init.c	Mon Dec 11 16:57:45 2000
+++ 2.2.18aa1/arch/m68k/mm/init.c	Mon Dec 11 17:20:48 2000
@@ -495,5 +495,7 @@
     }
     val->totalram <<= PAGE_SHIFT;
     val->sharedram <<= PAGE_SHIFT;
+    val->totalbig = 0;
+    val->freebig = 0;
     return;
 }
diff -urN 2.2.18/arch/mips/kernel/sysirix.c 2.2.18aa1/arch/mips/kernel/sysirix.c
--- 2.2.18/arch/mips/kernel/sysirix.c	Mon Jan 17 16:44:34 2000
+++ 2.2.18aa1/arch/mips/kernel/sysirix.c	Mon Dec 11 17:20:49 2000
@@ -1984,7 +1984,7 @@
 #define ROUND_UP32(x) (((x)+sizeof(u32)-1) & ~(sizeof(u32)-1))
 
 static int irix_filldir32(void *__buf, const char *name, int namlen,
-                          off_t offset, ino_t ino)
+                          off_t offset, ino_t ino, unsigned int d_type)
 {
 	struct irix_dirent32 *dirent;
 	struct irix_dirent32_callback *buf =
@@ -2097,7 +2097,7 @@
 #define ROUND_UP64(x) (((x)+sizeof(u64)-1) & ~(sizeof(u64)-1))
 
 static int irix_filldir64(void * __buf, const char * name, int namlen,
-			  off_t offset, ino_t ino)
+			  off_t offset, ino_t ino, unsigned int d_type)
 {
 	struct irix_dirent64 *dirent;
 	struct irix_dirent64_callback * buf =
diff -urN 2.2.18/arch/mips/mm/init.c 2.2.18aa1/arch/mips/mm/init.c
--- 2.2.18/arch/mips/mm/init.c	Mon Jan 17 16:44:34 2000
+++ 2.2.18aa1/arch/mips/mm/init.c	Mon Dec 11 17:20:48 2000
@@ -380,6 +380,8 @@
 	}
 	val->totalram <<= PAGE_SHIFT;
 	val->sharedram <<= PAGE_SHIFT;
+	val->totalbig = 0;
+	val->freebig = 0;
 	return;
 }
 
diff -urN 2.2.18/arch/ppc/kernel/misc.S 2.2.18aa1/arch/ppc/kernel/misc.S
--- 2.2.18/arch/ppc/kernel/misc.S	Mon Dec 11 16:57:45 2000
+++ 2.2.18aa1/arch/ppc/kernel/misc.S	Mon Dec 11 17:20:49 2000
@@ -950,7 +950,7 @@
 	.long sys_swapon
 	.long sys_reboot
 	.long old_readdir
-	.long sys_mmap		/* 90 */
+	.long old_mmap		/* 90 */
 	.long sys_munmap
 	.long sys_truncate
 	.long sys_ftruncate
@@ -1050,18 +1050,20 @@
 	.long sys_ni_syscall		/* streams1 */
 	.long sys_ni_syscall		/* streams2 */
 	.long sys_vfork
-	.long sys_ni_syscall		/* 190 */	/* MacOnLinux - old */
+	.long sys_ni_syscall		/* 190 getrlimit */
 	.long sys_ni_syscall		/* 191 */	/* Unused */
-	.long sys_ni_syscall		/* 192 - reserved - mmap2 */
-	.long sys_ni_syscall		/* 193 - reserved - truncate64 */
-	.long sys_ni_syscall		/* 194 - reserved - ftruncate64 */
-	.long sys_ni_syscall		/* 195 - reserved - stat64 */
-	.long sys_ni_syscall		/* 196 - reserved - lstat64 */
-	.long sys_ni_syscall		/* 197 - reserved - fstat64 */
+	.long sys_mmap2			/* 192 */
+	.long sys_truncate64		/* 193 */
+	.long sys_ftruncate64		/* 194 */
+	.long sys_stat64		/* 195 */
+	.long sys_lstat64		/* 196 */
+	.long sys_fstat64		/* 197 */
 	.long sys_pciconfig_read	/* 198 */
 	.long sys_pciconfig_write 	/* 199 */
 	.long sys_pciconfig_iobase 	/* 200 */
 	.long sys_ni_syscall		/* 201 - reserved - MacOnLinux - new */
-	.rept NR_syscalls-201
+	.long sys_getdents64		/* 202 */
+	.long sys_fcntl64		/* 203 */
+	.rept NR_syscalls-203
 		.long sys_ni_syscall
 	.endr
diff -urN 2.2.18/arch/ppc/kernel/syscalls.c 2.2.18aa1/arch/ppc/kernel/syscalls.c
--- 2.2.18/arch/ppc/kernel/syscalls.c	Mon Dec 11 16:57:46 2000
+++ 2.2.18aa1/arch/ppc/kernel/syscalls.c	Mon Dec 11 17:20:49 2000
@@ -33,6 +33,7 @@
 #include <linux/sys.h>
 #include <linux/ipc.h>
 #include <linux/utsname.h>
+#include <linux/file.h>
 
 #include <asm/uaccess.h>
 #include <asm/ipc.h>
@@ -192,25 +193,55 @@
 	return error;
 }
 
-asmlinkage unsigned long sys_mmap(unsigned long addr, size_t len,
-				  unsigned long prot, unsigned long flags,
-				  unsigned long fd, off_t offset)
+/* common code for old and new mmaps */
+static inline long do_mmap2(
+	unsigned long addr, unsigned long len,
+	unsigned long prot, unsigned long flags,
+	unsigned long fd, unsigned long pgoff)
 {
 	struct file * file = NULL;
 	int ret = -EBADF;
 
 	down(&current->mm->mmap_sem);
-	lock_kernel();
+
 	if (!(flags & MAP_ANONYMOUS)) {
-		if (fd >= NR_OPEN || !(file = current->files->fd[fd]))
+		file = fget(fd);
+		if (!file)
 			goto out;
 	}
-	
+
 	flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
-	ret = do_mmap(file, addr, len, prot, flags, offset);
-out:
+	ret = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
+
+	if (file)
+		fput(file);
+ out:
 	unlock_kernel();
 	up(&current->mm->mmap_sem);
+	return ret;
+
+}
+
+asmlinkage long sys_mmap2(unsigned long addr, unsigned long len,
+			  unsigned long prot, unsigned long flags,
+			  unsigned long fd, unsigned long pgoff)
+{
+	return do_mmap2(addr, len, prot, flags, fd, pgoff);
+}
+
+asmlinkage unsigned long old_mmap(unsigned long addr, size_t len,
+				  unsigned long prot, unsigned long flags,
+				  unsigned long fd, off_t offset)
+{
+	int ret;
+
+	ret = -EINVAL;
+	if (offset & ~PAGE_MASK)
+		goto out;
+
+	ret = do_mmap2(addr, len, prot, flags, fd, offset >> PAGE_SHIFT);
+
+ out:
 	return ret;
 }
 
diff -urN 2.2.18/arch/ppc/kernel/time.c 2.2.18aa1/arch/ppc/kernel/time.c
--- 2.2.18/arch/ppc/kernel/time.c	Mon Dec 11 16:57:46 2000
+++ 2.2.18aa1/arch/ppc/kernel/time.c	Mon Dec 11 17:20:46 2000
@@ -147,6 +147,19 @@
 	hardirq_exit(cpu);
 }
 
+static inline void
+timeval_normalize(struct timeval * tv)
+{
+	time_t __sec;
+
+	__sec = tv->tv_usec / 1000000;
+	if (__sec)
+	{
+		tv->tv_usec %= 1000000;
+		tv->tv_sec += __sec;
+	}
+}
+
 /*
  * This version of gettimeofday has microsecond resolution.
  */
@@ -161,10 +174,7 @@
 #ifndef __SMP__
 	tv->tv_usec += (decrementer_count - get_dec())
 	    * count_period_num / count_period_den;
-	if (tv->tv_usec >= 1000000) {
-		tv->tv_usec -= 1000000;
-		tv->tv_sec++;
-	}
+	timeval_normalize(tv);
 #endif
 	restore_flags(flags);
 }
diff -urN 2.2.18/arch/ppc/mm/fault.c 2.2.18aa1/arch/ppc/mm/fault.c
--- 2.2.18/arch/ppc/mm/fault.c	Tue Sep  5 02:28:38 2000
+++ 2.2.18aa1/arch/ppc/mm/fault.c	Mon Dec 11 17:20:46 2000
@@ -58,7 +58,7 @@
 void do_page_fault(struct pt_regs *regs, unsigned long address,
 		   unsigned long error_code)
 {
-	struct vm_area_struct * vma;
+	struct vm_area_struct * vma, * prev_vma;
 	struct mm_struct *mm = current->mm;
 	int fault;
 
@@ -92,14 +92,14 @@
 	}
 
 	down(&mm->mmap_sem);
-	vma = find_vma(mm, address);
+	vma = find_vma_prev(mm, address, &prev_vma);
 	if (!vma)
 		goto bad_area;
 	if (vma->vm_start <= address)
 		goto good_area;
 	if (!(vma->vm_flags & VM_GROWSDOWN))
 		goto bad_area;
-	if (expand_stack(vma, address))
+	if (expand_stack(vma, address, prev_vma))
 		goto bad_area;
 
 good_area:
diff -urN 2.2.18/arch/ppc/mm/init.c 2.2.18aa1/arch/ppc/mm/init.c
--- 2.2.18/arch/ppc/mm/init.c	Mon Dec 11 16:57:46 2000
+++ 2.2.18aa1/arch/ppc/mm/init.c	Mon Dec 11 17:20:48 2000
@@ -329,6 +329,8 @@
 	}
 	val->totalram <<= PAGE_SHIFT;
 	val->sharedram <<= PAGE_SHIFT;
+	val->totalbig = 0;
+	val->freebig = 0;
 	return;
 }
 
diff -urN 2.2.18/arch/s390/mm/fault.c 2.2.18aa1/arch/s390/mm/fault.c
--- 2.2.18/arch/s390/mm/fault.c	Mon Dec 11 16:57:46 2000
+++ 2.2.18aa1/arch/s390/mm/fault.c	Mon Dec 11 17:20:46 2000
@@ -124,7 +124,7 @@
                 goto good_area;
         if (!(vma->vm_flags & VM_GROWSDOWN))
                 goto bad_area;
-        if (expand_stack(vma, address))
+        if (expand_stack(vma, address, NULL))
                 goto bad_area;
 /*
  * Ok, we have a good vm_area for this memory access, so
diff -urN 2.2.18/arch/s390/mm/init.c 2.2.18aa1/arch/s390/mm/init.c
--- 2.2.18/arch/s390/mm/init.c	Mon Dec 11 16:57:46 2000
+++ 2.2.18aa1/arch/s390/mm/init.c	Mon Dec 11 17:20:48 2000
@@ -430,5 +430,7 @@
         }
         val->totalram <<= PAGE_SHIFT;
         val->sharedram <<= PAGE_SHIFT;
+	val->totalbig = 0;
+	val->freebig = 0;
         return;
 }
diff -urN 2.2.18/arch/sparc/config.in 2.2.18aa1/arch/sparc/config.in
--- 2.2.18/arch/sparc/config.in	Mon Dec 11 16:57:46 2000
+++ 2.2.18aa1/arch/sparc/config.in	Mon Dec 11 17:20:54 2000
@@ -88,10 +88,16 @@
 
 bool 'Multiple devices driver support' CONFIG_BLK_DEV_MD
 if [ "$CONFIG_BLK_DEV_MD" = "y" ]; then
+  bool 'Autodetect RAID partitions' CONFIG_AUTODETECT_RAID
   tristate '   Linear (append) mode' CONFIG_MD_LINEAR
   tristate '   RAID-0 (striping) mode' CONFIG_MD_STRIPED
   tristate '   RAID-1 (mirroring) mode' CONFIG_MD_MIRRORING
   tristate '   RAID-4/RAID-5 mode' CONFIG_MD_RAID5
+  tristate '   Translucent mode' CONFIG_MD_TRANSLUCENT
+  tristate '   Hierarchical Storage Management support' CONFIG_MD_HSM
+fi
+if [ "$CONFIG_MD_LINEAR" = "y" -o "$CONFIG_MD_STRIPED" = "y" ]; then
+  bool '      Boot support (linear, striped)' CONFIG_MD_BOOT
 fi
 
 tristate 'RAM disk support' CONFIG_BLK_DEV_RAM
diff -urN 2.2.18/arch/sparc/defconfig 2.2.18aa1/arch/sparc/defconfig
--- 2.2.18/arch/sparc/defconfig	Mon Dec 11 16:57:46 2000
+++ 2.2.18aa1/arch/sparc/defconfig	Mon Dec 11 17:20:54 2000
@@ -89,10 +89,13 @@
 #
 CONFIG_BLK_DEV_FD=y
 CONFIG_BLK_DEV_MD=y
+# CONFIG_AUTODETECT_RAID is not set
 CONFIG_MD_LINEAR=m
 CONFIG_MD_STRIPED=m
 CONFIG_MD_MIRRORING=m
 CONFIG_MD_RAID5=m
+# CONFIG_MD_TRANSLUCENT is not set
+# CONFIG_MD_HSM is not set
 CONFIG_BLK_DEV_RAM=y
 CONFIG_BLK_DEV_RAM_SIZE=4096
 CONFIG_BLK_DEV_INITRD=y
diff -urN 2.2.18/arch/sparc/kernel/sys_sparc.c 2.2.18aa1/arch/sparc/kernel/sys_sparc.c
--- 2.2.18/arch/sparc/kernel/sys_sparc.c	Thu May  4 13:00:36 2000
+++ 2.2.18aa1/arch/sparc/kernel/sys_sparc.c	Mon Dec 11 17:20:49 2000
@@ -176,9 +176,9 @@
 }
 
 /* Linux version of mmap */
-asmlinkage unsigned long sys_mmap(unsigned long addr, unsigned long len,
+asmlinkage unsigned long do_mmap2(unsigned long addr, unsigned long len,
 	unsigned long prot, unsigned long flags, unsigned long fd,
-	unsigned long off)
+	unsigned long pgoff)
 {
 	struct file * file = NULL;
 	unsigned long retval = -EBADF;
@@ -211,7 +211,7 @@
 		goto out_putf;
 
 	flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
-	retval = do_mmap(file, addr, len, prot, flags, off);
+	retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
 
 out_putf:
 	if (file)
@@ -220,6 +220,22 @@
 	unlock_kernel();
 	up(&current->mm->mmap_sem);
 	return retval;
+}
+
+asmlinkage unsigned long sys_mmap2(unsigned long addr, unsigned long len,
+	unsigned long prot, unsigned long flags, unsigned long fd,
+	unsigned long pgoff)
+{
+	/* Make sure the shift for mmap2 is constant (12), no matter what PAGE_SIZE
+	   we have. */
+	return do_mmap2(addr, len, prot, flags, fd, pgoff >> (PAGE_SHIFT - 12));
+}
+
+asmlinkage unsigned long sys_mmap(unsigned long addr, unsigned long len,
+	unsigned long prot, unsigned long flags, unsigned long fd,
+	unsigned long off)
+{
+	return do_mmap2(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
 }
 
 /* we come to here via sys_nis_syscall so it can setup the regs argument */
diff -urN 2.2.18/arch/sparc/kernel/sys_sunos.c 2.2.18aa1/arch/sparc/kernel/sys_sunos.c
--- 2.2.18/arch/sparc/kernel/sys_sunos.c	Mon Dec 11 16:57:46 2000
+++ 2.2.18aa1/arch/sparc/kernel/sys_sunos.c	Mon Dec 11 17:20:50 2000
@@ -409,7 +409,7 @@
 #define ROUND_UP(x) (((x)+sizeof(long)-1) & ~(sizeof(long)-1))
 
 static int sunos_filldir(void * __buf, const char * name, int namlen,
-			 off_t offset, ino_t ino)
+			 off_t offset, ino_t ino, unsigned int d_type)
 {
 	struct sunos_dirent * dirent;
 	struct sunos_dirent_callback * buf = (struct sunos_dirent_callback *) __buf;
@@ -500,7 +500,7 @@
 };
 
 static int sunos_filldirentry(void * __buf, const char * name, int namlen,
-			      off_t offset, ino_t ino)
+			      off_t offset, ino_t ino, unsigned int d_type)
 {
 	struct sunos_direntry * dirent;
 	struct sunos_direntry_callback * buf = (struct sunos_direntry_callback *) __buf;
diff -urN 2.2.18/arch/sparc/kernel/systbls.S 2.2.18aa1/arch/sparc/kernel/systbls.S
--- 2.2.18/arch/sparc/kernel/systbls.S	Thu May  4 13:00:36 2000
+++ 2.2.18aa1/arch/sparc/kernel/systbls.S	Mon Dec 11 17:20:50 2000
@@ -29,12 +29,12 @@
 /*40*/	.long sys_newlstat, sys_dup, sys_pipe, sys_times, sys_nis_syscall
 /*45*/	.long sys_umount, sys_setgid, sys_getgid, sys_signal, sys_geteuid
 /*50*/	.long sys_getegid, sys_acct, sys_nis_syscall, sys_nis_syscall, sys_ioctl
-/*55*/	.long sys_reboot, sys_lfs_syscall, sys_symlink, sys_readlink, sys_execve
-/*60*/	.long sys_umask, sys_chroot, sys_newfstat, sys_lfs_syscall, sys_getpagesize
+/*55*/	.long sys_reboot, sys_mmap2, sys_symlink, sys_readlink, sys_execve
+/*60*/	.long sys_umask, sys_chroot, sys_newfstat, sys_fstat64, sys_getpagesize
 /*65*/	.long sys_msync, sys_vfork, sys_pread, sys_pwrite, sys_nis_syscall
 /*70*/	.long sys_nis_syscall, sys_mmap, sys_nis_syscall, sys_munmap, sys_mprotect
-/*75*/	.long sys_nis_syscall, sys_vhangup, sys_lfs_syscall, sys_nis_syscall, sys_getgroups
-/*80*/	.long sys_setgroups, sys_getpgrp, sys_nis_syscall, sys_setitimer, sys_lfs_syscall
+/*75*/	.long sys_nis_syscall, sys_vhangup, sys_truncate64, sys_nis_syscall, sys_getgroups
+/*80*/	.long sys_setgroups, sys_getpgrp, sys_nis_syscall, sys_setitimer, sys_ftruncate64
 /*85*/	.long sys_swapon, sys_getitimer, sys_nis_syscall, sys_sethostname, sys_nis_syscall
 /*90*/	.long sys_dup2, sys_nis_syscall, sys_fcntl, sys_select, sys_nis_syscall
 /*95*/	.long sys_fsync, sys_setpriority, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall
@@ -44,12 +44,12 @@
 /*115*/	.long sys_nis_syscall, sys_gettimeofday, sys_getrusage, sys_nis_syscall, sys_getcwd
 /*120*/	.long sys_readv, sys_writev, sys_settimeofday, sys_fchown, sys_fchmod
 /*125*/	.long sys_nis_syscall, sys_setreuid, sys_setregid, sys_rename, sys_truncate
-/*130*/	.long sys_ftruncate, sys_flock, sys_lfs_syscall, sys_nis_syscall, sys_nis_syscall
-/*135*/	.long sys_nis_syscall, sys_mkdir, sys_rmdir, sys_utimes, sys_lfs_syscall
-/*140*/	.long sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_getrlimit
+/*130*/	.long sys_ftruncate, sys_flock, sys_lstat64, sys_nis_syscall, sys_nis_syscall
+/*135*/	.long sys_nis_syscall, sys_mkdir, sys_rmdir, sys_utimes, sys_stat64
+/*140*/	.long sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_fcntl64, sys_getrlimit
 /*145*/	.long sys_setrlimit, sys_nis_syscall, sys_prctl, sys_pciconfig_read, sys_pciconfig_write
-/*150*/	.long sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_poll, sys_nis_syscall
-/*155*/	.long sys_nis_syscall, sys_nis_syscall, sys_statfs, sys_fstatfs, sys_oldumount
+/*150*/	.long sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_poll, sys_getdents64
+/*155*/	.long sys_fcntl64, sys_nis_syscall, sys_statfs, sys_fstatfs, sys_oldumount
 /*160*/	.long sys_nis_syscall, sys_nis_syscall, sys_getdomainname, sys_setdomainname, sys_nis_syscall
 /*165*/	.long sys_quotactl, sys_nis_syscall, sys_mount, sys_ustat, sys_nis_syscall
 /*170*/	.long sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_getdents
diff -urN 2.2.18/arch/sparc/mm/fault.c 2.2.18aa1/arch/sparc/mm/fault.c
--- 2.2.18/arch/sparc/mm/fault.c	Sun Apr  2 21:07:48 2000
+++ 2.2.18aa1/arch/sparc/mm/fault.c	Mon Dec 11 17:20:46 2000
@@ -222,7 +222,7 @@
 		goto good_area;
 	if(!(vma->vm_flags & VM_GROWSDOWN))
 		goto bad_area;
-	if(expand_stack(vma, address))
+	if(expand_stack(vma, address, NULL))
 		goto bad_area;
 	/*
 	 * Ok, we have a good vm_area for this memory access, so
@@ -414,7 +414,7 @@
 		goto good_area;
 	if(!(vma->vm_flags & VM_GROWSDOWN))
 		goto bad_area;
-	if(expand_stack(vma, address))
+	if(expand_stack(vma, address, NULL))
 		goto bad_area;
 good_area:
 	if(write) {
diff -urN 2.2.18/arch/sparc/mm/init.c 2.2.18aa1/arch/sparc/mm/init.c
--- 2.2.18/arch/sparc/mm/init.c	Mon Jan 17 16:44:36 2000
+++ 2.2.18aa1/arch/sparc/mm/init.c	Mon Dec 11 17:20:48 2000
@@ -380,4 +380,6 @@
 	}
 	val->totalram <<= PAGE_SHIFT;
 	val->sharedram <<= PAGE_SHIFT;
+	val->totalbig = 0;
+	val->freebig = 0;
 }
diff -urN 2.2.18/arch/sparc64/config.in 2.2.18aa1/arch/sparc64/config.in
--- 2.2.18/arch/sparc64/config.in	Mon Dec 11 16:57:46 2000
+++ 2.2.18aa1/arch/sparc64/config.in	Mon Dec 11 17:20:54 2000
@@ -102,10 +102,16 @@
 
 bool 'Multiple devices driver support' CONFIG_BLK_DEV_MD
 if [ "$CONFIG_BLK_DEV_MD" = "y" ]; then
+  bool 'Autodetect RAID partitions' CONFIG_AUTODETECT_RAID
   tristate '   Linear (append) mode' CONFIG_MD_LINEAR
   tristate '   RAID-0 (striping) mode' CONFIG_MD_STRIPED
   tristate '   RAID-1 (mirroring) mode' CONFIG_MD_MIRRORING
   tristate '   RAID-4/RAID-5 mode' CONFIG_MD_RAID5
+  tristate '   Translucent mode' CONFIG_MD_TRANSLUCENT
+  tristate '   Hierarchical Storage Management support' CONFIG_MD_HSM
+fi
+if [ "$CONFIG_MD_LINEAR" = "y" -o "$CONFIG_MD_STRIPED" = "y" ]; then
+  bool '      Boot support (linear, striped)' CONFIG_MD_BOOT
 fi
 
 tristate 'RAM disk support' CONFIG_BLK_DEV_RAM
diff -urN 2.2.18/arch/sparc64/defconfig 2.2.18aa1/arch/sparc64/defconfig
--- 2.2.18/arch/sparc64/defconfig	Mon Dec 11 16:57:47 2000
+++ 2.2.18aa1/arch/sparc64/defconfig	Mon Dec 11 17:20:54 2000
@@ -106,10 +106,13 @@
 #
 CONFIG_BLK_DEV_FD=y
 CONFIG_BLK_DEV_MD=y
+# CONFIG_AUTODETECT_RAID is not set
 CONFIG_MD_LINEAR=m
 CONFIG_MD_STRIPED=m
 CONFIG_MD_MIRRORING=m
 CONFIG_MD_RAID5=m
+# CONFIG_MD_TRANSLUCENT is not set
+# CONFIG_MD_HSM is not set
 CONFIG_BLK_DEV_RAM=y
 CONFIG_BLK_DEV_RAM_SIZE=4096
 CONFIG_BLK_DEV_INITRD=y
diff -urN 2.2.18/arch/sparc64/kernel/ioctl32.c 2.2.18aa1/arch/sparc64/kernel/ioctl32.c
--- 2.2.18/arch/sparc64/kernel/ioctl32.c	Mon Dec 11 16:57:47 2000
+++ 2.2.18aa1/arch/sparc64/kernel/ioctl32.c	Mon Dec 11 17:20:54 2000
@@ -3094,6 +3094,42 @@
 	case AUTOFS_IOC_PROTOVER:
 	case AUTOFS_IOC_EXPIRE:
 	
+	/* add in the ioctls for Pool, GFS, and gnbd.
+	 * This seems weird, but it looks like it is the 
+	 * way they are doing things.
+	 */
+#define _POOLC_(x) (('p' << 8) | x)
+#define _GFSC_(x) (('p' << 8) | x)
+	case _GFSC_(0x01):
+	case _GFSC_(0x02):
+	case _GFSC_(0x03):
+	case _GFSC_(0x04):
+	case _GFSC_(0x05):
+	case _GFSC_(0x06):
+	case _GFSC_(0x07):
+	case _GFSC_(0x08):
+	case _GFSC_(0x09):
+	case _GFSC_(0x0a):
+	case _POOLC_(0x11):
+	case _POOLC_(0x12):
+	case _POOLC_(0x13):
+	case _POOLC_(0x14):
+	case _POOLC_(0x15):
+	case _POOLC_(0x16):
+	case _POOLC_(0x17):
+	case _POOLC_(0x21):
+	case _POOLC_(0x41):
+	case _POOLC_(0x42):
+	case _POOLC_(0x43):
+	case _POOLC_(0x44):
+	case _POOLC_(0x80):
+	case _POOLC_(0x81):
+	case _POOLC_(0x90):
+	case _IO( 0xeb, 1):
+	case _IO( 0xeb, 2):
+	case _IO( 0xeb, 3):
+	case _IO( 0xeb, 9):
+	
 	/* Raw devices */
 	case _IO(0xac, 0): /* RAW_SETBIND */
 	case _IO(0xac, 1): /* RAW_GETBIND */
diff -urN 2.2.18/arch/sparc64/kernel/sparc64_ksyms.c 2.2.18aa1/arch/sparc64/kernel/sparc64_ksyms.c
--- 2.2.18/arch/sparc64/kernel/sparc64_ksyms.c	Mon Dec 11 16:57:47 2000
+++ 2.2.18aa1/arch/sparc64/kernel/sparc64_ksyms.c	Mon Dec 11 17:20:50 2000
@@ -84,6 +84,7 @@
 extern int sys32_ioctl(unsigned int fd, unsigned int cmd, u32 arg);
 extern int (*handle_mathemu)(struct pt_regs *, struct fpustate *);
 extern void VISenter(void);
+extern long sparc32_open(const char * filename, int flags, int mode);
 extern int io_remap_page_range(unsigned long from, unsigned long offset, unsigned long size, pgprot_t prot, int space);
                 
 extern void bcopy (const char *, char *, int);
@@ -278,6 +279,7 @@
 EXPORT_SYMBOL(prom_cpu_nodes);
 EXPORT_SYMBOL(sys_ioctl);
 EXPORT_SYMBOL(sys32_ioctl);
+EXPORT_SYMBOL(sparc32_open);
 EXPORT_SYMBOL(move_addr_to_kernel);
 EXPORT_SYMBOL(move_addr_to_user);
 #endif
diff -urN 2.2.18/arch/sparc64/kernel/sys32.S 2.2.18aa1/arch/sparc64/kernel/sys32.S
--- 2.2.18/arch/sparc64/kernel/sys32.S	Thu May  4 13:00:36 2000
+++ 2.2.18aa1/arch/sparc64/kernel/sys32.S	Mon Dec 11 17:20:50 2000
@@ -60,3 +60,12 @@
 	sethi		%hi(sys_bdflush), %g1
 	jmpl		%g1 + %lo(sys_bdflush), %g0
 	 sra		%o1, 0, %o1
+
+	.align		32
+	.globl		sys32_mmap2
+sys32_mmap2:
+	srl		%o4, 0, %o4
+	sethi		%hi(sys_mmap), %g1
+	srl		%o5, 0, %o5
+	jmpl		%g1 + %lo(sys_mmap), %g0
+	 sllx		%o5, 12, %o5
diff -urN 2.2.18/arch/sparc64/kernel/sys_sparc32.c 2.2.18aa1/arch/sparc64/kernel/sys_sparc32.c
--- 2.2.18/arch/sparc64/kernel/sys_sparc32.c	Mon Dec 11 16:57:47 2000
+++ 2.2.18aa1/arch/sparc64/kernel/sys_sparc32.c	Mon Dec 11 17:20:50 2000
@@ -600,15 +600,27 @@
 			old_fs = get_fs(); set_fs (KERNEL_DS);
 			ret = sys_fcntl(fd, cmd, (unsigned long)&f);
 			set_fs (old_fs);
+			if (ret) return ret;
+			if (f.l_start >= 0x7fffffffUL ||
+			    f.l_len >= 0x7fffffffUL ||
+			    f.l_start + f.l_len >= 0x7fffffffUL)
+				return -EOVERFLOW;
 			if(put_flock(&f, (struct flock32 *)arg))
 				return -EFAULT;
-			return ret;
+			return 0;
 		}
 	default:
 		return sys_fcntl(fd, cmd, (unsigned long)arg);
 	}
 }
 
+asmlinkage long sys32_fcntl64(unsigned int fd, unsigned int cmd, unsigned long arg)
+{
+	if (cmd >= F_GETLK64 && cmd <= F_SETLKW64)
+		return sys_fcntl(fd, cmd + F_GETLK - F_GETLK64, arg);
+	return sys32_fcntl(fd, cmd, arg);
+}
+
 struct dqblk32 {
     __u32 dqb_bhardlimit;
     __u32 dqb_bsoftlimit;
@@ -720,6 +732,25 @@
 	return ret;
 }
 
+extern asmlinkage long sys_truncate(const char * path, unsigned long length);
+extern asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length);
+
+asmlinkage int sys32_truncate64(const char * path, unsigned long high, unsigned long low)
+{
+	if ((int)high < 0)
+		return -EINVAL;
+	else
+		return sys_truncate(path, (high << 32) | low);
+}
+
+asmlinkage int sys32_ftruncate64(unsigned int fd, unsigned long high, unsigned long low)
+{
+	if ((int)high < 0)
+		return -EINVAL;
+	else
+		return sys_ftruncate(fd, (high << 32) | low);
+}
+
 extern asmlinkage int sys_utime(char * filename, struct utimbuf * times);
 
 struct utimbuf32 {
@@ -917,7 +948,7 @@
 };
 
 static int fillonedir(void * __buf, const char * name, int namlen,
-		      off_t offset, ino_t ino)
+		      off_t offset, ino_t ino, unsigned int d_type)
 {
 	struct readdir_callback32 * buf = (struct readdir_callback32 *) __buf;
 	struct old_linux_dirent32 * dirent;
@@ -982,7 +1013,8 @@
 	int error;
 };
 
-static int filldir(void * __buf, const char * name, int namlen, off_t offset, ino_t ino)
+static int filldir(void * __buf, const char * name, int namlen, off_t offset, ino_t ino,
+		   unsigned int d_type)
 {
 	struct linux_dirent32 * dirent;
 	struct getdents_callback32 * buf = (struct getdents_callback32 *) __buf;
@@ -4007,6 +4039,39 @@
 		return -EFAULT;
 		
 	return ret;
+}
+
+/* This is just a version for 32-bit applications which does
+ * not force O_LARGEFILE on.
+ */
+
+asmlinkage long sparc32_open(const char * filename, int flags, int mode)
+{
+	char * tmp;
+	int fd, error;
+
+	tmp = getname(filename);
+	fd = PTR_ERR(tmp);
+	if (!IS_ERR(tmp)) {
+		lock_kernel();
+		fd = get_unused_fd();
+		if (fd >= 0) {
+			struct file * f = filp_open(tmp, flags, mode);
+			error = PTR_ERR(f);
+			if (IS_ERR(f))
+				goto out_error;
+			fd_install(fd, f);
+		}
+out:
+		unlock_kernel();
+		putname(tmp);
+	}
+	return fd;
+
+out_error:
+	put_unused_fd(fd);
+	fd = error;
+	goto out;
 }
 
 /* Handle adjtimex compatability. */
diff -urN 2.2.18/arch/sparc64/kernel/sys_sunos32.c 2.2.18aa1/arch/sparc64/kernel/sys_sunos32.c
--- 2.2.18/arch/sparc64/kernel/sys_sunos32.c	Mon Dec 11 16:57:47 2000
+++ 2.2.18aa1/arch/sparc64/kernel/sys_sunos32.c	Mon Dec 11 17:20:50 2000
@@ -366,7 +366,7 @@
 #define ROUND_UP(x) (((x)+sizeof(s32)-1) & ~(sizeof(s32)-1))
 
 static int sunos_filldir(void * __buf, const char * name, int namlen,
-			 off_t offset, ino_t ino)
+			 off_t offset, ino_t ino, unsigned int d_type)
 {
 	struct sunos_dirent * dirent;
 	struct sunos_dirent_callback * buf = (struct sunos_dirent_callback *) __buf;
@@ -458,7 +458,7 @@
 };
 
 static int sunos_filldirentry(void * __buf, const char * name, int namlen,
-			      off_t offset, ino_t ino)
+			      off_t offset, ino_t ino, unsigned int d_type)
 {
 	struct sunos_direntry * dirent;
 	struct sunos_direntry_callback * buf = (struct sunos_direntry_callback *) __buf;
@@ -1296,13 +1296,15 @@
 	return rval;
 }
 
+extern asmlinkage long sparc32_open(const char * filename, int flags, int mode);
+
 asmlinkage int sunos_open(u32 filename, int flags, int mode)
 {
 	int ret;
 
 	lock_kernel();
 	current->personality |= PER_BSD;
-	ret = sys_open ((char *)A(filename), flags, mode);
+	ret = sparc32_open ((char *)A(filename), flags, mode);
 	unlock_kernel();
 	return ret;
 }
diff -urN 2.2.18/arch/sparc64/kernel/systbls.S 2.2.18aa1/arch/sparc64/kernel/systbls.S
--- 2.2.18/arch/sparc64/kernel/systbls.S	Mon Dec 11 16:57:47 2000
+++ 2.2.18aa1/arch/sparc64/kernel/systbls.S	Mon Dec 11 17:20:50 2000
@@ -20,7 +20,7 @@
 	.globl sys_call_table32
 sys_call_table32:
 /*0*/	.word sys_nis_syscall, sparc_exit, sys_fork, sys_read, sys_write
-/*5*/	.word sys_open, sys_close, sys32_wait4, sys_creat, sys_link
+/*5*/	.word sparc32_open, sys_close, sys32_wait4, sys_creat, sys_link
 /*10*/  .word sys_unlink, sunos_execv, sys_chdir, sys32_chown16, sys32_mknod
 /*15*/	.word sys32_chmod, sys32_lchown16, sparc_brk, sys_perfctr, sys32_lseek
 /*20*/	.word sys_getpid, sys_capget, sys_capset, sys_setuid, sys_getuid
@@ -30,12 +30,12 @@
 /*40*/	.word sys32_newlstat, sys_dup, sys_pipe, sys32_times, sys_nis_syscall
 	.word sys_umount, sys_setgid, sys_getgid, sys_signal, sys_geteuid
 /*50*/	.word sys_getegid, sys_acct, sys_nis_syscall, sys_nis_syscall, sys32_ioctl
-	.word sys_reboot, sys_lfs_syscall, sys_symlink, sys_readlink, sys32_execve
-/*60*/	.word sys_umask, sys_chroot, sys32_newfstat, sys_lfs_syscall, sys_getpagesize
+	.word sys_reboot, sys32_mmap2, sys_symlink, sys_readlink, sys32_execve
+/*60*/	.word sys_umask, sys_chroot, sys32_newfstat, sys_fstat64, sys_getpagesize
 	.word sys_msync, sys_vfork, sys32_pread, sys32_pwrite, sys_nis_syscall
 /*70*/	.word sys_nis_syscall, sys32_mmap, sys_nis_syscall, sys_munmap, sys_mprotect
-	.word sys_nis_syscall, sys_vhangup, sys_lfs_syscall, sys_nis_syscall, sys32_getgroups
-/*80*/	.word sys32_setgroups, sys_getpgrp, sys_nis_syscall, sys32_setitimer, sys_lfs_syscall
+	.word sys_nis_syscall, sys_vhangup, sys32_truncate64, sys_nis_syscall, sys32_getgroups
+/*80*/	.word sys32_setgroups, sys_getpgrp, sys_nis_syscall, sys32_setitimer, sys32_ftruncate64
 	.word sys_swapon, sys32_getitimer, sys_nis_syscall, sys_sethostname, sys_nis_syscall
 /*90*/	.word sys_dup2, sys_nis_syscall, sys32_fcntl, sys32_select, sys_nis_syscall
 	.word sys_fsync, sys_setpriority, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall
@@ -45,12 +45,12 @@
 	.word sys_nis_syscall, sys32_gettimeofday, sys32_getrusage, sys_nis_syscall, sys_getcwd
 /*120*/	.word sys32_readv, sys32_writev, sys32_settimeofday, sys32_fchown16, sys_fchmod
 	.word sys_nis_syscall, sys32_setreuid, sys32_setregid, sys_rename, sys_truncate
-/*130*/	.word sys_ftruncate, sys_flock, sys_lfs_syscall, sys_nis_syscall, sys_nis_syscall
-	.word sys_nis_syscall, sys_mkdir, sys_rmdir, sys32_utimes, sys_lfs_syscall
-/*140*/	.word sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys32_getrlimit
+/*130*/	.word sys_ftruncate, sys_flock, sys_lstat64, sys_nis_syscall, sys_nis_syscall
+	.word sys_nis_syscall, sys_mkdir, sys_rmdir, sys32_utimes, sys_stat64
+/*140*/	.word sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys32_fcntl64, sys32_getrlimit
 	.word sys32_setrlimit, sys_nis_syscall, sys32_prctl, sys32_pciconfig_read, sys32_pciconfig_write
-/*150*/	.word sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_poll, sys_nis_syscall
-	.word sys_nis_syscall, sys_nis_syscall, sys32_statfs, sys32_fstatfs, sys_oldumount
+/*150*/	.word sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_poll, sys_getdents64
+	.word sys32_fcntl64, sys_nis_syscall, sys32_statfs, sys32_fstatfs, sys_oldumount
 /*160*/	.word sys_nis_syscall, sys_nis_syscall, sys_getdomainname, sys_setdomainname, sys_nis_syscall
 	.word sys32_quotactl, sys_nis_syscall, sys32_mount, sys_ustat, sys_nis_syscall
 /*170*/	.word sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys32_getdents
@@ -108,7 +108,7 @@
 	.word sys_socketpair, sys_mkdir, sys_rmdir, sys_utimes, sys_nis_syscall
 /*140*/	.word sys_nis_syscall, sys_getpeername, sys_nis_syscall, sys_nis_syscall, sys_getrlimit
 	.word sys_setrlimit, sys_nis_syscall, sys_prctl, sys_pciconfig_read, sys_pciconfig_write
-/*150*/	.word sys_getsockname, sys_nis_syscall, sys_nis_syscall, sys_poll, sys_nis_syscall
+/*150*/	.word sys_getsockname, sys_nis_syscall, sys_nis_syscall, sys_poll, sys_getdents64
 	.word sys_nis_syscall, sys_nis_syscall, sys_statfs, sys_fstatfs, sys_oldumount
 /*160*/	.word sys_nis_syscall, sys_nis_syscall, sys_getdomainname, sys_setdomainname, sys_utrap_install
 	.word sys_quotactl, sys_nis_syscall, sys_mount, sys_ustat, sys_nis_syscall
diff -urN 2.2.18/arch/sparc64/mm/fault.c 2.2.18aa1/arch/sparc64/mm/fault.c
--- 2.2.18/arch/sparc64/mm/fault.c	Sun Apr  2 21:07:48 2000
+++ 2.2.18aa1/arch/sparc64/mm/fault.c	Mon Dec 11 17:20:46 2000
@@ -194,7 +194,7 @@
 		goto good_area;
 	if(!(vma->vm_flags & VM_GROWSDOWN))
 		goto bad_area;
-	if(expand_stack(vma, address))
+	if(expand_stack(vma, address, NULL))
 		goto bad_area;
 	/*
 	 * Ok, we have a good vm_area for this memory access, so
diff -urN 2.2.18/arch/sparc64/mm/init.c 2.2.18aa1/arch/sparc64/mm/init.c
--- 2.2.18/arch/sparc64/mm/init.c	Thu May  4 13:00:37 2000
+++ 2.2.18aa1/arch/sparc64/mm/init.c	Mon Dec 11 17:20:48 2000
@@ -1756,4 +1756,6 @@
 	}
 	val->totalram <<= PAGE_SHIFT;
 	val->sharedram <<= PAGE_SHIFT;
+	val->totalbig = 0;
+	val->freebig = 0;
 }
diff -urN 2.2.18/arch/sparc64/solaris/fs.c 2.2.18aa1/arch/sparc64/solaris/fs.c
--- 2.2.18/arch/sparc64/solaris/fs.c	Mon Jan 17 16:44:36 2000
+++ 2.2.18aa1/arch/sparc64/solaris/fs.c	Mon Dec 11 17:20:50 2000
@@ -572,20 +572,20 @@
 	return error;
 }
 
+extern asmlinkage long sparc32_open(const char * filename, int flags, int mode);
+
 asmlinkage int solaris_open(u32 filename, int flags, u32 mode)
 {
-	int (*sys_open)(const char *,int,int) = 
-		(int (*)(const char *,int,int))SYS(open);
 	int fl = flags & 0xf;
 
-/*	if (flags & 0x2000) - allow LFS			*/
+	if (flags & 0x2000) fl |= O_LARGEFILE;
 	if (flags & 0x8050) fl |= O_SYNC;
 	if (flags & 0x80) fl |= O_NONBLOCK;
 	if (flags & 0x100) fl |= O_CREAT;
 	if (flags & 0x200) fl |= O_TRUNC;
 	if (flags & 0x400) fl |= O_EXCL;
 	if (flags & 0x800) fl |= O_NOCTTY;
-	return sys_open((const char *)A(filename), fl, mode);
+	return sparc32_open((const char *)A(filename), fl, mode);
 }
 
 #define SOL_F_SETLK	6
diff -urN 2.2.18/arch/sparc64/solaris/timod.c 2.2.18aa1/arch/sparc64/solaris/timod.c
--- 2.2.18/arch/sparc64/solaris/timod.c	Mon Jan 17 16:44:36 2000
+++ 2.2.18aa1/arch/sparc64/solaris/timod.c	Mon Dec 11 17:20:44 2000
@@ -154,7 +154,7 @@
 	sock = &current->files->fd[fd]->f_dentry->d_inode->u.socket_i;
 	wake_up_interruptible(&sock->wait);
 	if (sock->fasync_list && !(sock->flags & SO_WAITDATA))
-		kill_fasync(sock->fasync_list, SIGIO);
+		kill_fasync(sock->fasync_list, SIGIO, POLL_IN);
 	SOLD("done");
 }
 
diff -urN 2.2.18/drivers/block/Config.in 2.2.18aa1/drivers/block/Config.in
--- 2.2.18/drivers/block/Config.in	Mon Dec 11 16:57:47 2000
+++ 2.2.18aa1/drivers/block/Config.in	Mon Dec 11 17:20:54 2000
@@ -97,16 +97,23 @@
 
 comment 'Additional Block Devices'
 
+tristate 'Logical volume manager (LVM) support' CONFIG_BLK_DEV_LVM N
+if [ "$CONFIG_BLK_DEV_LVM" != "n" ]; then
+  bool '   LVM information in proc filesystem' CONFIG_LVM_PROC_FS Y
+fi
 tristate 'Loopback device support' CONFIG_BLK_DEV_LOOP
 if [ "$CONFIG_NET" = "y" ]; then
   tristate 'Network block device support' CONFIG_BLK_DEV_NBD
 fi
 bool 'Multiple devices driver support' CONFIG_BLK_DEV_MD
 if [ "$CONFIG_BLK_DEV_MD" = "y" ]; then
+  bool 'Autodetect RAID partitions' CONFIG_AUTODETECT_RAID
   tristate '   Linear (append) mode' CONFIG_MD_LINEAR
   tristate '   RAID-0 (striping) mode' CONFIG_MD_STRIPED
   tristate '   RAID-1 (mirroring) mode' CONFIG_MD_MIRRORING
   tristate '   RAID-4/RAID-5 mode' CONFIG_MD_RAID5
+  tristate '   Translucent mode' CONFIG_MD_TRANSLUCENT
+  tristate '   Hierarchical Storage Management support' CONFIG_MD_HSM
 fi
 if [ "$CONFIG_MD_LINEAR" = "y" -o "$CONFIG_MD_STRIPED" = "y" ]; then
   bool '      Boot support (linear, striped)' CONFIG_MD_BOOT
diff -urN 2.2.18/drivers/block/Makefile 2.2.18aa1/drivers/block/Makefile
--- 2.2.18/drivers/block/Makefile	Mon Dec 11 16:57:47 2000
+++ 2.2.18aa1/drivers/block/Makefile	Mon Dec 11 17:20:54 2000
@@ -266,6 +266,14 @@
   endif
 endif
 
+ifeq ($(CONFIG_BLK_DEV_LVM),y)
+L_OBJS += lvm.o lvm-snap.o
+else
+   ifeq ($(CONFIG_BLK_DEV_LVM),m)
+   M_OBJS += lvm-mod.o
+   endif
+endif
+
 ifeq ($(CONFIG_BLK_DEV_MD),y)
 LX_OBJS += md.o
 
@@ -294,13 +302,31 @@
 endif
 
 ifeq ($(CONFIG_MD_RAID5),y)
+LX_OBJS += xor.o
 L_OBJS += raid5.o
 else
   ifeq ($(CONFIG_MD_RAID5),m)
+  LX_OBJS += xor.o
   M_OBJS += raid5.o
   endif
 endif
 
+ifeq ($(CONFIG_MD_TRANSLUCENT),y)
+L_OBJS += translucent.o
+else
+  ifeq ($(CONFIG_MD_TRANSLUCENT),m)
+  M_OBJS += translucent.o
+  endif
+endif
+
+ifeq ($(CONFIG_MD_HSM),y)
+L_OBJS += hsm.o
+else
+  ifeq ($(CONFIG_MD_HSM),m)
+  M_OBJS += hsm.o
+  endif
+endif
+
 endif
 
 ifeq ($(CONFIG_BLK_DEV_NBD),y)
@@ -321,6 +347,9 @@
 endif
 
 include $(TOPDIR)/Rules.make
+
+lvm-mod.o: lvm.o lvm-snap.o
+	$(LD) -r -o $@ lvm.o lvm-snap.o
 
 ide-mod.o: ide.o $(IDE_OBJS)
 	$(LD) $(LD_RFLAG) -r -o $@ ide.o $(IDE_OBJS)
diff -urN 2.2.18/drivers/block/README.lvm 2.2.18aa1/drivers/block/README.lvm
--- 2.2.18/drivers/block/README.lvm	Thu Jan  1 01:00:00 1970
+++ 2.2.18aa1/drivers/block/README.lvm	Mon Dec 11 17:20:49 2000
@@ -0,0 +1,8 @@
+
+This is the Logical Volume Manager driver for Linux,
+
+Tools, library that manage logical volumes can be found
+at <http://www.sistina.com/lvm>.
+
+There you can obtain actual driver versions too.
+
diff -urN 2.2.18/drivers/block/genhd.c 2.2.18aa1/drivers/block/genhd.c
--- 2.2.18/drivers/block/genhd.c	Mon Dec 11 16:57:48 2000
+++ 2.2.18aa1/drivers/block/genhd.c	Mon Dec 11 17:20:54 2000
@@ -28,6 +28,7 @@
 #include <linux/string.h>
 #include <linux/blk.h>
 #include <linux/init.h>
+#include <linux/raid/md.h>
 
 #ifdef CONFIG_ARCH_S390
 #include <asm/dasd.h>
@@ -54,6 +55,11 @@
 				le32_to_cpu(__a); \
 			})
 
+#if defined CONFIG_BLK_DEV_LVM || defined CONFIG_BLK_DEV_LVM_MODULE
+#include <linux/lvm.h>
+void ( *lvm_hd_name_ptr) ( char *, int) = NULL;
+#endif
+
 struct gendisk *gendisk_head = NULL;
 
 static int current_minor = 0;
@@ -113,6 +119,13 @@
 	} 
 #endif
 	/*
+	 * If the programmer has chosen to specify a name for the
+	 * block device, then use that; otherwise use the defaults.
+	 */
+	if(hd->device_names){
+	  return hd->device_names[minor];
+	}
+	/*
 	 * IDE devices use multiple major numbers, but the drives
 	 * are named as:  {hda,hdb}, {hdc,hdd}, {hde,hdf}, {hdg,hdh}..
 	 * This requires special handling here.
@@ -135,6 +148,14 @@
 			break;
 		case MD_MAJOR:
 			unit -= 'a'-'0';
+			break;
+#if defined CONFIG_BLK_DEV_LVM || defined CONFIG_BLK_DEV_LVM_MODULE
+		case LVM_BLK_MAJOR:
+			*buf = 0;
+			if ( lvm_hd_name_ptr != NULL)
+				( lvm_hd_name_ptr) ( buf, minor);
+			return buf;
+#endif
 	}
 	part = minor & ((1 << hd->minor_shift) - 1);
 	if (hd->major >= SCSI_DISK1_MAJOR && hd->major <= SCSI_DISK7_MAJOR) {
@@ -1729,6 +1750,9 @@
 	else
 #endif
 	rd_load();
+#endif
+#ifdef CONFIG_BLK_DEV_MD
+	autodetect_raid();
 #endif
 #ifdef CONFIG_MD_BOOT
         md_setup_drive();
diff -urN 2.2.18/drivers/block/hsm.c 2.2.18aa1/drivers/block/hsm.c
--- 2.2.18/drivers/block/hsm.c	Thu Jan  1 01:00:00 1970
+++ 2.2.18aa1/drivers/block/hsm.c	Mon Dec 11 17:20:54 2000
@@ -0,0 +1,840 @@
+/*
+   hsm.c : HSM RAID driver for Linux
+              Copyright (C) 1998 Ingo Molnar
+
+   HSM mode management functions.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+   
+   You should have received a copy of the GNU General Public License
+   (for example /usr/src/linux/COPYING); if not, write to the Free
+   Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  
+*/
+
+#include <linux/module.h>
+
+#include <linux/raid/md.h>
+#include <linux/malloc.h>
+
+#include <linux/raid/hsm.h>
+#include <linux/blk.h>
+
+#define MAJOR_NR MD_MAJOR
+#define MD_DRIVER
+#define MD_PERSONALITY
+
+
+#define DEBUG_HSM 1
+
+#if DEBUG_HSM
+#define dprintk(x,y...) printk(x,##y)
+#else
+#define dprintk(x,y...) do { } while (0)
+#endif
+
+void print_bh(struct buffer_head *bh)
+{
+	dprintk("bh %p: %lx %lx %x %x %lx %p %lx %p %x %p %x %lx\n", bh, 
+		bh->b_blocknr, bh->b_size, bh->b_dev, bh->b_rdev,
+		bh->b_rsector, bh->b_this_page, bh->b_state,
+		bh->b_next_free, bh->b_count, bh->b_data,
+		bh->b_list, bh->b_flushtime
+	);
+}
+
+static int check_bg (pv_t *pv, pv_block_group_t * bg)
+{
+	int i, free = 0;
+
+	dprintk("checking bg ...\n");
+
+	for (i = 0; i < pv->pv_sb->pv_bg_size-1; i++) {
+		if (pv_pptr_free(bg->blocks + i)) {
+			free++;
+			if (test_bit(i, bg->used_bitmap)) {
+				printk("hm, bit %d set?\n", i);
+			}
+		} else {
+			if (!test_bit(i, bg->used_bitmap)) {
+				printk("hm, bit %d not set?\n", i);
+			}
+		}
+	}
+	dprintk("%d free blocks in bg ...\n", free);
+	return free;
+}
+
+static void get_bg (pv_t *pv, pv_bg_desc_t *desc, int nr)
+{
+	unsigned int bg_pos = nr * pv->pv_sb->pv_bg_size + 2;
+	struct buffer_head *bh;
+
+	dprintk("... getting BG at %u ...\n", bg_pos);
+
+        bh = bread (pv->dev, bg_pos, HSM_BLOCKSIZE);
+	if (!bh) {
+		MD_BUG();
+		return;
+	}
+	desc->bg = (pv_block_group_t *) bh->b_data;
+	desc->free_blocks = check_bg(pv, desc->bg);
+}
+
+static int find_free_block (lv_t *lv, pv_t *pv, pv_bg_desc_t *desc, int nr,
+				unsigned int lblock, lv_lptr_t * index)
+{
+	int i;
+
+	for (i = 0; i < pv->pv_sb->pv_bg_size-1; i++) {
+		pv_pptr_t * bptr = desc->bg->blocks + i;
+		if (pv_pptr_free(bptr)) {
+			unsigned int bg_pos = nr * pv->pv_sb->pv_bg_size + 2;
+
+			if (test_bit(i, desc->bg->used_bitmap)) {
+				MD_BUG();
+				continue;
+			}
+			bptr->u.used.owner.log_id = lv->log_id;
+			bptr->u.used.owner.log_index = lblock;
+			index->data.phys_nr = pv->phys_nr;
+			index->data.phys_block = bg_pos + i + 1;
+			set_bit(i, desc->bg->used_bitmap);
+			desc->free_blocks--;
+			dprintk(".....free blocks left in bg %p: %d\n",
+					desc->bg, desc->free_blocks);
+			return 0;
+		}
+	}
+	return -ENOSPC;
+}
+
+static int __get_free_block (lv_t *lv, pv_t *pv,
+					unsigned int lblock, lv_lptr_t * index)
+{
+	int i;
+
+	dprintk("trying to get free block for lblock %d ...\n", lblock);
+
+	for (i = 0; i < pv->pv_sb->pv_block_groups; i++) {
+		pv_bg_desc_t *desc = pv->bg_array + i;
+
+		dprintk("looking at desc #%d (%p)...\n", i, desc->bg);
+		if (!desc->bg)
+			get_bg(pv, desc, i);
+
+		if (desc->bg && desc->free_blocks)
+			return find_free_block(lv, pv, desc, i,
+							lblock, index);
+	}
+	dprintk("hsm: pv %s full!\n", partition_name(pv->dev));
+	return -ENOSPC;
+}
+
+static int get_free_block (lv_t *lv, unsigned int lblock, lv_lptr_t * index)
+{
+	int err;
+
+	if (!lv->free_indices)
+		return -ENOSPC;
+
+ 	/* fix me */
+	err = __get_free_block(lv, lv->vg->pv_array + 0, lblock, index);
+
+	if (err || !index->data.phys_block) {
+		MD_BUG();
+		return -ENOSPC;
+	}
+
+	lv->free_indices--;
+
+	return 0;
+}
+
+/*
+ * fix me: wordsize assumptions ...
+ */
+#define INDEX_BITS 8
+#define INDEX_DEPTH (32/INDEX_BITS)
+#define INDEX_MASK ((1<<INDEX_BITS) - 1)
+
+static void print_index_list (lv_t *lv, lv_lptr_t *index)
+{
+	lv_lptr_t *tmp;
+	int i;
+
+	dprintk("... block <%u,%u,%x> [.", index->data.phys_nr,
+		index->data.phys_block, index->cpu_addr);
+
+	tmp = index_child(index);
+	for (i = 0; i < HSM_LPTRS_PER_BLOCK; i++) {
+		if (index_block(lv, tmp))
+			dprintk("(%d->%d)", i, index_block(lv, tmp));
+		tmp++;
+	}
+	dprintk(".]\n");
+}
+
+static int read_index_group (lv_t *lv, lv_lptr_t *index)
+{
+	lv_lptr_t *index_group, *tmp;
+	struct buffer_head *bh;
+	int i;
+
+	dprintk("reading index group <%s:%d>\n",
+		partition_name(index_dev(lv, index)), index_block(lv, index));
+
+	bh = bread(index_dev(lv, index), index_block(lv, index), HSM_BLOCKSIZE);
+	if (!bh) {
+		MD_BUG();
+		return -EIO;
+	}
+	if (!buffer_uptodate(bh))
+		MD_BUG();
+
+	index_group = (lv_lptr_t *) bh->b_data;
+	tmp = index_group;
+	for (i = 0; i < HSM_LPTRS_PER_BLOCK; i++) {
+		if (index_block(lv, tmp)) {
+			dprintk("index group has BLOCK %d, non-present.\n", i);
+			tmp->cpu_addr = 0;
+		}
+		tmp++;
+	}
+	index->cpu_addr = ptr_to_cpuaddr(index_group);
+
+	dprintk("have read index group %p at block %d.\n",
+				index_group, index_block(lv, index));
+	print_index_list(lv, index);
+
+	return 0;
+}
+
+static int alloc_index_group (lv_t *lv, unsigned int lblock, lv_lptr_t * index)
+{
+	struct buffer_head *bh;
+	lv_lptr_t * index_group;
+	
+	if (get_free_block(lv, lblock, index))
+		return -ENOSPC;
+
+	dprintk("creating block for index group <%s:%d>\n",
+		partition_name(index_dev(lv, index)), index_block(lv, index));
+
+	bh = getblk(index_dev(lv, index),
+			 index_block(lv, index), HSM_BLOCKSIZE);
+
+	index_group = (lv_lptr_t *) bh->b_data;
+	md_clear_page(index_group);
+	mark_buffer_uptodate(bh, 1);
+
+	index->cpu_addr = ptr_to_cpuaddr(index_group);
+
+	dprintk("allocated index group %p at block %d.\n",
+				index_group, index_block(lv, index));
+	return 0;
+}
+
+static lv_lptr_t * alloc_fixed_index (lv_t *lv, unsigned int lblock)
+{
+	lv_lptr_t * index = index_child(&lv->root_index);
+	int idx, l;
+
+	for (l = INDEX_DEPTH-1; l >= 0; l--) {
+		idx = (lblock >> (INDEX_BITS*l)) & INDEX_MASK;
+		index += idx;
+		if (!l)
+			break;
+		if (!index_present(index)) {
+			dprintk("no group, level %u, pos %u\n", l, idx);
+			if (alloc_index_group(lv, lblock, index))
+				return NULL;
+		}
+		index = index_child(index);
+	}
+	if (!index_block(lv,index)) {
+		dprintk("no data, pos %u\n", idx);
+		if (get_free_block(lv, lblock, index))
+			return NULL;
+		return index;
+	}
+	MD_BUG();
+	return index;
+}
+
+static lv_lptr_t * find_index (lv_t *lv, unsigned int lblock)
+{
+	lv_lptr_t * index = index_child(&lv->root_index);
+	int idx, l;
+
+	for (l = INDEX_DEPTH-1; l >= 0; l--) {
+		idx = (lblock >> (INDEX_BITS*l)) & INDEX_MASK;
+		index += idx;
+		if (!l)
+			break;
+		if (index_free(index))
+			return NULL;
+		if (!index_present(index))
+			read_index_group(lv, index);
+		if (!index_present(index)) {
+			MD_BUG();
+			return NULL;
+		}
+		index = index_child(index);
+	}
+	if (!index_block(lv,index))
+		return NULL;
+	return index;
+}
+
+static int read_root_index(lv_t *lv)
+{
+	int err;
+	lv_lptr_t *index = &lv->root_index;
+
+	if (!index_block(lv, index)) {
+		printk("LV has no root index yet, creating.\n");
+
+		err = alloc_index_group (lv, 0, index);
+		if (err) {
+			printk("could not create index group, err:%d\n", err);
+			return err;
+		}
+		lv->vg->vg_sb->lv_array[lv->log_id].lv_root_idx =
+					lv->root_index.data;
+	} else {
+		printk("LV already has a root index.\n");
+		printk("... at <%s:%d>.\n",
+			partition_name(index_dev(lv, index)),
+			index_block(lv, index));
+
+		read_index_group(lv, index);
+	}
+	return 0;
+}
+
+static int init_pv(pv_t *pv)
+{
+	struct buffer_head *bh;
+	pv_sb_t *pv_sb;
+
+        bh = bread (pv->dev, 0, HSM_BLOCKSIZE);
+	if (!bh) {
+		MD_BUG();
+		return -1;
+	}
+
+	pv_sb = (pv_sb_t *) bh->b_data;
+	pv->pv_sb = pv_sb;
+
+	if (pv_sb->pv_magic != HSM_PV_SB_MAGIC) {
+		printk("%s is not a PV, has magic %x instead of %x!\n",
+			partition_name(pv->dev), pv_sb->pv_magic,
+			HSM_PV_SB_MAGIC);
+		return -1;
+	}
+	printk("%s detected as a valid PV (#%d).\n", partition_name(pv->dev),
+							pv->phys_nr);
+	printk("... created under HSM version %d.%d.%d, at %x.\n",
+	    pv_sb->pv_major, pv_sb->pv_minor, pv_sb->pv_patch, pv_sb->pv_ctime);
+	printk("... total # of blocks: %d (%d left unallocated).\n",
+			 pv_sb->pv_total_size, pv_sb->pv_blocks_left);
+
+	printk("... block size: %d bytes.\n", pv_sb->pv_block_size);
+	printk("... block descriptor size: %d bytes.\n", pv_sb->pv_pptr_size);
+	printk("... block group size: %d blocks.\n", pv_sb->pv_bg_size);
+	printk("... # of block groups: %d.\n", pv_sb->pv_block_groups);
+
+	if (pv_sb->pv_block_groups*sizeof(pv_bg_desc_t) > PAGE_SIZE) {
+		MD_BUG();
+		return 1;
+	}
+	pv->bg_array = (pv_bg_desc_t *)__get_free_page(GFP_KERNEL);
+	if (!pv->bg_array) {
+		MD_BUG();
+		return 1;
+	}
+	memset(pv->bg_array, 0, PAGE_SIZE);
+
+	return 0;
+}
+
+static int free_pv(pv_t *pv)
+{
+	struct buffer_head *bh;
+
+	dprintk("freeing PV %d ...\n", pv->phys_nr);
+
+	if (pv->bg_array) {
+		int i;
+
+		dprintk(".... freeing BGs ...\n");
+		for (i = 0; i < pv->pv_sb->pv_block_groups; i++) {
+			unsigned int bg_pos = i * pv->pv_sb->pv_bg_size + 2;
+			pv_bg_desc_t *desc = pv->bg_array + i;
+
+			if (desc->bg) {
+				dprintk(".... freeing BG %d ...\n", i);
+	        		bh = getblk (pv->dev, bg_pos, HSM_BLOCKSIZE);
+				mark_buffer_dirty(bh, 1);
+				brelse(bh);
+				brelse(bh);
+			}
+		}
+		free_page((unsigned long)pv->bg_array);
+	} else
+		MD_BUG();
+
+        bh = getblk (pv->dev, 0, HSM_BLOCKSIZE);
+	if (!bh) {
+		MD_BUG();
+		return -1;
+	}
+	mark_buffer_dirty(bh, 1);
+	brelse(bh);
+	brelse(bh);
+
+	return 0;
+}
+
+struct semaphore hsm_sem = MUTEX;
+
+#define HSM_SECTORS (HSM_BLOCKSIZE/512)
+
+static int hsm_map (mddev_t *mddev, kdev_t dev, kdev_t *rdev,
+			unsigned long *rsector, unsigned long bsectors)
+{
+	lv_t *lv = kdev_to_lv(dev);
+	lv_lptr_t *index;
+	unsigned int lblock = *rsector / HSM_SECTORS;
+	unsigned int offset = *rsector % HSM_SECTORS;
+	int err = -EIO;
+
+	if (!lv) {
+		printk("HSM: md%d not a Logical Volume!\n", mdidx(mddev));
+		goto out;
+	}
+	if (offset + bsectors > HSM_SECTORS) {
+		MD_BUG();
+		goto out;
+	}
+	down(&hsm_sem);
+	index = find_index(lv, lblock);
+	if (!index) {
+		printk("no block %u yet ... allocating\n", lblock);
+		index = alloc_fixed_index(lv, lblock);
+	}
+
+	err = 0;
+
+	printk(" %u <%s : %ld(%ld)> -> ", lblock,
+		partition_name(*rdev), *rsector, bsectors);
+
+	*rdev = index_dev(lv, index);
+	*rsector = index_block(lv, index) * HSM_SECTORS + offset;
+
+	printk(" <%s : %ld> %u\n",
+		partition_name(*rdev), *rsector, index_block(lv, index));
+
+	up(&hsm_sem);
+out:
+	return err;
+}
+
+static void free_index (lv_t *lv, lv_lptr_t * index)
+{
+	struct buffer_head *bh;
+
+	printk("tryin to get cached block for index group <%s:%d>\n",
+		partition_name(index_dev(lv, index)), index_block(lv, index));
+
+	bh = getblk(index_dev(lv, index), index_block(lv, index),HSM_BLOCKSIZE);
+
+	printk("....FREEING ");
+	print_index_list(lv, index);
+
+	if (bh) {
+		if (!buffer_uptodate(bh))
+			MD_BUG();
+		if ((lv_lptr_t *)bh->b_data != index_child(index)) {
+			printk("huh? b_data is %p, index content is %p.\n",
+				bh->b_data, index_child(index));
+		} else 
+			printk("good, b_data == index content == %p.\n",
+				index_child(index));
+		printk("b_count == %d, writing.\n", bh->b_count);
+		mark_buffer_dirty(bh, 1);
+		brelse(bh);
+		brelse(bh);
+		printk("done.\n");
+	} else {
+		printk("FAILED!\n");
+	}
+	print_index_list(lv, index);
+	index_child(index) = NULL;
+}
+
+static void free_index_group (lv_t *lv, int level, lv_lptr_t * index_0)
+{
+	char dots [3*8];
+	lv_lptr_t * index;
+	int i, nr_dots;
+
+	nr_dots = (INDEX_DEPTH-level)*3;
+ 	memcpy(dots,"...............",nr_dots);
+	dots[nr_dots] = 0;
+
+	dprintk("%s level %d index group block:\n", dots, level);
+
+
+	index = index_0;
+	for (i = 0; i < HSM_LPTRS_PER_BLOCK; i++) {
+		if (index->data.phys_block) {
+			dprintk("%s block <%u,%u,%x>\n", dots,
+				index->data.phys_nr,
+				index->data.phys_block,
+				index->cpu_addr);
+			if (level && index_present(index)) {
+				dprintk("%s==> deeper one level\n", dots);
+				free_index_group(lv, level-1,
+						index_child(index));
+				dprintk("%s freeing index group block %p ...",
+						dots, index_child(index));
+				free_index(lv, index);
+			}
+		}
+		index++;
+	}
+	dprintk("%s DONE: level %d index group block.\n", dots, level);
+}
+
+static void free_lv_indextree (lv_t *lv)
+{
+	dprintk("freeing LV %d ...\n", lv->log_id);
+	dprintk("..root index: %p\n", index_child(&lv->root_index));
+	dprintk("..INDEX TREE:\n");
+	free_index_group(lv, INDEX_DEPTH-1, index_child(&lv->root_index));
+	dprintk("..freeing root index %p ...", index_child(&lv->root_index));
+	dprintk("root block <%u,%u,%x>\n", lv->root_index.data.phys_nr,
+		lv->root_index.data.phys_block, lv->root_index.cpu_addr);
+	free_index(lv, &lv->root_index);
+	dprintk("..INDEX TREE done.\n");
+	fsync_dev(lv->vg->pv_array[0].dev); /* fix me */
+	lv->vg->vg_sb->lv_array[lv->log_id].lv_free_indices = lv->free_indices;
+}
+
+static void print_index_group (lv_t *lv, int level, lv_lptr_t * index_0)
+{
+	char dots [3*5];
+	lv_lptr_t * index;
+	int i, nr_dots;
+
+	nr_dots = (INDEX_DEPTH-level)*3;
+ 	memcpy(dots,"...............",nr_dots);
+	dots[nr_dots] = 0;
+
+	dprintk("%s level %d index group block:\n", dots, level);
+
+
+	for (i = 0; i < HSM_LPTRS_PER_BLOCK; i++) {
+		index = index_0 + i;
+		if (index->data.phys_block) {
+			dprintk("%s block <%u,%u,%x>\n", dots,
+				index->data.phys_nr,
+				index->data.phys_block,
+				index->cpu_addr);
+			if (level && index_present(index)) {
+				dprintk("%s==> deeper one level\n", dots);
+				print_index_group(lv, level-1,
+							index_child(index));
+			}
+		}
+	}
+	dprintk("%s DONE: level %d index group block.\n", dots, level);
+}
+
+static void print_lv (lv_t *lv)
+{
+	dprintk("printing LV %d ...\n", lv->log_id);
+	dprintk("..root index: %p\n", index_child(&lv->root_index));
+	dprintk("..INDEX TREE:\n");
+	print_index_group(lv, INDEX_DEPTH-1, index_child(&lv->root_index));
+	dprintk("..INDEX TREE done.\n");
+}
+
+static int map_lv (lv_t *lv)
+{
+	kdev_t dev = lv->dev;
+	unsigned int nr = MINOR(dev);
+	mddev_t *mddev = lv->vg->mddev;
+
+	if (MAJOR(dev) != MD_MAJOR) {
+		MD_BUG();
+		return -1;
+	}
+	if (kdev_to_mddev(dev)) {
+		MD_BUG();
+		return -1;
+	}
+	md_hd_struct[nr].start_sect = 0;
+	md_hd_struct[nr].nr_sects = md_size[mdidx(mddev)] << 1;
+	md_size[nr] = md_size[mdidx(mddev)];
+	add_mddev_mapping(mddev, dev, lv);
+
+	return 0;
+}
+
+static int unmap_lv (lv_t *lv)
+{
+	kdev_t dev = lv->dev;
+	unsigned int nr = MINOR(dev);
+
+	if (MAJOR(dev) != MD_MAJOR) {
+		MD_BUG();
+		return -1;
+	}
+	md_hd_struct[nr].start_sect = 0;
+	md_hd_struct[nr].nr_sects = 0;
+	md_size[nr] = 0;
+	del_mddev_mapping(lv->vg->mddev, dev);
+
+	return 0;
+}
+
+static int init_vg (vg_t *vg)
+{
+	int i;
+	lv_t *lv;
+	kdev_t dev;
+	vg_sb_t *vg_sb;
+	struct buffer_head *bh;
+	lv_descriptor_t *lv_desc;
+
+	/*
+	 * fix me: read all PVs and compare the SB
+	 */
+        dev = vg->pv_array[0].dev;
+        bh = bread (dev, 1, HSM_BLOCKSIZE);
+	if (!bh) {
+		MD_BUG();
+		return -1;
+	}
+
+	vg_sb = (vg_sb_t *) bh->b_data;
+	vg->vg_sb = vg_sb;
+
+	if (vg_sb->vg_magic != HSM_VG_SB_MAGIC) {
+		printk("%s is not a valid VG, has magic %x instead of %x!\n",
+			partition_name(dev), vg_sb->vg_magic,
+			HSM_VG_SB_MAGIC);
+		return -1;
+	}
+
+	vg->nr_lv = 0;
+	for (i = 0; i < HSM_MAX_LVS_PER_VG; i++) {
+		unsigned int id;
+		lv_desc = vg->vg_sb->lv_array + i;
+
+		id = lv_desc->lv_id;
+		if (!id) {
+			printk("... LV desc %d empty\n", i);
+			continue;
+		}
+		if (id >= HSM_MAX_LVS_PER_VG) {
+			MD_BUG();
+			continue;
+		}
+
+		lv = vg->lv_array + id;
+		if (lv->vg) {
+			MD_BUG();
+			continue;
+		}
+		lv->log_id = id;
+		lv->vg = vg;
+		lv->max_indices = lv_desc->lv_max_indices;
+		lv->free_indices = lv_desc->lv_free_indices;
+		lv->root_index.data = lv_desc->lv_root_idx;
+		lv->dev = MKDEV(MD_MAJOR, lv_desc->md_id);
+
+		vg->nr_lv++;
+
+		map_lv(lv);
+		if (read_root_index(lv)) {
+			vg->nr_lv--;
+			unmap_lv(lv);
+			memset(lv, 0, sizeof(*lv));
+		}
+	}
+	if (vg->nr_lv != vg_sb->nr_lvs)
+		MD_BUG();
+
+	return 0;
+}
+
+static int hsm_run (mddev_t *mddev)
+{
+	int i;
+	vg_t *vg;
+	mdk_rdev_t *rdev;
+
+	MOD_INC_USE_COUNT;
+
+	vg = kmalloc (sizeof (*vg), GFP_KERNEL);
+	if (!vg)
+		goto out;
+	memset(vg, 0, sizeof(*vg));
+	mddev->private = vg;
+	vg->mddev = mddev;
+
+	if (md_check_ordering(mddev)) {
+		printk("hsm: disks are not ordered, aborting!\n");
+		goto out;
+	}
+
+	set_blocksize (mddev_to_kdev(mddev), HSM_BLOCKSIZE);
+
+	vg->nr_pv = mddev->nb_dev;
+	ITERATE_RDEV_ORDERED(mddev,rdev,i) {
+		pv_t *pv = vg->pv_array + i;
+
+		pv->dev = rdev->dev;
+		fsync_dev (pv->dev);
+		set_blocksize (pv->dev, HSM_BLOCKSIZE);
+		pv->phys_nr = i;
+		if (init_pv(pv))
+			goto out;
+	}
+
+	init_vg(vg);
+
+	return 0;
+
+out:
+	if (vg) {
+		kfree(vg);
+		mddev->private = NULL;
+	}
+	MOD_DEC_USE_COUNT;
+
+	return 1;
+}
+
+static int hsm_stop (mddev_t *mddev)
+{
+	lv_t *lv;
+	vg_t *vg;
+	int i;
+
+	vg = mddev_to_vg(mddev);
+
+	for (i = 0; i < HSM_MAX_LVS_PER_VG; i++) {
+		lv = vg->lv_array + i;
+		if (!lv->log_id)
+			continue;
+		print_lv(lv);
+		free_lv_indextree(lv);
+		unmap_lv(lv);
+	}
+	for (i = 0; i < vg->nr_pv; i++)
+		free_pv(vg->pv_array + i);
+
+	kfree(vg);
+
+	MOD_DEC_USE_COUNT;
+
+	return 0;
+}
+
+
+static int hsm_status (char *page, mddev_t *mddev)
+{
+	int sz = 0, i;
+	lv_t *lv;
+	vg_t *vg;
+
+	vg = mddev_to_vg(mddev);
+
+	for (i = 0; i < HSM_MAX_LVS_PER_VG; i++) {
+		lv = vg->lv_array + i;
+		if (!lv->log_id)
+			continue;
+		sz += sprintf(page+sz, "<LV%d %d/%d blocks used> ", lv->log_id,
+			lv->max_indices - lv->free_indices, lv->max_indices);
+	}
+	return sz;
+}
+
+
+static mdk_personality_t hsm_personality=
+{
+	"hsm",
+	hsm_map,
+	NULL,
+	NULL,
+	hsm_run,
+	hsm_stop,
+	hsm_status,
+	NULL,
+	0,
+	NULL,
+	NULL,
+	NULL,
+	NULL
+};
+
+#ifndef MODULE
+
+md__initfunc(void hsm_init (void))
+{
+	register_md_personality (HSM, &hsm_personality);
+}
+
+#else
+
+int init_module (void)
+{
+	return (register_md_personality (HSM, &hsm_personality));
+}
+
+void cleanup_module (void)
+{
+	unregister_md_personality (HSM);
+}
+
+#endif
+
+/*
+ * This Linus-trick catches bugs via the linker.
+ */
+
+extern void __BUG__in__hsm_dot_c_1(void);
+extern void __BUG__in__hsm_dot_c_2(void);
+extern void __BUG__in__hsm_dot_c_3(void);
+extern void __BUG__in__hsm_dot_c_4(void);
+extern void __BUG__in__hsm_dot_c_5(void);
+extern void __BUG__in__hsm_dot_c_6(void);
+extern void __BUG__in__hsm_dot_c_7(void);
+ 
+void bugcatcher (void)
+{
+        if (sizeof(pv_block_group_t) != HSM_BLOCKSIZE)
+                __BUG__in__hsm_dot_c_1();
+        if (sizeof(lv_index_block_t) != HSM_BLOCKSIZE)
+                __BUG__in__hsm_dot_c_2();
+
+        if (sizeof(pv_sb_t) != HSM_BLOCKSIZE)
+                __BUG__in__hsm_dot_c_4();
+        if (sizeof(lv_sb_t) != HSM_BLOCKSIZE)
+                __BUG__in__hsm_dot_c_3();
+	if (sizeof(vg_sb_t) != HSM_BLOCKSIZE)
+                __BUG__in__hsm_dot_c_6();
+
+	if (sizeof(lv_lptr_t) != 16)
+                __BUG__in__hsm_dot_c_5();
+	if (sizeof(pv_pptr_t) != 16)
+                __BUG__in__hsm_dot_c_6();
+}
+
diff -urN 2.2.18/drivers/block/ide-probe.c 2.2.18aa1/drivers/block/ide-probe.c
--- 2.2.18/drivers/block/ide-probe.c	Mon Dec 11 16:57:48 2000
+++ 2.2.18aa1/drivers/block/ide-probe.c	Mon Dec 11 17:20:54 2000
@@ -393,6 +393,7 @@
 	extern struct drive_info_struct drive_info;
 	byte cmos_disks, *BIOS = (byte *) &drive_info;
 	int unit;
+	extern spinlock_t rtc_lock;
 
 #ifdef CONFIG_BLK_DEV_PDC4030
 	if (hwif->chipset == ide_pdc4030 && hwif->channel != 0)
@@ -699,6 +700,7 @@
 	gd->init	= &ide_geninit;		/* initialization function */
 	gd->real_devices= hwif;			/* ptr to internal data */
 	gd->next	= NULL;			/* linked list of major devs */
+	gd->device_names = NULL;
 
 	for (gdp = &gendisk_head; *gdp; gdp = &((*gdp)->next)) ;
 	hwif->gd = *gdp = gd;			/* link onto tail of list */
diff -urN 2.2.18/drivers/block/linear.c 2.2.18aa1/drivers/block/linear.c
--- 2.2.18/drivers/block/linear.c	Mon Jan 17 16:44:36 2000
+++ 2.2.18aa1/drivers/block/linear.c	Mon Dec 11 17:20:54 2000
@@ -1,4 +1,3 @@
-
 /*
    linear.c : Multiple Devices driver for Linux
               Copyright (C) 1994-96 Marc ZYNGIER
@@ -19,186 +18,207 @@
 
 #include <linux/module.h>
 
-#include <linux/md.h>
+#include <linux/raid/md.h>
 #include <linux/malloc.h>
-#include <linux/init.h>
 
-#include "linear.h"
+#include <linux/raid/linear.h>
 
 #define MAJOR_NR MD_MAJOR
 #define MD_DRIVER
 #define MD_PERSONALITY
 
-static int linear_run (int minor, struct md_dev *mddev)
+static int linear_run (mddev_t *mddev)
 {
-  int cur=0, i, size, dev0_size, nb_zone;
-  struct linear_data *data;
-
-  MOD_INC_USE_COUNT;
-
-  mddev->private=kmalloc (sizeof (struct linear_data), GFP_KERNEL);
-  data=(struct linear_data *) mddev->private;
-
-  /*
-     Find out the smallest device. This was previously done
-     at registry time, but since it violates modularity,
-     I moved it here... Any comment ? ;-)
-   */
-
-  data->smallest=mddev->devices;
-  for (i=1; i<mddev->nb_dev; i++)
-    if (data->smallest->size > mddev->devices[i].size)
-      data->smallest=mddev->devices+i;
-  
-  nb_zone=data->nr_zones=
-    md_size[minor]/data->smallest->size +
-    (md_size[minor]%data->smallest->size ? 1 : 0);
-  
-  data->hash_table=kmalloc (sizeof (struct linear_hash)*nb_zone, GFP_KERNEL);
-
-  size=mddev->devices[cur].size;
+	linear_conf_t *conf;
+	struct linear_hash *table;
+	mdk_rdev_t *rdev;
+	int size, i, j, nb_zone;
+	unsigned int curr_offset;
+
+	MOD_INC_USE_COUNT;
+
+	conf = kmalloc (sizeof (*conf), GFP_KERNEL);
+	if (!conf)
+		goto out;
+	mddev->private = conf;
+
+	if (md_check_ordering(mddev)) {
+		printk("linear: disks are not ordered, aborting!\n");
+		goto out;
+	}
+	/*
+	 * Find the smallest device.
+	 */
+
+	conf->smallest = NULL;
+	curr_offset = 0;
+	ITERATE_RDEV_ORDERED(mddev,rdev,j) {
+		dev_info_t *disk = conf->disks + j;
+
+		disk->dev = rdev->dev;
+		disk->size = rdev->size;
+		disk->offset = curr_offset;
+
+		curr_offset += disk->size;
+
+		if (!conf->smallest || (disk->size < conf->smallest->size))
+			conf->smallest = disk;
+	}
+
+	nb_zone = conf->nr_zones =
+		md_size[mdidx(mddev)] / conf->smallest->size +
+		((md_size[mdidx(mddev)] % conf->smallest->size) ? 1 : 0);
+  
+	conf->hash_table = kmalloc (sizeof (struct linear_hash) * nb_zone,
+					GFP_KERNEL);
+	if (!conf->hash_table)
+		goto out;
+
+	/*
+	 * Here we generate the linear hash table
+	 */
+	table = conf->hash_table;
+	i = 0;
+	size = 0;
+	for (j = 0; j < mddev->nb_dev; j++) {
+		dev_info_t *disk = conf->disks + j;
+
+		if (size < 0) {
+			table->dev1 = disk;
+			table++;
+		}
+		size += disk->size;
+
+		while (size) {
+			table->dev0 = disk;
+			size -= conf->smallest->size;
+			if (size < 0)
+				break;
+			table->dev1 = NULL;
+			table++;
+		}
+	}
+	table->dev1 = NULL;
+
+	return 0;
+
+out:
+	if (conf)
+		kfree(conf);
+	MOD_DEC_USE_COUNT;
+	return 1;
+}
+
+static int linear_stop (mddev_t *mddev)
+{
+	linear_conf_t *conf = mddev_to_conf(mddev);
+  
+	kfree(conf->hash_table);
+	kfree(conf);
 
-  i=0;
-  while (cur<mddev->nb_dev)
-  {
-    data->hash_table[i].dev0=mddev->devices+cur;
+	MOD_DEC_USE_COUNT;
 
-    if (size>=data->smallest->size) /* If we completely fill the slot */
-    {
-      data->hash_table[i++].dev1=NULL;
-      size-=data->smallest->size;
-
-      if (!size)
-      {
-	if (++cur==mddev->nb_dev) continue;
-	size=mddev->devices[cur].size;
-      }
-
-      continue;
-    }
-
-    if (++cur==mddev->nb_dev) /* Last dev, set dev1 as NULL */
-    {
-      data->hash_table[i].dev1=NULL;
-      continue;
-    }
-
-    dev0_size=size;		/* Here, we use a 2nd dev to fill the slot */
-    size=mddev->devices[cur].size;
-    data->hash_table[i++].dev1=mddev->devices+cur;
-    size-=(data->smallest->size - dev0_size);
-  }
-
-  return 0;
-}
-
-static int linear_stop (int minor, struct md_dev *mddev)
-{
-  struct linear_data *data=(struct linear_data *) mddev->private;
-  
-  kfree (data->hash_table);
-  kfree (data);
-
-  MOD_DEC_USE_COUNT;
-
-  return 0;
+	return 0;
 }
 
 
-static int linear_map (struct md_dev *mddev, kdev_t *rdev,
+static int linear_map (mddev_t *mddev, kdev_t dev, kdev_t *rdev,
 		       unsigned long *rsector, unsigned long size)
 {
-  struct linear_data *data=(struct linear_data *) mddev->private;
-  struct linear_hash *hash;
-  struct real_dev *tmp_dev;
-  long block;
-
-  block=*rsector >> 1;
-  hash=data->hash_table+(block/data->smallest->size);
-  
-  if (block >= (hash->dev0->size + hash->dev0->offset))
-  {
-    if (!hash->dev1)
-    {
-      printk ("linear_map : hash->dev1==NULL for block %ld\n", block);
-      return (-1);
-    }
-    
-    tmp_dev=hash->dev1;
-  }
-  else
-    tmp_dev=hash->dev0;
+	linear_conf_t *conf = mddev_to_conf(mddev);
+	struct linear_hash *hash;
+	dev_info_t *tmp_dev;
+	long block;
+
+	block = *rsector >> 1;
+	hash = conf->hash_table + (block / conf->smallest->size);
+  
+	if (block >= (hash->dev0->size + hash->dev0->offset))
+	{
+		if (!hash->dev1)
+		{
+			printk ("linear_map : hash->dev1==NULL for block %ld\n",
+						block);
+			return -1;
+		}
+		tmp_dev = hash->dev1;
+	} else
+		tmp_dev = hash->dev0;
     
-  if (block >= (tmp_dev->size + tmp_dev->offset) || block < tmp_dev->offset)
-    printk ("Block %ld out of bounds on dev %s size %d offset %d\n",
-	    block, kdevname(tmp_dev->dev), tmp_dev->size, tmp_dev->offset);
+	if (block >= (tmp_dev->size + tmp_dev->offset)
+				|| block < tmp_dev->offset)
+	printk ("Block %ld out of bounds on dev %s size %d offset %d\n",
+		block, kdevname(tmp_dev->dev), tmp_dev->size, tmp_dev->offset);
   
-  *rdev=tmp_dev->dev;
-  *rsector=(block-(tmp_dev->offset)) << 1;
+	*rdev = tmp_dev->dev;
+	*rsector = (block - tmp_dev->offset) << 1;
 
-  return (0);
+	return 0;
 }
 
-static int linear_status (char *page, int minor, struct md_dev *mddev)
+static int linear_status (char *page, mddev_t *mddev)
 {
-  int sz=0;
+	int sz=0;
 
 #undef MD_DEBUG
 #ifdef MD_DEBUG
-  int j;
-  struct linear_data *data=(struct linear_data *) mddev->private;
+	int j;
+	linear_conf_t *conf = mddev_to_conf(mddev);
   
-  sz+=sprintf (page+sz, "      ");
-  for (j=0; j<data->nr_zones; j++)
-  {
-    sz+=sprintf (page+sz, "[%s",
-		 partition_name (data->hash_table[j].dev0->dev));
-
-    if (data->hash_table[j].dev1)
-      sz+=sprintf (page+sz, "/%s] ",
-		   partition_name(data->hash_table[j].dev1->dev));
-    else
-      sz+=sprintf (page+sz, "] ");
-  }
-
-  sz+=sprintf (page+sz, "\n");
+	sz += sprintf(page+sz, "      ");
+	for (j = 0; j < conf->nr_zones; j++)
+	{
+		sz += sprintf(page+sz, "[%s",
+			partition_name(conf->hash_table[j].dev0->dev));
+
+		if (conf->hash_table[j].dev1)
+			sz += sprintf(page+sz, "/%s] ",
+			  partition_name(conf->hash_table[j].dev1->dev));
+		else
+			sz += sprintf(page+sz, "] ");
+	}
+	sz += sprintf(page+sz, "\n");
 #endif
-  sz+=sprintf (page+sz, " %dk rounding", 1<<FACTOR_SHIFT(FACTOR(mddev)));
-  return sz;
+	sz += sprintf(page+sz, " %dk rounding", mddev->param.chunk_size/1024);
+	return sz;
 }
 
 
-static struct md_personality linear_personality=
+static mdk_personality_t linear_personality=
 {
-  "linear",
-  linear_map,
-  NULL,
-  NULL,
-  linear_run,
-  linear_stop,
-  linear_status,
-  NULL,				/* no ioctls */
-  0
+	"linear",
+	linear_map,
+	NULL,
+	NULL,
+	linear_run,
+	linear_stop,
+	linear_status,
+	NULL,
+	0,
+	NULL,
+	NULL,
+	NULL,
+	NULL
 };
 
-
 #ifndef MODULE
 
-__initfunc(void linear_init (void))
+md__initfunc(void linear_init (void))
 {
-  register_md_personality (LINEAR, &linear_personality);
+	register_md_personality (LINEAR, &linear_personality);
 }
 
 #else
 
 int init_module (void)
 {
-  return (register_md_personality (LINEAR, &linear_personality));
+	return (register_md_personality (LINEAR, &linear_personality));
 }
 
 void cleanup_module (void)
 {
-  unregister_md_personality (LINEAR);
+	unregister_md_personality (LINEAR);
 }
 
 #endif
+
diff -urN 2.2.18/drivers/block/linear.h 2.2.18aa1/drivers/block/linear.h
--- 2.2.18/drivers/block/linear.h	Mon Jan 17 16:44:36 2000
+++ 2.2.18aa1/drivers/block/linear.h	Thu Jan  1 01:00:00 1970
@@ -1,16 +0,0 @@
-#ifndef _LINEAR_H
-#define _LINEAR_H
-
-struct linear_hash
-{
-  struct real_dev *dev0, *dev1;
-};
-
-struct linear_data
-{
-  struct linear_hash *hash_table; /* Dynamically allocated */
-  struct real_dev *smallest;
-  int nr_zones;
-};
-
-#endif
diff -urN 2.2.18/drivers/block/ll_rw_blk.c 2.2.18aa1/drivers/block/ll_rw_blk.c
--- 2.2.18/drivers/block/ll_rw_blk.c	Mon Dec 11 16:57:48 2000
+++ 2.2.18aa1/drivers/block/ll_rw_blk.c	Mon Dec 11 17:20:55 2000
@@ -26,6 +26,7 @@
 
 #include <linux/module.h>
 
+
 /*
  * The request-struct contains all necessary data
  * to load a nr of sectors into memory
@@ -53,6 +54,11 @@
 spinlock_t io_request_lock = SPIN_LOCK_UNLOCKED;
 
 /*
+ * per-major idle-IO detection
+ */
+unsigned long io_events[MAX_BLKDEV] = {0, };
+
+/*
  * used to wait on when there are no free requests
  */
 struct wait_queue * wait_for_request;
@@ -415,7 +421,8 @@
              case COMPAQ_CISS_MAJOR+4:        \
              case COMPAQ_CISS_MAJOR+5:        \
              case COMPAQ_CISS_MAJOR+6:        \
-             case COMPAQ_CISS_MAJOR+7:
+             case COMPAQ_CISS_MAJOR+7:        \
+             case LOOP_MAJOR:
 
 #define elevator_starve_rest_of_queue(req)			\
 do {								\
@@ -570,10 +577,9 @@
 void make_request(int major, int rw, struct buffer_head * bh)
 {
 	unsigned int sector, count;
-	struct request * req, * prev;
+	struct request * req, * prev, * freereq = NULL;
 	int rw_ahead, max_req, max_sectors, max_segments;
 	unsigned long flags;
-	int back, front;
 
 	count = bh->b_size >> 9;
 	sector = bh->b_rsector;
@@ -583,6 +589,8 @@
 		return;
 	/* Maybe the above fixes it, and maybe it doesn't boot. Life is interesting */
 	lock_buffer(bh);
+	if (!buffer_lowprio(bh))
+		io_events[major]++;
 
 	if (blk_size[major]) {
 		unsigned long maxsector = (blk_size[major][MINOR(bh->b_rdev)] << 1) + 1;
@@ -650,6 +658,7 @@
 	max_sectors = get_max_sectors(bh->b_rdev);
 	max_segments = get_max_segments(bh->b_rdev);
 
+ again:
 	/*
 	 * Now we acquire the request spinlock, we have to be mega careful
 	 * not to schedule or do something nonatomic
@@ -659,6 +668,9 @@
 	if (!req) {
 		/* MD and loop can't handle plugging without deadlocking */
 		if (major != MD_MAJOR && major != LOOP_MAJOR && 
+#if defined CONFIG_BLK_DEV_LVM || defined CONFIG_BLK_DEV_LVM_MODULE
+		    major != LVM_BLK_MAJOR &&
+#endif
 		    major != DDV_MAJOR && major != NBD_MAJOR)
 			plug_device(blk_dev + major); /* is atomic */
 	} else switch (major) {
@@ -680,29 +692,23 @@
 
 		req = seek_to_not_starving_chunk(req);
 		prev = NULL;
-		back = front = 0;
 		do {
 			if (req->cmd != rw)
 				continue;
 			if (req->rq_dev != bh->b_rdev)
 				continue;
-			if (req->sector + req->nr_sectors == sector)
-				back = 1;
-			else if (req->sector - count == sector)
-				front = 1;
-
 			if (req->nr_sectors + count > max_sectors)
 				continue;
 			if (req->sem)
 				continue;
 
 			/* Can we add it to the end of this request? */
-			if (back) {
+			if (req->sector + req->nr_sectors == sector) {
 				if (req->bhtail->b_data + req->bhtail->b_size
 				    != bh->b_data) {
 					if (req->nr_segments < max_segments)
 						req->nr_segments++;
-					else break;
+					else continue;
 				}
 				req->bhtail->b_reqnext = bh;
 				req->bhtail = bh;
@@ -713,19 +719,19 @@
 				/* Can we now merge this req with the next? */
 				attempt_merge(req, max_sectors, max_segments);
 			/* or to the beginning? */
-			} else if (front) {
+			} else if (req->sector - count == sector) {
 				/*
 				 * Check that we didn't seek on a starving request,
 				 * that could happen only at the first pass, thus
 				 * do that only if prev is NULL.
 				 */
 				if (!prev && ((req->cmd != READ && req->cmd != WRITE) || !req->elevator_latency))
-					break;
+					continue;
 				if (bh->b_data + bh->b_size
 				    != req->bh->b_data) {
 					if (req->nr_segments < max_segments)
 						req->nr_segments++;
-					else break;
+					else continue;
 				}
 			    	bh->b_reqnext = req->bh;
 			    	req->bh = bh;
@@ -745,15 +751,22 @@
 				continue;
 
 			mark_buffer_clean(bh);
+			if (freereq) {
+				freereq->rq_status = RQ_INACTIVE;
+				wake_up(&wait_for_request);
+			}
 			spin_unlock_irqrestore(&io_request_lock,flags);
 		    	return;
 
-		} while (prev = req,
-			 !front && !back && (req = req->next) != NULL);
+		} while (prev = req, (req = req->next) != NULL);
 	}
 
 /* find an unused request. */
-	req = get_request(max_req, bh->b_rdev);
+	if (freereq) {
+		req = freereq;
+		freereq = NULL;
+	} else
+		req = get_request(max_req, bh->b_rdev);
 
 	spin_unlock_irqrestore(&io_request_lock,flags);
 
@@ -761,7 +774,8 @@
 	if (!req) {
 		if (rw_ahead)
 			goto end_io;
-		req = __get_request_wait(max_req, bh->b_rdev);
+		freereq = __get_request_wait(max_req, bh->b_rdev);
+		goto again;
 	}
 
 /* fill up the request-info, and add it to the queue */
@@ -790,7 +804,7 @@
 {
 	unsigned int major;
 	int correct_size;
-	struct blk_dev_struct * dev;
+	struct blk_dev_struct * dev, * tdev = NULL;
 	int i;
 
 	/* Make sure that the first block contains something reasonable */
@@ -803,7 +817,7 @@
 	dev = NULL;
 	if ((major = MAJOR(bh[0]->b_dev)) < MAX_BLKDEV)
 		dev = blk_dev + major;
-	if (!dev || !dev->request_fn) {
+	if (!dev || (!dev->request_fn && !dev->makerq_fn && !dev->map_fn)) {
 		printk(KERN_ERR
 	"ll_rw_block: Trying to read nonexistent block-device %s (%ld)\n",
 		kdevname(bh[0]->b_dev), bh[0]->b_blocknr);
@@ -819,7 +833,7 @@
 
 	/* Verify requested block sizes.  */
 	for (i = 0; i < nr; i++) {
-		if (bh[i] && bh[i]->b_size != correct_size) {
+		if (bh[i]->b_size != correct_size) {
 			printk(KERN_NOTICE "ll_rw_block: device %s: "
 			       "only %d-char blocks implemented (%lu)\n",
 			       kdevname(bh[0]->b_dev),
@@ -827,18 +841,30 @@
 			goto sorry;
 		}
 
-		/* Md remaps blocks now */
+		/* LVM and MD remap blocks now */
 		bh[i]->b_rdev = bh[i]->b_dev;
 		bh[i]->b_rsector=bh[i]->b_blocknr*(bh[i]->b_size >> 9);
-#ifdef CONFIG_BLK_DEV_MD
-		if (major==MD_MAJOR &&
-		    md_map (MINOR(bh[i]->b_dev), &bh[i]->b_rdev,
-			    &bh[i]->b_rsector, bh[i]->b_size >> 9)) {
-		        printk (KERN_ERR
-				"Bad md_map in ll_rw_block\n");
-		        goto sorry;
+
+		tdev = dev;
+		while (tdev->map_fn) {
+			kdev_t __rdev = bh[i]->b_rdev;
+			unsigned long __rsector = bh[i]->b_rsector;
+
+			if (tdev->map_fn (bh[i]->b_rdev, &bh[i]->b_rdev,
+					  &bh[i]->b_rsector,
+					  bh[i]->b_size >> 9, rw)) {
+				printk (KERN_ERR "Bad map in ll_rw_block\n");
+				goto sorry;
+			}
+			if (__rdev == bh[i]->b_rdev &&
+			    __rsector == bh[i]->b_rsector)
+				/*
+				 * Break the loop if map_fn is a noop
+				 * as it happens with raid1.
+				 */
+				break;
+			tdev = blk_dev + MAJOR(bh[i]->b_rdev);
 		}
-#endif
 	}
 
 	if ((rw == WRITE || rw == WRITEA) && is_read_only(bh[0]->b_dev)) {
@@ -848,16 +874,12 @@
 	}
 
 	for (i = 0; i < nr; i++) {
-		if (bh[i]) {
-			set_bit(BH_Req, &bh[i]->b_state);
-#ifdef CONFIG_BLK_DEV_MD
-			if (MAJOR(bh[i]->b_dev) == MD_MAJOR) {
-				md_make_request(MINOR (bh[i]->b_dev), rw, bh[i]);
-				continue;
-			}
-#endif
-			make_request(MAJOR(bh[i]->b_rdev), rw, bh[i]);
+		set_bit(BH_Req, &bh[i]->b_state);
+		if (tdev->makerq_fn) {
+			tdev->makerq_fn(bh[i], rw);
+			continue;
 		}
+		make_request(MAJOR(bh[i]->b_rdev), rw, bh[i]);
 	}
 	return;
 
@@ -938,6 +960,8 @@
 		dev->request_fn      = NULL;
 		dev->queue           = NULL;
 		dev->current_request = NULL;
+		dev->map_fn          = NULL;
+		dev->makerq_fn       = NULL;
 		dev->plug.rq_status  = RQ_INACTIVE;
 		dev->plug.cmd        = -1;
 		dev->plug.next       = NULL;
@@ -1037,6 +1061,9 @@
 #ifdef CONFIG_SJCD
 	sjcd_init();
 #endif CONFIG_SJCD
+#ifdef CONFIG_BLK_DEV_LVM
+	lvm_init();
+#endif
 #ifdef CONFIG_BLK_DEV_MD
 	md_init();
 #endif CONFIG_BLK_DEV_MD
diff -urN 2.2.18/drivers/block/loop.c 2.2.18aa1/drivers/block/loop.c
--- 2.2.18/drivers/block/loop.c	Sun Apr  2 21:07:48 2000
+++ 2.2.18aa1/drivers/block/loop.c	Mon Dec 11 17:20:50 2000
@@ -143,12 +143,12 @@
 	int	size;
 
 	if (S_ISREG(lo->lo_dentry->d_inode->i_mode))
-		size = (lo->lo_dentry->d_inode->i_size - lo->lo_offset) / BLOCK_SIZE;
+		size = (lo->lo_dentry->d_inode->i_size - lo->lo_offset) >> BLOCK_SIZE_BITS;
 	else {
 		kdev_t lodev = lo->lo_device;
 		if (blk_size[MAJOR(lodev)])
 			size = blk_size[MAJOR(lodev)][MINOR(lodev)] -
-                                lo->lo_offset / BLOCK_SIZE;
+				 (lo->lo_offset >> BLOCK_SIZE_BITS);
 		else
 			size = MAX_DISK_SIZE;
 	}
@@ -274,6 +274,8 @@
 		block++;
 	}
 	spin_lock_irq(&io_request_lock);
+	current_request->sector += current_request->current_nr_sectors;
+	current_request->nr_sectors -= current_request->current_nr_sectors;
 	current_request->next=CURRENT;
 	CURRENT=current_request;
 	end_request(1);
diff -urN 2.2.18/drivers/block/lvm-snap.c 2.2.18aa1/drivers/block/lvm-snap.c
--- 2.2.18/drivers/block/lvm-snap.c	Thu Jan  1 01:00:00 1970
+++ 2.2.18aa1/drivers/block/lvm-snap.c	Mon Dec 11 17:20:49 2000
@@ -0,0 +1,607 @@
+/*
+ * kernel/lvm-snap.c
+ *
+ * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
+ *                    Heinz Mauelshagen, Sistina Software (persistent snapshots)
+ *
+ * LVM snapshot driver is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ * 
+ * LVM snapshot driver is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING.  If not, write to
+ * the Free Software Foundation, 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA. 
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/vmalloc.h>
+#include <linux/blkdev.h>
+#include <linux/smp_lock.h>
+#include <linux/types.h>
+#include <linux/iobuf.h>
+#include <linux/lvm.h>
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION ( 2, 3 ,0)
+#include <linux/pagemap.h>
+#endif
+
+
+static char *lvm_snap_version __attribute__ ((unused)) = "LVM 0.9 snapshot code (13/11/2000)\n";
+
+#ifndef LockPage
+#define LockPage(map) set_bit(PG_locked, &(map)->flags)
+#endif
+
+extern const char *const lvm_name;
+extern int lvm_blocksizes[];
+
+void lvm_snapshot_release(lv_t *);
+
+uint lvm_pv_get_number(vg_t * vg, kdev_t rdev)
+{
+	uint p;
+
+	for ( p = 0; p < vg->pv_max; p++)
+	{
+		if ( vg->pv[p] == NULL) continue;
+		if ( vg->pv[p]->pv_dev == rdev) break;
+	}
+
+	return vg->pv[p]->pv_number;
+}
+
+
+#define hashfn(dev,block,mask,chunk_size) \
+	((HASHDEV(dev)^((block)/(chunk_size))) & (mask))
+
+static inline lv_block_exception_t *
+lvm_find_exception_table(kdev_t org_dev, unsigned long org_start, lv_t * lv)
+{
+	struct list_head * hash_table = lv->lv_snapshot_hash_table, * next;
+	unsigned long mask = lv->lv_snapshot_hash_mask;
+	int chunk_size = lv->lv_chunk_size;
+	lv_block_exception_t * ret;
+	int i = 0;
+
+	hash_table = &hash_table[hashfn(org_dev, org_start, mask, chunk_size)];
+	ret = NULL;
+	for (next = hash_table->next; next != hash_table; next = next->next)
+	{
+		lv_block_exception_t * exception;
+
+		exception = list_entry(next, lv_block_exception_t, hash);
+		if (exception->rsector_org == org_start &&
+		    exception->rdev_org == org_dev)
+		{
+			if (i)
+			{
+				/* fun, isn't it? :) */
+				list_del(next);
+				list_add(next, hash_table);
+			}
+			ret = exception;
+			break;
+		}
+		i++;
+	}
+	return ret;
+}
+
+inline void lvm_hash_link(lv_block_exception_t * exception,
+			  kdev_t org_dev, unsigned long org_start,
+			  lv_t * lv)
+{
+	struct list_head * hash_table = lv->lv_snapshot_hash_table;
+	unsigned long mask = lv->lv_snapshot_hash_mask;
+	int chunk_size = lv->lv_chunk_size;
+
+	hash_table = &hash_table[hashfn(org_dev, org_start, mask, chunk_size)];
+	list_add(&exception->hash, hash_table);
+}
+
+int lvm_snapshot_remap_block(kdev_t * org_dev, unsigned long * org_sector,
+			     unsigned long pe_start, lv_t * lv)
+{
+	int ret;
+	unsigned long pe_off, pe_adjustment, __org_start;
+	kdev_t __org_dev;
+	int chunk_size = lv->lv_chunk_size;
+	lv_block_exception_t * exception;
+
+	pe_off = pe_start % chunk_size;
+	pe_adjustment = (*org_sector-pe_off) % chunk_size;
+	__org_start = *org_sector - pe_adjustment;
+	__org_dev = *org_dev;
+	ret = 0;
+	exception = lvm_find_exception_table(__org_dev, __org_start, lv);
+	if (exception)
+	{
+		*org_dev = exception->rdev_new;
+		*org_sector = exception->rsector_new + pe_adjustment;
+		ret = 1;
+	}
+	return ret;
+}
+
+void lvm_drop_snapshot(lv_t * lv_snap, const char * reason)
+{
+	kdev_t last_dev;
+	int i;
+
+	/* no exception storage space available for this snapshot
+	   or error on this snapshot --> release it */
+	invalidate_buffers(lv_snap->lv_dev);
+
+	for (i = last_dev = 0; i < lv_snap->lv_remap_ptr; i++) {
+		if ( lv_snap->lv_block_exception[i].rdev_new != last_dev) {
+			last_dev = lv_snap->lv_block_exception[i].rdev_new;
+			invalidate_buffers(last_dev);
+		}
+	}
+
+	lvm_snapshot_release(lv_snap);
+
+	printk(KERN_INFO
+	       "%s -- giving up to snapshot %s on %s due %s\n",
+	       lvm_name, lv_snap->lv_snapshot_org->lv_name, lv_snap->lv_name,
+	       reason);
+}
+
+static inline void lvm_snapshot_prepare_blocks(unsigned long * blocks,
+					       unsigned long start,
+					       int nr_sectors,
+					       int blocksize)
+{
+	int i, sectors_per_block, nr_blocks;
+
+	sectors_per_block = blocksize >> 9;
+	nr_blocks = nr_sectors / sectors_per_block;
+	start /= sectors_per_block;
+
+	for (i = 0; i < nr_blocks; i++)
+		blocks[i] = start++;
+}
+
+inline int lvm_get_blksize(kdev_t dev)
+{
+	int correct_size = BLOCK_SIZE, i, major;
+
+	major = MAJOR(dev);
+	if (blksize_size[major])
+	{
+		i = blksize_size[major][MINOR(dev)];
+		if (i)
+			correct_size = i;
+	}
+	return correct_size;
+}
+
+#ifdef DEBUG_SNAPSHOT
+static inline void invalidate_snap_cache(unsigned long start, unsigned long nr,
+					 kdev_t dev)
+{
+	struct buffer_head * bh;
+	int sectors_per_block, i, blksize, minor;
+
+	minor = MINOR(dev);
+	blksize = lvm_blocksizes[minor];
+	sectors_per_block = blksize >> 9;
+	nr /= sectors_per_block;
+	start /= sectors_per_block;
+
+	for (i = 0; i < nr; i++)
+	{
+		bh = get_hash_table(dev, start++, blksize);
+		if (bh)
+			bforget(bh);
+	}
+}
+#endif
+
+
+void lvm_snapshot_fill_COW_page(vg_t * vg, lv_t * lv_snap)
+{
+	int 	id = 0, is = lv_snap->lv_remap_ptr;
+	ulong	blksize_snap;
+	lv_COW_table_disk_t * lv_COW_table =
+	   ( lv_COW_table_disk_t *) page_address(lv_snap->lv_COW_table_page);
+
+	if (is == 0) return;
+	is--;
+        blksize_snap = lvm_get_blksize(lv_snap->lv_block_exception[is].rdev_new);
+        is -= is % (blksize_snap / sizeof(lv_COW_table_disk_t));
+
+	memset(lv_COW_table, 0, blksize_snap);
+	for ( ; is < lv_snap->lv_remap_ptr; is++, id++) {
+		/* store new COW_table entry */
+		lv_COW_table[id].pv_org_number = LVM_TO_DISK64(lvm_pv_get_number(vg, lv_snap->lv_block_exception[is].rdev_org));
+		lv_COW_table[id].pv_org_rsector = LVM_TO_DISK64(lv_snap->lv_block_exception[is].rsector_org);
+		lv_COW_table[id].pv_snap_number = LVM_TO_DISK64(lvm_pv_get_number(vg, lv_snap->lv_block_exception[is].rdev_new));
+		lv_COW_table[id].pv_snap_rsector = LVM_TO_DISK64(lv_snap->lv_block_exception[is].rsector_new);
+	}
+}
+
+
+/*
+ * writes a COW exception table sector to disk (HM)
+ *
+ */
+
+int lvm_write_COW_table_block(vg_t * vg,
+			      lv_t * lv_snap)
+{
+	int blksize_snap;
+	int end_of_table;
+	int idx = lv_snap->lv_remap_ptr, idx_COW_table;
+	int nr_pages_tmp;
+	int length_tmp;
+	ulong snap_pe_start, COW_table_sector_offset,
+	      COW_entries_per_pe, COW_chunks_per_pe, COW_entries_per_block;
+	ulong blocks[1];
+	const char * reason;
+	kdev_t snap_phys_dev;
+	struct kiobuf * iobuf = lv_snap->lv_iobuf;
+	struct page * page_tmp;
+	lv_COW_table_disk_t * lv_COW_table =
+	   ( lv_COW_table_disk_t *) page_address(lv_snap->lv_COW_table_page);
+
+	idx--;
+
+	COW_chunks_per_pe = LVM_GET_COW_TABLE_CHUNKS_PER_PE(vg, lv_snap);
+	COW_entries_per_pe = LVM_GET_COW_TABLE_ENTRIES_PER_PE(vg, lv_snap);
+
+	/* get physical addresse of destination chunk */
+	snap_phys_dev = lv_snap->lv_block_exception[idx].rdev_new;
+	snap_pe_start = lv_snap->lv_block_exception[idx - (idx % COW_entries_per_pe)].rsector_new - lv_snap->lv_chunk_size;
+
+	blksize_snap = lvm_get_blksize(snap_phys_dev);
+
+        COW_entries_per_block = blksize_snap / sizeof(lv_COW_table_disk_t);
+        idx_COW_table = idx % COW_entries_per_pe % COW_entries_per_block;
+
+	if ( idx_COW_table == 0) memset(lv_COW_table, 0, blksize_snap);
+
+	/* sector offset into the on disk COW table */
+	COW_table_sector_offset = (idx % COW_entries_per_pe) / (SECTOR_SIZE / sizeof(lv_COW_table_disk_t));
+
+        /* COW table block to write next */
+	blocks[0] = (snap_pe_start + COW_table_sector_offset) >> (blksize_snap >> 10);
+
+	/* store new COW_table entry */
+	lv_COW_table[idx_COW_table].pv_org_number = LVM_TO_DISK64(lvm_pv_get_number(vg, lv_snap->lv_block_exception[idx].rdev_org));
+	lv_COW_table[idx_COW_table].pv_org_rsector = LVM_TO_DISK64(lv_snap->lv_block_exception[idx].rsector_org);
+	lv_COW_table[idx_COW_table].pv_snap_number = LVM_TO_DISK64(lvm_pv_get_number(vg, snap_phys_dev));
+	lv_COW_table[idx_COW_table].pv_snap_rsector = LVM_TO_DISK64(lv_snap->lv_block_exception[idx].rsector_new);
+
+	length_tmp = iobuf->length;
+	iobuf->length = blksize_snap;
+	page_tmp = iobuf->maplist[0];
+        iobuf->maplist[0] = lv_snap->lv_COW_table_page;
+	nr_pages_tmp = iobuf->nr_pages;
+	iobuf->nr_pages = 1;
+
+	if (brw_kiovec(WRITE, 1, &iobuf, snap_phys_dev,
+		       blocks, blksize_snap) != blksize_snap)
+		goto fail_raw_write;
+
+
+	/* initialization of next COW exception table block with zeroes */
+	end_of_table = idx % COW_entries_per_pe == COW_entries_per_pe - 1;
+	if (idx_COW_table % COW_entries_per_block == COW_entries_per_block - 1 || end_of_table)
+	{
+		/* don't go beyond the end */
+		if (idx + 1 >= lv_snap->lv_remap_end) goto good_out;
+
+		memset(lv_COW_table, 0, blksize_snap);
+
+		if (end_of_table)
+		{
+			idx++;
+			snap_phys_dev = lv_snap->lv_block_exception[idx].rdev_new;
+			snap_pe_start = lv_snap->lv_block_exception[idx - (idx % COW_entries_per_pe)].rsector_new - lv_snap->lv_chunk_size;
+			blksize_snap = lvm_get_blksize(snap_phys_dev);
+			blocks[0] = snap_pe_start >> (blksize_snap >> 10);
+		} else blocks[0]++;
+
+		if (brw_kiovec(WRITE, 1, &iobuf, snap_phys_dev,
+			       blocks, blksize_snap) != blksize_snap)
+			goto fail_raw_write;
+	}
+
+
+ good_out:
+	iobuf->length = length_tmp;
+        iobuf->maplist[0] = page_tmp;
+	iobuf->nr_pages = nr_pages_tmp;
+	return 0;
+
+	/* slow path */
+ out:
+	lvm_drop_snapshot(lv_snap, reason);
+	return 1;
+
+ fail_raw_write:
+	reason = "write error";
+	goto out;
+}
+
+/*
+ * copy on write handler for one snapshot logical volume
+ *
+ * read the original blocks and store it/them on the new one(s).
+ * if there is no exception storage space free any longer --> release snapshot.
+ *
+ * this routine gets called for each _first_ write to a physical chunk.
+ */
+int lvm_snapshot_COW(kdev_t org_phys_dev,
+		     unsigned long org_phys_sector,
+		     unsigned long org_pe_start,
+		     unsigned long org_virt_sector,
+		     lv_t * lv_snap)
+{
+	const char * reason;
+	unsigned long org_start, snap_start, snap_phys_dev, virt_start, pe_off;
+	int idx = lv_snap->lv_remap_ptr, chunk_size = lv_snap->lv_chunk_size;
+	struct kiobuf * iobuf;
+	unsigned long blocks[KIO_MAX_SECTORS];
+	int blksize_snap, blksize_org, min_blksize, max_blksize;
+	int max_sectors, nr_sectors;
+
+	/* check if we are out of snapshot space */
+	if (idx >= lv_snap->lv_remap_end)
+		goto fail_out_of_space;
+
+	/* calculate physical boundaries of source chunk */
+	pe_off = org_pe_start % chunk_size;
+	org_start = org_phys_sector - ((org_phys_sector-pe_off) % chunk_size);
+	virt_start = org_virt_sector - (org_phys_sector - org_start);
+
+	/* calculate physical boundaries of destination chunk */
+	snap_phys_dev = lv_snap->lv_block_exception[idx].rdev_new;
+	snap_start = lv_snap->lv_block_exception[idx].rsector_new;
+
+#ifdef DEBUG_SNAPSHOT
+	printk(KERN_INFO
+	       "%s -- COW: "
+	       "org %02d:%02d faulting %lu start %lu, "
+	       "snap %02d:%02d start %lu, "
+	       "size %d, pe_start %lu pe_off %lu, virt_sec %lu\n",
+	       lvm_name,
+	       MAJOR(org_phys_dev), MINOR(org_phys_dev), org_phys_sector,
+	       org_start,
+	       MAJOR(snap_phys_dev), MINOR(snap_phys_dev), snap_start,
+	       chunk_size,
+	       org_pe_start, pe_off,
+	       org_virt_sector);
+#endif
+
+	iobuf = lv_snap->lv_iobuf;
+
+	blksize_org = lvm_get_blksize(org_phys_dev);
+	blksize_snap = lvm_get_blksize(snap_phys_dev);
+	max_blksize = max(blksize_org, blksize_snap);
+	min_blksize = min(blksize_org, blksize_snap);
+	max_sectors = KIO_MAX_SECTORS * (min_blksize>>9);
+
+	if (chunk_size % (max_blksize>>9))
+		goto fail_blksize;
+
+	while (chunk_size)
+	{
+		nr_sectors = min(chunk_size, max_sectors);
+		chunk_size -= nr_sectors;
+
+		iobuf->length = nr_sectors << 9;
+
+		lvm_snapshot_prepare_blocks(blocks, org_start,
+					    nr_sectors, blksize_org);
+		if (brw_kiovec(READ, 1, &iobuf, org_phys_dev,
+			       blocks, blksize_org) != (nr_sectors<<9))
+			goto fail_raw_read;
+
+		lvm_snapshot_prepare_blocks(blocks, snap_start,
+					    nr_sectors, blksize_snap);
+		if (brw_kiovec(WRITE, 1, &iobuf, snap_phys_dev,
+			       blocks, blksize_snap) != (nr_sectors<<9))
+			goto fail_raw_write;
+	}
+
+#ifdef DEBUG_SNAPSHOT
+	/* invalidate the logical snapshot buffer cache */
+	invalidate_snap_cache(virt_start, lv_snap->lv_chunk_size,
+			      lv_snap->lv_dev);
+#endif
+
+	/* the original chunk is now stored on the snapshot volume
+	   so update the execption table */
+	lv_snap->lv_block_exception[idx].rdev_org = org_phys_dev;
+	lv_snap->lv_block_exception[idx].rsector_org = org_start;
+
+	lvm_hash_link(lv_snap->lv_block_exception + idx,
+		      org_phys_dev, org_start, lv_snap);
+	lv_snap->lv_remap_ptr = idx + 1;
+	if (lv_snap->lv_snapshot_use_rate > 0) {
+		if (lv_snap->lv_remap_ptr * 100 / lv_snap->lv_remap_end >= lv_snap->lv_snapshot_use_rate)
+			wake_up_interruptible(&lv_snap->lv_snapshot_wait);
+	}
+	return 0;
+
+	/* slow path */
+ out:
+	lvm_drop_snapshot(lv_snap, reason);
+	return 1;
+
+ fail_out_of_space:
+	reason = "out of space";
+	goto out;
+ fail_raw_read:
+	reason = "read error";
+	goto out;
+ fail_raw_write:
+	reason = "write error";
+	goto out;
+ fail_blksize:
+	reason = "blocksize error";
+	goto out;
+}
+
+int lvm_snapshot_alloc_iobuf_pages(struct kiobuf * iobuf, int sectors)
+{
+	int bytes, nr_pages, err, i;
+
+	bytes = sectors << 9;
+	nr_pages = (bytes + ~PAGE_MASK) >> PAGE_SHIFT;
+	err = expand_kiobuf(iobuf, nr_pages);
+	if (err)
+		goto out;
+
+	err = -ENOMEM;
+	iobuf->locked = 0;
+	iobuf->nr_pages = 0;
+	for (i = 0; i < nr_pages; i++)
+	{
+		struct page * page;
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,3,27)
+		page = alloc_page(GFP_KERNEL);
+		if (!page)
+			goto out;
+#else
+		{
+			unsigned long addr = __get_free_page(GFP_USER);
+			if (!addr)
+				goto out;
+			iobuf->pagelist[i] = addr;
+			page = mem_map + MAP_NR(addr);
+		}
+#endif
+
+		iobuf->maplist[i] = page;
+		iobuf->nr_pages++;
+	}
+	iobuf->offset = 0;
+
+	err = 0;
+ out:
+	return err;
+}
+
+static int calc_max_buckets(void)
+{
+	unsigned long mem;
+
+	mem = num_physpages << PAGE_SHIFT;
+	mem /= 100;
+	mem *= 2;
+	mem /= sizeof(struct list_head);
+
+	return mem;
+}
+
+int lvm_snapshot_alloc_hash_table(lv_t * lv)
+{
+	int err;
+	unsigned long buckets, max_buckets, size;
+	struct list_head * hash;
+
+	buckets = lv->lv_remap_end;
+	max_buckets = calc_max_buckets();
+	buckets = min(buckets, max_buckets);
+	while (buckets & (buckets-1))
+		buckets &= (buckets-1);
+
+	size = buckets * sizeof(struct list_head);
+
+	err = -ENOMEM;
+	hash = vmalloc(size);
+	lv->lv_snapshot_hash_table = hash;
+
+	if (!hash)
+		goto out;
+	lv->lv_snapshot_hash_table_size = size;
+
+	lv->lv_snapshot_hash_mask = buckets-1;
+	while (buckets--)
+		INIT_LIST_HEAD(hash+buckets);
+	err = 0;
+ out:
+	return err;
+}
+
+int lvm_snapshot_alloc(lv_t * lv_snap)
+{
+	int err, blocksize, max_sectors;
+
+	err = alloc_kiovec(1, &lv_snap->lv_iobuf);
+	if (err)
+		goto out;
+
+	blocksize = lvm_blocksizes[MINOR(lv_snap->lv_dev)];
+	max_sectors = KIO_MAX_SECTORS << (PAGE_SHIFT-9);
+
+	err = lvm_snapshot_alloc_iobuf_pages(lv_snap->lv_iobuf, max_sectors);
+	if (err)
+		goto out_free_kiovec;
+
+	err = lvm_snapshot_alloc_hash_table(lv_snap);
+	if (err)
+		goto out_free_kiovec;
+
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,3,27)
+		lv_snap->lv_COW_table_page = alloc_page(GFP_KERNEL);
+#else
+		{
+			unsigned long addr = __get_free_page(GFP_USER);
+			if (!addr)
+				goto out_free_kiovec;
+			lv_snap->lv_COW_table_page = mem_map + MAP_NR(addr);
+		}
+#endif
+		if (!lv_snap->lv_COW_table_page)
+			goto out_free_kiovec;
+
+ out:
+	return err;
+
+ out_free_kiovec:
+	unmap_kiobuf(lv_snap->lv_iobuf);
+	free_kiovec(1, &lv_snap->lv_iobuf);
+	vfree(lv_snap->lv_snapshot_hash_table);
+	lv_snap->lv_snapshot_hash_table = NULL;
+	goto out;
+}
+
+void lvm_snapshot_release(lv_t * lv)
+{
+	if (lv->lv_block_exception)
+	{
+		vfree(lv->lv_block_exception);
+		lv->lv_block_exception = NULL;
+	}
+	if (lv->lv_snapshot_hash_table)
+	{
+		vfree(lv->lv_snapshot_hash_table);
+		lv->lv_snapshot_hash_table = NULL;
+		lv->lv_snapshot_hash_table_size = 0;
+	}
+	if (lv->lv_iobuf)
+	{
+		unmap_kiobuf(lv->lv_iobuf);
+		free_kiovec(1, &lv->lv_iobuf);
+		lv->lv_iobuf = NULL;
+	}
+	if (lv->lv_COW_table_page)
+	{
+		free_page((ulong)lv->lv_COW_table_page);
+		lv->lv_COW_table_page = NULL;
+	}
+}
diff -urN 2.2.18/drivers/block/lvm.c 2.2.18aa1/drivers/block/lvm.c
--- 2.2.18/drivers/block/lvm.c	Thu Jan  1 01:00:00 1970
+++ 2.2.18aa1/drivers/block/lvm.c	Mon Dec 11 17:20:55 2000
@@ -0,0 +1,3271 @@
+/*
+ * kernel/lvm.c
+ *
+ * Copyright (C) 1997 - 2000  Heinz Mauelshagen, Sistina Software
+ *
+ * February-November 1997
+ * April-May,July-August,November 1998
+ * January-March,May,July,September,October 1999
+ * January,February,July,September-November 2000
+ *
+ *
+ * LVM driver is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ * 
+ * LVM driver is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING.  If not, write to
+ * the Free Software Foundation, 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA. 
+ *
+ */
+
+/*
+ * Changelog
+ *
+ *    09/11/1997 - added chr ioctls VG_STATUS_GET_COUNT
+ *                 and VG_STATUS_GET_NAMELIST
+ *    18/01/1998 - change lvm_chr_open/close lock handling
+ *    30/04/1998 - changed LV_STATUS ioctl to LV_STATUS_BYNAME and
+ *               - added   LV_STATUS_BYINDEX ioctl
+ *               - used lvm_status_byname_req_t and
+ *                      lvm_status_byindex_req_t vars
+ *    04/05/1998 - added multiple device support
+ *    08/05/1998 - added support to set/clear extendable flag in volume group
+ *    09/05/1998 - changed output of lvm_proc_get_global_info() because of
+ *                 support for free (eg. longer) logical volume names
+ *    12/05/1998 - added spin_locks (thanks to Pascal van Dam
+ *                 <pascal@ramoth.xs4all.nl>)
+ *    25/05/1998 - fixed handling of locked PEs in lvm_map() and lvm_chr_ioctl()
+ *    26/05/1998 - reactivated verify_area by access_ok
+ *    07/06/1998 - used vmalloc/vfree instead of kmalloc/kfree to go
+ *                 beyond 128/256 KB max allocation limit per call
+ *               - #ifdef blocked spin_lock calls to avoid compile errors
+ *                 with 2.0.x
+ *    11/06/1998 - another enhancement to spinlock code in lvm_chr_open()
+ *                 and use of LVM_VERSION_CODE instead of my own macros
+ *                 (thanks to  Michael Marxmeier <mike@msede.com>)
+ *    07/07/1998 - added statistics in lvm_map()
+ *    08/07/1998 - saved statistics in lvm_do_lv_extend_reduce()
+ *    25/07/1998 - used __initfunc macro
+ *    02/08/1998 - changes for official char/block major numbers
+ *    07/08/1998 - avoided init_module() and cleanup_module() to be static
+ *    30/08/1998 - changed VG lv_open counter from sum of LV lv_open counters
+ *                 to sum of LVs open (no matter how often each is)
+ *    01/09/1998 - fixed lvm_gendisk.part[] index error
+ *    07/09/1998 - added copying of lv_current_pe-array
+ *                 in LV_STATUS_BYINDEX ioctl
+ *    17/11/1998 - added KERN_* levels to printk
+ *    13/01/1999 - fixed LV index bug in lvm_do_lv_create() which hit lvrename
+ *    07/02/1999 - fixed spinlock handling bug in case of LVM_RESET
+ *                 by moving spinlock code from lvm_chr_open()
+ *                 to lvm_chr_ioctl()
+ *               - added LVM_LOCK_LVM ioctl to lvm_chr_ioctl()
+ *               - allowed LVM_RESET and retrieval commands to go ahead;
+ *                 only other update ioctls are blocked now
+ *               - fixed pv->pe to NULL for pv_status
+ *               - using lv_req structure in lvm_chr_ioctl() now
+ *               - fixed NULL ptr reference bug in lvm_do_lv_extend_reduce()
+ *                 caused by uncontiguous PV array in lvm_chr_ioctl(VG_REDUCE)
+ *    09/02/1999 - changed BLKRASET and BLKRAGET in lvm_chr_ioctl() to
+ *                 handle lgoical volume private read ahead sector
+ *               - implemented LV read_ahead handling with lvm_blk_read()
+ *                 and lvm_blk_write()
+ *    10/02/1999 - implemented 2.[12].* support function lvm_hd_name()
+ *                 to be used in drivers/block/genhd.c by disk_name()
+ *    12/02/1999 - fixed index bug in lvm_blk_ioctl(), HDIO_GETGEO
+ *               - enhanced gendisk insert/remove handling
+ *    16/02/1999 - changed to dynamic block minor number allocation to
+ *                 have as much as 99 volume groups with 256 logical volumes
+ *                 as the grand total; this allows having 1 volume group with
+ *                 up to 256 logical volumes in it
+ *    21/02/1999 - added LV open count information to proc filesystem
+ *               - substituted redundant LVM_RESET code by calls
+ *                 to lvm_do_vg_remove()
+ *    22/02/1999 - used schedule_timeout() to be more responsive
+ *                 in case of lvm_do_vg_remove() with lots of logical volumes
+ *    19/03/1999 - fixed NULL pointer bug in module_init/lvm_init
+ *    17/05/1999 - used DECLARE_WAIT_QUEUE_HEAD macro (>2.3.0)
+ *               - enhanced lvm_hd_name support
+ *    03/07/1999 - avoided use of KERNEL_VERSION macro based ifdefs and
+ *                 memcpy_tofs/memcpy_fromfs macro redefinitions
+ *    06/07/1999 - corrected reads/writes statistic counter copy in case
+ *                 of striped logical volume
+ *    28/07/1999 - implemented snapshot logical volumes
+ *                 - lvm_chr_ioctl
+ *                   - LV_STATUS_BYINDEX
+ *                   - LV_STATUS_BYNAME
+ *                 - lvm_do_lv_create
+ *                 - lvm_do_lv_remove
+ *                 - lvm_map
+ *                 - new lvm_snapshot_remap_block
+ *                 - new lvm_snapshot_remap_new_block
+ *    08/10/1999 - implemented support for multiple snapshots per
+ *                 original logical volume
+ *    12/10/1999 - support for 2.3.19
+ *    11/11/1999 - support for 2.3.28
+ *    21/11/1999 - changed lvm_map() interface to buffer_head based
+ *    19/12/1999 - support for 2.3.33
+ *    01/01/2000 - changed locking concept in lvm_map(),
+ *                 lvm_do_vg_create() and lvm_do_lv_remove()
+ *    15/01/2000 - fixed PV_FLUSH bug in lvm_chr_ioctl()
+ *    24/01/2000 - ported to 2.3.40 including Alan Cox's pointer changes etc.
+ *    29/01/2000 - used kmalloc/kfree again for all small structures
+ *    20/01/2000 - cleaned up lvm_chr_ioctl by moving code
+ *                 to seperated functions
+ *               - avoided "/dev/" in proc filesystem output
+ *               - avoided inline strings functions lvm_strlen etc.
+ *    14/02/2000 - support for 2.3.43
+ *               - integrated Andrea Arcagneli's snapshot code
+ *    25/06/2000 - james (chip) , IKKHAYD! roffl
+ *    26/06/2000 - enhanced lv_extend_reduce for snapshot logical volume support
+ *    06/09/2000 - added devfs support
+ *    07/09/2000 - changed IOP version to 9
+ *               - started to add new char ioctl LV_STATUS_BYDEV_T to support
+ *                 getting an lv_t based on the dev_t of the Logical Volume
+ *    14/09/2000 - enhanced lvm_do_lv_create to upcall VFS functions
+ *                 to sync and lock, activate snapshot and unlock the FS
+ *                 (to support journaled filesystems)
+ *    18/09/2000 - hardsector size support
+ *    27/09/2000 - implemented lvm_do_lv_rename() and lvm_do_vg_rename()
+ *    30/10/2000 - added Andi Kleen's LV_BMAP ioctl to support LILO
+ *    01/11/2000 - added memory information on hash tables to
+ *                 lvm_proc_get_global_info()
+ *    02/11/2000 - implemented /proc/lvm/ hierarchy
+ *
+ */
+
+
+static char *lvm_version = "LVM version 0.9  by Heinz Mauelshagen  (13/11/2000)\n";
+static char *lvm_short_version = "version 0.9 (13/11/2000)";
+
+#define MAJOR_NR	LVM_BLK_MAJOR
+#define	DEVICE_OFF(device)
+
+/* lvm_do_lv_create calls fsync_dev_lockfs()/unlockfs() */
+/* #define	LVM_VFS_ENHANCEMENT */
+
+#include <linux/config.h>
+#include <linux/version.h>
+
+#ifdef MODVERSIONS
+#undef MODULE
+#define MODULE
+#include <linux/modversions.h>
+#endif
+
+#include <linux/module.h>
+
+#include <linux/kernel.h>
+#include <linux/vmalloc.h>
+#if LINUX_VERSION_CODE > KERNEL_VERSION ( 2, 3, 39)
+#  include <linux/slab.h>
+#endif
+#include <linux/init.h>
+
+#include <linux/hdreg.h>
+#include <linux/stat.h>
+#include <linux/fs.h>
+#include <linux/proc_fs.h>
+#include <linux/blkdev.h>
+#include <linux/genhd.h>
+#include <linux/locks.h>
+#include <linux/smp_lock.h>
+#include <asm/ioctl.h>
+#include <asm/segment.h>
+#include <asm/uaccess.h>
+
+#ifdef CONFIG_KERNELD
+#include <linux/kerneld.h>
+#endif
+
+#include <linux/blk.h>
+#if LINUX_VERSION_CODE > KERNEL_VERSION ( 2, 3, 0)
+#include <linux/blkpg.h>
+#endif
+
+#include <linux/errno.h>
+#include <linux/lvm.h>
+
+#define	LVM_CORRECT_READ_AHEAD( a) \
+   if      ( a < LVM_MIN_READ_AHEAD || \
+             a > LVM_MAX_READ_AHEAD) a = LVM_MAX_READ_AHEAD;
+
+#ifndef WRITEA
+#  define WRITEA WRITE
+#endif
+
+/*
+ * External function prototypes
+ */
+#ifdef MODULE
+int init_module(void);
+void cleanup_module(void);
+#else
+extern int lvm_init(void);
+#endif
+
+#if LINUX_VERSION_CODE > KERNEL_VERSION ( 2, 3, 30)
+static void lvm_dummy_device_request(request_queue_t *);
+#else
+static void lvm_dummy_device_request(void);
+#endif
+#define	DEVICE_REQUEST	lvm_dummy_device_request
+
+#if LINUX_VERSION_CODE > KERNEL_VERSION ( 2, 3, 42)
+#if LINUX_VERSION_CODE < KERNEL_VERSION ( 2, 3, 49)
+static int lvm_make_request_fn(int, struct buffer_head*);
+#else
+static int lvm_make_request_fn(request_queue_t*, int, struct buffer_head*);
+#endif
+#endif
+
+static int lvm_blk_ioctl(struct inode *, struct file *, uint, ulong);
+static int lvm_blk_open(struct inode *, struct file *);
+
+static int lvm_chr_open(struct inode *, struct file *);
+
+static int lvm_chr_close(struct inode *, struct file *);
+static int lvm_blk_close(struct inode *, struct file *);
+static int lvm_user_bmap(struct inode *, struct lv_bmap *);
+
+static int lvm_chr_ioctl(struct inode *, struct file *, uint, ulong);
+
+#if defined CONFIG_LVM_PROC_FS && defined CONFIG_PROC_FS
+int lvm_proc_read_vg_info(char *, char **, off_t, int, int *, void *);
+int lvm_proc_read_lv_info(char *, char **, off_t, int, int *, void *);
+int lvm_proc_read_pv_info(char *, char **, off_t, int, int *, void *);
+static int lvm_proc_get_global_info(char *, char **, off_t, int, int *, void *);
+void lvm_do_create_proc_entry_of_vg ( vg_t *);
+inline void lvm_do_remove_proc_entry_of_vg ( vg_t *);
+inline void lvm_do_create_proc_entry_of_lv ( vg_t *, lv_t *);
+inline void lvm_do_remove_proc_entry_of_lv ( vg_t *, lv_t *);
+inline void lvm_do_create_proc_entry_of_pv ( vg_t *, pv_t *);
+inline void lvm_do_remove_proc_entry_of_pv ( vg_t *, pv_t *);
+#endif
+
+#ifdef LVM_HD_NAME
+void lvm_hd_name(char *, int);
+#endif
+/* End external function prototypes */
+
+
+/*
+ * Internal function prototypes
+ */
+static void lvm_init_vars(void);
+
+/* external snapshot calls */
+extern inline int lvm_get_blksize(kdev_t);
+extern int lvm_snapshot_alloc(lv_t *);
+extern void lvm_snapshot_fill_COW_page(vg_t *, lv_t *);
+extern int lvm_snapshot_COW(kdev_t, ulong, ulong, ulong, lv_t *);
+extern int lvm_snapshot_remap_block(kdev_t *, ulong *, ulong, lv_t *);
+extern void lvm_snapshot_release(lv_t *); 
+extern int lvm_write_COW_table_block(vg_t *, lv_t *);
+extern inline void lvm_hash_link(lv_block_exception_t *, kdev_t, ulong, lv_t *);
+extern int lvm_snapshot_alloc_hash_table(lv_t *);
+extern void lvm_drop_snapshot(lv_t *, char *);
+
+#ifdef LVM_HD_NAME
+extern void (*lvm_hd_name_ptr) (char *, int);
+#endif
+static int lvm_map(kdev_t, kdev_t *, unsigned long *, unsigned long, int);
+static int lvm_do_lock_lvm(void);
+static int lvm_do_le_remap(vg_t *, void *);
+
+static int lvm_do_pv_create(pv_t *, vg_t *, ulong);
+static int lvm_do_pv_remove(vg_t *, ulong);
+static int lvm_do_lv_create(int, char *, lv_t *);
+static int lvm_do_lv_extend_reduce(int, char *, lv_t *);
+static int lvm_do_lv_remove(int, char *, int);
+static int lvm_do_lv_rename(vg_t *, lv_req_t *, lv_t *);
+static int lvm_do_lv_status_byname(vg_t *r, void *);
+static int lvm_do_lv_status_byindex(vg_t *, void *);
+static int lvm_do_lv_status_bydev(vg_t *, void *);
+
+static int lvm_do_pe_lock_unlock(vg_t *r, void *);
+
+static int lvm_do_pv_change(vg_t*, void*);
+static int lvm_do_pv_status(vg_t *, void *);
+
+static int lvm_do_vg_create(int, void *);
+static int lvm_do_vg_extend(vg_t *, void *);
+static int lvm_do_vg_reduce(vg_t *, void *);
+static int lvm_do_vg_rename(vg_t *, void *);
+static int lvm_do_vg_remove(int);
+static void lvm_geninit(struct gendisk *);
+#ifdef LVM_GET_INODE
+static struct inode *lvm_get_inode(int);
+void lvm_clear_inode(struct inode *);
+#endif
+/* END Internal function prototypes */
+
+
+/* volume group descriptor area pointers */
+static vg_t *vg[ABS_MAX_VG];
+
+#ifdef	CONFIG_DEVFS_FS
+static devfs_handle_t lvm_devfs_handle;
+static devfs_handle_t vg_devfs_handle[MAX_VG];
+static devfs_handle_t ch_devfs_handle[MAX_VG];
+static devfs_handle_t lv_devfs_handle[MAX_LV];
+#endif
+
+static pv_t *pvp = NULL;
+static lv_t *lvp = NULL;
+static pe_t *pep = NULL;
+static pe_t *pep1 = NULL;
+static char *basename = NULL;
+
+
+/* map from block minor number to VG and LV numbers */
+typedef struct {
+	int vg_number;
+	int lv_number;
+} vg_lv_map_t;
+static vg_lv_map_t vg_lv_map[ABS_MAX_LV];
+
+
+/* Request structures (lvm_chr_ioctl()) */
+static pv_change_req_t pv_change_req;
+static pv_flush_req_t pv_flush_req;
+static pv_status_req_t pv_status_req;
+static pe_lock_req_t pe_lock_req;
+static le_remap_req_t le_remap_req;
+static lv_req_t lv_req;
+
+#ifdef LVM_TOTAL_RESET
+static int lvm_reset_spindown = 0;
+#endif
+
+static char pv_name[NAME_LEN];
+/* static char rootvg[NAME_LEN] = { 0, }; */
+const char *const lvm_name = LVM_NAME;
+static int lock = 0;
+static int loadtime = 0;
+static uint vg_count = 0;
+static long lvm_chr_open_count = 0;
+static ushort lvm_iop_version = LVM_DRIVER_IOP_VERSION;
+#if LINUX_VERSION_CODE > KERNEL_VERSION ( 2, 3, 0)
+static DECLARE_WAIT_QUEUE_HEAD(lvm_snapshot_wait);
+static DECLARE_WAIT_QUEUE_HEAD(lvm_wait);
+static DECLARE_WAIT_QUEUE_HEAD(lvm_map_wait);
+#else
+struct wait_queue *lvm_snapshot_wait = NULL;
+struct wait_queue *lvm_wait = NULL;
+struct wait_queue *lvm_map_wait = NULL;
+#endif
+
+static spinlock_t lvm_lock = SPIN_LOCK_UNLOCKED;
+static spinlock_t lvm_snapshot_lock = SPIN_LOCK_UNLOCKED;
+
+#if defined CONFIG_LVM_PROC_FS && defined CONFIG_PROC_FS
+static struct proc_dir_entry *lvm_proc_dir = NULL;
+static struct proc_dir_entry *lvm_proc_vg_subdir = NULL;
+struct proc_dir_entry *pde = NULL;
+#endif
+
+static struct file_operations lvm_chr_fops =
+{
+	open:		lvm_chr_open,
+	release:	lvm_chr_close,
+	ioctl:		lvm_chr_ioctl,
+};
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION ( 2, 3, 38)
+static struct file_operations lvm_blk_fops =
+{
+        open:           lvm_blk_open,
+        read:           block_read,
+        write:          block_write,
+        release:        lvm_blk_close,
+        ioctl:          lvm_blk_ioctl,
+        fsync:          block_fsync,
+};
+#endif
+
+#if LINUX_VERSION_CODE > KERNEL_VERSION ( 2, 3, 37)
+#define BLOCK_DEVICE_OPERATIONS
+/* block device operations structure needed for 2.3.38? and above */
+static struct block_device_operations lvm_blk_dops =
+{
+	open: 		lvm_blk_open,
+	release:	lvm_blk_close,
+	ioctl:		lvm_blk_ioctl,
+};
+#endif
+
+
+/* gendisk structures */
+static struct hd_struct lvm_hd_struct[MAX_LV];
+static int lvm_blocksizes[MAX_LV] =
+{0,};
+static int lvm_size[MAX_LV] =
+{0,};
+static struct gendisk lvm_gendisk =
+{
+	MAJOR_NR,		/* major # */
+	LVM_NAME,		/* name of major */
+	0,			/* number of times minor is shifted
+				   to get real minor */
+	1,			/* maximum partitions per device */
+#if LINUX_VERSION_CODE < KERNEL_VERSION ( 2, 3, 40)
+	MAX_LV,			/* maximum number of real devices */
+	lvm_geninit,		/* initialization called before we
+				   do other things */
+#endif
+	lvm_hd_struct,		/* partition table */
+	lvm_size,		/* device size in blocks, copied
+				   to block_size[] */
+	MAX_LV,			/* number or real devices */
+	NULL,			/* internal */
+	NULL,			/* pointer to next gendisk struct (internal) */
+};
+
+
+#ifdef MODULE
+/*
+ * Module initialization...
+ */
+int init_module(void)
+#else
+/*
+ * Driver initialization...
+ */
+#ifdef __initfunc
+__initfunc(int lvm_init(void))
+#else
+int __init lvm_init(void)
+#endif
+#endif				/* #ifdef MODULE */
+{
+	struct gendisk *gendisk_ptr = NULL;
+
+	if (register_chrdev(LVM_CHAR_MAJOR, lvm_name, &lvm_chr_fops) < 0) {
+		printk(KERN_ERR "%s -- register_chrdev failed\n", lvm_name);
+		return -EIO;
+	}
+#ifdef BLOCK_DEVICE_OPERATIONS
+	if (register_blkdev(MAJOR_NR, lvm_name, &lvm_blk_dops) < 0)
+#else
+	if (register_blkdev(MAJOR_NR, lvm_name, &lvm_blk_fops) < 0)
+#endif
+	{
+		printk("%s -- register_blkdev failed\n", lvm_name);
+		if (unregister_chrdev(LVM_CHAR_MAJOR, lvm_name) < 0)
+			printk(KERN_ERR "%s -- unregister_chrdev failed\n", lvm_name);
+		return -EIO;
+	}
+
+#ifdef	CONFIG_DEVFS_FS
+	lvm_devfs_handle = devfs_register(
+		0 , "lvm", 0, 0, LVM_CHAR_MAJOR,
+		S_IFCHR | S_IRUSR | S_IWUSR | S_IRGRP,
+		&lvm_chr_fops, NULL);
+#endif
+
+#if defined CONFIG_LVM_PROC_FS && defined CONFIG_PROC_FS
+	lvm_proc_dir = create_proc_entry (LVM_DIR, S_IFDIR, &proc_root);
+	if (lvm_proc_dir != NULL) {
+		lvm_proc_vg_subdir = create_proc_entry (LVM_VG_SUBDIR, S_IFDIR, lvm_proc_dir);
+		pde = create_proc_entry(LVM_GLOBAL, S_IFREG, lvm_proc_dir);
+		if ( pde != NULL) pde->read_proc = &lvm_proc_get_global_info;
+	}
+#endif
+
+	lvm_init_vars();
+	lvm_geninit(&lvm_gendisk);
+
+	/* insert our gendisk at the corresponding major */
+	if (gendisk_head != NULL) {
+		gendisk_ptr = gendisk_head;
+		while (gendisk_ptr->next != NULL &&
+		       gendisk_ptr->major > lvm_gendisk.major) {
+			gendisk_ptr = gendisk_ptr->next;
+		}
+		lvm_gendisk.next = gendisk_ptr->next;
+		gendisk_ptr->next = &lvm_gendisk;
+	} else {
+		gendisk_head = &lvm_gendisk;
+		lvm_gendisk.next = NULL;
+	}
+
+#ifdef LVM_HD_NAME
+	/* reference from drivers/block/genhd.c */
+	lvm_hd_name_ptr = lvm_hd_name;
+#endif
+
+#if LINUX_VERSION_CODE > KERNEL_VERSION ( 2, 3, 30)
+	blk_init_queue(BLK_DEFAULT_QUEUE(MAJOR_NR), DEVICE_REQUEST);
+#if LINUX_VERSION_CODE > KERNEL_VERSION ( 2, 3, 42)
+	blk_queue_make_request(BLK_DEFAULT_QUEUE(MAJOR_NR), lvm_make_request_fn);
+#endif
+#else
+	blk_dev[MAJOR_NR].request_fn = DEVICE_REQUEST;
+	blk_dev[MAJOR_NR].current_request = NULL;
+	blk_dev[MAJOR_NR].makerq_fn = NULL;
+	blk_dev[MAJOR_NR].map_fn = lvm_map;
+#endif
+
+	/* optional read root VGDA */
+/*
+   if ( *rootvg != 0) vg_read_with_pv_and_lv ( rootvg, &vg);
+*/
+
+	printk(KERN_INFO
+	       "%s%s -- "
+#ifdef MODULE
+	       "Module"
+#else
+	       "Driver"
+#endif
+	       " successfully initialized\n",
+	       lvm_version, lvm_name);
+
+	return 0;
+} /* init_module() / lvm_init() */
+
+
+#ifdef MODULE
+/*
+ * Module cleanup...
+ */
+void cleanup_module(void)
+{
+	struct gendisk *gendisk_ptr = NULL, *gendisk_ptr_prev = NULL;
+
+#ifdef	CONFIG_DEVFS_FS
+	devfs_unregister (lvm_devfs_handle);
+#endif
+
+	if (unregister_chrdev(LVM_CHAR_MAJOR, lvm_name) < 0) {
+		printk(KERN_ERR "%s -- unregister_chrdev failed\n", lvm_name);
+	}
+	if (unregister_blkdev(MAJOR_NR, lvm_name) < 0) {
+		printk(KERN_ERR "%s -- unregister_blkdev failed\n", lvm_name);
+	}
+#if LINUX_VERSION_CODE > KERNEL_VERSION ( 2, 3, 30)
+	blk_cleanup_queue(BLK_DEFAULT_QUEUE(MAJOR_NR));
+#else
+	blk_dev[MAJOR_NR].request_fn = NULL;
+	blk_dev[MAJOR_NR].current_request = NULL;
+	blk_dev[MAJOR_NR].makerq_fn = NULL;
+	blk_dev[MAJOR_NR].map_fn = NULL;
+#endif
+
+	gendisk_ptr = gendisk_ptr_prev = gendisk_head;
+	while (gendisk_ptr != NULL) {
+		if (gendisk_ptr == &lvm_gendisk)
+			break;
+		gendisk_ptr_prev = gendisk_ptr;
+		gendisk_ptr = gendisk_ptr->next;
+	}
+	/* delete our gendisk from chain */
+	if (gendisk_ptr == &lvm_gendisk)
+		gendisk_ptr_prev->next = gendisk_ptr->next;
+
+	blk_size[MAJOR_NR] = NULL;
+	blksize_size[MAJOR_NR] = NULL;
+	hardsect_size[MAJOR_NR] = NULL;
+
+#if defined CONFIG_LVM_PROC_FS && defined CONFIG_PROC_FS
+	remove_proc_entry(LVM_GLOBAL, lvm_proc_dir);
+	remove_proc_entry(LVM_VG_SUBDIR, lvm_proc_dir);
+	remove_proc_entry(LVM_DIR, &proc_root);
+#endif
+
+#ifdef LVM_HD_NAME
+	/* reference from linux/drivers/block/genhd.c */
+	lvm_hd_name_ptr = NULL;
+#endif
+
+	printk(KERN_INFO "%s -- Module successfully deactivated\n", lvm_name);
+
+	return;
+}	/* void cleanup_module() */
+#endif	/* #ifdef MODULE */
+
+
+/*
+ * support function to initialize lvm variables
+ */
+#ifdef __initfunc
+__initfunc(void lvm_init_vars(void))
+#else
+void __init lvm_init_vars(void)
+#endif
+{
+	int v;
+
+	loadtime = CURRENT_TIME;
+
+	lvm_lock = lvm_snapshot_lock = SPIN_LOCK_UNLOCKED;
+
+	pe_lock_req.lock = UNLOCK_PE;
+	pe_lock_req.data.lv_dev = \
+	pe_lock_req.data.pv_dev = \
+	pe_lock_req.data.pv_offset = 0;
+
+	/* Initialize VG pointers */
+	for (v = 0; v < ABS_MAX_VG; v++) vg[v] = NULL;
+
+	/* Initialize LV -> VG association */
+	for (v = 0; v < ABS_MAX_LV; v++) {
+		/* index ABS_MAX_VG never used for real VG */
+		vg_lv_map[v].vg_number = ABS_MAX_VG;
+		vg_lv_map[v].lv_number = -1;
+	}
+
+	return;
+} /* lvm_init_vars() */
+
+
+/********************************************************************
+ *
+ * Character device functions
+ *
+ ********************************************************************/
+
+/*
+ * character device open routine
+ */
+static int lvm_chr_open(struct inode *inode,
+			struct file *file)
+{
+	int minor = MINOR(inode->i_rdev);
+
+#ifdef DEBUG
+	printk(KERN_DEBUG
+	 "%s -- lvm_chr_open MINOR: %d  VG#: %d  mode: 0x%X  lock: %d\n",
+	       lvm_name, minor, VG_CHR(minor), file->f_mode, lock);
+#endif
+
+	/* super user validation */
+	if (!capable(CAP_SYS_ADMIN)) return -EACCES;
+
+	/* Group special file open */
+	if (VG_CHR(minor) > MAX_VG) return -ENXIO;
+
+	lvm_chr_open_count++;
+
+	MOD_INC_USE_COUNT;
+
+	return 0;
+} /* lvm_chr_open() */
+
+
+/*
+ * character device i/o-control routine
+ *
+ * Only one changing process can do changing ioctl at one time,
+ * others will block.
+ *
+ */
+static int lvm_chr_ioctl(struct inode *inode, struct file *file,
+			 uint command, ulong a)
+{
+	int minor = MINOR(inode->i_rdev);
+	uint extendable, l, v;
+	void *arg = (void *) a;
+	lv_t lv;
+	vg_t* vg_ptr = vg[VG_CHR(minor)];
+
+	/* otherwise cc will complain about unused variables */
+	(void) lvm_lock;
+
+
+#ifdef DEBUG_IOCTL
+	printk(KERN_DEBUG
+	       "%s -- lvm_chr_ioctl: command: 0x%X  MINOR: %d  "
+	       "VG#: %d  mode: 0x%X\n",
+	       lvm_name, command, minor, VG_CHR(minor), file->f_mode);
+#endif
+
+#ifdef LVM_TOTAL_RESET
+	if (lvm_reset_spindown > 0) return -EACCES;
+#endif
+
+	/* Main command switch */
+	switch (command) {
+	case LVM_LOCK_LVM:
+		/* lock the LVM */
+		return lvm_do_lock_lvm();
+
+	case LVM_GET_IOP_VERSION:
+		/* check lvm version to ensure driver/tools+lib
+		   interoperability */
+		if (copy_to_user(arg, &lvm_iop_version, sizeof(ushort)) != 0)
+			return -EFAULT;
+		return 0;
+
+#ifdef LVM_TOTAL_RESET
+	case LVM_RESET:
+		/* lock reset function */
+		lvm_reset_spindown = 1;
+		for (v = 0; v < ABS_MAX_VG; v++) {
+			if (vg[v] != NULL) lvm_do_vg_remove(v);
+		}
+
+#ifdef MODULE
+		while (GET_USE_COUNT(&__this_module) < 1)
+			MOD_INC_USE_COUNT;
+		while (GET_USE_COUNT(&__this_module) > 1)
+			MOD_DEC_USE_COUNT;
+#endif /* MODULE */
+		lock = 0;	/* release lock */
+		wake_up_interruptible(&lvm_wait);
+		return 0;
+#endif /* LVM_TOTAL_RESET */
+
+
+	case LE_REMAP:
+		/* remap a logical extent (after moving the physical extent) */
+		return lvm_do_le_remap(vg_ptr,arg);
+
+	case PE_LOCK_UNLOCK:
+		/* lock/unlock i/o to a physical extent to move it to another
+		   physical volume (move's done in user space's pvmove) */
+		return lvm_do_pe_lock_unlock(vg_ptr,arg);
+
+	case VG_CREATE:
+		/* create a VGDA */
+		return lvm_do_vg_create(minor, arg);
+
+	case VG_EXTEND:
+		/* extend a volume group */
+		return lvm_do_vg_extend(vg_ptr, arg);
+
+	case VG_REDUCE:
+		/* reduce a volume group */
+		return lvm_do_vg_reduce(vg_ptr, arg);
+
+	case VG_RENAME:
+		/* rename a volume group */
+		return lvm_do_vg_rename(vg_ptr, arg);
+
+	case VG_REMOVE:
+		/* remove an inactive VGDA */
+		return lvm_do_vg_remove(minor);
+
+
+	case VG_SET_EXTENDABLE:
+		/* set/clear extendability flag of volume group */
+		if (vg_ptr == NULL) return -ENXIO;
+		if (copy_from_user(&extendable, arg, sizeof(extendable)) != 0)
+			return -EFAULT;
+
+		if (extendable == VG_EXTENDABLE ||
+		    extendable == ~VG_EXTENDABLE) {
+			if (extendable == VG_EXTENDABLE)
+				vg_ptr->vg_status |= VG_EXTENDABLE;
+			else
+				vg_ptr->vg_status &= ~VG_EXTENDABLE;
+		} else return -EINVAL;
+		return 0;
+
+
+	case VG_STATUS:
+		/* get volume group data (only the vg_t struct) */
+		if (vg_ptr == NULL) return -ENXIO;
+		if (copy_to_user(arg, vg_ptr, sizeof(vg_t)) != 0)
+			return -EFAULT;
+		return 0;
+
+
+	case VG_STATUS_GET_COUNT:
+		/* get volume group count */
+		if (copy_to_user(arg, &vg_count, sizeof(vg_count)) != 0)
+			return -EFAULT;
+		return 0;
+
+
+	case VG_STATUS_GET_NAMELIST:
+		/* get volume group count */
+		for (l = v = 0; v < ABS_MAX_VG; v++) {
+			if (vg[v] != NULL) {
+				if (copy_to_user(arg + l * NAME_LEN,
+						 vg[v]->vg_name,
+						 NAME_LEN) != 0)
+					return -EFAULT;
+				l++;
+			}
+		}
+		return 0;
+
+
+	case LV_CREATE:
+	case LV_EXTEND:
+	case LV_REDUCE:
+	case LV_REMOVE:
+	case LV_RENAME:
+		/* create, extend, reduce, remove or rename a logical volume */
+		if (vg_ptr == NULL) return -ENXIO;
+		if (copy_from_user(&lv_req, arg, sizeof(lv_req)) != 0)
+			return -EFAULT;
+
+		if (command != LV_REMOVE) {
+			if (copy_from_user(&lv, lv_req.lv, sizeof(lv_t)) != 0)
+				return -EFAULT;
+		}
+		switch (command) {
+		case LV_CREATE:
+			return lvm_do_lv_create(minor, lv_req.lv_name, &lv);
+
+		case LV_EXTEND:
+		case LV_REDUCE:
+			return lvm_do_lv_extend_reduce(minor, lv_req.lv_name, &lv);
+		case LV_REMOVE:
+			return lvm_do_lv_remove(minor, lv_req.lv_name, -1);
+
+		case LV_RENAME:
+			return lvm_do_lv_rename(vg_ptr, &lv_req, &lv);
+		}
+
+
+
+
+	case LV_STATUS_BYNAME:
+		/* get status of a logical volume by name */
+		return lvm_do_lv_status_byname(vg_ptr, arg);
+
+
+	case LV_STATUS_BYINDEX:
+		/* get status of a logical volume by index */
+		return lvm_do_lv_status_byindex(vg_ptr, arg);
+
+
+	case LV_STATUS_BYDEV:
+		return lvm_do_lv_status_bydev(vg_ptr, arg);
+
+
+	case PV_CHANGE:
+		/* change a physical volume */
+		return lvm_do_pv_change(vg_ptr,arg);
+
+
+	case PV_STATUS:
+		/* get physical volume data (pv_t structure only) */
+		return lvm_do_pv_status(vg_ptr,arg);
+
+
+	case PV_FLUSH:
+		/* physical volume buffer flush/invalidate */
+		if (copy_from_user(&pv_flush_req, arg,
+				   sizeof(pv_flush_req)) != 0)
+			return -EFAULT;
+
+		fsync_dev(pv_flush_req.pv_dev);
+		invalidate_buffers(pv_flush_req.pv_dev);
+		return 0;
+
+
+	default:
+		printk(KERN_WARNING
+		       "%s -- lvm_chr_ioctl: unknown command %x\n",
+		       lvm_name, command);
+		return -EINVAL;
+	}
+
+	return 0;
+} /* lvm_chr_ioctl */
+
+
+/*
+ * character device close routine
+ */
+static int lvm_chr_close(struct inode *inode, struct file *file)
+{
+#ifdef DEBUG
+	int minor = MINOR(inode->i_rdev);
+	printk(KERN_DEBUG
+	     "%s -- lvm_chr_close   VG#: %d\n", lvm_name, VG_CHR(minor));
+#endif
+
+#ifdef LVM_TOTAL_RESET
+	if (lvm_reset_spindown > 0) {
+		lvm_reset_spindown = 0;
+		lvm_chr_open_count = 0;
+	}
+#endif
+
+	if (lvm_chr_open_count > 0) lvm_chr_open_count--;
+	if (lock == current->pid) {
+		lock = 0;	/* release lock */
+		wake_up_interruptible(&lvm_wait);
+	}
+
+	MOD_DEC_USE_COUNT;
+
+	return 0;
+} /* lvm_chr_close() */
+
+
+
+/********************************************************************
+ *
+ * Block device functions
+ *
+ ********************************************************************/
+
+/*
+ * block device open routine
+ */
+static int lvm_blk_open(struct inode *inode, struct file *file)
+{
+	int minor = MINOR(inode->i_rdev);
+	lv_t *lv_ptr;
+	vg_t *vg_ptr = vg[VG_BLK(minor)];
+
+#ifdef DEBUG_LVM_BLK_OPEN
+	printk(KERN_DEBUG
+	  "%s -- lvm_blk_open MINOR: %d  VG#: %d  LV#: %d  mode: 0x%X\n",
+	    lvm_name, minor, VG_BLK(minor), LV_BLK(minor), file->f_mode);
+#endif
+
+#ifdef LVM_TOTAL_RESET
+	if (lvm_reset_spindown > 0)
+		return -EPERM;
+#endif
+
+	if (vg_ptr != NULL &&
+	    (vg_ptr->vg_status & VG_ACTIVE) &&
+	    (lv_ptr = vg_ptr->lv[LV_BLK(minor)]) != NULL &&
+	    LV_BLK(minor) >= 0 &&
+	    LV_BLK(minor) < vg_ptr->lv_max) {
+
+		/* Check parallel LV spindown (LV remove) */
+		if (lv_ptr->lv_status & LV_SPINDOWN) return -EPERM;
+
+		/* Check inactive LV and open for read/write */
+		if (file->f_mode & O_RDWR) {
+			if (!(lv_ptr->lv_status & LV_ACTIVE)) return -EPERM;
+			if (!(lv_ptr->lv_access & LV_WRITE))  return -EACCES;
+		}
+
+#ifndef BLOCK_DEVICE_OPERATIONS
+		file->f_op = &lvm_blk_fops;
+#endif
+
+                /* be sure to increment VG counter */
+		if (lv_ptr->lv_open == 0) vg_ptr->lv_open++;
+		lv_ptr->lv_open++;
+
+		MOD_INC_USE_COUNT;
+
+#ifdef DEBUG_LVM_BLK_OPEN
+		printk(KERN_DEBUG
+		       "%s -- lvm_blk_open MINOR: %d  VG#: %d  LV#: %d  size: %d\n",
+		       lvm_name, minor, VG_BLK(minor), LV_BLK(minor),
+		       lv_ptr->lv_size);
+#endif
+
+		return 0;
+	}
+	return -ENXIO;
+} /* lvm_blk_open() */
+
+
+/*
+ * block device i/o-control routine
+ */
+static int lvm_blk_ioctl(struct inode *inode, struct file *file,
+			 uint command, ulong a)
+{
+	int minor = MINOR(inode->i_rdev);
+	vg_t *vg_ptr = vg[VG_BLK(minor)];
+	lv_t *lv_ptr = vg_ptr->lv[LV_BLK(minor)];
+	void *arg = (void *) a;
+	struct hd_geometry *hd = (struct hd_geometry *) a;
+
+#ifdef DEBUG_IOCTL
+	printk(KERN_DEBUG
+	       "%s -- lvm_blk_ioctl MINOR: %d  command: 0x%X  arg: %X  "
+	       "VG#: %dl  LV#: %d\n",
+	       lvm_name, minor, command, (ulong) arg,
+	       VG_BLK(minor), LV_BLK(minor));
+#endif
+
+	switch (command) {
+	case BLKGETSIZE:
+		/* return device size */
+#ifdef DEBUG_IOCTL
+		printk(KERN_DEBUG
+		       "%s -- lvm_blk_ioctl -- BLKGETSIZE: %u\n",
+		       lvm_name, lv_ptr->lv_size);
+#endif
+		if (put_user(lv_ptr->lv_size, (long *)arg))
+			return -EFAULT; 
+		break;
+
+
+	case BLKFLSBUF:
+		/* flush buffer cache */
+		if (!capable(CAP_SYS_ADMIN)) return -EACCES;
+
+#ifdef DEBUG_IOCTL
+		printk(KERN_DEBUG
+		       "%s -- lvm_blk_ioctl -- BLKFLSBUF\n", lvm_name);
+#endif
+		fsync_dev(inode->i_rdev);
+		break;
+
+
+	case BLKRASET:
+		/* set read ahead for block device */
+		if (!capable(CAP_SYS_ADMIN)) return -EACCES;
+
+#ifdef DEBUG_IOCTL
+		printk(KERN_DEBUG
+		       "%s -- lvm_blk_ioctl -- BLKRASET: %d sectors for %02X:%02X\n",
+		       lvm_name, (long) arg, MAJOR(inode->i_rdev), minor);
+#endif
+		if ((long) arg < LVM_MIN_READ_AHEAD ||
+		    (long) arg > LVM_MAX_READ_AHEAD)
+			return -EINVAL;
+		lv_ptr->lv_read_ahead = (long) arg;
+		break;
+
+
+	case BLKRAGET:
+		/* get current read ahead setting */
+#ifdef DEBUG_IOCTL
+		printk(KERN_DEBUG
+		       "%s -- lvm_blk_ioctl -- BLKRAGET\n", lvm_name);
+#endif
+		if (put_user(lv_ptr->lv_read_ahead, (long *)arg))
+			return -EFAULT;
+		break;
+
+
+	case HDIO_GETGEO:
+		/* get disk geometry */
+#ifdef DEBUG_IOCTL
+		printk(KERN_DEBUG
+		       "%s -- lvm_blk_ioctl -- HDIO_GETGEO\n", lvm_name);
+#endif
+		if (hd == NULL)
+			return -EINVAL;
+		{
+			unsigned char heads = 64;
+			unsigned char sectors = 32;
+			long start = 0;
+			short cylinders = lv_ptr->lv_size / heads / sectors;
+
+			if (copy_to_user((char *) &hd->heads, &heads,
+					 sizeof(heads)) != 0 ||
+			    copy_to_user((char *) &hd->sectors, &sectors,
+					 sizeof(sectors)) != 0 ||
+			    copy_to_user((short *) &hd->cylinders,
+				   &cylinders, sizeof(cylinders)) != 0 ||
+			    copy_to_user((long *) &hd->start, &start,
+					 sizeof(start)) != 0)
+				return -EFAULT;
+		}
+
+#ifdef DEBUG_IOCTL
+		printk(KERN_DEBUG
+		       "%s -- lvm_blk_ioctl -- cylinders: %d\n",
+		       lvm_name, lv_ptr->lv_size / heads / sectors);
+#endif
+		break;
+
+
+	case LV_SET_ACCESS:
+		/* set access flags of a logical volume */
+		if (!capable(CAP_SYS_ADMIN)) return -EACCES;
+		lv_ptr->lv_access = (ulong) arg;
+		if ( lv_ptr->lv_access & LV_WRITE)
+			set_device_ro(lv_ptr->lv_dev, 0);
+		else
+			set_device_ro(lv_ptr->lv_dev, 1);
+		break;
+
+
+	case LV_SET_STATUS:
+		/* set status flags of a logical volume */
+		if (!capable(CAP_SYS_ADMIN)) return -EACCES;
+		if (!((ulong) arg & LV_ACTIVE) && lv_ptr->lv_open > 1)
+			return -EPERM;
+		lv_ptr->lv_status = (ulong) arg;
+		break;
+
+	case LV_BMAP:
+		/* turn logical block into (dev_t, block). non privileged. */
+		return lvm_user_bmap(inode, (struct lv_bmap *) arg);
+		break;
+
+	case LV_SET_ALLOCATION:
+		/* set allocation flags of a logical volume */
+		if (!capable(CAP_SYS_ADMIN)) return -EACCES;
+		lv_ptr->lv_allocation = (ulong) arg;
+		break;
+
+	case LV_SNAPSHOT_USE_RATE:
+		if (!(lv_ptr->lv_access & LV_SNAPSHOT)) return -EPERM;
+		{
+			lv_snapshot_use_rate_req_t	lv_snapshot_use_rate_req;
+
+			if (copy_from_user(&lv_snapshot_use_rate_req, arg,
+					   sizeof(lv_snapshot_use_rate_req_t)))
+				return -EFAULT;
+			if (lv_snapshot_use_rate_req.rate < 0 ||
+			    lv_snapshot_use_rate_req.rate  > 100) return -EFAULT;
+
+			switch (lv_snapshot_use_rate_req.block)
+			{
+			case 0:
+				lv_ptr->lv_snapshot_use_rate = lv_snapshot_use_rate_req.rate;
+				if (lv_ptr->lv_remap_ptr * 100 / lv_ptr->lv_remap_end < lv_ptr->lv_snapshot_use_rate)
+					interruptible_sleep_on (&lv_ptr->lv_snapshot_wait);
+				break;
+
+			case O_NONBLOCK:
+				break;
+
+			default:
+				return -EFAULT;
+			}
+			lv_snapshot_use_rate_req.rate = lv_ptr->lv_remap_ptr * 100 / lv_ptr->lv_remap_end;
+			if (copy_to_user(arg, &lv_snapshot_use_rate_req,
+					 sizeof(lv_snapshot_use_rate_req_t)))
+				return -EFAULT;
+		}
+		break;
+
+	default:
+		printk(KERN_WARNING
+		       "%s -- lvm_blk_ioctl: unknown command %d\n",
+		       lvm_name, command);
+		return -EINVAL;
+	}
+
+	return 0;
+} /* lvm_blk_ioctl() */
+
+
+/*
+ * block device close routine
+ */
+static int lvm_blk_close(struct inode *inode, struct file *file)
+{
+	int minor = MINOR(inode->i_rdev);
+	vg_t *vg_ptr = vg[VG_BLK(minor)];
+	lv_t *lv_ptr = vg_ptr->lv[LV_BLK(minor)];
+
+#ifdef DEBUG
+	printk(KERN_DEBUG
+	       "%s -- lvm_blk_close MINOR: %d  VG#: %d  LV#: %d\n",
+	       lvm_name, minor, VG_BLK(minor), LV_BLK(minor));
+#endif
+
+	sync_dev(inode->i_rdev);
+	if (lv_ptr->lv_open == 1) vg_ptr->lv_open--;
+	lv_ptr->lv_open--;
+
+	MOD_DEC_USE_COUNT;
+
+	return 0;
+} /* lvm_blk_close() */
+
+
+static int lvm_user_bmap(struct inode *inode, struct lv_bmap *user_result)
+{
+	struct buffer_head bh;
+	unsigned long block;
+	int err;
+	
+	if (get_user(block, &user_result->lv_block))
+	return -EFAULT;
+	
+	memset(&bh,0,sizeof bh);
+	bh.b_rsector = block;
+	bh.b_dev = bh.b_rdev = inode->i_dev;
+	bh.b_size = lvm_get_blksize(bh.b_dev);
+	if ((err=lvm_map(bh.b_rdev, &bh.b_rdev, &bh.b_rsector, bh.b_size >> 9, READ)) < 0)  {
+	printk("lvm map failed: %d\n", err);
+	return -EINVAL;
+	}
+	
+	return put_user(  kdev_t_to_nr(bh.b_rdev), &user_result->lv_dev) ||
+	put_user(bh.b_rsector, &user_result->lv_block) ? -EFAULT : 0;
+}     
+
+
+/*
+ * provide VG info for proc filesystem use (global)
+ */
+int lvm_vg_info(vg_t *vg_ptr, char *buf) {
+	int sz = 0;
+	char inactive_flag = ' ';
+
+	if (!(vg_ptr->vg_status & VG_ACTIVE)) inactive_flag = 'I';
+	sz = sprintf(buf,
+		     "\nVG: %c%s  [%d PV, %d LV/%d open] "
+		     " PE Size: %d KB\n"
+		     "  Usage [KB/PE]: %d /%d total  "
+		     "%d /%d used  %d /%d free",
+		     inactive_flag,
+		     vg_ptr->vg_name,
+		     vg_ptr->pv_cur,
+		     vg_ptr->lv_cur,
+		     vg_ptr->lv_open,
+	     	     vg_ptr->pe_size >> 1,
+		     vg_ptr->pe_size * vg_ptr->pe_total >> 1,
+		     vg_ptr->pe_total,
+		     vg_ptr->pe_allocated * vg_ptr->pe_size >> 1,
+	     	     vg_ptr->pe_allocated,
+		     (vg_ptr->pe_total - vg_ptr->pe_allocated) *	
+	     	     vg_ptr->pe_size >> 1,
+		     vg_ptr->pe_total - vg_ptr->pe_allocated);
+	return sz;
+}
+
+
+/*
+ * provide LV info for proc filesystem use (global)
+ */
+int lvm_lv_info(vg_t *vg_ptr, lv_t *lv_ptr, char *buf) {
+	int sz = 0;
+	char inactive_flag = 'A', allocation_flag = ' ',
+	     stripes_flag = ' ', rw_flag = ' ';
+
+	if (!(lv_ptr->lv_status & LV_ACTIVE))
+		inactive_flag = 'I';
+	rw_flag = 'R';
+	if (lv_ptr->lv_access & LV_WRITE)
+		rw_flag = 'W';
+	allocation_flag = 'D';
+	if (lv_ptr->lv_allocation & LV_CONTIGUOUS)
+		allocation_flag = 'C';
+	stripes_flag = 'L';
+	if (lv_ptr->lv_stripes > 1)
+		stripes_flag = 'S';
+	sz += sprintf(buf+sz,
+		      "[%c%c%c%c",
+		      inactive_flag,
+	 rw_flag,
+		      allocation_flag,
+		      stripes_flag);
+	if (lv_ptr->lv_stripes > 1)
+		sz += sprintf(buf+sz, "%-2d",
+			      lv_ptr->lv_stripes);
+	else
+		sz += sprintf(buf+sz, "  ");
+	basename = strrchr(lv_ptr->lv_name, '/');
+	if ( basename == 0) basename = lv_ptr->lv_name;
+	else                basename++;
+	sz += sprintf(buf+sz, "] %-25s", basename);
+	if (strlen(basename) > 25)
+		sz += sprintf(buf+sz,
+			      "\n                              ");
+	sz += sprintf(buf+sz, "%9d /%-6d   ",
+		      lv_ptr->lv_size >> 1,
+		      lv_ptr->lv_size / vg_ptr->pe_size);
+
+	if (lv_ptr->lv_open == 0)
+		sz += sprintf(buf+sz, "close");
+	else
+		sz += sprintf(buf+sz, "%dx open",
+			      lv_ptr->lv_open);
+
+	return sz;
+}
+
+
+/*
+ * provide PV info for proc filesystem use (global)
+ */
+int lvm_pv_info(pv_t *pv_ptr, char *buf) {
+	int sz = 0;
+	char inactive_flag = 'A', allocation_flag = ' ';
+	char *pv_name = NULL;
+
+	if (!(pv_ptr->pv_status & PV_ACTIVE))
+		inactive_flag = 'I';
+	allocation_flag = 'A';
+	if (!(pv_ptr->pv_allocatable & PV_ALLOCATABLE))
+		allocation_flag = 'N';
+	pv_name = strrchr(pv_ptr->pv_name+1,'/');
+	if ( pv_name == 0) pv_name = pv_ptr->pv_name;
+	else               pv_name++;
+	sz = sprintf(buf,
+		     "[%c%c] %-21s %8d /%-6d  "
+		     "%8d /%-6d  %8d /%-6d",
+		     inactive_flag,
+		     allocation_flag,
+		     pv_name,
+		     pv_ptr->pe_total *
+		     pv_ptr->pe_size >> 1,
+		     pv_ptr->pe_total,
+		     pv_ptr->pe_allocated *
+		     pv_ptr->pe_size >> 1,
+		     pv_ptr->pe_allocated,
+		     (pv_ptr->pe_total -
+		      pv_ptr->pe_allocated) *
+		     pv_ptr->pe_size >> 1,
+		     pv_ptr->pe_total -
+		     pv_ptr->pe_allocated);
+	return sz;
+}
+
+
+#if defined CONFIG_LVM_PROC_FS && defined CONFIG_PROC_FS
+/*
+ * Support functions /proc-Filesystem
+ */
+
+#define  LVM_PROC_BUF   ( i == 0 ? dummy_buf : &buf[sz])
+
+/*
+ * provide global LVM information
+ */
+static int lvm_proc_get_global_info(char *page, char **start, off_t pos, int count, int *eof, void *data)
+{
+	int c, i, l, p, v, vg_counter, pv_counter, lv_counter, lv_open_counter,
+	 lv_open_total, pe_t_bytes, hash_table_bytes, lv_block_exception_t_bytes, seconds;
+	static off_t sz;
+	off_t sz_last;
+	static char *buf = NULL;
+	static char dummy_buf[160];	/* sized for 2 lines */
+	vg_t *vg_ptr;
+	lv_t *lv_ptr;
+	pv_t *pv_ptr;
+
+
+#ifdef DEBUG_LVM_PROC_GET_INFO
+	printk(KERN_DEBUG
+	       "%s - lvm_proc_get_global_info CALLED  pos: %lu  count: %d  whence: %d\n",
+	       lvm_name, pos, count, whence);
+#endif
+
+	MOD_INC_USE_COUNT;
+
+	if (pos == 0 || buf == NULL) {
+		sz_last = vg_counter = pv_counter = lv_counter = lv_open_counter = \
+		lv_open_total = pe_t_bytes = hash_table_bytes = \
+		lv_block_exception_t_bytes = 0;
+
+		/* search for activity */
+		for (v = 0; v < ABS_MAX_VG; v++) {
+			if ((vg_ptr = vg[v]) != NULL) {
+				vg_counter++;
+				pv_counter += vg_ptr->pv_cur;
+				lv_counter += vg_ptr->lv_cur;
+				if (vg_ptr->lv_cur > 0) {
+					for (l = 0; l < vg[v]->lv_max; l++) {
+						if ((lv_ptr = vg_ptr->lv[l]) != NULL) {
+							pe_t_bytes += lv_ptr->lv_allocated_le;
+							hash_table_bytes += lv_ptr->lv_snapshot_hash_table_size;
+							if (lv_ptr->lv_block_exception != NULL)
+								lv_block_exception_t_bytes += lv_ptr->lv_remap_end;
+							if (lv_ptr->lv_open > 0) {
+								lv_open_counter++;
+								lv_open_total += lv_ptr->lv_open;
+							}
+						}
+					}
+				}
+			}
+		}
+		pe_t_bytes *= sizeof(pe_t);
+		lv_block_exception_t_bytes *= sizeof(lv_block_exception_t);
+
+		if (buf != NULL) {
+#ifdef DEBUG_KFREE
+			printk(KERN_DEBUG
+			       "%s -- vfree %d\n", lvm_name, __LINE__);
+#endif
+			lock_kernel();
+			vfree(buf);
+			unlock_kernel();
+			buf = NULL;
+		}
+		/* 2 times: first to get size to allocate buffer,
+		   2nd to fill the malloced buffer */
+		for (i = 0; i < 2; i++) {
+			sz = 0;
+			sz += sprintf(LVM_PROC_BUF,
+				      "LVM "
+#ifdef MODULE
+				      "module"
+#else
+				      "driver"
+#endif
+				      " %s\n\n"
+				    "Total:  %d VG%s  %d PV%s  %d LV%s ",
+				      lvm_short_version,
+				  vg_counter, vg_counter == 1 ? "" : "s",
+				  pv_counter, pv_counter == 1 ? "" : "s",
+				 lv_counter, lv_counter == 1 ? "" : "s");
+			sz += sprintf(LVM_PROC_BUF,
+				      "(%d LV%s open",
+				      lv_open_counter,
+				      lv_open_counter == 1 ? "" : "s");
+			if (lv_open_total > 0)
+				sz += sprintf(LVM_PROC_BUF,
+					      " %d times)\n",
+					      lv_open_total);
+			else
+				sz += sprintf(LVM_PROC_BUF, ")");
+			sz += sprintf(LVM_PROC_BUF,
+				      "\nGlobal: %lu bytes malloced   IOP version: %d   ",
+				      vg_counter * sizeof(vg_t) +
+				      pv_counter * sizeof(pv_t) +
+				      lv_counter * sizeof(lv_t) +
+				      pe_t_bytes + hash_table_bytes + lv_block_exception_t_bytes + sz_last,
+				      lvm_iop_version);
+
+			seconds = CURRENT_TIME - loadtime;
+			if (seconds < 0)
+				loadtime = CURRENT_TIME + seconds;
+			if (seconds / 86400 > 0) {
+				sz += sprintf(LVM_PROC_BUF, "%d day%s ",
+					      seconds / 86400,
+					      seconds / 86400 == 0 ||
+					 seconds / 86400 > 1 ? "s" : "");
+			}
+			sz += sprintf(LVM_PROC_BUF, "%d:%02d:%02d active\n",
+				      (seconds % 86400) / 3600,
+				      (seconds % 3600) / 60,
+				      seconds % 60);
+
+			if (vg_counter > 0) {
+				for (v = 0; v < ABS_MAX_VG; v++) {
+					/* volume group */
+					if ((vg_ptr = vg[v]) != NULL) {
+						sz += lvm_vg_info(vg_ptr, LVM_PROC_BUF);
+
+						/* physical volumes */
+						sz += sprintf(LVM_PROC_BUF,
+							      "\n  PV%s ",
+							      vg_ptr->pv_cur == 1 ? ": " : "s:");
+						c = 0;
+						for (p = 0; p < vg_ptr->pv_max; p++) {
+							if ((pv_ptr = vg_ptr->pv[p]) != NULL) {
+								sz += lvm_pv_info(pv_ptr, LVM_PROC_BUF);
+
+								c++;
+								if (c < vg_ptr->pv_cur)
+									sz += sprintf(LVM_PROC_BUF,
+										      "\n       ");
+							}
+						}
+
+						/* logical volumes */
+						sz += sprintf(LVM_PROC_BUF,
+							   "\n    LV%s ",
+							      vg_ptr->lv_cur == 1 ? ": " : "s:");
+						c = 0;
+						for (l = 0; l < vg_ptr->lv_max; l++) {
+							if ((lv_ptr = vg_ptr->lv[l]) != NULL) {
+								sz += lvm_lv_info(vg_ptr, lv_ptr, LVM_PROC_BUF);
+								c++;
+								if (c < vg_ptr->lv_cur)
+									sz += sprintf(LVM_PROC_BUF,
+										      "\n         ");
+							}
+						}
+						if (vg_ptr->lv_cur == 0) sz += sprintf(LVM_PROC_BUF, "none");
+						sz += sprintf(LVM_PROC_BUF, "\n");
+					}
+				}
+			}
+			if (buf == NULL) {
+				lock_kernel();
+				buf = vmalloc(sz);
+				unlock_kernel();
+				if (buf == NULL) {
+					sz = 0;
+					MOD_DEC_USE_COUNT;
+					return sprintf(page, "%s - vmalloc error at line %d\n",
+						     lvm_name, __LINE__);
+				}
+			}
+			sz_last = sz;
+		}
+	}
+	MOD_DEC_USE_COUNT;
+	if (pos > sz - 1) {
+		lock_kernel();
+		vfree(buf);
+		unlock_kernel();
+		buf = NULL;
+		return 0;
+	}
+	*start = &buf[pos];
+	if (sz - pos < count)
+		return sz - pos;
+	else
+		return count;
+} /* lvm_proc_get_global_info() */
+#endif /* #if defined CONFIG_LVM_PROC_FS && defined CONFIG_PROC_FS */
+
+
+/*
+ * provide VG information
+ */
+int lvm_proc_read_vg_info(char *page, char **start, off_t off,
+			  int count, int *eof, void *data) {
+	int sz = 0;
+	vg_t *vg = data;
+
+	sz += sprintf ( page+sz, "name:         %s\n", vg->vg_name);
+	sz += sprintf ( page+sz, "size:         %u\n",
+		        vg->pe_total * vg->pe_size / 2);
+	sz += sprintf ( page+sz, "access:       %u\n", vg->vg_access);
+	sz += sprintf ( page+sz, "status:       %u\n", vg->vg_status);
+	sz += sprintf ( page+sz, "number:       %u\n", vg->vg_number);
+	sz += sprintf ( page+sz, "LV max:       %u\n", vg->lv_max);
+	sz += sprintf ( page+sz, "LV current:   %u\n", vg->lv_cur);
+	sz += sprintf ( page+sz, "LV open:      %u\n", vg->lv_open);
+	sz += sprintf ( page+sz, "PV max:       %u\n", vg->pv_max);
+	sz += sprintf ( page+sz, "PV current:   %u\n", vg->pv_cur);
+	sz += sprintf ( page+sz, "PV active:    %u\n", vg->pv_act);
+	sz += sprintf ( page+sz, "PE size:      %u\n", vg->pe_size / 2);
+	sz += sprintf ( page+sz, "PE total:     %u\n", vg->pe_total);
+	sz += sprintf ( page+sz, "PE allocated: %u\n", vg->pe_allocated);
+	sz += sprintf ( page+sz, "uuid:         %s\n", vg->vg_uuid);
+
+	return sz;
+}
+
+
+/*
+ * provide LV information
+ */
+int lvm_proc_read_lv_info(char *page, char **start, off_t off,
+			  int count, int *eof, void *data) {
+	int sz = 0;
+	lv_t *lv = data;
+
+	sz += sprintf ( page+sz, "name:         %s\n", lv->lv_name);
+	sz += sprintf ( page+sz, "size:         %u\n", lv->lv_size);
+	sz += sprintf ( page+sz, "access:       %u\n", lv->lv_access);
+	sz += sprintf ( page+sz, "status:       %u\n", lv->lv_status);
+	sz += sprintf ( page+sz, "number:       %u\n", lv->lv_number);
+	sz += sprintf ( page+sz, "open:         %u\n", lv->lv_open);
+	sz += sprintf ( page+sz, "allocation:   %u\n", lv->lv_allocation);
+	sz += sprintf ( page+sz, "device:       %02u:%02u\n",
+                        MAJOR(lv->lv_dev), MINOR(lv->lv_dev));
+
+	return sz;
+}
+
+
+/*
+ * provide PV information
+ */
+int lvm_proc_read_pv_info(char *page, char **start, off_t off,
+			  int count, int *eof, void *data) {
+	int sz = 0;
+	pv_t *pv = data;
+
+	sz += sprintf ( page+sz, "name:         %s\n", pv->pv_name);
+	sz += sprintf ( page+sz, "size:         %u\n", pv->pv_size);
+	sz += sprintf ( page+sz, "status:       %u\n", pv->pv_status);
+	sz += sprintf ( page+sz, "number:       %u\n", pv->pv_number);
+	sz += sprintf ( page+sz, "allocatable:  %u\n", pv->pv_allocatable);
+	sz += sprintf ( page+sz, "LV current:   %u\n", pv->lv_cur);
+	sz += sprintf ( page+sz, "PE size:      %u\n", pv->pe_size / 2);
+	sz += sprintf ( page+sz, "PE total:     %u\n", pv->pe_total);
+	sz += sprintf ( page+sz, "PE allocated: %u\n", pv->pe_allocated);
+	sz += sprintf ( page+sz, "device:       %02u:%02u\n",
+                        MAJOR(pv->pv_dev), MINOR(pv->pv_dev));
+	sz += sprintf ( page+sz, "uuid:         %s\n", pv->pv_uuid);
+
+
+	return sz;
+}
+
+
+/*
+ * block device support function for /usr/src/linux/drivers/block/ll_rw_blk.c
+ * (see init_module/lvm_init)
+ */
+static int lvm_map(kdev_t rdev_tmp, kdev_t * rdev_out,
+		   unsigned long * rsector_out, unsigned long size, int rw)
+{
+	int minor = MINOR (rdev_tmp);
+	int ret = 0;
+	ulong index;
+	ulong pe_start;
+	ulong rsector_tmp = *rsector_out;
+	ulong rsector_sav;
+	kdev_t rdev_sav;
+	vg_t *vg_this = vg[VG_BLK(minor)];
+	lv_t *lv = vg_this->lv[LV_BLK(minor)];
+
+
+	if (!(lv->lv_status & LV_ACTIVE)) {
+		printk(KERN_ALERT
+		       "%s - lvm_map: ll_rw_blk for inactive LV %s\n",
+		       lvm_name, lv->lv_name);
+		return -1;
+	}
+
+	if ((rw == WRITE || rw == WRITEA) &&
+	    !(lv->lv_access & LV_WRITE)) {
+		printk(KERN_CRIT
+		    "%s - lvm_map: ll_rw_blk write for readonly LV %s\n",
+		       lvm_name, lv->lv_name);
+		return -1;
+	}
+#ifdef DEBUG_MAP
+	printk(KERN_DEBUG
+	       "%s - lvm_map minor:%d  *rdev: %02d:%02d  *rsector: %lu  "
+	       "size:%lu\n",
+	       lvm_name, minor,
+	       MAJOR(rdev_tmp),
+	       MINOR(rdev_tmp),
+	       rsector_tmp, size);
+#endif
+
+	if (rsector_tmp + size > lv->lv_size) {
+		printk(KERN_ALERT
+		       "%s - lvm_map access beyond end of device; *rsector: "
+                       "%lu or size: %lu wrong for minor: %2d\n",
+                       lvm_name, rsector_tmp, size, minor);
+		return -1;
+	}
+	rsector_sav = rsector_tmp;
+	rdev_sav = rdev_tmp;
+
+lvm_second_remap:
+	/* linear mapping */
+	if (lv->lv_stripes < 2) {
+		/* get the index */
+		index = rsector_tmp / vg_this->pe_size;
+		pe_start = lv->lv_current_pe[index].pe;
+		rsector_tmp = lv->lv_current_pe[index].pe +
+		    (rsector_tmp % vg_this->pe_size);
+		rdev_tmp = lv->lv_current_pe[index].dev;
+
+#ifdef DEBUG_MAP
+		printk(KERN_DEBUG
+		       "lv_current_pe[%ld].pe: %ld  rdev: %02d:%02d  rsector:%ld\n",
+		       index,
+		       lv->lv_current_pe[index].pe,
+		       MAJOR(rdev_tmp),
+		       MINOR(rdev_tmp),
+		       rsector_tmp);
+#endif
+
+		/* striped mapping */
+	} else {
+		ulong stripe_index;
+		ulong stripe_length;
+
+		stripe_length = vg_this->pe_size * lv->lv_stripes;
+		stripe_index = (rsector_tmp % stripe_length) / lv->lv_stripesize;
+		index = rsector_tmp / stripe_length +
+		    (stripe_index % lv->lv_stripes) *
+		    (lv->lv_allocated_le / lv->lv_stripes);
+		pe_start = lv->lv_current_pe[index].pe;
+		rsector_tmp = lv->lv_current_pe[index].pe +
+		    (rsector_tmp % stripe_length) -
+		    (stripe_index % lv->lv_stripes) * lv->lv_stripesize -
+		    stripe_index / lv->lv_stripes *
+		    (lv->lv_stripes - 1) * lv->lv_stripesize;
+		rdev_tmp = lv->lv_current_pe[index].dev;
+	}
+
+#ifdef DEBUG_MAP
+	printk(KERN_DEBUG
+	     "lv_current_pe[%ld].pe: %ld  rdev: %02d:%02d  rsector:%ld\n"
+	       "stripe_length: %ld  stripe_index: %ld\n",
+	       index,
+	       lv->lv_current_pe[index].pe,
+	       MAJOR(rdev_tmp),
+	       MINOR(rdev_tmp),
+	       rsector_tmp,
+	       stripe_length,
+	       stripe_index);
+#endif
+
+	/* handle physical extents on the move */
+	if (pe_lock_req.lock == LOCK_PE) {
+		if (rdev_tmp == pe_lock_req.data.pv_dev &&
+		    rsector_tmp >= pe_lock_req.data.pv_offset &&
+		    rsector_tmp < (pe_lock_req.data.pv_offset +
+				   vg_this->pe_size)) {
+			sleep_on(&lvm_map_wait);
+			rsector_tmp = rsector_sav;
+			rdev_tmp = rdev_sav;
+			goto lvm_second_remap;
+		}
+	}
+	/* statistic */
+	if (rw == WRITE || rw == WRITEA)
+		lv->lv_current_pe[index].writes++;
+	else
+		lv->lv_current_pe[index].reads++;
+
+	/* snapshot volume exception handling on physical device address base */
+	if (lv->lv_access & (LV_SNAPSHOT|LV_SNAPSHOT_ORG)) {
+		/* original logical volume */
+		if (lv->lv_access & LV_SNAPSHOT_ORG) {
+			if (rw == WRITE || rw == WRITEA)
+			{
+				lv_t *lv_ptr;
+
+				/* start with first snapshot and loop thrugh all of them */
+				for (lv_ptr = lv->lv_snapshot_next;
+				     lv_ptr != NULL;
+				     lv_ptr = lv_ptr->lv_snapshot_next) {
+					/* Check for inactive snapshot */
+					if (!(lv_ptr->lv_status & LV_ACTIVE)) continue;
+					down(&lv->lv_snapshot_org->lv_snapshot_sem);
+					/* do we still have exception storage for this snapshot free? */
+					if (lv_ptr->lv_block_exception != NULL) {
+						rdev_sav = rdev_tmp;
+						rsector_sav = rsector_tmp;
+						if (!lvm_snapshot_remap_block(&rdev_tmp,
+									      &rsector_tmp,
+									      pe_start,
+									      lv_ptr)) {
+							/* create a new mapping */
+							if (!(ret = lvm_snapshot_COW(rdev_tmp,
+									       	     rsector_tmp,
+									             pe_start,
+									             rsector_sav,
+									             lv_ptr)))
+								ret = lvm_write_COW_table_block(vg_this,
+												lv_ptr);
+						}
+						rdev_tmp = rdev_sav;
+						rsector_tmp = rsector_sav;
+					}
+					up(&lv->lv_snapshot_org->lv_snapshot_sem);
+				}
+			}
+		} else {
+			/* remap snapshot logical volume */
+			down(&lv->lv_snapshot_sem);
+			if (lv->lv_block_exception != NULL)
+				lvm_snapshot_remap_block(&rdev_tmp, &rsector_tmp, pe_start, lv);
+			up(&lv->lv_snapshot_sem);
+		}
+	}
+	*rdev_out = rdev_tmp;
+	*rsector_out = rsector_tmp;
+
+	return ret;
+} /* lvm_map() */
+
+
+/*
+ * internal support functions
+ */
+
+#ifdef LVM_HD_NAME
+/*
+ * generate "hard disk" name
+ */
+void lvm_hd_name(char *buf, int minor)
+{
+	int len = 0;
+	lv_t *lv_ptr;
+
+	if (vg[VG_BLK(minor)] == NULL ||
+	    (lv_ptr = vg[VG_BLK(minor)]->lv[LV_BLK(minor)]) == NULL)
+		return;
+	len = strlen(lv_ptr->lv_name) - 5;
+	memcpy(buf, &lv_ptr->lv_name[5], len);
+	buf[len] = 0;
+	return;
+}
+#endif
+
+
+/*
+ * this one never should be called...
+ */
+#if LINUX_VERSION_CODE > KERNEL_VERSION ( 2, 3, 30)
+static void lvm_dummy_device_request(request_queue_t * t)
+#else
+static void lvm_dummy_device_request(void)
+#endif
+{
+	printk(KERN_EMERG
+	     "%s -- oops, got lvm request for %02d:%02d [sector: %lu]\n",
+	       lvm_name,
+	       MAJOR(CURRENT->rq_dev),
+	       MINOR(CURRENT->rq_dev),
+	       CURRENT->sector);
+	return;
+}
+
+
+/*
+ * make request function
+ */
+#if LINUX_VERSION_CODE > KERNEL_VERSION ( 2, 3, 42)
+static int lvm_make_request_fn(
+#if LINUX_VERSION_CODE > KERNEL_VERSION ( 2, 3, 48)
+			       request_queue_t *q,
+#endif
+			       int rw,
+			       struct buffer_head *bh)
+{
+	lvm_map(bh, rw);
+	return 1;
+}
+#endif
+
+
+/********************************************************************
+ *
+ * Character device support functions
+ *
+ ********************************************************************/
+/*
+ * character device support function logical volume manager lock
+ */
+static int lvm_do_lock_lvm(void)
+{
+lock_try_again:
+	spin_lock(&lvm_lock);
+	if (lock != 0 && lock != current->pid) {
+#ifdef DEBUG_IOCTL
+		printk(KERN_INFO "lvm_do_lock_lvm: %s is locked by pid %d ...\n",
+		       lvm_name, lock);
+#endif
+		spin_unlock(&lvm_lock);
+		interruptible_sleep_on(&lvm_wait);
+		if (current->sigpending != 0)
+			return -EINTR;
+#ifdef LVM_TOTAL_RESET
+		if (lvm_reset_spindown > 0)
+			return -EACCES;
+#endif
+		goto lock_try_again;
+	}
+	lock = current->pid;
+	spin_unlock(&lvm_lock);
+	return 0;
+} /* lvm_do_lock_lvm */
+
+
+/*
+ * character device support function lock/unlock physical extend
+ */
+static int lvm_do_pe_lock_unlock(vg_t *vg_ptr, void *arg)
+{
+	uint p;
+
+	if (vg_ptr == NULL) return -ENXIO;
+	if (copy_from_user(&pe_lock_req, arg,
+			   sizeof(pe_lock_req_t)) != 0) return -EFAULT;
+
+	switch (pe_lock_req.lock) {
+	case LOCK_PE:
+		for (p = 0; p < vg_ptr->pv_max; p++) {
+			if (vg_ptr->pv[p] != NULL &&
+			    pe_lock_req.data.pv_dev ==
+			    vg_ptr->pv[p]->pv_dev)
+				break;
+		}
+		if (p == vg_ptr->pv_max) return -ENXIO;
+
+		pe_lock_req.lock = UNLOCK_PE;
+		fsync_dev(pe_lock_req.data.lv_dev);
+		pe_lock_req.lock = LOCK_PE;
+		break;
+
+	case UNLOCK_PE:
+		pe_lock_req.lock = UNLOCK_PE;
+		pe_lock_req.data.lv_dev = \
+		pe_lock_req.data.pv_dev = \
+		pe_lock_req.data.pv_offset = 0;
+		wake_up(&lvm_map_wait);
+		break;
+
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+
+/*
+ * character device support function logical extend remap
+ */
+static int lvm_do_le_remap(vg_t *vg_ptr, void *arg)
+{
+	uint l, le;
+	lv_t *lv_ptr;
+
+	if (vg_ptr == NULL) return -ENXIO;
+	if (copy_from_user(&le_remap_req, arg,
+			   sizeof(le_remap_req_t)) != 0)
+		return -EFAULT;
+
+	for (l = 0; l < vg_ptr->lv_max; l++) {
+		lv_ptr = vg_ptr->lv[l];
+		if (lv_ptr != NULL &&
+		    strcmp(lv_ptr->lv_name,
+			       le_remap_req.lv_name) == 0) {
+			for (le = 0; le < lv_ptr->lv_allocated_le; le++) {
+				if (lv_ptr->lv_current_pe[le].dev ==
+				    le_remap_req.old_dev &&
+				    lv_ptr->lv_current_pe[le].pe ==
+				    le_remap_req.old_pe) {
+					lv_ptr->lv_current_pe[le].dev =
+					    le_remap_req.new_dev;
+					lv_ptr->lv_current_pe[le].pe =
+					    le_remap_req.new_pe;
+					return 0;
+				}
+			}
+			return -EINVAL;
+		}
+	}
+	return -ENXIO;
+} /* lvm_do_le_remap() */
+
+
+/*
+ * character device support function VGDA create
+ */
+int lvm_do_vg_create(int minor, void *arg)
+{
+	int ret = 0;
+	ulong l, ls = 0, p, size;
+	lv_t lv;
+	vg_t *vg_ptr;
+	lv_t **snap_lv_ptr;
+
+	if (vg[VG_CHR(minor)] != NULL) return -EPERM;
+
+	if ((vg_ptr = kmalloc(sizeof(vg_t),GFP_KERNEL)) == NULL) {
+		printk(KERN_CRIT
+		       "%s -- VG_CREATE: kmalloc error VG at line %d\n",
+		       lvm_name, __LINE__);
+		return -ENOMEM;
+	}
+	/* get the volume group structure */
+	if (copy_from_user(vg_ptr, arg, sizeof(vg_t)) != 0) {
+		kfree(vg_ptr);
+		return -EFAULT;
+	}
+
+	/* we are not that active so far... */
+	vg_ptr->vg_status &= ~VG_ACTIVE;
+	vg[VG_CHR(minor)] = vg_ptr;
+	vg[VG_CHR(minor)]->pe_allocated = 0;
+
+	if (vg_ptr->pv_max > ABS_MAX_PV) {
+		printk(KERN_WARNING
+		       "%s -- Can't activate VG: ABS_MAX_PV too small\n",
+		       lvm_name);
+		kfree(vg_ptr);
+		vg[VG_CHR(minor)] = NULL;
+		return -EPERM;
+	}
+	if (vg_ptr->lv_max > ABS_MAX_LV) {
+		printk(KERN_WARNING
+		"%s -- Can't activate VG: ABS_MAX_LV too small for %u\n",
+		       lvm_name, vg_ptr->lv_max);
+		kfree(vg_ptr);
+		vg_ptr = NULL;
+		return -EPERM;
+	}
+
+	/* get the physical volume structures */
+	vg_ptr->pv_act = vg_ptr->pv_cur = 0;
+	for (p = 0; p < vg_ptr->pv_max; p++) {
+		/* user space address */
+		if ((pvp = vg_ptr->pv[p]) != NULL) {
+			ret = lvm_do_pv_create(pvp, vg_ptr, p);
+			if ( ret != 0) {
+				lvm_do_vg_remove(minor);
+				return ret;
+			}
+		}
+	}
+
+	size = vg_ptr->lv_max * sizeof(lv_t *);
+	if ((snap_lv_ptr = vmalloc ( size)) == NULL) {
+		printk(KERN_CRIT
+		       "%s -- VG_CREATE: vmalloc error snapshot LVs at line %d\n",
+		       lvm_name, __LINE__);
+		lvm_do_vg_remove(minor);
+		return -EFAULT;
+	}
+	memset(snap_lv_ptr, 0, size);
+
+	/* get the logical volume structures */
+	vg_ptr->lv_cur = 0;
+	for (l = 0; l < vg_ptr->lv_max; l++) {
+		/* user space address */
+		if ((lvp = vg_ptr->lv[l]) != NULL) {
+			if (copy_from_user(&lv, lvp, sizeof(lv_t)) != 0) {
+				lvm_do_vg_remove(minor);
+				return -EFAULT;
+			}
+			if ( lv.lv_access & LV_SNAPSHOT) {
+				snap_lv_ptr[ls] = lvp;
+				vg_ptr->lv[l] = NULL;
+				ls++;
+				continue;
+			}
+			vg_ptr->lv[l] = NULL;
+			/* only create original logical volumes for now */
+			if (lvm_do_lv_create(minor, lv.lv_name, &lv) != 0) {
+				lvm_do_vg_remove(minor);
+				return -EFAULT;
+			}
+		}
+	}
+
+	/* Second path to correct snapshot logical volumes which are not
+	   in place during first path above */
+	for (l = 0; l < ls; l++) {
+		lvp = snap_lv_ptr[l];
+		if (copy_from_user(&lv, lvp, sizeof(lv_t)) != 0) {
+			lvm_do_vg_remove(minor);
+			return -EFAULT;
+		}
+		if (lvm_do_lv_create(minor, lv.lv_name, &lv) != 0) {
+			lvm_do_vg_remove(minor);
+			return -EFAULT;
+		}
+	}
+
+#ifdef	CONFIG_DEVFS_FS
+	vg_devfs_handle[vg_ptr->vg_number] = devfs_mk_dir(0, vg_ptr->vg_name, NULL);
+	ch_devfs_handle[vg_ptr->vg_number] = devfs_register(
+		vg_devfs_handle[vg_ptr->vg_number] , "group",
+		DEVFS_FL_DEFAULT, LVM_CHAR_MAJOR, vg_ptr->vg_number,
+		S_IFCHR | S_IRUSR | S_IWUSR | S_IRGRP,
+		&lvm_chr_fops, NULL);
+#endif
+
+#if defined CONFIG_LVM_PROC_FS && defined CONFIG_PROC_FS
+	lvm_do_create_proc_entry_of_vg ( vg_ptr);
+#endif
+
+	vfree(snap_lv_ptr);
+
+	vg_count++;
+
+
+	MOD_INC_USE_COUNT;
+
+	/* let's go active */
+	vg_ptr->vg_status |= VG_ACTIVE;
+
+	return 0;
+} /* lvm_do_vg_create() */
+
+
+/*
+ * character device support function VGDA extend
+ */
+static int lvm_do_vg_extend(vg_t *vg_ptr, void *arg)
+{
+	int ret = 0;
+	uint p;
+	pv_t *pv_ptr;
+
+	if (vg_ptr == NULL) return -ENXIO;
+	if (vg_ptr->pv_cur < vg_ptr->pv_max) {
+		for (p = 0; p < vg_ptr->pv_max; p++) {
+			if ( ( pv_ptr = vg_ptr->pv[p]) == NULL) {
+				ret = lvm_do_pv_create(arg, vg_ptr, p);
+				lvm_do_create_proc_entry_of_pv ( vg_ptr, pv_ptr);
+				if ( ret != 0) return ret;
+	
+				/* We don't need the PE list
+				   in kernel space like LVs pe_t list */
+				pv_ptr->pe = NULL;
+				vg_ptr->pv_cur++;
+				vg_ptr->pv_act++;
+				vg_ptr->pe_total +=
+				    pv_ptr->pe_total;
+#ifdef LVM_GET_INODE
+				/* insert a dummy inode for fs_may_mount */
+				pv_ptr->inode = lvm_get_inode(pv_ptr->pv_dev);
+#endif
+				return 0;
+			}
+		}
+	}
+return -EPERM;
+} /* lvm_do_vg_extend() */
+
+
+/*
+ * character device support function VGDA reduce
+ */
+static int lvm_do_vg_reduce(vg_t *vg_ptr, void *arg) {
+	uint p;
+	pv_t *pv_ptr;
+
+	if (vg_ptr == NULL) return -ENXIO;
+	if (copy_from_user(pv_name, arg, sizeof(pv_name)) != 0)
+		return -EFAULT;
+
+	for (p = 0; p < vg_ptr->pv_max; p++) {
+		pv_ptr = vg_ptr->pv[p];
+		if (pv_ptr != NULL &&
+		    strcmp(pv_ptr->pv_name,
+			       pv_name) == 0) {
+			if (pv_ptr->lv_cur > 0) return -EPERM;
+			vg_ptr->pe_total -=
+			    pv_ptr->pe_total;
+			vg_ptr->pv_cur--;
+			vg_ptr->pv_act--;
+			lvm_do_pv_remove(vg_ptr, p);
+			/* Make PV pointer array contiguous */
+			for (; p < vg_ptr->pv_max - 1; p++)
+				vg_ptr->pv[p] = vg_ptr->pv[p + 1];
+			vg_ptr->pv[p + 1] = NULL;
+			return 0;
+		}
+	}
+	return -ENXIO;
+} /* lvm_do_vg_reduce */
+
+
+/*
+ * character device support function VG rename
+ */
+static int lvm_do_vg_rename(vg_t *vg_ptr, void *arg)
+{
+	int l = 0, p = 0, len = 0;
+	char vg_name[NAME_LEN] = { 0,};
+	char lv_name[NAME_LEN] = { 0,};
+	char *ptr = NULL;
+	lv_t *lv_ptr = NULL;
+	pv_t *pv_ptr = NULL;
+
+	if (copy_from_user(vg_name, arg, sizeof(vg_name)) != 0)
+		return -EFAULT;
+
+#if defined CONFIG_LVM_PROC_FS && defined CONFIG_PROC_FS
+	lvm_do_remove_proc_entry_of_vg ( vg_ptr);
+#endif
+
+	strncpy ( vg_ptr->vg_name, vg_name, sizeof ( vg_name)-1);
+	for ( l = 0; l < vg_ptr->lv_max; l++)
+	{
+		if ((lv_ptr = vg_ptr->lv[l]) == NULL) continue;
+		strncpy(lv_ptr->vg_name, vg_name, sizeof ( vg_name));
+		ptr = strrchr(lv_ptr->lv_name, '/');
+		if (ptr == NULL) ptr = lv_ptr->lv_name;
+		strncpy(lv_name, ptr, sizeof ( lv_name));
+		len = sizeof(LVM_DIR_PREFIX);
+		strcpy(lv_ptr->lv_name, LVM_DIR_PREFIX);
+		strncat(lv_ptr->lv_name, vg_name, NAME_LEN - len);
+		len += strlen ( vg_name);
+		strncat(lv_ptr->lv_name, lv_name, NAME_LEN - len);
+	}
+	for ( p = 0; p < vg_ptr->pv_max; p++)
+	{
+		if ( (pv_ptr = vg_ptr->pv[p]) == NULL) continue;
+		strncpy(pv_ptr->vg_name, vg_name, NAME_LEN);
+	}
+
+#if defined CONFIG_LVM_PROC_FS && defined CONFIG_PROC_FS
+	lvm_do_create_proc_entry_of_vg ( vg_ptr);
+#endif
+
+	return 0;
+} /* lvm_do_vg_rename */
+
+
+/*
+ * character device support function VGDA remove
+ */
+static int lvm_do_vg_remove(int minor)
+{
+	int i;
+	vg_t *vg_ptr = vg[VG_CHR(minor)];
+	pv_t *pv_ptr;
+
+	if (vg_ptr == NULL) return -ENXIO;
+
+#ifdef LVM_TOTAL_RESET
+	if (vg_ptr->lv_open > 0 && lvm_reset_spindown == 0)
+#else
+	if (vg_ptr->lv_open > 0)
+#endif
+		return -EPERM;
+
+	/* let's go inactive */
+	vg_ptr->vg_status &= ~VG_ACTIVE;
+
+	/* free LVs */
+	/* first free snapshot logical volumes */
+	for (i = 0; i < vg_ptr->lv_max; i++) {
+		if (vg_ptr->lv[i] != NULL &&
+		    vg_ptr->lv[i]->lv_access & LV_SNAPSHOT) {
+			lvm_do_lv_remove(minor, NULL, i);
+			current->state = TASK_UNINTERRUPTIBLE;
+			schedule_timeout(1);
+		}
+	}
+	/* then free the rest of the LVs */
+	for (i = 0; i < vg_ptr->lv_max; i++) {
+		if (vg_ptr->lv[i] != NULL) {
+			lvm_do_lv_remove(minor, NULL, i);
+			current->state = TASK_UNINTERRUPTIBLE;
+			schedule_timeout(1);
+		}
+	}
+
+	/* free PVs */
+	for (i = 0; i < vg_ptr->pv_max; i++) {
+		if ((pv_ptr = vg_ptr->pv[i]) != NULL) {
+#ifdef DEBUG_KFREE
+			printk(KERN_DEBUG
+			       "%s -- kfree %d\n", lvm_name, __LINE__);
+#endif
+			lvm_do_pv_remove(vg_ptr, i);
+		}
+	}
+
+#ifdef	CONFIG_DEVFS_FS
+	devfs_unregister (ch_devfs_handle[vg_ptr->vg_number]);
+	devfs_unregister (vg_devfs_handle[vg_ptr->vg_number]);
+#endif
+
+#if defined CONFIG_LVM_PROC_FS && defined CONFIG_PROC_FS
+	lvm_do_remove_proc_entry_of_vg ( vg_ptr);
+#endif
+
+#ifdef DEBUG_KFREE
+	printk(KERN_DEBUG "%s -- kfree %d\n", lvm_name, __LINE__);
+#endif
+
+	kfree(vg_ptr);
+	vg[VG_CHR(minor)] = NULL;
+
+	vg_count--;
+
+	MOD_DEC_USE_COUNT;
+
+	return 0;
+} /* lvm_do_vg_remove() */
+
+
+/*
+ * character device support function physical volume create
+ */
+static int lvm_do_pv_create(pv_t *pvp, vg_t *vg_ptr, ulong p) {
+	pv_t *pv_ptr = NULL;
+
+	pv_ptr = vg_ptr->pv[p] = kmalloc(sizeof(pv_t),GFP_KERNEL);
+	if (pv_ptr == NULL) {
+		printk(KERN_CRIT
+		       "%s -- VG_CREATE: kmalloc error PV at line %d\n",
+		       lvm_name, __LINE__);
+		return -ENOMEM;
+	}
+	if (copy_from_user(pv_ptr, pvp, sizeof(pv_t)) != 0) {
+		return -EFAULT;
+	}
+	/* We don't need the PE list
+	   in kernel space as with LVs pe_t list (see below) */
+	pv_ptr->pe = NULL;
+	pv_ptr->pe_allocated = 0;
+	pv_ptr->pv_status = PV_ACTIVE;
+	vg_ptr->pv_act++;
+	vg_ptr->pv_cur++;
+
+#ifdef LVM_GET_INODE
+	/* insert a dummy inode for fs_may_mount */
+	pv_ptr->inode = lvm_get_inode(pv_ptr->pv_dev);
+#endif
+
+	return 0;
+} /* lvm_do_pv_create() */
+
+
+/*
+ * character device support function physical volume create
+ */
+static int lvm_do_pv_remove(vg_t *vg_ptr, ulong p) {
+	pv_t *pv_ptr = vg_ptr->pv[p];
+
+#if defined CONFIG_LVM_PROC_FS && defined CONFIG_PROC_FS
+	lvm_do_remove_proc_entry_of_pv ( vg_ptr, pv_ptr);
+#endif
+	vg_ptr->pe_total -=
+	    pv_ptr->pe_total;
+	vg_ptr->pv_cur--;
+	vg_ptr->pv_act--;
+#ifdef LVM_GET_INODE
+	lvm_clear_inode(pv_ptr->inode);
+#endif
+	kfree(pv_ptr);
+	vg_ptr->pv[p] = NULL;
+
+	return 0;
+}
+
+
+/*
+ * character device support function logical volume create
+ */
+static int lvm_do_lv_create(int minor, char *lv_name, lv_t *lv)
+{
+	int e, ret, l, le, l_new, p, size;
+	ulong lv_status_save;
+	lv_block_exception_t *lvbe = lv->lv_block_exception;
+	vg_t *vg_ptr = vg[VG_CHR(minor)];
+	lv_t *lv_ptr = NULL;
+
+	if ((pep = lv->lv_current_pe) == NULL) return -EINVAL;
+	if (lv->lv_chunk_size > LVM_SNAPSHOT_MAX_CHUNK)
+		return -EINVAL;
+
+	for (l = 0; l < vg_ptr->lv_max; l++) {
+		if (vg_ptr->lv[l] != NULL &&
+		    strcmp(vg_ptr->lv[l]->lv_name, lv_name) == 0)
+			return -EEXIST;
+	}
+
+	/* in case of lv_remove(), lv_create() pair */
+	l_new = -1;
+	if (vg_ptr->lv[lv->lv_number] == NULL)
+		l_new = lv->lv_number;
+	else {
+		for (l = 0; l < vg_ptr->lv_max; l++) {
+			if (vg_ptr->lv[l] == NULL)
+				if (l_new == -1) l_new = l;
+		}
+	}
+	if (l_new == -1) return -EPERM;
+	else             l = l_new;
+
+	if ((lv_ptr = kmalloc(sizeof(lv_t),GFP_KERNEL)) == NULL) {;
+		printk(KERN_CRIT "%s -- LV_CREATE: kmalloc error LV at line %d\n",
+		       lvm_name, __LINE__);
+		return -ENOMEM;
+	}
+	/* copy preloaded LV */
+	memcpy((char *) lv_ptr, (char *) lv, sizeof(lv_t));
+
+	lv_status_save = lv_ptr->lv_status;
+	lv_ptr->lv_status &= ~LV_ACTIVE;
+	lv_ptr->lv_snapshot_org = \
+	lv_ptr->lv_snapshot_prev = \
+	lv_ptr->lv_snapshot_next = NULL;
+	lv_ptr->lv_block_exception = NULL;
+	lv_ptr->lv_iobuf = NULL;
+	lv_ptr->lv_snapshot_hash_table = NULL;
+	lv_ptr->lv_snapshot_hash_table_size = 0;
+	lv_ptr->lv_snapshot_hash_mask = 0;
+	lv_ptr->lv_COW_table_page = NULL;
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 3, 4)
+	lv_ptr->lv_snapshot_sem = MUTEX;
+#else
+	init_MUTEX(&lv_ptr->lv_snapshot_sem);
+#endif
+	lv_ptr->lv_snapshot_use_rate = 0;
+	vg_ptr->lv[l] = lv_ptr;
+
+	/* get the PE structures from user space if this
+	   is no snapshot logical volume */
+	if (!(lv_ptr->lv_access & LV_SNAPSHOT)) {
+		size = lv_ptr->lv_allocated_le * sizeof(pe_t);
+		if ((lv_ptr->lv_current_pe = vmalloc(size)) == NULL) {
+			printk(KERN_CRIT
+			       "%s -- LV_CREATE: vmalloc error LV_CURRENT_PE of %d Byte "
+			       "at line %d\n",
+			       lvm_name, size, __LINE__);
+#ifdef DEBUG_KFREE
+			printk(KERN_DEBUG "%s -- kfree %d\n", lvm_name, __LINE__);
+#endif
+			kfree(lv_ptr);
+			vg[VG_CHR(minor)]->lv[l] = NULL;
+			return -ENOMEM;
+		}
+		if (copy_from_user(lv_ptr->lv_current_pe, pep, size)) {
+			vfree(lv_ptr->lv_current_pe);
+			kfree(lv_ptr);
+			vg_ptr->lv[l] = NULL;
+			return -EFAULT;
+		}
+		/* correct the PE count in PVs */
+		for (le = 0; le < lv_ptr->lv_allocated_le; le++) {
+			vg_ptr->pe_allocated++;
+			for (p = 0; p < vg_ptr->pv_cur; p++) {
+				if (vg_ptr->pv[p]->pv_dev ==
+				    lv_ptr->lv_current_pe[le].dev)
+					vg_ptr->pv[p]->pe_allocated++;
+			}
+		}
+	} else {
+		/* Get snapshot exception data and block list */
+		if (lvbe != NULL) {
+			lv_ptr->lv_snapshot_org =
+			    vg_ptr->lv[LV_BLK(lv_ptr->lv_snapshot_minor)];
+			if (lv_ptr->lv_snapshot_org != NULL) {
+				size = lv_ptr->lv_remap_end * sizeof(lv_block_exception_t);
+				if ((lv_ptr->lv_block_exception = vmalloc(size)) == NULL) {
+					printk(KERN_CRIT
+					       "%s -- lvm_do_lv_create: vmalloc error LV_BLOCK_EXCEPTION "
+					       "of %d byte at line %d\n",
+					       lvm_name, size, __LINE__);
+#ifdef DEBUG_KFREE
+					printk(KERN_DEBUG "%s -- kfree %d\n", lvm_name, __LINE__);
+#endif
+					kfree(lv_ptr);
+					vg_ptr->lv[l] = NULL;
+					return -ENOMEM;
+				}
+				if (copy_from_user(lv_ptr->lv_block_exception, lvbe, size)) {
+					vfree(lv_ptr->lv_block_exception);
+					kfree(lv_ptr);
+					vg[VG_CHR(minor)]->lv[l] = NULL;
+					return -EFAULT;
+				}
+				/* point to the original logical volume */
+				lv_ptr = lv_ptr->lv_snapshot_org;
+
+				lv_ptr->lv_snapshot_minor = 0;
+				lv_ptr->lv_snapshot_org = lv_ptr;
+				lv_ptr->lv_snapshot_prev = NULL;
+				/* walk thrugh the snapshot list */
+				while (lv_ptr->lv_snapshot_next != NULL)
+					lv_ptr = lv_ptr->lv_snapshot_next;
+				/* now lv_ptr points to the last existing snapshot in the chain */
+				vg_ptr->lv[l]->lv_snapshot_prev = lv_ptr;
+				/* our new one now back points to the previous last in the chain
+				   which can be the original logical volume */
+				lv_ptr = vg_ptr->lv[l];
+				/* now lv_ptr points to our new last snapshot logical volume */
+				lv_ptr->lv_snapshot_org = lv_ptr->lv_snapshot_prev->lv_snapshot_org;
+				lv_ptr->lv_snapshot_next = NULL;
+				lv_ptr->lv_current_pe = lv_ptr->lv_snapshot_org->lv_current_pe;
+				lv_ptr->lv_allocated_le = lv_ptr->lv_snapshot_org->lv_allocated_le;
+				lv_ptr->lv_current_le = lv_ptr->lv_snapshot_org->lv_current_le;
+				lv_ptr->lv_size = lv_ptr->lv_snapshot_org->lv_size;
+				lv_ptr->lv_stripes = lv_ptr->lv_snapshot_org->lv_stripes;
+				lv_ptr->lv_stripesize = lv_ptr->lv_snapshot_org->lv_stripesize;
+				if ((ret = lvm_snapshot_alloc(lv_ptr)) != 0)
+				{
+					vfree(lv_ptr->lv_block_exception);
+					kfree(lv_ptr);
+					vg[VG_CHR(minor)]->lv[l] = NULL;
+					return ret;
+				}
+				for ( e = 0; e < lv_ptr->lv_remap_ptr; e++)
+					lvm_hash_link (lv_ptr->lv_block_exception + e, lv_ptr->lv_block_exception[e].rdev_org, lv_ptr->lv_block_exception[e].rsector_org, lv_ptr);
+				/* need to fill the COW exception table data
+				   into the page for disk i/o */
+				lvm_snapshot_fill_COW_page(vg_ptr, lv_ptr);
+#if LINUX_VERSION_CODE > KERNEL_VERSION ( 2, 3, 0)
+				init_waitqueue_head(&lv_ptr->lv_snapshot_wait);
+#else
+				lv_ptr->lv_snapshot_wait = NULL;
+#endif
+			} else {
+				vfree(lv_ptr->lv_block_exception);
+				kfree(lv_ptr);
+				vg_ptr->lv[l] = NULL;
+				return -EFAULT;
+			}
+		} else {
+			kfree(vg_ptr->lv[l]);
+			vg_ptr->lv[l] = NULL;
+			return -EINVAL;
+		}
+	} /* if ( vg[VG_CHR(minor)]->lv[l]->lv_access & LV_SNAPSHOT) */
+
+	lv_ptr = vg_ptr->lv[l];
+	lvm_gendisk.part[MINOR(lv_ptr->lv_dev)].start_sect = 0;
+	lvm_gendisk.part[MINOR(lv_ptr->lv_dev)].nr_sects = lv_ptr->lv_size;
+	lvm_size[MINOR(lv_ptr->lv_dev)] = lv_ptr->lv_size >> 1;
+	vg_lv_map[MINOR(lv_ptr->lv_dev)].vg_number = vg_ptr->vg_number;
+	vg_lv_map[MINOR(lv_ptr->lv_dev)].lv_number = lv_ptr->lv_number;
+	LVM_CORRECT_READ_AHEAD(lv_ptr->lv_read_ahead);
+	vg_ptr->lv_cur++;
+	lv_ptr->lv_status = lv_status_save;
+
+#ifdef	CONFIG_DEVFS_FS
+	{
+	char *lv_tmp, *lv_buf;
+
+	strtok(lv->lv_name, "/");       /* /dev */
+	while((lv_tmp = strtok(NULL, "/")) != NULL)
+		lv_buf = lv_tmp;
+
+	lv_devfs_handle[lv->lv_number] = devfs_register(
+		vg_devfs_handle[vg_ptr->vg_number], lv_buf,
+		DEVFS_FL_DEFAULT, LVM_BLK_MAJOR, lv->lv_number,
+		S_IFBLK | S_IRUSR | S_IWUSR | S_IRGRP,
+		&lvm_blk_dops, NULL);
+	}
+#endif
+
+#if defined CONFIG_LVM_PROC_FS && defined CONFIG_PROC_FS
+	lvm_do_create_proc_entry_of_lv ( vg_ptr, lv_ptr);
+#endif
+
+	/* optionally add our new snapshot LV */
+	if (lv_ptr->lv_access & LV_SNAPSHOT) {
+		/* sync the original logical volume */
+		fsync_dev(lv_ptr->lv_snapshot_org->lv_dev);
+#ifdef	LVM_VFS_ENHANCEMENT
+		/* VFS function call to sync and lock the filesystem */
+		fsync_dev_lockfs(lv_ptr->lv_snapshot_org->lv_dev);
+#endif
+		lv_ptr->lv_snapshot_org->lv_access |= LV_SNAPSHOT_ORG;
+		lv_ptr->lv_access &= ~LV_SNAPSHOT_ORG;
+		/* put ourselve into the chain */
+		lv_ptr->lv_snapshot_prev->lv_snapshot_next = lv_ptr;
+	}
+
+	/* activate the logical volume */
+	lv_ptr->lv_status |= LV_ACTIVE;
+	if ( lv_ptr->lv_access & LV_WRITE)
+		set_device_ro(lv_ptr->lv_dev, 0);
+	else
+		set_device_ro(lv_ptr->lv_dev, 1);
+
+#ifdef	LVM_VFS_ENHANCEMENT
+/* VFS function call to unlock the filesystem */
+	if (lv_ptr->lv_access & LV_SNAPSHOT) {
+		unlockfs(lv_ptr->lv_snapshot_org->lv_dev);
+	}
+#endif
+
+	lv_ptr->vg = vg_ptr;
+
+	return 0;
+} /* lvm_do_lv_create() */
+
+
+/*
+ * character device support function logical volume remove
+ */
+static int lvm_do_lv_remove(int minor, char *lv_name, int l)
+{
+	uint le, p;
+	vg_t *vg_ptr = vg[VG_CHR(minor)];
+	lv_t *lv_ptr;
+
+	if (l == -1) {
+		for (l = 0; l < vg_ptr->lv_max; l++) {
+			if (vg_ptr->lv[l] != NULL &&
+			    strcmp(vg_ptr->lv[l]->lv_name, lv_name) == 0) {
+				break;
+			}
+		}
+	}
+	if (l == vg_ptr->lv_max) return -ENXIO;
+
+	lv_ptr = vg_ptr->lv[l];
+#ifdef LVM_TOTAL_RESET
+	if (lv_ptr->lv_open > 0 && lvm_reset_spindown == 0)
+#else
+	if (lv_ptr->lv_open > 0)
+#endif
+		return -EBUSY;
+
+	/* check for deletion of snapshot source while
+	   snapshot volume still exists */
+	if ((lv_ptr->lv_access & LV_SNAPSHOT_ORG) &&
+	    lv_ptr->lv_snapshot_next != NULL)
+		return -EPERM;
+
+	lv_ptr->lv_status |= LV_SPINDOWN;
+
+	/* sync the buffers */
+	fsync_dev(lv_ptr->lv_dev);
+
+	lv_ptr->lv_status &= ~LV_ACTIVE;
+
+	/* invalidate the buffers */
+	invalidate_buffers(lv_ptr->lv_dev);
+
+	/* reset generic hd */
+	lvm_gendisk.part[MINOR(lv_ptr->lv_dev)].start_sect = -1;
+	lvm_gendisk.part[MINOR(lv_ptr->lv_dev)].nr_sects = 0;
+	lvm_size[MINOR(lv_ptr->lv_dev)] = 0;
+
+	/* reset VG/LV mapping */
+	vg_lv_map[MINOR(lv_ptr->lv_dev)].vg_number = ABS_MAX_VG;
+	vg_lv_map[MINOR(lv_ptr->lv_dev)].lv_number = -1;
+
+	/* correct the PE count in PVs if this is no snapshot logical volume */
+	if (!(lv_ptr->lv_access & LV_SNAPSHOT)) {
+		/* only if this is no snapshot logical volume because
+		   we share the lv_current_pe[] structs with the
+		   original logical volume */
+		for (le = 0; le < lv_ptr->lv_allocated_le; le++) {
+			vg_ptr->pe_allocated--;
+			for (p = 0; p < vg_ptr->pv_cur; p++) {
+				if (vg_ptr->pv[p]->pv_dev ==
+				    lv_ptr->lv_current_pe[le].dev)
+					vg_ptr->pv[p]->pe_allocated--;
+			}
+		}
+		vfree(lv_ptr->lv_current_pe);
+	/* LV_SNAPSHOT */
+	} else {
+		/* remove this snapshot logical volume from the chain */
+		lv_ptr->lv_snapshot_prev->lv_snapshot_next = lv_ptr->lv_snapshot_next;
+		if (lv_ptr->lv_snapshot_next != NULL) {
+			lv_ptr->lv_snapshot_next->lv_snapshot_prev =
+			    lv_ptr->lv_snapshot_prev;
+		}
+		/* no more snapshots? */
+		if (lv_ptr->lv_snapshot_org->lv_snapshot_next == NULL)
+			lv_ptr->lv_snapshot_org->lv_access &= ~LV_SNAPSHOT_ORG;
+		lvm_snapshot_release(lv_ptr);
+	}
+
+#ifdef	CONFIG_DEVFS_FS
+	devfs_unregister(lv_devfs_handle[lv_ptr->lv_number]);
+#endif
+
+#if defined CONFIG_LVM_PROC_FS && defined CONFIG_PROC_FS
+	lvm_do_remove_proc_entry_of_lv ( vg_ptr, lv_ptr);
+#endif
+
+#ifdef DEBUG_KFREE
+	printk(KERN_DEBUG "%s -- kfree %d\n", lvm_name, __LINE__);
+#endif
+	kfree(lv_ptr);
+	vg_ptr->lv[l] = NULL;
+	vg_ptr->lv_cur--;
+	return 0;
+} /* lvm_do_lv_remove() */
+
+
+/*
+ * character device support function logical volume extend / reduce
+ */
+static int lvm_do_lv_extend_reduce(int minor, char *lv_name, lv_t *lv)
+{
+	ulong end, l, le, p, size, old_allocated_le;
+	vg_t *vg_ptr = vg[VG_CHR(minor)];
+	lv_t *lv_ptr;
+	pe_t *pe;
+
+	if ((pep = lv->lv_current_pe) == NULL) return -EINVAL;
+
+	for (l = 0; l < vg_ptr->lv_max; l++) {
+		if (vg_ptr->lv[l] != NULL &&
+		    strcmp(vg_ptr->lv[l]->lv_name, lv_name) == 0)
+			break;
+	}
+	if (l == vg_ptr->lv_max) return -ENXIO;
+	lv_ptr = vg_ptr->lv[l];
+
+	/* check for active snapshot */
+	if (lv->lv_access & LV_SNAPSHOT)
+	{
+		ulong e;
+		lv_block_exception_t *lvbe, *lvbe_old;
+		struct list_head * lvs_hash_table_old;
+
+		if (lv->lv_block_exception == NULL) return -ENXIO;
+		size = lv->lv_remap_end * sizeof ( lv_block_exception_t);
+		if ((lvbe = vmalloc(size)) == NULL)
+		{
+			printk(KERN_CRIT
+			"%s -- lvm_do_lv_extend_reduce: vmalloc error LV_BLOCK_EXCEPTION "
+			       "of %lu Byte at line %d\n",
+			       lvm_name, size, __LINE__);
+			return -ENOMEM;
+		}
+		if (lv->lv_remap_end > lv_ptr->lv_remap_end)
+		{
+			if (copy_from_user(lvbe, lv->lv_block_exception, size))
+			{
+				vfree(lvbe);
+				return -EFAULT;
+			}
+		}
+
+		lvbe_old = lv_ptr->lv_block_exception;
+		lvs_hash_table_old = lv_ptr->lv_snapshot_hash_table;
+
+		/* we need to play on the safe side here... */
+		down(&lv_ptr->lv_snapshot_org->lv_snapshot_sem);
+		if (lv_ptr->lv_block_exception == NULL ||
+		    lv_ptr->lv_remap_ptr > lv_ptr->lv_remap_end)
+		{
+			up(&lv_ptr->lv_snapshot_org->lv_snapshot_sem);
+			vfree(lvbe);
+			return -EPERM;
+		}
+		memcpy(lvbe,
+		       lv_ptr->lv_block_exception,
+		       (lv->lv_remap_end > lv_ptr->lv_remap_end ? lv_ptr->lv_remap_ptr : lv->lv_remap_end) * sizeof(lv_block_exception_t));
+
+		lv_ptr->lv_block_exception = lvbe;
+		lv_ptr->lv_remap_end = lv->lv_remap_end;
+		if (lvm_snapshot_alloc_hash_table(lv_ptr) != 0)
+		{
+			lvm_drop_snapshot(lv_ptr, "hash_alloc");
+			up(&lv_ptr->lv_snapshot_org->lv_snapshot_sem);
+			vfree(lvbe_old);
+			vfree(lvs_hash_table_old);
+			return 1;
+		}
+
+		for (e = 0; e < lv_ptr->lv_remap_ptr; e++)
+			lvm_hash_link (lv_ptr->lv_block_exception + e, lv_ptr->lv_block_exception[e].rdev_org, lv_ptr->lv_block_exception[e].rsector_org, lv_ptr);
+
+		up(&lv_ptr->lv_snapshot_org->lv_snapshot_sem);
+
+		vfree(lvbe_old);
+		vfree(lvs_hash_table_old);
+
+		return 0;
+	}
+
+
+	/* we drop in here in case it is an original logical volume */
+	if ((pe = vmalloc(size = lv->lv_current_le * sizeof(pe_t))) == NULL) {
+		printk(KERN_CRIT
+		"%s -- lvm_do_lv_extend_reduce: vmalloc error LV_CURRENT_PE "
+		       "of %lu Byte at line %d\n",
+		       lvm_name, size, __LINE__);
+		return -ENOMEM;
+	}
+	/* get the PE structures from user space */
+	if (copy_from_user(pe, pep, size)) {
+		vfree(pe);
+		return -EFAULT;
+	}
+
+#ifdef DEBUG
+	printk(KERN_DEBUG
+	       "%s -- fsync_dev and "
+	       "invalidate_buffers for %s [%s] in %s\n",
+	       lvm_name, lv_ptr->lv_name,
+	       kdevname(lv_ptr->lv_dev),
+	       vg_ptr->vg_name);
+#endif
+
+	/* reduce allocation counters on PV(s) */
+	for (le = 0; le < lv_ptr->lv_allocated_le; le++) {
+		vg_ptr->pe_allocated--;
+		for (p = 0; p < vg_ptr->pv_cur; p++) {
+			if (vg_ptr->pv[p]->pv_dev ==
+			lv_ptr->lv_current_pe[le].dev) {
+				vg_ptr->pv[p]->pe_allocated--;
+				break;
+			}
+		}
+	}
+
+
+	/* save pointer to "old" lv/pe pointer array */
+	pep1 = lv_ptr->lv_current_pe;
+	end = lv_ptr->lv_current_le;
+
+	/* save open counter... */
+	lv->lv_open = lv_ptr->lv_open;
+	lv->lv_snapshot_prev = lv_ptr->lv_snapshot_prev;
+	lv->lv_snapshot_next = lv_ptr->lv_snapshot_next;
+	lv->lv_snapshot_org  = lv_ptr->lv_snapshot_org;
+
+	lv->lv_current_pe = pe;
+
+	/* save # of old allocated logical extents */
+	old_allocated_le = lv_ptr->lv_allocated_le;
+
+        /* in case of shrinking -> let's flush */
+        if ( end > lv->lv_current_le) fsync_dev(lv_ptr->lv_dev);
+
+	/* copy preloaded LV */
+	memcpy((char *) lv_ptr, (char *) lv, sizeof(lv_t));
+
+	lvm_gendisk.part[MINOR(lv_ptr->lv_dev)].start_sect = 0;
+	lvm_gendisk.part[MINOR(lv_ptr->lv_dev)].nr_sects = lv_ptr->lv_size;
+	lvm_size[MINOR(lv_ptr->lv_dev)] = lv_ptr->lv_size >> 1;
+	/* vg_lv_map array doesn't have to be changed here */
+
+	LVM_CORRECT_READ_AHEAD(lv_ptr->lv_read_ahead);
+
+	/* save availiable i/o statistic data */
+	/* linear logical volume */
+	if (lv_ptr->lv_stripes < 2) {
+		/* Check what last LE shall be used */
+		if (end > lv_ptr->lv_current_le) end = lv_ptr->lv_current_le;
+		for (le = 0; le < end; le++) {
+			lv_ptr->lv_current_pe[le].reads  += pep1[le].reads;
+			lv_ptr->lv_current_pe[le].writes += pep1[le].writes;
+		}
+		/* striped logical volume */
+	} else {
+		uint i, j, source, dest, end, old_stripe_size, new_stripe_size;
+
+		old_stripe_size = old_allocated_le / lv_ptr->lv_stripes;
+		new_stripe_size = lv_ptr->lv_allocated_le / lv_ptr->lv_stripes;
+		end = old_stripe_size;
+		if (end > new_stripe_size) end = new_stripe_size;
+		for (i = source = dest = 0;
+		     i < lv_ptr->lv_stripes; i++) {
+			for (j = 0; j < end; j++) {
+				lv_ptr->lv_current_pe[dest + j].reads +=
+				    pep1[source + j].reads;
+				lv_ptr->lv_current_pe[dest + j].writes +=
+				    pep1[source + j].writes;
+			}
+			source += old_stripe_size;
+			dest += new_stripe_size;
+		}
+	}
+
+	/* extend the PE count in PVs */
+	for (le = 0; le < lv_ptr->lv_allocated_le; le++) {
+		vg_ptr->pe_allocated++;
+		for (p = 0; p < vg_ptr->pv_cur; p++) {
+			if (vg_ptr->pv[p]->pv_dev ==
+                            lv_ptr->lv_current_pe[le].dev) {
+				vg_ptr->pv[p]->pe_allocated++;
+				break;
+			}
+		}
+	}
+
+	vfree ( pep1);
+	pep1 = NULL;
+
+	if (lv->lv_access & LV_SNAPSHOT_ORG)
+	{
+		/* Correct the snapshot size information */
+		while ((lv_ptr = lv_ptr->lv_snapshot_next) != NULL)
+		{
+			lv_ptr->lv_current_pe = lv_ptr->lv_snapshot_org->lv_current_pe;
+			lv_ptr->lv_allocated_le = lv_ptr->lv_snapshot_org->lv_allocated_le;
+			lv_ptr->lv_current_le = lv_ptr->lv_snapshot_org->lv_current_le;
+			lv_ptr->lv_size = lv_ptr->lv_snapshot_org->lv_size;
+			lvm_gendisk.part[MINOR(lv_ptr->lv_dev)].nr_sects = lv_ptr->lv_size;
+			lvm_size[MINOR(lv_ptr->lv_dev)] = lv_ptr->lv_size >> 1;
+		}
+	}
+
+	return 0;
+} /* lvm_do_lv_extend_reduce() */
+
+
+/*
+ * character device support function logical volume status by name
+ */
+static int lvm_do_lv_status_byname(vg_t *vg_ptr, void *arg)
+{
+	uint l;
+	ulong size;
+	lv_t lv;
+	lv_t *lv_ptr;
+	lv_status_byname_req_t lv_status_byname_req;
+
+	if (vg_ptr == NULL) return -ENXIO;
+	if (copy_from_user(&lv_status_byname_req, arg,
+			   sizeof(lv_status_byname_req_t)) != 0)
+		return -EFAULT;
+
+	if (lv_status_byname_req.lv == NULL) return -EINVAL;
+	if (copy_from_user(&lv, lv_status_byname_req.lv,
+			   sizeof(lv_t)) != 0)
+		return -EFAULT;
+
+	for (l = 0; l < vg_ptr->lv_max; l++) {
+		lv_ptr = vg_ptr->lv[l];
+		if (lv_ptr != NULL &&
+		    strcmp(lv_ptr->lv_name,
+			    lv_status_byname_req.lv_name) == 0) {
+			if (copy_to_user(lv_status_byname_req.lv,
+					 lv_ptr,
+					 sizeof(lv_t)) != 0)
+				return -EFAULT;
+
+			if (lv.lv_current_pe != NULL) {
+				size = lv_ptr->lv_allocated_le *
+				       sizeof(pe_t);
+				if (copy_to_user(lv.lv_current_pe,
+						 lv_ptr->lv_current_pe,
+						 size) != 0)
+					return -EFAULT;
+			}
+			return 0;
+		}
+	}
+	return -ENXIO;
+} /* lvm_do_lv_status_byname() */
+
+
+/*
+ * character device support function logical volume status by index
+ */
+static int lvm_do_lv_status_byindex(vg_t *vg_ptr,void *arg)
+{
+	ulong size;
+	lv_t lv;
+	lv_t *lv_ptr;
+	lv_status_byindex_req_t lv_status_byindex_req;
+
+	if (vg_ptr == NULL) return -ENXIO;
+	if (copy_from_user(&lv_status_byindex_req, arg,
+			   sizeof(lv_status_byindex_req)) != 0)
+		return -EFAULT;
+
+	if ((lvp = lv_status_byindex_req.lv) == NULL)
+		return -EINVAL;
+	if ( ( lv_ptr = vg_ptr->lv[lv_status_byindex_req.lv_index]) == NULL)
+		return -ENXIO;
+
+	if (copy_from_user(&lv, lvp, sizeof(lv_t)) != 0)
+		return -EFAULT;
+
+	if (copy_to_user(lvp, lv_ptr, sizeof(lv_t)) != 0)
+		return -EFAULT;
+
+	if (lv.lv_current_pe != NULL) {
+		size = lv_ptr->lv_allocated_le * sizeof(pe_t);
+		if (copy_to_user(lv.lv_current_pe,
+			 	 lv_ptr->lv_current_pe,
+				 size) != 0)
+			return -EFAULT;
+	}
+	return 0;
+} /* lvm_do_lv_status_byindex() */
+
+
+/*
+ * character device support function logical volume status by device number
+ */
+static int lvm_do_lv_status_bydev(vg_t * vg_ptr, void * arg) {
+	int l;
+	lv_status_bydev_req_t lv_status_bydev_req;
+
+	if (vg_ptr == NULL) return -ENXIO;
+	if (copy_from_user(&lv_status_bydev_req, arg,
+			   sizeof(lv_status_bydev_req)) != 0)
+		return -EFAULT;
+
+	for ( l = 0; l < vg_ptr->lv_max; l++) {
+		if ( vg_ptr->lv[l] == NULL) continue;
+		if ( vg_ptr->lv[l]->lv_dev == lv_status_bydev_req.dev) break;
+	}
+
+	if ( l == vg_ptr->lv_max) return -ENXIO;
+
+	if (copy_to_user(lv_status_bydev_req.lv,
+			 vg_ptr->lv[l], sizeof(lv_t)) != 0)
+		return -EFAULT;
+
+	return 0;
+} /* lvm_do_lv_status_bydev() */
+
+
+/*
+ * character device support function rename a logical volume
+ */
+static int lvm_do_lv_rename(vg_t *vg_ptr, lv_req_t *lv_req, lv_t *lv)
+{
+	int l = 0;
+	int ret = 0;
+	lv_t *lv_ptr = NULL;
+
+	for (l = 0; l < vg_ptr->lv_max; l++)
+	{
+		if ( (lv_ptr = vg_ptr->lv[l]) == NULL) continue;
+		if (lv_ptr->lv_dev == lv->lv_dev)
+		{
+#if defined CONFIG_LVM_PROC_FS && defined CONFIG_PROC_FS
+			lvm_do_remove_proc_entry_of_lv ( vg_ptr, lv_ptr);
+#endif
+			strncpy(lv_ptr->lv_name,
+				lv_req->lv_name,
+				NAME_LEN);
+#if defined CONFIG_LVM_PROC_FS && defined CONFIG_PROC_FS
+			lvm_do_create_proc_entry_of_lv ( vg_ptr, lv_ptr);
+#endif
+			break;
+		}
+	}
+	if (l == vg_ptr->lv_max) ret = -ENODEV;
+
+	return ret;
+} /* lvm_do_lv_rename */
+
+
+/*
+ * character device support function physical volume change
+ */
+static int lvm_do_pv_change(vg_t *vg_ptr, void *arg)
+{
+	uint p;
+	pv_t *pv_ptr;
+#ifdef LVM_GET_INODE
+	struct inode *inode_sav;
+#endif
+
+	if (vg_ptr == NULL) return -ENXIO;
+	if (copy_from_user(&pv_change_req, arg,
+			   sizeof(pv_change_req)) != 0)
+		return -EFAULT;
+
+	for (p = 0; p < vg_ptr->pv_max; p++) {
+		pv_ptr = vg_ptr->pv[p];
+		if (pv_ptr != NULL &&
+		    strcmp(pv_ptr->pv_name,
+			       pv_change_req.pv_name) == 0) {
+#ifdef LVM_GET_INODE
+			inode_sav = pv_ptr->inode;
+#endif
+			if (copy_from_user(pv_ptr,
+					   pv_change_req.pv,
+					   sizeof(pv_t)) != 0)
+				return -EFAULT;
+
+			/* We don't need the PE list
+			   in kernel space as with LVs pe_t list */
+			pv_ptr->pe = NULL;
+#ifdef LVM_GET_INODE
+			pv_ptr->inode = inode_sav;
+#endif
+			return 0;
+		}
+	}
+	return -ENXIO;
+} /* lvm_do_pv_change() */
+
+/*
+ * character device support function get physical volume status
+ */
+static int lvm_do_pv_status(vg_t *vg_ptr, void *arg)
+{
+	uint p;
+	pv_t *pv_ptr;
+
+	if (vg_ptr == NULL) return -ENXIO;
+	if (copy_from_user(&pv_status_req, arg,
+			   sizeof(pv_status_req)) != 0)
+		return -EFAULT;
+
+	for (p = 0; p < vg_ptr->pv_max; p++) {
+		pv_ptr = vg_ptr->pv[p];
+		if (pv_ptr != NULL &&
+		    strcmp(pv_ptr->pv_name,
+			       pv_status_req.pv_name) == 0) {
+			if (copy_to_user(pv_status_req.pv,
+					 pv_ptr,
+				         sizeof(pv_t)) != 0)
+				return -EFAULT;
+			return 0;
+		}
+	}
+	return -ENXIO;
+} /* lvm_do_pv_status() */
+
+
+
+/*
+ * create a /proc entry for a logical volume
+ */
+inline void lvm_do_create_proc_entry_of_lv ( vg_t *vg_ptr, lv_t *lv_ptr) {
+	char *basename;
+
+	if ( vg_ptr->lv_subdir_pde != NULL) {
+		basename = strrchr(lv_ptr->lv_name, '/');
+		if (basename == NULL) basename = lv_ptr->lv_name;
+		else		      basename++;
+		pde = create_proc_entry(basename, S_IFREG,
+					vg_ptr->lv_subdir_pde);
+		if ( pde != NULL) {
+			pde->read_proc = lvm_proc_read_lv_info;
+			pde->data = lv_ptr;
+		}
+	}
+}
+
+
+/*
+ * remove a /proc entry for a logical volume
+ */
+inline void lvm_do_remove_proc_entry_of_lv ( vg_t *vg_ptr, lv_t *lv_ptr) {
+	char *basename;
+
+	if ( vg_ptr->lv_subdir_pde != NULL) {
+		basename = strrchr(lv_ptr->lv_name, '/');
+		if (basename == NULL) basename = lv_ptr->lv_name;
+		else		      basename++;
+		remove_proc_entry(basename, vg_ptr->lv_subdir_pde);
+	}
+}
+
+
+/*
+ * create a /proc entry for a physical volume
+ */
+inline void lvm_do_create_proc_entry_of_pv ( vg_t *vg_ptr, pv_t *pv_ptr) {
+	char *basename;
+
+	basename = strrchr(pv_ptr->pv_name, '/');
+	if (basename == NULL) basename = pv_ptr->pv_name;
+	else		      basename++;
+	pde = create_proc_entry(basename, S_IFREG, vg_ptr->pv_subdir_pde);
+	if ( pde != NULL) {
+		pde->read_proc = lvm_proc_read_pv_info;
+		pde->data = pv_ptr;
+	}
+}
+
+
+/*
+ * remove a /proc entry for a physical volume
+ */
+inline void lvm_do_remove_proc_entry_of_pv ( vg_t *vg_ptr, pv_t *pv_ptr) {
+	char *basename;
+
+	basename = strrchr(pv_ptr->pv_name, '/');
+	if ( vg_ptr->pv_subdir_pde != NULL) {
+		basename = strrchr(pv_ptr->pv_name, '/');
+		if (basename == NULL) basename = pv_ptr->pv_name;
+		else		      basename++;
+		remove_proc_entry(basename, vg_ptr->pv_subdir_pde);
+	}
+}
+
+
+/*
+ * create a /proc entry for a volume group
+ */
+#if defined CONFIG_LVM_PROC_FS && defined CONFIG_PROC_FS
+void lvm_do_create_proc_entry_of_vg ( vg_t *vg_ptr) {
+	int l, p;
+	pv_t *pv_ptr;
+	lv_t *lv_ptr;
+
+	pde = create_proc_entry(vg_ptr->vg_name, S_IFDIR,
+				lvm_proc_vg_subdir);
+	if ( pde != NULL) {
+		vg_ptr->vg_dir_pde = pde;
+		pde = create_proc_entry("group", S_IFREG,
+					vg_ptr->vg_dir_pde);
+		if ( pde != NULL) {
+			pde->read_proc = lvm_proc_read_vg_info;
+			pde->data = vg_ptr;
+		}
+		vg_ptr->lv_subdir_pde =
+			create_proc_entry(LVM_LV_SUBDIR, S_IFDIR,
+					  vg_ptr->vg_dir_pde);
+		vg_ptr->pv_subdir_pde =
+			create_proc_entry(LVM_PV_SUBDIR, S_IFDIR,
+					  vg_ptr->vg_dir_pde);
+	}
+
+	if ( vg_ptr->pv_subdir_pde != NULL) {
+		for ( l = 0; l < vg_ptr->lv_max; l++) {
+			if ( ( lv_ptr = vg_ptr->lv[l]) == NULL) continue;
+			lvm_do_create_proc_entry_of_lv ( vg_ptr, lv_ptr);
+		}
+		for ( p = 0; p < vg_ptr->pv_max; p++) {
+			if ( ( pv_ptr = vg_ptr->pv[p]) == NULL) continue;
+			lvm_do_create_proc_entry_of_pv ( vg_ptr, pv_ptr);
+		}
+	}
+}
+
+/*
+ * remove a /proc entry for a volume group
+ */
+void lvm_do_remove_proc_entry_of_vg ( vg_t *vg_ptr) {
+	int l, p;
+	lv_t *lv_ptr;
+	pv_t *pv_ptr;
+
+	for ( l = 0; l < vg_ptr->lv_max; l++) {
+		if ( ( lv_ptr = vg_ptr->lv[l]) == NULL) continue;
+		lvm_do_remove_proc_entry_of_lv ( vg_ptr, vg_ptr->lv[l]);
+	}
+	for ( p = 0; p < vg_ptr->pv_max; p++) {
+		if ( ( pv_ptr = vg_ptr->pv[p]) == NULL) continue;
+		lvm_do_remove_proc_entry_of_pv ( vg_ptr, vg_ptr->pv[p]);
+	}
+	if ( vg_ptr->vg_dir_pde != NULL) {
+		remove_proc_entry(LVM_LV_SUBDIR, vg_ptr->vg_dir_pde);
+		remove_proc_entry(LVM_PV_SUBDIR, vg_ptr->vg_dir_pde);
+		remove_proc_entry("group", vg_ptr->vg_dir_pde);
+		remove_proc_entry(vg_ptr->vg_name, lvm_proc_vg_subdir);
+	}
+}
+#endif
+
+
+/*
+ * support function initialize gendisk variables
+ */
+#ifdef __initfunc
+__initfunc(void lvm_geninit(struct gendisk *lvm_gdisk))
+#else
+void __init
+ lvm_geninit(struct gendisk *lvm_gdisk)
+#endif
+{
+	int i = 0;
+
+#ifdef DEBUG_GENDISK
+	printk(KERN_DEBUG "%s -- lvm_gendisk\n", lvm_name);
+#endif
+
+	for (i = 0; i < MAX_LV; i++) {
+		lvm_gendisk.part[i].start_sect = -1;	/* avoid partition check */
+		lvm_size[i] = lvm_gendisk.part[i].nr_sects = 0;
+		lvm_blocksizes[i] = BLOCK_SIZE;
+	}
+
+	blk_size[MAJOR_NR] = lvm_size;
+	blksize_size[MAJOR_NR] = lvm_blocksizes;
+	hardsect_size[MAJOR_NR] = lvm_blocksizes;
+
+	return;
+} /* lvm_gen_init() */
+
+
+#ifdef LVM_GET_INODE
+/*
+ * support function to get an empty inode
+ *
+ * Gets an empty inode to be inserted into the inode hash,
+ * so that a physical volume can't be mounted.
+ * This is analog to drivers/block/md.c
+ *
+ * Is this the real thing?
+ *
+ */
+struct inode *lvm_get_inode(int dev)
+{
+	struct inode *inode_this = NULL;
+
+	/* Lock the device by inserting a dummy inode. */
+	inode_this = get_empty_inode();
+	inode_this->i_dev = dev;
+	insert_inode_hash(inode_this);
+	return inode_this;
+}
+
+
+/*
+ * support function to clear an inode
+ *
+ */
+void lvm_clear_inode(struct inode *inode)
+{
+#ifdef I_FREEING
+	inode->i_state |= I_FREEING;
+#endif
+	clear_inode(inode);
+	return;
+}
+#endif /* #ifdef LVM_GET_INODE */
diff -urN 2.2.18/drivers/block/md.c 2.2.18aa1/drivers/block/md.c
--- 2.2.18/drivers/block/md.c	Tue Sep  5 02:28:40 2000
+++ 2.2.18aa1/drivers/block/md.c	Mon Dec 11 17:20:55 2000
@@ -1,21 +1,17 @@
-
 /*
    md.c : Multiple Devices driver for Linux
-          Copyright (C) 1994-96 Marc ZYNGIER
-	  <zyngier@ufr-info-p7.ibp.fr> or
-	  <maz@gloups.fdn.fr>
+          Copyright (C) 1998, 1999 Ingo Molnar
 
-   A lot of inspiration came from hd.c ...
+     completely rewritten, based on the MD driver code from Marc Zyngier
 
-   kerneld support by Boris Tobotras <boris@xtalk.msk.su>
-   boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
+   Changes:
 
-   RAID-1/RAID-5 extensions by:
-        Ingo Molnar, Miguel de Icaza, Gadi Oxman
+   - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
+   - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
+   - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
+   - kmod support by: Cyrus Durgin
+   - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
 
-   Changes for kmod by:
-   	Cyrus Durgin
-   
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2, or (at your option)
@@ -26,807 +22,3006 @@
    Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  
 */
 
-/*
- * Current RAID-1,4,5 parallel reconstruction speed limit is 1024 KB/sec, so
- * the extra system load does not show up that much. Increase it if your
- * system can take more.
- */
-#define SPEED_LIMIT 1024
+#include <linux/raid/md.h>
+#include <linux/raid/xor.h>
 
-#include <linux/config.h>
-#include <linux/module.h>
-#include <linux/version.h>
-#include <linux/malloc.h>
-#include <linux/mm.h>
-#include <linux/md.h>
-#include <linux/hdreg.h>
-#include <linux/stat.h>
-#include <linux/fs.h>
-#include <linux/proc_fs.h>
-#include <linux/blkdev.h>
-#include <linux/genhd.h>
-#include <linux/smp_lock.h>
 #ifdef CONFIG_KMOD
 #include <linux/kmod.h>
 #endif
-#include <linux/errno.h>
-#include <linux/init.h>
 
 #define __KERNEL_SYSCALLS__
 #include <linux/unistd.h>
 
+#include <asm/unaligned.h>
+
+extern asmlinkage int sys_sched_yield(void);
+extern asmlinkage int sys_setsid(void);
+
+extern unsigned long io_events[MAX_BLKDEV];
+
 #define MAJOR_NR MD_MAJOR
 #define MD_DRIVER
 
 #include <linux/blk.h>
-#include <asm/uaccess.h>
-#include <asm/bitops.h>
-#include <asm/atomic.h>
 
 #ifdef CONFIG_MD_BOOT
-extern kdev_t name_to_kdev_t(char *line) __init;
+extern kdev_t name_to_kdev_t(char *line) md__init;
 #endif
 
-static struct hd_struct md_hd_struct[MAX_MD_DEV];
-static int md_blocksizes[MAX_MD_DEV];
-int md_maxreadahead[MAX_MD_DEV];
-#if SUPPORT_RECONSTRUCTION
-static struct md_thread *md_sync_thread = NULL;
-#endif /* SUPPORT_RECONSTRUCTION */
+static mdk_personality_t *pers[MAX_PERSONALITY] = {NULL, };
+
+/*
+ * these have to be allocated separately because external
+ * subsystems want to have a pre-defined structure
+ */
+struct hd_struct md_hd_struct[MAX_MD_DEVS];
+static int md_blocksizes[MAX_MD_DEVS];
+static int md_maxreadahead[MAX_MD_DEVS];
+static mdk_thread_t *md_recovery_thread = NULL;
 
-int md_size[MAX_MD_DEV]={0, };
+int md_size[MAX_MD_DEVS] = {0, };
 
 static void md_geninit (struct gendisk *);
 
 static struct gendisk md_gendisk=
 {
-  MD_MAJOR,
-  "md",
-  0,
-  1,
-  MAX_MD_DEV,
-  md_geninit,
-  md_hd_struct,
-  md_size,
-  MAX_MD_DEV,
-  NULL,
-  NULL
+	MD_MAJOR,
+	"md",
+	0,
+	1,
+	MAX_MD_DEVS,
+	md_geninit,
+	md_hd_struct,
+	md_size,
+	MAX_MD_DEVS,
+	NULL,
+	NULL
 };
 
-static struct md_personality *pers[MAX_PERSONALITY]={NULL, };
-struct md_dev md_dev[MAX_MD_DEV];
-
-int md_thread(void * arg);
+/*
+ * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
+ * is 100 KB/sec, so the extra system load does not show up that much.
+ * Increase it if you want to have more _guaranteed_ speed. Note that
+ * the RAID driver will use the maximum available bandwith if the IO
+ * subsystem is idle.
+ *
+ * you can change it via /proc/sys/dev/speed-limit
+ */
 
-static struct gendisk *find_gendisk (kdev_t dev)
-{
-  struct gendisk *tmp=gendisk_head;
+static int sysctl_speed_limit = 100;
 
-  while (tmp != NULL)
-  {
-    if (tmp->major==MAJOR(dev))
-      return (tmp);
-    
-    tmp=tmp->next;
-  }
+static struct ctl_table_header *md_table_header;
 
-  return (NULL);
-}
+static ctl_table md_table[] = {
+	{DEV_MD_SPEED_LIMIT, "speed-limit",
+	 &sysctl_speed_limit, sizeof(int), 0644, NULL, &proc_dointvec},
+	{0}
+};
 
-char *partition_name (kdev_t dev)
-{
-  static char name[40];		/* This should be long
-				   enough for a device name ! */
-  struct gendisk *hd = find_gendisk (dev);
+static ctl_table md_dir_table[] = {
+        {DEV_MD, "md", NULL, 0, 0555, md_table},
+        {0}
+};
 
-  if (!hd)
-  {
-    sprintf (name, "[dev %s]", kdevname(dev));
-    return (name);
-  }
+static ctl_table md_root_table[] = {
+        {CTL_DEV, "dev", NULL, 0, 0555, md_dir_table},
+        {0}
+};
 
-  return disk_name (hd, MINOR(dev), name);  /* routine in genhd.c */
+static void md_register_sysctl(void)
+{
+        md_table_header = register_sysctl_table(md_root_table, 1);
 }
 
-static int legacy_raid_sb (int minor, int pnum)
+void md_unregister_sysctl(void)
 {
-	int i, factor;
+        unregister_sysctl_table(md_table_header);
+}
+
+/*
+ * The mapping between kdev and mddev is not necessary a simple
+ * one! Eg. HSM uses several sub-devices to implement Logical
+ * Volumes. All these sub-devices map to the same mddev.
+ */
+dev_mapping_t mddev_map [MAX_MD_DEVS] = { {NULL, 0}, };
 
-	factor = 1 << FACTOR_SHIFT(FACTOR((md_dev+minor)));
+void add_mddev_mapping (mddev_t * mddev, kdev_t dev, void *data)
+{
+	unsigned int minor = MINOR(dev);
 
-	/*****
-	 * do size and offset calculations.
-	 */
-	for (i=0; i<md_dev[minor].nb_dev; i++) {
-		md_dev[minor].devices[i].size &= ~(factor - 1);
-		md_size[minor] += md_dev[minor].devices[i].size;
-		md_dev[minor].devices[i].offset=i ? (md_dev[minor].devices[i-1].offset + 
-							md_dev[minor].devices[i-1].size) : 0;
+	if (MAJOR(dev) != MD_MAJOR) {
+		MD_BUG();
+		return;
 	}
-	if (pnum == RAID0 >> PERSONALITY_SHIFT)
-		md_maxreadahead[minor] = MD_DEFAULT_DISK_READAHEAD * md_dev[minor].nb_dev;
-	return 0;
+	if (mddev_map[minor].mddev != NULL) {
+		MD_BUG();
+		return;
+	}
+	mddev_map[minor].mddev = mddev;
+	mddev_map[minor].data = data;
 }
 
-static void free_sb (struct md_dev *mddev)
+void del_mddev_mapping (mddev_t * mddev, kdev_t dev)
 {
-	int i;
-	struct real_dev *realdev;
+	unsigned int minor = MINOR(dev);
 
-	if (mddev->sb) {
-		free_page((unsigned long) mddev->sb);
-		mddev->sb = NULL;
+	if (MAJOR(dev) != MD_MAJOR) {
+		MD_BUG();
+		return;
 	}
-	for (i = 0; i <mddev->nb_dev; i++) {
-		realdev = mddev->devices + i;
-		if (realdev->sb) {
-			free_page((unsigned long) realdev->sb);
-			realdev->sb = NULL;
-		}
+	if (mddev_map[minor].mddev != mddev) {
+		MD_BUG();
+		return;
 	}
+	mddev_map[minor].mddev = NULL;
+	mddev_map[minor].data = NULL;
 }
 
 /*
- * Check one RAID superblock for generic plausibility
+ * Enables to iterate over all existing md arrays
  */
+static MD_LIST_HEAD(all_mddevs);
 
-#define BAD_MAGIC KERN_ERR \
-"md: %s: invalid raid superblock magic (%x) on block %u\n"
+static mddev_t * alloc_mddev (kdev_t dev)
+{
+	mddev_t * mddev;
 
-#define OUT_OF_MEM KERN_ALERT \
-"md: out of memory.\n"
+	if (MAJOR(dev) != MD_MAJOR) {
+		MD_BUG();
+		return 0;
+	}
+	mddev = (mddev_t *) kmalloc(sizeof(*mddev), GFP_KERNEL);
+	if (!mddev)
+		return NULL;
+		
+	memset(mddev, 0, sizeof(*mddev));
 
-#define NO_DEVICE KERN_ERR \
-"md: disabled device %s\n"
+	mddev->__minor = MINOR(dev);
+	mddev->reconfig_sem = MUTEX;
+	mddev->recovery_sem = MUTEX;
+	mddev->resync_sem = MUTEX;
+	MD_INIT_LIST_HEAD(&mddev->disks);
+	/*
+	 * The 'base' mddev is the one with data NULL.
+	 * personalities can create additional mddevs 
+	 * if necessary.
+	 */
+	add_mddev_mapping(mddev, dev, 0);
+	md_list_add(&mddev->all_mddevs, &all_mddevs);
 
-#define SUCCESS 0
-#define FAILURE -1
+	return mddev;
+}
 
-static int analyze_one_sb (struct real_dev * rdev)
+static void free_mddev (mddev_t *mddev)
 {
-	int ret = FAILURE;
-	struct buffer_head *bh;
-	kdev_t dev = rdev->dev;
-	md_superblock_t *sb;
+	if (!mddev) {
+		MD_BUG();
+		return;
+	}
 
 	/*
-	 * Read the superblock, it's at the end of the disk
+	 * Make sure nobody else is using this mddev
+	 * (careful, we rely on the global kernel lock here)
 	 */
-	rdev->sb_offset = MD_NEW_SIZE_BLOCKS (blk_size[MAJOR(dev)][MINOR(dev)]);
-	set_blocksize (dev, MD_SB_BYTES);
-	bh = bread (dev, rdev->sb_offset / MD_SB_BLOCKS, MD_SB_BYTES);
-
-	if (bh) {
-		sb = (md_superblock_t *) bh->b_data;
-		if (sb->md_magic != MD_SB_MAGIC) {
-			printk (BAD_MAGIC, kdevname(dev),
-					 sb->md_magic, rdev->sb_offset);
-			goto abort;
-		}
-		rdev->sb = (md_superblock_t *) __get_free_page(GFP_KERNEL);
-		if (!rdev->sb) {
-			printk (OUT_OF_MEM);
-			goto abort;
-		}
-		memcpy (rdev->sb, bh->b_data, MD_SB_BYTES);
+	while (md_atomic_read(&mddev->resync_sem.count) != 1)
+		schedule();
+	while (md_atomic_read(&mddev->recovery_sem.count) != 1)
+		schedule();
 
-		rdev->size = sb->size;
-	} else
-		printk (NO_DEVICE,kdevname(rdev->dev));
-	ret = SUCCESS;
-abort:
-	if (bh)
-		brelse (bh);
-	return ret;
+	del_mddev_mapping(mddev, MKDEV(MD_MAJOR, mdidx(mddev)));
+	md_list_del(&mddev->all_mddevs);
+	MD_INIT_LIST_HEAD(&mddev->all_mddevs);
+	kfree(mddev);
 }
 
-#undef SUCCESS
-#undef FAILURE
-
-#undef BAD_MAGIC
-#undef OUT_OF_MEM
-#undef NO_DEVICE
 
-/*
- * Check a full RAID array for plausibility
- */
+struct gendisk * find_gendisk (kdev_t dev)
+{
+	struct gendisk *tmp = gendisk_head;
 
-#define INCONSISTENT KERN_ERR \
-"md: superblock inconsistency -- run ckraid\n"
+	while (tmp != NULL) {
+		if (tmp->major == MAJOR(dev))
+			return (tmp);
+		tmp = tmp->next;
+	}
+	return (NULL);
+}
 
-#define OUT_OF_DATE KERN_ERR \
-"md: superblock update time inconsistenty -- using the most recent one\n"
+mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
+{
+	mdk_rdev_t * rdev;
+	struct md_list_head *tmp;
 
-#define OLD_VERSION KERN_ALERT \
-"md: %s: unsupported raid array version %d.%d.%d\n"
+	ITERATE_RDEV(mddev,rdev,tmp) {
+		if (rdev->desc_nr == nr)
+			return rdev;
+	}
+	return NULL;
+}
 
-#define NOT_CLEAN KERN_ERR \
-"md: %s: raid array is not clean -- run ckraid\n"
+mdk_rdev_t * find_rdev(mddev_t * mddev, kdev_t dev)
+{
+	struct md_list_head *tmp;
+	mdk_rdev_t *rdev;
 
-#define NOT_CLEAN_IGNORE KERN_ERR \
-"md: %s: raid array is not clean -- reconstructing parity\n"
+	ITERATE_RDEV(mddev,rdev,tmp) {
+		if (rdev->dev == dev)
+			return rdev;
+	}
+	return NULL;
+}
 
-#define UNKNOWN_LEVEL KERN_ERR \
-"md: %s: unsupported raid level %d\n"
+static MD_LIST_HEAD(device_names);
 
-static int analyze_sbs (int minor, int pnum)
+char * partition_name (kdev_t dev)
 {
-	struct md_dev *mddev = md_dev + minor;
-	int i, N = mddev->nb_dev, out_of_date = 0;
-	struct real_dev * disks = mddev->devices;
-	md_superblock_t *sb, *freshest = NULL;
+	struct gendisk *hd;
+	static char nomem [] = "<nomem>";
+	dev_name_t *dname;
+	struct md_list_head *tmp = device_names.next;
 
-	/*
-	 * RAID-0 and linear don't use a RAID superblock
-	 */
-	if (pnum == RAID0 >> PERSONALITY_SHIFT ||
-		pnum == LINEAR >> PERSONALITY_SHIFT)
-			return legacy_raid_sb (minor, pnum);
+	while (tmp != &device_names) {
+		dname = md_list_entry(tmp, dev_name_t, list);
+		if (dname->dev == dev)
+			return dname->name;
+		tmp = tmp->next;
+	}
+
+	dname = (dev_name_t *) kmalloc(sizeof(*dname), GFP_KERNEL);
 
+	if (!dname)
+		return nomem;
 	/*
-	 * Verify the RAID superblock on each real device
+	 * ok, add this new device name to the list
 	 */
-	for (i = 0; i < N; i++)
-		if (analyze_one_sb(disks+i))
-			goto abort;
+	hd = find_gendisk (dev);
+
+	if (!hd)
+		sprintf (dname->name, "[dev %s]", kdevname(dev));
+	else
+		disk_name (hd, MINOR(dev), dname->name);
+
+	dname->dev = dev;
+	md_list_add(&dname->list, &device_names);
+
+	return dname->name;
+}
+
+static unsigned int calc_dev_sboffset (kdev_t dev, mddev_t *mddev,
+						int persistent)
+{
+	unsigned int size = 0;
+
+	if (blk_size[MAJOR(dev)])
+		size = blk_size[MAJOR(dev)][MINOR(dev)];
+	if (persistent)
+		size = MD_NEW_SIZE_BLOCKS(size);
+	return size;
+}
+
+static unsigned int calc_dev_size (kdev_t dev, mddev_t *mddev, int persistent)
+{
+	unsigned int size;
+
+	size = calc_dev_sboffset(dev, mddev, persistent);
+	if (!mddev->sb) {
+		MD_BUG();
+		return size;
+	}
+	if (mddev->sb->chunk_size)
+		size &= ~(mddev->sb->chunk_size/1024 - 1);
+	return size;
+}
+
+/*
+ * We check wether all devices are numbered from 0 to nb_dev-1. The
+ * order is guaranteed even after device name changes.
+ *
+ * Some personalities (raid0, linear) use this. Personalities that
+ * provide data have to be able to deal with loss of individual
+ * disks, so they do their checking themselves.
+ */
+int md_check_ordering (mddev_t *mddev)
+{
+	int i, c;
+	mdk_rdev_t *rdev;
+	struct md_list_head *tmp;
 
 	/*
-	 * The superblock constant part has to be the same
-	 * for all disks in the array.
+	 * First, all devices must be fully functional
 	 */
-	sb = NULL;
-	for (i = 0; i < N; i++) {
-		if (!disks[i].sb)
-			continue;
-		if (!sb) {
-			sb = disks[i].sb;
-			continue;
-		}
-		if (memcmp(sb,
-			   disks[i].sb, MD_SB_GENERIC_CONSTANT_WORDS * 4)) {
-			printk (INCONSISTENT);
+	ITERATE_RDEV(mddev,rdev,tmp) {
+		if (rdev->faulty) {
+			printk("md: md%d's device %s faulty, aborting.\n",
+				mdidx(mddev), partition_name(rdev->dev));
 			goto abort;
 		}
 	}
 
-	/*
-	 * OK, we have all disks and the array is ready to run. Let's
-	 * find the freshest superblock, that one will be the superblock
-	 * that represents the whole array.
-	 */
-	if ((sb = mddev->sb = (md_superblock_t *) __get_free_page (GFP_KERNEL)) == NULL)
+	c = 0;
+	ITERATE_RDEV(mddev,rdev,tmp) {
+		c++;
+	}
+	if (c != mddev->nb_dev) {
+		MD_BUG();
 		goto abort;
-	freshest = NULL;
-	for (i = 0; i < N; i++) {
-		if (!disks[i].sb)
-			continue;
-		if (!freshest) {
-			freshest = disks[i].sb;
-			continue;
-		}
-		/*
-		 * Find the newest superblock version
-		 */
-		if (disks[i].sb->utime != freshest->utime) {
-			out_of_date = 1;
-			if (disks[i].sb->utime > freshest->utime)
-				freshest = disks[i].sb;
-		}
 	}
-	if (out_of_date)
-		printk(OUT_OF_DATE);
-	memcpy (sb, freshest, sizeof(*freshest));
-
-	/*
-	 * Check if we can support this RAID array
-	 */
-	if (sb->major_version != MD_MAJOR_VERSION ||
-			sb->minor_version > MD_MINOR_VERSION) {
-
-		printk (OLD_VERSION, kdevname(MKDEV(MD_MAJOR, minor)),
-				sb->major_version, sb->minor_version,
-				sb->patch_version);
+	if (mddev->nb_dev != mddev->sb->raid_disks) {
+		printk("md: md%d, array needs %d disks, has %d, aborting.\n",
+			mdidx(mddev), mddev->sb->raid_disks, mddev->nb_dev);
 		goto abort;
 	}
-
 	/*
-	 * We need to add this as a superblock option.
+	 * Now the numbering check
 	 */
-#if SUPPORT_RECONSTRUCTION
-	if (sb->state != (1 << MD_SB_CLEAN)) {
-		if (sb->level == 1) {
-			printk (NOT_CLEAN, kdevname(MKDEV(MD_MAJOR, minor)));
+	for (i = 0; i < mddev->nb_dev; i++) {
+		c = 0;
+		ITERATE_RDEV(mddev,rdev,tmp) {
+			if (rdev->desc_nr == i)
+				c++;
+		}
+		if (c == 0) {
+			printk("md: md%d, missing disk #%d, aborting.\n",
+				mdidx(mddev), i);
 			goto abort;
-		} else
-			printk (NOT_CLEAN_IGNORE, kdevname(MKDEV(MD_MAJOR, minor)));
-	}
-#else
-	if (sb->state != (1 << MD_SB_CLEAN)) {
-		printk (NOT_CLEAN, kdevname(MKDEV(MD_MAJOR, minor)));
-		goto abort;
-	}
-#endif /* SUPPORT_RECONSTRUCTION */
-
-	switch (sb->level) {
-		case 1:
-			md_size[minor] = sb->size;
-			md_maxreadahead[minor] = MD_DEFAULT_DISK_READAHEAD;
-			break;
-		case 4:
-		case 5:
-			md_size[minor] = sb->size * (sb->raid_disks - 1);
-			md_maxreadahead[minor] = MD_DEFAULT_DISK_READAHEAD * (sb->raid_disks - 1);
-			break;
-		default:
-			printk (UNKNOWN_LEVEL, kdevname(MKDEV(MD_MAJOR, minor)),
-					sb->level);
+		}
+		if (c > 1) {
+			printk("md: md%d, too many disks #%d, aborting.\n",
+				mdidx(mddev), i);
 			goto abort;
+		}
 	}
 	return 0;
 abort:
-	free_sb(mddev);
 	return 1;
 }
 
-#undef INCONSISTENT
-#undef OUT_OF_DATE
-#undef OLD_VERSION
-#undef NOT_CLEAN
-#undef OLD_LEVEL
-
-int md_update_sb(int minor)
+static unsigned int zoned_raid_size (mddev_t *mddev)
 {
-	struct md_dev *mddev = md_dev + minor;
-	struct buffer_head *bh;
-	md_superblock_t *sb = mddev->sb;
-	struct real_dev *realdev;
-	kdev_t dev;
-	int i;
-	u32 sb_offset;
+	unsigned int mask;
+	mdk_rdev_t * rdev;
+	struct md_list_head *tmp;
 
-	sb->utime = CURRENT_TIME;
-	for (i = 0; i < mddev->nb_dev; i++) {
-		realdev = mddev->devices + i;
-		if (!realdev->sb)
-			continue;
-		dev = realdev->dev;
-		sb_offset = realdev->sb_offset;
-		set_blocksize(dev, MD_SB_BYTES);
-		printk("md: updating raid superblock on device %s, sb_offset == %u\n", kdevname(dev), sb_offset);
-		bh = getblk(dev, sb_offset / MD_SB_BLOCKS, MD_SB_BYTES);
-		if (bh) {
-			sb = (md_superblock_t *) bh->b_data;
-			memcpy(sb, mddev->sb, MD_SB_BYTES);
-			memcpy(&sb->descriptor, sb->disks + realdev->sb->descriptor.number, MD_SB_DESCRIPTOR_WORDS * 4);
-			mark_buffer_uptodate(bh, 1);
-			mark_buffer_dirty(bh, 1);
-			ll_rw_block(WRITE, 1, &bh);
-			wait_on_buffer(bh);
-			bforget(bh);
-			fsync_dev(dev);
-			invalidate_buffers(dev);
-		} else
-			printk(KERN_ERR "md: getblk failed for device %s\n", kdevname(dev));
+	if (!mddev->sb) {
+		MD_BUG();
+		return -EINVAL;
+	}
+	/*
+	 * do size and offset calculations.
+	 */
+	mask = ~(mddev->sb->chunk_size/1024 - 1);
+printk("mask %08x\n", mask);
+
+	ITERATE_RDEV(mddev,rdev,tmp) {
+printk(" rdev->size: %d\n", rdev->size);
+		rdev->size &= mask;
+printk(" masked rdev->size: %d\n", rdev->size);
+		md_size[mdidx(mddev)] += rdev->size;
+printk("  new md_size: %d\n", md_size[mdidx(mddev)]);
 	}
 	return 0;
 }
 
-static int do_md_run (int minor, int repart)
+static void remove_descriptor (mdp_disk_t *disk, mdp_super_t *sb)
 {
-  int pnum, i, min, factor, err;
+	if (disk_active(disk)) {
+		sb->working_disks--;
+	} else {
+		if (disk_spare(disk)) {
+			sb->spare_disks--;
+			sb->working_disks--;
+		} else	{
+			sb->failed_disks--;
+		}
+	}
+	sb->nr_disks--;
+	disk->major = 0;
+	disk->minor = 0;
+	mark_disk_removed(disk);
+}
 
-  if (!md_dev[minor].nb_dev)
-    return -EINVAL;
-  
-  if (md_dev[minor].pers)
-    return -EBUSY;
+#define BAD_MAGIC KERN_ERR \
+"md: invalid raid superblock magic on %s\n"
 
-  md_dev[minor].repartition=repart;
-  
-  if ((pnum=PERSONALITY(&md_dev[minor]) >> (PERSONALITY_SHIFT))
-      >= MAX_PERSONALITY)
-    return -EINVAL;
-
-  /* Only RAID-1 and RAID-5 can have MD devices as underlying devices */
-  if (pnum != (RAID1 >> PERSONALITY_SHIFT) && pnum != (RAID5 >> PERSONALITY_SHIFT)){
-	  for (i = 0; i < md_dev [minor].nb_dev; i++)
-		  if (MAJOR (md_dev [minor].devices [i].dev) == MD_MAJOR)
-			  return -EINVAL;
-  }
-  if (!pers[pnum])
-  {
-#ifdef CONFIG_KMOD
-    char module_name[80];
-    sprintf (module_name, "md-personality-%d", pnum);
-    request_module (module_name);
-    if (!pers[pnum])
-#endif
-      return -EINVAL;
-  }
-  
-  factor = min = 1 << FACTOR_SHIFT(FACTOR((md_dev+minor)));
-  
-  for (i=0; i<md_dev[minor].nb_dev; i++)
-    if (md_dev[minor].devices[i].size<min)
-    {
-      printk ("Dev %s smaller than %dk, cannot shrink\n",
-	      partition_name (md_dev[minor].devices[i].dev), min);
-      return -EINVAL;
-    }
-
-  for (i=0; i<md_dev[minor].nb_dev; i++) {
-    fsync_dev(md_dev[minor].devices[i].dev);
-    invalidate_buffers(md_dev[minor].devices[i].dev);
-  }
-  
-  /* Resize devices according to the factor. It is used to align
-     partitions size on a given chunk size. */
-  md_size[minor]=0;
-
-  /*
-   * Analyze the raid superblock
-   */ 
-  if (analyze_sbs(minor, pnum))
-    return -EINVAL;
+#define BAD_MINOR KERN_ERR \
+"md: %s: invalid raid minor (%x)\n"
 
-  md_dev[minor].pers=pers[pnum];
-  
-  if ((err=md_dev[minor].pers->run (minor, md_dev+minor)))
-  {
-    md_dev[minor].pers=NULL;
-    free_sb(md_dev + minor);
-    return (err);
-  }
-
-  if (pnum != RAID0 >> PERSONALITY_SHIFT && pnum != LINEAR >> PERSONALITY_SHIFT)
-  {
-    md_dev[minor].sb->state &= ~(1 << MD_SB_CLEAN);
-    md_update_sb(minor);
-  }
-
-  /* FIXME : We assume here we have blocks
-     that are twice as large as sectors.
-     THIS MAY NOT BE TRUE !!! */
-  md_hd_struct[minor].start_sect=0;
-  md_hd_struct[minor].nr_sects=md_size[minor]<<1;
-  
-  read_ahead[MD_MAJOR] = 128;
-  return (0);
-}
+#define OUT_OF_MEM KERN_ALERT \
+"md: out of memory.\n"
+
+#define NO_SB KERN_ERR \
+"md: disabled device %s, could not read superblock.\n"
 
-static int do_md_stop (int minor, struct inode *inode)
+#define BAD_CSUM KERN_WARNING \
+"md: invalid superblock checksum on %s\n"
+
+static int alloc_array_sb (mddev_t * mddev)
 {
-	int i;
-  
-	if (inode->i_count>1 || md_dev[minor].busy>1) {
-		/*
-		 * ioctl : one open channel
-		 */
-		printk ("STOP_MD md%x failed : i_count=%d, busy=%d\n",
-				minor, inode->i_count, md_dev[minor].busy);
-		return -EBUSY;
-	}
-  
-	if (md_dev[minor].pers) {
-		/*
-		 * It is safe to call stop here, it only frees private
-		 * data. Also, it tells us if a device is unstoppable
-		 * (eg. resyncing is in progress)
-		 */
-		if (md_dev[minor].pers->stop (minor, md_dev+minor))
-			return -EBUSY;
-		/*
-		 *  The device won't exist anymore -> flush it now
-		 */
-		fsync_dev (inode->i_rdev);
-		invalidate_buffers (inode->i_rdev);
-		if (md_dev[minor].sb) {
-			md_dev[minor].sb->state |= 1 << MD_SB_CLEAN;
-			md_update_sb(minor);
-		}
+	if (mddev->sb) {
+		MD_BUG();
+		return 0;
 	}
-  
-	/* Remove locks. */
-	if (md_dev[minor].sb)
-	free_sb(md_dev + minor);
-	for (i=0; i<md_dev[minor].nb_dev; i++)
-		clear_inode (md_dev[minor].devices[i].inode);
-
-	md_dev[minor].nb_dev=md_size[minor]=0;
-	md_hd_struct[minor].nr_sects=0;
-	md_dev[minor].pers=NULL;
-  
-	read_ahead[MD_MAJOR] = 128;
-  
-	return (0);
+
+	mddev->sb = (mdp_super_t *) __get_free_page (GFP_KERNEL);
+	if (!mddev->sb)
+		return -ENOMEM;
+	md_clear_page((unsigned long)mddev->sb);
+	return 0;
 }
 
-static int do_md_add (int minor, kdev_t dev)
+static int alloc_disk_sb (mdk_rdev_t * rdev)
 {
-	int i;
-	int hot_add=0;
-	struct real_dev *realdev;
+	if (rdev->sb)
+		MD_BUG();
 
-	if (md_dev[minor].nb_dev==MAX_REAL)
+	rdev->sb = (mdp_super_t *) __get_free_page(GFP_KERNEL);
+	if (!rdev->sb) {
+		printk (OUT_OF_MEM);
 		return -EINVAL;
+	}
+	md_clear_page((unsigned long)rdev->sb);
 
-	if (!fs_may_mount (dev))
-		return -EBUSY;
+	return 0;
+}
 
-	if (blk_size[MAJOR(dev)] == NULL || blk_size[MAJOR(dev)][MINOR(dev)] == 0) {
-		printk("md_add(): zero device size, huh, bailing out.\n");
-		return -EINVAL;
+static void free_disk_sb (mdk_rdev_t * rdev)
+{
+	if (rdev->sb) {
+		free_page((unsigned long) rdev->sb);
+		rdev->sb = NULL;
+		rdev->sb_offset = 0;
+		rdev->size = 0;
+	} else {
+		if (!rdev->faulty)
+			MD_BUG();
 	}
+}
 
-	if (md_dev[minor].pers) {
-		/*
-		 * The array is already running, hot-add the drive, or
-		 * bail out:
-		 */
-		if (!md_dev[minor].pers->hot_add_disk)
-			return -EBUSY;
-		else
-			hot_add=1;
+static void mark_rdev_faulty (mdk_rdev_t * rdev)
+{
+	unsigned long flags;
+
+	if (!rdev) {
+		MD_BUG();
+		return;
 	}
+	save_flags(flags);
+	cli();
+	free_disk_sb(rdev);
+	rdev->faulty = 1;
+	restore_flags(flags);
+}
+
+static int read_disk_sb (mdk_rdev_t * rdev)
+{
+	int ret = -EINVAL;
+	struct buffer_head *bh = NULL;
+	kdev_t dev = rdev->dev;
+	mdp_super_t *sb;
+	u32 sb_offset;
 
+	if (!rdev->sb) {
+		MD_BUG();
+		goto abort;
+	}	
+	
 	/*
-	 * Careful. We cannot increase nb_dev for a running array.
+	 * Calculate the position of the superblock,
+	 * it's at the end of the disk
 	 */
-	i=md_dev[minor].nb_dev;
-	realdev = &md_dev[minor].devices[i];
-	realdev->dev=dev;
-  
-	/* Lock the device by inserting a dummy inode. This doesn't
-	   smell very good, but I need to be consistent with the
-	   mount stuff, specially with fs_may_mount. If someone have
-	   a better idea, please help ! */
-  
-	realdev->inode=get_empty_inode ();
-	realdev->inode->i_dev=dev; 	/* don't care about other fields */
-	insert_inode_hash (realdev->inode);
-  
-	/* Sizes are now rounded at run time */
-  
-/*  md_dev[minor].devices[i].size=gen_real->sizes[MINOR(dev)]; HACKHACK*/
-
-	realdev->size=blk_size[MAJOR(dev)][MINOR(dev)];
+	sb_offset = calc_dev_sboffset(rdev->dev, rdev->mddev, 1);
+	rdev->sb_offset = sb_offset;
+	printk("(read) %s's sb offset: %d", partition_name(dev),
+							 sb_offset);
+	fsync_dev(dev);
+	set_blocksize (dev, MD_SB_BYTES);
+	bh = bread (dev, sb_offset / MD_SB_BLOCKS, MD_SB_BYTES);
 
-	if (hot_add) {
+	if (bh) {
+		sb = (mdp_super_t *) bh->b_data;
+		memcpy (rdev->sb, sb, MD_SB_BYTES);
+	} else {
+		printk (NO_SB,partition_name(rdev->dev));
+		goto abort;
+	}
+	printk(" [events: %08lx]\n", (unsigned long)get_unaligned(&rdev->sb->events));
+	ret = 0;
+abort:
+	if (bh)
+		brelse (bh);
+	return ret;
+}
+
+static unsigned int calc_sb_csum (mdp_super_t * sb)
+{
+	unsigned int disk_csum, csum;
+
+	disk_csum = sb->sb_csum;
+	sb->sb_csum = 0;
+	csum = csum_partial((void *)sb, MD_SB_BYTES, 0);
+	sb->sb_csum = disk_csum;
+	return csum;
+}
+
+/*
+ * Check one RAID superblock for generic plausibility
+ */
+
+static int check_disk_sb (mdk_rdev_t * rdev)
+{
+	mdp_super_t *sb;
+	int ret = -EINVAL;
+
+	sb = rdev->sb;
+	if (!sb) {
+		MD_BUG();
+		goto abort;
+	}
+
+	if (sb->md_magic != MD_SB_MAGIC) {
+		printk (BAD_MAGIC, partition_name(rdev->dev));
+		goto abort;
+	}
+
+	if (sb->md_minor >= MAX_MD_DEVS) {
+		printk (BAD_MINOR, partition_name(rdev->dev),
+							sb->md_minor);
+		goto abort;
+	}
+
+	if (calc_sb_csum(sb) != sb->sb_csum)
+		printk(BAD_CSUM, partition_name(rdev->dev));
+	ret = 0;
+abort:
+	return ret;
+}
+
+static kdev_t dev_unit(kdev_t dev)
+{
+	unsigned int mask;
+	struct gendisk *hd = find_gendisk(dev);
+
+	if (!hd)
+		return 0;
+	mask = ~((1 << hd->minor_shift) - 1);
+
+	return MKDEV(MAJOR(dev), MINOR(dev) & mask);
+}
+
+static mdk_rdev_t * match_dev_unit(mddev_t *mddev, kdev_t dev)
+{
+	struct md_list_head *tmp;
+	mdk_rdev_t *rdev;
+
+	ITERATE_RDEV(mddev,rdev,tmp)
+		if (dev_unit(rdev->dev) == dev_unit(dev))
+			return rdev;
+
+	return NULL;
+}
+
+static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
+{
+	struct md_list_head *tmp;
+	mdk_rdev_t *rdev;
+
+	ITERATE_RDEV(mddev1,rdev,tmp)
+		if (match_dev_unit(mddev2, rdev->dev))
+			return 1;
+
+	return 0;
+}
+
+static MD_LIST_HEAD(all_raid_disks);
+static MD_LIST_HEAD(pending_raid_disks);
+
+static void bind_rdev_to_array (mdk_rdev_t * rdev, mddev_t * mddev)
+{
+	mdk_rdev_t *same_pdev;
+
+	if (rdev->mddev) {
+		MD_BUG();
+		return;
+	}
+	same_pdev = match_dev_unit(mddev, rdev->dev);
+	if (same_pdev)
+		printk( KERN_WARNING
+"md%d: WARNING: %s appears to be on the same physical disk as %s. True\n"
+"     protection against single-disk failure might be compromised.\n",
+ 			mdidx(mddev), partition_name(rdev->dev),
+				partition_name(same_pdev->dev));
+		
+	md_list_add(&rdev->same_set, &mddev->disks);
+	rdev->mddev = mddev;
+	mddev->nb_dev++;
+	printk("bind<%s,%d>\n", partition_name(rdev->dev), mddev->nb_dev);
+}
+
+static void unbind_rdev_from_array (mdk_rdev_t * rdev)
+{
+	if (!rdev->mddev) {
+		MD_BUG();
+		return;
+	}
+	md_list_del(&rdev->same_set);
+	MD_INIT_LIST_HEAD(&rdev->same_set);
+	rdev->mddev->nb_dev--;
+	printk("unbind<%s,%d>\n", partition_name(rdev->dev),
+						 rdev->mddev->nb_dev);
+	rdev->mddev = NULL;
+}
+
+/*
+ * prevent the device from being mounted, repartitioned or
+ * otherwise reused by a RAID array (or any other kernel
+ * subsystem), by opening the device. [simply getting an
+ * inode is not enough, the SCSI module usage code needs
+ * an explicit open() on the device]
+ */
+static int lock_rdev (mdk_rdev_t *rdev)
+{
+	int err = 0;
+
+	/*
+	 * First insert a dummy inode.
+	 */
+	if (rdev->inode)
+		MD_BUG();
+	rdev->inode = get_empty_inode();
+ 	/*
+	 * we dont care about any other fields
+	 */
+	rdev->inode->i_dev = rdev->inode->i_rdev = rdev->dev;
+	insert_inode_hash(rdev->inode);
+
+	memset(&rdev->filp, 0, sizeof(rdev->filp));
+	rdev->filp.f_mode = 3; /* read write */
+	err = blkdev_open(rdev->inode, &rdev->filp);
+	if (err) {
+		printk("blkdev_open() failed: %d\n", err);
+		clear_inode(rdev->inode);
+		rdev->inode = NULL;
+	}
+	return err;
+}
+
+static void unlock_rdev (mdk_rdev_t *rdev)
+{
+	blkdev_release(rdev->inode);
+	if (!rdev->inode)
+		MD_BUG();
+	clear_inode(rdev->inode);
+	rdev->inode = NULL;
+}
+
+static void export_rdev (mdk_rdev_t * rdev)
+{
+	printk("export_rdev(%s)\n",partition_name(rdev->dev));
+	if (rdev->mddev)
+		MD_BUG();
+	unlock_rdev(rdev);
+	free_disk_sb(rdev);
+	md_list_del(&rdev->all);
+	MD_INIT_LIST_HEAD(&rdev->all);
+	if (rdev->pending.next != &rdev->pending) {
+		printk("(%s was pending)\n",partition_name(rdev->dev));
+		md_list_del(&rdev->pending);
+		MD_INIT_LIST_HEAD(&rdev->pending);
+	}
+	rdev->dev = 0;
+	rdev->faulty = 0;
+	kfree(rdev);
+}
+
+static void kick_rdev_from_array (mdk_rdev_t * rdev)
+{
+	unbind_rdev_from_array(rdev);
+	export_rdev(rdev);
+}
+
+static void export_array (mddev_t *mddev)
+{
+	struct md_list_head *tmp;
+	mdk_rdev_t *rdev;
+	mdp_super_t *sb = mddev->sb;
+
+	if (mddev->sb) {
+		mddev->sb = NULL;
+		free_page((unsigned long) sb);
+	}
+
+	ITERATE_RDEV(mddev,rdev,tmp) {
+		if (!rdev->mddev) {
+			MD_BUG();
+			continue;
+		}
+		kick_rdev_from_array(rdev);
+	}
+	if (mddev->nb_dev)
+		MD_BUG();
+}
+
+#undef BAD_CSUM
+#undef BAD_MAGIC
+#undef OUT_OF_MEM
+#undef NO_SB
+
+static void print_desc(mdp_disk_t *desc)
+{
+	printk(" DISK<N:%d,%s(%d,%d),R:%d,S:%d>\n", desc->number,
+		partition_name(MKDEV(desc->major,desc->minor)),
+		desc->major,desc->minor,desc->raid_disk,desc->state);
+}
+
+static void print_sb(mdp_super_t *sb)
+{
+	int i;
+
+	printk("  SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",
+		sb->major_version, sb->minor_version, sb->patch_version,
+		sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3,
+		sb->ctime);
+	printk("     L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", sb->level,
+		sb->size, sb->nr_disks, sb->raid_disks, sb->md_minor,
+		sb->layout, sb->chunk_size);
+	printk("     UT:%08x ST:%d AD:%d WD:%d FD:%d SD:%d CSUM:%08x E:%08lx\n",
+		sb->utime, sb->state, sb->active_disks, sb->working_disks,
+		sb->failed_disks, sb->spare_disks,
+		sb->sb_csum, (unsigned long)get_unaligned(&sb->events));
+
+	for (i = 0; i < MD_SB_DISKS; i++) {
+		mdp_disk_t *desc;
+
+		desc = sb->disks + i;
+		printk("     D %2d: ", i);
+		print_desc(desc);
+	}
+	printk("     THIS: ");
+	print_desc(&sb->this_disk);
+
+}
+
+static void print_rdev(mdk_rdev_t *rdev)
+{
+	printk(" rdev %s: O:%s, SZ:%08d F:%d DN:%d ",
+		partition_name(rdev->dev), partition_name(rdev->old_dev),
+		rdev->size, rdev->faulty, rdev->desc_nr);
+	if (rdev->sb) {
+		printk("rdev superblock:\n");
+		print_sb(rdev->sb);
+	} else
+		printk("no rdev superblock!\n");
+}
+
+void md_print_devices (void)
+{
+	struct md_list_head *tmp, *tmp2;
+	mdk_rdev_t *rdev;
+	mddev_t *mddev;
+
+	printk("\n");
+	printk("       **********************************\n");
+	printk("       * <COMPLETE RAID STATE PRINTOUT> *\n");
+	printk("       **********************************\n");
+	ITERATE_MDDEV(mddev,tmp) {
+		printk("md%d: ", mdidx(mddev));
+
+		ITERATE_RDEV(mddev,rdev,tmp2)
+			printk("<%s>", partition_name(rdev->dev));
+
+		if (mddev->sb) {
+			printk(" array superblock:\n");
+			print_sb(mddev->sb);
+		} else
+			printk(" no array superblock.\n");
+
+		ITERATE_RDEV(mddev,rdev,tmp2)
+			print_rdev(rdev);
+	}
+	printk("       **********************************\n");
+	printk("\n");
+}
+
+static int sb_equal ( mdp_super_t *sb1, mdp_super_t *sb2)
+{
+	int ret;
+	mdp_super_t *tmp1, *tmp2;
+
+	tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
+	tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
+
+	if (!tmp1 || !tmp2) {
+		ret = 0;
+		goto abort;
+	}
+
+	*tmp1 = *sb1;
+	*tmp2 = *sb2;
+
+	/*
+	 * nr_disks is not constant
+	 */
+	tmp1->nr_disks = 0;
+	tmp2->nr_disks = 0;
+
+	if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4))
+		ret = 0;
+	else
+		ret = 1;
+
+abort:
+	if (tmp1)
+		kfree(tmp1);
+	if (tmp2)
+		kfree(tmp2);
+
+	return ret;
+}
+
+static int uuid_equal(mdk_rdev_t *rdev1, mdk_rdev_t *rdev2)
+{
+	if (	(rdev1->sb->set_uuid0 == rdev2->sb->set_uuid0) &&
+		(rdev1->sb->set_uuid1 == rdev2->sb->set_uuid1) &&
+		(rdev1->sb->set_uuid2 == rdev2->sb->set_uuid2) &&
+		(rdev1->sb->set_uuid3 == rdev2->sb->set_uuid3))
+
+		return 1;
+
+	return 0;
+}
+
+static mdk_rdev_t * find_rdev_all (kdev_t dev)
+{
+	struct md_list_head *tmp;
+	mdk_rdev_t *rdev;
+
+	tmp = all_raid_disks.next;
+	while (tmp != &all_raid_disks) {
+		rdev = md_list_entry(tmp, mdk_rdev_t, all);
+		if (rdev->dev == dev)
+			return rdev;
+		tmp = tmp->next;
+	}
+	return NULL;
+}
+
+#define GETBLK_FAILED KERN_ERR \
+"md: getblk failed for device %s\n"
+
+static int write_disk_sb(mdk_rdev_t * rdev)
+{
+	struct buffer_head *bh;
+	kdev_t dev;
+	u32 sb_offset, size;
+	mdp_super_t *sb;
+
+	if (!rdev->sb) {
+		MD_BUG();
+		return -1;
+	}
+	if (rdev->faulty) {
+		MD_BUG();
+		return -1;
+	}
+	if (rdev->sb->md_magic != MD_SB_MAGIC) {
+		MD_BUG();
+		return -1;
+	}
+
+	dev = rdev->dev;
+	sb_offset = calc_dev_sboffset(dev, rdev->mddev, 1);
+	if (rdev->sb_offset != sb_offset) {
+		printk("%s's sb offset has changed from %d to %d, skipping\n", partition_name(dev), rdev->sb_offset, sb_offset);
+		goto skip;
+	}
+	/*
+	 * If the disk went offline meanwhile and it's just a spare, then
+	 * it's size has changed to zero silently, and the MD code does
+	 * not yet know that it's faulty.
+	 */
+	size = calc_dev_size(dev, rdev->mddev, 1);
+	if (size != rdev->size) {
+		printk("%s's size has changed from %d to %d since import, skipping\n", partition_name(dev), rdev->size, size);
+		goto skip;
+	}
+
+	printk("(write) %s's sb offset: %d\n", partition_name(dev), sb_offset);
+	fsync_dev(dev);
+	set_blocksize(dev, MD_SB_BYTES);
+	bh = getblk(dev, sb_offset / MD_SB_BLOCKS, MD_SB_BYTES);
+	if (!bh) {
+		printk(GETBLK_FAILED, partition_name(dev));
+		return 1;
+	}
+	memset(bh->b_data,0,bh->b_size);
+	sb = (mdp_super_t *) bh->b_data;
+	memcpy(sb, rdev->sb, MD_SB_BYTES);
+
+	mark_buffer_uptodate(bh, 1);
+	mark_buffer_dirty(bh, 1);
+	ll_rw_block(WRITE, 1, &bh);
+	wait_on_buffer(bh);
+	brelse(bh);
+	fsync_dev(dev);
+skip:
+	return 0;
+}
+#undef GETBLK_FAILED KERN_ERR
+
+static void set_this_disk(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+	int i, ok = 0;
+	mdp_disk_t *desc;
+
+	for (i = 0; i < MD_SB_DISKS; i++) {
+		desc = mddev->sb->disks + i;
+#if 0
+		if (disk_faulty(desc)) {
+			if (MKDEV(desc->major,desc->minor) == rdev->dev)
+				ok = 1;
+			continue;
+		}
+#endif
+		if (MKDEV(desc->major,desc->minor) == rdev->dev) {
+			rdev->sb->this_disk = *desc;
+			rdev->desc_nr = desc->number;
+			ok = 1;
+			break;
+		}
+	}
+
+	if (!ok) {
+		MD_BUG();
+	}
+}
+
+static int sync_sbs(mddev_t * mddev)
+{
+	mdk_rdev_t *rdev;
+	mdp_super_t *sb;
+        struct md_list_head *tmp;
+
+        ITERATE_RDEV(mddev,rdev,tmp) {
+		if (rdev->faulty)
+			continue;
+		sb = rdev->sb;
+		*sb = *mddev->sb;
+		set_this_disk(mddev, rdev);
+		sb->sb_csum = calc_sb_csum(sb);
+	}
+	return 0;
+}
+
+int md_update_sb(mddev_t * mddev)
+{
+	int first, err, count = 100;
+        struct md_list_head *tmp;
+	mdk_rdev_t *rdev;
+	__u64 ev;
+
+repeat:
+	mddev->sb->utime = CURRENT_TIME;
+	ev = get_unaligned(&mddev->sb->events);
+	++ev;
+	put_unaligned(ev,&mddev->sb->events);
+	if (ev == (__u64)0) {
+		/*
+		 * oops, this 64-bit counter should never wrap.
+		 * Either we are in around ~1 trillion A.C., assuming
+		 * 1 reboot per second, or we have a bug:
+		 */
+		MD_BUG();
+		--ev;
+		put_unaligned(ev,&mddev->sb->events);
+	}
+	sync_sbs(mddev);
+
+	/*
+	 * do not write anything to disk if using
+	 * nonpersistent superblocks
+	 */
+	if (mddev->sb->not_persistent)
+		return 0;
+
+	printk(KERN_INFO "md: updating md%d RAID superblock on device\n",
+					mdidx(mddev));
+
+	first = 1;
+	err = 0;
+        ITERATE_RDEV(mddev,rdev,tmp) {
+		if (!first) {
+			first = 0;
+			printk(", ");
+		}
+		if (rdev->faulty)
+			printk("(skipping faulty ");
+		printk("%s ", partition_name(rdev->dev));
+		if (!rdev->faulty) {
+			printk("[events: %08lx]",
+			       (unsigned long)get_unaligned(&rdev->sb->events));
+			err += write_disk_sb(rdev);
+		} else
+			printk(")\n");
+	}
+	printk(".\n");
+	if (err) {
+		printk("errors occured during superblock update, repeating\n");
+		if (--count)
+			goto repeat;
+		printk("excessive errors occured during superblock update, exiting\n");
+	}
+	return 0;
+}
+
+/*
+ * Import a device. If 'on_disk', then sanity check the superblock
+ *
+ * mark the device faulty if:
+ *
+ *   - the device is nonexistent (zero size)
+ *   - the device has no valid superblock
+ *
+ * a faulty rdev _never_ has rdev->sb set.
+ */
+static int md_import_device (kdev_t newdev, int on_disk)
+{
+	int err;
+	mdk_rdev_t *rdev;
+	unsigned int size;
+
+	if (find_rdev_all(newdev))
+		return -EEXIST;
+
+	rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL);
+	if (!rdev) {
+		printk("could not alloc mem for %s!\n", partition_name(newdev));
+		return -ENOMEM;
+	}
+	memset(rdev, 0, sizeof(*rdev));
+
+	if (!fs_may_mount(newdev)) {
+		printk("md: can not import %s, has active inodes!\n",
+			partition_name(newdev));
+		err = -EBUSY;
+		goto abort_free;
+	}
+
+	if ((err = alloc_disk_sb(rdev)))
+		goto abort_free;
+
+	rdev->dev = newdev;
+	if (lock_rdev(rdev)) {
+		printk("md: could not lock %s, zero-size? Marking faulty.\n",
+			partition_name(newdev));
+		err = -EINVAL;
+		goto abort_free;
+	}
+	rdev->desc_nr = -1;
+	rdev->faulty = 0;
+
+	size = 0;
+	if (blk_size[MAJOR(newdev)])
+		size = blk_size[MAJOR(newdev)][MINOR(newdev)];
+	if (!size) {
+		printk("md: %s has zero size, marking faulty!\n",
+				partition_name(newdev));
+		err = -EINVAL;
+		goto abort_free;
+	}
+
+	if (on_disk) {
+		if ((err = read_disk_sb(rdev))) {
+			printk("md: could not read %s's sb, not importing!\n",
+					partition_name(newdev));
+			goto abort_free;
+		}
+		if ((err = check_disk_sb(rdev))) {
+			printk("md: %s has invalid sb, not importing!\n",
+					partition_name(newdev));
+			goto abort_free;
+		}
+
+		rdev->old_dev = MKDEV(rdev->sb->this_disk.major,
+					rdev->sb->this_disk.minor);
+		rdev->desc_nr = rdev->sb->this_disk.number;
+	}
+	md_list_add(&rdev->all, &all_raid_disks);
+	MD_INIT_LIST_HEAD(&rdev->pending);
+
+	if (rdev->faulty && rdev->sb)
+		free_disk_sb(rdev);
+	return 0;
+
+abort_free:
+	if (rdev->sb) {
+		if (rdev->inode)
+			unlock_rdev(rdev);
+		free_disk_sb(rdev);
+	}
+	kfree(rdev);
+	return err;
+}
+
+/*
+ * Check a full RAID array for plausibility
+ */
+
+#define INCONSISTENT KERN_ERR \
+"md: fatal superblock inconsistency in %s -- removing from array\n"
+
+#define OUT_OF_DATE KERN_ERR \
+"md: superblock update time inconsistency -- using the most recent one\n"
+
+#define OLD_VERSION KERN_ALERT \
+"md: md%d: unsupported raid array version %d.%d.%d\n"
+
+#define NOT_CLEAN_IGNORE KERN_ERR \
+"md: md%d: raid array is not clean -- starting background reconstruction\n"
+
+#define UNKNOWN_LEVEL KERN_ERR \
+"md: md%d: unsupported raid level %d\n"
+
+static int analyze_sbs (mddev_t * mddev)
+{
+	int out_of_date = 0, i;
+	struct md_list_head *tmp, *tmp2;
+	mdk_rdev_t *rdev, *rdev2, *freshest;
+	mdp_super_t *sb;
+
+	/*
+	 * Verify the RAID superblock on each real device
+	 */
+	ITERATE_RDEV(mddev,rdev,tmp) {
+		if (rdev->faulty) {
+			MD_BUG();
+			goto abort;
+		}
+		if (!rdev->sb) {
+			MD_BUG();
+			goto abort;
+		}
+		if (check_disk_sb(rdev))
+			goto abort;
+	}
+
+	/*
+	 * The superblock constant part has to be the same
+	 * for all disks in the array.
+	 */
+	sb = NULL;
+
+	ITERATE_RDEV(mddev,rdev,tmp) {
+		if (!sb) {
+			sb = rdev->sb;
+			continue;
+		}
+		if (!sb_equal(sb, rdev->sb)) {
+			printk (INCONSISTENT, partition_name(rdev->dev));
+			kick_rdev_from_array(rdev);
+			continue;
+		}
+	}
+
+	/*
+	 * OK, we have all disks and the array is ready to run. Let's
+	 * find the freshest superblock, that one will be the superblock
+	 * that represents the whole array.
+	 */
+	if (!mddev->sb)
+		if (alloc_array_sb(mddev))
+			goto abort;
+	sb = mddev->sb;
+	freshest = NULL;
+
+	ITERATE_RDEV(mddev,rdev,tmp) {
+		__u64 ev1, ev2;
+		/*
+		 * if the checksum is invalid, use the superblock
+		 * only as a last resort. (decrease it's age by
+		 * one event)
+		 */
+		if (calc_sb_csum(rdev->sb) != rdev->sb->sb_csum) {
+			__u64 ev = get_unaligned(&rdev->sb->events);
+			if (ev != (__u64)0) {
+				--ev;
+				put_unaligned(ev,&rdev->sb->events);
+			}
+		}
+
+		printk("%s's event counter: %08lx\n", partition_name(rdev->dev),
+		       (unsigned long)get_unaligned(&rdev->sb->events));
+		if (!freshest) {
+			freshest = rdev;
+			continue;
+		}
+		/*
+		 * Find the newest superblock version
+		 */
+		ev1 = get_unaligned(&rdev->sb->events);
+		ev2 = get_unaligned(&freshest->sb->events);
+		if (ev1 != ev2) {
+			out_of_date = 1;
+			if (ev1 > ev2)
+				freshest = rdev;
+		}
+	}
+	if (out_of_date) {
+		printk(OUT_OF_DATE);
+		printk("freshest: %s\n", partition_name(freshest->dev));
+	}
+	memcpy (sb, freshest->sb, sizeof(*sb));
+
+	/*
+	 * at this point we have picked the 'best' superblock
+	 * from all available superblocks.
+	 * now we validate this superblock and kick out possibly
+	 * failed disks.
+	 */
+	ITERATE_RDEV(mddev,rdev,tmp) {
+		/*
+		 * Kick all non-fresh devices faulty
+		 */
+		__u64 ev1, ev2;
+		ev1 = get_unaligned(&rdev->sb->events);
+		ev2 = get_unaligned(&sb->events);
+		++ev1;
+		if (ev1 < ev2) {
+			printk("md: kicking non-fresh %s from array!\n",
+						partition_name(rdev->dev));
+			kick_rdev_from_array(rdev);
+			continue;
+		}
+	}
+
+	/*
+	 * Fix up changed device names ... but only if this disk has a
+	 * recent update time. Use faulty checksum ones too.
+	 */
+	ITERATE_RDEV(mddev,rdev,tmp) {
+		__u64 ev1, ev2, ev3;
+		if (rdev->faulty) { /* REMOVEME */
+			MD_BUG();
+			goto abort;
+		}
+		ev1 = get_unaligned(&rdev->sb->events);
+		ev2 = get_unaligned(&sb->events);
+		ev3 = ev2;
+		--ev3;
+		if ((rdev->dev != rdev->old_dev) &&
+		    ((ev1 == ev2) || (ev1 == ev3))) {
+			mdp_disk_t *desc;
+
+			printk("md: device name has changed from %s to %s since last import!\n", partition_name(rdev->old_dev), partition_name(rdev->dev));
+			if (rdev->desc_nr == -1) {
+				MD_BUG();
+				goto abort;
+			}
+			desc = &sb->disks[rdev->desc_nr];
+			if (rdev->old_dev != MKDEV(desc->major, desc->minor)) {
+				MD_BUG();
+				goto abort;
+			}
+			desc->major = MAJOR(rdev->dev);
+			desc->minor = MINOR(rdev->dev);
+			desc = &rdev->sb->this_disk;
+			desc->major = MAJOR(rdev->dev);
+			desc->minor = MINOR(rdev->dev);
+		}
+	}
+
+	/*
+	 * Remove unavailable and faulty devices ...
+	 *
+	 * note that if an array becomes completely unrunnable due to
+	 * missing devices, we do not write the superblock back, so the
+	 * administrator has a chance to fix things up. The removal thus
+	 * only happens if it's nonfatal to the contents of the array.
+	 */
+	for (i = 0; i < MD_SB_DISKS; i++) {
+		int found;
+		mdp_disk_t *desc;
+		kdev_t dev;
+
+		desc = sb->disks + i;
+		dev = MKDEV(desc->major, desc->minor);
+
+		/*
+		 * We kick faulty devices/descriptors immediately.
+		 */
+		if (disk_faulty(desc)) {
+			found = 0;
+			ITERATE_RDEV(mddev,rdev,tmp) {
+				if (rdev->desc_nr != desc->number)
+					continue;
+				printk("md%d: kicking faulty %s!\n",
+					mdidx(mddev),partition_name(rdev->dev));
+				kick_rdev_from_array(rdev);
+				found = 1;
+				break;
+			}
+			if (!found) {
+				if (dev == MKDEV(0,0))
+					continue;
+				printk("md%d: removing former faulty %s!\n",
+					mdidx(mddev), partition_name(dev));
+			}
+			remove_descriptor(desc, sb);
+			continue;
+		}
+
+		if (dev == MKDEV(0,0))
+			continue;
+		/*
+		 * Is this device present in the rdev ring?
+		 */
+		found = 0;
+		ITERATE_RDEV(mddev,rdev,tmp) {
+			if (rdev->desc_nr == desc->number) {
+				found = 1;
+				break;
+			}
+		}
+		if (found) 
+			continue;
+
+		printk("md%d: former device %s is unavailable, removing from array!\n", mdidx(mddev), partition_name(dev));
+		remove_descriptor(desc, sb);
+	}
+
+	/*
+	 * Double check wether all devices mentioned in the
+	 * superblock are in the rdev ring.
+	 */
+	for (i = 0; i < MD_SB_DISKS; i++) {
+		mdp_disk_t *desc;
+		kdev_t dev;
+
+		desc = sb->disks + i;
+		dev = MKDEV(desc->major, desc->minor);
+
+		if (dev == MKDEV(0,0))
+			continue;
+
+		if (disk_faulty(desc)) {
+			MD_BUG();
+			goto abort;
+		}
+
+		rdev = find_rdev(mddev, dev);
+		if (!rdev) {
+			MD_BUG();
+			goto abort;
+		}
+	}
+
+	/*
+	 * Do a final reality check.
+	 */
+	ITERATE_RDEV(mddev,rdev,tmp) {
+		if (rdev->desc_nr == -1) {
+			MD_BUG();
+			goto abort;
+		}
+		/*
+		 * is the desc_nr unique?
+		 */
+		ITERATE_RDEV(mddev,rdev2,tmp2) {
+			if ((rdev2 != rdev) &&
+					(rdev2->desc_nr == rdev->desc_nr)) {
+				MD_BUG();
+				goto abort;
+			}
+		}
+		/*
+		 * is the device unique?
+		 */
+		ITERATE_RDEV(mddev,rdev2,tmp2) {
+			if ((rdev2 != rdev) &&
+					(rdev2->dev == rdev->dev)) {
+				MD_BUG();
+				goto abort;
+			}
+		}
+	}
+
+	/*
+	 * Check if we can support this RAID array
+	 */
+	if (sb->major_version != MD_MAJOR_VERSION ||
+			sb->minor_version > MD_MINOR_VERSION) {
+
+		printk (OLD_VERSION, mdidx(mddev), sb->major_version,
+				sb->minor_version, sb->patch_version);
+		goto abort;
+	}
+
+	if ((sb->state != (1 << MD_SB_CLEAN)) && ((sb->level == 1) ||
+			(sb->level == 4) || (sb->level == 5)))
+		printk (NOT_CLEAN_IGNORE, mdidx(mddev));
+
+	return 0;
+abort:
+	return 1;
+}
+
+#undef INCONSISTENT
+#undef OUT_OF_DATE
+#undef OLD_VERSION
+#undef OLD_LEVEL
+
+static int device_size_calculation (mddev_t * mddev)
+{
+	int data_disks = 0, persistent;
+	unsigned int readahead;
+	mdp_super_t *sb = mddev->sb;
+	struct md_list_head *tmp;
+	mdk_rdev_t *rdev;
+
+	/*
+	 * Do device size calculation. Bail out if too small.
+	 * (we have to do this after having validated chunk_size,
+	 * because device size has to be modulo chunk_size)
+	 */ 
+	persistent = !mddev->sb->not_persistent;
+	ITERATE_RDEV(mddev,rdev,tmp) {
+		if (rdev->faulty)
+			continue;
+		if (rdev->size) {
+			MD_BUG();
+			continue;
+		}
+		rdev->size = calc_dev_size(rdev->dev, mddev, persistent);
+		if (rdev->size < sb->chunk_size / 1024) {
+			printk (KERN_WARNING
+				"Dev %s smaller than chunk_size: %dk < %dk\n",
+				partition_name(rdev->dev),
+				rdev->size, sb->chunk_size / 1024);
+			return -EINVAL;
+		}
+	}
+
+	switch (sb->level) {
+		case -3:
+			data_disks = 1;
+			break;
+		case -2:
+			data_disks = 1;
+			break;
+		case -1:
+			zoned_raid_size(mddev);
+			data_disks = 1;
+			break;
+		case 0:
+			zoned_raid_size(mddev);
+			data_disks = sb->raid_disks;
+			break;
+		case 1:
+			data_disks = 1;
+			break;
+		case 4:
+		case 5:
+			data_disks = sb->raid_disks-1;
+			break;
+		default:
+			printk (UNKNOWN_LEVEL, mdidx(mddev), sb->level);
+			goto abort;
+	}
+	if (!md_size[mdidx(mddev)])
+		md_size[mdidx(mddev)] = sb->size * data_disks;
+
+	readahead = MD_READAHEAD;
+	if ((sb->level == 0) || (sb->level == 4) || (sb->level == 5))
+		readahead = mddev->sb->chunk_size * 4 * data_disks;
+		if (readahead < data_disks * MAX_SECTORS*512*2) 
+			readahead = data_disks * MAX_SECTORS*512*2;
+	else {
+		if (sb->level == -3)
+			readahead = 0;
+	}
+	md_maxreadahead[mdidx(mddev)] = readahead;
+
+	printk(KERN_INFO "md%d: max total readahead window set to %dk\n",
+		mdidx(mddev), readahead/1024);
+
+	printk(KERN_INFO
+		"md%d: %d data-disks, max readahead per data-disk: %dk\n",
+			mdidx(mddev), data_disks, readahead/data_disks/1024);
+	return 0;
+abort:
+	return 1;
+}
+
+
+#define TOO_BIG_CHUNKSIZE KERN_ERR \
+"too big chunk_size: %d > %d\n"
+
+#define TOO_SMALL_CHUNKSIZE KERN_ERR \
+"too small chunk_size: %d < %ld\n"
+
+#define BAD_CHUNKSIZE KERN_ERR \
+"no chunksize specified, see 'man raidtab'\n"
+
+static int do_md_run (mddev_t * mddev)
+{
+	int pnum, err;
+	int chunk_size;
+	struct md_list_head *tmp;
+	mdk_rdev_t *rdev;
+
+
+	if (!mddev->nb_dev) {
+		MD_BUG();
+		return -EINVAL;
+	}
+  
+	if (mddev->pers)
+		return -EBUSY;
+
+	/*
+	 * Resize disks to align partitions size on a given
+	 * chunk size.
+	 */
+	md_size[mdidx(mddev)] = 0;
+
+	/*
+	 * Analyze all RAID superblock(s)
+	 */ 
+	if (analyze_sbs(mddev)) {
+		MD_BUG();
+		return -EINVAL;
+	}
+
+	chunk_size = mddev->sb->chunk_size;
+	pnum = level_to_pers(mddev->sb->level);
+
+	mddev->param.chunk_size = chunk_size;
+	mddev->param.personality = pnum;
+
+	if (chunk_size > MAX_CHUNK_SIZE) {
+		printk(TOO_BIG_CHUNKSIZE, chunk_size, MAX_CHUNK_SIZE);
+		return -EINVAL;
+	}
+	/*
+	 * chunk-size has to be a power of 2 and multiples of PAGE_SIZE
+	 */
+	if ( (1 << ffz(~chunk_size)) != chunk_size) {
+		MD_BUG();
+		return -EINVAL;
+	}
+	if (chunk_size < PAGE_SIZE) {
+		printk(TOO_SMALL_CHUNKSIZE, chunk_size, PAGE_SIZE);
+		return -EINVAL;
+	}
+
+	if (pnum >= MAX_PERSONALITY) {
+		MD_BUG();
+		return -EINVAL;
+	}
+
+	if ((pnum != RAID1) && (pnum != LINEAR) && !chunk_size) {
+		/*
+		 * 'default chunksize' in the old md code used to
+		 * be PAGE_SIZE, baaad.
+		 * we abort here to be on the safe side. We dont
+		 * want to continue the bad practice.
+		 */
+		printk(BAD_CHUNKSIZE);
+		return -EINVAL;
+	}
+
+	if (!pers[pnum])
+	{
+#ifdef CONFIG_KMOD
+		char module_name[80];
+		sprintf (module_name, "md-personality-%d", pnum);
+		request_module (module_name);
+		if (!pers[pnum])
+#endif
+			return -EINVAL;
+	}
+  
+	if (device_size_calculation(mddev))
+		return -EINVAL;
+
+	/*
+	 * Drop all container device buffers, from now on
+	 * the only valid external interface is through the md
+	 * device.
+	 */
+	ITERATE_RDEV(mddev,rdev,tmp) {
+		if (rdev->faulty)
+			continue;
+		fsync_dev(rdev->dev);
+		invalidate_buffers(rdev->dev);
+	}
+  
+	mddev->pers = pers[pnum];
+  
+	err = mddev->pers->run(mddev);
+	if (err) {
+		printk("pers->run() failed ...\n");
+		mddev->pers = NULL;
+		return -EINVAL;
+	}
+
+	mddev->sb->state &= ~(1 << MD_SB_CLEAN);
+	md_update_sb(mddev);
+
+	/*
+	 * md_size has units of 1K blocks, which are
+	 * twice as large as sectors.
+	 */
+	md_hd_struct[mdidx(mddev)].start_sect = 0;
+	md_hd_struct[mdidx(mddev)].nr_sects = md_size[mdidx(mddev)] << 1;
+  
+	read_ahead[MD_MAJOR] = 1024;
+	return (0);
+}
+
+#undef TOO_BIG_CHUNKSIZE
+#undef BAD_CHUNKSIZE
+
+#define OUT(x) do { err = (x); goto out; } while (0)
+
+static int restart_array (mddev_t *mddev)
+{
+	int err = 0;
+ 
+	/*
+	 * Complain if it has no devices
+	 */
+	if (!mddev->nb_dev)
+		OUT(-ENXIO);
+
+	if (mddev->pers) {
+		if (!mddev->ro)
+			OUT(-EBUSY);
+
+		mddev->ro = 0;
+		set_device_ro(mddev_to_kdev(mddev), 0);
+
+		printk (KERN_INFO
+			"md%d switched to read-write mode.\n", mdidx(mddev));
+		/*
+		 * Kick recovery or resync if necessary
+		 */
+		md_recover_arrays();
+		if (mddev->pers->restart_resync)
+			mddev->pers->restart_resync(mddev);
+	} else
+		err = -EINVAL;
+ 
+out:
+	return err;
+}
+
+#define STILL_MOUNTED KERN_WARNING \
+"md: md%d still mounted.\n"
+
+static int do_md_stop (mddev_t * mddev, int ro)
+{
+	int err = 0, resync_interrupted = 0;
+	kdev_t dev = mddev_to_kdev(mddev);
+ 
+	if (!ro && !fs_may_mount (dev)) {
+		printk (STILL_MOUNTED, mdidx(mddev));
+		OUT(-EBUSY);
+	}
+  
+	/*
+	 * complain if it's already stopped
+	 */
+	if (!mddev->nb_dev)
+		OUT(-ENXIO);
+
+	if (mddev->pers) {
+		/*
+		 * It is safe to call stop here, it only frees private
+		 * data. Also, it tells us if a device is unstoppable
+		 * (eg. resyncing is in progress)
+		 */
+		if (mddev->pers->stop_resync)
+			if (mddev->pers->stop_resync(mddev))
+				resync_interrupted = 1;
+
+		if (mddev->recovery_running)
+			md_interrupt_thread(md_recovery_thread);
+
+		/*
+		 * This synchronizes with signal delivery to the
+		 * resync or reconstruction thread. It also nicely
+		 * hangs the process if some reconstruction has not
+		 * finished.
+		 */
+		down(&mddev->recovery_sem);
+		up(&mddev->recovery_sem);
+
+		/*
+		 *  sync and invalidate buffers because we cannot kill the
+		 *  main thread with valid IO transfers still around.
+		 *  the kernel lock protects us from new requests being
+		 *  added after invalidate_buffers().
+		 */
+		fsync_dev (mddev_to_kdev(mddev));
+		fsync_dev (dev);
+		invalidate_buffers (dev);
+
+		if (ro) {
+			if (mddev->ro)
+				OUT(-ENXIO);
+			mddev->ro = 1;
+		} else {
+			if (mddev->ro)
+				set_device_ro(dev, 0);
+			if (mddev->pers->stop(mddev)) {
+				if (mddev->ro)
+					set_device_ro(dev, 1);
+				OUT(-EBUSY);
+			}
+			if (mddev->ro)
+				mddev->ro = 0;
+		}
+		if (mddev->sb) {
+			/*
+			 * mark it clean only if there was no resync
+			 * interrupted.
+			 */
+			if (!mddev->recovery_running && !resync_interrupted) {
+				printk("marking sb clean...\n");
+				mddev->sb->state |= 1 << MD_SB_CLEAN;
+			}
+			md_update_sb(mddev);
+		}
+		if (ro)
+			set_device_ro(dev, 1);
+	}
+ 
+	/*
+	 * Free resources if final stop
+	 */
+	if (!ro) {
+		export_array(mddev);
+		md_size[mdidx(mddev)] = 0;
+		md_hd_struct[mdidx(mddev)].nr_sects = 0;
+		free_mddev(mddev);
+
+		printk (KERN_INFO "md%d stopped.\n", mdidx(mddev));
+	} else
+		printk (KERN_INFO
+			"md%d switched to read-only mode.\n", mdidx(mddev));
+out:
+	return err;
+}
+
+#undef OUT
+
+/*
+ * We have to safely support old arrays too.
+ */
+int detect_old_array (mdp_super_t *sb)
+{
+	if (sb->major_version > 0)
+		return 0;
+	if (sb->minor_version >= 90)
+		return 0;
+
+	return -EINVAL;
+}
+
+
+static void autorun_array (mddev_t *mddev)
+{
+	mdk_rdev_t *rdev;
+        struct md_list_head *tmp;
+	int err;
+
+	if (mddev->disks.prev == &mddev->disks) {
+		MD_BUG();
+		return;
+	}
+
+	printk("running: ");
+
+        ITERATE_RDEV(mddev,rdev,tmp) {
+		printk("<%s>", partition_name(rdev->dev));
+	}
+	printk("\nnow!\n");
+
+	err = do_md_run (mddev);
+	if (err) {
+		printk("do_md_run() returned %d\n", err);
+		/*
+		 * prevent the writeback of an unrunnable array
+		 */
+		mddev->sb_dirty = 0;
+		do_md_stop (mddev, 0);
+	}
+}
+
+/*
+ * lets try to run arrays based on all disks that have arrived
+ * until now. (those are in the ->pending list)
+ *
+ * the method: pick the first pending disk, collect all disks with
+ * the same UUID, remove all from the pending list and put them into
+ * the 'same_array' list. Then order this list based on superblock
+ * update time (freshest comes first), kick out 'old' disks and
+ * compare superblocks. If everything's fine then run it.
+ */
+static void autorun_devices (void)
+{
+	struct md_list_head candidates;
+	struct md_list_head *tmp;
+	mdk_rdev_t *rdev0, *rdev;
+	mddev_t *mddev;
+	kdev_t md_kdev;
+
+
+	printk("autorun ...\n");
+	while (pending_raid_disks.next != &pending_raid_disks) {
+		rdev0 = md_list_entry(pending_raid_disks.next,
+					 mdk_rdev_t, pending);
+
+		printk("considering %s ...\n", partition_name(rdev0->dev));
+		MD_INIT_LIST_HEAD(&candidates);
+		ITERATE_RDEV_PENDING(rdev,tmp) {
+			if (uuid_equal(rdev0, rdev)) {
+				if (!sb_equal(rdev0->sb, rdev->sb)) {
+					printk("%s has same UUID as %s, but superblocks differ ...\n", partition_name(rdev->dev), partition_name(rdev0->dev));
+					continue;
+				}
+				printk("  adding %s ...\n", partition_name(rdev->dev));
+				md_list_del(&rdev->pending);
+				md_list_add(&rdev->pending, &candidates);
+			}
+		}
 		/*
-		 * Check the superblock for consistency.
-		 * The personality itself has to check whether it's getting
-		 * added with the proper flags.  The personality has to be
-                 * checked too. ;)
+		 * now we have a set of devices, with all of them having
+		 * mostly sane superblocks. It's time to allocate the
+		 * mddev.
 		 */
-		if (analyze_one_sb (realdev))
+		md_kdev = MKDEV(MD_MAJOR, rdev0->sb->md_minor);
+		mddev = kdev_to_mddev(md_kdev);
+		if (mddev) {
+			printk("md%d already running, cannot run %s\n",
+				 mdidx(mddev), partition_name(rdev0->dev));
+			ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp)
+				export_rdev(rdev);
+			continue;
+		}
+		mddev = alloc_mddev(md_kdev);
+		printk("created md%d\n", mdidx(mddev));
+		ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp) {
+			bind_rdev_to_array(rdev, mddev);
+			md_list_del(&rdev->pending);
+			MD_INIT_LIST_HEAD(&rdev->pending);
+		}
+		autorun_array(mddev);
+	}
+	printk("... autorun DONE.\n");
+}
+
+/*
+ * import RAID devices based on one partition
+ * if possible, the array gets run as well.
+ */
+
+#define BAD_VERSION KERN_ERR \
+"md: %s has RAID superblock version 0.%d, autodetect needs v0.90 or higher\n"
+
+#define OUT_OF_MEM KERN_ALERT \
+"md: out of memory.\n"
+
+#define NO_DEVICE KERN_ERR \
+"md: disabled device %s\n"
+
+#define AUTOADD_FAILED KERN_ERR \
+"md: auto-adding devices to md%d FAILED (error %d).\n"
+
+#define AUTOADD_FAILED_USED KERN_ERR \
+"md: cannot auto-add device %s to md%d, already used.\n"
+
+#define AUTORUN_FAILED KERN_ERR \
+"md: auto-running md%d FAILED (error %d).\n"
+
+#define MDDEV_BUSY KERN_ERR \
+"md: cannot auto-add to md%d, already running.\n"
+
+#define AUTOADDING KERN_INFO \
+"md: auto-adding devices to md%d, based on %s's superblock.\n"
+
+#define AUTORUNNING KERN_INFO \
+"md: auto-running md%d.\n"
+
+static int autostart_array (kdev_t startdev)
+{
+	int err = -EINVAL, i;
+	mdp_super_t *sb = NULL;
+	mdk_rdev_t *start_rdev = NULL, *rdev;
+
+	if (md_import_device(startdev, 1)) {
+		printk("could not import %s!\n", partition_name(startdev));
+		goto abort;
+	}
+
+	start_rdev = find_rdev_all(startdev);
+	if (!start_rdev) {
+		MD_BUG();
+		goto abort;
+	}
+	if (start_rdev->faulty) {
+		printk("can not autostart based on faulty %s!\n",
+						partition_name(startdev));
+		goto abort;
+	}
+	md_list_add(&start_rdev->pending, &pending_raid_disks);
+
+	sb = start_rdev->sb;
+
+	err = detect_old_array(sb);
+	if (err) {
+		printk("array version is too old to be autostarted, use raidtools 0.90 mkraid --upgrade\nto upgrade the array without data loss!\n");
+		goto abort;
+	}
+
+	for (i = 0; i < MD_SB_DISKS; i++) {
+		mdp_disk_t *desc;
+		kdev_t dev;
+
+		desc = sb->disks + i;
+		dev = MKDEV(desc->major, desc->minor);
+
+		if (dev == MKDEV(0,0))
+			continue;
+		if (dev == startdev)
+			continue;
+		if (md_import_device(dev, 1)) {
+			printk("could not import %s, trying to run array nevertheless.\n", partition_name(dev));
+			continue;
+		}
+		rdev = find_rdev_all(dev);
+		if (!rdev) {
+			MD_BUG();
+			goto abort;
+		}
+		md_list_add(&rdev->pending, &pending_raid_disks);
+	}
+
+	/*
+	 * possibly return codes
+	 */
+	autorun_devices();
+	return 0;
+
+abort:
+	if (start_rdev)
+		export_rdev(start_rdev);
+	return err;
+}
+
+#undef BAD_VERSION
+#undef OUT_OF_MEM
+#undef NO_DEVICE
+#undef AUTOADD_FAILED_USED
+#undef AUTOADD_FAILED
+#undef AUTORUN_FAILED
+#undef AUTOADDING
+#undef AUTORUNNING
+
+struct {
+	int set;
+	int noautodetect;
+
+} raid_setup_args md__initdata = { 0, 0 };
+
+/*
+ * Searches all registered partitions for autorun RAID arrays
+ * at boot time.
+ */
+md__initfunc(void autodetect_raid(void))
+{
+#ifdef CONFIG_AUTODETECT_RAID
+	struct gendisk *disk;
+	mdk_rdev_t *rdev;
+	int i;
+
+	if (raid_setup_args.noautodetect) {
+		printk(KERN_INFO "skipping autodetection of RAID arrays\n");
+		return;
+	}
+	printk(KERN_INFO "autodetecting RAID arrays\n");
+
+	for (disk = gendisk_head ; disk ; disk = disk->next) {
+		for (i = 0; i < disk->max_p*disk->max_nr; i++) {
+			kdev_t dev = MKDEV(disk->major,i);
+
+			if (disk->part[i].type == LINUX_OLD_RAID_PARTITION) {
+				printk(KERN_ALERT
+"md: %s's partition type has to be changed from type 0x86 to type 0xfd\n"
+"    to maintain interoperability with other OSs! Autodetection support for\n"
+"    type 0x86 will be deleted after some migration timeout. Sorry.\n",
+					partition_name(dev));
+				disk->part[i].type = LINUX_RAID_PARTITION;
+			}
+			if (disk->part[i].type != LINUX_RAID_PARTITION)
+				continue;
+
+			if (md_import_device(dev,1)) {
+				printk(KERN_ALERT "could not import %s!\n",
+							partition_name(dev));
+				continue;
+			}
+			/*
+			 * Sanity checks:
+			 */
+			rdev = find_rdev_all(dev);
+			if (!rdev) {
+				MD_BUG();
+				continue;
+			}
+			if (rdev->faulty) {
+				MD_BUG();
+				continue;
+			}
+			md_list_add(&rdev->pending, &pending_raid_disks);
+		}
+	}
+
+	autorun_devices();
+#endif
+}
+
+static int get_version (void * arg)
+{
+	mdu_version_t ver;
+
+	ver.major = MD_MAJOR_VERSION;
+	ver.minor = MD_MINOR_VERSION;
+	ver.patchlevel = MD_PATCHLEVEL_VERSION;
+
+	if (md_copy_to_user(arg, &ver, sizeof(ver)))
+		return -EFAULT;
+
+	return 0;
+}
+
+#define SET_FROM_SB(x) info.x = mddev->sb->x
+static int get_array_info (mddev_t * mddev, void * arg)
+{
+	mdu_array_info_t info;
+
+	if (!mddev->sb)
+		return -EINVAL;
+
+	SET_FROM_SB(major_version);
+	SET_FROM_SB(minor_version);
+	SET_FROM_SB(patch_version);
+	SET_FROM_SB(ctime);
+	SET_FROM_SB(level);
+	SET_FROM_SB(size);
+	SET_FROM_SB(nr_disks);
+	SET_FROM_SB(raid_disks);
+	SET_FROM_SB(md_minor);
+	SET_FROM_SB(not_persistent);
+
+	SET_FROM_SB(utime);
+	SET_FROM_SB(state);
+	SET_FROM_SB(active_disks);
+	SET_FROM_SB(working_disks);
+	SET_FROM_SB(failed_disks);
+	SET_FROM_SB(spare_disks);
+
+	SET_FROM_SB(layout);
+	SET_FROM_SB(chunk_size);
+
+	if (md_copy_to_user(arg, &info, sizeof(info)))
+		return -EFAULT;
+
+	return 0;
+}
+#undef SET_FROM_SB
+
+#define SET_FROM_SB(x) info.x = mddev->sb->disks[nr].x
+static int get_disk_info (mddev_t * mddev, void * arg)
+{
+	mdu_disk_info_t info;
+	unsigned int nr;
+
+	if (!mddev->sb)
+		return -EINVAL;
+
+	if (md_copy_from_user(&info, arg, sizeof(info)))
+		return -EFAULT;
+
+	nr = info.number;
+	if (nr >= mddev->sb->nr_disks)
+		return -EINVAL;
+
+	SET_FROM_SB(major);
+	SET_FROM_SB(minor);
+	SET_FROM_SB(raid_disk);
+	SET_FROM_SB(state);
+
+	if (md_copy_to_user(arg, &info, sizeof(info)))
+		return -EFAULT;
+
+	return 0;
+}
+#undef SET_FROM_SB
+
+#define SET_SB(x) mddev->sb->disks[nr].x = info.x
+
+static int add_new_disk (mddev_t * mddev, void * arg)
+{
+	int err, size, persistent;
+	mdu_disk_info_t info;
+	mdk_rdev_t *rdev;
+	unsigned int nr;
+	kdev_t dev;
+
+	if (!mddev->sb)
+		return -EINVAL;
+
+	if (md_copy_from_user(&info, arg, sizeof(info)))
+		return -EFAULT;
+
+	nr = info.number;
+	if (nr >= mddev->sb->nr_disks)
+		return -EINVAL;
+
+	dev = MKDEV(info.major,info.minor);
+
+	if (find_rdev_all(dev)) {
+		printk("device %s already used in a RAID array!\n", 
+				partition_name(dev));
+		return -EBUSY;
+	}
+
+	SET_SB(number);
+	SET_SB(major);
+	SET_SB(minor);
+	SET_SB(raid_disk);
+	SET_SB(state);
+ 
+	if ((info.state & (1<<MD_DISK_FAULTY))==0) {
+		err = md_import_device (dev, 0);
+		if (err) {
+			printk("md: error, md_import_device() returned %d\n", err);
+			return -EINVAL;
+		}
+		rdev = find_rdev_all(dev);
+		if (!rdev) {
+			MD_BUG();
 			return -EINVAL;
+		}
+ 
+		rdev->old_dev = dev;
+		rdev->desc_nr = info.number;
+ 
+		bind_rdev_to_array(rdev, mddev);
+ 
+		persistent = !mddev->sb->not_persistent;
+		if (!persistent)
+			printk("nonpersistent superblock ...\n");
+		if (!mddev->sb->chunk_size)
+			printk("no chunksize?\n");
+ 
+		size = calc_dev_size(dev, mddev, persistent);
+		rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent);
+ 
+		if (!mddev->sb->size || (mddev->sb->size > size))
+			mddev->sb->size = size;
+	}
+ 
+	/*
+	 * sync all other superblocks with the main superblock
+	 */
+	sync_sbs(mddev);
+
+	return 0;
+}
+#undef SET_SB
+
+static int hot_remove_disk (mddev_t * mddev, kdev_t dev)
+{
+	int err;
+	mdk_rdev_t *rdev;
+	mdp_disk_t *disk;
+
+	if (!mddev->pers)
+		return -ENODEV;
+
+	printk("trying to remove %s from md%d ... \n",
+		partition_name(dev), mdidx(mddev));
+
+	if (!mddev->pers->diskop) {
+		printk("md%d: personality does not support diskops!\n",
+								 mdidx(mddev));
+		return -EINVAL;
+	}
+
+	rdev = find_rdev(mddev, dev);
+	if (!rdev)
+		return -ENXIO;
+
+	if (rdev->desc_nr == -1) {
+		MD_BUG();
+		return -EINVAL;
+	}
+	disk = &mddev->sb->disks[rdev->desc_nr];
+	if (disk_active(disk))
+		goto busy;
+	if (disk_removed(disk)) {
+		MD_BUG();
+		return -EINVAL;
+	}
+	
+	err = mddev->pers->diskop(mddev, &disk, DISKOP_HOT_REMOVE_DISK);
+	if (err == -EBUSY)
+		goto busy;
+	if (err) {
+		MD_BUG();
+		return -EINVAL;
+	}
+
+	remove_descriptor(disk, mddev->sb);
+	kick_rdev_from_array(rdev);
+	mddev->sb_dirty = 1;
+	md_update_sb(mddev);
+
+	return 0;
+busy:
+	printk("cannot remove active disk %s from md%d ... \n",
+		partition_name(dev), mdidx(mddev));
+	return -EBUSY;
+}
+
+static int hot_add_disk (mddev_t * mddev, kdev_t dev)
+{
+	int i, err, persistent;
+	unsigned int size;
+	mdk_rdev_t *rdev;
+	mdp_disk_t *disk;
+
+	if (!mddev->pers)
+		return -ENODEV;
+
+	printk("trying to hot-add %s to md%d ... \n",
+		partition_name(dev), mdidx(mddev));
+
+	if (!mddev->pers->diskop) {
+		printk("md%d: personality does not support diskops!\n",
+								 mdidx(mddev));
+		return -EINVAL;
+	}
+
+	persistent = !mddev->sb->not_persistent;
+	size = calc_dev_size(dev, mddev, persistent);
+
+	if (size < mddev->sb->size) {
+		printk("md%d: disk size %d blocks < array size %d\n",
+				mdidx(mddev), size, mddev->sb->size);
+		return -ENOSPC;
+	}
+
+	rdev = find_rdev(mddev, dev);
+	if (rdev)
+		return -EBUSY;
+
+	err = md_import_device (dev, 0);
+	if (err) {
+		printk("md: error, md_import_device() returned %d\n", err);
+		return -EINVAL;
+	}
+	rdev = find_rdev_all(dev);
+	if (!rdev) {
+		MD_BUG();
+		return -EINVAL;
+	}
+	if (rdev->faulty) {
+		printk("md: can not hot-add faulty %s disk to md%d!\n",
+				partition_name(dev), mdidx(mddev));
+		err = -EINVAL;
+		goto abort_export;
+	}
+	bind_rdev_to_array(rdev, mddev);
+
+	/*
+	 * The rest should better be atomic, we can have disk failures
+	 * noticed in interrupt contexts ...
+	 */
+	cli();
+	rdev->old_dev = dev;
+	rdev->size = size;
+	rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent);
+
+	disk = mddev->sb->disks + mddev->sb->raid_disks;
+	for (i = mddev->sb->raid_disks; i < MD_SB_DISKS; i++) {
+		disk = mddev->sb->disks + i;
+
+		if (!disk->major && !disk->minor)
+			break;
+		if (disk_removed(disk))
+			break;
+	}
+	if (i == MD_SB_DISKS) {
+		sti();
+		printk("md%d: can not hot-add to full array!\n", mdidx(mddev));
+		err = -EBUSY;
+		goto abort_unbind_export;
+	}
+
+	if (disk_removed(disk)) {
 		/*
-		 * hot_add has to bump up nb_dev itself
+		 * reuse slot
 		 */
-		if (md_dev[minor].pers->hot_add_disk (&md_dev[minor], dev)) {
-			/*
-			 * FIXME: here we should free up the inode and stuff
-			 */
-			printk ("FIXME\n");
-			return -EINVAL;
+		if (disk->number != i) {
+			sti();
+			MD_BUG();
+			err = -EINVAL;
+			goto abort_unbind_export;
 		}
-	} else
-		md_dev[minor].nb_dev++;
+	} else {
+		disk->number = i;
+	}
 
-	printk ("REGISTER_DEV %s to md%x done\n", partition_name(dev), minor);
-	return (0);
+	disk->raid_disk = disk->number;
+	disk->major = MAJOR(dev);
+	disk->minor = MINOR(dev);
+
+	if (mddev->pers->diskop(mddev, &disk, DISKOP_HOT_ADD_DISK)) {
+		sti();
+		MD_BUG();
+		err = -EINVAL;
+		goto abort_unbind_export;
+	}
+
+	mark_disk_spare(disk);
+	mddev->sb->nr_disks++;
+	mddev->sb->spare_disks++;
+	mddev->sb->working_disks++;
+
+	mddev->sb_dirty = 1;
+
+	sti();
+	md_update_sb(mddev);
+
+	/*
+	 * Kick recovery, maybe this spare has to be added to the
+	 * array immediately.
+	 */
+	md_recover_arrays();
+
+	return 0;
+
+abort_unbind_export:
+	unbind_rdev_from_array(rdev);
+
+abort_export:
+	export_rdev(rdev);
+	return err;
+}
+
+#define SET_SB(x) mddev->sb->x = info.x
+static int set_array_info (mddev_t * mddev, void * arg)
+{
+	mdu_array_info_t info;
+
+	if (mddev->sb) {
+		printk("array md%d already has a superblock!\n", 
+				mdidx(mddev));
+		return -EBUSY;
+	}
+
+	if (md_copy_from_user(&info, arg, sizeof(info)))
+		return -EFAULT;
+
+	if (alloc_array_sb(mddev))
+		return -ENOMEM;
+
+	mddev->sb->major_version = MD_MAJOR_VERSION;
+	mddev->sb->minor_version = MD_MINOR_VERSION;
+	mddev->sb->patch_version = MD_PATCHLEVEL_VERSION;
+	mddev->sb->ctime = CURRENT_TIME;
+
+	SET_SB(level);
+	SET_SB(size);
+	SET_SB(nr_disks);
+	SET_SB(raid_disks);
+	SET_SB(md_minor);
+	SET_SB(not_persistent);
+
+	SET_SB(state);
+	SET_SB(active_disks);
+	SET_SB(working_disks);
+	SET_SB(failed_disks);
+	SET_SB(spare_disks);
+
+	SET_SB(layout);
+	SET_SB(chunk_size);
+
+	mddev->sb->md_magic = MD_SB_MAGIC;
+
+	/*
+	 * Generate a 128 bit UUID
+	 */
+	get_random_bytes(&mddev->sb->set_uuid0, 4);
+	get_random_bytes(&mddev->sb->set_uuid1, 4);
+	get_random_bytes(&mddev->sb->set_uuid2, 4);
+	get_random_bytes(&mddev->sb->set_uuid3, 4);
+
+	return 0;
+}
+#undef SET_SB
+
+static int set_disk_info (mddev_t * mddev, void * arg)
+{
+	printk("not yet");
+	return -EINVAL;
+}
+
+static int clear_array (mddev_t * mddev)
+{
+	printk("not yet");
+	return -EINVAL;
+}
+
+static int write_raid_info (mddev_t * mddev)
+{
+	printk("not yet");
+	return -EINVAL;
+}
+
+static int protect_array (mddev_t * mddev)
+{
+	printk("not yet");
+	return -EINVAL;
+}
+
+static int unprotect_array (mddev_t * mddev)
+{
+	printk("not yet");
+	return -EINVAL;
+}
+
+static int set_disk_faulty (mddev_t *mddev, kdev_t dev)
+{
+	int ret;
+
+	fsync_dev(mddev_to_kdev(mddev));
+	ret = md_error(mddev, dev);
+	return ret;
 }
 
 static int md_ioctl (struct inode *inode, struct file *file,
                      unsigned int cmd, unsigned long arg)
 {
-  int minor, err;
-  struct hd_geometry *loc = (struct hd_geometry *) arg;
+	unsigned int minor;
+	int err = 0;
+	struct hd_geometry *loc = (struct hd_geometry *) arg;
+	mddev_t *mddev = NULL;
+	kdev_t dev;
 
-  if (!capable(CAP_SYS_ADMIN))
-    return -EACCES;
+	if (!md_capable_admin())
+		return -EACCES;
 
-  if (((minor=MINOR(inode->i_rdev)) & 0x80) &&
-      (minor & 0x7f) < MAX_PERSONALITY &&
-      pers[minor & 0x7f] &&
-      pers[minor & 0x7f]->ioctl)
-    return (pers[minor & 0x7f]->ioctl (inode, file, cmd, arg));
-  
-  if (minor >= MAX_MD_DEV)
-    return -EINVAL;
+	dev = inode->i_rdev;
+	minor = MINOR(dev);
+	if (minor >= MAX_MD_DEVS)
+		return -EINVAL;
 
-  switch (cmd)
-  {
-    case REGISTER_DEV:
-      return do_md_add (minor, to_kdev_t ((dev_t) arg));
+	/*
+	 * Commands dealing with the RAID driver but not any
+	 * particular array:
+	 */
+	switch (cmd)
+	{
+		case RAID_VERSION:
+			err = get_version((void *)arg);
+			goto done;
+
+		case PRINT_RAID_DEBUG:
+			err = 0;
+			md_print_devices();
+			goto done_unlock;
+      
+		case BLKGETSIZE:   /* Return device size */
+			if (!arg) {
+				err = -EINVAL;
+				goto abort;
+			}
+			err = md_put_user(md_hd_struct[minor].nr_sects,
+						(long *) arg);
+			goto done;
 
-    case START_MD:
-      return do_md_run (minor, (int) arg);
+		case BLKFLSBUF:
+			fsync_dev(dev);
+			invalidate_buffers(dev);
+			goto done;
 
-    case STOP_MD:
-      return do_md_stop (minor, inode);
-      
-    case BLKGETSIZE:   /* Return device size */
-    if  (!arg)  return -EINVAL;
-    err = put_user (md_hd_struct[MINOR(inode->i_rdev)].nr_sects, (long *) arg);
-    if (err)
-      return err;
-    break;
-
-    case BLKFLSBUF:
-    fsync_dev (inode->i_rdev);
-    invalidate_buffers (inode->i_rdev);
-    break;
-
-    case BLKRASET:
-    if (arg > 0xff)
-      return -EINVAL;
-    read_ahead[MAJOR(inode->i_rdev)] = arg;
-    return 0;
-    
-    case BLKRAGET:
-    if  (!arg)  return -EINVAL;
-    err = put_user (read_ahead[MAJOR(inode->i_rdev)], (long *) arg);
-    if (err)
-      return err;
-    break;
-
-    /* We have a problem here : there is no easy way to give a CHS
-       virtual geometry. We currently pretend that we have a 2 heads
-       4 sectors (with a BIG number of cylinders...). This drives dosfs
-       just mad... ;-) */
-    
-    case HDIO_GETGEO:
-    if (!loc)  return -EINVAL;
-    err = put_user (2, (char *) &loc->heads);
-    if (err)
-      return err;
-    err = put_user (4, (char *) &loc->sectors);
-    if (err)
-      return err;
-    err = put_user (md_hd_struct[minor].nr_sects/8, (short *) &loc->cylinders);
-    if (err)
-      return err;
-    err = put_user (md_hd_struct[MINOR(inode->i_rdev)].start_sect,
-		(long *) &loc->start);
-    if (err)
-      return err;
-    break;
-    
-    RO_IOCTLS(inode->i_rdev,arg);
+		case BLKRASET:
+			if (arg > 0xff) {
+				err = -EINVAL;
+				goto abort;
+			}
+			read_ahead[MAJOR(dev)] = arg;
+			goto done;
     
-    default:
-    return -EINVAL;
-  }
+		case BLKRAGET:
+			if (!arg) {
+				err = -EINVAL;
+				goto abort;
+			}
+			err = md_put_user (read_ahead[
+				MAJOR(dev)], (long *) arg);
+			goto done;
+		default:
+	}
+
+	/*
+	 * Commands creating/starting a new array:
+	 */
+
+	mddev = kdev_to_mddev(dev);
+
+	switch (cmd)
+	{
+		case SET_ARRAY_INFO:
+		case START_ARRAY:
+			if (mddev) {
+				printk("array md%d already exists!\n",
+								mdidx(mddev));
+				err = -EEXIST;
+				goto abort;
+			}
+		default:
+	}
+
+	switch (cmd)
+	{
+		case SET_ARRAY_INFO:
+			mddev = alloc_mddev(dev);
+			if (!mddev) {
+				err = -ENOMEM;
+				goto abort;
+			}
+			/*
+			 * alloc_mddev() should possibly self-lock.
+			 */
+			err = lock_mddev(mddev);
+			if (err) {
+				printk("ioctl, reason %d, cmd %d\n", err, cmd);
+				goto abort;
+			}
+			err = set_array_info(mddev, (void *)arg);
+			if (err) {
+				printk("couldnt set array info. %d\n", err);
+				goto abort;
+			}
+			goto done_unlock;
+
+		case START_ARRAY:
+			/*
+			 * possibly make it lock the array ...
+			 */
+			err = autostart_array((kdev_t)arg);
+			if (err) {
+				printk("autostart %s failed!\n",
+					partition_name((kdev_t)arg));
+				goto abort;
+			}
+			goto done;
+      
+		default:
+	}
+      
+	/*
+	 * Commands querying/configuring an existing array:
+	 */
+
+	if (!mddev) {
+		err = -ENODEV;
+		goto abort;
+	}
+	err = lock_mddev(mddev);
+	if (err) {
+		printk("ioctl lock interrupted, reason %d, cmd %d\n",err, cmd);
+		goto abort;
+	}
+
+	/*
+	 * Commands even a read-only array can execute:
+	 */
+	switch (cmd)
+	{
+		case GET_ARRAY_INFO:
+			err = get_array_info(mddev, (void *)arg);
+			goto done_unlock;
+
+		case GET_DISK_INFO:
+			err = get_disk_info(mddev, (void *)arg);
+			goto done_unlock;
+      
+		case RESTART_ARRAY_RW:
+			err = restart_array(mddev);
+			goto done_unlock;
+
+		case STOP_ARRAY:
+			err = do_md_stop (mddev, 0);
+			goto done_unlock;
+      
+		case STOP_ARRAY_RO:
+			err = do_md_stop (mddev, 1);
+			goto done_unlock;
+      
+	/*
+	 * We have a problem here : there is no easy way to give a CHS
+	 * virtual geometry. We currently pretend that we have a 2 heads
+	 * 4 sectors (with a BIG number of cylinders...). This drives
+	 * dosfs just mad... ;-)
+	 */
+		case HDIO_GETGEO:
+			if (!loc) {
+				err = -EINVAL;
+				goto abort_unlock;
+			}
+			err = md_put_user (2, (char *) &loc->heads);
+			if (err)
+				goto abort_unlock;
+			err = md_put_user (4, (char *) &loc->sectors);
+			if (err)
+				goto abort_unlock;
+			err = md_put_user (md_hd_struct[mdidx(mddev)].nr_sects/8,
+						(short *) &loc->cylinders);
+			if (err)
+				goto abort_unlock;
+			err = md_put_user (md_hd_struct[minor].start_sect,
+						(long *) &loc->start);
+			goto done_unlock;
+	}
 
-  return (0);
+	/*
+	 * The remaining ioctls are changing the state of the
+	 * superblock, so we do not allow read-only arrays
+	 * here:
+	 */
+	if (mddev->ro) {
+		err = -EROFS;
+		goto abort_unlock;
+	}
+
+	switch (cmd)
+	{
+		case CLEAR_ARRAY:
+			err = clear_array(mddev);
+			goto done_unlock;
+      
+		case ADD_NEW_DISK:
+			err = add_new_disk(mddev, (void *)arg);
+			goto done_unlock;
+      
+		case HOT_REMOVE_DISK:
+			err = hot_remove_disk(mddev, (kdev_t)arg);
+			goto done_unlock;
+      
+		case HOT_ADD_DISK:
+			err = hot_add_disk(mddev, (kdev_t)arg);
+			goto done_unlock;
+      
+		case SET_DISK_INFO:
+			err = set_disk_info(mddev, (void *)arg);
+			goto done_unlock;
+      
+		case WRITE_RAID_INFO:
+			err = write_raid_info(mddev);
+			goto done_unlock;
+      
+		case UNPROTECT_ARRAY:
+			err = unprotect_array(mddev);
+			goto done_unlock;
+      
+		case PROTECT_ARRAY:
+			err = protect_array(mddev);
+			goto done_unlock;
+
+		case SET_DISK_FAULTY:
+			err = set_disk_faulty(mddev, (kdev_t)arg);
+			goto done_unlock;
+      
+		case RUN_ARRAY:
+		{
+			mdu_param_t param;
+
+			err = md_copy_from_user(&param, (mdu_param_t *)arg,
+							 sizeof(param));
+			if (err)
+				goto abort_unlock;
+
+			err = do_md_run (mddev);
+			/*
+			 * we have to clean up the mess if
+			 * the array cannot be run for some
+			 * reason ...
+			 */
+			if (err) {
+				mddev->sb_dirty = 0;
+				do_md_stop (mddev, 0);
+			}
+			goto done_unlock;
+		}
+      
+		default:
+			printk(KERN_WARNING "%s(pid %d) used obsolete MD ioctl, upgrade your software to use new ictls.\n", current->comm, current->pid);
+			err = -EINVAL;
+			goto abort_unlock;
+	}
+
+done_unlock:
+abort_unlock:
+	if (mddev)
+		unlock_mddev(mddev);
+	else
+		printk("huh11?\n");
+
+	return err;
+done:
+	if (err)
+		printk("huh12?\n");
+abort:
+	return err;
 }
 
+
+#if LINUX_VERSION_CODE < LinuxVersionCode(2,1,0)
+
 static int md_open (struct inode *inode, struct file *file)
 {
-  int minor=MINOR(inode->i_rdev);
+	/*
+	 * Always succeed
+	 */
+	return (0);
+}
+
+static void md_release (struct inode *inode, struct file *file)
+{
+	sync_dev(inode->i_rdev);
+}
+
+
+static int md_read (struct inode *inode, struct file *file,
+						char *buf, int count)
+{
+	mddev_t *mddev = kdev_to_mddev(MD_FILE_TO_INODE(file)->i_rdev);
 
-  md_dev[minor].busy++;
-  return (0);			/* Always succeed */
+	if (!mddev || !mddev->pers)
+		return -ENXIO;
+
+	return block_read (inode, file, buf, count);
 }
 
+static int md_write (struct inode *inode, struct file *file,
+						const char *buf, int count)
+{
+	mddev_t *mddev = kdev_to_mddev(MD_FILE_TO_INODE(file)->i_rdev);
+
+	if (!mddev || !mddev->pers)
+		return -ENXIO;
 
-static int md_release (struct inode *inode, struct file *file)
+	return block_write (inode, file, buf, count);
+}
+
+static struct file_operations md_fops=
 {
-  int minor=MINOR(inode->i_rdev);
+	NULL,
+	md_read,
+	md_write,
+	NULL,
+	NULL,
+	md_ioctl,
+	NULL,
+	md_open,
+	md_release,
+	block_fsync
+};
+
+#else
 
-  sync_dev (inode->i_rdev);
-  md_dev[minor].busy--;
-  return 0;
+static int md_open (struct inode *inode, struct file *file)
+{
+	/*
+	 * Always succeed
+	 */
+	return (0);
 }
 
+static int md_release (struct inode *inode, struct file *file)
+{
+	sync_dev(inode->i_rdev);
+	return 0;
+}
 
 static ssize_t md_read (struct file *file, char *buf, size_t count,
 			loff_t *ppos)
 {
-  int minor=MINOR(file->f_dentry->d_inode->i_rdev);
+	mddev_t *mddev = kdev_to_mddev(MD_FILE_TO_INODE(file)->i_rdev);
 
-  if (!md_dev[minor].pers)	/* Check if device is being run */
-    return -ENXIO;
+	if (!mddev || !mddev->pers)
+		return -ENXIO;
 
-  return block_read(file, buf, count, ppos);
+	return block_read(file, buf, count, ppos);
 }
 
 static ssize_t md_write (struct file *file, const char *buf,
 			 size_t count, loff_t *ppos)
 {
-  int minor=MINOR(file->f_dentry->d_inode->i_rdev);
+	mddev_t *mddev = kdev_to_mddev(MD_FILE_TO_INODE(file)->i_rdev);
 
-  if (!md_dev[minor].pers)	/* Check if device is being run */
-    return -ENXIO;
+	if (!mddev || !mddev->pers)
+		return -ENXIO;
 
-  return block_write(file, buf, count, ppos);
+	return block_write(file, buf, count, ppos);
 }
 
 static struct file_operations md_fops=
 {
-  NULL,
-  md_read,
-  md_write,
-  NULL,
-  NULL,
-  md_ioctl,
-  NULL,
-  md_open,
-  NULL,
-  md_release,
-  block_fsync
+	NULL,
+	md_read,
+	md_write,
+	NULL,
+	NULL,
+	md_ioctl,
+	NULL,
+	md_open,
+	NULL,
+	md_release,
+	block_fsync
 };
 
-int md_map (int minor, kdev_t *rdev, unsigned long *rsector, unsigned long size)
+#endif
+
+int md_map (kdev_t dev, kdev_t *rdev, unsigned long *rsector, unsigned long size, int rw)
 {
-  if ((unsigned int) minor >= MAX_MD_DEV)
-  {
-    printk ("Bad md device %d\n", minor);
-    return (-1);
-  }
-  
-  if (!md_dev[minor].pers)
-  {
-    printk ("Oops ! md%d not running, giving up !\n", minor);
-    return (-1);
-  }
+	int err;
+	mddev_t *mddev = kdev_to_mddev(dev);
 
-  return (md_dev[minor].pers->map(md_dev+minor, rdev, rsector, size));
+	if (!mddev || !mddev->pers) {
+		err = -ENXIO;
+		goto out;
+	}
+
+	err = mddev->pers->map(mddev, dev, rdev, rsector, size);
+out:
+	return err;
 }
   
-int md_make_request (int minor, int rw, struct buffer_head * bh)
+int md_make_request (struct buffer_head * bh, int rw)
 {
-	if (md_dev [minor].pers->make_request) {
-		if (buffer_locked(bh))
-			return 0;
+	int err;
+	mddev_t *mddev = kdev_to_mddev(bh->b_rdev);
+
+	if (!mddev || !mddev->pers) {
+		err = -ENXIO;
+		goto out;
+	}
+
+	if (mddev->pers->make_request) {
+		if (buffer_locked(bh)) {
+			err = 0;
+			goto out;
+		}
 		set_bit(BH_Lock, &bh->b_state);
 		if (rw == WRITE || rw == WRITEA) {
 			if (!buffer_dirty(bh)) {
-				bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state));
-				return 0;
+				bh->b_end_io(bh, buffer_uptodate(bh));
+				err = 0;
+				goto out;
 			}
 		}
 		if (rw == READ || rw == READA) {
 			if (buffer_uptodate(bh)) {
-				bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state));
-				return 0;
+				bh->b_end_io(bh, buffer_uptodate(bh));
+				err = 0;
+				goto out;
 			}
 		}
-		return (md_dev[minor].pers->make_request(md_dev+minor, rw, bh));
+		err = mddev->pers->make_request(mddev, rw, bh);
 	} else {
 		make_request (MAJOR(bh->b_rdev), rw, bh);
-		return 0;
+		err = 0;
 	}
+out:
+	return err;
 }
 
 static void do_md_request (void)
 {
-  printk ("Got md request, not good...");
-  return;
+	printk(KERN_ALERT "Got md request, not good...");
+	return;
+}
+
+int md_thread(void * arg)
+{
+	mdk_thread_t *thread = arg;
+
+	md_lock_kernel();
+	exit_mm(current);
+	exit_files(current);
+	exit_fs(current);
+
+	/*
+	 * Detach thread
+	 */
+	sys_setsid();
+	sprintf(current->comm, thread->name);
+	md_init_signals();
+	md_flush_signals();
+	thread->tsk = current;
+
+	/*
+	 * md_thread is a 'system-thread', it's priority should be very
+	 * high. We avoid resource deadlocks individually in each
+	 * raid personality. (RAID5 does preallocation) We also use RR and
+	 * the very same RT priority as kswapd, thus we will never get
+	 * into a priority inversion deadlock.
+	 *
+	 * we definitely have to have equal or higher priority than
+	 * bdflush, otherwise bdflush will deadlock if there are too
+	 * many dirty RAID5 blocks.
+	 */
+	current->policy = SCHED_OTHER;
+	current->priority = 40;
+
+	up(thread->sem);
+
+	for (;;) {
+		cli();
+		if (!test_bit(THREAD_WAKEUP, &thread->flags)) {
+			if (!thread->run)
+				break;
+			interruptible_sleep_on(&thread->wqueue);
+		}
+		sti();
+		clear_bit(THREAD_WAKEUP, &thread->flags);
+		if (thread->run) {
+			thread->run(thread->data);
+			run_task_queue(&tq_disk);
+		}
+		if (md_signal_pending(current)) {
+			printk("%8s(%d) flushing signals.\n", current->comm,
+				current->pid);
+			md_flush_signals();
+		}
+	}
+	sti();
+	up(thread->sem);
+	return 0;
 }
 
-void md_wakeup_thread(struct md_thread *thread)
+void md_wakeup_thread(mdk_thread_t *thread)
 {
 	set_bit(THREAD_WAKEUP, &thread->flags);
 	wake_up(&thread->wqueue);
 }
 
-struct md_thread *md_register_thread (void (*run) (void *), void *data)
+mdk_thread_t *md_register_thread (void (*run) (void *),
+						void *data, const char *name)
 {
-	struct md_thread *thread = (struct md_thread *)
-		kmalloc(sizeof(struct md_thread), GFP_KERNEL);
+	mdk_thread_t *thread;
 	int ret;
 	struct semaphore sem = MUTEX_LOCKED;
 	
-	if (!thread) return NULL;
+	thread = (mdk_thread_t *) kmalloc
+				(sizeof(mdk_thread_t), GFP_KERNEL);
+	if (!thread)
+		return NULL;
 	
-	memset(thread, 0, sizeof(struct md_thread));
+	memset(thread, 0, sizeof(mdk_thread_t));
 	init_waitqueue(&thread->wqueue);
 	
 	thread->sem = &sem;
 	thread->run = run;
 	thread->data = data;
+	thread->name = name;
 	ret = kernel_thread(md_thread, thread, 0);
 	if (ret < 0) {
 		kfree(thread);
@@ -836,270 +3031,406 @@
 	return thread;
 }
 
-void md_unregister_thread (struct md_thread *thread)
+void md_interrupt_thread (mdk_thread_t *thread)
+{
+	if (!thread->tsk) {
+		MD_BUG();
+		return;
+	}
+	printk("interrupting MD-thread pid %d\n", thread->tsk->pid);
+	send_sig(SIGKILL, thread->tsk, 1);
+}
+
+void md_unregister_thread (mdk_thread_t *thread)
 {
 	struct semaphore sem = MUTEX_LOCKED;
 	
 	thread->sem = &sem;
 	thread->run = NULL;
-	if (thread->tsk)
-		printk("Killing md_thread %d %p %s\n",
-		       thread->tsk->pid, thread->tsk, thread->tsk->comm);
-	else
-		printk("Aiee. md_thread has 0 tsk\n");
-	send_sig(SIGKILL, thread->tsk, 1);
-	printk("downing on %p\n", &sem);
+	thread->name = NULL;
+	if (!thread->tsk) {
+		MD_BUG();
+		return;
+	}
+	md_interrupt_thread(thread);
 	down(&sem);
 }
 
-#define SHUTDOWN_SIGS   (sigmask(SIGKILL)|sigmask(SIGINT)|sigmask(SIGTERM))
-
-int md_thread(void * arg)
+void md_recover_arrays (void)
 {
-	struct md_thread *thread = arg;
-
-	lock_kernel();
-	exit_mm(current);
-	exit_files(current);
-	exit_fs(current);
-	
-	current->session = 1;
-	current->pgrp = 1;
-	sprintf(current->comm, "md_thread");
-	siginitsetinv(&current->blocked, SHUTDOWN_SIGS);
-	thread->tsk = current;
-	up(thread->sem);
-
-	for (;;) {
-		cli();
-		if (!test_bit(THREAD_WAKEUP, &thread->flags)) {
-			do {
-			        spin_lock(&current->sigmask_lock);
-				flush_signals(current);
-	  			spin_unlock(&current->sigmask_lock);
-				interruptible_sleep_on(&thread->wqueue);
-				cli();
-				if (test_bit(THREAD_WAKEUP, &thread->flags))
-					break;
-				if (!thread->run) {
-					sti();
-					up(thread->sem);
-					return 0;
-				}
-			} while (signal_pending(current));
-		}
-		sti();
-		clear_bit(THREAD_WAKEUP, &thread->flags);
-		if (thread->run) {
-			thread->run(thread->data);
-			run_task_queue(&tq_disk);
-		}
+	if (!md_recovery_thread) {
+		MD_BUG();
+		return;
 	}
+	md_wakeup_thread(md_recovery_thread);
 }
 
-EXPORT_SYMBOL(md_size);
-EXPORT_SYMBOL(md_maxreadahead);
-EXPORT_SYMBOL(register_md_personality);
-EXPORT_SYMBOL(unregister_md_personality);
-EXPORT_SYMBOL(partition_name);
-EXPORT_SYMBOL(md_dev);
-EXPORT_SYMBOL(md_error);
-EXPORT_SYMBOL(md_register_thread);
-EXPORT_SYMBOL(md_unregister_thread);
-EXPORT_SYMBOL(md_update_sb);
-EXPORT_SYMBOL(md_map);
-EXPORT_SYMBOL(md_wakeup_thread);
-EXPORT_SYMBOL(md_do_sync);
 
-#ifdef CONFIG_PROC_FS
-static struct proc_dir_entry proc_md = {
-	PROC_MD, 6, "mdstat",
-	S_IFREG | S_IRUGO, 1, 0, 0,
-	0, &proc_array_inode_operations,
-};
+int md_error (mddev_t *mddev, kdev_t rdev)
+{
+	mdk_rdev_t * rrdev;
+	int rc;
+
+	if (!mddev) {
+		MD_BUG();
+		return 0;
+	}
+	rrdev = find_rdev(mddev, rdev);
+	mark_rdev_faulty(rrdev);
+	/*
+	 * if recovery was running, stop it now.
+	 */
+	if (mddev->pers->stop_resync)
+		mddev->pers->stop_resync(mddev);
+	if (mddev->recovery_running)
+		md_interrupt_thread(md_recovery_thread);
+	if (mddev->pers->error_handler) {
+		rc = mddev->pers->error_handler(mddev, rdev);
+		md_recover_arrays();
+		return rc;
+	}
+#if 0
+	/*
+	 * Drop all buffers in the failed array.
+	 * _not_. This is called from IRQ handlers ...
+	 */
+	invalidate_buffers(rdev);
 #endif
+	return 0;
+}
 
-static void md_geninit (struct gendisk *gdisk)
+static int status_unused (char * page)
 {
-  int i;
-  
-  for(i=0;i<MAX_MD_DEV;i++)
-  {
-    md_blocksizes[i] = 1024;
-    md_maxreadahead[i] = MD_DEFAULT_DISK_READAHEAD;
-    md_gendisk.part[i].start_sect=-1; /* avoid partition check */
-    md_gendisk.part[i].nr_sects=0;
-    md_dev[i].pers=NULL;
-  }
+	int sz = 0, i = 0;
+	mdk_rdev_t *rdev;
+	struct md_list_head *tmp;
 
-  blksize_size[MD_MAJOR] = md_blocksizes;
-  max_readahead[MD_MAJOR] = md_maxreadahead;
+	sz += sprintf(page + sz, "unused devices: ");
 
-#ifdef CONFIG_PROC_FS
-  proc_register(&proc_root, &proc_md);
-#endif
+	ITERATE_RDEV_ALL(rdev,tmp) {
+		if (!rdev->same_set.next && !rdev->same_set.prev) {
+			/*
+			 * The device is not yet used by any array.
+			 */
+			i++;
+			sz += sprintf(page + sz, "%s ",
+				partition_name(rdev->dev));
+		}
+	}
+	if (!i)
+		sz += sprintf(page + sz, "<none>");
+
+	sz += sprintf(page + sz, "\n");
+	return sz;
 }
 
-int md_error (kdev_t mddev, kdev_t rdev)
+
+static int status_resync (char * page, mddev_t * mddev)
 {
-    unsigned int minor = MINOR (mddev);
-    int rc;
+	int sz = 0;
+	unsigned int blocksize, max_blocks, resync, res, dt, tt, et;
 
-    if (MAJOR(mddev) != MD_MAJOR || minor > MAX_MD_DEV)
-	panic ("md_error gets unknown device\n");
-    if (!md_dev [minor].pers)
-	panic ("md_error gets an error for an unknown device\n");
-    if (md_dev [minor].pers->error_handler) {
-	rc = md_dev [minor].pers->error_handler (md_dev+minor, rdev);
-#if SUPPORT_RECONSTRUCTION
-	md_wakeup_thread(md_sync_thread);
-#endif /* SUPPORT_RECONSTRUCTION */
-	return rc;
-    }
-    return 0;
+	resync = mddev->curr_resync;
+	blocksize = blksize_size[MD_MAJOR][mdidx(mddev)];
+	max_blocks = blk_size[MD_MAJOR][mdidx(mddev)] / (blocksize >> 10);
+
+	/*
+	 * Should not happen.
+	 */		
+	if (!max_blocks) {
+		MD_BUG();
+		return 0;
+	}
+	res = resync*100/max_blocks;
+	if (!mddev->recovery_running)
+		/*
+		 * true resync
+		 */
+		sz += sprintf(page + sz, " resync=%u%%", res);
+	else
+		/*
+		 * recovery ...
+		 */
+		sz += sprintf(page + sz, " recovery=%u%%", res);
+
+	/*
+	 * We do not want to overflow, so the order of operands and
+	 * the * 100 / 100 trick are important. We do a +1 to be
+	 * safe against division by zero. We only estimate anyway.
+	 *
+	 * dt: time until now
+	 * tt: total time
+	 * et: estimated finish time
+	 */
+	dt = ((jiffies - mddev->resync_start) / HZ);
+	tt = (dt * (max_blocks / (resync/100+1)))/100;
+	if (tt > dt)
+		et = tt - dt;
+	else
+		/*
+		 * ignore rounding effects near finish time
+		 */
+		et = 0;
+	
+	sz += sprintf(page + sz, " finish=%u.%umin", et / 60, (et % 60)/6);
+
+	return sz;
 }
 
 int get_md_status (char *page)
 {
-  int sz=0, i, j, size;
-
-  sz+=sprintf( page+sz, "Personalities : ");
-  for (i=0; i<MAX_PERSONALITY; i++)
-    if (pers[i])
-      sz+=sprintf (page+sz, "[%d %s] ", i, pers[i]->name);
-
-  page[sz-1]='\n';
-
-  sz+=sprintf (page+sz, "read_ahead ");
-  if (read_ahead[MD_MAJOR]==INT_MAX)
-    sz+=sprintf (page+sz, "not set\n");
-  else
-    sz+=sprintf (page+sz, "%d sectors\n", read_ahead[MD_MAJOR]);
+	int sz = 0, j, size;
+	struct md_list_head *tmp, *tmp2;
+	mdk_rdev_t *rdev;
+	mddev_t *mddev;
+
+	sz += sprintf(page + sz, "Personalities : ");
+	for (j = 0; j < MAX_PERSONALITY; j++)
+	if (pers[j])
+		sz += sprintf(page+sz, "[%s] ", pers[j]->name);
+
+	sz += sprintf(page+sz, "\n");
+
+
+	sz += sprintf(page+sz, "read_ahead ");
+	if (read_ahead[MD_MAJOR] == INT_MAX)
+		sz += sprintf(page+sz, "not set\n");
+	else
+		sz += sprintf(page+sz, "%d sectors\n", read_ahead[MD_MAJOR]);
   
-  for (i=0; i<MAX_MD_DEV; i++)
-  {
-    sz+=sprintf (page+sz, "md%d : %sactive", i, md_dev[i].pers ? "" : "in");
-
-    if (md_dev[i].pers)
-      sz+=sprintf (page+sz, " %s", md_dev[i].pers->name);
+	ITERATE_MDDEV(mddev,tmp) {
+		sz += sprintf(page + sz, "md%d : %sactive", mdidx(mddev),
+						mddev->pers ? "" : "in");
+		if (mddev->pers) {
+			if (mddev->ro)	
+				sz += sprintf(page + sz, " (read-only)");
+			sz += sprintf(page + sz, " %s", mddev->pers->name);
+		}
 
-    size=0;
-    for (j=0; j<md_dev[i].nb_dev; j++)
-    {
-      sz+=sprintf (page+sz, " %s",
-		   partition_name(md_dev[i].devices[j].dev));
-      size+=md_dev[i].devices[j].size;
-    }
+		size = 0;
+		ITERATE_RDEV(mddev,rdev,tmp2) {
+			sz += sprintf(page + sz, " %s[%d]",
+				partition_name(rdev->dev), rdev->desc_nr);
+			if (rdev->faulty) {
+				sz += sprintf(page + sz, "(F)");
+				continue;
+			}
+			size += rdev->size;
+		}
 
-    if (md_dev[i].nb_dev) {
-      if (md_dev[i].pers)
-        sz+=sprintf (page+sz, " %d blocks", md_size[i]);
-      else
-        sz+=sprintf (page+sz, " %d blocks", size);
-    }
+		if (mddev->nb_dev) {
+			if (mddev->pers)
+				sz += sprintf(page + sz, " %d blocks",
+						 md_size[mdidx(mddev)]);
+			else
+				sz += sprintf(page + sz, " %d blocks", size);
+		}
 
-    if (!md_dev[i].pers)
-    {
-      sz+=sprintf (page+sz, "\n");
-      continue;
-    }
+		if (!mddev->pers) {
+			sz += sprintf(page+sz, "\n");
+			continue;
+		}
 
-    if (md_dev[i].pers->max_invalid_dev)
-      sz+=sprintf (page+sz, " maxfault=%ld", MAX_FAULT(md_dev+i));
+		sz += mddev->pers->status (page+sz, mddev);
 
-    sz+=md_dev[i].pers->status (page+sz, i, md_dev+i);
-    sz+=sprintf (page+sz, "\n");
-  }
+		if (mddev->curr_resync)
+			sz += status_resync (page+sz, mddev);
+		else {
+			if (md_atomic_read(&mddev->resync_sem.count) != 1)
+				sz += sprintf(page + sz, " resync=DELAYED");
+		}
+		sz += sprintf(page + sz, "\n");
+	}
+	sz += status_unused (page + sz);
 
-  return (sz);
+	return (sz);
 }
 
-int register_md_personality (int p_num, struct md_personality *p)
+int register_md_personality (int pnum, mdk_personality_t *p)
 {
-  int i=(p_num >> PERSONALITY_SHIFT);
-
-  if (i >= MAX_PERSONALITY)
-    return -EINVAL;
+	if (pnum >= MAX_PERSONALITY)
+		return -EINVAL;
 
-  if (pers[i])
-    return -EBUSY;
+	if (pers[pnum])
+		return -EBUSY;
   
-  pers[i]=p;
-  printk ("%s personality registered\n", p->name);
-  return 0;
+	pers[pnum] = p;
+	printk(KERN_INFO "%s personality registered\n", p->name);
+	return 0;
 }
 
-int unregister_md_personality (int p_num)
+int unregister_md_personality (int pnum)
 {
-  int i=(p_num >> PERSONALITY_SHIFT);
-
-  if (i >= MAX_PERSONALITY)
-    return -EINVAL;
+	if (pnum >= MAX_PERSONALITY)
+		return -EINVAL;
 
-  printk ("%s personality unregistered\n", pers[i]->name);
-  pers[i]=NULL;
-  return 0;
+	printk(KERN_INFO "%s personality unregistered\n", pers[pnum]->name);
+	pers[pnum] = NULL;
+	return 0;
 } 
 
-static md_descriptor_t *get_spare(struct md_dev *mddev)
+static mdp_disk_t *get_spare(mddev_t *mddev)
 {
-	int i;
-	md_superblock_t *sb = mddev->sb;
-	md_descriptor_t *descriptor;
-	struct real_dev *realdev;
-	
-  	for (i = 0; i < mddev->nb_dev; i++) {
-  		realdev = &mddev->devices[i];
-		if (!realdev->sb)
+	mdp_super_t *sb = mddev->sb;
+	mdp_disk_t *disk;
+	mdk_rdev_t *rdev;
+	struct md_list_head *tmp;
+
+	ITERATE_RDEV(mddev,rdev,tmp) {
+		if (rdev->faulty)
+			continue;
+		if (!rdev->sb) {
+			MD_BUG();
 			continue;
-		descriptor = &sb->disks[realdev->sb->descriptor.number];
-		if (descriptor->state & (1 << MD_FAULTY_DEVICE))
+		}
+		disk = &sb->disks[rdev->desc_nr];
+		if (disk_faulty(disk)) {
+			MD_BUG();
 			continue;
-		if (descriptor->state & (1 << MD_ACTIVE_DEVICE))
+		}
+		if (disk_active(disk))
 			continue;
-		return descriptor;
+		return disk;
 	}
 	return NULL;
 }
 
+static int is_mddev_idle (mddev_t *mddev)
+{
+	mdk_rdev_t * rdev;
+	struct md_list_head *tmp;
+	int idle;
+	unsigned long curr_events;
+
+	idle = 1;
+	ITERATE_RDEV(mddev,rdev,tmp) {
+		curr_events = io_events[MAJOR(rdev->dev)];
+
+		if (curr_events != rdev->last_events) {
+//			printk("!I(%d)", curr_events-rdev->last_events);
+			rdev->last_events = curr_events;
+			idle = 0;
+		}
+	}
+	return idle;
+}
+
 /*
  * parallel resyncing thread. 
- *
- * FIXME: - make it abort with a dirty array on mdstop, now it just blocks
- *        - fix read error handing
  */
 
-int md_do_sync(struct md_dev *mddev)
+/*
+ * Determine correct block size for this device.
+ */
+unsigned int device_bsize (kdev_t dev)
+{
+	unsigned int i, correct_size;
+
+	correct_size = BLOCK_SIZE;
+	if (blksize_size[MAJOR(dev)]) {
+		i = blksize_size[MAJOR(dev)][MINOR(dev)];
+		if (i)
+			correct_size = i;
+	}
+
+	return correct_size;
+}
+
+static struct wait_queue *resync_wait = (struct wait_queue *)NULL;
+
+#define RA_ORDER (1)
+#define RA_PAGE_SIZE (PAGE_SIZE*(1<<RA_ORDER))
+#define MAX_NR_BLOCKS (RA_PAGE_SIZE/sizeof(struct buffer_head *))
+
+int md_do_sync(mddev_t *mddev, mdp_disk_t *spare)
 {
-        struct buffer_head *bh;
-	int max_blocks, blocksize, curr_bsize, percent=1, j;
-	kdev_t read_disk = MKDEV(MD_MAJOR, mddev - md_dev);
+	mddev_t *mddev2;
+        struct buffer_head **bh;
+	unsigned int max_blocks, blocksize, curr_bsize,
+		i, ii, j, k, chunk, window, nr_blocks, err, serialize;
+	kdev_t read_disk = mddev_to_kdev(mddev);
 	int major = MAJOR(read_disk), minor = MINOR(read_disk);
 	unsigned long starttime;
+	int max_read_errors = 2*MAX_NR_BLOCKS,
+		 max_write_errors = 2*MAX_NR_BLOCKS;
+	struct md_list_head *tmp;
+
+retry_alloc:
+	bh = (struct buffer_head **) md__get_free_pages(GFP_KERNEL, RA_ORDER);
+	if (!bh) {
+		printk(KERN_ERR
+		"could not alloc bh array for reconstruction ... retrying!\n");
+		goto retry_alloc;
+	}
+
+	err = down_interruptible(&mddev->resync_sem);
+	if (err)
+		goto out_nolock;
+
+recheck:
+	serialize = 0;
+	ITERATE_MDDEV(mddev2,tmp) {
+		if (mddev2 == mddev)
+			continue;
+		if (mddev2->curr_resync && match_mddev_units(mddev,mddev2)) {
+			printk(KERN_INFO "md: serializing resync, md%d has overlapping physical units with md%d!\n", mdidx(mddev), mdidx(mddev2));
+			serialize = 1;
+			break;
+		}
+	}
+	if (serialize) {
+		interruptible_sleep_on(&resync_wait);
+		if (md_signal_pending(current)) {
+			md_flush_signals();
+			err = -EINTR;
+			goto out;
+		}
+		goto recheck;
+	}
+
+	mddev->curr_resync = 1;
 
-	blocksize = blksize_size[major][minor];
+	blocksize = device_bsize(read_disk);
 	max_blocks = blk_size[major][minor] / (blocksize >> 10);
 
-	printk("... resync log\n");
-	printk(" ....   mddev->nb_dev: %d\n", mddev->nb_dev);
-	printk(" ....   raid array: %s\n", kdevname(read_disk));
-	printk(" ....   max_blocks: %d blocksize: %d\n", max_blocks, blocksize);
-	printk("md: syncing RAID array %s\n", kdevname(read_disk));
+	printk(KERN_INFO "md: syncing RAID array md%d\n", mdidx(mddev));
+	printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed: %d KB/sec.\n",
+						sysctl_speed_limit);
+	printk(KERN_INFO "md: using maximum available idle IO bandwith for reconstruction.\n");
+
+	/*
+	 * Resync has low priority.
+	 */
+	current->priority = 1;
+
+	is_mddev_idle(mddev); /* this also initializes IO event counters */
+	starttime = jiffies;
+	mddev->resync_start = starttime;
 
-	mddev->busy++;
+	/*
+	 * Tune reconstruction:
+	 */
+	window = md_maxreadahead[mdidx(mddev)]/1024;
+	nr_blocks = window / (blocksize >> 10);
+	if (!nr_blocks || (nr_blocks > MAX_NR_BLOCKS))
+		nr_blocks = MAX_NR_BLOCKS;
+	printk(KERN_INFO "md: using %dk window.\n",window);
 
-	starttime=jiffies;
-	for (j = 0; j < max_blocks; j++) {
+	for (j = 0; j < max_blocks; j += nr_blocks) {
 
+		if (j)
+			mddev->curr_resync = j;
 		/*
 		 * B careful. When some1 mounts a non-'blocksize' filesystem
 		 * then we get the blocksize changed right under us. Go deal
 		 * with it transparently, recalculate 'blocksize', 'j' and
 		 * 'max_blocks':
 		 */
-		curr_bsize = blksize_size[major][minor];
+		curr_bsize = device_bsize(read_disk);
 		if (curr_bsize != blocksize) {
-		diff_blocksize:
+			printk(KERN_INFO "md%d: blocksize changed\n",
+								mdidx(mddev));
+retry_read:
 			if (curr_bsize > blocksize)
 				/*
 				 * this is safe, rounds downwards.
@@ -1109,114 +3440,384 @@
 				j *= blocksize/curr_bsize;
 
 			blocksize = curr_bsize;
+			nr_blocks = window / (blocksize >> 10);
+			if (!nr_blocks || (nr_blocks > MAX_NR_BLOCKS))
+				nr_blocks = MAX_NR_BLOCKS;
 			max_blocks = blk_size[major][minor] / (blocksize >> 10);
-		}
-        	if ((bh = breada (read_disk, j, blocksize, j * blocksize,
-					max_blocks * blocksize)) != NULL) {
-			mark_buffer_dirty(bh, 1);
-			brelse(bh);
-		} else {
+			printk("nr_blocks changed to %d (blocksize %d, j %d, max_blocks %d)\n",
+					nr_blocks, blocksize, j, max_blocks);
 			/*
-			 * FIXME: Ugly, but set_blocksize() isnt safe ...
+			 * We will retry the current block-group
 			 */
-			curr_bsize = blksize_size[major][minor];
-			if (curr_bsize != blocksize)
-				goto diff_blocksize;
+		}
 
-			/*
-			 * It's a real read problem. FIXME, handle this
-			 * a better way.
-			 */
-			printk ( KERN_ALERT
-				 "read error, stopping reconstruction.\n");
-			mddev->busy--;
-			return 1;
+		/*
+		 * Cleanup routines expect this
+		 */
+		for (k = 0; k < nr_blocks; k++)
+			bh[k] = NULL;
+
+		chunk = nr_blocks;
+		if (chunk > max_blocks-j)
+			chunk = max_blocks-j;
+
+		/*
+		 * request buffer heads ...
+		 */
+		for (i = 0; i < chunk; i++) {
+			bh[i] = getblk (read_disk, j+i, blocksize);
+			if (!bh[i])
+				goto read_error;
+			if (!buffer_dirty(bh[i]))
+				mark_buffer_lowprio(bh[i]);
 		}
 
 		/*
-		 * Let's sleep some if we are faster than our speed limit:
+		 * read buffer heads ...
 		 */
-		while (blocksize*j/(jiffies-starttime+1)*HZ/1024 > SPEED_LIMIT)
-		{
-			current->state = TASK_INTERRUPTIBLE;
-			schedule_timeout(1);
+		ll_rw_block (READ, chunk, bh);
+		run_task_queue(&tq_disk);
+		
+		/*
+		 * verify that all of them are OK ...
+		 */
+		for (i = 0; i < chunk; i++) {
+			ii = chunk-i-1;
+			wait_on_buffer(bh[ii]);
+			if (!buffer_uptodate(bh[ii]))
+				goto read_error;
+		}
+
+retry_write:
+		for (i = 0; i < chunk; i++)
+			mark_buffer_dirty_lowprio(bh[i]);
+
+		ll_rw_block(WRITE, chunk, bh);
+		run_task_queue(&tq_disk);
+
+		for (i = 0; i < chunk; i++) {
+			ii = chunk-i-1;
+			wait_on_buffer(bh[ii]);
+
+			if (spare && disk_faulty(spare)) {
+				for (k = 0; k < chunk; k++)
+					brelse(bh[k]);
+				printk(" <SPARE FAILED!>\n ");
+				err = -EIO;
+				goto out;
+			}
+
+			if (!buffer_uptodate(bh[ii])) {
+				curr_bsize = device_bsize(read_disk);
+				if (curr_bsize != blocksize) {
+					printk(KERN_INFO
+						"md%d: blocksize changed during write\n",
+						mdidx(mddev));
+					for (k = 0; k < chunk; k++)
+						if (bh[k]) {
+							if (buffer_lowprio(bh[k]))
+								mark_buffer_clean(bh[k]);
+							brelse(bh[k]);
+						}
+					goto retry_read;
+				}
+				printk(" BAD WRITE %8d>\n", j);
+				/*
+				 * Ouch, write error, retry or bail out.
+				 */
+				if (max_write_errors) {
+					max_write_errors--;
+					printk ( KERN_WARNING "md%d: write error while reconstructing, at block %u(%d).\n", mdidx(mddev), j, blocksize);
+					goto retry_write;
+				}
+				printk ( KERN_ALERT
+				  "too many write errors, stopping reconstruction.\n");
+				for (k = 0; k < chunk; k++)
+					if (bh[k]) {
+						if (buffer_lowprio(bh[k]))
+							mark_buffer_clean(bh[k]);
+						brelse(bh[k]);
+					}
+				err = -EIO;
+				goto out;
+			}
 		}
 
 		/*
-		 * FIXME: put this status bar thing into /proc
+		 * This is the normal 'everything went OK' case
+		 * do a 'free-behind' logic, we sure dont need
+		 * this buffer if it was the only user.
 		 */
-		if (!(j%(max_blocks/100))) {
-			if (!(percent%10))
-				printk (" %03d%% done.\n",percent);
+		for (i = 0; i < chunk; i++)
+			if (buffer_dirty(bh[i]))
+				brelse(bh[i]);
 			else
-				printk (".");
-			percent++;
+				bforget(bh[i]);
+
+
+		if (md_signal_pending(current)) {
+			/*
+			 * got a signal, exit.
+			 */
+			mddev->curr_resync = 0;
+			printk("md_do_sync() got signal ... exiting\n");
+			md_flush_signals();
+			err = -EINTR;
+			goto out;
 		}
+
+		/*
+		 * this loop exits only if either when we are slower than
+		 * the 'hard' speed limit, or the system was IO-idle for
+		 * a jiffy.
+		 * the system might be non-idle CPU-wise, but we only care
+		 * about not overloading the IO subsystem. (things like an
+		 * e2fsck being done on the RAID array should execute fast)
+		 */
+repeat:
+		if (md_need_resched(current))
+			schedule();
+
+		if ((blocksize/1024)*j/((jiffies-starttime)/HZ + 1) + 1
+						> sysctl_speed_limit) {
+			current->priority = 1;
+
+			if (!is_mddev_idle(mddev)) {
+				current->state = TASK_INTERRUPTIBLE;
+				md_schedule_timeout(HZ/2);
+				if (!md_signal_pending(current))
+					goto repeat;
+			}
+		} else
+			current->priority = 40;
 	}
 	fsync_dev(read_disk);
-	printk("md: %s: sync done.\n", kdevname(read_disk));
-	mddev->busy--;
-	return 0;
+	printk(KERN_INFO "md: md%d: sync done.\n",mdidx(mddev));
+	err = 0;
+	/*
+	 * this also signals 'finished resyncing' to md_stop
+	 */
+out:
+	up(&mddev->resync_sem);
+out_nolock:
+	free_pages((unsigned long)bh, RA_ORDER);
+	mddev->curr_resync = 0;
+	wake_up(&resync_wait);
+	return err;
+
+read_error:
+	/*
+	 * set_blocksize() might change the blocksize. This
+	 * should not happen often, but it happens when eg.
+	 * someone mounts a filesystem that has non-1k
+	 * blocksize. set_blocksize() doesnt touch our
+	 * buffer, but to avoid aliasing problems we change
+	 * our internal blocksize too and retry the read.
+	 */
+	curr_bsize = device_bsize(read_disk);
+	if (curr_bsize != blocksize) {
+		printk(KERN_INFO "md%d: blocksize changed during read\n",
+			mdidx(mddev));
+		for (k = 0; k < chunk; k++)
+			if (bh[k]) {
+				if (buffer_lowprio(bh[k]))
+					mark_buffer_clean(bh[k]);
+				brelse(bh[k]);
+			}
+		goto retry_read;
+	}
+
+	/*
+	 * It's a real read problem. We retry and bail out
+	 * only if it's excessive.
+	 */
+	if (max_read_errors) {
+		max_read_errors--;
+		printk ( KERN_WARNING "md%d: read error while reconstructing, at block %u(%d).\n", mdidx(mddev), j, blocksize);
+		for (k = 0; k < chunk; k++)
+			if (bh[k]) {
+				if (buffer_lowprio(bh[k]))
+					mark_buffer_clean(bh[k]);
+				brelse(bh[k]);
+			}
+		goto retry_read;
+	}
+	printk ( KERN_ALERT "too many read errors, stopping reconstruction.\n");
+	for (k = 0; k < chunk; k++)
+		if (bh[k]) {
+			if (buffer_lowprio(bh[k]))
+				mark_buffer_clean(bh[k]);
+			brelse(bh[k]);
+		}
+	err = -EIO;
+	goto out;
 }
 
+#undef MAX_NR_BLOCKS
+
 /*
- * This is a kernel thread which: syncs a spare disk with the active array
+ * This is a kernel thread which syncs a spare disk with the active array
  *
  * the amount of foolproofing might seem to be a tad excessive, but an
  * early (not so error-safe) version of raid1syncd synced the first 0.5 gigs
  * of my root partition with the first 0.5 gigs of my /home partition ... so
  * i'm a bit nervous ;)
  */
-void mdsyncd (void *data)
+void md_do_recovery (void *data)
 {
-	int i;
-	struct md_dev *mddev;
-	md_superblock_t *sb;
-	md_descriptor_t *spare;
+	int err;
+	mddev_t *mddev;
+	mdp_super_t *sb;
+	mdp_disk_t *spare;
 	unsigned long flags;
+	struct md_list_head *tmp;
 
-	for (i = 0, mddev = md_dev; i < MAX_MD_DEV; i++, mddev++) {
-		if ((sb = mddev->sb) == NULL)
+	printk(KERN_INFO "md: recovery thread got woken up ...\n");
+restart:
+	ITERATE_MDDEV(mddev,tmp) {
+		sb = mddev->sb;
+		if (!sb)
+			continue;
+		if (mddev->recovery_running)
 			continue;
 		if (sb->active_disks == sb->raid_disks)
 			continue;
-		if (!sb->spare_disks)
+		if (!sb->spare_disks) {
+			printk(KERN_ERR "md%d: no spare disk to reconstruct array! -- continuing in degraded mode\n", mdidx(mddev));
 			continue;
+		}
+		/*
+		 * now here we get the spare and resync it.
+		 */
 		if ((spare = get_spare(mddev)) == NULL)
 			continue;
-		if (!mddev->pers->mark_spare)
+		printk(KERN_INFO "md%d: resyncing spare disk %s to replace failed disk\n", mdidx(mddev), partition_name(MKDEV(spare->major,spare->minor)));
+		if (!mddev->pers->diskop)
 			continue;
-		if (mddev->pers->mark_spare(mddev, spare, SPARE_WRITE))
+		if (mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_WRITE))
 			continue;
-		if (md_do_sync(mddev) || (spare->state & (1 << MD_FAULTY_DEVICE))) {
-			mddev->pers->mark_spare(mddev, spare, SPARE_INACTIVE);
+		down(&mddev->recovery_sem);
+		mddev->recovery_running = 1;
+		err = md_do_sync(mddev, spare);
+		if (err == -EIO) {
+			printk(KERN_INFO "md%d: spare disk %s failed, skipping to next spare.\n", mdidx(mddev), partition_name(MKDEV(spare->major,spare->minor)));
+			if (!disk_faulty(spare)) {
+				mddev->pers->diskop(mddev,&spare,DISKOP_SPARE_INACTIVE);
+				mark_disk_faulty(spare);
+				mark_disk_nonsync(spare);
+				mark_disk_inactive(spare);
+				sb->spare_disks--;
+				sb->working_disks--;
+				sb->failed_disks++;
+			}
+		} else
+			if (disk_faulty(spare))
+				mddev->pers->diskop(mddev, &spare,
+						DISKOP_SPARE_INACTIVE);
+		if (err == -EINTR) {
+			/*
+			 * Recovery got interrupted ...
+			 * signal back that we have finished using the array.
+			 */
+			mddev->pers->diskop(mddev, &spare,
+							 DISKOP_SPARE_INACTIVE);
+			up(&mddev->recovery_sem);
+			mddev->recovery_running = 0;
 			continue;
+		} else {
+			mddev->recovery_running = 0;
+			up(&mddev->recovery_sem);
 		}
 		save_flags(flags);
 		cli();
-		mddev->pers->mark_spare(mddev, spare, SPARE_ACTIVE);
-		spare->state |= (1 << MD_SYNC_DEVICE);
-		spare->state |= (1 << MD_ACTIVE_DEVICE);
-		sb->spare_disks--;
-		sb->active_disks++;
-		mddev->sb_dirty = 1;
-		md_update_sb(mddev - md_dev);
+		if (!disk_faulty(spare)) {
+			/*
+			 * the SPARE_ACTIVE diskop possibly changes the
+			 * pointer too
+			 */
+			mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_ACTIVE);
+			mark_disk_sync(spare);
+			mark_disk_active(spare);
+			sb->active_disks++;
+			sb->spare_disks--;
+		}
 		restore_flags(flags);
+		mddev->sb_dirty = 1;
+		md_update_sb(mddev);
+		goto restart;
 	}
+	printk(KERN_INFO "md: recovery thread finished ...\n");
 	
 }
 
+int md_notify_reboot(struct notifier_block *this,
+					unsigned long code, void *x)
+{
+	struct md_list_head *tmp;
+	mddev_t *mddev;
+
+	if ((code == MD_SYS_DOWN) || (code == MD_SYS_HALT)
+				  || (code == MD_SYS_POWER_OFF)) {
+
+		printk(KERN_INFO "stopping all md devices.\n");
+
+		ITERATE_MDDEV(mddev,tmp)
+			do_md_stop (mddev, 1);
+		/*
+		 * certain more exotic SCSI devices are known to be
+		 * volatile wrt too early system reboots. While the
+		 * right place to handle this issue is the given
+		 * driver, we do want to have a safe RAID driver ...
+		 */
+		md_mdelay(1000*1);
+	}
+	return NOTIFY_DONE;
+}
+
+struct notifier_block md_notifier = {
+	md_notify_reboot,
+	NULL,
+	0
+};
+
+md__initfunc(void raid_setup(char *str, int *ints))
+{
+	char tmpline[100];
+	int len, pos, nr, i;
+
+	len = strlen(str) + 1;
+	nr = 0;
+	pos = 0;
+
+	for (i = 0; i < len; i++) {
+		char c = str[i];
+
+		if (c == ',' || !c) {
+			tmpline[pos] = 0;
+			if (!strcmp(tmpline,"noautodetect"))
+				raid_setup_args.noautodetect = 1;
+			nr++;
+			pos = 0;
+			continue;
+		}
+		tmpline[pos] = c;
+		pos++;
+	}
+	raid_setup_args.set = 1;
+	return;
+}
+
 #ifdef CONFIG_MD_BOOT
 struct {
 	int set;
 	int ints[100];
 	char str[100];
-} md_setup_args __initdata = {
+} md_setup_args md__initdata = {
 	0,{0},{0}
 };
 
 /* called from init/main.c */
-__initfunc(void md_setup(char *str,int *ints))
+md__initfunc(void md_setup(char *str,int *ints))
 {
 	int i;
 	for(i=0;i<=ints[0];i++) {
@@ -1228,21 +3829,24 @@
 	return;
 }
 
-__initfunc(void do_md_setup(char *str,int *ints))
+md__initfunc(void do_md_setup(char *str,int *ints))
 {
-	int minor, pers, factor, fault;
+#if 0
+	int minor, pers, chunk_size, fault;
 	kdev_t dev;
 	int i=1;
 
+	printk("i plan to phase this out --mingo\n");
+
 	if(ints[0] < 4) {
-		printk ("md: Too few Arguments (%d).\n", ints[0]);
+		printk (KERN_WARNING "md: Too few Arguments (%d).\n", ints[0]);
 		return;
 	}
    
 	minor=ints[i++];
    
-	if (minor >= MAX_MD_DEV) {
-		printk ("md: Minor device number too high.\n");
+	if ((unsigned int)minor >= MAX_MD_DEVS) {
+		printk (KERN_WARNING "md: Minor device number too high.\n");
 		return;
 	}
 
@@ -1252,18 +3856,20 @@
 	case -1:
 #ifdef CONFIG_MD_LINEAR
 		pers = LINEAR;
-		printk ("md: Setting up md%d as linear device.\n",minor);
+		printk (KERN_INFO "md: Setting up md%d as linear device.\n",
+									minor);
 #else 
-	        printk ("md: Linear mode not configured." 
+	        printk (KERN_WARNING "md: Linear mode not configured." 
 			"Recompile the kernel with linear mode enabled!\n");
 #endif
 		break;
 	case 0:
 		pers = STRIPED;
 #ifdef CONFIG_MD_STRIPED
-		printk ("md: Setting up md%d as a striped device.\n",minor);
+		printk (KERN_INFO "md: Setting up md%d as a striped device.\n",
+								minor);
 #else 
-	        printk ("md: Striped mode not configured." 
+	        printk (KERN_WARNING "md: Striped mode not configured." 
 			"Recompile the kernel with striped mode enabled!\n");
 #endif
 		break;
@@ -1278,79 +3884,147 @@
 		break;
 */
 	default:	   
-		printk ("md: Unknown or not supported raid level %d.\n", ints[--i]);
+		printk (KERN_WARNING "md: Unknown or not supported raid level %d.\n", ints[--i]);
 		return;
 	}
 
-	if(pers) {
+	if (pers) {
 
-	  factor=ints[i++]; /* Chunksize  */
-	  fault =ints[i++]; /* Faultlevel */
+		chunk_size = ints[i++]; /* Chunksize  */
+		fault = ints[i++]; /* Faultlevel */
    
-	  pers=pers | factor | (fault << FAULT_SHIFT);   
+		pers = pers | chunk_size | (fault << FAULT_SHIFT);   
    
-	  while( str && (dev = name_to_kdev_t(str))) {
-	    do_md_add (minor, dev);
-	    if((str = strchr (str, ',')) != NULL)
-	      str++;
-	  }
+		while( str && (dev = name_to_kdev_t(str))) {
+			do_md_add (minor, dev);
+			if((str = strchr (str, ',')) != NULL)
+				str++;
+		}
 
-	  do_md_run (minor, pers);
-	  printk ("md: Loading md%d.\n",minor);
+		do_md_run (minor, pers);
+		printk (KERN_INFO "md: Loading md%d.\n",minor);
 	}
-   
+#endif
 }
 #endif
 
+void hsm_init (void);
+void translucent_init (void);
 void linear_init (void);
 void raid0_init (void);
 void raid1_init (void);
 void raid5_init (void);
 
-__initfunc(int md_init (void))
+md__initfunc(int md_init (void))
 {
-  printk ("md driver %d.%d.%d MAX_MD_DEV=%d, MAX_REAL=%d\n",
-    MD_MAJOR_VERSION, MD_MINOR_VERSION, MD_PATCHLEVEL_VERSION,
-    MAX_MD_DEV, MAX_REAL);
-
-  if (register_blkdev (MD_MAJOR, "md", &md_fops))
-  {
-    printk ("Unable to get major %d for md\n", MD_MAJOR);
-    return (-1);
-  }
-
-  blk_dev[MD_MAJOR].request_fn=DEVICE_REQUEST;
-  blk_dev[MD_MAJOR].current_request=NULL;
-  read_ahead[MD_MAJOR]=INT_MAX;
-  memset(md_dev, 0, MAX_MD_DEV * sizeof (struct md_dev));
-  md_gendisk.next=gendisk_head;
-
-  gendisk_head=&md_gendisk;
-
-#if SUPPORT_RECONSTRUCTION
-  if ((md_sync_thread = md_register_thread(mdsyncd, NULL)) == NULL)
-    printk("md: bug: md_sync_thread == NULL\n");
-#endif /* SUPPORT_RECONSTRUCTION */
+	static char * name = "mdrecoveryd";
+
+	printk (KERN_INFO "md driver %d.%d.%d MAX_MD_DEVS=%d, MAX_REAL=%d\n",
+			MD_MAJOR_VERSION, MD_MINOR_VERSION,
+			MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MAX_REAL);
+
+	if (register_blkdev (MD_MAJOR, "md", &md_fops))
+	{
+		printk (KERN_ALERT "Unable to get major %d for md\n", MD_MAJOR);
+		return (-1);
+	}
+
+	blk_dev[MD_MAJOR].request_fn = DEVICE_REQUEST;
+	blk_dev[MD_MAJOR].current_request = NULL;
+	blk_dev[MD_MAJOR].makerq_fn=md_make_request;
+	blk_dev[MD_MAJOR].map_fn=md_map;
+	read_ahead[MD_MAJOR] = INT_MAX;
+	md_gendisk.next = gendisk_head;
+
+	gendisk_head = &md_gendisk;
+
+	md_recovery_thread = md_register_thread(md_do_recovery, NULL, name);
+	if (!md_recovery_thread)
+		printk(KERN_ALERT "bug: couldn't allocate md_recovery_thread\n");
 
+	md_register_reboot_notifier(&md_notifier);
+	md_register_sysctl();
+
+#ifdef CONFIG_MD_HSM
+	hsm_init ();
+#endif
+#ifdef CONFIG_MD_TRANSLUCENT
+	translucent_init ();
+#endif
 #ifdef CONFIG_MD_LINEAR
-  linear_init ();
+	linear_init ();
 #endif
 #ifdef CONFIG_MD_STRIPED
-  raid0_init ();
+	raid0_init ();
 #endif
 #ifdef CONFIG_MD_MIRRORING
-  raid1_init ();
+	raid1_init ();
 #endif
 #ifdef CONFIG_MD_RAID5
-  raid5_init ();
+	raid5_init ();
+#endif
+#if defined(CONFIG_MD_RAID5) || defined(CONFIG_MD_RAID5_MODULE)
+        /*
+         * pick a XOR routine, runtime.
+         */
+	calibrate_xor_block();
 #endif
-  return (0);
+
+	return (0);
 }
 
 #ifdef CONFIG_MD_BOOT
-__initfunc(void md_setup_drive(void))
+md__initfunc(void md_setup_drive(void))
 {
 	if(md_setup_args.set)
 		do_md_setup(md_setup_args.str, md_setup_args.ints);
 }
 #endif
+
+MD_EXPORT_SYMBOL(md_size);
+MD_EXPORT_SYMBOL(register_md_personality);
+MD_EXPORT_SYMBOL(unregister_md_personality);
+MD_EXPORT_SYMBOL(partition_name);
+MD_EXPORT_SYMBOL(md_error);
+MD_EXPORT_SYMBOL(md_recover_arrays);
+MD_EXPORT_SYMBOL(md_register_thread);
+MD_EXPORT_SYMBOL(md_unregister_thread);
+MD_EXPORT_SYMBOL(md_update_sb);
+MD_EXPORT_SYMBOL(md_map);
+MD_EXPORT_SYMBOL(md_wakeup_thread);
+MD_EXPORT_SYMBOL(md_do_sync);
+MD_EXPORT_SYMBOL(md_print_devices);
+MD_EXPORT_SYMBOL(find_rdev_nr);
+MD_EXPORT_SYMBOL(md_check_ordering);
+MD_EXPORT_SYMBOL(md_interrupt_thread);
+MD_EXPORT_SYMBOL(mddev_map);
+
+#ifdef CONFIG_PROC_FS
+static struct proc_dir_entry proc_md = {
+	PROC_MD, 6, "mdstat",
+	S_IFREG | S_IRUGO, 1, 0, 0,
+	0, &proc_array_inode_operations,
+};
+#endif
+
+static void md_geninit (struct gendisk *gdisk)
+{
+	int i;
+  
+	for(i = 0; i < MAX_MD_DEVS; i++) {
+		md_blocksizes[i] = 1024;
+		md_maxreadahead[i] = MD_READAHEAD;
+		md_gendisk.part[i].start_sect = -1; /* avoid partition check */
+		md_gendisk.part[i].nr_sects = 0;
+	}
+
+	printk("md.c: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
+
+	blksize_size[MD_MAJOR] = md_blocksizes;
+	md_set_global_readahead(md_maxreadahead);
+
+#ifdef CONFIG_PROC_FS
+	proc_register(&proc_root, &proc_md);
+#endif
+}
+
diff -urN 2.2.18/drivers/block/raid0.c 2.2.18aa1/drivers/block/raid0.c
--- 2.2.18/drivers/block/raid0.c	Tue Sep  5 02:28:40 2000
+++ 2.2.18aa1/drivers/block/raid0.c	Mon Dec 11 17:20:54 2000
@@ -1,4 +1,3 @@
-
 /*
    raid0.c : Multiple Devices driver for Linux
              Copyright (C) 1994-96 Marc ZYNGIER
@@ -18,146 +17,201 @@
 */
 
 #include <linux/module.h>
-#include <linux/md.h>
-#include <linux/raid0.h>
-#include <linux/vmalloc.h>
+#include <linux/raid/raid0.h>
 
 #define MAJOR_NR MD_MAJOR
 #define MD_DRIVER
 #define MD_PERSONALITY
 
-static int create_strip_zones (int minor, struct md_dev *mddev)
+static int create_strip_zones (mddev_t *mddev)
 {
-  int i, j, c=0;
-  int current_offset=0;
-  struct real_dev *smallest_by_zone;
-  struct raid0_data *data=(struct raid0_data *) mddev->private;
-  
-  data->nr_strip_zones=1;
-  
-  for (i=1; i<mddev->nb_dev; i++)
-  {
-    for (j=0; j<i; j++)
-      if (mddev->devices[i].size==mddev->devices[j].size)
-      {
-	c=1;
-	break;
-      }
-
-    if (!c)
-      data->nr_strip_zones++;
-
-    c=0;
-  }
-
-  if ((data->strip_zone=vmalloc(sizeof(struct strip_zone)*data->nr_strip_zones)) == NULL)
-    return 1;
-
-  data->smallest=NULL;
-  
-  for (i=0; i<data->nr_strip_zones; i++)
-  {
-    data->strip_zone[i].dev_offset=current_offset;
-    smallest_by_zone=NULL;
-    c=0;
-
-    for (j=0; j<mddev->nb_dev; j++)
-      if (mddev->devices[j].size>current_offset)
-      {
-	data->strip_zone[i].dev[c++]=mddev->devices+j;
-	if (!smallest_by_zone ||
-	    smallest_by_zone->size > mddev->devices[j].size)
-	  smallest_by_zone=mddev->devices+j;
-      }
-
-    data->strip_zone[i].nb_dev=c;
-    data->strip_zone[i].size=(smallest_by_zone->size-current_offset)*c;
-
-    if (!data->smallest ||
-	data->smallest->size > data->strip_zone[i].size)
-      data->smallest=data->strip_zone+i;
-
-    data->strip_zone[i].zone_offset=i ? (data->strip_zone[i-1].zone_offset+
-					   data->strip_zone[i-1].size) : 0;
-    current_offset=smallest_by_zone->size;
-  }
-  return 0;
+	int i, c, j, j1, j2;
+	int current_offset, curr_zone_offset;
+	raid0_conf_t *conf = mddev_to_conf(mddev);
+	mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev;
+ 
+	/*
+	 * The number of 'same size groups'
+	 */
+	conf->nr_strip_zones = 0;
+ 
+	ITERATE_RDEV_ORDERED(mddev,rdev1,j1) {
+		printk("raid0: looking at %s\n", partition_name(rdev1->dev));
+		c = 0;
+		ITERATE_RDEV_ORDERED(mddev,rdev2,j2) {
+			printk("raid0:   comparing %s(%d) with %s(%d)\n", partition_name(rdev1->dev), rdev1->size, partition_name(rdev2->dev), rdev2->size);
+			if (rdev2 == rdev1) {
+				printk("raid0:   END\n");
+				break;
+			}
+			if (rdev2->size == rdev1->size)
+			{
+				/*
+				 * Not unique, dont count it as a new
+				 * group
+				 */
+				printk("raid0:   EQUAL\n");
+				c = 1;
+				break;
+			}
+			printk("raid0:   NOT EQUAL\n");
+		}
+		if (!c) {
+			printk("raid0:   ==> UNIQUE\n");
+			conf->nr_strip_zones++;
+			printk("raid0: %d zones\n", conf->nr_strip_zones);
+		}
+	}
+		printk("raid0: FINAL %d zones\n", conf->nr_strip_zones);
+
+	conf->strip_zone = vmalloc(sizeof(struct strip_zone)*
+				conf->nr_strip_zones);
+	if (!conf->strip_zone)
+		return 1;
+
+
+	conf->smallest = NULL;
+	current_offset = 0;
+	curr_zone_offset = 0;
+
+	for (i = 0; i < conf->nr_strip_zones; i++)
+	{
+		struct strip_zone *zone = conf->strip_zone + i;
+
+		printk("zone %d\n", i);
+		zone->dev_offset = current_offset;
+		smallest = NULL;
+		c = 0;
+
+		ITERATE_RDEV_ORDERED(mddev,rdev,j) {
+
+			printk(" checking %s ...", partition_name(rdev->dev));
+			if (rdev->size > current_offset)
+			{
+				printk(" contained as device %d\n", c);
+				zone->dev[c] = rdev;
+				c++;
+				if (!smallest || (rdev->size <smallest->size)) {
+					smallest = rdev;
+					printk("  (%d) is smallest!.\n", rdev->size);
+				}
+			} else
+				printk(" nope.\n");
+		}
+
+		zone->nb_dev = c;
+		zone->size = (smallest->size - current_offset) * c;
+		printk(" zone->nb_dev: %d, size: %d\n",zone->nb_dev,zone->size);
+
+		if (!conf->smallest || (zone->size < conf->smallest->size))
+			conf->smallest = zone;
+
+		zone->zone_offset = curr_zone_offset;
+		curr_zone_offset += zone->size;
+
+		current_offset = smallest->size;
+		printk("current zone offset: %d\n", current_offset);
+	}
+	printk("done.\n");
+	return 0;
 }
 
-static int raid0_run (int minor, struct md_dev *mddev)
+static int raid0_run (mddev_t *mddev)
 {
-  int cur=0, i=0, size, zone0_size, nb_zone;
-  struct raid0_data *data;
-
-  MOD_INC_USE_COUNT;
+	int cur=0, i=0, size, zone0_size, nb_zone;
+	raid0_conf_t *conf;
 
-  if ((mddev->private=vmalloc (sizeof (struct raid0_data))) == NULL) return 1;
-  data=(struct raid0_data *) mddev->private;
-  
-  if (create_strip_zones (minor, mddev)) 
-  {
-  	vfree(data);
-  	return 1;
-  }
-
-  nb_zone=data->nr_zones=
-    md_size[minor]/data->smallest->size +
-    (md_size[minor]%data->smallest->size ? 1 : 0);
-
-  printk ("raid0 : Allocating %ld bytes for hash.\n",(long)sizeof(struct raid0_hash)*nb_zone);
-  if ((data->hash_table=vmalloc (sizeof (struct raid0_hash)*nb_zone)) == NULL)
-  {
-    vfree(data->strip_zone);
-    vfree(data);
-    return 1;
-  }
-  size=data->strip_zone[cur].size;
-
-  i=0;
-  while (cur<data->nr_strip_zones)
-  {
-    data->hash_table[i].zone0=data->strip_zone+cur;
-
-    if (size>=data->smallest->size)/* If we completely fill the slot */
-    {
-      data->hash_table[i++].zone1=NULL;
-      size-=data->smallest->size;
-
-      if (!size)
-      {
-	if (++cur==data->nr_strip_zones) continue;
-	size=data->strip_zone[cur].size;
-      }
-
-      continue;
-    }
-
-    if (++cur==data->nr_strip_zones) /* Last dev, set unit1 as NULL */
-    {
-      data->hash_table[i].zone1=NULL;
-      continue;
-    }
-
-    zone0_size=size;		/* Here, we use a 2nd dev to fill the slot */
-    size=data->strip_zone[cur].size;
-    data->hash_table[i++].zone1=data->strip_zone+cur;
-    size-=(data->smallest->size - zone0_size);
-  }
+	MOD_INC_USE_COUNT;
 
-  return (0);
+	conf = vmalloc(sizeof (raid0_conf_t));
+	if (!conf)
+		goto out;
+	mddev->private = (void *)conf;
+ 
+	if (md_check_ordering(mddev)) {
+		printk("raid0: disks are not ordered, aborting!\n");
+		goto out_free_conf;
+	}
+
+	if (create_strip_zones (mddev)) 
+		goto out_free_conf;
+
+	printk("raid0 : md_size is %d blocks.\n", md_size[mdidx(mddev)]);
+	printk("raid0 : conf->smallest->size is %d blocks.\n", conf->smallest->size);
+	nb_zone = md_size[mdidx(mddev)]/conf->smallest->size +
+			(md_size[mdidx(mddev)] % conf->smallest->size ? 1 : 0);
+	printk("raid0 : nb_zone is %d.\n", nb_zone);
+	conf->nr_zones = nb_zone;
+
+	printk("raid0 : Allocating %d bytes for hash.\n",
+				sizeof(struct raid0_hash)*nb_zone);
+
+	conf->hash_table = vmalloc (sizeof (struct raid0_hash)*nb_zone);
+	if (!conf->hash_table)
+		goto out_free_zone_conf;
+	size = conf->strip_zone[cur].size;
+
+	i = 0;
+	while (cur < conf->nr_strip_zones) {
+		conf->hash_table[i].zone0 = conf->strip_zone + cur;
+
+		/*
+		 * If we completely fill the slot
+		 */
+		if (size >= conf->smallest->size) {
+			conf->hash_table[i++].zone1 = NULL;
+			size -= conf->smallest->size;
+
+			if (!size) {
+				if (++cur == conf->nr_strip_zones)
+					continue;
+				size = conf->strip_zone[cur].size;
+			}
+			continue;
+		}
+		if (++cur == conf->nr_strip_zones) {
+			/*
+			 * Last dev, set unit1 as NULL
+			 */
+			conf->hash_table[i].zone1=NULL;
+			continue;
+		}
+
+		/*
+		 * Here we use a 2nd dev to fill the slot
+		 */
+		zone0_size = size;
+		size = conf->strip_zone[cur].size;
+		conf->hash_table[i++].zone1 = conf->strip_zone + cur;
+		size -= (conf->smallest->size - zone0_size);
+	}
+	return 0;
+
+out_free_zone_conf:
+	vfree(conf->strip_zone);
+	conf->strip_zone = NULL;
+
+out_free_conf:
+	vfree(conf);
+	mddev->private = NULL;
+out:
+	MOD_DEC_USE_COUNT;
+	return 1;
 }
 
-
-static int raid0_stop (int minor, struct md_dev *mddev)
+static int raid0_stop (mddev_t *mddev)
 {
-  struct raid0_data *data=(struct raid0_data *) mddev->private;
+	raid0_conf_t *conf = mddev_to_conf(mddev);
 
-  vfree (data->hash_table);
-  vfree (data->strip_zone);
-  vfree (data);
+	vfree (conf->hash_table);
+	conf->hash_table = NULL;
+	vfree (conf->strip_zone);
+	conf->strip_zone = NULL;
+	vfree (conf);
+	mddev->private = NULL;
 
-  MOD_DEC_USE_COUNT;
-  return 0;
+	MOD_DEC_USE_COUNT;
+	return 0;
 }
 
 /*
@@ -167,135 +221,140 @@
  * Of course, those facts may not be valid anymore (and surely won't...)
  * Hey guys, there's some work out there ;-)
  */
-static int raid0_map (struct md_dev *mddev, kdev_t *rdev,
+static int raid0_map (mddev_t *mddev, kdev_t dev, kdev_t *rdev,
 		      unsigned long *rsector, unsigned long size)
 {
-  struct raid0_data *data=(struct raid0_data *) mddev->private;
-  static struct raid0_hash *hash;
-  struct strip_zone *zone;
-  struct real_dev *tmp_dev;
-  int blk_in_chunk, factor, chunk, chunk_size;
-  long block, rblock;
-
-  factor=FACTOR(mddev);
-  chunk_size=(1UL << FACTOR_SHIFT(factor));
-  block=*rsector >> 1;
-  hash=data->hash_table+(block/data->smallest->size);
-
-  if (hash - data->hash_table > data->nr_zones) 
-  { 
-	  printk(KERN_DEBUG "raid0_map: invalid block %li\n", block);
-	  return -1;
-  }
-
-  /* Sanity check */
-  if ((chunk_size*2)<(*rsector % (chunk_size*2))+size)
-  {
-    printk ("raid0_convert : can't convert block across chunks or bigger than %dk %ld %ld\n", chunk_size, *rsector, size);
-    return (-1);
-  }
-  
-  if (block >= (hash->zone0->size +
-		hash->zone0->zone_offset))
-  {
-    if (!hash->zone1)
-    {
-      printk ("raid0_convert : hash->zone1==NULL for block %ld\n", block);
-      return (-1);
-    }
-    
-    zone=hash->zone1;
-  }
-  else
-    zone=hash->zone0;
+	raid0_conf_t *conf = mddev_to_conf(mddev);
+	struct raid0_hash *hash;
+	struct strip_zone *zone;
+	mdk_rdev_t *tmp_dev;
+	int blk_in_chunk, chunksize_bits, chunk, chunk_size;
+	long block, rblock;
+
+	chunk_size = mddev->param.chunk_size >> 10;
+	chunksize_bits = ffz(~chunk_size);
+	block = *rsector >> 1;
+	hash = conf->hash_table + block / conf->smallest->size;
+
+	if (hash - conf->hash_table > conf->nr_zones) {
+		printk(KERN_DEBUG "raid0_map: invalid block %lu\n", block);
+		return -1;
+	}
+
+	/* Sanity check */
+	if ((chunk_size * 2) < (*rsector % (chunk_size * 2)) + size)
+		goto bad_map;
+ 
+	if (!hash)
+		goto bad_hash;
+
+	if (!hash->zone0)
+		goto bad_zone0;
+ 
+	if (block >= (hash->zone0->size + hash->zone0->zone_offset)) {
+		if (!hash->zone1)
+			goto bad_zone1;
+		zone = hash->zone1;
+	} else
+		zone = hash->zone0;
     
-  blk_in_chunk=block & (chunk_size -1);
-  chunk=(block - zone->zone_offset) / (zone->nb_dev<<FACTOR_SHIFT(factor));
-  tmp_dev=zone->dev[(block >> FACTOR_SHIFT(factor)) % zone->nb_dev];
-  rblock=(chunk << FACTOR_SHIFT(factor)) + blk_in_chunk + zone->dev_offset;
+	blk_in_chunk = block & (chunk_size -1);
+	chunk = (block - zone->zone_offset) / (zone->nb_dev << chunksize_bits);
+	tmp_dev = zone->dev[(block >> chunksize_bits) % zone->nb_dev];
+	rblock = (chunk << chunksize_bits) + blk_in_chunk + zone->dev_offset;
   
-  *rdev=tmp_dev->dev;
-  *rsector=rblock<<1;
+	*rdev = tmp_dev->dev;
+	*rsector = rblock << 1;
 
-  return (0);
+	return 0;
+
+bad_map:
+	printk ("raid0_map bug: can't convert block across chunks or bigger than %dk %ld %ld\n", chunk_size, *rsector, size);
+	return -1;
+bad_hash:
+	printk("raid0_map bug: hash==NULL for block %ld\n", block);
+	return -1;
+bad_zone0:
+	printk ("raid0_map bug: hash->zone0==NULL for block %ld\n", block);
+	return -1;
+bad_zone1:
+	printk ("raid0_map bug: hash->zone1==NULL for block %ld\n", block);
+	return -1;
 }
 
 			   
-static int raid0_status (char *page, int minor, struct md_dev *mddev)
+static int raid0_status (char *page, mddev_t *mddev)
 {
-  int sz=0;
+	int sz = 0;
 #undef MD_DEBUG
 #ifdef MD_DEBUG
-  int j, k;
-  struct raid0_data *data=(struct raid0_data *) mddev->private;
+	int j, k;
+	raid0_conf_t *conf = mddev_to_conf(mddev);
   
-  sz+=sprintf (page+sz, "      ");
-  for (j=0; j<data->nr_zones; j++)
-  {
-    sz+=sprintf (page+sz, "[z%d",
-		 data->hash_table[j].zone0-data->strip_zone);
-    if (data->hash_table[j].zone1)
-      sz+=sprintf (page+sz, "/z%d] ",
-		   data->hash_table[j].zone1-data->strip_zone);
-    else
-      sz+=sprintf (page+sz, "] ");
-  }
+	sz += sprintf(page + sz, "      ");
+	for (j = 0; j < conf->nr_zones; j++) {
+		sz += sprintf(page + sz, "[z%d",
+				conf->hash_table[j].zone0 - conf->strip_zone);
+		if (conf->hash_table[j].zone1)
+			sz += sprintf(page+sz, "/z%d] ",
+				conf->hash_table[j].zone1 - conf->strip_zone);
+		else
+			sz += sprintf(page+sz, "] ");
+	}
   
-  sz+=sprintf (page+sz, "\n");
+	sz += sprintf(page + sz, "\n");
   
-  for (j=0; j<data->nr_strip_zones; j++)
-  {
-    sz+=sprintf (page+sz, "      z%d=[", j);
-    for (k=0; k<data->strip_zone[j].nb_dev; k++)
-      sz+=sprintf (page+sz, "%s/",
-		   partition_name(data->strip_zone[j].dev[k]->dev));
-    sz--;
-    sz+=sprintf (page+sz, "] zo=%d do=%d s=%d\n",
-		 data->strip_zone[j].zone_offset,
-		 data->strip_zone[j].dev_offset,
-		 data->strip_zone[j].size);
-  }
+	for (j = 0; j < conf->nr_strip_zones; j++) {
+		sz += sprintf(page + sz, "      z%d=[", j);
+		for (k = 0; k < conf->strip_zone[j].nb_dev; k++)
+			sz += sprintf (page+sz, "%s/", partition_name(
+				conf->strip_zone[j].dev[k]->dev));
+		sz--;
+		sz += sprintf (page+sz, "] zo=%d do=%d s=%d\n",
+				conf->strip_zone[j].zone_offset,
+				conf->strip_zone[j].dev_offset,
+				conf->strip_zone[j].size);
+	}
 #endif
-  sz+=sprintf (page+sz, " %dk chunks", 1<<FACTOR_SHIFT(FACTOR(mddev)));
-  return sz;
+	sz += sprintf(page + sz, " %dk chunks", mddev->param.chunk_size/1024);
+	return sz;
 }
 
-
-static struct md_personality raid0_personality=
+static mdk_personality_t raid0_personality=
 {
-  "raid0",
-  raid0_map,
-  NULL,				/* no special make_request */
-  NULL,				/* no special end_request */
-  raid0_run,
-  raid0_stop,
-  raid0_status,
-  NULL,				/* no ioctls */
-  0,
-  NULL,				/* no error_handler */
-  NULL,				/* hot_add_disk */
-  NULL,				/* hot_remove_disk */
-  NULL				/* mark_spare */
+	"raid0",
+	raid0_map,
+	NULL,				/* no special make_request */
+	NULL,				/* no special end_request */
+	raid0_run,
+	raid0_stop,
+	raid0_status,
+	NULL,				/* no ioctls */
+	0,
+	NULL,				/* no error_handler */
+	NULL,				/* no diskop */
+	NULL,				/* no stop resync */
+	NULL				/* no restart resync */
 };
 
-
 #ifndef MODULE
 
 void raid0_init (void)
 {
-  register_md_personality (RAID0, &raid0_personality);
+	register_md_personality (RAID0, &raid0_personality);
 }
 
 #else
 
 int init_module (void)
 {
-  return (register_md_personality (RAID0, &raid0_personality));
+	return (register_md_personality (RAID0, &raid0_personality));
 }
 
 void cleanup_module (void)
 {
-  unregister_md_personality (RAID0);
+	unregister_md_personality (RAID0);
 }
 
 #endif
+
diff -urN 2.2.18/drivers/block/raid1.c 2.2.18aa1/drivers/block/raid1.c
--- 2.2.18/drivers/block/raid1.c	Mon Dec 11 16:57:48 2000
+++ 2.2.18aa1/drivers/block/raid1.c	Mon Dec 11 17:20:55 2000
@@ -1,6 +1,6 @@
-/************************************************************************
+/*
  * raid1.c : Multiple Devices driver for Linux
- *           Copyright (C) 1996 Ingo Molnar, Miguel de Icaza, Gadi Oxman
+ * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
  *
  * RAID-1 management functions.
  *
@@ -15,50 +15,55 @@
  */
 
 #include <linux/module.h>
-#include <linux/locks.h>
 #include <linux/malloc.h>
-#include <linux/md.h>
-#include <linux/raid1.h>
-#include <asm/bitops.h>
+#include <linux/raid/raid1.h>
 #include <asm/atomic.h>
 
 #define MAJOR_NR MD_MAJOR
 #define MD_DRIVER
 #define MD_PERSONALITY
 
-/*
- * The following can be used to debug the driver
- */
-/*#define RAID1_DEBUG*/
-#ifdef RAID1_DEBUG
-#define PRINTK(x)   do { printk x; } while (0);
-#else
-#define PRINTK(x)   do { ; } while (0);
-#endif
+#define MAX_LINEAR_SECTORS 128
 
 #define MAX(a,b)	((a) > (b) ? (a) : (b))
 #define MIN(a,b)	((a) < (b) ? (a) : (b))
 
-static struct md_personality raid1_personality;
-static struct md_thread *raid1_thread = NULL;
+static mdk_personality_t raid1_personality;
 struct buffer_head *raid1_retry_list = NULL;
 
-static int __raid1_map (struct md_dev *mddev, kdev_t *rdev,
+static void * raid1_kmalloc (int size)
+{
+	void * ptr;
+	/*
+	 * now we are rather fault tolerant than nice, but
+	 * there are a couple of places in the RAID code where we
+	 * simply can not afford to fail an allocation because
+	 * there is no failure return path (eg. make_request())
+	 */
+	while (!(ptr = kmalloc (sizeof (raid1_conf_t), GFP_BUFFER))) {
+		printk ("raid1: out of memory, retrying...\n");
+		current->state = TASK_UNINTERRUPTIBLE;
+		schedule_timeout(HZ/10);
+	}
+
+	memset(ptr, 0, size);
+	return ptr;
+}
+
+static int __raid1_map (mddev_t *mddev, kdev_t *rdev,
 		        unsigned long *rsector, unsigned long size)
 {
-	struct raid1_data *raid_conf = (struct raid1_data *) mddev->private;
-	int i, n = raid_conf->raid_disks;
+	raid1_conf_t *conf = mddev_to_conf(mddev);
+	int i, disks = MD_SB_DISKS;
 
 	/*
 	 * Later we do read balancing on the read side 
 	 * now we use the first available disk.
 	 */
 
-	PRINTK(("raid1_map().\n"));
-
-	for (i=0; i<n; i++) {
-		if (raid_conf->mirrors[i].operational) {
-			*rdev = raid_conf->mirrors[i].dev;
+	for (i = 0; i < disks; i++) {
+		if (conf->mirrors[i].operational) {
+			*rdev = conf->mirrors[i].dev;
 			return (0);
 		}
 	}
@@ -67,29 +72,29 @@
 	return (-1);
 }
 
-static int raid1_map (struct md_dev *mddev, kdev_t *rdev,
+static int raid1_map (mddev_t *mddev, kdev_t dev, kdev_t *rdev,
 		      unsigned long *rsector, unsigned long size)
 {
 	return 0;
 }
 
-void raid1_reschedule_retry (struct buffer_head *bh)
+static void raid1_reschedule_retry (struct buffer_head *bh)
 {
 	struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_dev_id);
-
-	PRINTK(("raid1_reschedule_retry().\n"));
+	mddev_t *mddev = r1_bh->mddev;
+	raid1_conf_t *conf = mddev_to_conf(mddev);
 
 	r1_bh->next_retry = raid1_retry_list;
 	raid1_retry_list = bh;
-	md_wakeup_thread(raid1_thread);
+	md_wakeup_thread(conf->thread);
 }
 
 /*
- * raid1_end_buffer_io() is called when we have finished servicing a mirrored
+ * raid1_end_bh_io() is called when we have finished servicing a mirrored
  * operation and are ready to return a success/failure code to the buffer
  * cache layer.
  */
-static inline void raid1_end_buffer_io(struct raid1_bh *r1_bh, int uptodate)
+static void raid1_end_bh_io (struct raid1_bh *r1_bh, int uptodate)
 {
 	struct buffer_head *bh = r1_bh->master_bh;
 
@@ -97,8 +102,6 @@
 	kfree(r1_bh);
 }
 
-int raid1_one_error=0;
-
 void raid1_end_request (struct buffer_head *bh, int uptodate)
 {
 	struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_dev_id);
@@ -106,17 +109,12 @@
 
 	save_flags(flags);
 	cli();
-	PRINTK(("raid1_end_request().\n"));
 
-	if (raid1_one_error) {
-		raid1_one_error=0;
-		uptodate=0;
-	}
 	/*
 	 * this branch is our 'one mirror IO has finished' event handler:
 	 */
 	if (!uptodate)
-		md_error (bh->b_dev, bh->b_rdev);
+		md_error (r1_bh->mddev, bh->b_rdev);
 	else {
 		/*
 		 * Set BH_Uptodate in our master buffer_head, so that
@@ -136,15 +134,11 @@
 	 */
 
 	if ( (r1_bh->cmd == READ) || (r1_bh->cmd == READA) ) {
-
-		PRINTK(("raid1_end_request(), read branch.\n"));
-
 		/*
 		 * we have only one buffer_head on the read side
 		 */
 		if (uptodate) {
-			PRINTK(("raid1_end_request(), read branch, uptodate.\n"));
-			raid1_end_buffer_io(r1_bh, uptodate);
+			raid1_end_bh_io(r1_bh, uptodate);
 			restore_flags(flags);
 			return;
 		}
@@ -152,71 +146,55 @@
 		 * oops, read error:
 		 */
 		printk(KERN_ERR "raid1: %s: rescheduling block %lu\n", 
-				 kdevname(bh->b_dev), bh->b_blocknr);
-		raid1_reschedule_retry (bh);
+			 partition_name(mddev_to_kdev(r1_bh->mddev)), bh->b_blocknr);
+		raid1_reschedule_retry(bh);
 		restore_flags(flags);
 		return;
 	}
 
 	/*
-	 * WRITE or WRITEA.
-	 */
-	PRINTK(("raid1_end_request(), write branch.\n"));
-
-	/*
+	 * WRITE:
+	 *
 	 * Let's see if all mirrored write operations have finished 
-	 * already [we have irqs off, so we can decrease]:
+	 * already.
 	 */
 
-	if (!--r1_bh->remaining) {
-		struct md_dev *mddev = r1_bh->mddev;
-		struct raid1_data *raid_conf = (struct raid1_data *) mddev->private;
-		int i, n = raid_conf->raid_disks;
-
-		PRINTK(("raid1_end_request(), remaining == 0.\n"));
+	if (atomic_dec_and_test(&r1_bh->remaining)) {
+		int i, disks = MD_SB_DISKS;
 
-		for ( i=0; i<n; i++)
-			if (r1_bh->mirror_bh[i]) kfree(r1_bh->mirror_bh[i]);
+		for ( i = 0; i < disks; i++)
+			if (r1_bh->mirror_bh[i])
+				kfree(r1_bh->mirror_bh[i]);
 
-		raid1_end_buffer_io(r1_bh, test_bit(BH_Uptodate, &r1_bh->state));
+		raid1_end_bh_io(r1_bh, test_bit(BH_Uptodate, &r1_bh->state));
 	}
-	else PRINTK(("raid1_end_request(), remaining == %u.\n", r1_bh->remaining));
 	restore_flags(flags);
 }
 
-/* This routine checks if the undelying device is an md device and in that
- * case it maps the blocks before putting the request on the queue
+/*
+ * This routine checks if the undelying device is an md device
+ * and in that case it maps the blocks before putting the
+ * request on the queue
  */
-static inline void
-map_and_make_request (int rw, struct buffer_head *bh)
+static void map_and_make_request (int rw, struct buffer_head *bh)
 {
 	if (MAJOR (bh->b_rdev) == MD_MAJOR)
-		md_map (MINOR (bh->b_rdev), &bh->b_rdev, &bh->b_rsector, bh->b_size >> 9);
+		md_map (bh->b_rdev, &bh->b_rdev, &bh->b_rsector, bh->b_size >> 9, rw);
 	clear_bit(BH_Lock, &bh->b_state);
 	make_request (MAJOR (bh->b_rdev), rw, bh);
 }
 	
-static int
-raid1_make_request (struct md_dev *mddev, int rw, struct buffer_head * bh)
+static int raid1_make_request (mddev_t *mddev, int rw,
+						 struct buffer_head * bh)
 {
-
-	struct raid1_data *raid_conf = (struct raid1_data *) mddev->private;
+	raid1_conf_t *conf = mddev_to_conf(mddev);
 	struct buffer_head *mirror_bh[MD_SB_DISKS], *bh_req;
 	struct raid1_bh * r1_bh;
-	int n = raid_conf->raid_disks, i, sum_bhs = 0, switch_disks = 0, sectors;
+	int disks = MD_SB_DISKS;
+	int i, sum_bhs = 0, switch_disks = 0, sectors, lowprio = 0;
 	struct mirror_info *mirror;
 
-	PRINTK(("raid1_make_request().\n"));
-
-	while (!( /* FIXME: now we are rather fault tolerant than nice */
-	r1_bh = kmalloc (sizeof (struct raid1_bh), GFP_BUFFER)
-	) )
-	{
-		printk ("raid1_make_request(#1): out of memory\n");
-		current->policy |= SCHED_YIELD;
-		schedule();
-	}
-	memset (r1_bh, 0, sizeof (struct raid1_bh));
+	r1_bh = raid1_kmalloc (sizeof (struct raid1_bh));
 
 /*
  * make_request() can abort the operation when READA or WRITEA are being
@@ -227,43 +205,65 @@
 	if (rw == READA) rw = READ;
 	if (rw == WRITEA) rw = WRITE;
 
-	if (rw == WRITE || rw == WRITEA)
-		mark_buffer_clean(bh);		/* Too early ? */
+	if (rw == WRITE) {
+		/*
+		 * Too early ?
+		 */
+		mark_buffer_clean(bh);
+		/*
+		 * not too early. we _first_ clean the bh, then we start
+		 * the IO, then when the IO has finished, we unlock the
+		 * bh and mark it uptodate. This way we do not miss the
+		 * case when the bh got dirty again during the IO.
+		 */
+	}
+
+	/*
+	 * special flag for 'lowprio' reconstruction requests ...
+	 */
+	if (buffer_lowprio(bh))
+		lowprio = 1;
 
 /*
- * i think the read and write branch should be separated completely, since we want
- * to do read balancing on the read side for example. Comments? :) --mingo
+ * i think the read and write branch should be separated completely,
+ * since we want to do read balancing on the read side for example.
+ * Comments? :) --mingo
  */
 
 	r1_bh->master_bh=bh;
 	r1_bh->mddev=mddev;
 	r1_bh->cmd = rw;
 
-	if (rw==READ || rw==READA) {
-		int last_used = raid_conf->last_used;
-		PRINTK(("raid1_make_request(), read branch.\n"));
-		mirror = raid_conf->mirrors + last_used;
+	if (rw==READ) {
+		int last_used = conf->last_used;
+
+		/*
+		 * read balancing logic:
+		 */
+		mirror = conf->mirrors + last_used;
 		bh->b_rdev = mirror->dev;
 		sectors = bh->b_size >> 9;
-		if (bh->b_blocknr * sectors == raid_conf->next_sect) {
-			raid_conf->sect_count += sectors;
-			if (raid_conf->sect_count >= mirror->sect_limit)
+
+		if (bh->b_blocknr * sectors == conf->next_sect) {
+			conf->sect_count += sectors;
+			if (conf->sect_count >= mirror->sect_limit)
 				switch_disks = 1;
 		} else
 			switch_disks = 1;
-		raid_conf->next_sect = (bh->b_blocknr + 1) * sectors;
-		if (switch_disks) {
-			PRINTK(("read-balancing: switching %d -> %d (%d sectors)\n", last_used, mirror->next, raid_conf->sect_count));
-			raid_conf->sect_count = 0;
-			last_used = raid_conf->last_used = mirror->next;
+		conf->next_sect = (bh->b_blocknr + 1) * sectors;
+		/*
+		 * Do not switch disks if full resync is in progress ...
+		 */
+		if (switch_disks && !conf->resync_mirrors) {
+			conf->sect_count = 0;
+			last_used = conf->last_used = mirror->next;
 			/*
-			 * Do not switch to write-only disks ... resyncing
-			 * is in progress
+			 * Do not switch to write-only disks ...
+			 * reconstruction is in progress
 			 */
-			while (raid_conf->mirrors[last_used].write_only)
-				raid_conf->last_used = raid_conf->mirrors[last_used].next;
+			while (conf->mirrors[last_used].write_only)
+				conf->last_used = conf->mirrors[last_used].next;
 		}
-		PRINTK (("raid1 read queue: %d %d\n", MAJOR (bh->b_rdev), MINOR (bh->b_rdev)));
 		bh_req = &r1_bh->bh_req;
 		memcpy(bh_req, bh, sizeof(*bh));
 		bh_req->b_end_io = raid1_end_request;
@@ -273,13 +273,12 @@
 	}
 
 	/*
-	 * WRITE or WRITEA.
+	 * WRITE:
 	 */
-	PRINTK(("raid1_make_request(n=%d), write branch.\n",n));
 
-	for (i = 0; i < n; i++) {
+	for (i = 0; i < disks; i++) {
 
-		if (!raid_conf->mirrors [i].operational) {
+		if (!conf->mirrors[i].operational) {
 			/*
 			 * the r1_bh->mirror_bh[i] pointer remains NULL
 			 */
@@ -287,89 +286,91 @@
 			continue;
 		}
 
+ 		/*
+ 		 * special case for reconstruction ...
+ 		 */
+ 		if (lowprio && (i == conf->last_used)) {
+ 			mirror_bh[i] = NULL;
+ 			continue;
+ 		}
+ 
+  	/*
+  	 * We should use a private pool (size depending on NR_REQUEST),
+  	 * to avoid writes filling up the memory with bhs
+  	 *
+ 	 * Such pools are much faster than kmalloc anyways (so we waste
+ 	 * almost nothing by not using the master bh when writing and
+ 	 * win alot of cleanness) but for now we are cool enough. --mingo
+ 	 *
+  	 * It's safe to sleep here, buffer heads cannot be used in a shared
+ 	 * manner in the write branch. Look how we lock the buffer at the
+ 	 * beginning of this function to grok the difference ;)
+  	 */
+ 		mirror_bh[i] = raid1_kmalloc(sizeof(struct buffer_head));
+  	/*
+  	 * prepare mirrored bh (fields ordered for max mem throughput):
+  	 */
+ 		mirror_bh[i]->b_blocknr    = bh->b_blocknr;
+ 		mirror_bh[i]->b_dev        = bh->b_dev;
+ 		mirror_bh[i]->b_rdev	   = conf->mirrors[i].dev;
+ 		mirror_bh[i]->b_rsector    = bh->b_rsector;
+ 		mirror_bh[i]->b_state      = (1<<BH_Req) | (1<<BH_Dirty);
+ 		if (lowprio)
+ 			mirror_bh[i]->b_state |= (1<<BH_LowPrio);
+ 
+ 		mirror_bh[i]->b_count      = 1;
+ 		mirror_bh[i]->b_size       = bh->b_size;
+ 		mirror_bh[i]->b_data       = bh->b_data;
+ 		mirror_bh[i]->b_list       = BUF_LOCKED;
+ 		mirror_bh[i]->b_end_io     = raid1_end_request;
+ 		mirror_bh[i]->b_dev_id     = r1_bh;
+  
+  		r1_bh->mirror_bh[i] = mirror_bh[i];
+  		sum_bhs++;
+	}
+
+	md_atomic_set(&r1_bh->remaining, sum_bhs);
+
 	/*
-	 * We should use a private pool (size depending on NR_REQUEST),
-	 * to avoid writes filling up the memory with bhs
-	 *
-	 * Such pools are much faster than kmalloc anyways (so we waste almost 
-	 * nothing by not using the master bh when writing and win alot of cleanness)
-	 *
-	 * but for now we are cool enough. --mingo
-	 *
-	 * It's safe to sleep here, buffer heads cannot be used in a shared
-	 * manner in the write branch. Look how we lock the buffer at the beginning
-	 * of this function to grok the difference ;)
-	 */
-		while (!( /* FIXME: now we are rather fault tolerant than nice */
-		mirror_bh[i] = kmalloc (sizeof (struct buffer_head), GFP_BUFFER)
-		) )
-		{
-			printk ("raid1_make_request(#2): out of memory\n");
-			current->policy |= SCHED_YIELD;
-			schedule();
-		}
-		memset (mirror_bh[i], 0, sizeof (struct buffer_head));
-
-	/*
-	 * prepare mirrored bh (fields ordered for max mem throughput):
-	 */
-		mirror_bh [i]->b_blocknr    = bh->b_blocknr;
-		mirror_bh [i]->b_dev        = bh->b_dev;
-		mirror_bh [i]->b_rdev 	    = raid_conf->mirrors [i].dev;
-		mirror_bh [i]->b_rsector    = bh->b_rsector;
-		mirror_bh [i]->b_state      = (1<<BH_Req) | (1<<BH_Dirty);
-		mirror_bh [i]->b_count      = 1;
-		mirror_bh [i]->b_size       = bh->b_size;
-		mirror_bh [i]->b_data       = bh->b_data;
-		mirror_bh [i]->b_list       = BUF_LOCKED;
-		mirror_bh [i]->b_end_io     = raid1_end_request;
-		mirror_bh [i]->b_dev_id     = r1_bh;
-
-		r1_bh->mirror_bh[i] = mirror_bh[i];
-		sum_bhs++;
-	}
-
-	r1_bh->remaining = sum_bhs;
-
-	PRINTK(("raid1_make_request(), write branch, sum_bhs=%d.\n",sum_bhs));
-
-	/*
-	 * We have to be a bit careful about the semaphore above, thats why we
-	 * start the requests separately. Since kmalloc() could fail, sleep and
-	 * make_request() can sleep too, this is the safer solution. Imagine,
-	 * end_request decreasing the semaphore before we could have set it up ...
-	 * We could play tricks with the semaphore (presetting it and correcting
-	 * at the end if sum_bhs is not 'n' but we have to do end_request by hand
-	 * if all requests finish until we had a chance to set up the semaphore
-	 * correctly ... lots of races).
-	 */
-	for (i = 0; i < n; i++)
-		if (mirror_bh [i] != NULL)
-			map_and_make_request (rw, mirror_bh [i]);
+	 * We have to be a bit careful about the semaphore above, thats
+	 * why we start the requests separately. Since kmalloc() could
+	 * fail, sleep and make_request() can sleep too, this is the
+	 * safer solution. Imagine, end_request decreasing the semaphore
+	 * before we could have set it up ... We could play tricks with
+	 * the semaphore (presetting it and correcting at the end if
+	 * sum_bhs is not 'n' but we have to do end_request by hand if
+	 * all requests finish until we had a chance to set up the
+	 * semaphore correctly ... lots of races).
+	 */
+	for (i = 0; i < disks; i++)
+		if (mirror_bh[i])
+			map_and_make_request(rw, mirror_bh[i]);
 
 	return (0);
 }
 			   
-static int raid1_status (char *page, int minor, struct md_dev *mddev)
+static int raid1_status (char *page, mddev_t *mddev)
 {
-	struct raid1_data *raid_conf = (struct raid1_data *) mddev->private;
+	raid1_conf_t *conf = mddev_to_conf(mddev);
 	int sz = 0, i;
 	
-	sz += sprintf (page+sz, " [%d/%d] [", raid_conf->raid_disks, raid_conf->working_disks);
-	for (i = 0; i < raid_conf->raid_disks; i++)
-		sz += sprintf (page+sz, "%s", raid_conf->mirrors [i].operational ? "U" : "_");
+	sz += sprintf (page+sz, " [%d/%d] [", conf->raid_disks,
+						 conf->working_disks);
+	for (i = 0; i < conf->raid_disks; i++)
+		sz += sprintf (page+sz, "%s",
+			conf->mirrors[i].operational ? "U" : "_");
 	sz += sprintf (page+sz, "]");
 	return sz;
 }
 
-static void raid1_fix_links (struct raid1_data *raid_conf, int failed_index)
+static void unlink_disk (raid1_conf_t *conf, int target)
 {
-	int disks = raid_conf->raid_disks;
-	int j;
+	int disks = MD_SB_DISKS;
+	int i;
 
-	for (j = 0; j < disks; j++)
-		if (raid_conf->mirrors [j].next == failed_index)
-			raid_conf->mirrors [j].next = raid_conf->mirrors [failed_index].next;
+	for (i = 0; i < disks; i++)
+		if (conf->mirrors[i].next == target)
+			conf->mirrors[i].next = conf->mirrors[target].next;
 }
 
 #define LAST_DISK KERN_ALERT \
@@ -388,48 +389,53 @@
 #define ALREADY_SYNCING KERN_INFO \
 "raid1: syncing already in progress.\n"
 
-static int raid1_error (struct md_dev *mddev, kdev_t dev)
+static void mark_disk_bad (mddev_t *mddev, int failed)
 {
-	struct raid1_data *raid_conf = (struct raid1_data *) mddev->private;
-	struct mirror_info *mirror;
-	md_superblock_t *sb = mddev->sb;
-	int disks = raid_conf->raid_disks;
-	int i;
+	raid1_conf_t *conf = mddev_to_conf(mddev);
+	struct mirror_info *mirror = conf->mirrors+failed;
+	mdp_super_t *sb = mddev->sb;
+
+	mirror->operational = 0;
+	unlink_disk(conf, failed);
+	mark_disk_faulty(sb->disks+mirror->number);
+	mark_disk_nonsync(sb->disks+mirror->number);
+	mark_disk_inactive(sb->disks+mirror->number);
+	sb->active_disks--;
+	sb->working_disks--;
+	sb->failed_disks++;
+	mddev->sb_dirty = 1;
+	md_wakeup_thread(conf->thread);
+	conf->working_disks--;
+	printk (DISK_FAILED, partition_name (mirror->dev),
+				 conf->working_disks);
+}
 
-	PRINTK(("raid1_error called\n"));
+static int raid1_error (mddev_t *mddev, kdev_t dev)
+{
+	raid1_conf_t *conf = mddev_to_conf(mddev);
+	struct mirror_info * mirrors = conf->mirrors;
+	int disks = MD_SB_DISKS;
+	int i;
 
-	if (raid_conf->working_disks == 1) {
+	if (conf->working_disks == 1) {
 		/*
 		 * Uh oh, we can do nothing if this is our last disk, but
 		 * first check if this is a queued request for a device
 		 * which has just failed.
 		 */
-		for (i = 0, mirror = raid_conf->mirrors; i < disks;
-				 i++, mirror++)
-			if (mirror->dev == dev && !mirror->operational)
+		for (i = 0; i < disks; i++) {
+			if (mirrors[i].dev==dev && !mirrors[i].operational)
 				return 0;
+		}
 		printk (LAST_DISK);
 	} else {
-		/* Mark disk as unusable */
-		for (i = 0, mirror = raid_conf->mirrors; i < disks;
-				 i++, mirror++) {
-			if (mirror->dev == dev && mirror->operational){
-				mirror->operational = 0;
-				raid1_fix_links (raid_conf, i);
-				sb->disks[mirror->number].state |=
-						(1 << MD_FAULTY_DEVICE);
-				sb->disks[mirror->number].state &=
-						~(1 << MD_SYNC_DEVICE);
-				sb->disks[mirror->number].state &=
-						~(1 << MD_ACTIVE_DEVICE);
-				sb->active_disks--;
-				sb->working_disks--;
-				sb->failed_disks++;
-				mddev->sb_dirty = 1;
-				md_wakeup_thread(raid1_thread);
-				raid_conf->working_disks--;
-				printk (DISK_FAILED, kdevname (dev),
-						raid_conf->working_disks);
+		/*
+		 * Mark disk as unusable
+		 */
+		for (i = 0; i < disks; i++) {
+			if (mirrors[i].dev==dev && mirrors[i].operational) {
+				mark_disk_bad (mddev, i);
+				break;
 			}
 		}
 	}
@@ -442,219 +448,396 @@
 #undef START_SYNCING
 
 /*
- * This is the personality-specific hot-addition routine
+ * Insert the spare disk into the drive-ring
  */
+static void link_disk(raid1_conf_t *conf, struct mirror_info *mirror)
+{
+	int j, next;
+	int disks = MD_SB_DISKS;
+	struct mirror_info *p = conf->mirrors;
 
-#define NO_SUPERBLOCK KERN_ERR \
-"raid1: cannot hot-add disk to the array with no RAID superblock\n"
+	for (j = 0; j < disks; j++, p++)
+		if (p->operational && !p->write_only) {
+			next = p->next;
+			p->next = mirror->raid_disk;
+			mirror->next = next;
+			return;
+		}
 
-#define WRONG_LEVEL KERN_ERR \
-"raid1: hot-add: level of disk is not RAID-1\n"
+	printk("raid1: bug: no read-operational devices\n");
+}
 
-#define HOT_ADD_SUCCEEDED KERN_INFO \
-"raid1: device %s hot-added\n"
+static void print_raid1_conf (raid1_conf_t *conf)
+{
+	int i;
+	struct mirror_info *tmp;
 
-static int raid1_hot_add_disk (struct md_dev *mddev, kdev_t dev)
+	printk("RAID1 conf printout:\n");
+	if (!conf) {
+		printk("(conf==NULL)\n");
+		return;
+	}
+	printk(" --- wd:%d rd:%d nd:%d\n", conf->working_disks,
+			 conf->raid_disks, conf->nr_disks);
+
+	for (i = 0; i < MD_SB_DISKS; i++) {
+		tmp = conf->mirrors + i;
+		printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n",
+			i, tmp->spare,tmp->operational,
+			tmp->number,tmp->raid_disk,tmp->used_slot,
+			partition_name(tmp->dev));
+	}
+}
+
+static int raid1_diskop(mddev_t *mddev, mdp_disk_t **d, int state)
 {
+	int err = 0;
+	int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1;
+	raid1_conf_t *conf = mddev->private;
+	struct mirror_info *tmp, *sdisk, *fdisk, *rdisk, *adisk;
 	unsigned long flags;
-	struct raid1_data *raid_conf = (struct raid1_data *) mddev->private;
-	struct mirror_info *mirror;
-	md_superblock_t *sb = mddev->sb;
-	struct real_dev * realdev;
-	int n;
+	mdp_super_t *sb = mddev->sb;
+	mdp_disk_t *failed_desc, *spare_desc, *added_desc;
+
+	save_flags(flags);
+	cli();
 
+	print_raid1_conf(conf);
 	/*
-	 * The device has its superblock already read and it was found
-	 * to be consistent for generic RAID usage.  Now we check whether
-	 * it's usable for RAID-1 hot addition.
+	 * find the disk ...
 	 */
+	switch (state) {
 
-	n = mddev->nb_dev++;
-	realdev = &mddev->devices[n];
-	if (!realdev->sb) {
-		printk (NO_SUPERBLOCK);
-		return -EINVAL;
-	}
-	if (realdev->sb->level != 1) {
-		printk (WRONG_LEVEL);
-		return -EINVAL;
+	case DISKOP_SPARE_ACTIVE:
+
+		/*
+		 * Find the failed disk within the RAID1 configuration ...
+		 * (this can only be in the first conf->working_disks part)
+		 */
+		for (i = 0; i < conf->raid_disks; i++) {
+			tmp = conf->mirrors + i;
+			if ((!tmp->operational && !tmp->spare) ||
+					!tmp->used_slot) {
+				failed_disk = i;
+				break;
+			}
+		}
+		/*
+		 * When we activate a spare disk we _must_ have a disk in
+		 * the lower (active) part of the array to replace. 
+		 */
+		if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) {
+			MD_BUG();
+			err = 1;
+			goto abort;
+		}
+		/* fall through */
+
+	case DISKOP_SPARE_WRITE:
+	case DISKOP_SPARE_INACTIVE:
+
+		/*
+		 * Find the spare disk ... (can only be in the 'high'
+		 * area of the array)
+		 */
+		for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
+			tmp = conf->mirrors + i;
+			if (tmp->spare && tmp->number == (*d)->number) {
+				spare_disk = i;
+				break;
+			}
+		}
+		if (spare_disk == -1) {
+			MD_BUG();
+			err = 1;
+			goto abort;
+		}
+		break;
+
+	case DISKOP_HOT_REMOVE_DISK:
+
+		for (i = 0; i < MD_SB_DISKS; i++) {
+			tmp = conf->mirrors + i;
+			if (tmp->used_slot && (tmp->number == (*d)->number)) {
+				if (tmp->operational) {
+					err = -EBUSY;
+					goto abort;
+				}
+				removed_disk = i;
+				break;
+			}
+		}
+		if (removed_disk == -1) {
+			MD_BUG();
+			err = 1;
+			goto abort;
+		}
+		break;
+
+	case DISKOP_HOT_ADD_DISK:
+
+		for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
+			tmp = conf->mirrors + i;
+			if (!tmp->used_slot) {
+				added_disk = i;
+				break;
+			}
+		}
+		if (added_disk == -1) {
+			MD_BUG();
+			err = 1;
+			goto abort;
+		}
+		break;
 	}
-	/* FIXME: are there other things left we could sanity-check? */
 
+	switch (state) {
 	/*
-	 * We have to disable interrupts, as our RAID-1 state is used
-	 * from irq handlers as well.
+	 * Switch the spare disk to write-only mode:
 	 */
-	save_flags(flags);
-	cli();
+	case DISKOP_SPARE_WRITE:
+		sdisk = conf->mirrors + spare_disk;
+		sdisk->operational = 1;
+		sdisk->write_only = 1;
+		break;
+	/*
+	 * Deactivate a spare disk:
+	 */
+	case DISKOP_SPARE_INACTIVE:
+		sdisk = conf->mirrors + spare_disk;
+		sdisk->operational = 0;
+		sdisk->write_only = 0;
+		break;
+	/*
+	 * Activate (mark read-write) the (now sync) spare disk,
+	 * which means we switch it's 'raid position' (->raid_disk)
+	 * with the failed disk. (only the first 'conf->nr_disks'
+	 * slots are used for 'real' disks and we must preserve this
+	 * property)
+	 */
+	case DISKOP_SPARE_ACTIVE:
 
-	raid_conf->raid_disks++;
-	mirror = raid_conf->mirrors+n;
+		sdisk = conf->mirrors + spare_disk;
+		fdisk = conf->mirrors + failed_disk;
 
-	mirror->number=n;
-	mirror->raid_disk=n;
-	mirror->dev=dev;
-	mirror->next=0; /* FIXME */
-	mirror->sect_limit=128;
-
-	mirror->operational=0;
-	mirror->spare=1;
-	mirror->write_only=0;
-
-	sb->disks[n].state |= (1 << MD_FAULTY_DEVICE);
-	sb->disks[n].state &= ~(1 << MD_SYNC_DEVICE);
-	sb->disks[n].state &= ~(1 << MD_ACTIVE_DEVICE);
-	sb->nr_disks++;
-	sb->spare_disks++;
+		spare_desc = &sb->disks[sdisk->number];
+		failed_desc = &sb->disks[fdisk->number];
 
-	restore_flags(flags);
+		if (spare_desc != *d) {
+			MD_BUG();
+			err = 1;
+			goto abort;
+		}
 
-	md_update_sb(MINOR(dev));
+		if (spare_desc->raid_disk != sdisk->raid_disk) {
+			MD_BUG();
+			err = 1;
+			goto abort;
+		}
+			
+		if (sdisk->raid_disk != spare_disk) {
+			MD_BUG();
+			err = 1;
+			goto abort;
+		}
 
-	printk (HOT_ADD_SUCCEEDED, kdevname(realdev->dev));
+		if (failed_desc->raid_disk != fdisk->raid_disk) {
+			MD_BUG();
+			err = 1;
+			goto abort;
+		}
 
-	return 0;
-}
+		if (fdisk->raid_disk != failed_disk) {
+			MD_BUG();
+			err = 1;
+			goto abort;
+		}
 
-#undef NO_SUPERBLOCK
-#undef WRONG_LEVEL
-#undef HOT_ADD_SUCCEEDED
+		/*
+		 * do the switch finally
+		 */
+		xchg_values(*spare_desc, *failed_desc);
+		xchg_values(*fdisk, *sdisk);
 
-/*
- * Insert the spare disk into the drive-ring
- */
-static void add_ring(struct raid1_data *raid_conf, struct mirror_info *mirror)
-{
-	int j, next;
-	struct mirror_info *p = raid_conf->mirrors;
+		/*
+		 * (careful, 'failed' and 'spare' are switched from now on)
+		 *
+		 * we want to preserve linear numbering and we want to
+		 * give the proper raid_disk number to the now activated
+		 * disk. (this means we switch back these values)
+		 */
+	
+		xchg_values(spare_desc->raid_disk, failed_desc->raid_disk);
+		xchg_values(sdisk->raid_disk, fdisk->raid_disk);
+		xchg_values(spare_desc->number, failed_desc->number);
+		xchg_values(sdisk->number, fdisk->number);
 
-	for (j = 0; j < raid_conf->raid_disks; j++, p++)
-		if (p->operational && !p->write_only) {
-			next = p->next;
-			p->next = mirror->raid_disk;
-			mirror->next = next;
-			return;
-		}
-	printk("raid1: bug: no read-operational devices\n");
-}
+		*d = failed_desc;
 
-static int raid1_mark_spare(struct md_dev *mddev, md_descriptor_t *spare,
-				int state)
-{
-	int i = 0, failed_disk = -1;
-	struct raid1_data *raid_conf = mddev->private;
-	struct mirror_info *mirror = raid_conf->mirrors;
-	md_descriptor_t *descriptor;
-	unsigned long flags;
+		if (sdisk->dev == MKDEV(0,0))
+			sdisk->used_slot = 0;
+		/*
+		 * this really activates the spare.
+		 */
+		fdisk->spare = 0;
+		fdisk->write_only = 0;
+		link_disk(conf, fdisk);
 
-	for (i = 0; i < MD_SB_DISKS; i++, mirror++) {
-		if (mirror->spare && mirror->number == spare->number)
-			goto found;
-	}
-	return 1;
-found:
-	for (i = 0, mirror = raid_conf->mirrors; i < raid_conf->raid_disks;
-								i++, mirror++)
-		if (!mirror->operational)
-			failed_disk = i;
+		/*
+		 * if we activate a spare, we definitely replace a
+		 * non-operational disk slot in the 'low' area of
+		 * the disk array.
+		 */
 
-	save_flags(flags);
-	cli();
-	switch (state) {
-		case SPARE_WRITE:
-			mirror->operational = 1;
-			mirror->write_only = 1;
-			raid_conf->raid_disks = MAX(raid_conf->raid_disks,
-							mirror->raid_disk + 1);
-			break;
-		case SPARE_INACTIVE:
-			mirror->operational = 0;
-			mirror->write_only = 0;
-			break;
-		case SPARE_ACTIVE:
-			mirror->spare = 0;
-			mirror->write_only = 0;
-			raid_conf->working_disks++;
-			add_ring(raid_conf, mirror);
-
-			if (failed_disk != -1) {
-				descriptor = &mddev->sb->disks[raid_conf->mirrors[failed_disk].number];
-				i = spare->raid_disk;
-				spare->raid_disk = descriptor->raid_disk;
-				descriptor->raid_disk = i;
-			}
-			break;
-		default:
-			printk("raid1_mark_spare: bug: state == %d\n", state);
-			restore_flags(flags);
-			return 1;
+		conf->working_disks++;
+
+		break;
+
+	case DISKOP_HOT_REMOVE_DISK:
+		rdisk = conf->mirrors + removed_disk;
+
+		if (rdisk->spare && (removed_disk < conf->raid_disks)) {
+			MD_BUG();	
+			err = 1;
+			goto abort;
+		}
+		rdisk->dev = MKDEV(0,0);
+		rdisk->used_slot = 0;
+		conf->nr_disks--;
+		break;
+
+	case DISKOP_HOT_ADD_DISK:
+		adisk = conf->mirrors + added_disk;
+		added_desc = *d;
+
+		if (added_disk != added_desc->number) {
+			MD_BUG();	
+			err = 1;
+			goto abort;
+		}
+
+		adisk->number = added_desc->number;
+		adisk->raid_disk = added_desc->raid_disk;
+		adisk->dev = MKDEV(added_desc->major,added_desc->minor);
+
+		adisk->operational = 0;
+		adisk->write_only = 0;
+		adisk->spare = 1;
+		adisk->used_slot = 1;
+		conf->nr_disks++;
+
+		break;
+
+	default:
+		MD_BUG();	
+		err = 1;
+		goto abort;
 	}
+abort:
 	restore_flags(flags);
-	return 0;
+	print_raid1_conf(conf);
+	return err;
 }
 
+
+#define IO_ERROR KERN_ALERT \
+"raid1: %s: unrecoverable I/O read error for block %lu\n"
+
+#define REDIRECT_SECTOR KERN_ERR \
+"raid1: %s: redirecting sector %lu to another mirror\n"
+
 /*
  * This is a kernel thread which:
  *
  *	1.	Retries failed read operations on working mirrors.
  *	2.	Updates the raid superblock when problems encounter.
  */
-void raid1d (void *data)
+static void raid1d (void *data)
 {
 	struct buffer_head *bh;
 	kdev_t dev;
 	unsigned long flags;
-	struct raid1_bh * r1_bh;
-	struct md_dev *mddev;
+	struct raid1_bh *r1_bh;
+	mddev_t *mddev;
 
-	PRINTK(("raid1d() active\n"));
-	save_flags(flags);
-	cli();
 	while (raid1_retry_list) {
+		save_flags(flags);
+		cli();
 		bh = raid1_retry_list;
 		r1_bh = (struct raid1_bh *)(bh->b_dev_id);
 		raid1_retry_list = r1_bh->next_retry;
 		restore_flags(flags);
 
-		mddev = md_dev + MINOR(bh->b_dev);
+		mddev = r1_bh->mddev;
 		if (mddev->sb_dirty) {
-			printk("dirty sb detected, updating.\n");
+			printk(KERN_INFO "dirty sb detected, updating.\n");
 			mddev->sb_dirty = 0;
-			md_update_sb(MINOR(bh->b_dev));
+			md_update_sb(mddev);
 		}
 		dev = bh->b_rdev;
-		__raid1_map (md_dev + MINOR(bh->b_dev), &bh->b_rdev, &bh->b_rsector, bh->b_size >> 9);
+		__raid1_map (mddev, &bh->b_rdev, &bh->b_rsector,
+							 bh->b_size >> 9);
 		if (bh->b_rdev == dev) {
-			printk (KERN_ALERT 
-					"raid1: %s: unrecoverable I/O read error for block %lu\n",
-						kdevname(bh->b_dev), bh->b_blocknr);
-			raid1_end_buffer_io(r1_bh, 0);
+			printk (IO_ERROR, partition_name(mddev_to_kdev(mddev)), bh->b_blocknr);
+			raid1_end_bh_io(r1_bh, 0);
 		} else {
-			printk (KERN_ERR "raid1: %s: redirecting sector %lu to another mirror\n", 
-					  kdevname(bh->b_dev), bh->b_blocknr);
+			printk (REDIRECT_SECTOR,
+				partition_name(mddev_to_kdev(mddev)), bh->b_blocknr);
 			map_and_make_request (r1_bh->cmd, bh);
 		}
-		cli();
 	}
-	restore_flags(flags);
+}
+#undef IO_ERROR
+#undef REDIRECT_SECTOR
+
+/*
+ * Private kernel thread to reconstruct mirrors after an unclean
+ * shutdown.
+ */
+static void raid1syncd (void *data)
+{
+        raid1_conf_t *conf = data;
+        mddev_t *mddev = conf->mddev;
+
+        if (!conf->resync_mirrors)
+                return;
+        if (conf->resync_mirrors == 2)
+                return;
+	down(&mddev->recovery_sem);
+        if (md_do_sync(mddev, NULL)) {
+		up(&mddev->recovery_sem);
+		return;
+	}
+	/*
+	 * Only if everything went Ok.
+	 */
+        conf->resync_mirrors = 0;
+	up(&mddev->recovery_sem);
 }
 
+
 /*
  * This will catch the scenario in which one of the mirrors was
  * mounted as a normal device rather than as a part of a raid set.
+ *
+ * check_consistency is very personality-dependent, eg. RAID5 cannot
+ * do this check, it uses another method.
  */
-static int __check_consistency (struct md_dev *mddev, int row)
+static int __check_consistency (mddev_t *mddev, int row)
 {
-	struct raid1_data *raid_conf = mddev->private;
+	raid1_conf_t *conf = mddev_to_conf(mddev);
+	int disks = MD_SB_DISKS;
 	kdev_t dev;
 	struct buffer_head *bh = NULL;
 	int i, rc = 0;
 	char *buffer = NULL;
 
-	for (i = 0; i < raid_conf->raid_disks; i++) {
-		if (!raid_conf->mirrors[i].operational)
+	for (i = 0; i < disks; i++) {
+		printk("(checking disk %d)\n",i);
+		if (!conf->mirrors[i].operational)
 			continue;
-		dev = raid_conf->mirrors[i].dev;
+		printk("(really checking disk %d)\n",i);
+		dev = conf->mirrors[i].dev;
 		set_blocksize(dev, 4096);
 		if ((bh = bread(dev, row / 4, 4096)) == NULL)
 			break;
@@ -683,167 +866,342 @@
 	return rc;
 }
 
-static int check_consistency (struct md_dev *mddev)
+static int check_consistency (mddev_t *mddev)
 {
-	int size = mddev->sb->size;
-	int row;
+	if (__check_consistency(mddev, 0))
+/*
+ * we do not do this currently, as it's perfectly possible to
+ * have an inconsistent array when it's freshly created. Only
+ * newly written data has to be consistent.
+ */
+		return 0;
 
-	for (row = 0; row < size; row += size / 8)
-		if (__check_consistency(mddev, row))
-			return 1;
 	return 0;
 }
 
-static int raid1_run (int minor, struct md_dev *mddev)
+#define INVALID_LEVEL KERN_WARNING \
+"raid1: md%d: raid level not set to mirroring (%d)\n"
+
+#define NO_SB KERN_ERR \
+"raid1: disabled mirror %s (couldn't access raid superblock)\n"
+
+#define ERRORS KERN_ERR \
+"raid1: disabled mirror %s (errors detected)\n"
+
+#define NOT_IN_SYNC KERN_ERR \
+"raid1: disabled mirror %s (not in sync)\n"
+
+#define INCONSISTENT KERN_ERR \
+"raid1: disabled mirror %s (inconsistent descriptor)\n"
+
+#define ALREADY_RUNNING KERN_ERR \
+"raid1: disabled mirror %s (mirror %d already operational)\n"
+
+#define OPERATIONAL KERN_INFO \
+"raid1: device %s operational as mirror %d\n"
+
+#define MEM_ERROR KERN_ERR \
+"raid1: couldn't allocate memory for md%d\n"
+
+#define SPARE KERN_INFO \
+"raid1: spare disk %s\n"
+
+#define NONE_OPERATIONAL KERN_ERR \
+"raid1: no operational mirrors for md%d\n"
+
+#define RUNNING_CKRAID KERN_ERR \
+"raid1: detected mirror differences -- running resync\n"
+
+#define ARRAY_IS_ACTIVE KERN_INFO \
+"raid1: raid set md%d active with %d out of %d mirrors\n"
+
+#define THREAD_ERROR KERN_ERR \
+"raid1: couldn't allocate thread for md%d\n"
+
+#define START_RESYNC KERN_WARNING \
+"raid1: raid set md%d not clean; reconstructing mirrors\n"
+
+static int raid1_run (mddev_t *mddev)
 {
-	struct raid1_data *raid_conf;
-	int i, j, raid_disk;
-	md_superblock_t *sb = mddev->sb;
-	md_descriptor_t *descriptor;
-	struct real_dev *realdev;
+	raid1_conf_t *conf;
+	int i, j, disk_idx;
+	struct mirror_info *disk;
+	mdp_super_t *sb = mddev->sb;
+	mdp_disk_t *descriptor;
+	mdk_rdev_t *rdev;
+	struct md_list_head *tmp;
+	int start_recovery = 0;
 
 	MOD_INC_USE_COUNT;
 
 	if (sb->level != 1) {
-		printk("raid1: %s: raid level not set to mirroring (%d)\n",
-				kdevname(MKDEV(MD_MAJOR, minor)), sb->level);
-		MOD_DEC_USE_COUNT;
-		return -EIO;
-	}
-	/****
-	 * copy the now verified devices into our private RAID1 bookkeeping
-	 * area. [whatever we allocate in raid1_run(), should be freed in
-	 * raid1_stop()]
+		printk(INVALID_LEVEL, mdidx(mddev), sb->level);
+		goto out;
+	}
+	/*
+	 * copy the already verified devices into our private RAID1
+	 * bookkeeping area. [whatever we allocate in raid1_run(),
+	 * should be freed in raid1_stop()]
 	 */
 
-	while (!( /* FIXME: now we are rather fault tolerant than nice */
-	mddev->private = kmalloc (sizeof (struct raid1_data), GFP_KERNEL)
-	) )
-	{
-		printk ("raid1_run(): out of memory\n");
-		current->policy |= SCHED_YIELD;
-		schedule();
-	}
-	raid_conf = mddev->private;
-	memset(raid_conf, 0, sizeof(*raid_conf));
-
-	PRINTK(("raid1_run(%d) called.\n", minor));
-
-  	for (i = 0; i < mddev->nb_dev; i++) {
-  		realdev = &mddev->devices[i];
-		if (!realdev->sb) {
-			printk(KERN_ERR "raid1: disabled mirror %s (couldn't access raid superblock)\n", kdevname(realdev->dev));
+	conf = raid1_kmalloc(sizeof(raid1_conf_t));
+	mddev->private = conf;
+	if (!conf) {
+		printk(MEM_ERROR, mdidx(mddev));
+		goto out;
+	}
+
+	ITERATE_RDEV(mddev,rdev,tmp) {
+		if (rdev->faulty) {
+			printk(ERRORS, partition_name(rdev->dev));
+		} else {
+			if (!rdev->sb) {
+				MD_BUG();
+				continue;
+			}
+		}
+		if (rdev->desc_nr == -1) {
+			MD_BUG();
 			continue;
 		}
-
-		/*
-		 * This is important -- we are using the descriptor on
-		 * the disk only to get a pointer to the descriptor on
-		 * the main superblock, which might be more recent.
-		 */
-		descriptor = &sb->disks[realdev->sb->descriptor.number];
-		if (descriptor->state & (1 << MD_FAULTY_DEVICE)) {
-			printk(KERN_ERR "raid1: disabled mirror %s (errors detected)\n", kdevname(realdev->dev));
+		descriptor = &sb->disks[rdev->desc_nr];
+		disk_idx = descriptor->raid_disk;
+		disk = conf->mirrors + disk_idx;
+
+		if (disk_faulty(descriptor)) {
+			disk->number = descriptor->number;
+			disk->raid_disk = disk_idx;
+			disk->dev = rdev->dev;
+			disk->sect_limit = MAX_LINEAR_SECTORS;
+			disk->operational = 0;
+			disk->write_only = 0;
+			disk->spare = 0;
+			disk->used_slot = 1;
 			continue;
 		}
-		if (descriptor->state & (1 << MD_ACTIVE_DEVICE)) {
-			if (!(descriptor->state & (1 << MD_SYNC_DEVICE))) {
-				printk(KERN_ERR "raid1: disabled mirror %s (not in sync)\n", kdevname(realdev->dev));
+		if (disk_active(descriptor)) {
+			if (!disk_sync(descriptor)) {
+				printk(NOT_IN_SYNC,
+					partition_name(rdev->dev));
 				continue;
 			}
-			raid_disk = descriptor->raid_disk;
-			if (descriptor->number > sb->nr_disks || raid_disk > sb->raid_disks) {
-				printk(KERN_ERR "raid1: disabled mirror %s (inconsistent descriptor)\n", kdevname(realdev->dev));
+			if ((descriptor->number > MD_SB_DISKS) ||
+					 (disk_idx > sb->raid_disks)) {
+
+				printk(INCONSISTENT,
+					partition_name(rdev->dev));
 				continue;
 			}
-			if (raid_conf->mirrors[raid_disk].operational) {
-				printk(KERN_ERR "raid1: disabled mirror %s (mirror %d already operational)\n", kdevname(realdev->dev), raid_disk);
+			if (disk->operational) {
+				printk(ALREADY_RUNNING,
+					partition_name(rdev->dev),
+					disk_idx);
 				continue;
 			}
-			printk(KERN_INFO "raid1: device %s operational as mirror %d\n", kdevname(realdev->dev), raid_disk);
-			raid_conf->mirrors[raid_disk].number = descriptor->number;
-			raid_conf->mirrors[raid_disk].raid_disk = raid_disk;
-			raid_conf->mirrors[raid_disk].dev = mddev->devices [i].dev;
-			raid_conf->mirrors[raid_disk].operational = 1;
-			raid_conf->mirrors[raid_disk].sect_limit = 128;
-			raid_conf->working_disks++;
+			printk(OPERATIONAL, partition_name(rdev->dev),
+ 					disk_idx);
+			disk->number = descriptor->number;
+			disk->raid_disk = disk_idx;
+			disk->dev = rdev->dev;
+			disk->sect_limit = MAX_LINEAR_SECTORS;
+			disk->operational = 1;
+			disk->write_only = 0;
+			disk->spare = 0;
+			disk->used_slot = 1;
+			conf->working_disks++;
 		} else {
 		/*
 		 * Must be a spare disk ..
 		 */
-			printk(KERN_INFO "raid1: spare disk %s\n", kdevname(realdev->dev));
-			raid_disk = descriptor->raid_disk;
-			raid_conf->mirrors[raid_disk].number = descriptor->number;
-			raid_conf->mirrors[raid_disk].raid_disk = raid_disk;
-			raid_conf->mirrors[raid_disk].dev = mddev->devices [i].dev;
-			raid_conf->mirrors[raid_disk].sect_limit = 128;
-
-			raid_conf->mirrors[raid_disk].operational = 0;
-			raid_conf->mirrors[raid_disk].write_only = 0;
-			raid_conf->mirrors[raid_disk].spare = 1;
-		}
-	}
-	if (!raid_conf->working_disks) {
-		printk(KERN_ERR "raid1: no operational mirrors for %s\n", kdevname(MKDEV(MD_MAJOR, minor)));
-		kfree(raid_conf);
-		mddev->private = NULL;
-		MOD_DEC_USE_COUNT;
-		return -EIO;
-	}
-
-	raid_conf->raid_disks = sb->raid_disks;
-	raid_conf->mddev = mddev;
-
-	for (j = 0; !raid_conf->mirrors[j].operational; j++);
-	raid_conf->last_used = j;
-	for (i = raid_conf->raid_disks - 1; i >= 0; i--) {
-		if (raid_conf->mirrors[i].operational) {
-			PRINTK(("raid_conf->mirrors[%d].next == %d\n", i, j));
-			raid_conf->mirrors[i].next = j;
+			printk(SPARE, partition_name(rdev->dev));
+			disk->number = descriptor->number;
+			disk->raid_disk = disk_idx;
+			disk->dev = rdev->dev;
+			disk->sect_limit = MAX_LINEAR_SECTORS;
+			disk->operational = 0;
+			disk->write_only = 0;
+			disk->spare = 1;
+			disk->used_slot = 1;
+		}
+	}
+	if (!conf->working_disks) {
+		printk(NONE_OPERATIONAL, mdidx(mddev));
+		goto out_free_conf;
+	}
+
+	conf->raid_disks = sb->raid_disks;
+	conf->nr_disks = sb->nr_disks;
+	conf->mddev = mddev;
+
+	for (i = 0; i < MD_SB_DISKS; i++) {
+		
+		descriptor = sb->disks+i;
+		disk_idx = descriptor->raid_disk;
+		disk = conf->mirrors + disk_idx;
+
+		if (disk_faulty(descriptor) && (disk_idx < conf->raid_disks) &&
+				!disk->used_slot) {
+
+			disk->number = descriptor->number;
+			disk->raid_disk = disk_idx;
+			disk->dev = MKDEV(0,0);
+
+			disk->operational = 0;
+			disk->write_only = 0;
+			disk->spare = 0;
+			disk->used_slot = 1;
+		}
+	}
+
+	/*
+	 * find the first working one and use it as a starting point
+	 * to read balancing.
+	 */
+	for (j = 0; !conf->mirrors[j].operational; j++)
+		/* nothing */;
+	conf->last_used = j;
+
+	/*
+	 * initialize the 'working disks' list.
+	 */
+	for (i = conf->raid_disks - 1; i >= 0; i--) {
+		if (conf->mirrors[i].operational) {
+			conf->mirrors[i].next = j;
 			j = i;
 		}
 	}
 
-	if (check_consistency(mddev)) {
-		printk(KERN_ERR "raid1: detected mirror differences -- run ckraid\n");
-		sb->state |= 1 << MD_SB_ERRORS;
-		kfree(raid_conf);
-		mddev->private = NULL;
-		MOD_DEC_USE_COUNT;
-		return -EIO;
+	if (conf->working_disks != sb->raid_disks) {
+		printk(KERN_ALERT "raid1: md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev));
+		start_recovery = 1;
 	}
 
+	if (!start_recovery && (sb->state & (1 << MD_SB_CLEAN))) {
+		/*
+		 * we do sanity checks even if the device says
+		 * it's clean ...
+		 */
+		if (check_consistency(mddev)) {
+			printk(RUNNING_CKRAID);
+			sb->state &= ~(1 << MD_SB_CLEAN);
+		}
+	}
+
+	{
+		const char * name = "raid1d";
+
+		conf->thread = md_register_thread(raid1d, conf, name);
+		if (!conf->thread) {
+			printk(THREAD_ERROR, mdidx(mddev));
+			goto out_free_conf;
+        	}
+	}
+
+	if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN))) {
+		const char * name = "raid1syncd";
+
+		conf->resync_thread = md_register_thread(raid1syncd, conf,name);
+		if (!conf->resync_thread) {
+			printk(THREAD_ERROR, mdidx(mddev));
+			goto out_free_conf;
+		}
+
+		printk(START_RESYNC, mdidx(mddev));
+                conf->resync_mirrors = 1;
+                md_wakeup_thread(conf->resync_thread);
+        }
+
 	/*
 	 * Regenerate the "device is in sync with the raid set" bit for
 	 * each device.
 	 */
-	for (i = 0; i < sb->nr_disks ; i++) {
-		sb->disks[i].state &= ~(1 << MD_SYNC_DEVICE);
+	for (i = 0; i < MD_SB_DISKS; i++) {
+		mark_disk_nonsync(sb->disks+i);
 		for (j = 0; j < sb->raid_disks; j++) {
-			if (!raid_conf->mirrors[j].operational)
+			if (!conf->mirrors[j].operational)
 				continue;
-			if (sb->disks[i].number == raid_conf->mirrors[j].number)
-				sb->disks[i].state |= 1 << MD_SYNC_DEVICE;
+			if (sb->disks[i].number == conf->mirrors[j].number)
+				mark_disk_sync(sb->disks+i);
 		}
 	}
-	sb->active_disks = raid_conf->working_disks;
+	sb->active_disks = conf->working_disks;
 
-	printk("raid1: raid set %s active with %d out of %d mirrors\n", kdevname(MKDEV(MD_MAJOR, minor)), sb->active_disks, sb->raid_disks);
-	/* Ok, everything is just fine now */
-	return (0);
+	if (start_recovery)
+		md_recover_arrays();
+
+
+	printk(ARRAY_IS_ACTIVE, mdidx(mddev), sb->active_disks, sb->raid_disks);
+	/*
+	 * Ok, everything is just fine now
+	 */
+	return 0;
+
+out_free_conf:
+	kfree(conf);
+	mddev->private = NULL;
+out:
+	MOD_DEC_USE_COUNT;
+	return -EIO;
+}
+
+#undef INVALID_LEVEL
+#undef NO_SB
+#undef ERRORS
+#undef NOT_IN_SYNC
+#undef INCONSISTENT
+#undef ALREADY_RUNNING
+#undef OPERATIONAL
+#undef SPARE
+#undef NONE_OPERATIONAL
+#undef RUNNING_CKRAID
+#undef ARRAY_IS_ACTIVE
+
+static int raid1_stop_resync (mddev_t *mddev)
+{
+	raid1_conf_t *conf = mddev_to_conf(mddev);
+
+	if (conf->resync_thread) {
+		if (conf->resync_mirrors) {
+			conf->resync_mirrors = 2;
+			md_interrupt_thread(conf->resync_thread);
+			printk(KERN_INFO "raid1: mirror resync was not fully finished, restarting next time.\n");
+			return 1;
+		}
+		return 0;
+	}
+	return 0;
+}
+
+static int raid1_restart_resync (mddev_t *mddev)
+{
+	raid1_conf_t *conf = mddev_to_conf(mddev);
+
+	if (conf->resync_mirrors) {
+		if (!conf->resync_thread) {
+			MD_BUG();
+			return 0;
+		}
+		conf->resync_mirrors = 1;
+		md_wakeup_thread(conf->resync_thread);
+		return 1;
+	}
+	return 0;
 }
 
-static int raid1_stop (int minor, struct md_dev *mddev)
+static int raid1_stop (mddev_t *mddev)
 {
-	struct raid1_data *raid_conf = (struct raid1_data *) mddev->private;
+	raid1_conf_t *conf = mddev_to_conf(mddev);
 
-	kfree (raid_conf);
+	md_unregister_thread(conf->thread);
+	if (conf->resync_thread)
+		md_unregister_thread(conf->resync_thread);
+	kfree(conf);
 	mddev->private = NULL;
 	MOD_DEC_USE_COUNT;
 	return 0;
 }
 
-static struct md_personality raid1_personality=
+static mdk_personality_t raid1_personality=
 {
 	"raid1",
 	raid1_map,
@@ -855,15 +1213,13 @@
 	NULL,			/* no ioctls */
 	0,
 	raid1_error,
-	raid1_hot_add_disk,
-	/* raid1_hot_remove_drive */ NULL,
-	raid1_mark_spare
+	raid1_diskop,
+	raid1_stop_resync,
+	raid1_restart_resync
 };
 
 int raid1_init (void)
 {
-	if ((raid1_thread = md_register_thread(raid1d, NULL)) == NULL)
-		return -EBUSY;
 	return register_md_personality (RAID1, &raid1_personality);
 }
 
@@ -875,7 +1231,6 @@
 
 void cleanup_module (void)
 {
-	md_unregister_thread (raid1_thread);
 	unregister_md_personality (RAID1);
 }
 #endif
diff -urN 2.2.18/drivers/block/raid5.c 2.2.18aa1/drivers/block/raid5.c
--- 2.2.18/drivers/block/raid5.c	Mon Jan 17 16:44:36 2000
+++ 2.2.18aa1/drivers/block/raid5.c	Mon Dec 11 17:20:54 2000
@@ -1,4 +1,4 @@
-/*****************************************************************************
+/*
  * raid5.c : Multiple Devices driver for Linux
  *           Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
  *
@@ -14,16 +14,15 @@
  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
+
 #include <linux/module.h>
 #include <linux/locks.h>
 #include <linux/malloc.h>
-#include <linux/md.h>
-#include <linux/raid5.h>
+#include <linux/raid/raid5.h>
 #include <asm/bitops.h>
 #include <asm/atomic.h>
-#include <asm/md.h>
 
-static struct md_personality raid5_personality;
+static mdk_personality_t raid5_personality;
 
 /*
  * Stripe cache
@@ -33,7 +32,7 @@
 #define HASH_PAGES_ORDER	0
 #define NR_HASH			(HASH_PAGES * PAGE_SIZE / sizeof(struct stripe_head *))
 #define HASH_MASK		(NR_HASH - 1)
-#define stripe_hash(raid_conf, sect, size)	((raid_conf)->stripe_hashtbl[((sect) / (size >> 9)) & HASH_MASK])
+#define stripe_hash(conf, sect, size)	((conf)->stripe_hashtbl[((sect) / (size >> 9)) & HASH_MASK])
 
 /*
  * The following can be used to debug the driver
@@ -46,6 +45,8 @@
 #define PRINTK(x)   do { ; } while (0)
 #endif
 
+static void print_raid5_conf (raid5_conf_t *conf);
+
 static inline int stripe_locked(struct stripe_head *sh)
 {
 	return test_bit(STRIPE_LOCKED, &sh->state);
@@ -61,32 +62,32 @@
  */
 static inline void lock_stripe(struct stripe_head *sh)
 {
-	struct raid5_data *raid_conf = sh->raid_conf;
-	if (!test_and_set_bit(STRIPE_LOCKED, &sh->state)) {
+	raid5_conf_t *conf = sh->raid_conf;
+	if (!md_test_and_set_bit(STRIPE_LOCKED, &sh->state)) {
 		PRINTK(("locking stripe %lu\n", sh->sector));
-		raid_conf->nr_locked_stripes++;
+		conf->nr_locked_stripes++;
 	}
 }
 
 static inline void unlock_stripe(struct stripe_head *sh)
 {
-	struct raid5_data *raid_conf = sh->raid_conf;
-	if (test_and_clear_bit(STRIPE_LOCKED, &sh->state)) {
+	raid5_conf_t *conf = sh->raid_conf;
+	if (md_test_and_clear_bit(STRIPE_LOCKED, &sh->state)) {
 		PRINTK(("unlocking stripe %lu\n", sh->sector));
-		raid_conf->nr_locked_stripes--;
+		conf->nr_locked_stripes--;
 		wake_up(&sh->wait);
 	}
 }
 
 static inline void finish_stripe(struct stripe_head *sh)
 {
-	struct raid5_data *raid_conf = sh->raid_conf;
+	raid5_conf_t *conf = sh->raid_conf;
 	unlock_stripe(sh);
 	sh->cmd = STRIPE_NONE;
 	sh->phase = PHASE_COMPLETE;
-	raid_conf->nr_pending_stripes--;
-	raid_conf->nr_cached_stripes++;
-	wake_up(&raid_conf->wait_for_stripe);
+	conf->nr_pending_stripes--;
+	conf->nr_cached_stripes++;
+	wake_up(&conf->wait_for_stripe);
 }
 
 void __wait_on_stripe(struct stripe_head *sh)
@@ -114,7 +115,7 @@
 		__wait_on_stripe(sh);
 }
 
-static inline void remove_hash(struct raid5_data *raid_conf, struct stripe_head *sh)
+static inline void remove_hash(raid5_conf_t *conf, struct stripe_head *sh)
 {
 	PRINTK(("remove_hash(), stripe %lu\n", sh->sector));
 
@@ -123,21 +124,22 @@
 			sh->hash_next->hash_pprev = sh->hash_pprev;
 		*sh->hash_pprev = sh->hash_next;
 		sh->hash_pprev = NULL;
-		raid_conf->nr_hashed_stripes--;
+		conf->nr_hashed_stripes--;
 	}
 }
 
-static inline void insert_hash(struct raid5_data *raid_conf, struct stripe_head *sh)
+static inline void insert_hash(raid5_conf_t *conf, struct stripe_head *sh)
 {
-	struct stripe_head **shp = &stripe_hash(raid_conf, sh->sector, sh->size);
+	struct stripe_head **shp = &stripe_hash(conf, sh->sector, sh->size);
 
-	PRINTK(("insert_hash(), stripe %lu, nr_hashed_stripes %d\n", sh->sector, raid_conf->nr_hashed_stripes));
+	PRINTK(("insert_hash(), stripe %lu, nr_hashed_stripes %d\n",
+			sh->sector, conf->nr_hashed_stripes));
 
 	if ((sh->hash_next = *shp) != NULL)
 		(*shp)->hash_pprev = &sh->hash_next;
 	*shp = sh;
 	sh->hash_pprev = shp;
-	raid_conf->nr_hashed_stripes++;
+	conf->nr_hashed_stripes++;
 }
 
 static struct buffer_head *get_free_buffer(struct stripe_head *sh, int b_size)
@@ -145,13 +147,15 @@
 	struct buffer_head *bh;
 	unsigned long flags;
 
-	save_flags(flags);
-	cli();
-	if ((bh = sh->buffer_pool) == NULL)
-		return NULL;
+	md_spin_lock_irqsave(&sh->stripe_lock, flags);
+	bh = sh->buffer_pool;
+	if (!bh)
+		goto out_unlock;
 	sh->buffer_pool = bh->b_next;
 	bh->b_size = b_size;
-	restore_flags(flags);
+out_unlock:
+	md_spin_unlock_irqrestore(&sh->stripe_lock, flags);
+
 	return bh;
 }
 
@@ -160,12 +164,14 @@
 	struct buffer_head *bh;
 	unsigned long flags;
 
-	save_flags(flags);
-	cli();
-	if ((bh = sh->bh_pool) == NULL)
-		return NULL;
+	md_spin_lock_irqsave(&sh->stripe_lock, flags);
+	bh = sh->bh_pool;
+	if (!bh)
+		goto out_unlock;
 	sh->bh_pool = bh->b_next;
-	restore_flags(flags);
+out_unlock:
+	md_spin_unlock_irqrestore(&sh->stripe_lock, flags);
+
 	return bh;
 }
 
@@ -173,54 +179,52 @@
 {
 	unsigned long flags;
 
-	save_flags(flags);
-	cli();
+	md_spin_lock_irqsave(&sh->stripe_lock, flags);
 	bh->b_next = sh->buffer_pool;
 	sh->buffer_pool = bh;
-	restore_flags(flags);
+	md_spin_unlock_irqrestore(&sh->stripe_lock, flags);
 }
 
 static void put_free_bh(struct stripe_head *sh, struct buffer_head *bh)
 {
 	unsigned long flags;
 
-	save_flags(flags);
-	cli();
+	md_spin_lock_irqsave(&sh->stripe_lock, flags);
 	bh->b_next = sh->bh_pool;
 	sh->bh_pool = bh;
-	restore_flags(flags);
+	md_spin_unlock_irqrestore(&sh->stripe_lock, flags);
 }
 
-static struct stripe_head *get_free_stripe(struct raid5_data *raid_conf)
+static struct stripe_head *get_free_stripe(raid5_conf_t *conf)
 {
 	struct stripe_head *sh;
 	unsigned long flags;
 
 	save_flags(flags);
 	cli();
-	if ((sh = raid_conf->free_sh_list) == NULL) {
+	if ((sh = conf->free_sh_list) == NULL) {
 		restore_flags(flags);
 		return NULL;
 	}
-	raid_conf->free_sh_list = sh->free_next;
-	raid_conf->nr_free_sh--;
-	if (!raid_conf->nr_free_sh && raid_conf->free_sh_list)
+	conf->free_sh_list = sh->free_next;
+	conf->nr_free_sh--;
+	if (!conf->nr_free_sh && conf->free_sh_list)
 		printk ("raid5: bug: free_sh_list != NULL, nr_free_sh == 0\n");
 	restore_flags(flags);
-	if (sh->hash_pprev || sh->nr_pending || sh->count)
+	if (sh->hash_pprev || md_atomic_read(&sh->nr_pending) || sh->count)
 		printk("get_free_stripe(): bug\n");
 	return sh;
 }
 
-static void put_free_stripe(struct raid5_data *raid_conf, struct stripe_head *sh)
+static void put_free_stripe(raid5_conf_t *conf, struct stripe_head *sh)
 {
 	unsigned long flags;
 
 	save_flags(flags);
 	cli();
-	sh->free_next = raid_conf->free_sh_list;
-	raid_conf->free_sh_list = sh;
-	raid_conf->nr_free_sh++;
+	sh->free_next = conf->free_sh_list;
+	conf->free_sh_list = sh;
+	conf->nr_free_sh++;
 	restore_flags(flags);
 }
 
@@ -324,8 +328,8 @@
 
 static void kfree_stripe(struct stripe_head *sh)
 {
-	struct raid5_data *raid_conf = sh->raid_conf;
-	int disks = raid_conf->raid_disks, j;
+	raid5_conf_t *conf = sh->raid_conf;
+	int disks = conf->raid_disks, j;
 
 	PRINTK(("kfree_stripe called, stripe %lu\n", sh->sector));
 	if (sh->phase != PHASE_COMPLETE || stripe_locked(sh) || sh->count) {
@@ -338,19 +342,19 @@
 		if (sh->bh_new[j] || sh->bh_copy[j])
 			printk("raid5: bug: sector %lu, new %p, copy %p\n", sh->sector, sh->bh_new[j], sh->bh_copy[j]);
 	}
-	remove_hash(raid_conf, sh);
-	put_free_stripe(raid_conf, sh);
+	remove_hash(conf, sh);
+	put_free_stripe(conf, sh);
 }
 
-static int shrink_stripe_cache(struct raid5_data *raid_conf, int nr)
+static int shrink_stripe_cache(raid5_conf_t *conf, int nr)
 {
 	struct stripe_head *sh;
 	int i, count = 0;
 
-	PRINTK(("shrink_stripe_cache called, %d/%d, clock %d\n", nr, raid_conf->nr_hashed_stripes, raid_conf->clock));
+	PRINTK(("shrink_stripe_cache called, %d/%d, clock %d\n", nr, conf->nr_hashed_stripes, conf->clock));
 	for (i = 0; i < NR_HASH; i++) {
 repeat:
-		sh = raid_conf->stripe_hashtbl[(i + raid_conf->clock) & HASH_MASK];
+		sh = conf->stripe_hashtbl[(i + conf->clock) & HASH_MASK];
 		for (; sh; sh = sh->hash_next) {
 			if (sh->phase != PHASE_COMPLETE)
 				continue;
@@ -360,30 +364,30 @@
 				continue;
 			kfree_stripe(sh);
 			if (++count == nr) {
-				PRINTK(("shrink completed, nr_hashed_stripes %d\n", raid_conf->nr_hashed_stripes));
-				raid_conf->clock = (i + raid_conf->clock) & HASH_MASK;
+				PRINTK(("shrink completed, nr_hashed_stripes %d\n", conf->nr_hashed_stripes));
+				conf->clock = (i + conf->clock) & HASH_MASK;
 				return nr;
 			}
 			goto repeat;
 		}
 	}
-	PRINTK(("shrink completed, nr_hashed_stripes %d\n", raid_conf->nr_hashed_stripes));
+	PRINTK(("shrink completed, nr_hashed_stripes %d\n", conf->nr_hashed_stripes));
 	return count;
 }
 
-static struct stripe_head *find_stripe(struct raid5_data *raid_conf, unsigned long sector, int size)
+static struct stripe_head *find_stripe(raid5_conf_t *conf, unsigned long sector, int size)
 {
 	struct stripe_head *sh;
 
-	if (raid_conf->buffer_size != size) {
-		PRINTK(("switching size, %d --> %d\n", raid_conf->buffer_size, size));
-		shrink_stripe_cache(raid_conf, raid_conf->max_nr_stripes);
-		raid_conf->buffer_size = size;
+	if (conf->buffer_size != size) {
+		PRINTK(("switching size, %d --> %d\n", conf->buffer_size, size));
+		shrink_stripe_cache(conf, conf->max_nr_stripes);
+		conf->buffer_size = size;
 	}
 
 	PRINTK(("find_stripe, sector %lu\n", sector));
-	for (sh = stripe_hash(raid_conf, sector, size); sh; sh = sh->hash_next)
-		if (sh->sector == sector && sh->raid_conf == raid_conf) {
+	for (sh = stripe_hash(conf, sector, size); sh; sh = sh->hash_next)
+		if (sh->sector == sector && sh->raid_conf == conf) {
 			if (sh->size == size) {
 				PRINTK(("found stripe %lu\n", sector));
 				return sh;
@@ -397,7 +401,7 @@
 	return NULL;
 }
 
-static int grow_stripes(struct raid5_data *raid_conf, int num, int priority)
+static int grow_stripes(raid5_conf_t *conf, int num, int priority)
 {
 	struct stripe_head *sh;
 
@@ -405,62 +409,64 @@
 		if ((sh = kmalloc(sizeof(struct stripe_head), priority)) == NULL)
 			return 1;
 		memset(sh, 0, sizeof(*sh));
-		if (grow_buffers(sh, 2 * raid_conf->raid_disks, PAGE_SIZE, priority)) {
-			shrink_buffers(sh, 2 * raid_conf->raid_disks);
+		sh->stripe_lock = MD_SPIN_LOCK_UNLOCKED;
+
+		if (grow_buffers(sh, 2 * conf->raid_disks, PAGE_SIZE, priority)) {
+			shrink_buffers(sh, 2 * conf->raid_disks);
 			kfree(sh);
 			return 1;
 		}
-		if (grow_bh(sh, raid_conf->raid_disks, priority)) {
-			shrink_buffers(sh, 2 * raid_conf->raid_disks);
-			shrink_bh(sh, raid_conf->raid_disks);
+		if (grow_bh(sh, conf->raid_disks, priority)) {
+			shrink_buffers(sh, 2 * conf->raid_disks);
+			shrink_bh(sh, conf->raid_disks);
 			kfree(sh);
 			return 1;
 		}
-		put_free_stripe(raid_conf, sh);
-		raid_conf->nr_stripes++;
+		put_free_stripe(conf, sh);
+		conf->nr_stripes++;
 	}
 	return 0;
 }
 
-static void shrink_stripes(struct raid5_data *raid_conf, int num)
+static void shrink_stripes(raid5_conf_t *conf, int num)
 {
 	struct stripe_head *sh;
 
 	while (num--) {
-		sh = get_free_stripe(raid_conf);
+		sh = get_free_stripe(conf);
 		if (!sh)
 			break;
-		shrink_buffers(sh, raid_conf->raid_disks * 2);
-		shrink_bh(sh, raid_conf->raid_disks);
+		shrink_buffers(sh, conf->raid_disks * 2);
+		shrink_bh(sh, conf->raid_disks);
 		kfree(sh);
-		raid_conf->nr_stripes--;
+		conf->nr_stripes--;
 	}
 }
 
-static struct stripe_head *kmalloc_stripe(struct raid5_data *raid_conf, unsigned long sector, int size)
+static struct stripe_head *kmalloc_stripe(raid5_conf_t *conf, unsigned long sector, int size)
 {
 	struct stripe_head *sh = NULL, *tmp;
 	struct buffer_head *buffer_pool, *bh_pool;
 
 	PRINTK(("kmalloc_stripe called\n"));
 
-	while ((sh = get_free_stripe(raid_conf)) == NULL) {
-		shrink_stripe_cache(raid_conf, raid_conf->max_nr_stripes / 8);
-		if ((sh = get_free_stripe(raid_conf)) != NULL)
+	while ((sh = get_free_stripe(conf)) == NULL) {
+		shrink_stripe_cache(conf, conf->max_nr_stripes / 8);
+		if ((sh = get_free_stripe(conf)) != NULL)
 			break;
-		if (!raid_conf->nr_pending_stripes)
+		if (!conf->nr_pending_stripes)
 			printk("raid5: bug: nr_free_sh == 0, nr_pending_stripes == 0\n");
-		md_wakeup_thread(raid_conf->thread);
+		md_wakeup_thread(conf->thread);
 		PRINTK(("waiting for some stripes to complete\n"));
-		sleep_on(&raid_conf->wait_for_stripe);
+		sleep_on(&conf->wait_for_stripe);
 	}
 
 	/*
 	 * The above might have slept, so perhaps another process
 	 * already created the stripe for us..
 	 */
-	if ((tmp = find_stripe(raid_conf, sector, size)) != NULL) { 
-		put_free_stripe(raid_conf, sh);
+	if ((tmp = find_stripe(conf, sector, size)) != NULL) { 
+		put_free_stripe(conf, sh);
 		wait_on_stripe(tmp);
 		return tmp;
 	}
@@ -472,25 +478,25 @@
 		sh->bh_pool = bh_pool;
 		sh->phase = PHASE_COMPLETE;
 		sh->cmd = STRIPE_NONE;
-		sh->raid_conf = raid_conf;
+		sh->raid_conf = conf;
 		sh->sector = sector;
 		sh->size = size;
-		raid_conf->nr_cached_stripes++;
-		insert_hash(raid_conf, sh);
+		conf->nr_cached_stripes++;
+		insert_hash(conf, sh);
 	} else printk("raid5: bug: kmalloc_stripe() == NULL\n");
 	return sh;
 }
 
-static struct stripe_head *get_stripe(struct raid5_data *raid_conf, unsigned long sector, int size)
+static struct stripe_head *get_stripe(raid5_conf_t *conf, unsigned long sector, int size)
 {
 	struct stripe_head *sh;
 
 	PRINTK(("get_stripe, sector %lu\n", sector));
-	sh = find_stripe(raid_conf, sector, size);
+	sh = find_stripe(conf, sector, size);
 	if (sh)
 		wait_on_stripe(sh);
 	else
-		sh = kmalloc_stripe(raid_conf, sector, size);
+		sh = kmalloc_stripe(conf, sector, size);
 	return sh;
 }
 
@@ -523,7 +529,7 @@
 	bh->b_end_io(bh, uptodate);
 	if (!uptodate)
 		printk(KERN_ALERT "raid5: %s: unrecoverable I/O error for "
-		       "block %lu\n", kdevname(bh->b_dev), bh->b_blocknr);
+		       "block %lu\n", partition_name(mddev_to_kdev(sh->raid_conf->mddev)), bh->b_blocknr);
 }
 
 static inline void raid5_mark_buffer_uptodate (struct buffer_head *bh, int uptodate)
@@ -537,36 +543,35 @@
 static void raid5_end_request (struct buffer_head * bh, int uptodate)
 {
 	struct stripe_head *sh = bh->b_dev_id;
-	struct raid5_data *raid_conf = sh->raid_conf;
-	int disks = raid_conf->raid_disks, i;
+	raid5_conf_t *conf = sh->raid_conf;
+	int disks = conf->raid_disks, i;
 	unsigned long flags;
 
 	PRINTK(("end_request %lu, nr_pending %d\n", sh->sector, sh->nr_pending));
-	save_flags(flags);
-	cli();
+	md_spin_lock_irqsave(&sh->stripe_lock, flags);
 	raid5_mark_buffer_uptodate(bh, uptodate);
-	--sh->nr_pending;
-	if (!sh->nr_pending) {
-		md_wakeup_thread(raid_conf->thread);
-		atomic_inc(&raid_conf->nr_handle);
+	if (atomic_dec_and_test(&sh->nr_pending)) {
+		md_wakeup_thread(conf->thread);
+		atomic_inc(&conf->nr_handle);
 	}
-	if (!uptodate)
-		md_error(bh->b_dev, bh->b_rdev);
-	if (raid_conf->failed_disks) {
+	if (!uptodate) {
+		md_error(conf->mddev, bh->b_rdev);
+	}
+	if (conf->failed_disks) {
 		for (i = 0; i < disks; i++) {
-			if (raid_conf->disks[i].operational)
+			if (conf->disks[i].operational)
 				continue;
 			if (bh != sh->bh_old[i] && bh != sh->bh_req[i] && bh != sh->bh_copy[i])
 				continue;
-			if (bh->b_rdev != raid_conf->disks[i].dev)
+			if (bh->b_rdev != conf->disks[i].dev)
 				continue;
 			set_bit(STRIPE_ERROR, &sh->state);
 		}
 	}
-	restore_flags(flags);
+	md_spin_unlock_irqrestore(&sh->stripe_lock, flags);
 }
 
-static int raid5_map (struct md_dev *mddev, kdev_t *rdev,
+static int raid5_map (mddev_t *mddev, kdev_t dev, kdev_t *rdev,
 		      unsigned long *rsector, unsigned long size)
 {
 	/* No complex mapping used: the core of the work is done in the
@@ -577,11 +582,10 @@
 
 static void raid5_build_block (struct stripe_head *sh, struct buffer_head *bh, int i)
 {
-	struct raid5_data *raid_conf = sh->raid_conf;
-	struct md_dev *mddev = raid_conf->mddev;
-	int minor = (int) (mddev - md_dev);
+	raid5_conf_t *conf = sh->raid_conf;
+	mddev_t *mddev = conf->mddev;
 	char *b_data;
-	kdev_t dev = MKDEV(MD_MAJOR, minor);
+	kdev_t dev = mddev_to_kdev(mddev);
 	int block = sh->sector / (sh->size >> 9);
 
 	b_data = ((volatile struct buffer_head *) bh)->b_data;
@@ -589,7 +593,7 @@
 	init_buffer(bh, dev, block, raid5_end_request, sh);
 	((volatile struct buffer_head *) bh)->b_data = b_data;
 
-	bh->b_rdev	= raid_conf->disks[i].dev;
+	bh->b_rdev	= conf->disks[i].dev;
 	bh->b_rsector   = sh->sector;
 
 	bh->b_state	= (1 << BH_Req);
@@ -597,33 +601,62 @@
 	bh->b_list	= BUF_LOCKED;
 }
 
-static int raid5_error (struct md_dev *mddev, kdev_t dev)
+static int raid5_error (mddev_t *mddev, kdev_t dev)
 {
-	struct raid5_data *raid_conf = (struct raid5_data *) mddev->private;
-	md_superblock_t *sb = mddev->sb;
+	raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
+	mdp_super_t *sb = mddev->sb;
 	struct disk_info *disk;
 	int i;
 
 	PRINTK(("raid5_error called\n"));
-	raid_conf->resync_parity = 0;
-	for (i = 0, disk = raid_conf->disks; i < raid_conf->raid_disks; i++, disk++)
+	conf->resync_parity = 0;
+	for (i = 0, disk = conf->disks; i < conf->raid_disks; i++, disk++) {
 		if (disk->dev == dev && disk->operational) {
 			disk->operational = 0;
-			sb->disks[disk->number].state |= (1 << MD_FAULTY_DEVICE);
-			sb->disks[disk->number].state &= ~(1 << MD_SYNC_DEVICE);
-			sb->disks[disk->number].state &= ~(1 << MD_ACTIVE_DEVICE);
+			mark_disk_faulty(sb->disks+disk->number);
+			mark_disk_nonsync(sb->disks+disk->number);
+			mark_disk_inactive(sb->disks+disk->number);
 			sb->active_disks--;
 			sb->working_disks--;
 			sb->failed_disks++;
 			mddev->sb_dirty = 1;
-			raid_conf->working_disks--;
-			raid_conf->failed_disks++;
-			md_wakeup_thread(raid_conf->thread);
+			conf->working_disks--;
+			conf->failed_disks++;
+			md_wakeup_thread(conf->thread);
 			printk (KERN_ALERT
-				"RAID5: Disk failure on %s, disabling device."
-				"Operation continuing on %d devices\n",
-				kdevname (dev), raid_conf->working_disks);
+				"raid5: Disk failure on %s, disabling device."
+				" Operation continuing on %d devices\n",
+				partition_name (dev), conf->working_disks);
+			return -EIO;
 		}
+	}
+	/*
+	 * handle errors in spares (during reconstruction)
+	 */
+	if (conf->spare) {
+		disk = conf->spare;
+		if (disk->dev == dev) {
+			printk (KERN_ALERT
+				"raid5: Disk failure on spare %s\n",
+				partition_name (dev));
+			if (!conf->spare->operational) {
+				MD_BUG();
+				return -EIO;
+			}
+			disk->operational = 0;
+			disk->write_only = 0;
+			conf->spare = NULL;
+			mark_disk_faulty(sb->disks+disk->number);
+			mark_disk_nonsync(sb->disks+disk->number);
+			mark_disk_inactive(sb->disks+disk->number);
+			sb->spare_disks--;
+			sb->working_disks--;
+			sb->failed_disks++;
+
+			return -EIO;
+		}
+	}
+	MD_BUG();
 	return 0;
 }	
 
@@ -634,12 +667,12 @@
 static inline unsigned long 
 raid5_compute_sector (int r_sector, unsigned int raid_disks, unsigned int data_disks,
 			unsigned int * dd_idx, unsigned int * pd_idx, 
-			struct raid5_data *raid_conf)
+			raid5_conf_t *conf)
 {
 	unsigned int  stripe;
 	int chunk_number, chunk_offset;
 	unsigned long new_sector;
-	int sectors_per_chunk = raid_conf->chunk_size >> 9;
+	int sectors_per_chunk = conf->chunk_size >> 9;
 
 	/* First compute the information on this sector */
 
@@ -662,9 +695,9 @@
 	/*
 	 * Select the parity disk based on the user selected algorithm.
 	 */
-	if (raid_conf->level == 4)
+	if (conf->level == 4)
 		*pd_idx = data_disks;
-	else switch (raid_conf->algorithm) {
+	else switch (conf->algorithm) {
 		case ALGORITHM_LEFT_ASYMMETRIC:
 			*pd_idx = data_disks - stripe % raid_disks;
 			if (*dd_idx >= *pd_idx)
@@ -684,7 +717,7 @@
 			*dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
 			break;
 		default:
-			printk ("raid5: unsupported algorithm %d\n", raid_conf->algorithm);
+			printk ("raid5: unsupported algorithm %d\n", conf->algorithm);
 	}
 
 	/*
@@ -705,16 +738,16 @@
 
 static unsigned long compute_blocknr(struct stripe_head *sh, int i)
 {
-	struct raid5_data *raid_conf = sh->raid_conf;
-	int raid_disks = raid_conf->raid_disks, data_disks = raid_disks - 1;
+	raid5_conf_t *conf = sh->raid_conf;
+	int raid_disks = conf->raid_disks, data_disks = raid_disks - 1;
 	unsigned long new_sector = sh->sector, check;
-	int sectors_per_chunk = raid_conf->chunk_size >> 9;
+	int sectors_per_chunk = conf->chunk_size >> 9;
 	unsigned long stripe = new_sector / sectors_per_chunk;
 	int chunk_offset = new_sector % sectors_per_chunk;
 	int chunk_number, dummy1, dummy2, dd_idx = i;
 	unsigned long r_sector, blocknr;
 
-	switch (raid_conf->algorithm) {
+	switch (conf->algorithm) {
 		case ALGORITHM_LEFT_ASYMMETRIC:
 		case ALGORITHM_RIGHT_ASYMMETRIC:
 			if (i > sh->pd_idx)
@@ -727,14 +760,14 @@
 			i -= (sh->pd_idx + 1);
 			break;
 		default:
-			printk ("raid5: unsupported algorithm %d\n", raid_conf->algorithm);
+			printk ("raid5: unsupported algorithm %d\n", conf->algorithm);
 	}
 
 	chunk_number = stripe * data_disks + i;
 	r_sector = chunk_number * sectors_per_chunk + chunk_offset;
 	blocknr = r_sector / (sh->size >> 9);
 
-	check = raid5_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, raid_conf);
+	check = raid5_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf);
 	if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) {
 		printk("compute_blocknr: map not correct\n");
 		return 0;
@@ -742,36 +775,11 @@
 	return blocknr;
 }
 
-#ifdef HAVE_ARCH_XORBLOCK
-static void xor_block(struct buffer_head *dest, struct buffer_head *source)
-{
-	__xor_block((char *) dest->b_data, (char *) source->b_data, dest->b_size);
-}
-#else
-static void xor_block(struct buffer_head *dest, struct buffer_head *source)
-{
-	long lines = dest->b_size / (sizeof (long)) / 8, i;
-	long *destp = (long *) dest->b_data, *sourcep = (long *) source->b_data;
-
-	for (i = lines; i > 0; i--) {
-		*(destp + 0) ^= *(sourcep + 0);
-		*(destp + 1) ^= *(sourcep + 1);
-		*(destp + 2) ^= *(sourcep + 2);
-		*(destp + 3) ^= *(sourcep + 3);
-		*(destp + 4) ^= *(sourcep + 4);
-		*(destp + 5) ^= *(sourcep + 5);
-		*(destp + 6) ^= *(sourcep + 6);
-		*(destp + 7) ^= *(sourcep + 7);
-		destp += 8;
-		sourcep += 8;
-	}
-}
-#endif
-
 static void compute_block(struct stripe_head *sh, int dd_idx)
 {
-	struct raid5_data *raid_conf = sh->raid_conf;
-	int i, disks = raid_conf->raid_disks;
+	raid5_conf_t *conf = sh->raid_conf;
+	int i, count, disks = conf->raid_disks;
+	struct buffer_head *bh_ptr[MAX_XOR_BLOCKS];
 
 	PRINTK(("compute_block, stripe %lu, idx %d\n", sh->sector, dd_idx));
 
@@ -780,69 +788,100 @@
 	raid5_build_block(sh, sh->bh_old[dd_idx], dd_idx);
 
 	memset(sh->bh_old[dd_idx]->b_data, 0, sh->size);
+	bh_ptr[0] = sh->bh_old[dd_idx];
+	count = 1;
 	for (i = 0; i < disks; i++) {
 		if (i == dd_idx)
 			continue;
 		if (sh->bh_old[i]) {
-			xor_block(sh->bh_old[dd_idx], sh->bh_old[i]);
-			continue;
-		} else
+			bh_ptr[count++] = sh->bh_old[i];
+		} else {
 			printk("compute_block() %d, stripe %lu, %d not present\n", dd_idx, sh->sector, i);
+		}
+		if (count == MAX_XOR_BLOCKS) {
+			xor_block(count, &bh_ptr[0]);
+			count = 1;
+		}
+	}
+	if(count != 1) {
+		xor_block(count, &bh_ptr[0]);
 	}
 	raid5_mark_buffer_uptodate(sh->bh_old[dd_idx], 1);
 }
 
 static void compute_parity(struct stripe_head *sh, int method)
 {
-	struct raid5_data *raid_conf = sh->raid_conf;
-	int i, pd_idx = sh->pd_idx, disks = raid_conf->raid_disks;
+	raid5_conf_t *conf = sh->raid_conf;
+	int i, pd_idx = sh->pd_idx, disks = conf->raid_disks, lowprio, count;
+	struct buffer_head *bh_ptr[MAX_XOR_BLOCKS];
 
 	PRINTK(("compute_parity, stripe %lu, method %d\n", sh->sector, method));
+	lowprio = 1;
 	for (i = 0; i < disks; i++) {
 		if (i == pd_idx || !sh->bh_new[i])
 			continue;
 		if (!sh->bh_copy[i])
 			sh->bh_copy[i] = raid5_kmalloc_buffer(sh, sh->size);
 		raid5_build_block(sh, sh->bh_copy[i], i);
+		if (!buffer_lowprio(sh->bh_new[i]))
+			lowprio = 0;
+		else
+			mark_buffer_lowprio(sh->bh_copy[i]);
 		mark_buffer_clean(sh->bh_new[i]);
 		memcpy(sh->bh_copy[i]->b_data, sh->bh_new[i]->b_data, sh->size);
 	}
 	if (sh->bh_copy[pd_idx] == NULL)
 		sh->bh_copy[pd_idx] = raid5_kmalloc_buffer(sh, sh->size);
 	raid5_build_block(sh, sh->bh_copy[pd_idx], sh->pd_idx);
+	if (lowprio)
+		mark_buffer_lowprio(sh->bh_copy[pd_idx]);
 
 	if (method == RECONSTRUCT_WRITE) {
 		memset(sh->bh_copy[pd_idx]->b_data, 0, sh->size);
+		bh_ptr[0] = sh->bh_copy[pd_idx];
+		count = 1;
 		for (i = 0; i < disks; i++) {
 			if (i == sh->pd_idx)
 				continue;
 			if (sh->bh_new[i]) {
-				xor_block(sh->bh_copy[pd_idx], sh->bh_copy[i]);
-				continue;
+				bh_ptr[count++] = sh->bh_copy[i];
+			} else if (sh->bh_old[i]) {
+				bh_ptr[count++] = sh->bh_old[i];
 			}
-			if (sh->bh_old[i]) {
-				xor_block(sh->bh_copy[pd_idx], sh->bh_old[i]);
-				continue;
+			if (count == MAX_XOR_BLOCKS) {
+				xor_block(count, &bh_ptr[0]);
+				count = 1;
 			}
 		}
+		if (count != 1) {
+			xor_block(count, &bh_ptr[0]);
+		}
 	} else if (method == READ_MODIFY_WRITE) {
 		memcpy(sh->bh_copy[pd_idx]->b_data, sh->bh_old[pd_idx]->b_data, sh->size);
+		bh_ptr[0] = sh->bh_copy[pd_idx];
+		count = 1;
 		for (i = 0; i < disks; i++) {
 			if (i == sh->pd_idx)
 				continue;
 			if (sh->bh_new[i] && sh->bh_old[i]) {
-				xor_block(sh->bh_copy[pd_idx], sh->bh_copy[i]);
-				xor_block(sh->bh_copy[pd_idx], sh->bh_old[i]);
-				continue;
+				bh_ptr[count++] = sh->bh_copy[i];
+				bh_ptr[count++] = sh->bh_old[i];
+			}
+			if (count >= (MAX_XOR_BLOCKS - 1)) {
+				xor_block(count, &bh_ptr[0]);
+				count = 1;
 			}
 		}
+		if (count != 1) {
+			xor_block(count, &bh_ptr[0]);
+		}
 	}
 	raid5_mark_buffer_uptodate(sh->bh_copy[pd_idx], 1);
 }
 
 static void add_stripe_bh (struct stripe_head *sh, struct buffer_head *bh, int dd_idx, int rw)
 {
-	struct raid5_data *raid_conf = sh->raid_conf;
+	raid5_conf_t *conf = sh->raid_conf;
 	struct buffer_head *bh_req;
 
 	if (sh->bh_new[dd_idx]) {
@@ -860,19 +899,22 @@
 	if (sh->phase == PHASE_COMPLETE && sh->cmd == STRIPE_NONE) {
 		sh->phase = PHASE_BEGIN;
 		sh->cmd = (rw == READ) ? STRIPE_READ : STRIPE_WRITE;
-		raid_conf->nr_pending_stripes++;
-		atomic_inc(&raid_conf->nr_handle);
+		conf->nr_pending_stripes++;
+		atomic_inc(&conf->nr_handle);
 	}
 	sh->bh_new[dd_idx] = bh;
 	sh->bh_req[dd_idx] = bh_req;
 	sh->cmd_new[dd_idx] = rw;
 	sh->new[dd_idx] = 1;
+
+	if (buffer_lowprio(bh))
+		mark_buffer_lowprio(bh_req);
 }
 
 static void complete_stripe(struct stripe_head *sh)
 {
-	struct raid5_data *raid_conf = sh->raid_conf;
-	int disks = raid_conf->raid_disks;
+	raid5_conf_t *conf = sh->raid_conf;
+	int disks = conf->raid_disks;
 	int i, new = 0;
 	
 	PRINTK(("complete_stripe %lu\n", sh->sector));
@@ -909,6 +951,22 @@
 	}
 }
 
+
+static int is_stripe_lowprio(struct stripe_head *sh, int disks)
+{
+	int i, lowprio = 1;
+
+	for (i = 0; i < disks; i++) {
+		if (sh->bh_new[i])
+			if (!buffer_lowprio(sh->bh_new[i]))
+				lowprio = 0;
+		if (sh->bh_old[i])
+			if (!buffer_lowprio(sh->bh_old[i]))
+				lowprio = 0;
+	}
+	return lowprio;
+}
+
 /*
  * handle_stripe() is our main logic routine. Note that:
  *
@@ -919,28 +977,27 @@
  * 2.	We should be careful to set sh->nr_pending whenever we sleep,
  *	to prevent re-entry of handle_stripe() for the same sh.
  *
- * 3.	raid_conf->failed_disks and disk->operational can be changed
+ * 3.	conf->failed_disks and disk->operational can be changed
  *	from an interrupt. This complicates things a bit, but it allows
  *	us to stop issuing requests for a failed drive as soon as possible.
  */
 static void handle_stripe(struct stripe_head *sh)
 {
-	struct raid5_data *raid_conf = sh->raid_conf;
-	struct md_dev *mddev = raid_conf->mddev;
-	int minor = (int) (mddev - md_dev);
+	raid5_conf_t *conf = sh->raid_conf;
+	mddev_t *mddev = conf->mddev;
 	struct buffer_head *bh;
-	int disks = raid_conf->raid_disks;
-	int i, nr = 0, nr_read = 0, nr_write = 0;
+	int disks = conf->raid_disks;
+	int i, nr = 0, nr_read = 0, nr_write = 0, lowprio;
 	int nr_cache = 0, nr_cache_other = 0, nr_cache_overwrite = 0, parity = 0;
 	int nr_failed_other = 0, nr_failed_overwrite = 0, parity_failed = 0;
 	int reading = 0, nr_writing = 0;
 	int method1 = INT_MAX, method2 = INT_MAX;
 	int block;
 	unsigned long flags;
-	int operational[MD_SB_DISKS], failed_disks = raid_conf->failed_disks;
+	int operational[MD_SB_DISKS], failed_disks = conf->failed_disks;
 
 	PRINTK(("handle_stripe(), stripe %lu\n", sh->sector));
-	if (sh->nr_pending) {
+	if (md_atomic_read(&sh->nr_pending)) {
 		printk("handle_stripe(), stripe %lu, io still pending\n", sh->sector);
 		return;
 	}
@@ -949,9 +1006,9 @@
 		return;
 	}
 
-	atomic_dec(&raid_conf->nr_handle);
+	atomic_dec(&conf->nr_handle);
 
-	if (test_and_clear_bit(STRIPE_ERROR, &sh->state)) {
+	if (md_test_and_clear_bit(STRIPE_ERROR, &sh->state)) {
 		printk("raid5: restarting stripe %lu\n", sh->sector);
 		sh->phase = PHASE_BEGIN;
 	}
@@ -969,11 +1026,11 @@
 	save_flags(flags);
 	cli();
 	for (i = 0; i < disks; i++) {
-		operational[i] = raid_conf->disks[i].operational;
-		if (i == sh->pd_idx && raid_conf->resync_parity)
+		operational[i] = conf->disks[i].operational;
+		if (i == sh->pd_idx && conf->resync_parity)
 			operational[i] = 0;
 	}
-	failed_disks = raid_conf->failed_disks;
+	failed_disks = conf->failed_disks;
 	restore_flags(flags);
 
 	if (failed_disks > 1) {
@@ -1017,7 +1074,7 @@
 	}
 
 	if (nr_write && nr_read)
-		printk("raid5: bug, nr_write == %d, nr_read == %d, sh->cmd == %d\n", nr_write, nr_read, sh->cmd);
+		printk("raid5: bug, nr_write ==`%d, nr_read == %d, sh->cmd == %d\n", nr_write, nr_read, sh->cmd);
 
 	if (nr_write) {
 		/*
@@ -1030,7 +1087,7 @@
 				if (sh->bh_new[i])
 					continue;
 				block = (int) compute_blocknr(sh, i);
-				bh = find_buffer(MKDEV(MD_MAJOR, minor), block, sh->size);
+				bh = find_buffer(mddev_to_kdev(mddev), block, sh->size);
 				if (bh && bh->b_count == 0 && buffer_dirty(bh) && !buffer_locked(bh)) {
 					PRINTK(("Whee.. sector %lu, index %d (%d) found in the buffer cache!\n", sh->sector, i, block));
 					add_stripe_bh(sh, bh, i, WRITE);
@@ -1064,21 +1121,22 @@
 
 		if (!method1 || !method2) {
 			lock_stripe(sh);
-			sh->nr_pending++;
+			lowprio = is_stripe_lowprio(sh, disks);
+			atomic_inc(&sh->nr_pending);
 			sh->phase = PHASE_WRITE;
 			compute_parity(sh, method1 <= method2 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE);
 			for (i = 0; i < disks; i++) {
-				if (!operational[i] && !raid_conf->spare && !raid_conf->resync_parity)
+				if (!operational[i] && !conf->spare && !conf->resync_parity)
 					continue;
 				if (i == sh->pd_idx || sh->bh_new[i])
 					nr_writing++;
 			}
 
-			sh->nr_pending = nr_writing;
-			PRINTK(("handle_stripe() %lu, writing back %d\n", sh->sector, sh->nr_pending));
+			md_atomic_set(&sh->nr_pending, nr_writing);
+			PRINTK(("handle_stripe() %lu, writing back %d\n", sh->sector, md_atomic_read(&sh->nr_pending)));
 
 			for (i = 0; i < disks; i++) {
-				if (!operational[i] && !raid_conf->spare && !raid_conf->resync_parity)
+				if (!operational[i] && !conf->spare && !conf->resync_parity)
 					continue;
 				bh = sh->bh_copy[i];
 				if (i != sh->pd_idx && ((bh == NULL) ^ (sh->bh_new[i] == NULL)))
@@ -1089,18 +1147,30 @@
 					bh->b_state |= (1<<BH_Dirty);
 					PRINTK(("making request for buffer %d\n", i));
 					clear_bit(BH_Lock, &bh->b_state);
-					if (!operational[i] && !raid_conf->resync_parity) {
-						bh->b_rdev = raid_conf->spare->dev;
-						make_request(MAJOR(raid_conf->spare->dev), WRITE, bh);
-					} else
-						make_request(MAJOR(raid_conf->disks[i].dev), WRITE, bh);
+					if (!operational[i] && !conf->resync_parity) {
+						bh->b_rdev = conf->spare->dev;
+						make_request(MAJOR(conf->spare->dev), WRITE, bh);
+					} else {
+#if 0
+						make_request(MAJOR(conf->disks[i].dev), WRITE, bh);
+#else
+						if (!lowprio || (i==sh->pd_idx))
+							make_request(MAJOR(conf->disks[i].dev), WRITE, bh);
+						else {
+							mark_buffer_clean(bh);
+							raid5_end_request(bh,1);
+							sh->new[i] = 0;
+						}
+#endif
+					}
 				}
 			}
 			return;
 		}
 
 		lock_stripe(sh);
-		sh->nr_pending++;
+		lowprio = is_stripe_lowprio(sh, disks);
+		atomic_inc(&sh->nr_pending);
 		if (method1 < method2) {
 			sh->write_method = RECONSTRUCT_WRITE;
 			for (i = 0; i < disks; i++) {
@@ -1110,6 +1180,8 @@
 					continue;
 				sh->bh_old[i] = raid5_kmalloc_buffer(sh, sh->size);
 				raid5_build_block(sh, sh->bh_old[i], i);
+				if (lowprio)
+					mark_buffer_lowprio(sh->bh_old[i]);
 				reading++;
 			}
 		} else {
@@ -1121,19 +1193,21 @@
 					continue;
 				sh->bh_old[i] = raid5_kmalloc_buffer(sh, sh->size);
 				raid5_build_block(sh, sh->bh_old[i], i);
+				if (lowprio)
+					mark_buffer_lowprio(sh->bh_old[i]);
 				reading++;
 			}
 		}
 		sh->phase = PHASE_READ_OLD;
-		sh->nr_pending = reading;
-		PRINTK(("handle_stripe() %lu, reading %d old buffers\n", sh->sector, sh->nr_pending));
+		md_atomic_set(&sh->nr_pending, reading);
+		PRINTK(("handle_stripe() %lu, reading %d old buffers\n", sh->sector, md_atomic_read(&sh->nr_pending)));
 		for (i = 0; i < disks; i++) {
 			if (!sh->bh_old[i])
 				continue;
 			if (buffer_uptodate(sh->bh_old[i]))
 				continue;
 		 	clear_bit(BH_Lock, &sh->bh_old[i]->b_state);
-			make_request(MAJOR(raid_conf->disks[i].dev), READ, sh->bh_old[i]);
+			make_request(MAJOR(conf->disks[i].dev), READ, sh->bh_old[i]);
 		}
 	} else {
 		/*
@@ -1141,7 +1215,8 @@
 		 */
 		method1 = nr_read - nr_cache_overwrite;
 		lock_stripe(sh);
-		sh->nr_pending++;
+		lowprio = is_stripe_lowprio(sh,disks);
+		atomic_inc(&sh->nr_pending);
 
 		PRINTK(("handle_stripe(), sector %lu, nr_read %d, nr_cache %d, method1 %d\n", sh->sector, nr_read, nr_cache, method1));
 		if (!method1 || (method1 == 1 && nr_cache == disks - 1)) {
@@ -1149,18 +1224,22 @@
 			for (i = 0; i < disks; i++) {
 				if (!sh->bh_new[i])
 					continue;
-				if (!sh->bh_old[i])
+				if (!sh->bh_old[i]) {
 					compute_block(sh, i);
+					if (lowprio)
+						mark_buffer_lowprio
+							(sh->bh_old[i]);
+				}
 				memcpy(sh->bh_new[i]->b_data, sh->bh_old[i]->b_data, sh->size);
 			}
-			sh->nr_pending--;
+			atomic_dec(&sh->nr_pending);
 			complete_stripe(sh);
 			return;
 		}
 		if (nr_failed_overwrite) {
 			sh->phase = PHASE_READ_OLD;
-			sh->nr_pending = (disks - 1) - nr_cache;
-			PRINTK(("handle_stripe() %lu, phase READ_OLD, pending %d\n", sh->sector, sh->nr_pending));
+			md_atomic_set(&sh->nr_pending, (disks - 1) - nr_cache);
+			PRINTK(("handle_stripe() %lu, phase READ_OLD, pending %d\n", sh->sector, md_atomic_read(&sh->nr_pending)));
 			for (i = 0; i < disks; i++) {
 				if (sh->bh_old[i])
 					continue;
@@ -1168,13 +1247,16 @@
 					continue;
 				sh->bh_old[i] = raid5_kmalloc_buffer(sh, sh->size);
 				raid5_build_block(sh, sh->bh_old[i], i);
+				if (lowprio)
+					mark_buffer_lowprio(sh->bh_old[i]);
 			 	clear_bit(BH_Lock, &sh->bh_old[i]->b_state);
-				make_request(MAJOR(raid_conf->disks[i].dev), READ, sh->bh_old[i]);
+				make_request(MAJOR(conf->disks[i].dev), READ, sh->bh_old[i]);
 			}
 		} else {
 			sh->phase = PHASE_READ;
-			sh->nr_pending = nr_read - nr_cache_overwrite;
-			PRINTK(("handle_stripe() %lu, phase READ, pending %d\n", sh->sector, sh->nr_pending));
+			md_atomic_set(&sh->nr_pending,
+				nr_read - nr_cache_overwrite);
+			PRINTK(("handle_stripe() %lu, phase READ, pending %d\n", sh->sector, md_atomic_read(&sh->nr_pending)));
 			for (i = 0; i < disks; i++) {
 				if (!sh->bh_new[i])
 					continue;
@@ -1182,16 +1264,16 @@
 					memcpy(sh->bh_new[i]->b_data, sh->bh_old[i]->b_data, sh->size);
 					continue;
 				}
-				make_request(MAJOR(raid_conf->disks[i].dev), READ, sh->bh_req[i]);
+				make_request(MAJOR(conf->disks[i].dev), READ, sh->bh_req[i]);
 			}
 		}
 	}
 }
 
-static int raid5_make_request (struct md_dev *mddev, int rw, struct buffer_head * bh)
+static int raid5_make_request (mddev_t *mddev, int rw, struct buffer_head * bh)
 {
-	struct raid5_data *raid_conf = (struct raid5_data *) mddev->private;
-	const unsigned int raid_disks = raid_conf->raid_disks;
+	raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
+	const unsigned int raid_disks = conf->raid_disks;
 	const unsigned int data_disks = raid_disks - 1;
 	unsigned int  dd_idx, pd_idx;
 	unsigned long new_sector;
@@ -1202,15 +1284,15 @@
 	if (rw == WRITEA) rw = WRITE;
 
 	new_sector = raid5_compute_sector(bh->b_rsector, raid_disks, data_disks,
-						&dd_idx, &pd_idx, raid_conf);
+						&dd_idx, &pd_idx, conf);
 
 	PRINTK(("raid5_make_request, sector %lu\n", new_sector));
 repeat:
-	sh = get_stripe(raid_conf, new_sector, bh->b_size);
+	sh = get_stripe(conf, new_sector, bh->b_size);
 	if ((rw == READ && sh->cmd == STRIPE_WRITE) || (rw == WRITE && sh->cmd == STRIPE_READ)) {
 		PRINTK(("raid5: lock contention, rw == %d, sh->cmd == %d\n", rw, sh->cmd));
 		lock_stripe(sh);
-		if (!sh->nr_pending)
+		if (!md_atomic_read(&sh->nr_pending))
 			handle_stripe(sh);
 		goto repeat;
 	}
@@ -1221,24 +1303,24 @@
 		printk("raid5: bug: stripe->bh_new[%d], sector %lu exists\n", dd_idx, sh->sector);
 		printk("raid5: bh %p, bh_new %p\n", bh, sh->bh_new[dd_idx]);
 		lock_stripe(sh);
-		md_wakeup_thread(raid_conf->thread);
+		md_wakeup_thread(conf->thread);
 		wait_on_stripe(sh);
 		goto repeat;
 	}
 	add_stripe_bh(sh, bh, dd_idx, rw);
 
-	md_wakeup_thread(raid_conf->thread);
+	md_wakeup_thread(conf->thread);
 	return 0;
 }
 
 static void unplug_devices(struct stripe_head *sh)
 {
 #if 0
-	struct raid5_data *raid_conf = sh->raid_conf;
+	raid5_conf_t *conf = sh->raid_conf;
 	int i;
 
-	for (i = 0; i < raid_conf->raid_disks; i++)
-		unplug_device(blk_dev + MAJOR(raid_conf->disks[i].dev));
+	for (i = 0; i < conf->raid_disks; i++)
+		unplug_device(blk_dev + MAJOR(conf->disks[i].dev));
 #endif
 }
 
@@ -1252,8 +1334,8 @@
 static void raid5d (void *data)
 {
 	struct stripe_head *sh;
-	struct raid5_data *raid_conf = data;
-	struct md_dev *mddev = raid_conf->mddev;
+	raid5_conf_t *conf = data;
+	mddev_t *mddev = conf->mddev;
 	int i, handled = 0, unplug = 0;
 	unsigned long flags;
 
@@ -1261,47 +1343,47 @@
 
 	if (mddev->sb_dirty) {
 		mddev->sb_dirty = 0;
-		md_update_sb((int) (mddev - md_dev));
+		md_update_sb(mddev);
 	}
 	for (i = 0; i < NR_HASH; i++) {
 repeat:
-		sh = raid_conf->stripe_hashtbl[i];
+		sh = conf->stripe_hashtbl[i];
 		for (; sh; sh = sh->hash_next) {
-			if (sh->raid_conf != raid_conf)
+			if (sh->raid_conf != conf)
 				continue;
 			if (sh->phase == PHASE_COMPLETE)
 				continue;
-			if (sh->nr_pending)
+			if (md_atomic_read(&sh->nr_pending))
 				continue;
-			if (sh->sector == raid_conf->next_sector) {
-				raid_conf->sector_count += (sh->size >> 9);
-				if (raid_conf->sector_count >= 128)
+			if (sh->sector == conf->next_sector) {
+				conf->sector_count += (sh->size >> 9);
+				if (conf->sector_count >= 128)
 					unplug = 1;
 			} else
 				unplug = 1;
 			if (unplug) {
-				PRINTK(("unplugging devices, sector == %lu, count == %d\n", sh->sector, raid_conf->sector_count));
+				PRINTK(("unplugging devices, sector == %lu, count == %d\n", sh->sector, conf->sector_count));
 				unplug_devices(sh);
 				unplug = 0;
-				raid_conf->sector_count = 0;
+				conf->sector_count = 0;
 			}
-			raid_conf->next_sector = sh->sector + (sh->size >> 9);
+			conf->next_sector = sh->sector + (sh->size >> 9);
 			handled++;
 			handle_stripe(sh);
 			goto repeat;
 		}
 	}
-	if (raid_conf) {
-		PRINTK(("%d stripes handled, nr_handle %d\n", handled, atomic_read(&raid_conf->nr_handle)));
+	if (conf) {
+		PRINTK(("%d stripes handled, nr_handle %d\n", handled, md_atomic_read(&conf->nr_handle)));
 		save_flags(flags);
 		cli();
-		if (!atomic_read(&raid_conf->nr_handle))
-			clear_bit(THREAD_WAKEUP, &raid_conf->thread->flags);
+		if (!md_atomic_read(&conf->nr_handle))
+			clear_bit(THREAD_WAKEUP, &conf->thread->flags);
+		restore_flags(flags);
 	}
 	PRINTK(("--- raid5d inactive\n"));
 }
 
-#if SUPPORT_RECONSTRUCTION
 /*
  * Private kernel thread for parity reconstruction after an unclean
  * shutdown. Reconstruction on spare drives in case of a failed drive
@@ -1309,44 +1391,64 @@
  */
 static void raid5syncd (void *data)
 {
-	struct raid5_data *raid_conf = data;
-	struct md_dev *mddev = raid_conf->mddev;
+	raid5_conf_t *conf = data;
+	mddev_t *mddev = conf->mddev;
 
-	if (!raid_conf->resync_parity)
+	if (!conf->resync_parity)
+		return;
+	if (conf->resync_parity == 2)
+		return;
+	down(&mddev->recovery_sem);
+	if (md_do_sync(mddev,NULL)) {
+		up(&mddev->recovery_sem);
+		printk("raid5: resync aborted!\n");
 		return;
-	md_do_sync(mddev);
-	raid_conf->resync_parity = 0;
+	}
+	conf->resync_parity = 0;
+	up(&mddev->recovery_sem);
+	printk("raid5: resync finished.\n");
 }
-#endif /* SUPPORT_RECONSTRUCTION */
 
-static int __check_consistency (struct md_dev *mddev, int row)
+static int __check_consistency (mddev_t *mddev, int row)
 {
-	struct raid5_data *raid_conf = mddev->private;
+	raid5_conf_t *conf = mddev->private;
 	kdev_t dev;
 	struct buffer_head *bh[MD_SB_DISKS], tmp;
-	int i, rc = 0, nr = 0;
+	int i, rc = 0, nr = 0, count;
+	struct buffer_head *bh_ptr[MAX_XOR_BLOCKS];
 
-	if (raid_conf->working_disks != raid_conf->raid_disks)
+	if (conf->working_disks != conf->raid_disks)
 		return 0;
 	tmp.b_size = 4096;
 	if ((tmp.b_data = (char *) get_free_page(GFP_KERNEL)) == NULL)
 		return 0;
+	md_clear_page((unsigned long)tmp.b_data);
 	memset(bh, 0, MD_SB_DISKS * sizeof(struct buffer_head *));
-	for (i = 0; i < raid_conf->raid_disks; i++) {
-		dev = raid_conf->disks[i].dev;
+	for (i = 0; i < conf->raid_disks; i++) {
+		dev = conf->disks[i].dev;
 		set_blocksize(dev, 4096);
 		if ((bh[i] = bread(dev, row / 4, 4096)) == NULL)
 			break;
 		nr++;
 	}
-	if (nr == raid_conf->raid_disks) {
-		for (i = 1; i < nr; i++)
-			xor_block(&tmp, bh[i]);
+	if (nr == conf->raid_disks) {
+		bh_ptr[0] = &tmp;
+		count = 1;
+		for (i = 1; i < nr; i++) {
+			bh_ptr[count++] = bh[i];
+			if (count == MAX_XOR_BLOCKS) {
+				xor_block(count, &bh_ptr[0]);
+				count = 1;
+			}
+		}
+		if (count != 1) {
+			xor_block(count, &bh_ptr[0]);
+		}
 		if (memcmp(tmp.b_data, bh[0]->b_data, 4096))
 			rc = 1;
 	}
-	for (i = 0; i < raid_conf->raid_disks; i++) {
-		dev = raid_conf->disks[i].dev;
+	for (i = 0; i < conf->raid_disks; i++) {
+		dev = conf->disks[i].dev;
 		if (bh[i]) {
 			bforget(bh[i]);
 			bh[i] = NULL;
@@ -1358,285 +1460,607 @@
 	return rc;
 }
 
-static int check_consistency (struct md_dev *mddev)
+static int check_consistency (mddev_t *mddev)
 {
-	int size = mddev->sb->size;
-	int row;
+	if (__check_consistency(mddev, 0))
+/*
+ * We are not checking this currently, as it's legitimate to have
+ * an inconsistent array, at creation time.
+ */
+		return 0;
 
-	for (row = 0; row < size; row += size / 8)
-		if (__check_consistency(mddev, row))
-			return 1;
 	return 0;
 }
 
-static int raid5_run (int minor, struct md_dev *mddev)
+static int raid5_run (mddev_t *mddev)
 {
-	struct raid5_data *raid_conf;
+	raid5_conf_t *conf;
 	int i, j, raid_disk, memory;
-	md_superblock_t *sb = mddev->sb;
-	md_descriptor_t *descriptor;
-	struct real_dev *realdev;
+	mdp_super_t *sb = mddev->sb;
+	mdp_disk_t *desc;
+	mdk_rdev_t *rdev;
+	struct disk_info *disk;
+	struct md_list_head *tmp;
+	int start_recovery = 0;
 
 	MOD_INC_USE_COUNT;
 
 	if (sb->level != 5 && sb->level != 4) {
-		printk("raid5: %s: raid level not set to 4/5 (%d)\n", kdevname(MKDEV(MD_MAJOR, minor)), sb->level);
+		printk("raid5: md%d: raid level not set to 4/5 (%d)\n", mdidx(mddev), sb->level);
 		MOD_DEC_USE_COUNT;
 		return -EIO;
 	}
 
-	mddev->private = kmalloc (sizeof (struct raid5_data), GFP_KERNEL);
-	if ((raid_conf = mddev->private) == NULL)
+	mddev->private = kmalloc (sizeof (raid5_conf_t), GFP_KERNEL);
+	if ((conf = mddev->private) == NULL)
 		goto abort;
-	memset (raid_conf, 0, sizeof (*raid_conf));
-	raid_conf->mddev = mddev;
+	memset (conf, 0, sizeof (*conf));
+	conf->mddev = mddev;
 
-	if ((raid_conf->stripe_hashtbl = (struct stripe_head **) __get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER)) == NULL)
+	if ((conf->stripe_hashtbl = (struct stripe_head **) md__get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER)) == NULL)
 		goto abort;
-	memset(raid_conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE);
+	memset(conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE);
 
-	init_waitqueue(&raid_conf->wait_for_stripe);
-	PRINTK(("raid5_run(%d) called.\n", minor));
-
-  	for (i = 0; i < mddev->nb_dev; i++) {
-  		realdev = &mddev->devices[i];
-		if (!realdev->sb) {
-			printk(KERN_ERR "raid5: disabled device %s (couldn't access raid superblock)\n", kdevname(realdev->dev));
-			continue;
-		}
+	init_waitqueue(&conf->wait_for_stripe);
+	PRINTK(("raid5_run(md%d) called.\n", mdidx(mddev)));
 
+	ITERATE_RDEV(mddev,rdev,tmp) {
 		/*
 		 * This is important -- we are using the descriptor on
 		 * the disk only to get a pointer to the descriptor on
 		 * the main superblock, which might be more recent.
 		 */
-		descriptor = &sb->disks[realdev->sb->descriptor.number];
-		if (descriptor->state & (1 << MD_FAULTY_DEVICE)) {
-			printk(KERN_ERR "raid5: disabled device %s (errors detected)\n", kdevname(realdev->dev));
+		desc = sb->disks + rdev->desc_nr;
+		raid_disk = desc->raid_disk;
+		disk = conf->disks + raid_disk;
+
+		if (disk_faulty(desc)) {
+			printk(KERN_ERR "raid5: disabled device %s (errors detected)\n", partition_name(rdev->dev));
+			if (!rdev->faulty) {
+				MD_BUG();
+				goto abort;
+			}
+			disk->number = desc->number;
+			disk->raid_disk = raid_disk;
+			disk->dev = rdev->dev;
+
+			disk->operational = 0;
+			disk->write_only = 0;
+			disk->spare = 0;
+			disk->used_slot = 1;
 			continue;
 		}
-		if (descriptor->state & (1 << MD_ACTIVE_DEVICE)) {
-			if (!(descriptor->state & (1 << MD_SYNC_DEVICE))) {
-				printk(KERN_ERR "raid5: disabled device %s (not in sync)\n", kdevname(realdev->dev));
-				continue;
+		if (disk_active(desc)) {
+			if (!disk_sync(desc)) {
+				printk(KERN_ERR "raid5: disabled device %s (not in sync)\n", partition_name(rdev->dev));
+				MD_BUG();
+				goto abort;
 			}
-			raid_disk = descriptor->raid_disk;
-			if (descriptor->number > sb->nr_disks || raid_disk > sb->raid_disks) {
-				printk(KERN_ERR "raid5: disabled device %s (inconsistent descriptor)\n", kdevname(realdev->dev));
+			if (raid_disk > sb->raid_disks) {
+				printk(KERN_ERR "raid5: disabled device %s (inconsistent descriptor)\n", partition_name(rdev->dev));
 				continue;
 			}
-			if (raid_conf->disks[raid_disk].operational) {
-				printk(KERN_ERR "raid5: disabled device %s (device %d already operational)\n", kdevname(realdev->dev), raid_disk);
+			if (disk->operational) {
+				printk(KERN_ERR "raid5: disabled device %s (device %d already operational)\n", partition_name(rdev->dev), raid_disk);
 				continue;
 			}
-			printk(KERN_INFO "raid5: device %s operational as raid disk %d\n", kdevname(realdev->dev), raid_disk);
+			printk(KERN_INFO "raid5: device %s operational as raid disk %d\n", partition_name(rdev->dev), raid_disk);
 	
-			raid_conf->disks[raid_disk].number = descriptor->number;
-			raid_conf->disks[raid_disk].raid_disk = raid_disk;
-			raid_conf->disks[raid_disk].dev = mddev->devices[i].dev;
-			raid_conf->disks[raid_disk].operational = 1;
+			disk->number = desc->number;
+			disk->raid_disk = raid_disk;
+			disk->dev = rdev->dev;
+			disk->operational = 1;
+			disk->used_slot = 1;
 
-			raid_conf->working_disks++;
+			conf->working_disks++;
 		} else {
 			/*
 			 * Must be a spare disk ..
 			 */
-			printk(KERN_INFO "raid5: spare disk %s\n", kdevname(realdev->dev));
-			raid_disk = descriptor->raid_disk;
-			raid_conf->disks[raid_disk].number = descriptor->number;
-			raid_conf->disks[raid_disk].raid_disk = raid_disk;
-			raid_conf->disks[raid_disk].dev = mddev->devices [i].dev;
-
-			raid_conf->disks[raid_disk].operational = 0;
-			raid_conf->disks[raid_disk].write_only = 0;
-			raid_conf->disks[raid_disk].spare = 1;
-		}
-	}
-	raid_conf->raid_disks = sb->raid_disks;
-	raid_conf->failed_disks = raid_conf->raid_disks - raid_conf->working_disks;
-	raid_conf->mddev = mddev;
-	raid_conf->chunk_size = sb->chunk_size;
-	raid_conf->level = sb->level;
-	raid_conf->algorithm = sb->parity_algorithm;
-	raid_conf->max_nr_stripes = NR_STRIPES;
+			printk(KERN_INFO "raid5: spare disk %s\n", partition_name(rdev->dev));
+			disk->number = desc->number;
+			disk->raid_disk = raid_disk;
+			disk->dev = rdev->dev;
 
-	if (raid_conf->working_disks != sb->raid_disks && sb->state != (1 << MD_SB_CLEAN)) {
-		printk(KERN_ALERT "raid5: raid set %s not clean and not all disks are operational -- run ckraid\n", kdevname(MKDEV(MD_MAJOR, minor)));
-		goto abort;
+			disk->operational = 0;
+			disk->write_only = 0;
+			disk->spare = 1;
+			disk->used_slot = 1;
+		}
 	}
-	if (!raid_conf->chunk_size || raid_conf->chunk_size % 4) {
-		printk(KERN_ERR "raid5: invalid chunk size %d for %s\n", raid_conf->chunk_size, kdevname(MKDEV(MD_MAJOR, minor)));
+
+	for (i = 0; i < MD_SB_DISKS; i++) {
+		desc = sb->disks + i;
+		raid_disk = desc->raid_disk;
+		disk = conf->disks + raid_disk;
+
+		if (disk_faulty(desc) && (raid_disk < sb->raid_disks) &&
+			!conf->disks[raid_disk].used_slot) {
+
+			disk->number = desc->number;
+			disk->raid_disk = raid_disk;
+			disk->dev = MKDEV(0,0);
+
+			disk->operational = 0;
+			disk->write_only = 0;
+			disk->spare = 0;
+			disk->used_slot = 1;
+		}
+	}
+
+	conf->raid_disks = sb->raid_disks;
+	/*
+	 * 0 for a fully functional array, 1 for a degraded array.
+	 */
+	conf->failed_disks = conf->raid_disks - conf->working_disks;
+	conf->mddev = mddev;
+	conf->chunk_size = sb->chunk_size;
+	conf->level = sb->level;
+	conf->algorithm = sb->layout;
+	conf->max_nr_stripes = NR_STRIPES;
+
+#if 0
+	for (i = 0; i < conf->raid_disks; i++) {
+		if (!conf->disks[i].used_slot) {
+			MD_BUG();
+			goto abort;
+		}
+	}
+#endif
+	if (!conf->chunk_size || conf->chunk_size % 4) {
+		printk(KERN_ERR "raid5: invalid chunk size %d for md%d\n", conf->chunk_size, mdidx(mddev));
 		goto abort;
 	}
-	if (raid_conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) {
-		printk(KERN_ERR "raid5: unsupported parity algorithm %d for %s\n", raid_conf->algorithm, kdevname(MKDEV(MD_MAJOR, minor)));
+	if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) {
+		printk(KERN_ERR "raid5: unsupported parity algorithm %d for md%d\n", conf->algorithm, mdidx(mddev));
 		goto abort;
 	}
-	if (raid_conf->failed_disks > 1) {
-		printk(KERN_ERR "raid5: not enough operational devices for %s (%d/%d failed)\n", kdevname(MKDEV(MD_MAJOR, minor)), raid_conf->failed_disks, raid_conf->raid_disks);
+	if (conf->failed_disks > 1) {
+		printk(KERN_ERR "raid5: not enough operational devices for md%d (%d/%d failed)\n", mdidx(mddev), conf->failed_disks, conf->raid_disks);
 		goto abort;
 	}
 
-	if ((sb->state & (1 << MD_SB_CLEAN)) && check_consistency(mddev)) {
-		printk(KERN_ERR "raid5: detected raid-5 xor inconsistenty -- run ckraid\n");
-		sb->state |= 1 << MD_SB_ERRORS;
-		goto abort;
+	if (conf->working_disks != sb->raid_disks) {
+		printk(KERN_ALERT "raid5: md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev));
+		start_recovery = 1;
 	}
 
-	if ((raid_conf->thread = md_register_thread(raid5d, raid_conf)) == NULL) {
-		printk(KERN_ERR "raid5: couldn't allocate thread for %s\n", kdevname(MKDEV(MD_MAJOR, minor)));
-		goto abort;
+	if (!start_recovery && (sb->state & (1 << MD_SB_CLEAN)) &&
+			check_consistency(mddev)) {
+		printk(KERN_ERR "raid5: detected raid-5 superblock xor inconsistency -- running resync\n");
+		sb->state &= ~(1 << MD_SB_CLEAN);
 	}
 
-#if SUPPORT_RECONSTRUCTION
-	if ((raid_conf->resync_thread = md_register_thread(raid5syncd, raid_conf)) == NULL) {
-		printk(KERN_ERR "raid5: couldn't allocate thread for %s\n", kdevname(MKDEV(MD_MAJOR, minor)));
-		goto abort;
+	{
+		const char * name = "raid5d";
+
+		conf->thread = md_register_thread(raid5d, conf, name);
+		if (!conf->thread) {
+			printk(KERN_ERR "raid5: couldn't allocate thread for md%d\n", mdidx(mddev));
+			goto abort;
+		}
 	}
-#endif /* SUPPORT_RECONSTRUCTION */
 
-	memory = raid_conf->max_nr_stripes * (sizeof(struct stripe_head) +
-		 raid_conf->raid_disks * (sizeof(struct buffer_head) +
+	memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
+		 conf->raid_disks * (sizeof(struct buffer_head) +
 		 2 * (sizeof(struct buffer_head) + PAGE_SIZE))) / 1024;
-	if (grow_stripes(raid_conf, raid_conf->max_nr_stripes, GFP_KERNEL)) {
+	if (grow_stripes(conf, conf->max_nr_stripes, GFP_KERNEL)) {
 		printk(KERN_ERR "raid5: couldn't allocate %dkB for buffers\n", memory);
-		shrink_stripes(raid_conf, raid_conf->max_nr_stripes);
+		shrink_stripes(conf, conf->max_nr_stripes);
 		goto abort;
 	} else
-		printk(KERN_INFO "raid5: allocated %dkB for %s\n", memory, kdevname(MKDEV(MD_MAJOR, minor)));
+		printk(KERN_INFO "raid5: allocated %dkB for md%d\n", memory, mdidx(mddev));
 
 	/*
 	 * Regenerate the "device is in sync with the raid set" bit for
 	 * each device.
 	 */
-	for (i = 0; i < sb->nr_disks ; i++) {
-		sb->disks[i].state &= ~(1 << MD_SYNC_DEVICE);
+	for (i = 0; i < MD_SB_DISKS ; i++) {
+		mark_disk_nonsync(sb->disks + i);
 		for (j = 0; j < sb->raid_disks; j++) {
-			if (!raid_conf->disks[j].operational)
+			if (!conf->disks[j].operational)
 				continue;
-			if (sb->disks[i].number == raid_conf->disks[j].number)
-				sb->disks[i].state |= 1 << MD_SYNC_DEVICE;
+			if (sb->disks[i].number == conf->disks[j].number)
+				mark_disk_sync(sb->disks + i);
 		}
 	}
-	sb->active_disks = raid_conf->working_disks;
+	sb->active_disks = conf->working_disks;
 
 	if (sb->active_disks == sb->raid_disks)
-		printk("raid5: raid level %d set %s active with %d out of %d devices, algorithm %d\n", raid_conf->level, kdevname(MKDEV(MD_MAJOR, minor)), sb->active_disks, sb->raid_disks, raid_conf->algorithm);
+		printk("raid5: raid level %d set md%d active with %d out of %d devices, algorithm %d\n", conf->level, mdidx(mddev), sb->active_disks, sb->raid_disks, conf->algorithm);
 	else
-		printk(KERN_ALERT "raid5: raid level %d set %s active with %d out of %d devices, algorithm %d\n", raid_conf->level, kdevname(MKDEV(MD_MAJOR, minor)), sb->active_disks, sb->raid_disks, raid_conf->algorithm);
+		printk(KERN_ALERT "raid5: raid level %d set md%d active with %d out of %d devices, algorithm %d\n", conf->level, mdidx(mddev), sb->active_disks, sb->raid_disks, conf->algorithm);
+
+	if (!start_recovery && ((sb->state & (1 << MD_SB_CLEAN))==0)) {
+		const char * name = "raid5syncd";
+
+		conf->resync_thread = md_register_thread(raid5syncd, conf,name);
+		if (!conf->resync_thread) {
+			printk(KERN_ERR "raid5: couldn't allocate thread for md%d\n", mdidx(mddev));
+			goto abort;
+		}
 
-	if ((sb->state & (1 << MD_SB_CLEAN)) == 0) {
-		printk("raid5: raid set %s not clean; re-constructing parity\n", kdevname(MKDEV(MD_MAJOR, minor)));
-		raid_conf->resync_parity = 1;
-#if SUPPORT_RECONSTRUCTION
-		md_wakeup_thread(raid_conf->resync_thread);
-#endif /* SUPPORT_RECONSTRUCTION */
+		printk("raid5: raid set md%d not clean; reconstructing parity\n", mdidx(mddev));
+		conf->resync_parity = 1;
+		md_wakeup_thread(conf->resync_thread);
 	}
 
+	print_raid5_conf(conf);
+	if (start_recovery)
+		md_recover_arrays();
+	print_raid5_conf(conf);
+
 	/* Ok, everything is just fine now */
 	return (0);
 abort:
-	if (raid_conf) {
-		if (raid_conf->stripe_hashtbl)
-			free_pages((unsigned long) raid_conf->stripe_hashtbl, HASH_PAGES_ORDER);
-		kfree(raid_conf);
+	if (conf) {
+		print_raid5_conf(conf);
+		if (conf->stripe_hashtbl)
+			free_pages((unsigned long) conf->stripe_hashtbl,
+							HASH_PAGES_ORDER);
+		kfree(conf);
 	}
 	mddev->private = NULL;
-	printk(KERN_ALERT "raid5: failed to run raid set %s\n", kdevname(MKDEV(MD_MAJOR, minor)));
+	printk(KERN_ALERT "raid5: failed to run raid set md%d\n", mdidx(mddev));
 	MOD_DEC_USE_COUNT;
 	return -EIO;
 }
 
-static int raid5_stop (int minor, struct md_dev *mddev)
+static int raid5_stop_resync (mddev_t *mddev)
+{
+	raid5_conf_t *conf = mddev_to_conf(mddev);
+	mdk_thread_t *thread = conf->resync_thread;
+
+	if (thread) {
+		if (conf->resync_parity) {
+			conf->resync_parity = 2;
+			md_interrupt_thread(thread);
+			printk(KERN_INFO "raid5: parity resync was not fully finished, restarting next time.\n");
+			return 1;
+		}
+		return 0;
+	}
+	return 0;
+}
+
+static int raid5_restart_resync (mddev_t *mddev)
 {
-	struct raid5_data *raid_conf = (struct raid5_data *) mddev->private;
+	raid5_conf_t *conf = mddev_to_conf(mddev);
 
-	shrink_stripe_cache(raid_conf, raid_conf->max_nr_stripes);
-	shrink_stripes(raid_conf, raid_conf->max_nr_stripes);
-	md_unregister_thread(raid_conf->thread);
-#if SUPPORT_RECONSTRUCTION
-	md_unregister_thread(raid_conf->resync_thread);
-#endif /* SUPPORT_RECONSTRUCTION */
-	free_pages((unsigned long) raid_conf->stripe_hashtbl, HASH_PAGES_ORDER);
-	kfree(raid_conf);
+	if (conf->resync_parity) {
+		if (!conf->resync_thread) {
+			MD_BUG();
+			return 0;
+		}
+		printk("raid5: waking up raid5resync.\n");
+		conf->resync_parity = 1;
+		md_wakeup_thread(conf->resync_thread);
+		return 1;
+	} else
+		printk("raid5: no restart-resync needed.\n");
+	return 0;
+}
+
+
+static int raid5_stop (mddev_t *mddev)
+{
+	raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
+
+	shrink_stripe_cache(conf, conf->max_nr_stripes);
+	shrink_stripes(conf, conf->max_nr_stripes);
+	md_unregister_thread(conf->thread);
+	if (conf->resync_thread)
+		md_unregister_thread(conf->resync_thread);
+	free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER);
+	kfree(conf);
 	mddev->private = NULL;
 	MOD_DEC_USE_COUNT;
 	return 0;
 }
 
-static int raid5_status (char *page, int minor, struct md_dev *mddev)
+static int raid5_status (char *page, mddev_t *mddev)
 {
-	struct raid5_data *raid_conf = (struct raid5_data *) mddev->private;
-	md_superblock_t *sb = mddev->sb;
+	raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
+	mdp_super_t *sb = mddev->sb;
 	int sz = 0, i;
 
-	sz += sprintf (page+sz, " level %d, %dk chunk, algorithm %d", sb->level, sb->chunk_size >> 10, sb->parity_algorithm);
-	sz += sprintf (page+sz, " [%d/%d] [", raid_conf->raid_disks, raid_conf->working_disks);
-	for (i = 0; i < raid_conf->raid_disks; i++)
-		sz += sprintf (page+sz, "%s", raid_conf->disks[i].operational ? "U" : "_");
+	sz += sprintf (page+sz, " level %d, %dk chunk, algorithm %d", sb->level, sb->chunk_size >> 10, sb->layout);
+	sz += sprintf (page+sz, " [%d/%d] [", conf->raid_disks, conf->working_disks);
+	for (i = 0; i < conf->raid_disks; i++)
+		sz += sprintf (page+sz, "%s", conf->disks[i].operational ? "U" : "_");
 	sz += sprintf (page+sz, "]");
 	return sz;
 }
 
-static int raid5_mark_spare(struct md_dev *mddev, md_descriptor_t *spare, int state)
+static void print_raid5_conf (raid5_conf_t *conf)
+{
+	int i;
+	struct disk_info *tmp;
+
+	printk("RAID5 conf printout:\n");
+	if (!conf) {
+		printk("(conf==NULL)\n");
+		return;
+	}
+	printk(" --- rd:%d wd:%d fd:%d\n", conf->raid_disks,
+		 conf->working_disks, conf->failed_disks);
+
+	for (i = 0; i < MD_SB_DISKS; i++) {
+		tmp = conf->disks + i;
+		printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n",
+			i, tmp->spare,tmp->operational,
+			tmp->number,tmp->raid_disk,tmp->used_slot,
+			partition_name(tmp->dev));
+	}
+}
+
+static int raid5_diskop(mddev_t *mddev, mdp_disk_t **d, int state)
 {
-	int i = 0, failed_disk = -1;
-	struct raid5_data *raid_conf = mddev->private;
-	struct disk_info *disk = raid_conf->disks;
+	int err = 0;
+	int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1;
+	raid5_conf_t *conf = mddev->private;
+	struct disk_info *tmp, *sdisk, *fdisk, *rdisk, *adisk;
 	unsigned long flags;
-	md_superblock_t *sb = mddev->sb;
-	md_descriptor_t *descriptor;
+	mdp_super_t *sb = mddev->sb;
+	mdp_disk_t *failed_desc, *spare_desc, *added_desc;
 
-	for (i = 0; i < MD_SB_DISKS; i++, disk++) {
-		if (disk->spare && disk->number == spare->number)
-			goto found;
-	}
-	return 1;
-found:
-	for (i = 0, disk = raid_conf->disks; i < raid_conf->raid_disks; i++, disk++)
-		if (!disk->operational)
-			failed_disk = i;
-	if (failed_disk == -1)
-		return 1;
 	save_flags(flags);
 	cli();
+
+	print_raid5_conf(conf);
+	/*
+	 * find the disk ...
+	 */
 	switch (state) {
-		case SPARE_WRITE:
-			disk->operational = 1;
-			disk->write_only = 1;
-			raid_conf->spare = disk;
-			break;
-		case SPARE_INACTIVE:
-			disk->operational = 0;
-			disk->write_only = 0;
-			raid_conf->spare = NULL;
-			break;
-		case SPARE_ACTIVE:
-			disk->spare = 0;
-			disk->write_only = 0;
 
-			descriptor = &sb->disks[raid_conf->disks[failed_disk].number];
-			i = spare->raid_disk;
-			disk->raid_disk = spare->raid_disk = descriptor->raid_disk;
-			if (disk->raid_disk != failed_disk)
-				printk("raid5: disk->raid_disk != failed_disk");
-			descriptor->raid_disk = i;
-
-			raid_conf->spare = NULL;
-			raid_conf->working_disks++;
-			raid_conf->failed_disks--;
-			raid_conf->disks[failed_disk] = *disk;
-			break;
-		default:
-			printk("raid5_mark_spare: bug: state == %d\n", state);
-			restore_flags(flags);
-			return 1;
+	case DISKOP_SPARE_ACTIVE:
+
+		/*
+		 * Find the failed disk within the RAID5 configuration ...
+		 * (this can only be in the first conf->raid_disks part)
+		 */
+		for (i = 0; i < conf->raid_disks; i++) {
+			tmp = conf->disks + i;
+			if ((!tmp->operational && !tmp->spare) ||
+					!tmp->used_slot) {
+				failed_disk = i;
+				break;
+			}
+		}
+		/*
+		 * When we activate a spare disk we _must_ have a disk in
+		 * the lower (active) part of the array to replace. 
+		 */
+		if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) {
+			MD_BUG();
+			err = 1;
+			goto abort;
+		}
+		/* fall through */
+
+	case DISKOP_SPARE_WRITE:
+	case DISKOP_SPARE_INACTIVE:
+
+		/*
+		 * Find the spare disk ... (can only be in the 'high'
+		 * area of the array)
+		 */
+		for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
+			tmp = conf->disks + i;
+			if (tmp->spare && tmp->number == (*d)->number) {
+				spare_disk = i;
+				break;
+			}
+		}
+		if (spare_disk == -1) {
+			MD_BUG();
+			err = 1;
+			goto abort;
+		}
+		break;
+
+	case DISKOP_HOT_REMOVE_DISK:
+
+		for (i = 0; i < MD_SB_DISKS; i++) {
+			tmp = conf->disks + i;
+			if (tmp->used_slot && (tmp->number == (*d)->number)) {
+				if (tmp->operational) {
+					err = -EBUSY;
+					goto abort;
+				}
+				removed_disk = i;
+				break;
+			}
+		}
+		if (removed_disk == -1) {
+			MD_BUG();
+			err = 1;
+			goto abort;
+		}
+		break;
+
+	case DISKOP_HOT_ADD_DISK:
+
+		for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
+			tmp = conf->disks + i;
+			if (!tmp->used_slot) {
+				added_disk = i;
+				break;
+			}
+		}
+		if (added_disk == -1) {
+			MD_BUG();
+			err = 1;
+			goto abort;
+		}
+		break;
+	}
+
+	switch (state) {
+	/*
+	 * Switch the spare disk to write-only mode:
+	 */
+	case DISKOP_SPARE_WRITE:
+		if (conf->spare) {
+			MD_BUG();
+			err = 1;
+			goto abort;
+		}
+		sdisk = conf->disks + spare_disk;
+		sdisk->operational = 1;
+		sdisk->write_only = 1;
+		conf->spare = sdisk;
+		break;
+	/*
+	 * Deactivate a spare disk:
+	 */
+	case DISKOP_SPARE_INACTIVE:
+		sdisk = conf->disks + spare_disk;
+		sdisk->operational = 0;
+		sdisk->write_only = 0;
+		/*
+		 * Was the spare being resynced?
+		 */
+		if (conf->spare == sdisk)
+			conf->spare = NULL;
+		break;
+	/*
+	 * Activate (mark read-write) the (now sync) spare disk,
+	 * which means we switch it's 'raid position' (->raid_disk)
+	 * with the failed disk. (only the first 'conf->raid_disks'
+	 * slots are used for 'real' disks and we must preserve this
+	 * property)
+	 */
+	case DISKOP_SPARE_ACTIVE:
+		if (!conf->spare) {
+			MD_BUG();
+			err = 1;
+			goto abort;
+		}
+		sdisk = conf->disks + spare_disk;
+		fdisk = conf->disks + failed_disk;
+
+		spare_desc = &sb->disks[sdisk->number];
+		failed_desc = &sb->disks[fdisk->number];
+
+		if (spare_desc != *d) {
+			MD_BUG();
+			err = 1;
+			goto abort;
+		}
+
+		if (spare_desc->raid_disk != sdisk->raid_disk) {
+			MD_BUG();
+			err = 1;
+			goto abort;
+		}
+			
+		if (sdisk->raid_disk != spare_disk) {
+			MD_BUG();
+			err = 1;
+			goto abort;
+		}
+
+		if (failed_desc->raid_disk != fdisk->raid_disk) {
+			MD_BUG();
+			err = 1;
+			goto abort;
+		}
+
+		if (fdisk->raid_disk != failed_disk) {
+			MD_BUG();
+			err = 1;
+			goto abort;
+		}
+
+		/*
+		 * do the switch finally
+		 */
+		xchg_values(*spare_desc, *failed_desc);
+		xchg_values(*fdisk, *sdisk);
+
+		/*
+		 * (careful, 'failed' and 'spare' are switched from now on)
+		 *
+		 * we want to preserve linear numbering and we want to
+		 * give the proper raid_disk number to the now activated
+		 * disk. (this means we switch back these values)
+		 */
+	
+		xchg_values(spare_desc->raid_disk, failed_desc->raid_disk);
+		xchg_values(sdisk->raid_disk, fdisk->raid_disk);
+		xchg_values(spare_desc->number, failed_desc->number);
+		xchg_values(sdisk->number, fdisk->number);
+
+		*d = failed_desc;
+
+		if (sdisk->dev == MKDEV(0,0))
+			sdisk->used_slot = 0;
+
+		/*
+		 * this really activates the spare.
+		 */
+		fdisk->spare = 0;
+		fdisk->write_only = 0;
+
+		/*
+		 * if we activate a spare, we definitely replace a
+		 * non-operational disk slot in the 'low' area of
+		 * the disk array.
+		 */
+		conf->failed_disks--;
+		conf->working_disks++;
+		conf->spare = NULL;
+
+		break;
+
+	case DISKOP_HOT_REMOVE_DISK:
+		rdisk = conf->disks + removed_disk;
+
+		if (rdisk->spare && (removed_disk < conf->raid_disks)) {
+			MD_BUG();	
+			err = 1;
+			goto abort;
+		}
+		rdisk->dev = MKDEV(0,0);
+		rdisk->used_slot = 0;
+
+		break;
+
+	case DISKOP_HOT_ADD_DISK:
+		adisk = conf->disks + added_disk;
+		added_desc = *d;
+
+		if (added_disk != added_desc->number) {
+			MD_BUG();	
+			err = 1;
+			goto abort;
+		}
+
+		adisk->number = added_desc->number;
+		adisk->raid_disk = added_desc->raid_disk;
+		adisk->dev = MKDEV(added_desc->major,added_desc->minor);
+
+		adisk->operational = 0;
+		adisk->write_only = 0;
+		adisk->spare = 1;
+		adisk->used_slot = 1;
+
+
+		break;
+
+	default:
+		MD_BUG();	
+		err = 1;
+		goto abort;
 	}
+abort:
 	restore_flags(flags);
-	return 0;
+	print_raid5_conf(conf);
+	return err;
 }
 
-static struct md_personality raid5_personality=
+static mdk_personality_t raid5_personality=
 {
 	"raid5",
 	raid5_map,
@@ -1648,14 +2072,19 @@
 	NULL,			/* no ioctls */
 	0,
 	raid5_error,
-	/* raid5_hot_add_disk, */ NULL,
-	/* raid1_hot_remove_drive */ NULL,
-	raid5_mark_spare
+	raid5_diskop,
+	raid5_stop_resync,
+	raid5_restart_resync
 };
 
 int raid5_init (void)
 {
-	return register_md_personality (RAID5, &raid5_personality);
+	int err;
+
+	err = register_md_personality (RAID5, &raid5_personality);
+	if (err)
+		return err;
+	return 0;
 }
 
 #ifdef MODULE
diff -urN 2.2.18/drivers/block/rd.c 2.2.18aa1/drivers/block/rd.c
--- 2.2.18/drivers/block/rd.c	Mon Dec 11 16:57:48 2000
+++ 2.2.18aa1/drivers/block/rd.c	Mon Dec 11 17:20:52 2000
@@ -173,7 +173,7 @@
 	if (CURRENT->cmd == READ) 
 		memset(CURRENT->buffer, 0, len); 
 	else	
-		set_bit(BH_Protected, &CURRENT->bh->b_state);
+		mark_buffer_protected(CURRENT->bh);
 
 	end_request(1);
 	goto repeat;
diff -urN 2.2.18/drivers/block/translucent.c 2.2.18aa1/drivers/block/translucent.c
--- 2.2.18/drivers/block/translucent.c	Thu Jan  1 01:00:00 1970
+++ 2.2.18aa1/drivers/block/translucent.c	Mon Dec 11 17:20:54 2000
@@ -0,0 +1,136 @@
+/*
+   translucent.c : Translucent RAID driver for Linux
+              Copyright (C) 1998 Ingo Molnar
+
+   Translucent mode management functions.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+   
+   You should have received a copy of the GNU General Public License
+   (for example /usr/src/linux/COPYING); if not, write to the Free
+   Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  
+*/
+
+#include <linux/module.h>
+
+#include <linux/raid/md.h>
+#include <linux/malloc.h>
+
+#include <linux/raid/translucent.h>
+
+#define MAJOR_NR MD_MAJOR
+#define MD_DRIVER
+#define MD_PERSONALITY
+
+static int translucent_run (mddev_t *mddev)
+{
+	translucent_conf_t *conf;
+	mdk_rdev_t *rdev;
+	int i;
+
+	MOD_INC_USE_COUNT;
+
+	conf = kmalloc (sizeof (*conf), GFP_KERNEL);
+	if (!conf)
+		goto out;
+	mddev->private = conf;
+
+	if (mddev->nb_dev != 2) {
+		printk("translucent: this mode needs 2 disks, aborting!\n");
+		goto out;
+	}
+
+	if (md_check_ordering(mddev)) {
+		printk("translucent: disks are not ordered, aborting!\n");
+		goto out;
+	}
+
+	ITERATE_RDEV_ORDERED(mddev,rdev,i) {
+		dev_info_t *disk = conf->disks + i;
+
+		disk->dev = rdev->dev;
+		disk->size = rdev->size;
+	}
+
+	return 0;
+
+out:
+	if (conf)
+		kfree(conf);
+
+	MOD_DEC_USE_COUNT;
+	return 1;
+}
+
+static int translucent_stop (mddev_t *mddev)
+{
+	translucent_conf_t *conf = mddev_to_conf(mddev);
+  
+	kfree(conf);
+
+	MOD_DEC_USE_COUNT;
+
+	return 0;
+}
+
+
+static int translucent_map (mddev_t *mddev, kdev_t dev, kdev_t *rdev,
+		       unsigned long *rsector, unsigned long size)
+{
+	translucent_conf_t *conf = mddev_to_conf(mddev);
+  
+	*rdev = conf->disks[0].dev;
+
+	return 0;
+}
+
+static int translucent_status (char *page, mddev_t *mddev)
+{
+	int sz = 0;
+  
+	sz += sprintf(page+sz, " %d%% full", 10);
+	return sz;
+}
+
+
+static mdk_personality_t translucent_personality=
+{
+	"translucent",
+	translucent_map,
+	NULL,
+	NULL,
+	translucent_run,
+	translucent_stop,
+	translucent_status,
+	NULL,
+	0,
+	NULL,
+	NULL,
+	NULL,
+	NULL
+};
+
+#ifndef MODULE
+
+md__initfunc(void translucent_init (void))
+{
+	register_md_personality (TRANSLUCENT, &translucent_personality);
+}
+
+#else
+
+int init_module (void)
+{
+	return (register_md_personality (TRANSLUCENT, &translucent_personality));
+}
+
+void cleanup_module (void)
+{
+	unregister_md_personality (TRANSLUCENT);
+}
+
+#endif
+
diff -urN 2.2.18/drivers/block/xor.c 2.2.18aa1/drivers/block/xor.c
--- 2.2.18/drivers/block/xor.c	Thu Jan  1 01:00:00 1970
+++ 2.2.18aa1/drivers/block/xor.c	Mon Dec 11 17:20:54 2000
@@ -0,0 +1,1894 @@
+/*
+ * xor.c : Multiple Devices driver for Linux
+ *
+ * Copyright (C) 1996, 1997, 1998, 1999 Ingo Molnar, Matti Aarnio, Jakub Jelinek
+ *
+ *
+ * optimized RAID-5 checksumming functions.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * You should have received a copy of the GNU General Public License
+ * (for example /usr/src/linux/COPYING); if not, write to the Free
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#include <linux/module.h>
+#include <linux/raid/md.h>
+#ifdef __sparc_v9__
+#include <asm/head.h>
+#include <asm/asi.h>
+#include <asm/visasm.h>
+#endif
+
+/*
+ * we use the 'XOR function template' to register multiple xor
+ * functions runtime. The kernel measures their speed upon bootup
+ * and decides which one to use. (compile-time registration is
+ * not enough as certain CPU features like MMX can only be detected
+ * runtime)
+ *
+ * this architecture makes it pretty easy to add new routines
+ * that are faster on certain CPUs, without killing other CPU's
+ * 'native' routine. Although the current routines are belived
+ * to be the physically fastest ones on all CPUs tested, but
+ * feel free to prove me wrong and add yet another routine =B-)
+ * --mingo
+ */
+
+#define MAX_XOR_BLOCKS 5
+
+#define XOR_ARGS (unsigned int count, struct buffer_head **bh_ptr)
+
+typedef void (*xor_block_t) XOR_ARGS;
+xor_block_t xor_block = NULL;
+
+#ifndef __sparc_v9__
+
+struct xor_block_template;
+
+struct xor_block_template {
+	char * name;
+	xor_block_t xor_block;
+	int speed;
+	struct xor_block_template * next;
+};
+
+struct xor_block_template * xor_functions = NULL;
+
+#define XORBLOCK_TEMPLATE(x) \
+static void xor_block_##x XOR_ARGS; \
+static struct xor_block_template t_xor_block_##x = \
+				 { #x, xor_block_##x, 0, NULL }; \
+static void xor_block_##x XOR_ARGS
+
+#ifdef __i386__
+
+#ifdef CONFIG_X86_XMM
+/*
+ * Cache avoiding checksumming functions utilizing KNI instructions
+ * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
+ */
+
+XORBLOCK_TEMPLATE(pIII_kni)
+{
+	char xmm_save[16*4];
+	int cr0;
+        int lines = (bh_ptr[0]->b_size>>8);
+
+	__asm__ __volatile__ ( 
+		"movl %%cr0,%0		;\n\t"
+		"clts			;\n\t"
+		"movups %%xmm0,(%1)	;\n\t"
+		"movups %%xmm1,0x10(%1)	;\n\t"
+		"movups %%xmm2,0x20(%1)	;\n\t"
+		"movups %%xmm3,0x30(%1)	;\n\t"
+		: "=r" (cr0)
+		: "r" (xmm_save) 
+		: "memory" );
+
+#define OFFS(x) "8*("#x"*2)"
+#define	PF0(x) \
+	"	prefetcht0  "OFFS(x)"(%1)   ;\n"
+#define LD(x,y) \
+        "       movaps   "OFFS(x)"(%1), %%xmm"#y"   ;\n"
+#define ST(x,y) \
+        "       movaps %%xmm"#y",   "OFFS(x)"(%1)   ;\n"
+#define PF1(x) \
+	"	prefetchnta "OFFS(x)"(%2)   ;\n"
+#define PF2(x) \
+	"	prefetchnta "OFFS(x)"(%3)   ;\n"
+#define PF3(x) \
+	"	prefetchnta "OFFS(x)"(%4)   ;\n"
+#define PF4(x) \
+	"	prefetchnta "OFFS(x)"(%5)   ;\n"
+#define PF5(x) \
+	"	prefetchnta "OFFS(x)"(%6)   ;\n"
+#define XO1(x,y) \
+        "       xorps   "OFFS(x)"(%2), %%xmm"#y"   ;\n"
+#define XO2(x,y) \
+        "       xorps   "OFFS(x)"(%3), %%xmm"#y"   ;\n"
+#define XO3(x,y) \
+        "       xorps   "OFFS(x)"(%4), %%xmm"#y"   ;\n"
+#define XO4(x,y) \
+        "       xorps   "OFFS(x)"(%5), %%xmm"#y"   ;\n"
+#define XO5(x,y) \
+        "       xorps   "OFFS(x)"(%6), %%xmm"#y"   ;\n"
+
+	switch(count) {
+		case 2:
+		        __asm__ __volatile__ (
+#undef BLOCK
+#define BLOCK(i) \
+		LD(i,0)					\
+			LD(i+1,1)			\
+		PF1(i)					\
+				PF1(i+2)		\
+				LD(i+2,2)		\
+					LD(i+3,3)	\
+		PF0(i+4)				\
+				PF0(i+6)		\
+		XO1(i,0)				\
+			XO1(i+1,1)			\
+				XO1(i+2,2)		\
+					XO1(i+3,3)	\
+		ST(i,0)					\
+			ST(i+1,1)			\
+				ST(i+2,2)		\
+					ST(i+3,3)	\
+
+
+		PF0(0)
+				PF0(2)
+
+	" .align 32,0x90		;\n"
+        " 1:                            ;\n"
+
+		BLOCK(0)
+		BLOCK(4)
+		BLOCK(8)
+		BLOCK(12)
+
+        "       addl $256, %1           ;\n"
+        "       addl $256, %2           ;\n"
+        "       decl %0                 ;\n"
+        "       jnz 1b                  ;\n"
+
+        		:
+			: "r" (lines),
+			  "r" (bh_ptr[0]->b_data),
+        		  "r" (bh_ptr[1]->b_data)
+		        : "memory" );
+			break;
+		case 3:
+		        __asm__ __volatile__ (
+#undef BLOCK
+#define BLOCK(i) \
+		PF1(i)					\
+				PF1(i+2)		\
+		LD(i,0)					\
+			LD(i+1,1)			\
+				LD(i+2,2)		\
+					LD(i+3,3)	\
+		PF2(i)					\
+				PF2(i+2)		\
+		PF0(i+4)				\
+				PF0(i+6)		\
+		XO1(i,0)				\
+			XO1(i+1,1)			\
+				XO1(i+2,2)		\
+					XO1(i+3,3)	\
+		XO2(i,0)				\
+			XO2(i+1,1)			\
+				XO2(i+2,2)		\
+					XO2(i+3,3)	\
+		ST(i,0)					\
+			ST(i+1,1)			\
+				ST(i+2,2)		\
+					ST(i+3,3)	\
+
+
+		PF0(0)
+				PF0(2)
+
+	" .align 32,0x90		;\n"
+        " 1:                            ;\n"
+
+		BLOCK(0)
+		BLOCK(4)
+		BLOCK(8)
+		BLOCK(12)
+
+        "       addl $256, %1           ;\n"
+        "       addl $256, %2           ;\n"
+        "       addl $256, %3           ;\n"
+        "       decl %0                 ;\n"
+        "       jnz 1b                  ;\n"
+        		:
+			: "r" (lines),
+			  "r" (bh_ptr[0]->b_data),
+        		  "r" (bh_ptr[1]->b_data),
+			  "r" (bh_ptr[2]->b_data)
+		        : "memory" );
+			break;
+		case 4:
+		        __asm__ __volatile__ (
+#undef BLOCK
+#define BLOCK(i) \
+		PF1(i)					\
+				PF1(i+2)		\
+		LD(i,0)					\
+			LD(i+1,1)			\
+				LD(i+2,2)		\
+					LD(i+3,3)	\
+		PF2(i)					\
+				PF2(i+2)		\
+		XO1(i,0)				\
+			XO1(i+1,1)			\
+				XO1(i+2,2)		\
+					XO1(i+3,3)	\
+		PF3(i)					\
+				PF3(i+2)		\
+		PF0(i+4)				\
+				PF0(i+6)		\
+		XO2(i,0)				\
+			XO2(i+1,1)			\
+				XO2(i+2,2)		\
+					XO2(i+3,3)	\
+		XO3(i,0)				\
+			XO3(i+1,1)			\
+				XO3(i+2,2)		\
+					XO3(i+3,3)	\
+		ST(i,0)					\
+			ST(i+1,1)			\
+				ST(i+2,2)		\
+					ST(i+3,3)	\
+
+
+		PF0(0)
+				PF0(2)
+
+	" .align 32,0x90		;\n"
+        " 1:                            ;\n"
+
+		BLOCK(0)
+		BLOCK(4)
+		BLOCK(8)
+		BLOCK(12)
+
+        "       addl $256, %1           ;\n"
+        "       addl $256, %2           ;\n"
+        "       addl $256, %3           ;\n"
+        "       addl $256, %4           ;\n"
+        "       decl %0                 ;\n"
+        "       jnz 1b                  ;\n"
+
+        		:
+			: "r" (lines),
+			  "r" (bh_ptr[0]->b_data),
+        		  "r" (bh_ptr[1]->b_data),
+			  "r" (bh_ptr[2]->b_data),
+			  "r" (bh_ptr[3]->b_data)
+		        : "memory" );
+			break;
+		case 5:
+		        __asm__ __volatile__ (
+#undef BLOCK
+#define BLOCK(i) \
+		PF1(i)					\
+				PF1(i+2)		\
+		LD(i,0)					\
+			LD(i+1,1)			\
+				LD(i+2,2)		\
+					LD(i+3,3)	\
+		PF2(i)					\
+				PF2(i+2)		\
+		XO1(i,0)				\
+			XO1(i+1,1)			\
+				XO1(i+2,2)		\
+					XO1(i+3,3)	\
+		PF3(i)					\
+				PF3(i+2)		\
+		XO2(i,0)				\
+			XO2(i+1,1)			\
+				XO2(i+2,2)		\
+					XO2(i+3,3)	\
+		PF4(i)					\
+				PF4(i+2)		\
+		PF0(i+4)				\
+				PF0(i+6)		\
+		XO3(i,0)				\
+			XO3(i+1,1)			\
+				XO3(i+2,2)		\
+					XO3(i+3,3)	\
+		XO4(i,0)				\
+			XO4(i+1,1)			\
+				XO4(i+2,2)		\
+					XO4(i+3,3)	\
+		ST(i,0)					\
+			ST(i+1,1)			\
+				ST(i+2,2)		\
+					ST(i+3,3)	\
+
+
+		PF0(0)
+				PF0(2)
+
+	" .align 32,0x90		;\n"
+        " 1:                            ;\n"
+
+		BLOCK(0)
+		BLOCK(4)
+		BLOCK(8)
+		BLOCK(12)
+
+        "       addl $256, %1           ;\n"
+        "       addl $256, %2           ;\n"
+        "       addl $256, %3           ;\n"
+        "       addl $256, %4           ;\n"
+        "       addl $256, %5           ;\n"
+        "       decl %0                 ;\n"
+        "       jnz 1b                  ;\n"
+
+        		:
+			: "r" (lines),
+			  "r" (bh_ptr[0]->b_data),
+        		  "r" (bh_ptr[1]->b_data),
+			  "r" (bh_ptr[2]->b_data),
+			  "r" (bh_ptr[3]->b_data),
+			  "r" (bh_ptr[4]->b_data)
+			: "memory");
+			break;
+	}
+
+	__asm__ __volatile__ ( 
+		"sfence			;\n\t"
+		"movups (%1),%%xmm0	;\n\t"
+		"movups 0x10(%1),%%xmm1	;\n\t"
+		"movups 0x20(%1),%%xmm2	;\n\t"
+		"movups 0x30(%1),%%xmm3	;\n\t"
+		"movl 	%0,%%cr0	;\n\t"
+		:
+		: "r" (cr0), "r" (xmm_save)
+		: "memory" );
+}
+
+#undef OFFS
+#undef LD
+#undef ST
+#undef PF0
+#undef PF1
+#undef PF2
+#undef PF3
+#undef PF4
+#undef PF5
+#undef XO1
+#undef XO2
+#undef XO3
+#undef XO4
+#undef XO5
+#undef BLOCK
+
+#endif /* CONFIG_X86_XMM */
+
+/*
+ * high-speed RAID5 checksumming functions utilizing MMX instructions
+ * Copyright (C) 1998 Ingo Molnar
+ */
+XORBLOCK_TEMPLATE(pII_mmx)
+{
+	char fpu_save[108];
+        int lines = (bh_ptr[0]->b_size>>7);
+
+	if (!(current->flags & PF_USEDFPU))
+		__asm__ __volatile__ ( " clts;\n");
+
+	__asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );
+
+#define LD(x,y) \
+        "       movq   8*("#x")(%1), %%mm"#y"   ;\n"
+#define ST(x,y) \
+        "       movq %%mm"#y",   8*("#x")(%1)   ;\n"
+#define XO1(x,y) \
+        "       pxor   8*("#x")(%2), %%mm"#y"   ;\n"
+#define XO2(x,y) \
+        "       pxor   8*("#x")(%3), %%mm"#y"   ;\n"
+#define XO3(x,y) \
+        "       pxor   8*("#x")(%4), %%mm"#y"   ;\n"
+#define XO4(x,y) \
+        "       pxor   8*("#x")(%5), %%mm"#y"   ;\n"
+
+	switch(count) {
+		case 2:
+			__asm__ __volatile__ (
+#undef BLOCK
+#define BLOCK(i) \
+			LD(i,0)					\
+				LD(i+1,1)			\
+					LD(i+2,2)		\
+						LD(i+3,3)	\
+			XO1(i,0)				\
+			ST(i,0)					\
+				XO1(i+1,1)			\
+				ST(i+1,1)			\
+					XO1(i+2,2)		\
+					ST(i+2,2)		\
+						XO1(i+3,3)	\
+						ST(i+3,3)
+
+			" .align 32,0x90		;\n"
+  			" 1:                            ;\n"
+
+			BLOCK(0)
+			BLOCK(4)
+			BLOCK(8)
+			BLOCK(12)
+
+		        "       addl $128, %1         ;\n"
+		        "       addl $128, %2         ;\n"
+		        "       decl %0               ;\n"
+		        "       jnz 1b                ;\n"
+	        	:
+			: "r" (lines),
+			  "r" (bh_ptr[0]->b_data),
+			  "r" (bh_ptr[1]->b_data)
+			: "memory");
+			break;
+		case 3:
+			__asm__ __volatile__ (
+#undef BLOCK
+#define BLOCK(i) \
+			LD(i,0)					\
+				LD(i+1,1)			\
+					LD(i+2,2)		\
+						LD(i+3,3)	\
+			XO1(i,0)				\
+				XO1(i+1,1)			\
+					XO1(i+2,2)		\
+						XO1(i+3,3)	\
+			XO2(i,0)				\
+			ST(i,0)					\
+				XO2(i+1,1)			\
+				ST(i+1,1)			\
+					XO2(i+2,2)		\
+					ST(i+2,2)		\
+						XO2(i+3,3)	\
+						ST(i+3,3)
+
+			" .align 32,0x90		;\n"
+  			" 1:                            ;\n"
+
+			BLOCK(0)
+			BLOCK(4)
+			BLOCK(8)
+			BLOCK(12)
+
+		        "       addl $128, %1         ;\n"
+		        "       addl $128, %2         ;\n"
+		        "       addl $128, %3         ;\n"
+		        "       decl %0               ;\n"
+		        "       jnz 1b                ;\n"
+	        	:
+			: "r" (lines),
+			  "r" (bh_ptr[0]->b_data),
+			  "r" (bh_ptr[1]->b_data),
+			  "r" (bh_ptr[2]->b_data)
+			: "memory");
+			break;
+		case 4:
+			__asm__ __volatile__ (
+#undef BLOCK
+#define BLOCK(i) \
+			LD(i,0)					\
+				LD(i+1,1)			\
+					LD(i+2,2)		\
+						LD(i+3,3)	\
+			XO1(i,0)				\
+				XO1(i+1,1)			\
+					XO1(i+2,2)		\
+						XO1(i+3,3)	\
+			XO2(i,0)				\
+				XO2(i+1,1)			\
+					XO2(i+2,2)		\
+						XO2(i+3,3)	\
+			XO3(i,0)				\
+			ST(i,0)					\
+				XO3(i+1,1)			\
+				ST(i+1,1)			\
+					XO3(i+2,2)		\
+					ST(i+2,2)		\
+						XO3(i+3,3)	\
+						ST(i+3,3)
+
+			" .align 32,0x90		;\n"
+  			" 1:                            ;\n"
+
+			BLOCK(0)
+			BLOCK(4)
+			BLOCK(8)
+			BLOCK(12)
+
+		        "       addl $128, %1         ;\n"
+		        "       addl $128, %2         ;\n"
+		        "       addl $128, %3         ;\n"
+		        "       addl $128, %4         ;\n"
+		        "       decl %0               ;\n"
+		        "       jnz 1b                ;\n"
+	        	:
+			: "r" (lines),
+			  "r" (bh_ptr[0]->b_data),
+			  "r" (bh_ptr[1]->b_data),
+			  "r" (bh_ptr[2]->b_data),
+			  "r" (bh_ptr[3]->b_data)
+			: "memory");
+			break;
+		case 5:
+			__asm__ __volatile__ (
+#undef BLOCK
+#define BLOCK(i) \
+			LD(i,0)					\
+				LD(i+1,1)			\
+					LD(i+2,2)		\
+						LD(i+3,3)	\
+			XO1(i,0)				\
+				XO1(i+1,1)			\
+					XO1(i+2,2)		\
+						XO1(i+3,3)	\
+			XO2(i,0)				\
+				XO2(i+1,1)			\
+					XO2(i+2,2)		\
+						XO2(i+3,3)	\
+			XO3(i,0)				\
+				XO3(i+1,1)			\
+					XO3(i+2,2)		\
+						XO3(i+3,3)	\
+			XO4(i,0)				\
+			ST(i,0)					\
+				XO4(i+1,1)			\
+				ST(i+1,1)			\
+					XO4(i+2,2)		\
+					ST(i+2,2)		\
+						XO4(i+3,3)	\
+						ST(i+3,3)
+
+			" .align 32,0x90		;\n"
+  			" 1:                            ;\n"
+
+			BLOCK(0)
+			BLOCK(4)
+			BLOCK(8)
+			BLOCK(12)
+
+		        "       addl $128, %1         ;\n"
+		        "       addl $128, %2         ;\n"
+		        "       addl $128, %3         ;\n"
+		        "       addl $128, %4         ;\n"
+		        "       addl $128, %5         ;\n"
+		        "       decl %0               ;\n"
+		        "       jnz 1b                ;\n"
+	        	:
+			: "r" (lines),
+			  "r" (bh_ptr[0]->b_data),
+			  "r" (bh_ptr[1]->b_data),
+			  "r" (bh_ptr[2]->b_data),
+			  "r" (bh_ptr[3]->b_data),
+			  "r" (bh_ptr[4]->b_data)
+			: "memory");
+			break;
+	}
+
+	__asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) );
+
+	if (!(current->flags & PF_USEDFPU))
+		stts();
+}
+
+#undef LD
+#undef XO1
+#undef XO2
+#undef XO3
+#undef XO4
+#undef ST
+#undef BLOCK
+
+XORBLOCK_TEMPLATE(p5_mmx)
+{
+	char fpu_save[108];
+        int lines = (bh_ptr[0]->b_size>>6);
+
+	if (!(current->flags & PF_USEDFPU))
+		__asm__ __volatile__ ( " clts;\n");
+
+	__asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );
+
+	switch(count) {
+		case 2:
+		        __asm__ __volatile__ (
+
+			        " .align 32,0x90             ;\n"
+			        " 1:                         ;\n"
+			        "       movq   (%1), %%mm0   ;\n"
+			        "       movq  8(%1), %%mm1   ;\n"
+			        "       pxor   (%2), %%mm0   ;\n"
+			        "       movq 16(%1), %%mm2   ;\n"
+			        "       movq %%mm0,   (%1)   ;\n"
+			        "       pxor  8(%2), %%mm1   ;\n"
+			        "       movq 24(%1), %%mm3   ;\n"
+			        "       movq %%mm1,  8(%1)   ;\n"
+			        "       pxor 16(%2), %%mm2   ;\n"
+			        "       movq 32(%1), %%mm4   ;\n"
+			        "       movq %%mm2, 16(%1)   ;\n"
+			        "       pxor 24(%2), %%mm3   ;\n"
+			        "       movq 40(%1), %%mm5   ;\n"
+			        "       movq %%mm3, 24(%1)   ;\n"
+			        "       pxor 32(%2), %%mm4   ;\n"
+			        "       movq 48(%1), %%mm6   ;\n"
+			        "       movq %%mm4, 32(%1)   ;\n"
+			        "       pxor 40(%2), %%mm5   ;\n"
+			        "       movq 56(%1), %%mm7   ;\n"
+			        "       movq %%mm5, 40(%1)   ;\n"
+			        "       pxor 48(%2), %%mm6   ;\n"
+			        "       pxor 56(%2), %%mm7   ;\n"
+			        "       movq %%mm6, 48(%1)   ;\n"
+			        "       movq %%mm7, 56(%1)   ;\n"
+        
+			        "       addl $64, %1         ;\n"
+			        "       addl $64, %2         ;\n"
+			        "       decl %0              ;\n"
+			        "       jnz 1b               ;\n"
+
+			        : 
+			        : "r" (lines),
+				  "r" (bh_ptr[0]->b_data),
+				  "r" (bh_ptr[1]->b_data)
+			        : "memory" );
+			break;
+		case 3:
+			__asm__ __volatile__ (
+
+			        " .align 32,0x90             ;\n"
+			        " 1:                         ;\n"
+			        "       movq   (%1), %%mm0   ;\n"
+			        "       movq  8(%1), %%mm1   ;\n"
+			        "       pxor   (%2), %%mm0   ;\n"
+			        "       movq 16(%1), %%mm2   ;\n"
+			        "       pxor  8(%2), %%mm1   ;\n"
+			        "       pxor   (%3), %%mm0   ;\n"
+			        "       pxor 16(%2), %%mm2   ;\n"
+			        "       movq %%mm0,   (%1)   ;\n"
+			        "       pxor  8(%3), %%mm1   ;\n"
+			        "       pxor 16(%3), %%mm2   ;\n"
+			        "       movq 24(%1), %%mm3   ;\n"
+			        "       movq %%mm1,  8(%1)   ;\n"
+			        "       movq 32(%1), %%mm4   ;\n"
+			        "       movq 40(%1), %%mm5   ;\n"
+			        "       pxor 24(%2), %%mm3   ;\n"
+			        "       movq %%mm2, 16(%1)   ;\n"
+			        "       pxor 32(%2), %%mm4   ;\n"
+			        "       pxor 24(%3), %%mm3   ;\n"
+			        "       pxor 40(%2), %%mm5   ;\n"
+			        "       movq %%mm3, 24(%1)   ;\n"
+			        "       pxor 32(%3), %%mm4   ;\n"
+			        "       pxor 40(%3), %%mm5   ;\n"
+			        "       movq 48(%1), %%mm6   ;\n"
+			        "       movq %%mm4, 32(%1)   ;\n"
+			        "       movq 56(%1), %%mm7   ;\n"
+			        "       pxor 48(%2), %%mm6   ;\n"
+			        "       movq %%mm5, 40(%1)   ;\n"
+			        "       pxor 56(%2), %%mm7   ;\n"
+			        "       pxor 48(%3), %%mm6   ;\n"
+			        "       pxor 56(%3), %%mm7   ;\n"
+			        "       movq %%mm6, 48(%1)   ;\n"
+			        "       movq %%mm7, 56(%1)   ;\n"
+        
+			        "       addl $64, %1         ;\n"
+			        "       addl $64, %2         ;\n"
+			        "       addl $64, %3         ;\n"
+			        "       decl %0              ;\n"
+			        "       jnz 1b               ;\n"
+
+			        : 
+			        : "r" (lines),
+				  "r" (bh_ptr[0]->b_data),
+				  "r" (bh_ptr[1]->b_data),
+				  "r" (bh_ptr[2]->b_data)
+			        : "memory" );
+			break;
+		case 4:
+			__asm__ __volatile__ (
+
+			        " .align 32,0x90             ;\n"
+			        " 1:                         ;\n"
+			        "       movq   (%1), %%mm0   ;\n"
+			        "       movq  8(%1), %%mm1   ;\n"
+			        "       pxor   (%2), %%mm0   ;\n"
+			        "       movq 16(%1), %%mm2   ;\n"
+			        "       pxor  8(%2), %%mm1   ;\n"
+			        "       pxor   (%3), %%mm0   ;\n"
+			        "       pxor 16(%2), %%mm2   ;\n"
+			        "       pxor  8(%3), %%mm1   ;\n"
+			        "       pxor   (%4), %%mm0   ;\n"
+			        "       movq 24(%1), %%mm3   ;\n"
+			        "       pxor 16(%3), %%mm2   ;\n"
+			        "       pxor  8(%4), %%mm1   ;\n"
+			        "       movq %%mm0,   (%1)   ;\n"
+			        "       movq 32(%1), %%mm4   ;\n"
+			        "       pxor 24(%2), %%mm3   ;\n"
+			        "       pxor 16(%4), %%mm2   ;\n"
+			        "       movq %%mm1,  8(%1)   ;\n"
+			        "       movq 40(%1), %%mm5   ;\n"
+			        "       pxor 32(%2), %%mm4   ;\n"
+			        "       pxor 24(%3), %%mm3   ;\n"
+			        "       movq %%mm2, 16(%1)   ;\n"
+			        "       pxor 40(%2), %%mm5   ;\n"
+			        "       pxor 32(%3), %%mm4   ;\n"
+			        "       pxor 24(%4), %%mm3   ;\n"
+			        "       movq %%mm3, 24(%1)   ;\n"
+			        "       movq 56(%1), %%mm7   ;\n"
+			        "       movq 48(%1), %%mm6   ;\n"
+			        "       pxor 40(%3), %%mm5   ;\n"
+			        "       pxor 32(%4), %%mm4   ;\n"
+			        "       pxor 48(%2), %%mm6   ;\n"
+			        "       movq %%mm4, 32(%1)   ;\n"
+			        "       pxor 56(%2), %%mm7   ;\n"
+			        "       pxor 40(%4), %%mm5   ;\n"
+			        "       pxor 48(%3), %%mm6   ;\n"
+			        "       pxor 56(%3), %%mm7   ;\n"
+			        "       movq %%mm5, 40(%1)   ;\n"
+			        "       pxor 48(%4), %%mm6   ;\n"
+			        "       pxor 56(%4), %%mm7   ;\n"
+			        "       movq %%mm6, 48(%1)   ;\n"
+			        "       movq %%mm7, 56(%1)   ;\n"
+        
+			        "       addl $64, %1         ;\n"
+			        "       addl $64, %2         ;\n"
+			        "       addl $64, %3         ;\n"
+			        "       addl $64, %4         ;\n"
+			        "       decl %0              ;\n"
+			        "       jnz 1b               ;\n"
+
+			        : 
+			        : "r" (lines),
+				  "r" (bh_ptr[0]->b_data),
+				  "r" (bh_ptr[1]->b_data),
+				  "r" (bh_ptr[2]->b_data),
+				  "r" (bh_ptr[3]->b_data)
+			        : "memory" );
+			break;
+		case 5:
+			__asm__ __volatile__ (
+
+			        " .align 32,0x90             ;\n"
+			        " 1:                         ;\n"
+			        "       movq   (%1), %%mm0   ;\n"
+			        "       movq  8(%1), %%mm1   ;\n"
+			        "       pxor   (%2), %%mm0   ;\n"
+			        "       pxor  8(%2), %%mm1   ;\n"
+			        "       movq 16(%1), %%mm2   ;\n"
+			        "       pxor   (%3), %%mm0   ;\n"
+			        "       pxor  8(%3), %%mm1   ;\n"
+			        "       pxor 16(%2), %%mm2   ;\n"
+			        "       pxor   (%4), %%mm0   ;\n"
+			        "       pxor  8(%4), %%mm1   ;\n"
+			        "       pxor 16(%3), %%mm2   ;\n"
+			        "       movq 24(%1), %%mm3   ;\n"
+			        "       pxor   (%5), %%mm0   ;\n"
+			        "       pxor  8(%5), %%mm1   ;\n"
+			        "       movq %%mm0,   (%1)   ;\n"
+			        "       pxor 16(%4), %%mm2   ;\n"
+			        "       pxor 24(%2), %%mm3   ;\n"
+			        "       movq %%mm1,  8(%1)   ;\n"
+			        "       pxor 16(%5), %%mm2   ;\n"
+			        "       pxor 24(%3), %%mm3   ;\n"
+			        "       movq 32(%1), %%mm4   ;\n"
+			        "       movq %%mm2, 16(%1)   ;\n"
+			        "       pxor 24(%4), %%mm3   ;\n"
+			        "       pxor 32(%2), %%mm4   ;\n"
+			        "       movq 40(%1), %%mm5   ;\n"
+			        "       pxor 24(%5), %%mm3   ;\n"
+			        "       pxor 32(%3), %%mm4   ;\n"
+			        "       pxor 40(%2), %%mm5   ;\n"
+			        "       movq %%mm3, 24(%1)   ;\n"
+			        "       pxor 32(%4), %%mm4   ;\n"
+			        "       pxor 40(%3), %%mm5   ;\n"
+			        "       movq 48(%1), %%mm6   ;\n"
+			        "       movq 56(%1), %%mm7   ;\n"
+			        "       pxor 32(%5), %%mm4   ;\n"
+			        "       pxor 40(%4), %%mm5   ;\n"
+			        "       pxor 48(%2), %%mm6   ;\n"
+			        "       pxor 56(%2), %%mm7   ;\n"
+			        "       movq %%mm4, 32(%1)   ;\n"
+			        "       pxor 48(%3), %%mm6   ;\n"
+			        "       pxor 56(%3), %%mm7   ;\n"
+			        "       pxor 40(%5), %%mm5   ;\n"
+			        "       pxor 48(%4), %%mm6   ;\n"
+			        "       pxor 56(%4), %%mm7   ;\n"
+			        "       movq %%mm5, 40(%1)   ;\n"
+			        "       pxor 48(%5), %%mm6   ;\n"
+			        "       pxor 56(%5), %%mm7   ;\n"
+			        "       movq %%mm6, 48(%1)   ;\n"
+			        "       movq %%mm7, 56(%1)   ;\n"
+        
+			        "       addl $64, %1         ;\n"
+			        "       addl $64, %2         ;\n"
+			        "       addl $64, %3         ;\n"
+			        "       addl $64, %4         ;\n"
+			        "       addl $64, %5         ;\n"
+			        "       decl %0              ;\n"
+			        "       jnz 1b               ;\n"
+
+			        : 
+			        : "r" (lines),
+				  "r" (bh_ptr[0]->b_data),
+				  "r" (bh_ptr[1]->b_data),
+				  "r" (bh_ptr[2]->b_data),
+				  "r" (bh_ptr[3]->b_data),
+				  "r" (bh_ptr[4]->b_data)
+			        : "memory" );
+			break;
+	}
+
+	__asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) );
+
+	if (!(current->flags & PF_USEDFPU))
+		stts();
+}
+#endif /* __i386__ */
+#endif /* !__sparc_v9__ */
+
+#ifdef __sparc_v9__
+/*
+ * High speed xor_block operation for RAID4/5 utilizing the
+ * UltraSparc Visual Instruction Set.
+ *
+ * Copyright (C) 1997, 1999 Jakub Jelinek (jj@ultra.linux.cz)
+ *
+ *	Requirements:
+ *	!(((long)dest | (long)sourceN) & (64 - 1)) &&
+ *	!(len & 127) && len >= 256
+ *
+ * It is done in pure assembly, as otherwise gcc makes it
+ * a non-leaf function, which is not what we want.
+ * Also, we don't measure the speeds as on other architectures,
+ * as the measuring routine does not take into account cold caches
+ * and the fact that xor_block_VIS bypasses the caches.
+ * xor_block_32regs might be 5% faster for count 2 if caches are hot
+ * and things just right (for count 3 VIS is about as fast as 32regs for
+ * hot caches and for count 4 and 5 VIS is faster by good margin always),
+ * but I think it is better not to pollute the caches.
+ * Actually, if I'd just fight for speed for hot caches, I could
+ * write a hybrid VIS/integer routine, which would do always two
+ * 64B blocks in VIS and two in IEUs, but I really care more about
+ * caches.
+ */
+extern void *VISenter(void);
+extern void xor_block_VIS XOR_ARGS;
+
+void __xor_block_VIS(void)
+{
+__asm__ ("
+	.globl xor_block_VIS
+xor_block_VIS:
+	ldx	[%%o1 + 0], %%o4
+	ldx	[%%o1 + 8], %%o3
+	ldx	[%%o4 + %1], %%g5
+	ldx	[%%o4 + %0], %%o4
+	ldx	[%%o3 + %0], %%o3
+	rd	%%fprs, %%o5
+	andcc	%%o5, %2, %%g0
+	be,pt	%%icc, 297f
+	 sethi	%%hi(%5), %%g1
+	jmpl	%%g1 + %%lo(%5), %%g7
+	 add	%%g7, 8, %%g7
+297:	wr	%%g0, %4, %%fprs
+	membar	#LoadStore|#StoreLoad|#StoreStore
+	sub	%%g5, 64, %%g5
+	ldda	[%%o4] %3, %%f0
+	ldda	[%%o3] %3, %%f16
+	cmp	%%o0, 4
+	bgeu,pt	%%xcc, 10f
+	 cmp	%%o0, 3
+	be,pn	%%xcc, 13f
+	 mov	-64, %%g1
+	sub	%%g5, 64, %%g5
+	rd	%%asi, %%g1
+	wr	%%g0, %3, %%asi
+
+2:	ldda	[%%o4 + 64] %%asi, %%f32
+	fxor	%%f0, %%f16, %%f16
+	fxor	%%f2, %%f18, %%f18
+	fxor	%%f4, %%f20, %%f20
+	fxor	%%f6, %%f22, %%f22
+	fxor	%%f8, %%f24, %%f24
+	fxor	%%f10, %%f26, %%f26
+	fxor	%%f12, %%f28, %%f28
+	fxor	%%f14, %%f30, %%f30
+	stda	%%f16, [%%o4] %3
+	ldda	[%%o3 + 64] %%asi, %%f48
+	ldda	[%%o4 + 128] %%asi, %%f0
+	fxor	%%f32, %%f48, %%f48
+	fxor	%%f34, %%f50, %%f50
+	add	%%o4, 128, %%o4
+	fxor	%%f36, %%f52, %%f52
+	add	%%o3, 128, %%o3
+	fxor	%%f38, %%f54, %%f54
+	subcc	%%g5, 128, %%g5
+	fxor	%%f40, %%f56, %%f56
+	fxor	%%f42, %%f58, %%f58
+	fxor	%%f44, %%f60, %%f60
+	fxor	%%f46, %%f62, %%f62
+	stda	%%f48, [%%o4 - 64] %%asi
+	bne,pt	%%xcc, 2b
+	 ldda	[%%o3] %3, %%f16
+
+	ldda	[%%o4 + 64] %%asi, %%f32
+	fxor	%%f0, %%f16, %%f16
+	fxor	%%f2, %%f18, %%f18
+	fxor	%%f4, %%f20, %%f20
+	fxor	%%f6, %%f22, %%f22
+	fxor	%%f8, %%f24, %%f24
+	fxor	%%f10, %%f26, %%f26
+	fxor	%%f12, %%f28, %%f28
+	fxor	%%f14, %%f30, %%f30
+	stda	%%f16, [%%o4] %3
+	ldda	[%%o3 + 64] %%asi, %%f48
+	membar	#Sync
+	fxor	%%f32, %%f48, %%f48
+	fxor	%%f34, %%f50, %%f50
+	fxor	%%f36, %%f52, %%f52
+	fxor	%%f38, %%f54, %%f54
+	fxor	%%f40, %%f56, %%f56
+	fxor	%%f42, %%f58, %%f58
+	fxor	%%f44, %%f60, %%f60
+	fxor	%%f46, %%f62, %%f62
+	stda	%%f48, [%%o4 + 64] %%asi
+	membar	#Sync|#StoreStore|#StoreLoad
+	wr	%%g0, 0, %%fprs
+	retl
+	 wr	%%g1, %%g0, %%asi
+
+13:	ldx	[%%o1 + 16], %%o2
+	ldx	[%%o2 + %0], %%o2
+
+3:	ldda	[%%o2] %3, %%f32
+	fxor	%%f0, %%f16, %%f48
+	fxor	%%f2, %%f18, %%f50
+	add	%%o4, 64, %%o4
+	fxor	%%f4, %%f20, %%f52
+	fxor	%%f6, %%f22, %%f54
+	add	%%o3, 64, %%o3
+	fxor	%%f8, %%f24, %%f56
+	fxor	%%f10, %%f26, %%f58
+	fxor	%%f12, %%f28, %%f60
+	fxor	%%f14, %%f30, %%f62
+	ldda	[%%o4] %3, %%f0
+	fxor	%%f48, %%f32, %%f48
+	fxor	%%f50, %%f34, %%f50
+	fxor	%%f52, %%f36, %%f52
+	fxor	%%f54, %%f38, %%f54
+	add	%%o2, 64, %%o2
+	fxor	%%f56, %%f40, %%f56
+	fxor	%%f58, %%f42, %%f58
+	subcc	%%g5, 64, %%g5
+	fxor	%%f60, %%f44, %%f60
+	fxor	%%f62, %%f46, %%f62
+	stda	%%f48, [%%o4 + %%g1] %3
+	bne,pt	%%xcc, 3b
+	 ldda	[%%o3] %3, %%f16
+
+	ldda	[%%o2] %3, %%f32
+	fxor	%%f0, %%f16, %%f48
+	fxor	%%f2, %%f18, %%f50
+	fxor	%%f4, %%f20, %%f52
+	fxor	%%f6, %%f22, %%f54
+	fxor	%%f8, %%f24, %%f56
+	fxor	%%f10, %%f26, %%f58
+	fxor	%%f12, %%f28, %%f60
+	fxor	%%f14, %%f30, %%f62
+	membar	#Sync
+	fxor	%%f48, %%f32, %%f48
+	fxor	%%f50, %%f34, %%f50
+	fxor	%%f52, %%f36, %%f52
+	fxor	%%f54, %%f38, %%f54
+	fxor	%%f56, %%f40, %%f56
+	fxor	%%f58, %%f42, %%f58
+	fxor	%%f60, %%f44, %%f60
+	fxor	%%f62, %%f46, %%f62
+	stda	%%f48, [%%o4] %3
+	membar	#Sync|#StoreStore|#StoreLoad
+	retl
+	 wr	%%g0, 0, %%fprs
+
+10:	cmp	%%o0, 5
+	be,pt	%%xcc, 15f
+	 mov	-64, %%g1
+
+14:	ldx	[%%o1 + 16], %%o2
+	ldx	[%%o1 + 24], %%o0
+	ldx	[%%o2 + %0], %%o2
+	ldx	[%%o0 + %0], %%o0
+
+4:	ldda	[%%o2] %3, %%f32
+	fxor	%%f0, %%f16, %%f16
+	fxor	%%f2, %%f18, %%f18
+	add	%%o4, 64, %%o4
+	fxor	%%f4, %%f20, %%f20
+	fxor	%%f6, %%f22, %%f22
+	add	%%o3, 64, %%o3
+	fxor	%%f8, %%f24, %%f24
+	fxor	%%f10, %%f26, %%f26
+	fxor	%%f12, %%f28, %%f28
+	fxor	%%f14, %%f30, %%f30
+	ldda	[%%o0] %3, %%f48
+	fxor	%%f16, %%f32, %%f32
+	fxor	%%f18, %%f34, %%f34
+	fxor	%%f20, %%f36, %%f36
+	fxor	%%f22, %%f38, %%f38
+	add	%%o2, 64, %%o2
+	fxor	%%f24, %%f40, %%f40
+	fxor	%%f26, %%f42, %%f42
+	fxor	%%f28, %%f44, %%f44
+	fxor	%%f30, %%f46, %%f46
+	ldda	[%%o4] %3, %%f0
+	fxor	%%f32, %%f48, %%f48
+	fxor	%%f34, %%f50, %%f50
+	fxor	%%f36, %%f52, %%f52
+	add	%%o0, 64, %%o0
+	fxor	%%f38, %%f54, %%f54
+	fxor	%%f40, %%f56, %%f56
+	fxor	%%f42, %%f58, %%f58
+	subcc	%%g5, 64, %%g5
+	fxor	%%f44, %%f60, %%f60
+	fxor	%%f46, %%f62, %%f62
+	stda	%%f48, [%%o4 + %%g1] %3
+	bne,pt	%%xcc, 4b
+	 ldda	[%%o3] %3, %%f16
+
+	ldda	[%%o2] %3, %%f32
+	fxor	%%f0, %%f16, %%f16
+	fxor	%%f2, %%f18, %%f18
+	fxor	%%f4, %%f20, %%f20
+	fxor	%%f6, %%f22, %%f22
+	fxor	%%f8, %%f24, %%f24
+	fxor	%%f10, %%f26, %%f26
+	fxor	%%f12, %%f28, %%f28
+	fxor	%%f14, %%f30, %%f30
+	ldda	[%%o0] %3, %%f48
+	fxor	%%f16, %%f32, %%f32
+	fxor	%%f18, %%f34, %%f34
+	fxor	%%f20, %%f36, %%f36
+	fxor	%%f22, %%f38, %%f38
+	fxor	%%f24, %%f40, %%f40
+	fxor	%%f26, %%f42, %%f42
+	fxor	%%f28, %%f44, %%f44
+	fxor	%%f30, %%f46, %%f46
+	membar	#Sync
+	fxor	%%f32, %%f48, %%f48
+	fxor	%%f34, %%f50, %%f50
+	fxor	%%f36, %%f52, %%f52
+	fxor	%%f38, %%f54, %%f54
+	fxor	%%f40, %%f56, %%f56
+	fxor	%%f42, %%f58, %%f58
+	fxor	%%f44, %%f60, %%f60
+	fxor	%%f46, %%f62, %%f62
+	stda	%%f48, [%%o4] %3
+	membar	#Sync|#StoreStore|#StoreLoad
+	retl
+	 wr	%%g0, 0, %%fprs
+
+15:	ldx	[%%o1 + 16], %%o2
+	ldx	[%%o1 + 24], %%o0
+	ldx	[%%o1 + 32], %%o1
+	ldx	[%%o2 + %0], %%o2
+	ldx	[%%o0 + %0], %%o0
+	ldx	[%%o1 + %0], %%o1
+
+5:	ldda	[%%o2] %3, %%f32
+	fxor	%%f0, %%f16, %%f48
+	fxor	%%f2, %%f18, %%f50
+	add	%%o4, 64, %%o4
+	fxor	%%f4, %%f20, %%f52
+	fxor	%%f6, %%f22, %%f54
+	add	%%o3, 64, %%o3
+	fxor	%%f8, %%f24, %%f56
+	fxor	%%f10, %%f26, %%f58
+	fxor	%%f12, %%f28, %%f60
+	fxor	%%f14, %%f30, %%f62
+	ldda	[%%o0] %3, %%f16
+	fxor	%%f48, %%f32, %%f48
+	fxor	%%f50, %%f34, %%f50
+	fxor	%%f52, %%f36, %%f52
+	fxor	%%f54, %%f38, %%f54
+	add	%%o2, 64, %%o2
+	fxor	%%f56, %%f40, %%f56
+	fxor	%%f58, %%f42, %%f58
+	fxor	%%f60, %%f44, %%f60
+	fxor	%%f62, %%f46, %%f62
+	ldda	[%%o1] %3, %%f32
+	fxor	%%f48, %%f16, %%f48
+	fxor	%%f50, %%f18, %%f50
+	add	%%o0, 64, %%o0
+	fxor	%%f52, %%f20, %%f52
+	fxor	%%f54, %%f22, %%f54
+	add	%%o1, 64, %%o1
+	fxor	%%f56, %%f24, %%f56
+	fxor	%%f58, %%f26, %%f58
+	fxor	%%f60, %%f28, %%f60
+	fxor	%%f62, %%f30, %%f62
+	ldda	[%%o4] %3, %%f0
+	fxor	%%f48, %%f32, %%f48
+	fxor	%%f50, %%f34, %%f50
+	fxor	%%f52, %%f36, %%f52
+	fxor	%%f54, %%f38, %%f54
+	fxor	%%f56, %%f40, %%f56
+	fxor	%%f58, %%f42, %%f58
+	subcc	%%g5, 64, %%g5
+	fxor	%%f60, %%f44, %%f60
+	fxor	%%f62, %%f46, %%f62
+	stda	%%f48, [%%o4 + %%g1] %3
+	bne,pt	%%xcc, 5b
+	 ldda	[%%o3] %3, %%f16
+
+	ldda	[%%o2] %3, %%f32
+	fxor	%%f0, %%f16, %%f48
+	fxor	%%f2, %%f18, %%f50
+	fxor	%%f4, %%f20, %%f52
+	fxor	%%f6, %%f22, %%f54
+	fxor	%%f8, %%f24, %%f56
+	fxor	%%f10, %%f26, %%f58
+	fxor	%%f12, %%f28, %%f60
+	fxor	%%f14, %%f30, %%f62
+	ldda	[%%o0] %3, %%f16
+	fxor	%%f48, %%f32, %%f48
+	fxor	%%f50, %%f34, %%f50
+	fxor	%%f52, %%f36, %%f52
+	fxor	%%f54, %%f38, %%f54
+	fxor	%%f56, %%f40, %%f56
+	fxor	%%f58, %%f42, %%f58
+	fxor	%%f60, %%f44, %%f60
+	fxor	%%f62, %%f46, %%f62
+	ldda	[%%o1] %3, %%f32
+	fxor	%%f48, %%f16, %%f48
+	fxor	%%f50, %%f18, %%f50
+	fxor	%%f52, %%f20, %%f52
+	fxor	%%f54, %%f22, %%f54
+	fxor	%%f56, %%f24, %%f56
+	fxor	%%f58, %%f26, %%f58
+	fxor	%%f60, %%f28, %%f60
+	fxor	%%f62, %%f30, %%f62
+	membar	#Sync
+	fxor	%%f48, %%f32, %%f48
+	fxor	%%f50, %%f34, %%f50
+	fxor	%%f52, %%f36, %%f52
+	fxor	%%f54, %%f38, %%f54
+	fxor	%%f56, %%f40, %%f56
+	fxor	%%f58, %%f42, %%f58
+	fxor	%%f60, %%f44, %%f60
+	fxor	%%f62, %%f46, %%f62
+	stda	%%f48, [%%o4] %3
+	membar	#Sync|#StoreStore|#StoreLoad
+	retl
+	 wr	%%g0, 0, %%fprs
+	" : :
+	"i" (&((struct buffer_head *)0)->b_data),
+	"i" (&((struct buffer_head *)0)->b_size),
+	"i" (FPRS_FEF|FPRS_DU), "i" (ASI_BLK_P),
+	"i" (FPRS_FEF), "i" (VISenter));
+}
+#endif /* __sparc_v9__ */
+
+#if defined(__sparc__) && !defined(__sparc_v9__)
+/*
+ * High speed xor_block operation for RAID4/5 utilizing the
+ * ldd/std SPARC instructions.
+ *
+ * Copyright (C) 1999 Jakub Jelinek (jj@ultra.linux.cz)
+ *
+ */
+
+XORBLOCK_TEMPLATE(SPARC)
+{
+	int size  = bh_ptr[0]->b_size;
+	int lines = size / (sizeof (long)) / 8, i;
+	long *destp   = (long *) bh_ptr[0]->b_data;
+	long *source1 = (long *) bh_ptr[1]->b_data;
+	long *source2, *source3, *source4;
+
+	switch (count) {
+	case 2:
+		for (i = lines; i > 0; i--) {
+		  __asm__ __volatile__("
+		  ldd [%0 + 0x00], %%g2
+		  ldd [%0 + 0x08], %%g4
+		  ldd [%0 + 0x10], %%o0
+		  ldd [%0 + 0x18], %%o2
+		  ldd [%1 + 0x00], %%o4
+		  ldd [%1 + 0x08], %%l0
+		  ldd [%1 + 0x10], %%l2
+		  ldd [%1 + 0x18], %%l4
+		  xor %%g2, %%o4, %%g2
+		  xor %%g3, %%o5, %%g3
+		  xor %%g4, %%l0, %%g4
+		  xor %%g5, %%l1, %%g5
+		  xor %%o0, %%l2, %%o0
+		  xor %%o1, %%l3, %%o1
+		  xor %%o2, %%l4, %%o2
+		  xor %%o3, %%l5, %%o3
+		  std %%g2, [%0 + 0x00]
+		  std %%g4, [%0 + 0x08]
+		  std %%o0, [%0 + 0x10]
+		  std %%o2, [%0 + 0x18]
+		  " : : "r" (destp), "r" (source1) : "g2", "g3", "g4", "g5", "o0", 
+		  "o1", "o2", "o3", "o4", "o5", "l0", "l1", "l2", "l3", "l4", "l5");
+		  destp += 8;
+		  source1 += 8;
+		}
+		break;
+	case 3:
+		source2 = (long *) bh_ptr[2]->b_data;
+		for (i = lines; i > 0; i--) {
+		  __asm__ __volatile__("
+		  ldd [%0 + 0x00], %%g2
+		  ldd [%0 + 0x08], %%g4
+		  ldd [%0 + 0x10], %%o0
+		  ldd [%0 + 0x18], %%o2
+		  ldd [%1 + 0x00], %%o4
+		  ldd [%1 + 0x08], %%l0
+		  ldd [%1 + 0x10], %%l2
+		  ldd [%1 + 0x18], %%l4
+		  xor %%g2, %%o4, %%g2
+		  xor %%g3, %%o5, %%g3
+		  ldd [%2 + 0x00], %%o4
+		  xor %%g4, %%l0, %%g4
+		  xor %%g5, %%l1, %%g5
+		  ldd [%2 + 0x08], %%l0
+		  xor %%o0, %%l2, %%o0
+		  xor %%o1, %%l3, %%o1
+		  ldd [%2 + 0x10], %%l2
+		  xor %%o2, %%l4, %%o2
+		  xor %%o3, %%l5, %%o3
+		  ldd [%2 + 0x18], %%l4
+		  xor %%g2, %%o4, %%g2
+		  xor %%g3, %%o5, %%g3
+		  xor %%g4, %%l0, %%g4
+		  xor %%g5, %%l1, %%g5
+		  xor %%o0, %%l2, %%o0
+		  xor %%o1, %%l3, %%o1
+		  xor %%o2, %%l4, %%o2
+		  xor %%o3, %%l5, %%o3
+		  std %%g2, [%0 + 0x00]
+		  std %%g4, [%0 + 0x08]
+		  std %%o0, [%0 + 0x10]
+		  std %%o2, [%0 + 0x18]
+		  " : : "r" (destp), "r" (source1), "r" (source2)
+		  : "g2", "g3", "g4", "g5", "o0", "o1", "o2", "o3", "o4", "o5",
+		  "l0", "l1", "l2", "l3", "l4", "l5");
+		  destp += 8;
+		  source1 += 8;
+		  source2 += 8;
+		}
+		break;
+	case 4:
+		source2 = (long *) bh_ptr[2]->b_data;
+		source3 = (long *) bh_ptr[3]->b_data;
+		for (i = lines; i > 0; i--) {
+		  __asm__ __volatile__("
+		  ldd [%0 + 0x00], %%g2
+		  ldd [%0 + 0x08], %%g4
+		  ldd [%0 + 0x10], %%o0
+		  ldd [%0 + 0x18], %%o2
+		  ldd [%1 + 0x00], %%o4
+		  ldd [%1 + 0x08], %%l0
+		  ldd [%1 + 0x10], %%l2
+		  ldd [%1 + 0x18], %%l4
+		  xor %%g2, %%o4, %%g2
+		  xor %%g3, %%o5, %%g3
+		  ldd [%2 + 0x00], %%o4
+		  xor %%g4, %%l0, %%g4
+		  xor %%g5, %%l1, %%g5
+		  ldd [%2 + 0x08], %%l0
+		  xor %%o0, %%l2, %%o0
+		  xor %%o1, %%l3, %%o1
+		  ldd [%2 + 0x10], %%l2
+		  xor %%o2, %%l4, %%o2
+		  xor %%o3, %%l5, %%o3
+		  ldd [%2 + 0x18], %%l4
+		  xor %%g2, %%o4, %%g2
+		  xor %%g3, %%o5, %%g3
+		  ldd [%3 + 0x00], %%o4
+		  xor %%g4, %%l0, %%g4
+		  xor %%g5, %%l1, %%g5
+		  ldd [%3 + 0x08], %%l0
+		  xor %%o0, %%l2, %%o0
+		  xor %%o1, %%l3, %%o1
+		  ldd [%3 + 0x10], %%l2
+		  xor %%o2, %%l4, %%o2
+		  xor %%o3, %%l5, %%o3
+		  ldd [%3 + 0x18], %%l4
+		  xor %%g2, %%o4, %%g2
+		  xor %%g3, %%o5, %%g3
+		  xor %%g4, %%l0, %%g4
+		  xor %%g5, %%l1, %%g5
+		  xor %%o0, %%l2, %%o0
+		  xor %%o1, %%l3, %%o1
+		  xor %%o2, %%l4, %%o2
+		  xor %%o3, %%l5, %%o3
+		  std %%g2, [%0 + 0x00]
+		  std %%g4, [%0 + 0x08]
+		  std %%o0, [%0 + 0x10]
+		  std %%o2, [%0 + 0x18]
+		  " : : "r" (destp), "r" (source1), "r" (source2), "r" (source3)
+		  : "g2", "g3", "g4", "g5", "o0", "o1", "o2", "o3", "o4", "o5",
+		  "l0", "l1", "l2", "l3", "l4", "l5");
+		  destp += 8;
+		  source1 += 8;
+		  source2 += 8;
+		  source3 += 8;
+		}
+		break;
+	case 5:
+		source2 = (long *) bh_ptr[2]->b_data;
+		source3 = (long *) bh_ptr[3]->b_data;
+		source4 = (long *) bh_ptr[4]->b_data;
+		for (i = lines; i > 0; i--) {
+		  __asm__ __volatile__("
+		  ldd [%0 + 0x00], %%g2
+		  ldd [%0 + 0x08], %%g4
+		  ldd [%0 + 0x10], %%o0
+		  ldd [%0 + 0x18], %%o2
+		  ldd [%1 + 0x00], %%o4
+		  ldd [%1 + 0x08], %%l0
+		  ldd [%1 + 0x10], %%l2
+		  ldd [%1 + 0x18], %%l4
+		  xor %%g2, %%o4, %%g2
+		  xor %%g3, %%o5, %%g3
+		  ldd [%2 + 0x00], %%o4
+		  xor %%g4, %%l0, %%g4
+		  xor %%g5, %%l1, %%g5
+		  ldd [%2 + 0x08], %%l0
+		  xor %%o0, %%l2, %%o0
+		  xor %%o1, %%l3, %%o1
+		  ldd [%2 + 0x10], %%l2
+		  xor %%o2, %%l4, %%o2
+		  xor %%o3, %%l5, %%o3
+		  ldd [%2 + 0x18], %%l4
+		  xor %%g2, %%o4, %%g2
+		  xor %%g3, %%o5, %%g3
+		  ldd [%3 + 0x00], %%o4
+		  xor %%g4, %%l0, %%g4
+		  xor %%g5, %%l1, %%g5
+		  ldd [%3 + 0x08], %%l0
+		  xor %%o0, %%l2, %%o0
+		  xor %%o1, %%l3, %%o1
+		  ldd [%3 + 0x10], %%l2
+		  xor %%o2, %%l4, %%o2
+		  xor %%o3, %%l5, %%o3
+		  ldd [%3 + 0x18], %%l4
+		  xor %%g2, %%o4, %%g2
+		  xor %%g3, %%o5, %%g3
+		  ldd [%4 + 0x00], %%o4
+		  xor %%g4, %%l0, %%g4
+		  xor %%g5, %%l1, %%g5
+		  ldd [%4 + 0x08], %%l0
+		  xor %%o0, %%l2, %%o0
+		  xor %%o1, %%l3, %%o1
+		  ldd [%4 + 0x10], %%l2
+		  xor %%o2, %%l4, %%o2
+		  xor %%o3, %%l5, %%o3
+		  ldd [%4 + 0x18], %%l4
+		  xor %%g2, %%o4, %%g2
+		  xor %%g3, %%o5, %%g3
+		  xor %%g4, %%l0, %%g4
+		  xor %%g5, %%l1, %%g5
+		  xor %%o0, %%l2, %%o0
+		  xor %%o1, %%l3, %%o1
+		  xor %%o2, %%l4, %%o2
+		  xor %%o3, %%l5, %%o3
+		  std %%g2, [%0 + 0x00]
+		  std %%g4, [%0 + 0x08]
+		  std %%o0, [%0 + 0x10]
+		  std %%o2, [%0 + 0x18]
+		  " : : "r" (destp), "r" (source1), "r" (source2), "r" (source3), "r" (source4)
+		  : "g2", "g3", "g4", "g5", "o0", "o1", "o2", "o3", "o4", "o5",
+		  "l0", "l1", "l2", "l3", "l4", "l5");
+		  destp += 8;
+		  source1 += 8;
+		  source2 += 8;
+		  source3 += 8;
+		  source4 += 8;
+		}
+		break;
+	}
+}
+#endif /* __sparc_v[78]__ */
+
+#ifndef __sparc_v9__
+
+/*
+ * this one works reasonably on any x86 CPU
+ * (send me an assembly version for inclusion if you can make it faster)
+ *
+ * this one is just as fast as written in pure assembly on x86.
+ * the reason for this separate version is that the
+ * fast open-coded xor routine "32reg" produces suboptimal code
+ * on x86, due to lack of registers.
+ */
+XORBLOCK_TEMPLATE(8regs)
+{
+	int len  = bh_ptr[0]->b_size;
+	long *destp   = (long *) bh_ptr[0]->b_data;
+	long *source1, *source2, *source3, *source4;
+	long lines = len / (sizeof (long)) / 8, i;
+
+	switch(count) {
+		case 2:
+			source1 = (long *) bh_ptr[1]->b_data;
+			for (i = lines; i > 0; i--) {
+				*(destp + 0) ^= *(source1 + 0);
+				*(destp + 1) ^= *(source1 + 1);
+				*(destp + 2) ^= *(source1 + 2);
+				*(destp + 3) ^= *(source1 + 3);
+				*(destp + 4) ^= *(source1 + 4);
+				*(destp + 5) ^= *(source1 + 5);
+				*(destp + 6) ^= *(source1 + 6);
+				*(destp + 7) ^= *(source1 + 7);
+				source1 += 8;
+				destp += 8;
+			}
+			break;
+		case 3:
+			source2 = (long *) bh_ptr[2]->b_data;
+			source1 = (long *) bh_ptr[1]->b_data;
+			for (i = lines; i > 0; i--) {
+				*(destp + 0) ^= *(source1 + 0);
+				*(destp + 0) ^= *(source2 + 0);
+				*(destp + 1) ^= *(source1 + 1);
+				*(destp + 1) ^= *(source2 + 1);
+				*(destp + 2) ^= *(source1 + 2);
+				*(destp + 2) ^= *(source2 + 2);
+				*(destp + 3) ^= *(source1 + 3);
+				*(destp + 3) ^= *(source2 + 3);
+				*(destp + 4) ^= *(source1 + 4);
+				*(destp + 4) ^= *(source2 + 4);
+				*(destp + 5) ^= *(source1 + 5);
+				*(destp + 5) ^= *(source2 + 5);
+				*(destp + 6) ^= *(source1 + 6);
+				*(destp + 6) ^= *(source2 + 6);
+				*(destp + 7) ^= *(source1 + 7);
+				*(destp + 7) ^= *(source2 + 7);
+				source1 += 8;
+				source2 += 8;
+				destp += 8;
+			}
+			break;
+		case 4:
+			source3 = (long *) bh_ptr[3]->b_data;
+			source2 = (long *) bh_ptr[2]->b_data;
+			source1 = (long *) bh_ptr[1]->b_data;
+			for (i = lines; i > 0; i--) {
+				*(destp + 0) ^= *(source1 + 0);
+				*(destp + 0) ^= *(source2 + 0);
+				*(destp + 0) ^= *(source3 + 0);
+				*(destp + 1) ^= *(source1 + 1);
+				*(destp + 1) ^= *(source2 + 1);
+				*(destp + 1) ^= *(source3 + 1);
+				*(destp + 2) ^= *(source1 + 2);
+				*(destp + 2) ^= *(source2 + 2);
+				*(destp + 2) ^= *(source3 + 2);
+				*(destp + 3) ^= *(source1 + 3);
+				*(destp + 3) ^= *(source2 + 3);
+				*(destp + 3) ^= *(source3 + 3);
+				*(destp + 4) ^= *(source1 + 4);
+				*(destp + 4) ^= *(source2 + 4);
+				*(destp + 4) ^= *(source3 + 4);
+				*(destp + 5) ^= *(source1 + 5);
+				*(destp + 5) ^= *(source2 + 5);
+				*(destp + 5) ^= *(source3 + 5);
+				*(destp + 6) ^= *(source1 + 6);
+				*(destp + 6) ^= *(source2 + 6);
+				*(destp + 6) ^= *(source3 + 6);
+				*(destp + 7) ^= *(source1 + 7);
+				*(destp + 7) ^= *(source2 + 7);
+				*(destp + 7) ^= *(source3 + 7);
+				source1 += 8;
+				source2 += 8;
+				source3 += 8;
+				destp += 8;
+			}
+			break;
+		case 5:
+			source4 = (long *) bh_ptr[4]->b_data;
+			source3 = (long *) bh_ptr[3]->b_data;
+			source2 = (long *) bh_ptr[2]->b_data;
+			source1 = (long *) bh_ptr[1]->b_data;
+			for (i = lines; i > 0; i--) {
+				*(destp + 0) ^= *(source1 + 0);
+				*(destp + 0) ^= *(source2 + 0);
+				*(destp + 0) ^= *(source3 + 0);
+				*(destp + 0) ^= *(source4 + 0);
+				*(destp + 1) ^= *(source1 + 1);
+				*(destp + 1) ^= *(source2 + 1);
+				*(destp + 1) ^= *(source3 + 1);
+				*(destp + 1) ^= *(source4 + 1);
+				*(destp + 2) ^= *(source1 + 2);
+				*(destp + 2) ^= *(source2 + 2);
+				*(destp + 2) ^= *(source3 + 2);
+				*(destp + 2) ^= *(source4 + 2);
+				*(destp + 3) ^= *(source1 + 3);
+				*(destp + 3) ^= *(source2 + 3);
+				*(destp + 3) ^= *(source3 + 3);
+				*(destp + 3) ^= *(source4 + 3);
+				*(destp + 4) ^= *(source1 + 4);
+				*(destp + 4) ^= *(source2 + 4);
+				*(destp + 4) ^= *(source3 + 4);
+				*(destp + 4) ^= *(source4 + 4);
+				*(destp + 5) ^= *(source1 + 5);
+				*(destp + 5) ^= *(source2 + 5);
+				*(destp + 5) ^= *(source3 + 5);
+				*(destp + 5) ^= *(source4 + 5);
+				*(destp + 6) ^= *(source1 + 6);
+				*(destp + 6) ^= *(source2 + 6);
+				*(destp + 6) ^= *(source3 + 6);
+				*(destp + 6) ^= *(source4 + 6);
+				*(destp + 7) ^= *(source1 + 7);
+				*(destp + 7) ^= *(source2 + 7);
+				*(destp + 7) ^= *(source3 + 7);
+				*(destp + 7) ^= *(source4 + 7);
+				source1 += 8;
+				source2 += 8;
+				source3 += 8;
+				source4 += 8;
+				destp += 8;
+			}
+			break;
+	}
+}
+
+/*
+ * platform independent RAID5 checksum calculation, this should
+ * be very fast on any platform that has a decent amount of
+ * registers. (32 or more)
+ */
+XORBLOCK_TEMPLATE(32regs)
+{
+	int size  = bh_ptr[0]->b_size;
+	int lines = size / (sizeof (long)) / 8, i;
+	long *destp   = (long *) bh_ptr[0]->b_data;
+	long *source1, *source2, *source3, *source4;
+	
+	  /* LOTS of registers available...
+	     We do explicite loop-unrolling here for code which
+	     favours RISC machines.  In fact this is almoast direct
+	     RISC assembly on Alpha and SPARC :-)  */
+
+
+	switch(count) {
+		case 2:
+			source1 = (long *) bh_ptr[1]->b_data;
+			for (i = lines; i > 0; i--) {
+	  			register long d0, d1, d2, d3, d4, d5, d6, d7;
+				d0 = destp[0];	/* Pull the stuff into registers	*/
+				d1 = destp[1];	/*  ... in bursts, if possible.		*/
+				d2 = destp[2];
+				d3 = destp[3];
+				d4 = destp[4];
+				d5 = destp[5];
+				d6 = destp[6];
+				d7 = destp[7];
+				d0 ^= source1[0];
+				d1 ^= source1[1];
+				d2 ^= source1[2];
+				d3 ^= source1[3];
+				d4 ^= source1[4];
+				d5 ^= source1[5];
+				d6 ^= source1[6];
+				d7 ^= source1[7];
+				destp[0] = d0;	/* Store the result (in burts)		*/
+				destp[1] = d1;
+				destp[2] = d2;
+				destp[3] = d3;
+				destp[4] = d4;	/* Store the result (in burts)		*/
+				destp[5] = d5;
+				destp[6] = d6;
+				destp[7] = d7;
+				source1 += 8;
+				destp += 8;
+			}
+			break;
+	  	case 3:
+			source2 = (long *) bh_ptr[2]->b_data;
+			source1 = (long *) bh_ptr[1]->b_data;
+			for (i = lines; i > 0; i--) {
+	  			register long d0, d1, d2, d3, d4, d5, d6, d7;
+				d0 = destp[0];	/* Pull the stuff into registers	*/
+				d1 = destp[1];	/*  ... in bursts, if possible.		*/
+				d2 = destp[2];
+				d3 = destp[3];
+				d4 = destp[4];
+				d5 = destp[5];
+				d6 = destp[6];
+				d7 = destp[7];
+				d0 ^= source1[0];
+				d1 ^= source1[1];
+				d2 ^= source1[2];
+				d3 ^= source1[3];
+				d4 ^= source1[4];
+				d5 ^= source1[5];
+				d6 ^= source1[6];
+				d7 ^= source1[7];
+				d0 ^= source2[0];
+				d1 ^= source2[1];
+				d2 ^= source2[2];
+				d3 ^= source2[3];
+				d4 ^= source2[4];
+				d5 ^= source2[5];
+				d6 ^= source2[6];
+				d7 ^= source2[7];
+				destp[0] = d0;	/* Store the result (in burts)		*/
+				destp[1] = d1;
+				destp[2] = d2;
+				destp[3] = d3;
+				destp[4] = d4;	/* Store the result (in burts)		*/
+				destp[5] = d5;
+				destp[6] = d6;
+				destp[7] = d7;
+				source1 += 8;
+				source2 += 8;
+				destp += 8;
+			}
+			break;
+		case 4:
+			source3 = (long *) bh_ptr[3]->b_data;
+			source2 = (long *) bh_ptr[2]->b_data;
+			source1 = (long *) bh_ptr[1]->b_data;
+			for (i = lines; i > 0; i--) {
+	  			register long d0, d1, d2, d3, d4, d5, d6, d7;
+				d0 = destp[0];	/* Pull the stuff into registers	*/
+				d1 = destp[1];	/*  ... in bursts, if possible.		*/
+				d2 = destp[2];
+				d3 = destp[3];
+				d4 = destp[4];
+				d5 = destp[5];
+				d6 = destp[6];
+				d7 = destp[7];
+				d0 ^= source1[0];
+				d1 ^= source1[1];
+				d2 ^= source1[2];
+				d3 ^= source1[3];
+				d4 ^= source1[4];
+				d5 ^= source1[5];
+				d6 ^= source1[6];
+				d7 ^= source1[7];
+				d0 ^= source2[0];
+				d1 ^= source2[1];
+				d2 ^= source2[2];
+				d3 ^= source2[3];
+				d4 ^= source2[4];
+				d5 ^= source2[5];
+				d6 ^= source2[6];
+				d7 ^= source2[7];
+				d0 ^= source3[0];
+				d1 ^= source3[1];
+				d2 ^= source3[2];
+				d3 ^= source3[3];
+				d4 ^= source3[4];
+				d5 ^= source3[5];
+				d6 ^= source3[6];
+				d7 ^= source3[7];
+				destp[0] = d0;	/* Store the result (in burts)		*/
+				destp[1] = d1;
+				destp[2] = d2;
+				destp[3] = d3;
+				destp[4] = d4;	/* Store the result (in burts)		*/
+				destp[5] = d5;
+				destp[6] = d6;
+				destp[7] = d7;
+				source1 += 8;
+				source2 += 8;
+				source3 += 8;
+				destp += 8;
+			}
+			break;
+		case 5:
+			source4 = (long *) bh_ptr[4]->b_data;
+			source3 = (long *) bh_ptr[3]->b_data;
+			source2 = (long *) bh_ptr[2]->b_data;
+			source1 = (long *) bh_ptr[1]->b_data;
+			for (i = lines; i > 0; i--) {
+	  			register long d0, d1, d2, d3, d4, d5, d6, d7;
+				d0 = destp[0];	/* Pull the stuff into registers	*/
+				d1 = destp[1];	/*  ... in bursts, if possible.		*/
+				d2 = destp[2];
+				d3 = destp[3];
+				d4 = destp[4];
+				d5 = destp[5];
+				d6 = destp[6];
+				d7 = destp[7];
+				d0 ^= source1[0];
+				d1 ^= source1[1];
+				d2 ^= source1[2];
+				d3 ^= source1[3];
+				d4 ^= source1[4];
+				d5 ^= source1[5];
+				d6 ^= source1[6];
+				d7 ^= source1[7];
+				d0 ^= source2[0];
+				d1 ^= source2[1];
+				d2 ^= source2[2];
+				d3 ^= source2[3];
+				d4 ^= source2[4];
+				d5 ^= source2[5];
+				d6 ^= source2[6];
+				d7 ^= source2[7];
+				d0 ^= source3[0];
+				d1 ^= source3[1];
+				d2 ^= source3[2];
+				d3 ^= source3[3];
+				d4 ^= source3[4];
+				d5 ^= source3[5];
+				d6 ^= source3[6];
+				d7 ^= source3[7];
+				d0 ^= source4[0];
+				d1 ^= source4[1];
+				d2 ^= source4[2];
+				d3 ^= source4[3];
+				d4 ^= source4[4];
+				d5 ^= source4[5];
+				d6 ^= source4[6];
+				d7 ^= source4[7];
+				destp[0] = d0;	/* Store the result (in burts)		*/
+				destp[1] = d1;
+				destp[2] = d2;
+				destp[3] = d3;
+				destp[4] = d4;	/* Store the result (in burts)		*/
+				destp[5] = d5;
+				destp[6] = d6;
+				destp[7] = d7;
+				source1 += 8;
+				source2 += 8;
+				source3 += 8;
+				source4 += 8;
+				destp += 8;
+			}
+			break;
+	}
+}
+
+/*
+ * (the -6*32 shift factor colors the cache)
+ */
+#define SIZE (PAGE_SIZE-6*32)
+
+static void xor_speed ( struct xor_block_template * func, 
+	struct buffer_head *b1, struct buffer_head *b2)
+{
+	int speed;
+	unsigned long now;
+	int i, count, max;
+	struct buffer_head *bh_ptr[6];
+
+	func->next = xor_functions;
+	xor_functions = func;
+	bh_ptr[0] = b1;
+	bh_ptr[1] = b2;
+
+	/*
+	 * count the number of XORs done during a whole jiffy.
+	 * calculate the speed of checksumming from this.
+	 * (we use a 2-page allocation to have guaranteed
+	 * color L1-cache layout)
+	 */
+	max = 0;
+	for (i = 0; i < 5; i++) {
+		now = jiffies;
+		count = 0;
+		while (jiffies == now) {
+			mb();
+			func->xor_block(2,bh_ptr);
+			mb();
+			count++;
+			mb();
+		}
+		if (count > max)
+			max = count;
+	}
+
+	speed = max * (HZ*SIZE/1024);
+	func->speed = speed;
+
+	printk( "   %-10s: %5d.%03d MB/sec\n", func->name,
+		speed / 1000, speed % 1000);
+}
+
+static inline void pick_fastest_function(void)
+{
+	struct xor_block_template *f, *fastest;
+
+	fastest = xor_functions;
+	for (f = fastest; f; f = f->next) {
+		if (f->speed > fastest->speed)
+			fastest = f;
+	}
+#ifdef CONFIG_X86_XMM 
+	if (boot_cpu_data.mmu_cr4_features & X86_CR4_OSXMMEXCPT) {
+		fastest = &t_xor_block_pIII_kni;
+	}
+#endif
+	xor_block = fastest->xor_block;
+	printk( "using fastest function: %s (%d.%03d MB/sec)\n", fastest->name,
+		fastest->speed / 1000, fastest->speed % 1000);
+}
+ 
+
+void calibrate_xor_block(void)
+{
+	struct buffer_head b1, b2;
+
+	memset(&b1,0,sizeof(b1));
+	b2 = b1;
+
+	b1.b_data = (char *) md__get_free_pages(GFP_KERNEL,2);
+	if (!b1.b_data) {
+		pick_fastest_function();
+		return;
+	}
+	b2.b_data = b1.b_data + 2*PAGE_SIZE + SIZE;
+
+	b1.b_size = SIZE;
+
+	printk(KERN_INFO "raid5: measuring checksumming speed\n");
+
+	sti(); /* should be safe */
+
+#if defined(__sparc__) && !defined(__sparc_v9__)
+	printk(KERN_INFO "raid5: trying high-speed SPARC checksum routine\n");
+	xor_speed(&t_xor_block_SPARC,&b1,&b2);
+#endif
+
+#ifdef CONFIG_X86_XMM 
+	if (boot_cpu_data.mmu_cr4_features & X86_CR4_OSXMMEXCPT) {
+		printk(KERN_INFO
+			"raid5: KNI detected, trying cache-avoiding KNI checksum routine\n");
+		/* we force the use of the KNI xor block because it
+			can write around l2.  we may also be able
+			to load into the l1 only depending on how
+			the cpu deals with a load to a line that is
+			being prefetched.
+		*/
+		xor_speed(&t_xor_block_pIII_kni,&b1,&b2);
+	}
+#endif /* CONFIG_X86_XMM */
+
+#ifdef __i386__
+
+	if (md_cpu_has_mmx() && !cpu_has_fxsr) {
+		printk(KERN_INFO
+			"raid5: MMX detected, trying high-speed MMX checksum routines\n");
+		xor_speed(&t_xor_block_pII_mmx,&b1,&b2);
+		xor_speed(&t_xor_block_p5_mmx,&b1,&b2);
+	}
+
+#endif /* __i386__ */
+	
+	
+	xor_speed(&t_xor_block_8regs,&b1,&b2);
+	xor_speed(&t_xor_block_32regs,&b1,&b2);
+
+	free_pages((unsigned long)b1.b_data,2);
+	pick_fastest_function();
+}
+
+#else /* __sparc_v9__ */
+
+void calibrate_xor_block(void)
+{
+	printk(KERN_INFO "raid5: using high-speed VIS checksum routine\n");
+	xor_block = xor_block_VIS;
+}
+
+#endif /* __sparc_v9__ */
+
+MD_EXPORT_SYMBOL(xor_block);
+
diff -urN 2.2.18/drivers/char/Config.in 2.2.18aa1/drivers/char/Config.in
--- 2.2.18/drivers/char/Config.in	Mon Dec 11 16:57:48 2000
+++ 2.2.18aa1/drivers/char/Config.in	Mon Dec 11 17:20:49 2000
@@ -118,6 +118,9 @@
 
 tristate '/dev/nvram support' CONFIG_NVRAM
 bool 'Enhanced Real Time Clock Support' CONFIG_RTC
+if [ "$CONFIG_RTC" = "y" -a "$ARCH" = "alpha" ]; then
+  bool '  Use only lightweight version (no interrupts)' CONFIG_RTC_LIGHT
+fi
 if [ "$CONFIG_ALPHA_BOOK1" = "y" ]; then
   bool 'Tadpole ANA H8 Support'  CONFIG_H8
 fi
diff -urN 2.2.18/drivers/char/Makefile 2.2.18aa1/drivers/char/Makefile
--- 2.2.18/drivers/char/Makefile	Mon Dec 11 16:57:48 2000
+++ 2.2.18aa1/drivers/char/Makefile	Mon Dec 11 17:20:48 2000
@@ -20,7 +20,7 @@
 
 O_TARGET := char.o
 M_OBJS   :=
-O_OBJS   := tty_io.o n_tty.o tty_ioctl.o mem.o random.o
+O_OBJS   := tty_io.o n_tty.o tty_ioctl.o mem.o random.o raw.o
 OX_OBJS  := pty.o misc.o
 obj-y 	 :=
 obj-m	 :=
diff -urN 2.2.18/drivers/char/adbmouse.c 2.2.18aa1/drivers/char/adbmouse.c
--- 2.2.18/drivers/char/adbmouse.c	Tue Sep  5 02:28:40 2000
+++ 2.2.18aa1/drivers/char/adbmouse.c	Mon Dec 11 17:20:44 2000
@@ -135,7 +135,7 @@
     mouse.ready = 1;
     wake_up_interruptible(&mouse.wait);
     if (mouse.fasyncptr)
-	kill_fasync(mouse.fasyncptr, SIGIO);
+	kill_fasync(mouse.fasyncptr, SIGIO, POLL_IN);
 }
 
 static int fasync_mouse(int fd, struct file *filp, int on)
diff -urN 2.2.18/drivers/char/amigamouse.c 2.2.18aa1/drivers/char/amigamouse.c
--- 2.2.18/drivers/char/amigamouse.c	Mon Jan 17 16:44:37 2000
+++ 2.2.18aa1/drivers/char/amigamouse.c	Mon Dec 11 17:20:44 2000
@@ -154,7 +154,7 @@
 	      mouse.dy =  2048;
 
 	  if (mouse.fasyncptr)
-	      kill_fasync(mouse.fasyncptr, SIGIO);
+	      kill_fasync(mouse.fasyncptr, SIGIO, POLL_IN);
 	}
 	AMI_MSE_INT_ON();
 }
diff -urN 2.2.18/drivers/char/atarimouse.c 2.2.18aa1/drivers/char/atarimouse.c
--- 2.2.18/drivers/char/atarimouse.c	Mon Jan 17 16:44:37 2000
+++ 2.2.18aa1/drivers/char/atarimouse.c	Mon Dec 11 17:20:44 2000
@@ -49,7 +49,7 @@
     mouse.ready = 1;
     wake_up_interruptible(&mouse.wait);
     if (mouse.fasyncptr)
-	kill_fasync(mouse.fasyncptr, SIGIO);
+	kill_fasync(mouse.fasyncptr, SIGIO, POLL_IN);
 
 /*    ikbd_mouse_rel_pos(); */
 }
diff -urN 2.2.18/drivers/char/atixlmouse.c 2.2.18aa1/drivers/char/atixlmouse.c
--- 2.2.18/drivers/char/atixlmouse.c	Mon Jan 17 16:44:37 2000
+++ 2.2.18aa1/drivers/char/atixlmouse.c	Mon Dec 11 17:20:44 2000
@@ -90,7 +90,7 @@
 		mouse.ready = 1;
 		wake_up_interruptible(&mouse.wait);
 		if (mouse.fasync)
-			kill_fasync(mouse.fasync, SIGIO);
+			kill_fasync(mouse.fasync, SIGIO, POLL_IN);
 	}
 	ATIXL_MSE_ENABLE_UPDATE();
 }
diff -urN 2.2.18/drivers/char/busmouse.c 2.2.18aa1/drivers/char/busmouse.c
--- 2.2.18/drivers/char/busmouse.c	Sun Apr  2 21:07:48 2000
+++ 2.2.18aa1/drivers/char/busmouse.c	Mon Dec 11 17:20:44 2000
@@ -105,7 +105,7 @@
 	      mouse.dy =  2048;
 
 	  if (mouse.fasyncptr)
-	      kill_fasync(mouse.fasyncptr, SIGIO);
+	      kill_fasync(mouse.fasyncptr, SIGIO, POLL_IN);
 	}
 	MSE_INT_ON();
 }
diff -urN 2.2.18/drivers/char/dn_keyb.c 2.2.18aa1/drivers/char/dn_keyb.c
--- 2.2.18/drivers/char/dn_keyb.c	Mon Jan 17 16:44:37 2000
+++ 2.2.18aa1/drivers/char/dn_keyb.c	Mon Dec 11 17:20:44 2000
@@ -468,7 +468,7 @@
           		if (mouse_dy >  2048)
               		mouse_dy =  2048;
 				if (mouse_fasyncptr)
-              		kill_fasync(mouse_fasyncptr, SIGIO);
+              		kill_fasync(mouse_fasyncptr, SIGIO, POLL_IN);
 			}
 			mouse_byte_count=0;
 /*			printk("mouse: %d, %d, %x\n",mouse_x,mouse_y,buttons); */
diff -urN 2.2.18/drivers/char/drm/fops.c 2.2.18aa1/drivers/char/drm/fops.c
--- 2.2.18/drivers/char/drm/fops.c	Mon Dec 11 16:57:48 2000
+++ 2.2.18aa1/drivers/char/drm/fops.c	Mon Dec 11 17:20:44 2000
@@ -216,7 +216,7 @@
 	}
 
 #if LINUX_VERSION_CODE < 0x020400
-	if (dev->buf_async) kill_fasync(dev->buf_async, SIGIO);
+	if (dev->buf_async) kill_fasync(dev->buf_async, SIGIO, POLL_IN);
 #else
 				/* Type of first parameter changed in
                                    Linux 2.4.0-test2... */
diff -urN 2.2.18/drivers/char/mem.c 2.2.18aa1/drivers/char/mem.c
--- 2.2.18/drivers/char/mem.c	Mon Dec 11 16:57:49 2000
+++ 2.2.18aa1/drivers/char/mem.c	Mon Dec 11 17:20:48 2000
@@ -17,6 +17,7 @@
 #include <linux/joystick.h>
 #include <linux/i2c.h>
 #include <linux/capability.h>
+#include <linux/raw.h>
 
 #include <asm/uaccess.h>
 #include <asm/io.h>
@@ -608,6 +609,7 @@
 	if (register_chrdev(MEM_MAJOR,"mem",&memory_fops))
 		printk("unable to get major %d for memory devs\n", MEM_MAJOR);
 	rand_initialize();
+	raw_init();
 #if defined (CONFIG_FB)
 	fbmem_init();
 #endif
diff -urN 2.2.18/drivers/char/msbusmouse.c 2.2.18aa1/drivers/char/msbusmouse.c
--- 2.2.18/drivers/char/msbusmouse.c	Sun Apr  2 21:07:48 2000
+++ 2.2.18aa1/drivers/char/msbusmouse.c	Mon Dec 11 17:20:44 2000
@@ -89,7 +89,7 @@
 		mouse.ready = 1;
 		wake_up_interruptible(&mouse.wait);
 		if (mouse.fasyncptr)
-			kill_fasync(mouse.fasyncptr, SIGIO);
+			kill_fasync(mouse.fasyncptr, SIGIO, POLL_IN);
 	}
 }
 
diff -urN 2.2.18/drivers/char/n_hdlc.c 2.2.18aa1/drivers/char/n_hdlc.c
--- 2.2.18/drivers/char/n_hdlc.c	Mon Dec 11 16:57:49 2000
+++ 2.2.18aa1/drivers/char/n_hdlc.c	Mon Dec 11 17:20:44 2000
@@ -653,11 +653,7 @@
 	wake_up_interruptible (&n_hdlc->read_wait);
 	wake_up_interruptible (&n_hdlc->poll_wait);
 	if (n_hdlc->tty->fasync != NULL)
-#if LINUX_VERSION_CODE < VERSION(2,3,0) 
-		kill_fasync (n_hdlc->tty->fasync, SIGIO);
-#else
 		kill_fasync (n_hdlc->tty->fasync, SIGIO, POLL_IN);
-#endif
 }	/* end of n_hdlc_tty_receive() */
 
 /* n_hdlc_tty_read()
diff -urN 2.2.18/drivers/char/n_tty.c 2.2.18aa1/drivers/char/n_tty.c
--- 2.2.18/drivers/char/n_tty.c	Tue Sep  5 02:28:41 2000
+++ 2.2.18aa1/drivers/char/n_tty.c	Mon Dec 11 17:20:44 2000
@@ -635,7 +635,7 @@
 			tty->canon_head = tty->read_head;
 			tty->canon_data++;
 			if (tty->fasync)
-				kill_fasync(tty->fasync, SIGIO);
+				kill_fasync(tty->fasync, SIGIO, POLL_IN);
 			if (tty->read_wait || tty->poll_wait)
 			{
 				wake_up_interruptible(&tty->read_wait);
@@ -743,7 +743,7 @@
 
 	if (!tty->icanon && (tty->read_cnt >= tty->minimum_to_wake)) {
 		if (tty->fasync)
-			kill_fasync(tty->fasync, SIGIO);
+			kill_fasync(tty->fasync, SIGIO, POLL_IN);
 		if (tty->read_wait||tty->poll_wait)
 		{
 			wake_up_interruptible(&tty->read_wait);
diff -urN 2.2.18/drivers/char/pc110pad.c 2.2.18aa1/drivers/char/pc110pad.c
--- 2.2.18/drivers/char/pc110pad.c	Mon Jan 17 16:44:37 2000
+++ 2.2.18aa1/drivers/char/pc110pad.c	Mon Dec 11 17:20:44 2000
@@ -75,7 +75,7 @@
 {
 	wake_up_interruptible(&queue);
 	if(asyncptr)
-		kill_fasync(asyncptr, SIGIO);
+		kill_fasync(asyncptr, SIGIO, POLL_IN);
 }
 
 
diff -urN 2.2.18/drivers/char/pc_keyb.c 2.2.18aa1/drivers/char/pc_keyb.c
--- 2.2.18/drivers/char/pc_keyb.c	Mon Dec 11 16:57:49 2000
+++ 2.2.18aa1/drivers/char/pc_keyb.c	Mon Dec 11 17:20:44 2000
@@ -424,7 +424,7 @@
 		if (head != queue->tail) {
 			queue->head = head;
 			if (queue->fasync)
-				kill_fasync(queue->fasync, SIGIO);
+				kill_fasync(queue->fasync, SIGIO, POLL_IN);
 			wake_up_interruptible(&queue->proc_list);
 		}
 	}
diff -urN 2.2.18/drivers/char/qpmouse.c 2.2.18aa1/drivers/char/qpmouse.c
--- 2.2.18/drivers/char/qpmouse.c	Mon Dec 11 16:57:49 2000
+++ 2.2.18aa1/drivers/char/qpmouse.c	Mon Dec 11 17:20:44 2000
@@ -134,7 +134,7 @@
 	}
 	queue->head = head;
 	if (queue->fasync)
-		kill_fasync(queue->fasync, SIGIO);
+		kill_fasync(queue->fasync, SIGIO, POLL_IN);
 	wake_up_interruptible(&queue->proc_list);
 }
 
diff -urN 2.2.18/drivers/char/raw.c 2.2.18aa1/drivers/char/raw.c
--- 2.2.18/drivers/char/raw.c	Thu Jan  1 01:00:00 1970
+++ 2.2.18aa1/drivers/char/raw.c	Mon Dec 11 17:20:48 2000
@@ -0,0 +1,387 @@
+/*
+ * linux/drivers/char/raw.c
+ *
+ * Front-end raw character devices.  These can be bound to any block
+ * devices to provide genuine Unix raw character device semantics.
+ *
+ * We reserve minor number 0 for a control interface.  ioctl()s on this
+ * device are used to bind the other minor numbers to block devices.
+ */
+
+#include <linux/fs.h>
+#include <linux/iobuf.h>
+#include <linux/major.h>
+#include <linux/blkdev.h>
+#include <linux/raw.h>
+#include <asm/uaccess.h>
+
+#define dprintk(x...) 
+
+static kdev_t raw_device_bindings[256] = {};
+static int raw_device_inuse[256] = {};
+static int raw_device_sector_size[256] = {};
+static int raw_device_sector_bits[256] = {};
+
+extern struct file_operations * get_blkfops(unsigned int major);
+
+static ssize_t rw_raw_dev(int rw, struct file *, char *, size_t, loff_t *);
+
+ssize_t	raw_read(struct file *, char *, size_t, loff_t *);
+ssize_t	raw_write(struct file *, const char *, size_t, loff_t *);
+int	raw_open(struct inode *, struct file *);
+int	raw_release(struct inode *, struct file *);
+int	raw_ctl_ioctl(struct inode *, struct file *, unsigned int, unsigned long);
+
+
+static struct file_operations raw_fops = {
+	NULL,		/* llseek */
+	raw_read,	/* read */
+	raw_write,	/* write */
+	NULL,		/* readdir */
+	NULL,		/* poll */
+	NULL,		/* ioctl */
+	NULL,		/* mmap */
+	raw_open,	/* open */
+	NULL,		/* flush */
+	raw_release,	/* release */
+	NULL		/* fsync */
+};
+
+static struct file_operations raw_ctl_fops = {
+	NULL,		/* llseek */
+	NULL,		/* read */
+	NULL,		/* write */
+	NULL,		/* readdir */
+	NULL,		/* poll */
+	raw_ctl_ioctl,	/* ioctl */
+	NULL,		/* mmap */
+	raw_open,	/* open */
+	NULL,		/* flush */
+	NULL,		/* no special release code */
+	NULL		/* fsync */
+};
+
+
+
+void __init raw_init(void)
+{
+	register_chrdev(RAW_MAJOR, "raw", &raw_fops);
+}
+
+
+/*
+ * The raw IO open and release code needs to fake appropriate
+ * open/release calls to the underlying block devices.  
+ */
+
+static int bdev_open(kdev_t dev, int mode)
+{
+	int err = 0;
+	struct file dummy_file = {};
+	struct dentry dummy_dentry = {};
+	struct inode * inode = get_empty_inode();
+	
+	if (!inode)
+		return -ENOMEM;
+	
+	dummy_file.f_op = get_blkfops(MAJOR(dev));
+	if (!dummy_file.f_op) {
+		err = -ENODEV;
+		goto done;
+	}
+	
+	if (dummy_file.f_op->open) {
+		inode->i_rdev = dev;
+		dummy_dentry.d_inode = inode;
+		dummy_file.f_dentry = &dummy_dentry;
+		dummy_file.f_mode = mode;
+		err = dummy_file.f_op->open(inode, &dummy_file);
+	}
+
+ done:
+	iput(inode);
+	return err;
+}
+
+static int bdev_close(kdev_t dev)
+{
+	int err;
+	struct inode * inode = get_empty_inode();
+
+	if (!inode)
+		return -ENOMEM;
+	
+	inode->i_rdev = dev;
+	err = blkdev_release(inode);
+	iput(inode);
+	return err;
+}
+
+
+
+/* 
+ * Open/close code for raw IO.
+ */
+
+int raw_open(struct inode *inode, struct file *filp)
+{
+	int minor;
+	kdev_t bdev;
+	int err;
+	int sector_size;
+	int sector_bits;
+
+	minor = MINOR(inode->i_rdev);
+	
+	/* 
+	 * Is it the control device? 
+	 */
+	
+	if (minor == 0) {
+		filp->f_op = &raw_ctl_fops;
+		return 0;
+	}
+	
+	/*
+	 * No, it is a normal raw device.  All we need to do on open is
+	 * to check that the device is bound, and force the underlying
+	 * block device to a sector-size blocksize. 
+	 */
+
+	bdev = raw_device_bindings[minor];
+	if (bdev == NODEV) 
+		return -ENODEV;
+
+	err = bdev_open(bdev, filp->f_mode);
+	if (err)
+		return err;
+	
+	/*
+	 * Don't change the blocksize if we already have users using
+	 * this device 
+	 */
+
+	if (raw_device_inuse[minor]++)
+		return 0;
+	
+	/* 
+	 * Don't interfere with mounted devices: we cannot safely set
+	 * the blocksize on a device which is already mounted.  
+	 */
+	
+	sector_size = 512;
+	if (lookup_vfsmnt(bdev) != NULL) {
+		if (blksize_size[MAJOR(bdev)])
+			sector_size = blksize_size[MAJOR(bdev)][MINOR(bdev)];
+	} else {
+		if (hardsect_size[MAJOR(bdev)])
+			sector_size = hardsect_size[MAJOR(bdev)][MINOR(bdev)];
+	}
+
+	set_blocksize(bdev, sector_size);
+	raw_device_sector_size[minor] = sector_size;
+
+	for (sector_bits = 0; !(sector_size & 1); )
+		sector_size>>=1, sector_bits++;
+	raw_device_sector_bits[minor] = sector_bits;
+	
+	return 0;
+}
+
+int raw_release(struct inode *inode, struct file *filp)
+{
+	int minor;
+	kdev_t bdev;
+	
+	minor = MINOR(inode->i_rdev);
+	bdev = raw_device_bindings[minor];
+	bdev_close(bdev);
+	raw_device_inuse[minor]--;
+	return 0;
+}
+
+
+
+/*
+ * Deal with ioctls against the raw-device control interface, to bind
+ * and unbind other raw devices.  
+ */
+
+int raw_ctl_ioctl(struct inode *inode, 
+		  struct file *flip,
+		  unsigned int command, 
+		  unsigned long arg)
+{
+	struct raw_config_request rq;
+	int err = 0;
+	int minor;
+	
+	switch (command) {
+	case RAW_SETBIND:
+	case RAW_GETBIND:
+
+		/* First, find out which raw minor we want */
+
+		err = copy_from_user(&rq, (void *) arg, sizeof(rq));
+		if (err)
+			break;
+		
+		minor = rq.raw_minor;
+		if (minor == 0 || minor > MINORMASK) {
+			err = -EINVAL;
+			break;
+		}
+
+		if (command == RAW_SETBIND) {
+			/* 
+			 * For now, we don't need to check that the underlying
+			 * block device is present or not: we can do that when
+			 * the raw device is opened.  Just check that the
+			 * major/minor numbers make sense. 
+			 */
+
+			if (rq.block_major == NODEV || 
+			    rq.block_major > MAX_BLKDEV ||
+			    rq.block_minor > MINORMASK) {
+				err = -EINVAL;
+				break;
+			}
+			
+			if (raw_device_inuse[minor]) {
+				err = -EBUSY;
+				break;
+			}
+			raw_device_bindings[minor] = 
+				MKDEV(rq.block_major, rq.block_minor);
+		} else {
+			rq.block_major = MAJOR(raw_device_bindings[minor]);
+			rq.block_minor = MINOR(raw_device_bindings[minor]);
+			err = copy_to_user((void *) arg, &rq, sizeof(rq));
+		}
+		break;
+		
+	default:
+		err = -EINVAL;
+	}
+	
+	return err;
+}
+
+
+
+ssize_t	raw_read(struct file *filp, char * buf, 
+		 size_t size, loff_t *offp)
+{
+	return rw_raw_dev(READ, filp, buf, size, offp);
+}
+
+ssize_t	raw_write(struct file *filp, const char *buf, 
+		  size_t size, loff_t *offp)
+{
+	return rw_raw_dev(WRITE, filp, (char *) buf, size, offp);
+}
+
+#define SECTOR_BITS 9
+#define SECTOR_SIZE (1U << SECTOR_BITS)
+#define SECTOR_MASK (SECTOR_SIZE - 1)
+
+ssize_t	rw_raw_dev(int rw, struct file *filp, char *buf, 
+		   size_t size, loff_t *offp)
+{
+	struct kiobuf * iobuf;
+	int		err;
+	unsigned long	blocknr, blocks;
+	unsigned long	b[KIO_MAX_SECTORS];
+	size_t		transferred;
+	int		iosize;
+	int		i;
+	int		minor;
+	kdev_t		dev;
+	unsigned long	limit;
+
+	int		sector_size, sector_bits, sector_mask;
+	int		max_sectors;
+	
+	/*
+	 * First, a few checks on device size limits 
+	 */
+
+	minor = MINOR(filp->f_dentry->d_inode->i_rdev);
+	dev = raw_device_bindings[minor];
+	sector_size = raw_device_sector_size[minor];
+	sector_bits = raw_device_sector_bits[minor];
+	sector_mask = sector_size- 1;
+	max_sectors = KIO_MAX_SECTORS >> (sector_bits - 9);
+	
+	if (blk_size[MAJOR(dev)])
+		limit = (((loff_t) blk_size[MAJOR(dev)][MINOR(dev)]) << BLOCK_SIZE_BITS) >> sector_bits;
+	else
+		limit = INT_MAX;
+	dprintk ("rw_raw_dev: dev %d:%d (+%d)\n",
+		 MAJOR(dev), MINOR(dev), limit);
+	
+	if ((*offp & sector_mask) || (size & sector_mask))
+		return -EINVAL;
+	if ((*offp >> sector_bits) >= limit)  {
+		if (size)
+			return -ENXIO;
+		return 0;
+	}
+
+	/* 
+	 * We'll just use one kiobuf
+	 */
+
+	err = alloc_kiovec(1, &iobuf);
+	if (err)
+		return err;
+
+	/*
+	 * Split the IO into KIO_MAX_SECTORS chunks, mapping and
+	 * unmapping the single kiobuf as we go to perform each chunk of
+	 * IO.  
+	 */
+
+	transferred = 0;
+	blocknr = *offp >> sector_bits;
+	while (size > 0) {
+		blocks = size >> sector_bits;
+		if (blocks > max_sectors)
+			blocks = max_sectors;
+		if (blocks > limit - blocknr)
+			blocks = limit - blocknr;
+		if (!blocks)
+			break;
+
+		iosize = blocks << sector_bits;
+		
+		err = map_user_kiobuf(rw, iobuf, (unsigned long) buf, iosize);
+		if (err)
+			break;
+		
+		for (i=0; i < blocks; i++)
+ 			b[i] = blocknr++;
+		
+		err = brw_kiovec(rw, 1, &iobuf, dev, b, sector_size);
+
+		if (err >= 0) {
+			transferred += err;
+			size -= err;
+			buf += err;
+		}
+
+		unmap_kiobuf(iobuf);
+
+		if (err != iosize)
+			break;
+	}
+	
+	free_kiovec(1, &iobuf);
+
+	if (transferred) {
+		*offp += transferred;
+		return transferred;
+	}
+	
+	return err;
+}
diff -urN 2.2.18/drivers/char/rtc.c 2.2.18aa1/drivers/char/rtc.c
--- 2.2.18/drivers/char/rtc.c	Mon Dec 11 16:57:50 2000
+++ 2.2.18aa1/drivers/char/rtc.c	Mon Dec 11 17:20:49 2000
@@ -121,6 +121,7 @@
  *	(See ./arch/XXXX/kernel/time.c for the set_rtc_mmss() function.)
  */
 
+#ifndef CONFIG_RTC_LIGHT
 static void rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 {
 	/*
@@ -141,6 +142,7 @@
 	if (rtc_status & RTC_TIMER_ON)
 		mod_timer(&rtc_irq_timer, jiffies + HZ/rtc_freq + 2*HZ/100);
 }
+#endif
 
 /*
  *	Now all the various file operations that we export.
@@ -154,6 +156,9 @@
 static ssize_t rtc_read(struct file *file, char *buf,
 			size_t count, loff_t *ppos)
 {
+#ifdef CONFIG_RTC_LIGHT
+	return -EIO;
+#else
 	struct wait_queue wait = { current, NULL };
 	unsigned long data;
 	ssize_t retval;
@@ -185,6 +190,7 @@
 	remove_wait_queue(&rtc_wait, &wait);
 
 	return retval;
+#endif
 }
 
 static int rtc_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
@@ -195,6 +201,7 @@
 	struct rtc_time wtime; 
 
 	switch (cmd) {
+#ifndef CONFIG_RTC_LIGHT
 	case RTC_AIE_OFF:	/* Mask alarm int. enab. bit	*/
 	{
 		mask_rtc_irq_bit(RTC_AIE);
@@ -242,6 +249,7 @@
 		set_rtc_irq_bit(RTC_UIE);
 		return 0;
 	}
+#endif
 	case RTC_ALM_READ:	/* Read the present alarm time */
 	{
 		/*
@@ -379,6 +387,7 @@
 	{
 		return put_user(rtc_freq, (unsigned long *)arg);
 	}
+#ifndef CONFIG_RTC_LIGHT
 	case RTC_IRQP_SET:	/* Set periodic IRQ rate.	*/
 	{
 		int tmp = 0;
@@ -414,6 +423,7 @@
 		spin_unlock_irqrestore(&rtc_lock, flags);
 		return 0;
 	}
+#endif
 #ifdef __alpha__
 	case RTC_EPOCH_READ:	/* Read the epoch.	*/
 	{
@@ -463,6 +473,7 @@
 	 * in use, and clear the data.
 	 */
 
+#ifndef CONFIG_RTC_LIGHT
 	unsigned char tmp;
 	unsigned long flags;
 
@@ -481,10 +492,12 @@
 	}
 
 	rtc_irq_data = 0;
+#endif
 	rtc_status &= ~RTC_IS_OPEN;
 	return 0;
 }
 
+#ifndef CONFIG_RTC_LIGHT
 static unsigned int rtc_poll(struct file *file, poll_table *wait)
 {
 	poll_wait(file, &rtc_wait, wait);
@@ -492,6 +505,7 @@
 		return POLLIN | POLLRDNORM;
 	return 0;
 }
+#endif
 
 /*
  *	The various file operations we support.
@@ -502,7 +516,11 @@
 	rtc_read,
 	NULL,		/* No write */
 	NULL,		/* No readdir */
+#ifdef CONFIG_RTC_LIGHT
+	NULL,
+#else
 	rtc_poll,
+#endif
 	rtc_ioctl,
 	NULL,		/* No mmap */
 	rtc_open,
@@ -526,12 +544,14 @@
 	char *guess = NULL;
 #endif
 	printk(KERN_INFO "Real Time Clock Driver v%s\n", RTC_VERSION);
+#ifndef CONFIG_RTC_LIGHT
 	if(request_irq(RTC_IRQ, rtc_interrupt, SA_INTERRUPT, "rtc", NULL))
 	{
 		/* Yeah right, seeing as irq 8 doesn't even hit the bus. */
 		printk(KERN_ERR "rtc: IRQ %d is not free.\n", RTC_IRQ);
 		return -EIO;
 	}
+#endif
 	misc_register(&rtc_dev);
 	/* Check region? Naaah! Just snarf it up. */
 	request_region(RTC_PORT(0), RTC_IO_EXTENT, "rtc");
@@ -557,13 +577,17 @@
 	if (year > 10 && year < 44) {
 		epoch = 1980;
 		guess = "ARC console";
-	} else if (year < 96) {
+	} else if (year >= 48 && year < 70) {
 		epoch = 1952;
 		guess = "Digital UNIX";
+	} else if (year >= 70 && year < 100) {
+		epoch = 1928;
+		guess = "Digital DECstation";
 	}
 	if (guess)
 		printk("rtc: %s epoch (%lu) detected\n", guess, epoch);
 #endif
+#ifndef CONFIG_RTC_LIGHT
 	init_timer(&rtc_irq_timer);
 	rtc_irq_timer.function = rtc_dropped_irq;
 	rtc_wait = NULL;
@@ -571,6 +595,7 @@
 	/* Initialize periodic freq. to CMOS reset default, which is 1024Hz */
 	CMOS_WRITE(((CMOS_READ(RTC_FREQ_SELECT) & 0xF0) | 0x06), RTC_FREQ_SELECT);
 	spin_unlock_irqrestore(&rtc_lock, flags);
+#endif
 	rtc_freq = 1024;
 	return 0;
 }
@@ -587,6 +612,7 @@
  *	for something that requires a steady > 1KHz signal anyways.)
  */
 
+#ifndef CONFIG_RTC_LIGHT
 void rtc_dropped_irq(unsigned long data)
 {
 	unsigned long flags;
@@ -600,6 +626,7 @@
 	rtc_irq_data |= (CMOS_READ(RTC_INTR_FLAGS) & 0xF0);	/* restart */
 	spin_unlock_irqrestore(&rtc_lock, flags);
 }
+#endif
 
 /*
  *	Info exported via "/proc/rtc".
@@ -782,6 +809,7 @@
  * meddles with the interrupt enable/disable bits.
  */
 
+#ifndef CONFIG_RTC_LIGHT
 void mask_rtc_irq_bit(unsigned char bit)
 {
 	unsigned char val;
@@ -809,3 +837,4 @@
 	rtc_irq_data = 0;
 	spin_unlock_irqrestore(&rtc_lock, flags);
 }
+#endif
diff -urN 2.2.18/drivers/char/tty_ioctl.c 2.2.18aa1/drivers/char/tty_ioctl.c
--- 2.2.18/drivers/char/tty_ioctl.c	Thu May  4 13:00:37 2000
+++ 2.2.18aa1/drivers/char/tty_ioctl.c	Mon Dec 11 17:20:54 2000
@@ -96,7 +96,7 @@
 			old->c_cc[i] : termios->c_cc[i];
 }
 
-static void change_termios(struct tty_struct * tty, struct termios * new_termios)
+void change_termios(struct tty_struct * tty, struct termios * new_termios)
 {
 	int canon_change;
 	struct termios old_termios = *tty->termios;
diff -urN 2.2.18/drivers/i2o/i2o_config.c 2.2.18aa1/drivers/i2o/i2o_config.c
--- 2.2.18/drivers/i2o/i2o_config.c	Tue Jun 13 03:48:13 2000
+++ 2.2.18aa1/drivers/i2o/i2o_config.c	Mon Dec 11 17:20:44 2000
@@ -164,7 +164,7 @@
 //			inf->fp, inf->q_id, inf->q_len);	
 
 		if(inf->fasync)
-			kill_fasync(inf->fasync, SIGIO);
+			kill_fasync(inf->fasync, SIGIO, POLL_IN);
 	}
 
 	return;
diff -urN 2.2.18/drivers/net/ppp.c 2.2.18aa1/drivers/net/ppp.c
--- 2.2.18/drivers/net/ppp.c	Tue Sep  5 02:28:43 2000
+++ 2.2.18aa1/drivers/net/ppp.c	Mon Dec 11 17:20:44 2000
@@ -2382,7 +2382,7 @@
 
 	wake_up_interruptible (&ppp->read_wait);
 	if (ppp->tty->fasync != NULL)
-		kill_fasync (ppp->tty->fasync, SIGIO);
+		kill_fasync (ppp->tty->fasync, SIGIO, POLL_IN);
 
 	return 1;
 }
diff -urN 2.2.18/drivers/sbus/char/pcikbd.c 2.2.18aa1/drivers/sbus/char/pcikbd.c
--- 2.2.18/drivers/sbus/char/pcikbd.c	Thu May  4 13:00:39 2000
+++ 2.2.18aa1/drivers/sbus/char/pcikbd.c	Mon Dec 11 17:20:44 2000
@@ -779,7 +779,7 @@
 	queue->head = head;
 	aux_ready = 1;
 	if (queue->fasync)
-		kill_fasync(queue->fasync, SIGIO);
+		kill_fasync(queue->fasync, SIGIO, POLL_IN);
 	wake_up_interruptible(&queue->proc_list);
 }
 
diff -urN 2.2.18/drivers/sbus/char/sunkbd.c 2.2.18aa1/drivers/sbus/char/sunkbd.c
--- 2.2.18/drivers/sbus/char/sunkbd.c	Mon Jan 17 16:44:39 2000
+++ 2.2.18aa1/drivers/sbus/char/sunkbd.c	Mon Dec 11 17:20:44 2000
@@ -1278,7 +1278,7 @@
 		kbd_head = next;
 	}
 	if (kb_fasync)
-		kill_fasync (kb_fasync, SIGIO);
+		kill_fasync (kb_fasync, SIGIO, POLL_IN);
 	wake_up_interruptible (&kbd_wait);
 }
 
diff -urN 2.2.18/drivers/sbus/char/sunmouse.c 2.2.18aa1/drivers/sbus/char/sunmouse.c
--- 2.2.18/drivers/sbus/char/sunmouse.c	Tue Jun 13 03:48:13 2000
+++ 2.2.18aa1/drivers/sbus/char/sunmouse.c	Mon Dec 11 17:20:44 2000
@@ -137,7 +137,7 @@
 	}
 	sunmouse.ready = 1;
 	if (sunmouse.fasync)
-		kill_fasync (sunmouse.fasync, SIGIO);
+		kill_fasync (sunmouse.fasync, SIGIO, POLL_IN);
 	wake_up_interruptible (&sunmouse.proc_list);
 }
 
@@ -365,7 +365,7 @@
 		 */
 		sunmouse.ready = 1;
 		if (sunmouse.fasync)
-			kill_fasync (sunmouse.fasync, SIGIO);
+			kill_fasync (sunmouse.fasync, SIGIO, POLL_IN);
 		wake_up_interruptible(&sunmouse.proc_list);
 	}
 	return;
diff -urN 2.2.18/drivers/scsi/Makefile 2.2.18aa1/drivers/scsi/Makefile
--- 2.2.18/drivers/scsi/Makefile	Mon Dec 11 16:57:53 2000
+++ 2.2.18aa1/drivers/scsi/Makefile	Mon Dec 11 17:20:54 2000
@@ -62,10 +62,12 @@
 endif
 
 ifeq ($(CONFIG_BLK_DEV_SD),y)
-L_OBJS += sd.o sd_ioctl.o
+L_OBJS += sd_ioctl.o
+LX_OBJS += sd.o
 else
   ifeq ($(CONFIG_BLK_DEV_SD),m)
   M_OBJS += sd_mod.o
+  MIX_OBJS += sd.o
   endif
 endif
 
@@ -729,9 +731,9 @@
 megaraid.o: megaraid.c
 	$(CC) $(CFLAGS) -c megaraid.c
 
-scsi_mod.o: $(MIX_OBJS) hosts.o scsi.o scsi_ioctl.o constants.o \
+scsi_mod.o: $(MIX_OBJS:%sd.o=%) hosts.o scsi.o scsi_ioctl.o constants.o \
 		scsicam.o scsi_proc.o scsi_error.o scsi_obsolete.o scsi_queue.o
-	$(LD) $(LD_RFLAG) -r -o $@ $(MIX_OBJS) hosts.o scsi.o scsi_ioctl.o \
+	$(LD) $(LD_RFLAG) -r -o $@ $(MIX_OBJS:%sd.o=%) hosts.o scsi.o scsi_ioctl.o \
 		constants.o scsicam.o scsi_proc.o               \
 		scsi_error.o scsi_obsolete.o scsi_queue.o \
 
diff -urN 2.2.18/drivers/scsi/constants.c 2.2.18aa1/drivers/scsi/constants.c
--- 2.2.18/drivers/scsi/constants.c	Mon Jan 17 16:44:40 2000
+++ 2.2.18aa1/drivers/scsi/constants.c	Mon Dec 11 17:20:54 2000
@@ -74,6 +74,14 @@
 };
 
 
+static const char *group_4_commands[] = {
+/* 80-87 */ unknown, unknown, unknown, "Dlock", unknown, unknown, unknown, unknown,
+/* 88-8F */ unknown, unknown, unknown, unknown, unknown, unknown, unknown, unknown,
+/* 90-97 */ unknown, unknown, unknown, unknown, unknown, unknown, unknown, unknown,
+/* 98-9F */ unknown, unknown, unknown, unknown, unknown, unknown, unknown, unknown,
+};
+
+
 /* The following are 12 byte commands in group 5 */
 static const char *group_5_commands[] = {
 /* a0-a5 */ unknown, unknown, unknown, unknown, unknown,
@@ -97,7 +105,7 @@
 
 static const char **commands[] = {
     group_0_commands, group_1_commands, group_2_commands, 
-    (const char **) RESERVED_GROUP, (const char **) RESERVED_GROUP, 
+    (const char **) RESERVED_GROUP, group_4_commands, 
     group_5_commands, (const char **) VENDOR_GROUP, 
     (const char **) VENDOR_GROUP
 };
diff -urN 2.2.18/drivers/scsi/scsi.c 2.2.18aa1/drivers/scsi/scsi.c
--- 2.2.18/drivers/scsi/scsi.c	Mon Dec 11 16:57:56 2000
+++ 2.2.18aa1/drivers/scsi/scsi.c	Mon Dec 11 17:20:54 2000
@@ -120,9 +120,9 @@
  */
 unsigned long             scsi_pid = 0;
 Scsi_Cmnd               * last_cmnd = NULL;
-/* Command groups 3 and 4 are reserved and should never be used.  */
+/* Command group 3 is reserved and should never be used.  */
 const unsigned char       scsi_command_size[8] = { 6, 10, 10, 12, 
-                                                   12, 12, 10, 10 };
+                                                   16, 12, 10, 10 };
 static unsigned long      serial_number = 0;
 static Scsi_Cmnd        * scsi_bh_queue_head = NULL;
 static Scsi_Cmnd	* scsi_bh_queue_tail = NULL;
@@ -1486,12 +1486,13 @@
     {
 	int i;
 	int target = SCpnt->target;
+        int size = COMMAND_SIZE(((const unsigned char *)cmnd)[0]);
 	printk ("scsi_do_cmd (host = %d, channel = %d target = %d, "
 		"buffer =%p, bufflen = %d, done = %p, timeout = %d, "
 		"retries = %d)\n"
 		"command : " , host->host_no, SCpnt->channel, target, buffer,
 		bufflen, done, timeout, retries);
-	for (i = 0; i < 10; ++i)
+	for (i = 0; i < size; ++i)
 	    printk ("%02x  ", ((unsigned char *) cmnd)[i]);
 	printk("\n");
     });
@@ -1534,7 +1535,10 @@
      * the completion function for the high level driver.
      */
 
-    memcpy ((void *) SCpnt->data_cmnd , (const void *) cmnd, 12);
+    if (SCpnt->cmd_len == 0)
+	SCpnt->cmd_len = COMMAND_SIZE(((const unsigned char *)cmnd)[0]);
+
+    memcpy ((void *) SCpnt->data_cmnd , (const void *) cmnd, SCpnt->cmd_len);
     SCpnt->reset_chain = NULL;
     SCpnt->serial_number = 0;
     SCpnt->serial_number_at_timeout = 0;
@@ -1546,7 +1550,7 @@
     SCpnt->done = done;
     SCpnt->timeout_per_command = timeout;
 
-    memcpy ((void *) SCpnt->cmnd , (const void *) cmnd, 12);
+    memcpy ((void *) SCpnt->cmnd , (const void *) cmnd, SCpnt->cmd_len);
     /* Zero the sense buffer.  Some host adapters automatically request
      * sense on error.  0 is not a valid sense code.
      */
@@ -1554,8 +1558,6 @@
     SCpnt->request_buffer = buffer;
     SCpnt->request_bufflen = bufflen;
     SCpnt->old_use_sg = SCpnt->use_sg;
-    if (SCpnt->cmd_len == 0)
-	SCpnt->cmd_len = COMMAND_SIZE(SCpnt->cmnd[0]);
     SCpnt->old_cmd_len = SCpnt->cmd_len;
 
     /* Start the timer ticking.  */
diff -urN 2.2.18/drivers/scsi/scsi.h 2.2.18aa1/drivers/scsi/scsi.h
--- 2.2.18/drivers/scsi/scsi.h	Thu Nov  9 19:41:01 2000
+++ 2.2.18aa1/drivers/scsi/scsi.h	Mon Dec 11 19:24:55 2000
@@ -287,7 +287,7 @@
 #define DRIVER_MASK         0x0f
 #define SUGGEST_MASK        0xf0
 
-#define MAX_COMMAND_SIZE    12
+#define MAX_COMMAND_SIZE    16
 
 /*
  *  SCSI command sets
@@ -548,14 +548,14 @@
     unsigned char      old_cmd_len;
 
     /* These elements define the operation we are about to perform */
-    unsigned char      cmnd[12];
+    unsigned char      cmnd[MAX_COMMAND_SIZE];
     unsigned           request_bufflen;	/* Actual request size */
     
     struct timer_list  eh_timeout;         /* Used to time out the command. */
     void             * request_buffer;	/* Actual requested buffer */
     
     /* These elements define the operation we ultimately want to perform */
-    unsigned char      data_cmnd[12];
+    unsigned char      data_cmnd[MAX_COMMAND_SIZE];
     unsigned short     old_use_sg;	/* We save  use_sg here when requesting
                                          * sense info */
     unsigned short     use_sg;          /* Number of pieces of scatter-gather */
diff -urN 2.2.18/drivers/scsi/sd.c 2.2.18aa1/drivers/scsi/sd.c
--- 2.2.18/drivers/scsi/sd.c	Mon Dec 11 16:57:56 2000
+++ 2.2.18aa1/drivers/scsi/sd.c	Mon Dec 11 17:20:54 2000
@@ -83,6 +83,7 @@
 struct hd_struct * sd;
 
 Scsi_Disk * rscsi_disks = NULL;
+EXPORT_SYMBOL(rscsi_disks);
 static int * sd_sizes;
 static int * sd_blocksizes;
 static int * sd_hardsizes;              /* Hardware sector size */
@@ -1571,6 +1572,7 @@
 	sd_gendisks[i].next = sd_gendisks + i + 1;
 	sd_gendisks[i].real_devices =
 		 (void *) (rscsi_disks + i * SCSI_DISKS_PER_MAJOR);
+        sd_gendisks[i].device_names = NULL;
     }
     
     LAST_SD_GENDISK.max_nr =
diff -urN 2.2.18/drivers/scsi/sg.c 2.2.18aa1/drivers/scsi/sg.c
--- 2.2.18/drivers/scsi/sg.c	Mon Dec 11 16:57:56 2000
+++ 2.2.18aa1/drivers/scsi/sg.c	Mon Dec 11 17:20:44 2000
@@ -831,7 +831,7 @@
     if (sfp && srp) {
         wake_up_interruptible(&sfp->read_wait);
         if (sfp->async_qp)
-            kill_fasync(sfp->async_qp, SIGPOLL);
+            kill_fasync(sfp->async_qp, SIGIO, POLL_IN);
     }
 }
 
diff -urN 2.2.18/drivers/sgi/char/shmiq.c 2.2.18aa1/drivers/sgi/char/shmiq.c
--- 2.2.18/drivers/sgi/char/shmiq.c	Mon Jan 17 16:44:40 2000
+++ 2.2.18aa1/drivers/sgi/char/shmiq.c	Mon Dec 11 17:20:44 2000
@@ -118,7 +118,7 @@
 	s->tail = tail_next;
 	shmiqs [device].tail = tail_next;
 	if (shmiqs [device].fasync)
-		kill_fasync (shmiqs [device].fasync, SIGIO);
+		kill_fasync (shmiqs [device].fasync, SIGIO, POLL_IN);
 	wake_up_interruptible (&shmiqs [device].proc_list);
 }
 
diff -urN 2.2.18/drivers/telephony/ixj.c 2.2.18aa1/drivers/telephony/ixj.c
--- 2.2.18/drivers/telephony/ixj.c	Mon Dec 11 16:57:57 2000
+++ 2.2.18aa1/drivers/telephony/ixj.c	Mon Dec 11 17:20:44 2000
@@ -536,7 +536,7 @@
 extern __inline__ void ixj_kill_fasync(int board)
 {
 	if (ixj[board].async_queue)
-		kill_fasync(ixj[board].async_queue, SIGIO);	// Send apps notice of change
+		kill_fasync(ixj[board].async_queue, SIGIO, POLL_IN);	// Send apps notice of change
 
 }
 static void ixj_timeout(unsigned long ptr)
diff -urN 2.2.18/drivers/usb/evdev.c 2.2.18aa1/drivers/usb/evdev.c
--- 2.2.18/drivers/usb/evdev.c	Mon Dec 11 16:57:58 2000
+++ 2.2.18aa1/drivers/usb/evdev.c	Mon Dec 11 17:20:44 2000
@@ -74,7 +74,7 @@
 		list->buffer[list->head].value = value;
 		list->head = (list->head + 1) & (EVDEV_BUFFER_SIZE - 1);
 		
-		kill_fasync(list->fasync, SIGIO);
+		kill_fasync(list->fasync, SIGIO, POLL_IN);
 
 		list = list->next;
 	}
diff -urN 2.2.18/drivers/usb/inode.c 2.2.18aa1/drivers/usb/inode.c
--- 2.2.18/drivers/usb/inode.c	Mon Dec 11 16:57:58 2000
+++ 2.2.18aa1/drivers/usb/inode.c	Mon Dec 11 17:20:50 2000
@@ -302,14 +302,14 @@
 	i = filp->f_pos;
 	switch (i) {
 	case 0:
-		if (filldir(dirent, ".", 1, i, IROOT) < 0)
+		if (filldir(dirent, ".", 1, i, IROOT, DT_DIR) < 0)
 			return 0;
 		filp->f_pos++;
 		i++;
 		/* fall through */
 
 	case 1:
-		if (filldir(dirent, "..", 2, i, IROOT) < 0)
+		if (filldir(dirent, "..", 2, i, IROOT, DT_DIR) < 0)
 			return 0;
 		filp->f_pos++;
 		i++;
@@ -319,7 +319,7 @@
 		
 		while (i >= 2 && i < 2+NRSPECIAL) {
 			spec = &special[filp->f_pos-2];
-			if (filldir(dirent, spec->name, strlen(spec->name), i, ISPECIAL | (filp->f_pos-2+IROOT)) < 0)
+			if (filldir(dirent, spec->name, strlen(spec->name), i, ISPECIAL | (filp->f_pos-2+IROOT), DT_UNKNOWN) < 0)
 				return 0;
 			filp->f_pos++;
 			i++;
@@ -335,7 +335,7 @@
 			}
 			bus = list_entry(list, struct usb_bus, bus_list);
 			sprintf(numbuf, "%03d", bus->busnum);
-			if (filldir(dirent, numbuf, 3, filp->f_pos, IBUS | ((bus->busnum & 0xff) << 8)) < 0)
+			if (filldir(dirent, numbuf, 3, filp->f_pos, IBUS | ((bus->busnum & 0xff) << 8), DT_UNKNOWN) < 0)
 				break;
 			filp->f_pos++;
 		}
@@ -355,7 +355,7 @@
 	if (pos > 0)
 		pos--;
 	else {
-		if (filldir(dirent, numbuf, 3, filp->f_pos, ino | (dev->devnum & 0xff)) < 0)
+		if (filldir(dirent, numbuf, 3, filp->f_pos, ino | (dev->devnum & 0xff), DT_UNKNOWN) < 0)
 			return -1;
 		filp->f_pos++;
 	}
@@ -380,13 +380,13 @@
 		return -EINVAL;
 	switch ((unsigned int)filp->f_pos) {
 	case 0:
-		if (filldir(dirent, ".", 1, filp->f_pos, ino) < 0)
+		if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) < 0)
 			return 0;
 		filp->f_pos++;
 		/* fall through */
 
 	case 1:
-		if (filldir(dirent, "..", 2, filp->f_pos, IROOT) < 0)
+		if (filldir(dirent, "..", 2, filp->f_pos, IROOT, DT_DIR) < 0)
 			return 0;
 		filp->f_pos++;
 		/* fall through */
diff -urN 2.2.18/drivers/usb/joydev.c 2.2.18aa1/drivers/usb/joydev.c
--- 2.2.18/drivers/usb/joydev.c	Mon Dec 11 16:57:58 2000
+++ 2.2.18aa1/drivers/usb/joydev.c	Mon Dec 11 17:20:44 2000
@@ -143,7 +143,7 @@
 			if (list->tail == (list->head = (list->head + 1) & (JOYDEV_BUFFER_SIZE - 1)))
 				list->startup = 0;
 
-		kill_fasync(list->fasync, SIGIO);
+		kill_fasync(list->fasync, SIGIO, POLL_IN);
 
 		list = list->next;
 	}
diff -urN 2.2.18/drivers/usb/mousedev.c 2.2.18aa1/drivers/usb/mousedev.c
--- 2.2.18/drivers/usb/mousedev.c	Mon Dec 11 16:57:58 2000
+++ 2.2.18aa1/drivers/usb/mousedev.c	Mon Dec 11 17:20:44 2000
@@ -143,7 +143,7 @@
 					
 			list->ready = 1;
 
-			kill_fasync(list->fasync, SIGIO);
+			kill_fasync(list->fasync, SIGIO, POLL_IN);
 
 			list = list->next;
 		}
@@ -321,7 +321,7 @@
 		list->buffer = list->bufsiz;
 	}
 
-	kill_fasync(list->fasync, SIGIO);
+	kill_fasync(list->fasync, SIGIO, POLL_IN);
 
 	wake_up_interruptible(&list->mousedev->wait);
 		
diff -urN 2.2.18/fs/Makefile 2.2.18aa1/fs/Makefile
--- 2.2.18/fs/Makefile	Thu Aug 26 14:20:19 1999
+++ 2.2.18aa1/fs/Makefile	Mon Dec 11 17:20:48 2000
@@ -13,7 +13,7 @@
 O_OBJS    = open.o read_write.o devices.o file_table.o buffer.o \
 		super.o  block_dev.o stat.o exec.o pipe.o namei.o fcntl.o \
 		ioctl.o readdir.o select.o fifo.o locks.o filesystems.o \
-		dcache.o inode.o attr.o bad_inode.o file.o $(BINFMTS) 
+		dcache.o inode.o attr.o bad_inode.o file.o iobuf.o $(BINFMTS) 
 
 MOD_LIST_NAME := FS_MODULES
 ALL_SUB_DIRS = coda minix ext2 fat msdos vfat proc isofs nfs umsdos ntfs \
diff -urN 2.2.18/fs/adfs/dir.c 2.2.18aa1/fs/adfs/dir.c
--- 2.2.18/fs/adfs/dir.c	Mon Dec 11 16:58:00 2000
+++ 2.2.18aa1/fs/adfs/dir.c	Mon Dec 11 17:20:50 2000
@@ -40,12 +40,12 @@
 
 	switch ((unsigned long)filp->f_pos) {
 	case 0:
-		if (filldir(dirent, ".", 1, 0, inode->i_ino) < 0)
+		if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0)
 			goto free_out;
 		filp->f_pos += 1;
 
 	case 1:
-		if (filldir(dirent, "..", 2, 1, dir.parent_id) < 0)
+		if (filldir(dirent, "..", 2, 1, dir.parent_id, DT_DIR) < 0)
 			goto free_out;
 		filp->f_pos += 1;
 
@@ -60,7 +60,7 @@
 		goto unlock_out;
 	while (ops->getnext(&dir, &obj) == 0) {
 		if (filldir(dirent, obj.name, obj.name_len,
-			    filp->f_pos, obj.file_id) < 0)
+			    filp->f_pos, obj.file_id, DT_UNKNOWN) < 0)
 			goto unlock_out;
 		filp->f_pos += 1;
 	}
diff -urN 2.2.18/fs/affs/dir.c 2.2.18aa1/fs/affs/dir.c
--- 2.2.18/fs/affs/dir.c	Mon Jan 17 16:44:41 2000
+++ 2.2.18aa1/fs/affs/dir.c	Mon Dec 11 17:20:50 2000
@@ -98,14 +98,14 @@
 
 	if (filp->f_pos == 0) {
 		filp->private_data = (void *)0;
-		if (filldir(dirent,".",1,filp->f_pos,inode->i_ino) < 0) {
+		if (filldir(dirent,".",1,filp->f_pos,inode->i_ino, DT_DIR) < 0) {
 			return 0;
 		}
 		++filp->f_pos;
 		stored++;
 	}
 	if (filp->f_pos == 1) {
-		if (filldir(dirent,"..",2,filp->f_pos,affs_parent_ino(inode)) < 0) {
+		if (filldir(dirent,"..",2,filp->f_pos,affs_parent_ino(inode), DT_DIR) < 0) {
 			return stored;
 		}
 		filp->f_pos = 2;
@@ -161,7 +161,7 @@
 			pr_debug("AFFS: readdir(): filldir(\"%.*s\",ino=%lu), i=%d\n",
 				 namelen,name,ino,i);
 			filp->private_data = (void *)ino;
-			if (filldir(dirent,name,namelen,filp->f_pos,ino) < 0)
+			if (filldir(dirent,name,namelen,filp->f_pos,ino, DT_UNKNOWN) < 0)
 				goto readdir_done;
 			filp->private_data = (void *)i;
 			affs_brelse(fh_bh);
diff -urN 2.2.18/fs/affs/file.c 2.2.18aa1/fs/affs/file.c
--- 2.2.18/fs/affs/file.c	Tue Sep  5 02:28:47 2000
+++ 2.2.18aa1/fs/affs/file.c	Mon Dec 11 17:20:50 2000
@@ -582,17 +582,17 @@
 affs_file_write(struct file *filp, const char *buf, size_t count, loff_t *ppos)
 {
 	struct inode		*inode = filp->f_dentry->d_inode;
-	off_t			 pos;
+	loff_t			 pos;
 	ssize_t			 written;
 	ssize_t			 c;
-	ssize_t			 blocksize;
+	ssize_t			 blocksize, blockshift;
 	struct buffer_head	*bh;
 	char			*p;
 
 	if (!count)
 		return 0;
-	pr_debug("AFFS: file_write(ino=%lu,pos=%lu,count=%d)\n",inode->i_ino,
-		 (unsigned long)*ppos,count);
+	pr_debug("AFFS: file_write(ino=%lu,pos=%Lu,count=%d)\n",inode->i_ino,
+		 *ppos,count);
 
 	if (!inode) {
 		affs_error(inode->i_sb,"file_write","Inode = NULL");
@@ -611,16 +611,22 @@
 	else
 		pos = *ppos;
 	written   = 0;
-	blocksize = AFFS_I2BSIZE(inode);
+	blocksize  = AFFS_I2BSIZE(inode);
+	blockshift = AFFS_I2BITS(inode);
+
+	if (pos >= 0x7fffffff) /* Max size: 2G-1 */
+		return -EFBIG;
+	if ((pos + count) > 0x7fffffff)
+		count = 0x7fffffff - pos;
 
 	while (written < count) {
-		bh = affs_getblock(inode,pos / blocksize);
+		bh = affs_getblock(inode, pos >> blockshift);
 		if (!bh) {
 			if (!written)
 				written = -ENOSPC;
 			break;
 		}
-		c = blocksize - (pos % blocksize);
+		c = blocksize - (pos & (blocksize -1));
 		if (c > count - written)
 			c = count - written;
 		if (c != blocksize && !buffer_uptodate(bh)) {
@@ -633,7 +639,7 @@
 				break;
 			}
 		}
-		p  = (pos % blocksize) + bh->b_data;
+		p  = (pos & (blocksize -1)) + bh->b_data;
 		c -= copy_from_user(p,buf,c);
 		if (!c) {
 			affs_brelse(bh);
@@ -664,7 +670,7 @@
 	off_t			 pos;
 	ssize_t			 written;
 	ssize_t			 c;
-	ssize_t			 blocksize;
+	ssize_t			 blocksize, blockshift;
 	struct buffer_head	*bh;
 	char			*p;
 
@@ -692,15 +698,16 @@
 
 	bh        = NULL;
 	blocksize = AFFS_I2BSIZE(inode) - 24;
+	blockshift = AFFS_I2BITS(inode);
 	written   = 0;
 	while (written < count) {
-		bh = affs_getblock(inode,pos / blocksize);
+		bh = affs_getblock(inode,pos >> blockshift);
 		if (!bh) {
 			if (!written)
 				written = -ENOSPC;
 			break;
 		}
-		c = blocksize - (pos % blocksize);
+		c = blocksize - (pos & (blocksize -1));
 		if (c > count - written)
 			c = count - written;
 		if (c != blocksize && !buffer_uptodate(bh)) {
@@ -713,7 +720,7 @@
 				break;
 			}
 		}
-		p  = (pos % blocksize) + bh->b_data + 24;
+		p  = (pos & (blocksize -1)) + bh->b_data + 24;
 		c -= copy_from_user(p,buf,c);
 		if (!c) {
 			affs_brelse(bh);
@@ -782,10 +789,10 @@
 	int	 rem;
 	int	 ext;
 
-	pr_debug("AFFS: truncate(inode=%ld,size=%lu)\n",inode->i_ino,inode->i_size);
+	pr_debug("AFFS: truncate(inode=%ld,size=%Lu)\n",inode->i_ino,inode->i_size);
 
 	net_blocksize = blocksize - ((inode->i_sb->u.affs_sb.s_flags & SF_OFS) ? 24 : 0);
-	first = (inode->i_size + net_blocksize - 1) / net_blocksize;
+	first = (u_long)(inode->i_size + net_blocksize - 1) / net_blocksize;
 	if (inode->u.affs_i.i_lastblock < first - 1) {
 		/* There has to be at least one new block to be allocated */
 		if (!inode->u.affs_i.i_ec && alloc_ext_cache(inode)) {
@@ -795,9 +802,9 @@
 		bh = affs_getblock(inode,first - 1);
 		if (!bh) {
 			affs_warning(inode->i_sb,"truncate","Cannot extend file");
-			inode->i_size = net_blocksize * (inode->u.affs_i.i_lastblock + 1);
+			inode->i_size = (inode->u.affs_i.i_lastblock + 1) * net_blocksize;
 		} else if (inode->i_sb->u.affs_sb.s_flags & SF_OFS) {
-			rem = inode->i_size % net_blocksize;
+			rem = ((u_long)inode->i_size) & (net_blocksize -1);
 			DATA_FRONT(bh)->data_size = cpu_to_be32(rem ? rem : net_blocksize);
 			affs_fix_checksum(blocksize,bh->b_data,5);
 			mark_buffer_dirty(bh,0);
@@ -864,7 +871,7 @@
 			affs_free_block(inode->i_sb,ekey);
 		ekey = key;
 	}
-	block = ((inode->i_size + net_blocksize - 1) / net_blocksize) - 1;
+	block = (((u_long)inode->i_size + net_blocksize - 1) / net_blocksize) - 1;
 	inode->u.affs_i.i_lastblock = block;
 
 	/* If the file is not truncated to a block boundary,
@@ -872,7 +879,7 @@
 	 * so it cannot become accessible again.
 	 */
 
-	rem = inode->i_size % net_blocksize;
+	rem = inode->i_size & (net_blocksize -1);
 	if (rem) {
 		if ((inode->i_sb->u.affs_sb.s_flags & SF_OFS)) 
 			rem += 24;
diff -urN 2.2.18/fs/affs/inode.c 2.2.18aa1/fs/affs/inode.c
--- 2.2.18/fs/affs/inode.c	Sun Apr  2 21:07:49 2000
+++ 2.2.18aa1/fs/affs/inode.c	Mon Dec 11 17:20:50 2000
@@ -146,7 +146,7 @@
 				block = AFFS_I2BSIZE(inode) - 24;
 			else
 				block = AFFS_I2BSIZE(inode);
-			inode->u.affs_i.i_lastblock = ((inode->i_size + block - 1) / block) - 1;
+			inode->u.affs_i.i_lastblock = (((u_long)inode->i_size + block - 1) / block) - 1;
 			break;
 		case ST_SOFTLINK:
 			inode->i_mode |= S_IFLNK;
diff -urN 2.2.18/fs/autofs/dir.c 2.2.18aa1/fs/autofs/dir.c
--- 2.2.18/fs/autofs/dir.c	Mon Jan 17 16:44:41 2000
+++ 2.2.18aa1/fs/autofs/dir.c	Mon Dec 11 17:20:50 2000
@@ -20,12 +20,12 @@
 	switch((unsigned long) filp->f_pos)
 	{
 	case 0:
-		if (filldir(dirent, ".", 1, 0, inode->i_ino) < 0)
+		if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0)
 			return 0;
 		filp->f_pos++;
 		/* fall through */
 	case 1:
-		if (filldir(dirent, "..", 2, 1, AUTOFS_ROOT_INO) < 0)
+		if (filldir(dirent, "..", 2, 1, AUTOFS_ROOT_INO, DT_DIR) < 0)
 			return 0;
 		filp->f_pos++;
 		/* fall through */
diff -urN 2.2.18/fs/autofs/root.c 2.2.18aa1/fs/autofs/root.c
--- 2.2.18/fs/autofs/root.c	Mon Jan 17 16:44:41 2000
+++ 2.2.18aa1/fs/autofs/root.c	Mon Dec 11 17:20:50 2000
@@ -79,19 +79,19 @@
 	switch(nr)
 	{
 	case 0:
-		if (filldir(dirent, ".", 1, nr, inode->i_ino) < 0)
+		if (filldir(dirent, ".", 1, nr, inode->i_ino, DT_DIR) < 0)
 			return 0;
 		filp->f_pos = ++nr;
 		/* fall through */
 	case 1:
-		if (filldir(dirent, "..", 2, nr, inode->i_ino) < 0)
+		if (filldir(dirent, "..", 2, nr, inode->i_ino, DT_DIR) < 0)
 			return 0;
 		filp->f_pos = ++nr;
 		/* fall through */
 	default:
 		while ( onr = nr, ent = autofs_hash_enum(dirhash,&nr,ent) ) {
 			if ( !ent->dentry || ent->dentry->d_mounts != ent->dentry ) {
-				if (filldir(dirent,ent->name,ent->len,onr,ent->ino) < 0)
+				if (filldir(dirent,ent->name,ent->len,onr,ent->ino, DT_UNKNOWN) < 0)
 					return 0;
 				filp->f_pos = nr;
 			}
diff -urN 2.2.18/fs/binfmt_aout.c 2.2.18aa1/fs/binfmt_aout.c
--- 2.2.18/fs/binfmt_aout.c	Sun Apr  2 21:07:49 2000
+++ 2.2.18aa1/fs/binfmt_aout.c	Mon Dec 11 17:20:52 2000
@@ -62,9 +62,9 @@
 static int dump_write(struct file *file, const void *addr, int nr)
 {
 	int r;
-	down(&file->f_dentry->d_inode->i_sem);
+	fs_down(&file->f_dentry->d_inode->i_sem);
 	r = file->f_op->write(file, addr, nr, &file->f_pos) == nr;
-	up(&file->f_dentry->d_inode->i_sem);
+	fs_up(&file->f_dentry->d_inode->i_sem);
 	return r;
 }
 
@@ -410,7 +410,11 @@
 		file = fget(fd);
 
 		if (!file->f_op || !file->f_op->mmap ||
+#if 0
 		    fd_offset & (bprm->dentry->d_inode->i_sb->s_blocksize-1)) {
+#else /* LFS enforces PAGE_SIZE file offset granularity in mmap */
+		    fd_offset & ~PAGE_MASK) {
+#endif
 			if (warnings++<10)
 				printk(KERN_NOTICE
 				       "fd_offset is not blocksize aligned. Loading %s in anonymous memory.\n",
@@ -532,7 +536,11 @@
 
 	start_addr =  ex.a_entry & 0xfffff000;
 
+#if 0
 	if (N_TXTOFF(ex) & (inode->i_sb->s_blocksize-1)) {
+#else /* LFS enforces PAGE_SIZE file offset granularity in mmap */
+	if (N_TXTOFF(ex) & ~PAGE_MASK) {
+#endif
 		if (warnings++<10)
 			printk(KERN_NOTICE
 			       "N_TXTOFF is not blocksize aligned. Loading library %s in anonymous memory.\n",
diff -urN 2.2.18/fs/binfmt_elf.c 2.2.18aa1/fs/binfmt_elf.c
--- 2.2.18/fs/binfmt_elf.c	Mon Dec 11 16:58:00 2000
+++ 2.2.18aa1/fs/binfmt_elf.c	Mon Dec 11 17:20:52 2000
@@ -945,12 +945,12 @@
  * These are the only things you should do on a core-file: use only these
  * functions to write out all the necessary info.
  */
-static int dump_write(struct file *file, const void *addr, int nr)
+static int dump_write(struct file *file, const void *addr, size_t nr)
 {
 	int r;
-	down(&file->f_dentry->d_inode->i_sem);
+	fs_down(&file->f_dentry->d_inode->i_sem);
 	r = file->f_op->write(file, addr, nr, &file->f_pos) == nr;
-	up(&file->f_dentry->d_inode->i_sem);
+	fs_up(&file->f_dentry->d_inode->i_sem);
 	return r;
 }
 
diff -urN 2.2.18/fs/buffer.c 2.2.18aa1/fs/buffer.c
--- 2.2.18/fs/buffer.c	Mon Dec 11 16:58:00 2000
+++ 2.2.18aa1/fs/buffer.c	Mon Dec 11 17:20:54 2000
@@ -27,9 +27,7 @@
 /* invalidate_buffers/set_blocksize/sync_dev race conditions and
    fs corruption fixes, 1999, Andrea Arcangeli <andrea@suse.de> */
 
-/* Wait for dirty buffers to sync in sync_page_buffers.
- * 2000, Marcelo Tosatti <marcelo@conectiva.com.br>
- */
+/* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de> */
 
 #include <linux/malloc.h>
 #include <linux/locks.h>
@@ -43,6 +41,7 @@
 #include <linux/file.h>
 #include <linux/init.h>
 #include <linux/quotaops.h>
+#include <linux/iobuf.h>
 
 #include <asm/uaccess.h>
 #include <asm/io.h>
@@ -83,6 +82,7 @@
 
 static int nr_buffers = 0;
 static int nr_buffers_type[NR_LIST] = {0,};
+static unsigned long size_buffers_type[NR_LIST];
 static int nr_buffer_heads = 0;
 static int nr_unused_buffer_heads = 0;
 static int nr_hashed_buffers = 0;
@@ -122,7 +122,7 @@
 
 /* These are the min and max parameter values that we will allow to be assigned */
 int bdflush_min[N_PARAM] = {  0,  10,    5,   25,  0,   1*HZ,   1*HZ, 1, 1};
-int bdflush_max[N_PARAM] = {100,5000, 2000, 2000,60*HZ, 600*HZ, 600*HZ, 2047, 5};
+int bdflush_max[N_PARAM] = {100,5000, 2000, 2000,INT_MAX, 600*HZ, 600*HZ, 2047, 5};
 
 void wakeup_bdflush(int);
 
@@ -143,13 +143,13 @@
 	bh->b_count++;
 	wait.task = tsk;
 	add_wait_queue(&bh->b_wait, &wait);
-repeat:
-	tsk->state = TASK_UNINTERRUPTIBLE;
-	run_task_queue(&tq_disk);
-	if (buffer_locked(bh)) {
+	do {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		run_task_queue(&tq_disk);
+		if (!buffer_locked(bh))
+			break;
 		schedule();
-		goto repeat;
-	}
+	} while (buffer_locked(bh));
 	tsk->state = TASK_RUNNING;
 	remove_wait_queue(&bh->b_wait, &wait);
 	bh->b_count--;
@@ -165,7 +165,7 @@
  * We will ultimately want to put these in a separate list, but for
  * now we search all of the lists for dirty buffers.
  */
-static int sync_buffers(kdev_t dev, int wait)
+int sync_buffers(kdev_t dev, int wait)
 {
 	int i, retry, pass = 0, err = 0;
 	struct buffer_head * bh, *next;
@@ -359,9 +359,9 @@
 		goto out_putf;
 
 	/* We need to protect against concurrent writers.. */
-	down(&inode->i_sem);
+	fs_down(&inode->i_sem);
 	err = file->f_op->fsync(file, dentry);
-	up(&inode->i_sem);
+	fs_up(&inode->i_sem);
 
 out_putf:
 	fput(file);
@@ -396,9 +396,9 @@
 		goto out_putf;
 
 	/* this needs further work, at the moment it is identical to fsync() */
-	down(&inode->i_sem);
+	fs_down(&inode->i_sem);
 	err = file->f_op->fsync(file, dentry);
-	up(&inode->i_sem);
+	fs_up(&inode->i_sem);
 
 out_putf:
 	fput(file);
@@ -474,6 +474,7 @@
 		return;
 	}
 	nr_buffers_type[bh->b_list]--;
+	size_buffers_type[bh->b_list] -= bh->b_size;
 	remove_from_hash_queue(bh);
 	remove_from_lru_list(bh);
 }
@@ -481,7 +482,12 @@
 static void put_last_free(struct buffer_head * bh)
 {
 	if (bh) {
-		struct buffer_head **bhp = &free_list[BUFSIZE_INDEX(bh->b_size)];
+		struct buffer_head **bhp;
+
+		if (bh->b_rm_fn) 
+			bh->b_rm_fn(bh);
+
+		bhp = &free_list[BUFSIZE_INDEX(bh->b_size)];
 
 		bh->b_count = 0;
 		bh->b_state = 0;
@@ -523,6 +529,7 @@
 		(*bhp)->b_prev_free = bh;
 
 		nr_buffers_type[bh->b_list]++;
+		size_buffers_type[bh->b_list] += bh->b_size;
 
 		/* Put the buffer in new hash-queue if it has a device. */
 		bh->b_next = NULL;
@@ -571,8 +578,10 @@
 {
 	struct buffer_head * bh;
 	bh = find_buffer(dev,block,size);
-	if (bh)
+	if (bh) {
 		bh->b_count++;
+		touch_buffer(bh);
+	}
 	return bh;
 }
 
@@ -732,6 +741,8 @@
 	bh->b_blocknr = block;
 	bh->b_end_io = handler;
 	bh->b_dev_id = dev_id;
+	bh->b_pdata = NULL;
+	bh->b_rm_fn = NULL;
 }
 
 static void end_buffer_io_sync(struct buffer_head *bh, int uptodate)
@@ -816,6 +827,46 @@
 	insert_into_queues(bh);
 }
 
+/* -1 -> no need to flush
+    0 -> async flush
+    1 -> sync flush (wait for I/O completation) */
+static int balance_dirty_state(kdev_t dev)
+{
+	unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit;
+
+	dirty = size_buffers_type[BUF_DIRTY] >> PAGE_SHIFT;
+	tot = (buffermem >> PAGE_SHIFT) + nr_free_pages;
+	tot -= size_buffers_type[BUF_PROTECTED] >> PAGE_SHIFT;
+
+	dirty *= 200;
+	soft_dirty_limit = tot * bdf_prm.b_un.nfract;
+	hard_dirty_limit = soft_dirty_limit * 2;
+
+	if (dirty > soft_dirty_limit)
+	{
+		if (dirty > hard_dirty_limit)
+			return 1;
+		return 0;
+	}
+	return -1;
+}
+
+/*
+ * if a new dirty buffer is created we need to balance bdflush.
+ *
+ * in the future we might want to make bdflush aware of different
+ * pressures on different devices - thus the (currently unused)
+ * 'dev' parameter.
+ */
+void balance_dirty(kdev_t dev)
+{
+	int state = balance_dirty_state(dev);
+
+	if (state < 0)
+		return;
+	wakeup_bdflush(state);
+}
+
 /*
  * A buffer may need to be moved from one buffer list to another
  * (e.g. in case it is not shared any more). Handle this.
@@ -828,7 +879,9 @@
 		printk("Attempt to refile free buffer\n");
 		return;
 	}
-	if (buffer_dirty(buf))
+	if (buffer_protected(buf))
+		dispose = BUF_PROTECTED;
+	else if (buffer_dirty(buf))
 		dispose = BUF_DIRTY;
 	else if (buffer_locked(buf))
 		dispose = BUF_LOCKED;
@@ -837,13 +890,7 @@
 	if(dispose != buf->b_list) {
 		file_buffer(buf, dispose);
 		if(dispose == BUF_DIRTY) {
-			int too_many = (nr_buffers * bdf_prm.b_un.nfract/100);
-
-			/* This buffer is dirty, maybe we need to start flushing.
-			 * If too high a percentage of the buffers are dirty...
-			 */
-			if (nr_buffers_type[BUF_DIRTY] > too_many)
-				wakeup_bdflush(1);
+			balance_dirty(buf->b_dev);
 
 			/* If this is a loop device, and
 			 * more than half of the buffers are dirty...
@@ -864,7 +911,6 @@
 	/* If dirty, mark the time this buffer should be written back. */
 	set_writetime(buf, 0);
 	refile_buffer(buf);
-	touch_buffer(buf);
 
 	if (buf->b_count) {
 		buf->b_count--;
@@ -978,6 +1024,9 @@
  */
 static void put_unused_buffer_head(struct buffer_head * bh)
 {
+	if (bh->b_rm_fn != NULL)
+		bh->b_rm_fn(bh);
+
 	if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
 		nr_buffer_heads--;
 		kmem_cache_free(bh_cachep, bh);
@@ -1109,6 +1158,8 @@
 
 		bh->b_data = (char *) (page+offset);
 		bh->b_list = 0;
+		bh->b_pdata = NULL;
+		bh->b_rm_fn = NULL;
 	}
 	return head;
 /*
@@ -1168,7 +1219,7 @@
 #endif
 	}
 	if (test_and_clear_bit(PG_swap_unlock_after, &page->flags))
-		swap_after_unlock_page(page->offset);
+		swap_after_unlock_page(pgoff2ulong(page->index));
 	if (test_and_clear_bit(PG_free_after, &page->flags))
 		__free_page(page);
 }
@@ -1261,6 +1312,223 @@
 	return;
 }
 
+
+/*
+ * For brw_kiovec: submit a set of buffer_head temporary IOs and wait
+ * for them to complete.  Clean up the buffer_heads afterwards.  
+ */
+
+#define dprintk(x...)
+
+static int do_kio(int rw, int nr, struct buffer_head *bh[], int size)
+{
+	int iosize;
+	int i;
+	int err;
+	struct buffer_head *tmp;
+
+	dprintk ("do_kio start\n");
+	
+	ll_rw_block(rw, nr, bh);
+	iosize = err = 0;
+	
+	for (i = nr; --i >= 0; ) {
+		tmp = bh[i];
+		wait_on_buffer(tmp);
+		if (!buffer_uptodate(tmp)) {
+			err = -EIO;
+			/* We are waiting on bh'es in reverse order so
+                           clearing iosize on error calculates the
+                           amount of IO before the first error. */
+			iosize = 0;
+		}
+		
+		put_unused_buffer_head(tmp);
+		iosize += size;
+	}
+	wake_up(&buffer_wait);
+	
+	dprintk ("do_kio end %d %d\n", iosize, err);
+	
+	if (iosize)
+		return iosize;
+	else
+		return err;
+}
+
+/*
+ * Clean up the bounce buffers potentially used by brw_kiovec.  All of
+ * the kiovec's bounce buffers must be cleared of temporarily allocated
+ * bounce pages, but only READ pages for whom IO completed successfully
+ * can actually be transferred back to user space. 
+ */
+
+void cleanup_bounce_buffers(int rw, int nr, struct kiobuf *iovec[], 
+			    int transferred)
+{
+	int i;
+	for (i = 0; i < nr; i++) {
+		struct kiobuf *iobuf = iovec[i];
+		if (iobuf->bounced) {
+			if (transferred > 0 && !(rw & WRITE))
+				kiobuf_copy_bounce(iobuf, COPY_FROM_BOUNCE, 
+						   transferred);
+			
+			clear_kiobuf_bounce_pages(iobuf);
+		}
+		transferred -= iobuf->length;
+	}
+}
+
+/*
+ * Start I/O on a physical range of kernel memory, defined by a vector
+ * of kiobuf structs (much like a user-space iovec list).
+ *
+ * IO is submitted asynchronously: you need to check page->locked,
+ * page->uptodate, and maybe wait on page->wait.
+ *
+ * It is up to the caller to make sure that there are enough blocks
+ * passed in to completely map the iobufs to disk.
+ */
+
+int brw_kiovec(int rw, int nr, struct kiobuf *iovec[], 
+	       kdev_t dev, unsigned long b[], int size)
+{
+	int		err;
+	int		length;
+	int		transferred;
+	int		i;
+	int		bufind;
+	int		pageind;
+	int		bhind;
+	int		offset;
+	unsigned long	blocknr;
+	struct kiobuf *	iobuf = NULL;
+	unsigned long	page;
+	unsigned long	bounce;
+	struct page *	map;
+	struct buffer_head *tmp, *bh[KIO_MAX_SECTORS];
+
+	/* 
+	 * First, do some alignment and validity checks 
+	 */
+	for (i = 0; i < nr; i++) {
+		iobuf = iovec[i];
+		if ((iobuf->offset & (size-1)) ||
+		    (iobuf->length & (size-1)))
+			return -EINVAL;
+		if (!iobuf->nr_pages)
+			panic("brw_kiovec: iobuf not initialised");
+	}
+
+	/* DEBUG */
+#if 0
+	return iobuf->length;
+#endif
+	dprintk ("brw_kiovec: start\n");
+	
+	/* 
+	 * OK to walk down the iovec doing page IO on each page we find. 
+	 */
+	bufind = bhind = transferred = err = 0;
+	for (i = 0; i < nr; i++) {
+		iobuf = iovec[i];
+		err = setup_kiobuf_bounce_pages(iobuf, GFP_USER);
+		if (err) 
+			goto finished;
+		if (rw & WRITE)
+			kiobuf_copy_bounce(iobuf, COPY_TO_BOUNCE, -1);
+		
+		offset = iobuf->offset;
+		length = iobuf->length;
+		dprintk ("iobuf %d %d %d\n", offset, length, size);
+
+		for (pageind = 0; pageind < iobuf->nr_pages; pageind++) {
+			map    = iobuf->maplist[pageind];
+			bounce = iobuf->bouncelist[pageind];
+
+			if (bounce)
+				page = bounce;
+			else
+				page = iobuf->pagelist[pageind];
+
+			while (length > 0) {
+				blocknr = b[bufind++];
+				tmp = get_unused_buffer_head(0);
+				if (!tmp) {
+					err = -ENOMEM;
+					goto error;
+				}
+				
+				tmp->b_dev = B_FREE;
+				tmp->b_size = size;
+				tmp->b_data = (char *) (page + offset);
+				tmp->b_this_page = tmp;
+
+				init_buffer(tmp, dev, blocknr,
+					    end_buffer_io_sync, NULL);
+				if (rw == WRITE) {
+					set_bit(BH_Uptodate, &tmp->b_state);
+					set_bit(BH_Dirty, &tmp->b_state);
+				}
+
+				dprintk ("buffer %d (%d) at %p\n", 
+					 bhind, tmp->b_blocknr, tmp->b_data);
+				bh[bhind++] = tmp;
+				length -= size;
+				offset += size;
+
+				/* 
+				 * Start the IO if we have got too much or if
+				 * this is the end of the last iobuf 
+				 */
+				if (bhind >= KIO_MAX_SECTORS) {
+					err = do_kio(rw, bhind, bh, size);
+					if (err >= 0)
+						transferred += err;
+					else
+						goto finished;
+					bhind = 0;
+				}
+				
+				if (offset >= PAGE_SIZE) {
+					offset = 0;
+					break;
+				}
+			} /* End of block loop */
+		} /* End of page loop */		
+	} /* End of iovec loop */
+
+	/* Is there any IO still left to submit? */
+	if (bhind) {
+		err = do_kio(rw, bhind, bh, size);
+		if (err >= 0)
+			transferred += err;
+		else
+			goto finished;
+	}
+
+ finished:
+	dprintk ("brw_kiovec: end (%d, %d)\n", transferred, err);
+
+	cleanup_bounce_buffers(rw, nr, iovec, transferred);
+	
+	if (transferred)
+		return transferred;
+	return err;
+
+ error:
+	/* We got an error allocation the bh'es.  Just free the current
+           buffer_heads and exit. */
+	for (i = 0; i < bhind; i++)
+		put_unused_buffer_head(bh[i]);
+	wake_up(&buffer_wait);
+
+	clear_kiobuf_bounce_pages(iobuf);
+
+	goto finished;
+}
+
 /*
  * Start I/O on a page.
  * This function expects the page to be locked and may return before I/O is complete.
@@ -1395,15 +1663,46 @@
 	set_bit(PG_locked, &page->flags);
 	set_bit(PG_free_after, &page->flags);
 	
+	/* Blocks within a page */
 	i = PAGE_SIZE >> inode->i_sb->s_blocksize_bits;
-	block = page->offset >> inode->i_sb->s_blocksize_bits;
-	p = nr;
-	do {
-		*p = inode->i_op->bmap(inode, block);
-		i--;
-		block++;
-		p++;
-	} while (i > 0);
+
+	block = pgoff2ulong(page->index);
+	/* Scaled already by PAGE_SHIFT, which said shift should
+	   be same or larger, than that of any filesystem in
+	   this system -- that is, at i386 with 4k pages one
+	   can't use 8k (primitive) blocks at the filesystems... */
+
+	if (i > 0) {
+		/* Filesystem blocksize is same, or smaller than CPU
+		   page size, we can easily process this.. */
+
+		if (i > 1)
+			block *= i;
+		/* Scale by FS blocks per page, presuming FS-blocks are smaller
+		   than the processor page... */
+
+		p = nr;
+		do {
+			*p = inode->i_op->bmap(inode, block);
+			i--;
+			block++;
+			p++;
+		} while (i > 0);
+	} else {
+		/* Filesystem blocksize is larger than CPU page size,
+		   but if the underlying storage system block size is
+		   smaller than CPU page size, all is well, else we
+		   are in deep trouble -- for direct paging in at least.. */
+		/* Nobody needs such monsterous fs block sizes ?
+		   Well, it is the only way to get files in terabyte
+		   range..  Nobody needs them ?  You are for a surprise..
+		   However EXT2 (at least) needs access to internal
+		   blocks and there it needs allocations of 8k/16k (or
+		   whatever the block size is) for internal uses..
+		   Fixing this function alone isn't enough, although
+		   perhaps fairly trivial.. */
+		/* FIXME: WRITE THE CODE HERE !!! */
+	}
 
 	/* IO start */
 	brw_page(READ, page, inode->i_dev, nr, inode->i_sb->s_blocksize, 1);
@@ -1457,6 +1756,7 @@
 	}
 	tmp->b_this_page = bh;
 	free_list[isize] = bh;
+	mem_map[MAP_NR(page)].flags = 0;
 	mem_map[MAP_NR(page)].buffers = bh;
 	buffermem += PAGE_SIZE;
 	return 1;
@@ -1468,33 +1768,34 @@
 #define BUFFER_BUSY_BITS	((1<<BH_Dirty) | (1<<BH_Lock) | (1<<BH_Protected))
 #define buffer_busy(bh)		((bh)->b_count || ((bh)->b_state & BUFFER_BUSY_BITS))
 
-static int sync_page_buffers(struct page * page, int wait)
+static void sync_page_buffers(struct page * page)
 {
-	struct buffer_head * bh = page->buffers;
-	struct buffer_head * tmp = bh;
+	struct buffer_head * tmp, * bh = page->buffers;
 
+	/*
+	 * Here we'll probably sleep and so we must make sure that
+	 * the page doesn't go away from under us. We also prefer any
+	 * concurrent try_to_free_buffers() not to work in any way on
+	 * our current page from under us since we're just working on it.
+	 * As always in 2.2.x we're serialized by the big kernel lock
+	 * during those hacky page-visibility manipulations.
+	 *
+	 * SUBTLE NOTE: for things like LVM snapshotting WRITEA will block too!
+	 */
 	page->buffers = NULL;
 
+	tmp = bh;
 	do {
 		struct buffer_head *p = tmp;
 		tmp = tmp->b_this_page;
-		if (buffer_locked(p)) {
-			if (wait)
-				__wait_on_buffer(p);
-		} else if (buffer_dirty(p))
-			ll_rw_block(WRITE, 1, &p);
-	} while (tmp != bh);
 
-	page->buffers = bh;
-
-	do {
-		struct buffer_head *p = tmp;
-		tmp = tmp->b_this_page;
-		if (buffer_busy(p))
-			return 1;
+		if (buffer_dirty(p))
+			if (test_and_set_bit(BH_Wait_IO, &p->b_state))
+				ll_rw_block(WRITE, 1, &p);
 	} while (tmp != bh);
 
-	return 0;
+	/* Restore the visibility of the page before returning. */
+	page->buffers = bh;
 }
 
 /*
@@ -1504,10 +1805,9 @@
  * Wake up bdflush() if this fails - if we're running low on memory due
  * to dirty buffers, we need to flush them out as quickly as possible.
  */
-int try_to_free_buffers(struct page * page_map, int wait)
+int try_to_free_buffers(struct page * page_map, int gfp_mask)
 {
 	struct buffer_head * tmp, * bh = page_map->buffers;
-	int too_many;
 
 	tmp = bh;
 	do {
@@ -1516,8 +1816,6 @@
 		tmp = tmp->b_this_page;
 	} while (tmp != bh);
 
- succeed:
-	tmp = bh;
 	do {
 		struct buffer_head * p = tmp;
 		tmp = tmp->b_this_page;
@@ -1536,25 +1834,12 @@
 	return 1;
 
  busy:
-	too_many = (nr_buffers * bdf_prm.b_un.nfract/100);
+	if (gfp_mask & __GFP_IO)
+		sync_page_buffers(page_map);
 
-	if (!sync_page_buffers(page_map, wait)) {
-
-		/* If a high percentage of the buffers are dirty, 
-		 * wake kflushd 
-		 */
-		if (nr_buffers_type[BUF_DIRTY] > too_many)
-			wakeup_bdflush(0);
-			
-		/*
-		 * We can jump after the busy check because
-		 * we rely on the kernel lock.
-		 */
-		goto succeed;
-	}
-
-	if(nr_buffers_type[BUF_DIRTY] > too_many)
+	if (balance_dirty_state(NODEV) >= 0)
 		wakeup_bdflush(0);
+
 	return 0;
 }
 
@@ -1566,7 +1851,7 @@
 	int found = 0, locked = 0, dirty = 0, used = 0, lastused = 0;
 	int protected = 0;
 	int nlist;
-	static char *buf_types[NR_LIST] = {"CLEAN","LOCKED","DIRTY"};
+	static char *buf_types[NR_LIST] = {"CLEAN","LOCKED","DIRTY","PROTECTED",};
 
 	printk("Buffer memory:   %8ldkB\n",buffermem>>10);
 	printk("Buffer heads:    %6d\n",nr_buffer_heads);
@@ -1590,7 +1875,7 @@
 			used++, lastused = found;
 		bh = bh->b_next_free;
 	  } while (bh != lru_list[nlist]);
-	  printk("%8s: %d buffers, %d used (last=%d), "
+	  printk("%9s: %d buffers, %d used (last=%d), "
 		 "%d locked, %d protected, %d dirty\n",
 		 buf_types[nlist], found, used, lastused,
 		 locked, protected, dirty);
@@ -1657,6 +1942,7 @@
 		bh = kmem_cache_alloc(bh_cachep, SLAB_ATOMIC);
 		if (!bh)
 			break;
+		memset(bh, 0, sizeof(*bh));
 		put_unused_buffer_head(bh);
 		nr_buffer_heads++;
 	}
@@ -1762,7 +2048,6 @@
 	if (ncount) printk("sync_old_buffers: %d dirty buffers not on dirty list\n", ncount);
 	printk("Wrote %d/%d buffers\n", nwritten, ndirty);
 #endif
-	run_task_queue(&tq_disk);
 	return 0;
 }
 
@@ -1935,7 +2220,8 @@
 		
 		/* If there are still a lot of dirty buffers around, skip the sleep
 		   and flush some more */
-		if(ndirty == 0 || nr_buffers_type[BUF_DIRTY] <= nr_buffers * bdf_prm.b_un.nfract/100) {
+		if (!ndirty || balance_dirty_state(NODEV) < 0)
+		{
 			spin_lock_irq(&current->sigmask_lock);
 			flush_signals(current);
 			spin_unlock_irq(&current->sigmask_lock);
@@ -1959,13 +2245,18 @@
 	tsk->session = 1;
 	tsk->pgrp = 1;
 	strcpy(tsk->comm, "kupdate");
+
+	/* sigstop and sigcont will stop and wakeup kupdate */
+	spin_lock_irq(&tsk->sigmask_lock);
 	sigfillset(&tsk->blocked);
-	/* sigcont will wakeup kupdate after setting interval to 0 */
 	sigdelset(&tsk->blocked, SIGCONT);
+	sigdelset(&tsk->blocked, SIGSTOP);
+	spin_unlock_irq(&tsk->sigmask_lock);
 
 	lock_kernel();
 
 	for (;;) {
+		/* update interval */
 		interval = bdf_prm.b_un.interval;
 		if (interval)
 		{
@@ -1974,8 +2265,24 @@
 		}
 		else
 		{
+		stop_kupdate:
 			tsk->state = TASK_STOPPED;
 			schedule(); /* wait for SIGCONT */
+		}
+		/* check for sigstop */
+		if (signal_pending(tsk))
+		{
+			int stopped = 0;
+			spin_lock_irq(&tsk->sigmask_lock);
+			if (sigismember(&tsk->signal, SIGSTOP))
+			{
+				sigdelset(&tsk->signal, SIGSTOP);
+				stopped = 1;
+			}
+			recalc_sigpending(tsk);
+			spin_unlock_irq(&tsk->sigmask_lock);
+			if (stopped)
+				goto stop_kupdate;
 		}
 #ifdef DEBUG
 		printk("kupdate() activated...\n");
diff -urN 2.2.18/fs/coda/dir.c 2.2.18aa1/fs/coda/dir.c
--- 2.2.18/fs/coda/dir.c	Tue Sep  5 02:28:47 2000
+++ 2.2.18aa1/fs/coda/dir.c	Mon Dec 11 17:20:50 2000
@@ -749,7 +749,7 @@
                         char *name  = vdirent->d_name;
 
 			errfill = filldir(getdent,  name, namlen, 
-					  offs, ino); 
+					  offs, ino, DT_UNKNOWN);
 CDEBUG(D_FILE, "entry %d: ino %ld, namlen %d, reclen %d, type %d, pos %d, string_offs %d, name %*s, offset %d, result: %d, errfill: %d.\n", i,vdirent->d_fileno, vdirent->d_namlen, vdirent->d_reclen, vdirent->d_type, pos,  string_offset, vdirent->d_namlen, vdirent->d_name, (u_int) offs, result, errfill);
 			/* errfill means no space for filling in this round */
 			if ( errfill < 0 ) {
diff -urN 2.2.18/fs/coda/file.c 2.2.18aa1/fs/coda/file.c
--- 2.2.18/fs/coda/file.c	Tue Sep  5 02:28:47 2000
+++ 2.2.18aa1/fs/coda/file.c	Mon Dec 11 17:20:52 2000
@@ -99,7 +99,7 @@
 			      &cont_file, &cont_dentry);
 
         CDEBUG(D_INODE, "coda ino: %ld, cached ino %ld, page offset: %lx\n", 
-	       coda_inode->i_ino, cii->c_ovp->i_ino, page->offset);
+	       coda_inode->i_ino, cii->c_ovp->i_ino, pgoff2ulong(page->index));
 
         generic_readpage(&cont_file, page);
         EXIT;
@@ -190,10 +190,10 @@
                 return -1;
         }
 
-	down(&cont_inode->i_sem);
+	fs_down(&cont_inode->i_sem);
         result = cont_file.f_op->write(&cont_file , buff, count, 
 				       &(cont_file.f_pos));
-	up(&cont_inode->i_sem);
+	fs_up(&cont_inode->i_sem);
         coda_restore_codafile(coda_inode, coda_file, cont_inode, &cont_file);
 	
 	if (result)
@@ -228,14 +228,14 @@
         coda_prepare_openfile(coda_inode, coda_file, cont_inode, 
 			      &cont_file, &cont_dentry);
 
-	down(&cont_inode->i_sem);
+	fs_down(&cont_inode->i_sem);
 
         result = file_fsync(&cont_file ,&cont_dentry);
 	if ( result == 0 ) {
 		result = venus_fsync(coda_inode->i_sb, &(cnp->c_fid));
 	}
 
-	up(&cont_inode->i_sem);
+	fs_up(&cont_inode->i_sem);
 
         coda_restore_codafile(coda_inode, coda_file, cont_inode, &cont_file);
         return result;
diff -urN 2.2.18/fs/dcache.c 2.2.18aa1/fs/dcache.c
--- 2.2.18/fs/dcache.c	Tue Jun 13 03:48:14 2000
+++ 2.2.18aa1/fs/dcache.c	Mon Dec 11 17:20:52 2000
@@ -20,6 +20,7 @@
 #include <linux/malloc.h>
 #include <linux/slab.h>
 #include <linux/init.h>
+#include <linux/bigmem.h>
 
 #include <asm/uaccess.h>
 #include <asm/cache.h>
@@ -253,10 +254,15 @@
 
 		if (tmp == &dentry_unused)
 			break;
-		dentry_stat.nr_unused--;
 		list_del(tmp);
-		INIT_LIST_HEAD(tmp);
 		dentry = list_entry(tmp, struct dentry, d_lru);
+		if (dentry->d_flags & DCACHE_REFERENCED) {
+			dentry->d_flags &= ~DCACHE_REFERENCED;
+			list_add(&dentry->d_lru, &dentry_unused);
+			continue;
+		}
+		dentry_stat.nr_unused--;
+		INIT_LIST_HEAD(tmp);
 		if (!dentry->d_count) {
 			i_nr -= prune_one_dentry(dentry);
 			if (!i_nr)
@@ -475,9 +481,9 @@
  */
 void shrink_dcache_memory(int priority, unsigned int gfp_mask)
 {
-	if (gfp_mask & __GFP_IO) {
+	if (gfp_mask & __GFP_IO && !current->fs_locks) {
 		int count = 0;
-		if (priority)
+		if (priority > 1)
 			count = dentry_stat.nr_unused / priority;
 		prune_dcache(count, -1);
 	}
@@ -598,6 +604,7 @@
 			if (memcmp(dentry->d_name.name, str, len))
 				continue;
 		}
+		dentry->d_flags |= DCACHE_REFERENCED;
 		return dget(dentry);
 	}
 	return NULL;
@@ -927,7 +934,11 @@
 	if (!dentry_cache)
 		panic("Cannot create dentry cache");
 
+#ifndef CONFIG_BIGMEM
 	memory_size = num_physpages << PAGE_SHIFT;
+#else
+	memory_size = bigmem_mapnr << PAGE_SHIFT;
+#endif
 	memory_size >>= 13;
 	memory_size *= 2 * sizeof(void *);
 	for (order = 0; ((1UL << order) << PAGE_SHIFT) < memory_size; order++);
diff -urN 2.2.18/fs/devpts/root.c 2.2.18aa1/fs/devpts/root.c
--- 2.2.18/fs/devpts/root.c	Mon Jan 17 16:44:41 2000
+++ 2.2.18aa1/fs/devpts/root.c	Mon Dec 11 17:20:50 2000
@@ -86,12 +86,12 @@
 	switch(nr)
 	{
 	case 0:
-		if (filldir(dirent, ".", 1, nr, inode->i_ino) < 0)
+		if (filldir(dirent, ".", 1, nr, inode->i_ino, DT_DIR) < 0)
 			return 0;
 		filp->f_pos = ++nr;
 		/* fall through */
 	case 1:
-		if (filldir(dirent, "..", 2, nr, inode->i_ino) < 0)
+		if (filldir(dirent, "..", 2, nr, inode->i_ino, DT_DIR) < 0)
 			return 0;
 		filp->f_pos = ++nr;
 		/* fall through */
@@ -100,7 +100,7 @@
 			int ptynr = nr - 2;
 			if ( sbi->inodes[ptynr] ) {
 				genptsname(numbuf, ptynr);
-				if ( filldir(dirent, numbuf, strlen(numbuf), nr, nr) < 0 )
+				if ( filldir(dirent, numbuf, strlen(numbuf), nr, nr, DT_CHR) < 0 )
 					return 0;
 			}
 			filp->f_pos = ++nr;
diff -urN 2.2.18/fs/dquot.c 2.2.18aa1/fs/dquot.c
--- 2.2.18/fs/dquot.c	Mon Dec 11 16:58:00 2000
+++ 2.2.18aa1/fs/dquot.c	Mon Dec 11 17:20:50 2000
@@ -570,7 +570,7 @@
 	 */
 	if (prune_dcache(0, 128))
 	{
-		free_inode_memory(10);
+		free_inode_memory();
 		goto repeat;
 	}
 
@@ -1535,7 +1535,7 @@
 	if (!S_ISREG(inode->i_mode))
 		goto out_f;
 	error = -EINVAL;
-	if (inode->i_size == 0 || (inode->i_size % sizeof(struct dqblk)) != 0)
+	if (inode->i_size == 0 || ((off_t)inode->i_size % sizeof(struct dqblk)) != 0)
 		goto out_f;
 	dquot_drop(inode);	/* We don't want quota on quota files */
 
diff -urN 2.2.18/fs/efs/dir.c 2.2.18aa1/fs/efs/dir.c
--- 2.2.18/fs/efs/dir.c	Mon Jan 17 16:44:41 2000
+++ 2.2.18aa1/fs/efs/dir.c	Mon Dec 11 17:20:50 2000
@@ -107,7 +107,7 @@
 				filp->f_pos = (block << EFS_DIRBSIZE_BITS) | slot;
 
 				/* copy filename and data in dirslot */
-				filldir(dirent, nameptr, namelen, filp->f_pos, inodenum);
+				filldir(dirent, nameptr, namelen, filp->f_pos, inodenum, DT_UNKNOWN);
 
 				/* sanity check */
 				if (nameptr - (char *) dirblock + namelen > EFS_DIRBSIZE) {
diff -urN 2.2.18/fs/ext2/dir.c 2.2.18aa1/fs/ext2/dir.c
--- 2.2.18/fs/ext2/dir.c	Thu May  4 13:00:39 2000
+++ 2.2.18aa1/fs/ext2/dir.c	Mon Dec 11 17:20:50 2000
@@ -32,6 +32,10 @@
 	return -EISDIR;
 }
 
+static unsigned char ext2_filetype_table[] = {
+	DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
+};
+
 static int ext2_readdir(struct file *, void *, filldir_t);
 
 static struct file_operations ext2_dir_operations = {
@@ -201,10 +205,14 @@
 				 * the descriptor.
 				 */
 				unsigned long version = filp->f_version;
+				unsigned char d_type = DT_UNKNOWN;
 
+				if (EXT2_HAS_INCOMPAT_FEATURE(sb, EXT2_FEATURE_INCOMPAT_FILETYPE)
+				    && de->file_type < EXT2_FT_MAX)
+					d_type = ext2_filetype_table[de->file_type];
 				error = filldir(dirent, de->name,
 						de->name_len,
-						filp->f_pos, le32_to_cpu(de->inode));
+						filp->f_pos, le32_to_cpu(de->inode), d_type);
 				if (error)
 					break;
 				if (version != filp->f_version)
diff -urN 2.2.18/fs/ext2/file.c 2.2.18aa1/fs/ext2/file.c
--- 2.2.18/fs/ext2/file.c	Mon Dec 11 16:58:00 2000
+++ 2.2.18aa1/fs/ext2/file.c	Mon Dec 11 17:20:50 2000
@@ -39,10 +39,6 @@
 static long long ext2_file_lseek(struct file *, long long, int);
 static ssize_t ext2_file_write (struct file *, const char *, size_t, loff_t *);
 static int ext2_release_file (struct inode *, struct file *);
-#if BITS_PER_LONG < 64
-static int ext2_open_file (struct inode *, struct file *);
-
-#else
 
 #define EXT2_MAX_SIZE(bits)							\
 	(((EXT2_NDIR_BLOCKS + (1LL << (bits - 2)) + 				\
@@ -55,8 +51,6 @@
 EXT2_MAX_SIZE(10), EXT2_MAX_SIZE(11), EXT2_MAX_SIZE(12), EXT2_MAX_SIZE(13)
 };
 
-#endif
-
 /*
  * We have mostly NULL's here: the current defaults are ok for
  * the ext2 filesystem.
@@ -69,11 +63,7 @@
 	NULL,			/* poll - default */
 	ext2_ioctl,		/* ioctl */
 	generic_file_mmap,	/* mmap */
-#if BITS_PER_LONG == 64	
 	NULL,			/* no special open is needed */
-#else
-	ext2_open_file,
-#endif
 	NULL,			/* flush */
 	ext2_release_file,	/* release */
 	ext2_sync_file,		/* fsync */
@@ -120,14 +110,9 @@
 		case 1:
 			offset += file->f_pos;
 	}
-#if BITS_PER_LONG < 64
-	if (offset >> 31)
-		return -EINVAL;
-#else
 	if (offset < 0 ||
 	    offset > ext2_max_sizes[EXT2_BLOCK_SIZE_BITS(inode->i_sb)])
 		return -EINVAL;
-#endif
 	if (offset != file->f_pos) {
 		file->f_pos = offset;
 		file->f_reada = 0;
@@ -155,10 +140,10 @@
 				size_t count, loff_t *ppos)
 {
 	struct inode * inode = filp->f_dentry->d_inode;
-	off_t pos;
+	loff_t pos;
 	long block;
 	int offset;
-	int written, c;
+	size_t written, c;
 	struct buffer_head * bh, *bufferlist[NBUF];
 	struct super_block * sb;
 	int err;
@@ -170,7 +155,7 @@
 		return 0;
 	/* This makes the bounds-checking arithmetic later on much more
 	 * sane. */
-	if (((signed) count) < 0)
+	if (((ssize_t) count) < 0)
 		return -EINVAL;
 	
 	write_error = buffercount = 0;
@@ -202,24 +187,18 @@
 
 	/* Check for overflow.. */
 
-#if BITS_PER_LONG < 64
-	/* If the fd's pos is already greater than or equal to the file
-	 * descriptor's offset maximum, then we need to return EFBIG for
-	 * any non-zero count (and we already tested for zero above). */
-	if (((unsigned) pos) >= 0x7FFFFFFFUL)
-		return -EFBIG;
-	
-	/* If we are about to overflow the maximum file size, we also
-	 * need to return the error, but only if no bytes can be written
-	 * successfully. */
-	if (((unsigned) pos + count) > 0x7FFFFFFFUL) {
-		count = 0x7FFFFFFFL - pos;
-		if (((signed) count) < 0)
+	/* L-F-S spec 2.2.1.27: */
+	if (!(filp->f_flags & O_LARGEFILE)) {
+		if (pos >= 0x7fffffffULL) /* pos@2G forbidden */
 			return -EFBIG;
+
+		if (pos + count > 0x7fffffffULL)
+			/* Write only until end of allowed region */
+			count = 0x7fffffffULL - pos;
 	}
-#else
+
 	{
-		off_t max = ext2_max_sizes[EXT2_BLOCK_SIZE_BITS(sb)];
+		loff_t max = ext2_max_sizes[EXT2_BLOCK_SIZE_BITS(sb)];
 
 		if (pos >= max)
 			return -EFBIG;
@@ -239,20 +218,18 @@
 			mark_buffer_dirty(sb->u.ext2_sb.s_sbh, 1);
 		}
 	}
-#endif
 
 	/* From SUS: We must generate a SIGXFSZ for file size overflow
 	 * only if no bytes were actually written to the file. --sct */
 
 	limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
-	if (limit < RLIM_INFINITY) {
-		if (((unsigned) pos+count) >= limit) {
-			count = limit - pos;
-			if (((signed) count) <= 0) {
-				send_sig(SIGXFSZ, current, 0);
-				return -EFBIG;
-			}
+	if (limit != RLIM_INFINITY) {
+		if (pos >= limit) {
+			send_sig(SIGXFSZ, current, 0);
+			return -EFBIG;
 		}
+		if (pos+count > limit)
+			count = limit - pos;
 	}
 
 	/*
@@ -382,15 +359,3 @@
 	return 0;
 }
 
-#if BITS_PER_LONG < 64
-/*
- * Called when an inode is about to be open.
- * We use this to disallow opening RW large files on 32bit systems.
- */
-static int ext2_open_file (struct inode * inode, struct file * filp)
-{
-	if (inode->u.ext2_i.i_high_size && (filp->f_mode & FMODE_WRITE))
-		return -EFBIG;
-	return 0;
-}
-#endif
diff -urN 2.2.18/fs/ext2/inode.c 2.2.18aa1/fs/ext2/inode.c
--- 2.2.18/fs/ext2/inode.c	Mon Dec 11 16:58:00 2000
+++ 2.2.18aa1/fs/ext2/inode.c	Mon Dec 11 17:20:50 2000
@@ -537,15 +537,8 @@
 		inode->u.ext2_i.i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl);
 	else {
 		inode->u.ext2_i.i_dir_acl = 0;
-		inode->u.ext2_i.i_high_size =
-			le32_to_cpu(raw_inode->i_size_high);
-#if BITS_PER_LONG < 64
-		if (raw_inode->i_size_high)
-			inode->i_size = (__u32)-1;
-#else
-		inode->i_size |= ((__u64)le32_to_cpu(raw_inode->i_size_high))
-			<< 32;
-#endif
+		inode->i_size = ((__u64)(inode->i_size & 0xFFFFFFFFUL)) |
+			(((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32);
 	}
 	inode->u.ext2_i.i_block_group = block_group;
 	inode->u.ext2_i.i_next_alloc_block = 0;
@@ -667,12 +660,7 @@
 	if (S_ISDIR(inode->i_mode))
 		raw_inode->i_dir_acl = cpu_to_le32(inode->u.ext2_i.i_dir_acl);
 	else { 
-#if BITS_PER_LONG < 64
-		raw_inode->i_size_high =
-			cpu_to_le32(inode->u.ext2_i.i_high_size);
-#else
 		raw_inode->i_size_high = cpu_to_le32(inode->i_size >> 32);
-#endif
 	}
 	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
 		raw_inode->i_block[0] = cpu_to_le32(kdev_t_to_nr(inode->i_rdev));
@@ -724,22 +712,19 @@
 	}
 
 	if (iattr->ia_valid & ATTR_SIZE) {
-		off_t size = iattr->ia_size;
+		loff_t size = iattr->ia_size;
 		unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
 
 		if (size < 0)
 			return -EINVAL;
-#if BITS_PER_LONG == 64	
 		if (size > ext2_max_sizes[EXT2_BLOCK_SIZE_BITS(inode->i_sb)])
 			return -EFBIG;
-#endif
-		if (limit < RLIM_INFINITY && size > limit) {
+		if (limit != RLIM_INFINITY && size > limit) {
 			send_sig(SIGXFSZ, current, 0);
 			return -EFBIG;
 		}
 
-#if BITS_PER_LONG == 64	
-		if (size >> 33) {
+		if (size >> 31) {
 			struct super_block *sb = inode->i_sb;
 			struct ext2_super_block *es = sb->u.ext2_sb.s_es;
 			if (!(es->s_feature_ro_compat &
@@ -751,7 +736,6 @@
 				mark_buffer_dirty(sb->u.ext2_sb.s_sbh, 1);
 			}
 		}
-#endif
 	}
 	
 	retval = inode_change_ok(inode, iattr);
diff -urN 2.2.18/fs/ext2/truncate.c 2.2.18aa1/fs/ext2/truncate.c
--- 2.2.18/fs/ext2/truncate.c	Mon Dec 11 16:58:00 2000
+++ 2.2.18aa1/fs/ext2/truncate.c	Mon Dec 11 17:20:50 2000
@@ -53,9 +53,10 @@
  * Currently we always hold the inode semaphore during truncate, so
  * there's no need to test for changes during the operation.
  */
-#define DIRECT_BLOCK(inode) \
-	((inode->i_size + inode->i_sb->s_blocksize - 1) / \
-			  inode->i_sb->s_blocksize)
+#define DIRECT_BLOCK(inode)	\
+	((long)			\
+	 ((inode->i_size + inode->i_sb->s_blocksize - 1) >> \
+			  inode->i_sb->s_blocksize_bits))
 #define INDIRECT_BLOCK(inode,offset) ((int)DIRECT_BLOCK(inode) - offset)
 #define DINDIRECT_BLOCK(inode,offset) \
 	(INDIRECT_BLOCK(inode,offset) / addr_per_block)
diff -urN 2.2.18/fs/fat/dir.c 2.2.18aa1/fs/fat/dir.c
--- 2.2.18/fs/fat/dir.c	Mon Dec 11 16:58:00 2000
+++ 2.2.18aa1/fs/fat/dir.c	Mon Dec 11 17:20:50 2000
@@ -315,7 +315,7 @@
 /* Fake . and .. for the root directory. */
 	if (inode->i_ino == MSDOS_ROOT_INO) {
 		while (cpos < 2) {
-			if (filldir(dirent, "..", cpos+1, cpos, MSDOS_ROOT_INO) < 0)
+			if (filldir(dirent, "..", cpos+1, cpos, MSDOS_ROOT_INO, DT_DIR) < 0)
 				return 0;
 			cpos++;
 			filp->f_pos++;
@@ -458,7 +458,8 @@
 	if (!long_slots||shortnames) {
 		if (both)
 			bufname[i] = '\0';
-		if (filldir(dirent, bufname, i, *furrfu, inum) < 0)
+		if (filldir(dirent, bufname, i, *furrfu, inum,
+			    (de->attr & ATTR_DIR) ? DT_DIR : DT_REG) < 0)
 			goto FillFailed;
 	} else {
 		char longname[275];
@@ -469,7 +470,8 @@
 			memcpy(&longname[long_len+1], bufname, i);
 			long_len += i;
 		}
-		if (filldir(dirent, longname, long_len, *furrfu, inum) < 0)
+		if (filldir(dirent, longname, long_len, *furrfu, inum,
+			    (de->attr & ATTR_DIR) ? DT_DIR : DT_REG) < 0)
 			goto FillFailed;
 	}
 
@@ -499,7 +501,8 @@
 	const char * name,
 	int name_len,
 	off_t offset,
-	ino_t ino)
+	ino_t ino,
+	unsigned int d_type)
 {
 	struct dirent *d1 = (struct dirent *)buf;
 	struct dirent *d2 = d1 + 1;
diff -urN 2.2.18/fs/fat/file.c 2.2.18aa1/fs/fat/file.c
--- 2.2.18/fs/fat/file.c	Mon Jan 17 16:44:42 2000
+++ 2.2.18aa1/fs/fat/file.c	Mon Dec 11 17:20:50 2000
@@ -227,7 +227,7 @@
 		Each time we process one block in bhlist, we replace
 		it by a new prefetch block if needed.
 	*/
-	PRINTK (("#### ino %ld pos %ld size %ld count %d\n",inode->i_ino,*ppos,inode->i_size,count));
+	PRINTK (("#### ino %ld pos %ld size %ld count %d\n",inode->i_ino,*ppos,(u_long)inode->i_size,count));
 	{
 		/*
 			We must prefetch complete block, so we must
@@ -253,7 +253,7 @@
 	}
 	pre.nolist = 0;
 	PRINTK (("count %d ahead %d nblist %d\n",count,read_ahead[MAJOR(inode->i_dev)],pre.nblist));
-	while ((left_in_file = inode->i_size - *ppos) > 0
+	while ((left_in_file = (u_long)inode->i_size - *ppos) > 0
 		&& buf < end){
 		struct buffer_head *bh = pre.bhlist[pre.nolist];
 		char *data;
@@ -451,7 +451,7 @@
 
 void fat_truncate(struct inode *inode)
 {
-	int cluster;
+	int cluster_bytes, cluster_shift;
 
 	/* Why no return value?  Surely the disk could fail... */
 	if (IS_IMMUTABLE(inode))
@@ -460,8 +460,10 @@
 		printk("FAT: fat_truncate called though fs is read-only, uhh...\n");
 		return /* -EROFS */;
 	}
-	cluster = SECTOR_SIZE*MSDOS_SB(inode->i_sb)->cluster_size;
-	(void) fat_free(inode,(inode->i_size+(cluster-1))/cluster);
+	cluster_bytes = SECTOR_SIZE * MSDOS_SB(inode->i_sb)->cluster_size;
+	cluster_shift = fslog2(cluster_bytes);
+	(void) fat_free(inode,
+			(inode->i_size+(cluster_bytes-1)) >> cluster_shift);
 	MSDOS_I(inode)->i_attrs |= ATTR_ARCH;
 	mark_inode_dirty(inode);
 }
diff -urN 2.2.18/fs/fat/inode.c 2.2.18aa1/fs/fat/inode.c
--- 2.2.18/fs/fat/inode.c	Mon Dec 11 16:58:00 2000
+++ 2.2.18aa1/fs/fat/inode.c	Mon Dec 11 17:20:50 2000
@@ -399,8 +399,9 @@
 			sizeof(struct msdos_dir_entry);
 	}
 	inode->i_blksize = MSDOS_SB(sb)->cluster_size* SECTOR_SIZE;
-	inode->i_blocks = (inode->i_size+inode->i_blksize-1)/
-		    inode->i_blksize*MSDOS_SB(sb)->cluster_size;
+		inode->i_blocks = (((inode->i_size+inode->i_blksize-1) >>
+				    fslog2(inode->i_blksize)) *
+				   MSDOS_SB(sb)->cluster_size);
 	MSDOS_I(inode)->i_logstart = 0;
 
 	MSDOS_I(inode)->i_attrs = 0;
@@ -830,8 +831,9 @@
 	MSDOS_I(inode)->i_attrs = de->attr & ATTR_UNUSED;
 	/* this is as close to the truth as we can get ... */
 	inode->i_blksize = MSDOS_SB(sb)->cluster_size*SECTOR_SIZE;
-	inode->i_blocks = (inode->i_size+inode->i_blksize-1)/
-	    inode->i_blksize*MSDOS_SB(sb)->cluster_size;
+	inode->i_blocks = (((inode->i_size+inode->i_blksize-1) >>
+			    fslog2(inode->i_blksize)) *
+			   MSDOS_SB(sb)->cluster_size);
 	inode->i_mtime = inode->i_atime =
 	    date_dos2unix(CF_LE_W(de->time),CF_LE_W(de->date));
 	inode->i_ctime =
diff -urN 2.2.18/fs/fcntl.c 2.2.18aa1/fs/fcntl.c
--- 2.2.18/fs/fcntl.c	Mon Jan 17 16:44:43 2000
+++ 2.2.18aa1/fs/fcntl.c	Mon Dec 11 17:20:50 2000
@@ -8,6 +8,8 @@
 #include <linux/file.h>
 #include <linux/smp_lock.h>
 
+#include <asm/poll.h>
+#include <asm/siginfo.h>
 #include <asm/uaccess.h>
 
 extern int sock_fcntl (struct file *, unsigned int cmd, unsigned long arg);
@@ -143,17 +145,11 @@
 	return 0;
 }
 
-asmlinkage long sys_fcntl(unsigned int fd, unsigned int cmd, unsigned long arg)
-{	
-	struct file * filp;
-	long err = -EBADF;
-
-	lock_kernel();
-	filp = fget(fd);
-	if (!filp)
-		goto out;
+static long do_fcntl(unsigned int fd, unsigned int cmd,
+		     unsigned long arg, struct file * filp)
+{
+	long err = 0;
 
-	err = 0;
 	switch (cmd) {
 		case F_DUPFD:
 			err = dupfd(fd, arg);
@@ -217,13 +213,74 @@
 				err = sock_fcntl (filp, cmd, arg);
 			break;
 	}
+
+	return err;
+}
+
+asmlinkage long sys_fcntl(unsigned int fd, unsigned int cmd, unsigned long arg)
+{	
+	struct file * filp;
+	long err = -EBADF;
+
+	lock_kernel();
+	filp = fget(fd);
+	if (!filp)
+		goto out;
+
+	err = do_fcntl(fd, cmd, arg, filp);
+
 	fput(filp);
 out:
 	unlock_kernel();
 	return err;
 }
 
-static void send_sigio(struct fown_struct *fown, struct fasync_struct *fa)
+#if BITS_PER_LONG == 32
+asmlinkage long sys_fcntl64(unsigned int fd, unsigned int cmd, unsigned long arg)
+{	
+	struct file * filp;
+	long err = -EBADF;
+
+	lock_kernel();
+	filp = fget(fd);
+	if (!filp)
+		goto out;
+
+	switch (cmd) {
+		case F_GETLK64:
+			err = fcntl_getlk64(fd, (struct flock64 *) arg);
+			break;
+		case F_SETLK64:
+			err = fcntl_setlk64(fd, cmd, (struct flock64 *) arg);
+			break;
+		case F_SETLKW64:
+			err = fcntl_setlk64(fd, cmd, (struct flock64 *) arg);
+			break;
+		default:
+			err = do_fcntl(fd, cmd, arg, filp);
+			break;
+	}
+
+	fput(filp);
+out:
+	unlock_kernel();
+	return err;
+}
+#endif
+
+/* Table to convert sigio signal codes into poll band bitmaps */
+
+static int band_table[NSIGPOLL] = {
+       POLLIN | POLLRDNORM,                    /* POLL_IN */
+       POLLOUT | POLLWRNORM | POLLWRBAND,      /* POLL_OUT */
+       POLLIN | POLLRDNORM | POLLMSG,          /* POLL_MSG */
+       POLLERR,                                /* POLL_ERR */
+       POLLPRI | POLLRDBAND,                   /* POLL_PRI */
+       POLLHUP | POLLERR                       /* POLL_HUP */
+};
+
+static void send_sigio(struct fown_struct *fown, struct fasync_struct *fa,
+	int reason)
 {
 	struct task_struct * p;
 	int   pid	= fown->pid;
@@ -252,9 +309,12 @@
 			   back to SIGIO in that case. --sct */
 			si.si_signo = fown->signum;
 			si.si_errno = 0;
-		        si.si_code  = SI_SIGIO;
-			si.si_pid   = pid;
-			si.si_uid   = uid;
+                        si.si_code  = reason;
+                        if (reason - POLL_IN >= NSIGPOLL ||
+			    reason <= 0)
+				panic("send_sigio got `reason' != POLL_*");
+                        else
+                                si.si_band = band_table[reason - POLL_IN];
 			si.si_fd    = fa->fa_fd;
 			if (!send_sig_info(fown->signum, &si, p))
 				break;
@@ -266,7 +326,7 @@
 	read_unlock(&tasklist_lock);
 }
 
-void kill_fasync(struct fasync_struct *fa, int sig)
+void kill_fasync(struct fasync_struct *fa, int sig, int band)
 {
 	while (fa) {
 		struct fown_struct * fown;
@@ -276,8 +336,11 @@
 			return;
 		}
 		fown = &fa->fa_file->f_owner;
-		if (fown->pid)
-			send_sigio(fown, fa);
+		/* Don't send SIGURG to processes which have not set a
+		   queued signum: SIGURG has its own default signalling
+		   mechanism. */
+		if (fown->pid && !(sig == SIGURG && fown->signum == 0))
+			send_sigio(fown, fa, band);
 		fa = fa->fa_next;
 	}
 }
diff -urN 2.2.18/fs/hfs/dir_cap.c 2.2.18aa1/fs/hfs/dir_cap.c
--- 2.2.18/fs/hfs/dir_cap.c	Mon Jan 17 16:44:42 2000
+++ 2.2.18aa1/fs/hfs/dir_cap.c	Mon Dec 11 17:20:50 2000
@@ -243,7 +243,7 @@
 
 	if (filp->f_pos == 0) {
 		/* Entry 0 is for "." */
-		if (filldir(dirent, DOT->Name, DOT_LEN, 0, dir->i_ino)) {
+		if (filldir(dirent, DOT->Name, DOT_LEN, 0, dir->i_ino, DT_DIR)) {
 			return 0;
 		}
 		filp->f_pos = 1;
@@ -260,7 +260,7 @@
 		}
 
 		if (filldir(dirent, DOT_DOT->Name,
-			    DOT_DOT_LEN, 1, ntohl(cnid))) {
+			    DOT_DOT_LEN, 1, ntohl(cnid), DT_DIR)) {
 			return 0;
 		}
 		filp->f_pos = 2;
@@ -287,7 +287,7 @@
 				len = hfs_namein(dir, tmp_name,
 				    &((struct hfs_cat_key *)brec.key)->CName);
 				if (filldir(dirent, tmp_name, len,
-					    filp->f_pos, ino)) {
+					    filp->f_pos, ino, DT_UNKNOWN)) {
 					hfs_cat_close(entry, &brec);
 					return 0;
 				}
@@ -303,7 +303,7 @@
 			/* In root dir last-2 entry is for ".rootinfo" */
 			if (filldir(dirent, DOT_ROOTINFO->Name,
 				    DOT_ROOTINFO_LEN, filp->f_pos,
-				    ntohl(entry->cnid) | HFS_CAP_FNDR)) {
+				    ntohl(entry->cnid) | HFS_CAP_FNDR, DT_UNKNOWN)) {
 				return 0;
 			}
 		}
@@ -315,7 +315,7 @@
 			/* In normal dirs last-1 entry is for ".finderinfo" */
 			if (filldir(dirent, DOT_FINDERINFO->Name,
 				    DOT_FINDERINFO_LEN, filp->f_pos,
-				    ntohl(entry->cnid) | HFS_CAP_FDIR)) {
+				    ntohl(entry->cnid) | HFS_CAP_FDIR, DT_UNKNOWN)) {
 				return 0;
 			}
 		}
@@ -327,7 +327,7 @@
 			/* In normal dirs last entry is for ".resource" */
 			if (filldir(dirent, DOT_RESOURCE->Name,
 				    DOT_RESOURCE_LEN, filp->f_pos,
-				    ntohl(entry->cnid) | HFS_CAP_RDIR)) {
+				    ntohl(entry->cnid) | HFS_CAP_RDIR, DT_UNKNOWN)) {
 				return 0;
 			}
 		}
diff -urN 2.2.18/fs/hfs/dir_dbl.c 2.2.18aa1/fs/hfs/dir_dbl.c
--- 2.2.18/fs/hfs/dir_dbl.c	Mon Jan 17 16:44:42 2000
+++ 2.2.18aa1/fs/hfs/dir_dbl.c	Mon Dec 11 17:20:50 2000
@@ -206,7 +206,7 @@
 
 	if (filp->f_pos == 0) {
 		/* Entry 0 is for "." */
-		if (filldir(dirent, DOT->Name, DOT_LEN, 0, dir->i_ino)) {
+		if (filldir(dirent, DOT->Name, DOT_LEN, 0, dir->i_ino, DT_DIR)) {
 			return 0;
 		}
 		filp->f_pos = 1;
@@ -215,7 +215,7 @@
 	if (filp->f_pos == 1) {
 		/* Entry 1 is for ".." */
 		if (filldir(dirent, DOT_DOT->Name, DOT_DOT_LEN, 1,
-			    hfs_get_hl(entry->key.ParID))) {
+			    hfs_get_hl(entry->key.ParID), DT_DIR)) {
 			return 0;
 		}
 		filp->f_pos = 2;
@@ -252,7 +252,7 @@
 				    &((struct hfs_cat_key *)brec.key)->CName);
 			}
 
-			if (filldir(dirent, tmp_name, len, filp->f_pos, ino)) {
+			if (filldir(dirent, tmp_name, len, filp->f_pos, ino, DT_UNKNOWN)) {
 				hfs_cat_close(entry, &brec);
 				return 0;
 			}
@@ -266,7 +266,7 @@
 			/* In root dir last entry is for "%RootInfo" */
 			if (filldir(dirent, PCNT_ROOTINFO->Name,
 				    PCNT_ROOTINFO_LEN, filp->f_pos,
-				    ntohl(entry->cnid) | HFS_DBL_HDR)) {
+				    ntohl(entry->cnid) | HFS_DBL_HDR, DT_UNKNOWN)) {
 				return 0;
 			}
 		}
diff -urN 2.2.18/fs/hfs/dir_nat.c 2.2.18aa1/fs/hfs/dir_nat.c
--- 2.2.18/fs/hfs/dir_nat.c	Mon Jan 17 16:44:42 2000
+++ 2.2.18aa1/fs/hfs/dir_nat.c	Mon Dec 11 17:20:50 2000
@@ -231,7 +231,7 @@
 
 	if (filp->f_pos == 0) {
 		/* Entry 0 is for "." */
-		if (filldir(dirent, DOT->Name, DOT_LEN, 0, dir->i_ino)) {
+		if (filldir(dirent, DOT->Name, DOT_LEN, 0, dir->i_ino, DT_DIR)) {
 			return 0;
 		}
 		filp->f_pos = 1;
@@ -248,7 +248,7 @@
 		}
 
 		if (filldir(dirent, DOT_DOT->Name,
-			    DOT_DOT_LEN, 1, ntohl(cnid))) {
+			    DOT_DOT_LEN, 1, ntohl(cnid), DT_DIR)) {
 			return 0;
 		}
 		filp->f_pos = 2;
@@ -275,7 +275,7 @@
 				len = hfs_namein(dir, tmp_name,
 				    &((struct hfs_cat_key *)brec.key)->CName);
 				if (filldir(dirent, tmp_name, len,
-					    filp->f_pos, ino)) {
+					    filp->f_pos, ino, DT_UNKNOWN)) {
 					hfs_cat_close(entry, &brec);
 					return 0;
 				}
@@ -290,14 +290,14 @@
 			/* In normal dirs entry 2 is for ".AppleDouble" */
 			if (filldir(dirent, DOT_APPLEDOUBLE->Name,
 				    DOT_APPLEDOUBLE_LEN, filp->f_pos,
-				    ntohl(entry->cnid) | HFS_NAT_HDIR)) {
+				    ntohl(entry->cnid) | HFS_NAT_HDIR, DT_UNKNOWN)) {
 				return 0;
 			}
 		} else if (type == HFS_NAT_HDIR) {
 			/* In .AppleDouble entry 2 is for ".Parent" */
 			if (filldir(dirent, DOT_PARENT->Name,
 				    DOT_PARENT_LEN, filp->f_pos,
-				    ntohl(entry->cnid) | HFS_NAT_HDR)) {
+				    ntohl(entry->cnid) | HFS_NAT_HDR, DT_UNKNOWN)) {
 				return 0;
 			}
 		}
@@ -310,7 +310,7 @@
 		    (type == HFS_NAT_HDIR)) {
 			if (filldir(dirent, ROOTINFO->Name,
 				    ROOTINFO_LEN, filp->f_pos,
-				    ntohl(entry->cnid) | HFS_NAT_HDR)) {
+				    ntohl(entry->cnid) | HFS_NAT_HDR, DT_UNKNOWN)) {
 				return 0;
 			}
 		}
diff -urN 2.2.18/fs/hpfs/hpfs_fs.c 2.2.18aa1/fs/hpfs/hpfs_fs.c
--- 2.2.18/fs/hpfs/hpfs_fs.c	Mon Jan 17 16:44:42 2000
+++ 2.2.18aa1/fs/hpfs/hpfs_fs.c	Mon Dec 11 17:20:50 2000
@@ -1376,13 +1376,13 @@
 		break;
 
 	case 0:
-		if (filldir(dirent, ".", 1, filp->f_pos, inode->i_ino) < 0)
+		if (filldir(dirent, ".", 1, filp->f_pos, inode->i_ino, DT_DIR) < 0)
 			break;
 		filp->f_pos = -1;
 		/* fall through */
 
 	case -1:
-		if (filldir(dirent, "..", 2, filp->f_pos, inode->i_hpfs_parent_dir) < 0)
+		if (filldir(dirent, "..", 2, filp->f_pos, inode->i_hpfs_parent_dir, DT_DIR) < 0)
 			break;
 		filp->f_pos = 1;
 		/* fall through */
@@ -1402,7 +1402,7 @@
 			else
 				ino = file_ino(de->fnode);
 			brelse4(&qbh);
-			if (filldir(dirent, tempname, namelen, old_pos, ino) < 0) {
+			if (filldir(dirent, tempname, namelen, old_pos, ino, DT_UNKNOWN) < 0) {
 				filp->f_pos = old_pos;
 				break;
 			}
diff -urN 2.2.18/fs/inode.c 2.2.18aa1/fs/inode.c
--- 2.2.18/fs/inode.c	Mon Dec 11 16:58:00 2000
+++ 2.2.18aa1/fs/inode.c	Mon Dec 11 17:20:46 2000
@@ -11,6 +11,8 @@
 #include <linux/init.h>
 #include <linux/quotaops.h>
 #include <linux/random.h>
+#include <linux/bigmem.h>
+#include <linux/slab.h>
 
 /*
  * New inode.c implementation.
@@ -29,9 +31,8 @@
  * Inode lookup is no longer as critical as it used to be:
  * most of the lookups are going to be through the dcache.
  */
-#define HASH_BITS	8
-#define HASH_SIZE	(1UL << HASH_BITS)
-#define HASH_MASK	(HASH_SIZE-1)
+#define HASH_BITS	i_hash_bits
+#define HASH_MASK	i_hash_mask
 
 /*
  * Each inode can be on two separate lists. One is
@@ -47,7 +48,9 @@
 
 LIST_HEAD(inode_in_use);
 static LIST_HEAD(inode_unused);
-static struct list_head inode_hashtable[HASH_SIZE];
+static unsigned int i_hash_bits;
+static unsigned int i_hash_mask;
+static struct list_head *inode_hashtable;
 
 __u32 inode_generation_count = 0;
 
@@ -435,7 +438,7 @@
  * This is the externally visible routine for
  * inode memory management.
  */
-void free_inode_memory(int goal)
+void free_inode_memory(void)
 {
 	spin_lock(&inode_lock);
 	free_inodes();
@@ -680,7 +683,7 @@
 
 static inline unsigned long hash(struct super_block *sb, unsigned long i_ino)
 {
-	unsigned long tmp = i_ino | (unsigned long) sb;
+	unsigned long tmp = i_ino + (unsigned long) sb / (sizeof(struct super_block) & ~(sizeof(struct super_block) - 1));
 	tmp = tmp + (tmp >> HASH_BITS) + (tmp >> HASH_BITS*2);
 	return tmp & HASH_MASK;
 }
@@ -835,29 +838,84 @@
 	return 0;
 }
 
-/*
- * Initialize the hash tables and default
- * value for max inodes
- */
-#define MAX_INODE (16384)
-
 void __init inode_init(void)
 {
-	int i, max;
-	struct list_head *head = inode_hashtable;
+	int i, order;
+	struct list_head *d;
+	unsigned long nr_hash, hash_size, tmp;
+
+#ifndef CONFIG_BIGMEM
+	nr_hash = num_physpages;
+#else
+	nr_hash = bigmem_mapnr;
+#endif
+	nr_hash <<= PAGE_SHIFT;
+	nr_hash >>= 13;
+
+	/* scale logaritmically over 32768 inodes */
+	if (nr_hash > 16384) {
+		if (nr_hash > 32768)
+			nr_hash >>= 1;
+		else
+			nr_hash = 16384;
+	}
+	if (nr_hash > 32768) {
+		if (nr_hash > 65536)
+			nr_hash >>= 1;
+		else
+			nr_hash = 32768;
+	}
+
+	/* This limit triggers with more than 1G of RAM */
+	if (nr_hash > 65536)
+		nr_hash = 65536;
+
+	max_inodes = nr_hash;
+
+	hash_size = nr_hash * sizeof(struct list_head);
+
+	if (hash_size < PAGE_SIZE) {
+		/* Embedded systems */
+		inode_hashtable = kmalloc(hash_size, GFP_ATOMIC);
+
+		i_hash_mask = (nr_hash - 1);
 
-	i = HASH_SIZE;
+		tmp = nr_hash;
+		i_hash_bits = 0;
+		while((tmp >>= 1UL) != 0UL)
+			i_hash_bits++;
+	} else {
+		for (order = 0; ((1UL << order) << PAGE_SHIFT) < hash_size;
+		     order++);
+
+		do {
+			hash_size = 1UL << (order+PAGE_SHIFT);
+			nr_hash =  hash_size / sizeof(struct list_head);
+
+			i_hash_mask = (nr_hash - 1);
+
+			tmp = nr_hash;
+			i_hash_bits = 0;
+			while((tmp >>= 1UL) != 0UL)
+				i_hash_bits++;
+
+			inode_hashtable = (struct list_head *) __get_free_pages(GFP_ATOMIC, order);
+		} while(inode_hashtable == NULL && --order >= 0);
+	}
+
+	printk("Inode hash table entries: %lu (%ldk), inode-max: %d\n",
+	       nr_hash, hash_size >> 10, max_inodes);
+
+	if (!inode_hashtable)
+		panic("Failed to allocate inode hash table\n");
+
+	d = inode_hashtable;
+	i = nr_hash;
 	do {
-		INIT_LIST_HEAD(head);
-		head++;
+		INIT_LIST_HEAD(d);
+		d++;
 		i--;
 	} while (i);
-
-	/* Initial guess at reasonable inode number */
-	max = num_physpages >> 1;
-	if (max > MAX_INODE)
-		max = MAX_INODE;
-	max_inodes = max;
 
 	/* Get a random number. */
 	get_random_bytes (&inode_generation_count,
diff -urN 2.2.18/fs/iobuf.c 2.2.18aa1/fs/iobuf.c
--- 2.2.18/fs/iobuf.c	Thu Jan  1 01:00:00 1970
+++ 2.2.18aa1/fs/iobuf.c	Mon Dec 11 17:20:48 2000
@@ -0,0 +1,236 @@
+/*
+ * iobuf.c
+ *
+ * Keep track of the general-purpose IO-buffer structures used to track
+ * abstract kernel-space io buffers.
+ * 
+ */
+
+#include <linux/iobuf.h>
+#include <linux/malloc.h>
+#include <linux/slab.h>
+#include <linux/bigmem.h>
+
+static kmem_cache_t *kiobuf_cachep;
+
+void __init kiobuf_init(void)
+{
+	kiobuf_cachep =  kmem_cache_create("kiobuf",
+					   sizeof(struct kiobuf),
+					   0,
+					   SLAB_HWCACHE_ALIGN, NULL, NULL);
+	if(!kiobuf_cachep)
+		panic("Cannot create kernel iobuf cache\n");
+}
+
+
+int alloc_kiovec(int nr, struct kiobuf **bufp)
+{
+	int i;
+	struct kiobuf *iobuf;
+	
+	for (i = 0; i < nr; i++) {
+		iobuf = kmem_cache_alloc(kiobuf_cachep, SLAB_KERNEL);
+		if (!iobuf) {
+			free_kiovec(i, bufp);
+			return -ENOMEM;
+		}
+		
+		memset(iobuf, 0, sizeof(*iobuf));
+		iobuf->array_len  = KIO_STATIC_PAGES;
+		iobuf->pagelist   = iobuf->page_array;
+		iobuf->maplist    = iobuf->map_array;
+		iobuf->bouncelist = iobuf->bounce_array;
+		*bufp++ = iobuf;
+	}
+	
+	return 0;
+}
+
+void clear_kiobuf_bounce_pages(struct kiobuf *iobuf)
+{
+	int i;
+	
+	if (!iobuf->bounced)
+		return;
+	
+	for (i = 0; i < iobuf->nr_pages; i++) {
+		unsigned long page = iobuf->bouncelist[i];
+		if (page)
+			free_page(page);
+	}
+	iobuf->bounced = 0;
+}
+
+void free_kiovec(int nr, struct kiobuf **bufp) 
+{
+	struct kiobuf *iobuf;
+	int i;
+	
+	for (i = 0; i < nr; i++) {
+		iobuf = bufp[i];
+		clear_kiobuf_bounce_pages(iobuf);
+		if (iobuf->array_len > KIO_STATIC_PAGES) {
+			kfree (iobuf->pagelist);
+		}
+		kmem_cache_free(kiobuf_cachep, bufp[i]);
+	}
+}
+
+int expand_kiobuf(struct kiobuf *iobuf, int wanted)
+{
+	unsigned long *	pagelist, * bouncelist;
+	struct page ** maplist;
+	
+	if (iobuf->array_len >= wanted)
+		return 0;
+
+	/*
+	 * kmalloc enough space for the page, map and bounce lists all
+	 * at once. 
+	 */
+	pagelist = (unsigned long *) 
+		kmalloc(3 * wanted * sizeof(unsigned long), GFP_KERNEL);
+	if (!pagelist)
+		return -ENOMEM;
+
+	/* Did it grow while we waited? */
+	if (iobuf->array_len >= wanted) {
+		kfree(pagelist);
+		return 0;
+	}
+	
+	maplist    = (struct page **) (pagelist + wanted);
+	bouncelist = pagelist + 2 * wanted;
+
+	memcpy (pagelist, iobuf->pagelist,
+		iobuf->array_len * sizeof(unsigned long));
+	memcpy (maplist, iobuf->maplist,
+		iobuf->array_len * sizeof(struct page **));
+	memcpy (bouncelist, iobuf->bouncelist,
+		iobuf->array_len * sizeof(unsigned long));
+
+	if (iobuf->array_len > KIO_STATIC_PAGES)
+		kfree (iobuf->pagelist);
+	
+	iobuf->pagelist   = pagelist;
+	iobuf->maplist    = maplist;
+	iobuf->bouncelist = bouncelist;
+	iobuf->array_len  = wanted;
+	return 0;
+}
+
+
+/*
+ * Test whether a given page from the bounce buffer matches the given
+ * gfp_mask.  Return true if a bounce buffer is required for this
+ * page. 
+ */
+
+static inline int test_bounce_page(unsigned long page, 
+				   struct page * map,
+				   int gfp_mask)
+{
+	/* Unmapped pages from PCI memory or BIGMEM pages always need a
+	 * bounce buffer unless the caller is prepared to accept
+	 * GFP_BIGMEM pages. */
+	
+	if (!map || PageBIGMEM(map) )
+		/* Careful, the following must return the right value
+		 * even if CONFIG_BIGMEM is not set */
+		return !(gfp_mask & __GFP_BIGMEM);
+	
+	/* A DMA-able page never needs a bounce buffer */
+	if (PageDMA(map))
+		return 0;
+	
+	/* Otherwise it is a non-ISA-DMA-capable page and needs bounce
+	 * buffers if GFP_DMA is requested */
+	return gfp_mask & __GFP_DMA;
+}
+
+int setup_kiobuf_bounce_pages(struct kiobuf *iobuf, int gfp_mask)
+{
+	int i;
+	
+	clear_kiobuf_bounce_pages(iobuf);
+	
+	for (i = 0; i < iobuf->nr_pages; i++) {
+		struct page *map = iobuf->maplist[i];
+		unsigned long page = iobuf->pagelist[i];
+		unsigned long bounce_page;
+		
+		if (!test_bounce_page(page, map, gfp_mask)) {
+			iobuf->bouncelist[i] = 0;
+			continue;
+		}
+		
+		bounce_page = __get_free_page(gfp_mask);
+		if (!bounce_page)
+			goto error;
+
+		iobuf->bouncelist[i] = bounce_page;
+		iobuf->bounced = 1;
+	}
+	return 0;
+	
+ error:
+	clear_kiobuf_bounce_pages(iobuf);
+	return -ENOMEM;
+}
+
+/*
+ * Copy a bounce buffer.  For completion of partially-failed read IOs,
+ * we need to be able to place an upper limit on the data successfully
+ * transferred from bounce buffers to the user's own buffers.  
+ */
+
+void kiobuf_copy_bounce(struct kiobuf *iobuf, int direction, int max)
+{
+	int i;
+	int offset, length;
+	
+	if (!iobuf->bounced)
+		return;
+	
+	offset = iobuf->offset;
+	length = iobuf->length;
+	if (max >= 0 && length > max)
+		length = max;
+	
+	i = 0;
+
+	if (offset > PAGE_SIZE) {
+		i = (offset >> PAGE_SHIFT);
+		offset &= ~PAGE_MASK;
+	}
+	
+	for (; i < iobuf->nr_pages && length > 0; i++) {
+		unsigned long page = iobuf->pagelist[i];
+		unsigned long bounce_page = iobuf->bouncelist[i];
+		unsigned long kin, kout;
+		int pagelen = length;
+		
+		if ((pagelen+offset) > PAGE_SIZE)
+			pagelen = PAGE_SIZE - offset;
+
+		if (bounce_page) {
+			if (direction == COPY_TO_BOUNCE) {
+				kin  = kmap(page, KM_READ);
+				kout = kmap(bounce_page, KM_WRITE);
+			} else {
+				kin  = kmap(bounce_page, KM_READ);
+				kout = kmap(page, KM_WRITE);
+			}
+			
+			memcpy((char *) (kout+offset), 
+			       (char *) (kin+offset),
+			       pagelen);
+			kunmap(kout, KM_WRITE);
+			kunmap(kin, KM_READ);
+		}
+		
+		length -= pagelen;
+		offset = 0;
+	}
+}
diff -urN 2.2.18/fs/isofs/dir.c 2.2.18aa1/fs/isofs/dir.c
--- 2.2.18/fs/isofs/dir.c	Tue Sep  5 02:28:47 2000
+++ 2.2.18aa1/fs/isofs/dir.c	Mon Dec 11 17:20:50 2000
@@ -220,7 +220,7 @@
 
 		/* Handle the case of the '.' directory */
 		if (de->name_len[0] == 1 && de->name[0] == 0) {
-			if (filldir(dirent, ".", 1, filp->f_pos, inode->i_ino) < 0)
+			if (filldir(dirent, ".", 1, filp->f_pos, inode->i_ino, DT_DIR) < 0)
 				break;
 			filp->f_pos += de_len;
 			continue;
@@ -231,7 +231,7 @@
 		/* Handle the case of the '..' directory */
 		if (de->name_len[0] == 1 && de->name[0] == 1) {
 			inode_number = filp->f_dentry->d_parent->d_inode->i_ino;
-			if (filldir(dirent, "..", 2, filp->f_pos, inode_number) < 0)
+			if (filldir(dirent, "..", 2, filp->f_pos, inode_number, DT_DIR) < 0)
 				break;
 			filp->f_pos += de_len;
 			continue;
@@ -276,7 +276,7 @@
 			}
 		}
 		if (len > 0) {
-			if (filldir(dirent, p, len, filp->f_pos, inode_number) < 0)
+			if (filldir(dirent, p, len, filp->f_pos, inode_number, DT_UNKNOWN) < 0)
 				break;
 		}
 		filp->f_pos += de_len;
diff -urN 2.2.18/fs/isofs/inode.c 2.2.18aa1/fs/isofs/inode.c
--- 2.2.18/fs/isofs/inode.c	Tue Sep  5 02:28:47 2000
+++ 2.2.18aa1/fs/isofs/inode.c	Mon Dec 11 17:20:50 2000
@@ -898,7 +898,8 @@
 
 int isofs_bmap(struct inode * inode,int block)
 {
-	off_t b_off, offset, size;
+	loff_t b_off;
+	unsigned offset, size;
 	struct inode *ino;
 	unsigned int firstext;
 	unsigned long nextino;
@@ -909,7 +910,7 @@
 		return 0;
 	}
 
-	b_off = block << ISOFS_BUFFER_BITS(inode);
+	b_off = (loff_t)block << ISOFS_BUFFER_BITS(inode);
 
 	/*
 	 * If we are beyond the end of this file, don't give out any
@@ -917,7 +918,7 @@
 	 */
 	if( b_off >= inode->i_size )
 	  {
-	    off_t	max_legal_read_offset;
+	    loff_t	max_legal_read_offset;
 
 	    /*
 	     * If we are *way* beyond the end of the file, print a message.
@@ -928,20 +929,21 @@
 	     * I/O errors.
 	     */
 	    max_legal_read_offset = (inode->i_size + PAGE_SIZE - 1)
-	      & ~(PAGE_SIZE - 1);
+					& ~(loff_t)(PAGE_SIZE - 1);
 	    if( b_off >= max_legal_read_offset )
 	      {
 
 		printk("_isofs_bmap: block>= EOF(%d, %ld)\n", block,
-		       inode->i_size);
+		       (u_long)((inode->i_size >> ISOFS_BUFFER_BITS(inode)) +
+				((inode->i_size & ((1 << ISOFS_BUFFER_BITS(inode))-1)) != 0)));
 	      }
 	    return 0;
 	  }
 
 	offset = 0;
 	firstext = inode->u.isofs_i.i_first_extent;
-	size = inode->u.isofs_i.i_section_size;
-	nextino = inode->u.isofs_i.i_next_section_ino;
+	size     = inode->u.isofs_i.i_section_size;
+	nextino  = inode->u.isofs_i.i_next_section_ino;
 #ifdef DEBUG
 	printk("first inode: inode=%x nextino=%x firstext=%u size=%lu\n",
 		inode->i_ino, nextino, firstext, size);
@@ -1180,7 +1182,7 @@
 
 #ifdef DEBUG
 	printk("Get inode %x: %d %d: %d\n",inode->i_ino, block,
-	       ((int)pnt) & 0x3ff, inode->i_size);
+	       ((int)pnt) & 0x3ff, (u_long)inode->i_size);
 #endif
 
 	inode->i_mtime = inode->i_atime = inode->i_ctime =
diff -urN 2.2.18/fs/lockd/clntlock.c 2.2.18aa1/fs/lockd/clntlock.c
--- 2.2.18/fs/lockd/clntlock.c	Mon Dec 11 16:58:00 2000
+++ 2.2.18aa1/fs/lockd/clntlock.c	Mon Dec 11 17:20:52 2000
@@ -138,7 +138,7 @@
 void
 nlmclnt_recovery(struct nlm_host *host, u32 newstate)
 {
-	if (!host->h_reclaiming++) {
+	if (host->h_reclaiming++) {
 		if (host->h_nsmstate == newstate)
 			return;
 		printk(KERN_WARNING
diff -urN 2.2.18/fs/lockd/host.c 2.2.18aa1/fs/lockd/host.c
--- 2.2.18/fs/lockd/host.c	Mon Dec 11 16:58:00 2000
+++ 2.2.18aa1/fs/lockd/host.c	Mon Dec 11 17:20:52 2000
@@ -51,7 +51,8 @@
 struct nlm_host *
 nlmsvc_lookup_host(struct svc_rqst *rqstp)
 {
-	return nlm_lookup_host(rqstp->rq_client, &rqstp->rq_addr, 0, 0);
+	return nlm_lookup_host(rqstp->rq_client, &rqstp->rq_addr,
+			       rqstp->rq_prot, rqstp->rq_vers);
 }
 
 /*
@@ -97,7 +98,9 @@
 		nlm_gc_hosts();
 
 	for (hp = &nlm_hosts[hash]; (host = *hp); hp = &host->h_next) {
-		if (host->h_version != version || host->h_proto != proto)
+		if (proto && host->h_proto != proto)
+			continue;
+		if (version && host->h_version != version)
 			continue;
 
 		if (nlm_match_host(host, clnt, sin)) {
diff -urN 2.2.18/fs/lockd/mon.c 2.2.18aa1/fs/lockd/mon.c
--- 2.2.18/fs/lockd/mon.c	Mon Dec 11 16:58:00 2000
+++ 2.2.18aa1/fs/lockd/mon.c	Mon Dec 11 17:20:52 2000
@@ -47,7 +47,7 @@
 
 	args.addr = host->h_addr.sin_addr.s_addr;
 	args.prog = NLM_PROGRAM;
-	args.vers = 1;
+	args.vers = host->h_version;
 	args.proc = NLMPROC_NSM_NOTIFY;
 	memset(res, 0, sizeof(*res));
 
diff -urN 2.2.18/fs/lockd/svc.c 2.2.18aa1/fs/lockd/svc.c
--- 2.2.18/fs/lockd/svc.c	Mon Dec 11 16:58:00 2000
+++ 2.2.18aa1/fs/lockd/svc.c	Mon Dec 11 17:20:52 2000
@@ -349,7 +349,7 @@
  * Define NLM program and procedures
  */
 static struct svc_version	nlmsvc_version1 = {
-	1, 16, nlmsvc_procedures, NULL
+	1, 17, nlmsvc_procedures, NULL
 };
 static struct svc_version	nlmsvc_version3 = {
 	3, 24, nlmsvc_procedures, NULL
diff -urN 2.2.18/fs/lockd/svc4proc.c 2.2.18aa1/fs/lockd/svc4proc.c
--- 2.2.18/fs/lockd/svc4proc.c	Mon Dec 11 16:58:00 2000
+++ 2.2.18aa1/fs/lockd/svc4proc.c	Mon Dec 11 17:20:52 2000
@@ -422,6 +422,8 @@
 					      void	        *resp)
 {
 	struct sockaddr_in	saddr = rqstp->rq_addr;
+	int			vers = rqstp->rq_vers;
+	int			prot = rqstp->rq_prot;
 	struct nlm_host		*host;
 
 	dprintk("lockd: SM_NOTIFY     called\n");
@@ -438,7 +440,7 @@
 	 * reclaim all locks we hold on this server.
 	 */
 	saddr.sin_addr.s_addr = argp->addr;	
-	if ((host = nlm_lookup_host(NULL, &saddr, IPPROTO_UDP, 1)) != NULL) {
+	if ((host = nlmclnt_lookup_host(&saddr, prot, vers)) != NULL) {
 		nlmclnt_recovery(host, argp->state);
 		nlm_release_host(host);
 	}
@@ -551,7 +553,8 @@
   PROC(cancel_res,	cancelres,	norep,		res,	void),
   PROC(unlock_res,	unlockres,	norep,		res,	void),
   PROC(granted_res,	grantedres,	norep,		res,	void),
-  PROC(none,		void,		void,		void,	void),
+  /* statd callback */
+  PROC(sm_notify,	reboot,		void,		reboot,	void),
   PROC(none,		void,		void,		void,	void),
   PROC(none,		void,		void,		void,	void),
   PROC(none,		void,		void,		void,	void),
@@ -560,6 +563,4 @@
   PROC(nm_lock,		lockargs,	res,		args,	res),
   PROC(free_all,	notify,		void,		args,	void),
 
-  /* statd callback */
-  PROC(sm_notify,	reboot,		void,		reboot,	void),
 };
diff -urN 2.2.18/fs/lockd/svclock.c 2.2.18aa1/fs/lockd/svclock.c
--- 2.2.18/fs/lockd/svclock.c	Mon Dec 11 16:58:00 2000
+++ 2.2.18aa1/fs/lockd/svclock.c	Mon Dec 11 17:20:50 2000
@@ -94,14 +94,18 @@
 	struct nlm_block	**head, *block;
 	struct file_lock	*fl;
 
-	dprintk("lockd: nlmsvc_lookup_block f=%p pd=%d %ld-%ld ty=%d\n",
-				file, lock->fl.fl_pid, lock->fl.fl_start,
-				lock->fl.fl_end, lock->fl.fl_type);
+	dprintk("lockd: nlmsvc_lookup_block f=%p pd=%d %Ld-%Ld ty=%d\n",
+				file, lock->fl.fl_pid,
+				(long long)lock->fl.fl_start,
+				(long long)lock->fl.fl_end,
+				lock->fl.fl_type);
 	for (head = &nlm_blocked; (block = *head); head = &block->b_next) {
 		fl = &block->b_call.a_args.lock.fl;
-		dprintk("lockd: check f=%p pd=%d %ld-%ld ty=%d cookie=%x\n",
-				block->b_file, fl->fl_pid, fl->fl_start,
-				fl->fl_end, fl->fl_type, 
+		dprintk("lockd: check f=%p pd=%d %Ld-%Ld ty=%d cookie=%x\n",
+				block->b_file, fl->fl_pid,
+				(long long)lock->fl.fl_start,
+				(long long)lock->fl.fl_end,
+				fl->fl_type, 
 				*(u32 *)(&block->b_call.a_args.cookie.data));
 		if (block->b_file == file && nlm_compare_locks(fl, &lock->fl)) {
 			if (remove)
@@ -287,12 +291,12 @@
 	struct inode            *inode = file->f_file.f_dentry->d_inode;
 	int			error;
 
-	dprintk("lockd: nlmsvc_lock(%04x/%ld, ty=%d, pi=%d, %ld-%ld, bl=%d)\n",
+	dprintk("lockd: nlmsvc_lock(%04x/%ld, ty=%d, pi=%d, %Ld-%Ld, bl=%d)\n",
 				file->f_file.f_dentry->d_inode->i_dev,
 				file->f_file.f_dentry->d_inode->i_ino,
 				lock->fl.fl_type, lock->fl.fl_pid,
-				lock->fl.fl_start,
-				lock->fl.fl_end,
+				(long long)lock->fl.fl_start,
+				(long long)lock->fl.fl_end,
 				wait);
 	
 	/* Checking for read only file system */
@@ -365,16 +369,18 @@
 {
 	struct file_lock	*fl;
 
-	dprintk("lockd: nlmsvc_testlock(%04x/%ld, ty=%d, %ld-%ld)\n",
+	dprintk("lockd: nlmsvc_testlock(%04x/%ld, ty=%d, %Ld-%Ld)\n",
 				file->f_file.f_dentry->d_inode->i_dev,
 				file->f_file.f_dentry->d_inode->i_ino,
 				lock->fl.fl_type,
-				lock->fl.fl_start,
-				lock->fl.fl_end);
+				(long long)lock->fl.fl_start,
+				(long long)lock->fl.fl_end);
 
 	if ((fl = posix_test_lock(&file->f_file, &lock->fl)) != NULL) {
-		dprintk("lockd: conflicting lock(ty=%d, %ld-%ld)\n",
-				fl->fl_type, fl->fl_start, fl->fl_end );
+		dprintk("lockd: conflicting lock(ty=%d, %Ld-%Ld)\n",
+				fl->fl_type,
+				(long long)lock->fl.fl_start,
+				(long long)lock->fl.fl_end);
 
 		conflock->caller = "somehost";	/* FIXME */
 		conflock->oh.len = 0;		/* don't return OH info */
@@ -397,12 +403,12 @@
 {
 	int	error;
 
-	dprintk("lockd: nlmsvc_unlock(%04x/%ld, pi=%d, %ld-%ld)\n",
+	dprintk("lockd: nlmsvc_unlock(%04x/%ld, pi=%d, %Ld-%Ld)\n",
 				file->f_file.f_dentry->d_inode->i_dev,
 				file->f_file.f_dentry->d_inode->i_ino,
 				lock->fl.fl_pid,
-				lock->fl.fl_start,
-				lock->fl.fl_end);
+				(long long)lock->fl.fl_start,
+				(long long)lock->fl.fl_end);
 
 	/* First, cancel any lock that might be there */
 	nlmsvc_cancel_blocked(file, lock);
@@ -425,12 +431,12 @@
 {
 	struct nlm_block	*block;
 
-	dprintk("lockd: nlmsvc_cancel(%04x/%ld, pi=%d, %ld-%ld)\n",
+	dprintk("lockd: nlmsvc_cancel(%04x/%ld, pi=%d, %Ld-%Ld)\n",
 				file->f_file.f_dentry->d_inode->i_dev,
 				file->f_file.f_dentry->d_inode->i_ino,
 				lock->fl.fl_pid,
-				lock->fl.fl_start,
-				lock->fl.fl_end);
+				(long long)lock->fl.fl_start,
+				(long long)lock->fl.fl_end);
 
 	down(&file->f_sema);
 	if ((block = nlmsvc_lookup_block(file, lock, 1)) != NULL)
diff -urN 2.2.18/fs/lockd/svcproc.c 2.2.18aa1/fs/lockd/svcproc.c
--- 2.2.18/fs/lockd/svcproc.c	Mon Dec 11 16:58:00 2000
+++ 2.2.18aa1/fs/lockd/svcproc.c	Mon Dec 11 17:20:52 2000
@@ -435,6 +435,8 @@
 					      void	        *resp)
 {
 	struct sockaddr_in	saddr = rqstp->rq_addr;
+	int			vers = rqstp->rq_vers;
+	int			prot = rqstp->rq_prot;
 	struct nlm_host		*host;
 
 	dprintk("lockd: SM_NOTIFY     called\n");
@@ -450,8 +452,8 @@
 	/* Obtain the host pointer for this NFS server and try to
 	 * reclaim all locks we hold on this server.
 	 */
-	saddr.sin_addr.s_addr = argp->addr;	
-	if ((host = nlm_lookup_host(NULL, &saddr, IPPROTO_UDP, 1)) != NULL) {
+	saddr.sin_addr.s_addr = htonl(argp->addr);
+	if ((host = nlmclnt_lookup_host(&saddr, prot, vers)) != NULL) {
 		nlmclnt_recovery(host, argp->state);
 		nlm_release_host(host);
 	}
@@ -585,7 +587,8 @@
   PROC(cancel_res,	cancelres,	norep,		res,	void),
   PROC(unlock_res,	unlockres,	norep,		res,	void),
   PROC(granted_res,	grantedres,	norep,		res,	void),
-  PROC(none,		void,		void,		void,	void),
+  /* statd callback */
+  PROC(sm_notify,	reboot,		void,		reboot,	void),
   PROC(none,		void,		void,		void,	void),
   PROC(none,		void,		void,		void,	void),
   PROC(none,		void,		void,		void,	void),
@@ -594,6 +597,4 @@
   PROC(nm_lock,		lockargs,	res,		args,	res),
   PROC(free_all,	notify,		void,		args,	void),
 
-  /* statd callback */
-  PROC(sm_notify,	reboot,		void,		reboot,	void),
 };
diff -urN 2.2.18/fs/lockd/svcsubs.c 2.2.18aa1/fs/lockd/svcsubs.c
--- 2.2.18/fs/lockd/svcsubs.c	Mon Dec 11 16:58:00 2000
+++ 2.2.18aa1/fs/lockd/svcsubs.c	Mon Dec 11 17:20:50 2000
@@ -169,7 +169,7 @@
 
 			lock.fl_type  = F_UNLCK;
 			lock.fl_start = 0;
-			lock.fl_end   = NLM_OFFSET_MAX;
+			lock.fl_end   = OFFSET_MAX;
 			if (posix_lock_file(&file->f_file, &lock, 0) < 0) {
 				printk("lockd: unlock failure in %s:%d\n",
 						__FILE__, __LINE__);
diff -urN 2.2.18/fs/lockd/xdr.c 2.2.18aa1/fs/lockd/xdr.c
--- 2.2.18/fs/lockd/xdr.c	Mon Dec 11 16:58:00 2000
+++ 2.2.18aa1/fs/lockd/xdr.c	Mon Dec 11 17:20:50 2000
@@ -24,7 +24,6 @@
 
 #define NLMDBG_FACILITY		NLMDBG_XDR
 #define NLM_MAXSTRLEN		1024
-#define OFFSET_MAX		LONG_MAX
 
 #define QUADLEN(len)		(((len) + 3) >> 2)
 
@@ -37,6 +36,25 @@
 static void nlm_register_stats(void);
 static void nlm_unregister_stats(void);
 
+static inline loff_t
+s32_to_loff_t(__s32 offset)
+{
+	return (loff_t)offset;
+}
+
+static inline __s32
+loff_t_to_s32(loff_t offset)
+{
+	__s32 res;
+	if (offset >= NLM_OFFSET_MAX)
+		res = NLM_OFFSET_MAX;
+	else if (offset <= -NLM_OFFSET_MAX)
+		res = -NLM_OFFSET_MAX;
+	else
+		res = offset;
+	return res;
+}
+
 /*
  * Initialization of NFS status variables
  */
@@ -157,7 +175,7 @@
 nlm_decode_lock(u32 *p, struct nlm_lock *lock)
 {
 	struct file_lock	*fl = &lock->fl;
-	int			len;
+	s32			start, len, end;
 
 	if (!(p = xdr_decode_string(p, &lock->caller, &len, NLM_MAXSTRLEN))
 	 || !(p = nlm_decode_fh(p, &lock->fh))
@@ -169,10 +187,16 @@
 	fl->fl_pid   = ntohl(*p++);
 	fl->fl_flags = FL_POSIX;
 	fl->fl_type  = F_RDLCK;		/* as good as anything else */
-	fl->fl_start = ntohl(*p++);
+	start = ntohl(*p++);
 	len = ntohl(*p++);
-	if (len == 0 || (fl->fl_end = fl->fl_start + len - 1) < 0)
+	end = start + len - 1;
+
+	fl->fl_start = s32_to_loff_t(start);
+
+	if (len == 0 || end < 0)
 		fl->fl_end = OFFSET_MAX;
+	else
+		fl->fl_end = s32_to_loff_t(end);
 	return p;
 }
 
@@ -183,6 +207,7 @@
 nlm_encode_lock(u32 *p, struct nlm_lock *lock)
 {
 	struct file_lock	*fl = &lock->fl;
+	__s32			start, len;
 
 	if (!(p = xdr_encode_string(p, lock->caller, -1))
 	 || !(p = nlm_encode_fh(p, &lock->fh))
@@ -193,12 +218,15 @@
 	 || (fl->fl_end > NLM_OFFSET_MAX && fl->fl_end != OFFSET_MAX))
 		return NULL;
 
-	*p++ = htonl(fl->fl_pid);
-	*p++ = htonl(fl->fl_start);
+	start = loff_t_to_s32(fl->fl_start);
 	if (fl->fl_end == OFFSET_MAX)
-		*p++ = xdr_zero;
+		len = 0;
 	else
-		*p++ = htonl(fl->fl_end - fl->fl_start + 1);
+		len = loff_t_to_s32(fl->fl_end - fl->fl_start + 1);
+
+	*p++ = htonl(fl->fl_pid);
+	*p++ = htonl(start);
+	*p++ = htonl(len);
 
 	return p;
 }
@@ -209,6 +237,8 @@
 static u32 *
 nlm_encode_testres(u32 *p, struct nlm_res *resp)
 {
+	s32		start, len;
+
 	if (!(p = nlm_encode_cookie(p, &resp->cookie)))
 		return 0;
 	*p++ = resp->status;
@@ -223,11 +253,14 @@
 		if (!(p = xdr_encode_netobj(p, &resp->lock.oh)))
 			return 0;
 
-		*p++ = htonl(fl->fl_start);
+		start = loff_t_to_s32(fl->fl_start);
 		if (fl->fl_end == OFFSET_MAX)
-			*p++ = xdr_zero;
+			len = xdr_zero;
 		else
-			*p++ = htonl(fl->fl_end - fl->fl_start + 1);
+			len = loff_t_to_s32(fl->fl_end - fl->fl_start + 1);
+
+		*p++ = htonl(start);
+		*p++ = htonl(len);
 	}
 
 	return p;
@@ -446,7 +479,8 @@
 	resp->status = ntohl(*p++);
 	if (resp->status == NLM_LCK_DENIED) {
 		struct file_lock	*fl = &resp->lock.fl;
-		u32			excl, len;
+		u32			excl;
+		s32			start, len, end;
 
 		memset(&resp->lock, 0, sizeof(resp->lock));
 		excl = ntohl(*p++);
@@ -456,10 +490,15 @@
 
 		fl->fl_flags = FL_POSIX;
 		fl->fl_type  = excl? F_WRLCK : F_RDLCK;
-		fl->fl_start = ntohl(*p++);
+		start = ntohl(*p++);
 		len = ntohl(*p++);
-		if (len == 0 || (fl->fl_end = fl->fl_start + len - 1) < 0)
+		end = start + len - 1;
+
+		fl->fl_start = s32_to_loff_t(start);
+		if (len == 0 || end < 0)
 			fl->fl_end = OFFSET_MAX;
+		else
+			fl->fl_end = s32_to_loff_t(end);
 	}
 	return 0;
 }
diff -urN 2.2.18/fs/lockd/xdr4.c 2.2.18aa1/fs/lockd/xdr4.c
--- 2.2.18/fs/lockd/xdr4.c	Mon Dec 11 16:58:00 2000
+++ 2.2.18aa1/fs/lockd/xdr4.c	Mon Dec 11 17:20:50 2000
@@ -23,7 +23,6 @@
 
 #define NLMDBG_FACILITY		NLMDBG_XDR
 #define NLM_MAXSTRLEN		1024
-#define OFFSET_MAX		((off_t)LONG_MAX)
 
 #define QUADLEN(len)		(((len) + 3) >> 2)
 
@@ -34,11 +33,23 @@
 
 typedef struct nlm_args	nlm_args;
 
-static inline off_t
-size_to_off_t(__s64 size)
+static inline loff_t
+s64_to_loff_t(__s64 offset)
 {
-        size = (size > (__s64)LONG_MAX) ? (off_t)LONG_MAX : (off_t) size;
-        return (size < (__s64)-LONG_MAX) ? (off_t)-LONG_MAX : (off_t) size;
+	return (loff_t)offset;
+}
+ 
+static inline s64
+loff_t_to_s64(loff_t offset)
+{
+	s64 res;
+	if (offset > NLM4_OFFSET_MAX)
+		res = NLM4_OFFSET_MAX;
+	else if (offset < -NLM4_OFFSET_MAX)
+		res = -NLM4_OFFSET_MAX;
+	else
+		res = offset;
+	return res;
 }
 
 /*
@@ -139,11 +150,12 @@
 	p = xdr_decode_hyper(p, &len);
 	end = start + len - 1;
 
-	fl->fl_start = size_to_off_t(start);
-	fl->fl_end = size_to_off_t(end);
+	fl->fl_start = s64_to_loff_t(start);
 
-	if (len == 0 || fl->fl_end < 0)
+	if (len == 0 || end < 0)
 		fl->fl_end = OFFSET_MAX;
+	else
+		fl->fl_end = s64_to_loff_t(end);
 	return p;
 }
 
@@ -154,18 +166,26 @@
 nlm4_encode_lock(u32 *p, struct nlm_lock *lock)
 {
 	struct file_lock	*fl = &lock->fl;
+	__s64			start, len;
 
 	if (!(p = xdr_encode_string(p, lock->caller, -1))
 	 || !(p = nlm4_encode_fh(p, &lock->fh))
 	 || !(p = nlm4_encode_oh(p, &lock->oh)))
 		return NULL;
 
-	*p++ = htonl(fl->fl_pid);
-	p = xdr_encode_hyper(p, fl->fl_start);
+	if (fl->fl_start > NLM4_OFFSET_MAX
+	 || (fl->fl_end > NLM4_OFFSET_MAX && fl->fl_end != OFFSET_MAX))
+		return NULL;
+
+	start = loff_t_to_s64(fl->fl_start);
 	if (fl->fl_end == OFFSET_MAX)
-		p = xdr_encode_hyper(p, 0);
+		len = 0;
 	else
-		p = xdr_encode_hyper(p, fl->fl_end - fl->fl_start + 1);
+		len = loff_t_to_s64(fl->fl_end - fl->fl_start + 1);
+
+	*p++ = htonl(fl->fl_pid);
+	p = xdr_encode_hyper(p, start);
+	p = xdr_encode_hyper(p, len);
 
 	return p;
 }
@@ -176,6 +196,7 @@
 static u32 *
 nlm4_encode_testres(u32 *p, struct nlm_res *resp)
 {
+	s64		start, len;
 
 	dprintk("xdr: before encode_testres (p %p resp %p)\n", p, resp);
 	if (!(p = nlm4_encode_cookie(p, &resp->cookie)))
@@ -192,14 +213,17 @@
 		if (!(p = xdr_encode_netobj(p, &resp->lock.oh)))
 			return 0;
 
-		p = xdr_encode_hyper(p, fl->fl_start);
+		start = loff_t_to_s64(fl->fl_start);
 		if (fl->fl_end == OFFSET_MAX)
-			p = xdr_encode_hyper(p, 0);
+			len = 0;
 		else
-			p = xdr_encode_hyper(p, fl->fl_end - fl->fl_start + 1);
-		dprintk("xdr: encode_testres (status %d pid %d type %d start %ld end %ld)\n", resp->status, fl->fl_pid, fl->fl_type, fl->fl_start,  fl->fl_end);
-
-
+			len = loff_t_to_s64(fl->fl_end - fl->fl_start + 1);
+		
+		p = xdr_encode_hyper(p, start);
+		p = xdr_encode_hyper(p, len);
+		dprintk("xdr: encode_testres (status %d pid %d type %d start %Ld end %Ld)\n",
+			resp->status, fl->fl_pid, fl->fl_type,
+			fl->fl_start, fl->fl_end);
 	}
 
 	dprintk("xdr: after encode_testres (p %p resp %p)\n", p, resp);
@@ -435,10 +459,11 @@
 		p = xdr_decode_hyper(p, &len);
 		end = start + len - 1;
 
-		fl->fl_start = size_to_off_t(start);
-		fl->fl_end = size_to_off_t(end);
-		if (len == 0 || fl->fl_end < 0)
+		fl->fl_start = s64_to_loff_t(start);
+		if (len == 0 || end < 0)
 			fl->fl_end = OFFSET_MAX;
+		else
+			fl->fl_end = s64_to_loff_t(end);
 	}
 	return 0;
 }
diff -urN 2.2.18/fs/locks.c 2.2.18aa1/fs/locks.c
--- 2.2.18/fs/locks.c	Mon Dec 11 16:58:00 2000
+++ 2.2.18aa1/fs/locks.c	Mon Dec 11 17:20:54 2000
@@ -111,12 +111,12 @@
 
 #include <asm/uaccess.h>
 
-#define OFFSET_MAX	((off_t)LONG_MAX)	/* FIXME: move elsewhere? */
-
 static int flock_make_lock(struct file *filp, struct file_lock *fl,
 			       unsigned int cmd);
-static int posix_make_lock(struct file *filp, struct file_lock *fl,
+static int flock_to_posix_lock(struct file *filp, struct file_lock *fl,
 			       struct flock *l);
+static int flock64_to_posix_lock(struct file *filp, struct file_lock *fl,
+				 struct flock64 *l);
 static int flock_locks_conflict(struct file_lock *caller_fl,
 				struct file_lock *sys_fl);
 static int posix_locks_conflict(struct file_lock *caller_fl,
@@ -195,7 +195,7 @@
 
 	if (waiter->fl_prevblock) {
 		printk(KERN_ERR "locks_insert_block: remove duplicated lock "
-			"(pid=%d %ld-%ld type=%d)\n",
+			"(pid=%d %Ld-%Ld type=%d)\n",
 			waiter->fl_pid, waiter->fl_start,
 			waiter->fl_end, waiter->fl_type);
 		locks_delete_block(waiter->fl_prevblock, waiter);
@@ -307,6 +307,12 @@
 	error = -EBADF;
 	if ((file_lock.fl_type != F_UNLCK) && !(filp->f_mode & 3))
 		goto out_putf;
+	if( filp->f_op->lock ) {
+		error = filp->f_op->lock(filp,
+			(((cmd&LOCK_NB)==LOCK_NB)?F_SETLK:F_SETLKW),
+			&file_lock);
+		if(error <0) goto out_putf;
+	}
 	error = flock_lock_file(filp, &file_lock,
 				(cmd & (LOCK_UN | LOCK_NB)) ? 0 : 1);
 out_putf:
@@ -342,7 +348,7 @@
 	if (!filp->f_dentry || !filp->f_dentry->d_inode || !filp->f_op)
 		goto out_putf;
 
-	if (!posix_make_lock(filp, &file_lock, &flock))
+	if (!flock_to_posix_lock(filp, &file_lock, &flock))
 		goto out_putf;
 
 	if (filp->f_op->lock) {
@@ -361,6 +367,18 @@
 	flock.l_type = F_UNLCK;
 	if (fl != NULL) {
 		flock.l_pid = fl->fl_pid;
+#if BITS_PER_LONG == 32
+		/*
+		 * Make sure we can represent the posix lock via
+		 * legacy 32bit flock.
+		 */
+		error = -EOVERFLOW;
+		if (fl->fl_start > OFFT_OFFSET_MAX)
+			goto out_putf;
+		if ((fl->fl_end != OFFSET_MAX)
+		    && (fl->fl_end > OFFT_OFFSET_MAX))
+			goto out_putf;
+#endif
 		flock.l_start = fl->fl_start;
 		flock.l_len = fl->fl_end == OFFSET_MAX ? 0 :
 			fl->fl_end - fl->fl_start + 1;
@@ -423,7 +441,7 @@
 	}
 
 	error = -EINVAL;
-	if (!posix_make_lock(filp, &file_lock, &flock))
+	if (!flock_to_posix_lock(filp, &file_lock, &flock))
 		goto out_putf;
 	
 	error = -EBADF;
@@ -473,6 +491,169 @@
 	return error;
 }
 
+#if BITS_PER_LONG == 32
+/* Report the first existing lock that would conflict with l.
+ * This implements the F_GETLK command of fcntl().
+ */
+int fcntl_getlk64(unsigned int fd, struct flock64 *l)
+{
+	struct file *filp;
+	struct file_lock *fl,file_lock;
+	struct flock64 flock;
+	int error;
+
+	error = -EFAULT;
+	if (copy_from_user(&flock, l, sizeof(flock)))
+		goto out;
+	error = -EINVAL;
+	if ((flock.l_type != F_RDLCK) && (flock.l_type != F_WRLCK))
+		goto out;
+
+	error = -EBADF;
+	filp = fget(fd);
+	if (!filp)
+		goto out;
+
+	error = -EINVAL;
+	if (!filp->f_dentry || !filp->f_dentry->d_inode || !filp->f_op)
+		goto out_putf;
+
+	if (!flock64_to_posix_lock(filp, &file_lock, &flock))
+		goto out_putf;
+
+	if (filp->f_op->lock) {
+		error = filp->f_op->lock(filp, F_GETLK, &file_lock);
+		if (error < 0)
+			goto out_putf;
+		else if (error == LOCK_USE_CLNT)
+		  /* Bypass for NFS with no locking - 2.0.36 compat */
+		  fl = posix_test_lock(filp, &file_lock);
+		else
+		  fl = (file_lock.fl_type == F_UNLCK ? NULL : &file_lock);
+	} else {
+		fl = posix_test_lock(filp, &file_lock);
+	}
+ 
+	flock.l_type = F_UNLCK;
+	if (fl != NULL) {
+		flock.l_pid = fl->fl_pid;
+		flock.l_start = fl->fl_start;
+		flock.l_len = fl->fl_end == OFFSET_MAX ? 0 :
+			fl->fl_end - fl->fl_start + 1;
+		flock.l_whence = 0;
+		flock.l_type = fl->fl_type;
+	}
+	error = -EFAULT;
+	if (!copy_to_user(l, &flock, sizeof(flock)))
+		error = 0;
+  
+out_putf:
+	fput(filp);
+out:
+	return error;
+}
+
+/* Apply the lock described by l to an open file descriptor.
+ * This implements both the F_SETLK and F_SETLKW commands of fcntl().
+ */
+int fcntl_setlk64(unsigned int fd, unsigned int cmd, struct flock64 *l)
+{
+	struct file *filp;
+	struct file_lock file_lock;
+	struct flock64 flock;
+	struct dentry * dentry;
+	struct inode *inode;
+	int error;
+
+	/*
+	 * This might block, so we do it before checking the inode.
+	 */
+	error = -EFAULT;
+	if (copy_from_user(&flock, l, sizeof(flock)))
+		goto out;
+
+	/* Get arguments and validate them ...
+	 */
+
+	error = -EBADF;
+	filp = fget(fd);
+	if (!filp)
+		goto out;
+
+	error = -EINVAL;
+	if (!(dentry = filp->f_dentry))
+		goto out_putf;
+	if (!(inode = dentry->d_inode))
+		goto out_putf;
+	if (!filp->f_op)
+		goto out_putf;
+
+	/* Don't allow mandatory locks on files that may be memory mapped
+	 * and shared.
+	 */
+	if (IS_MANDLOCK(inode) &&
+	    (inode->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID &&
+	    inode->i_mmap) {
+		struct vm_area_struct *vma = inode->i_mmap;
+		error = -EAGAIN;
+		do {
+			if (vma->vm_flags & VM_MAYSHARE)
+				goto out_putf;
+		} while ((vma = vma->vm_next_share) != NULL);
+	}
+
+	error = -EINVAL;
+	if (!flock64_to_posix_lock(filp, &file_lock, &flock))
+		goto out_putf;
+	
+	error = -EBADF;
+	switch (flock.l_type) {
+	case F_RDLCK:
+		if (!(filp->f_mode & FMODE_READ))
+			goto out_putf;
+		break;
+	case F_WRLCK:
+		if (!(filp->f_mode & FMODE_WRITE))
+			goto out_putf;
+		break;
+	case F_UNLCK:
+		break;
+	case F_SHLCK:
+	case F_EXLCK:
+#ifdef __sparc__
+/* warn a bit for now, but don't overdo it */
+{
+	static int count = 0;
+	if (!count) {
+		count=1;
+		printk(KERN_WARNING
+		       "fcntl_setlk() called by process %d (%s) with broken flock() emulation\n",
+		       current->pid, current->comm);
+	}
+}
+		if (!(filp->f_mode & 3))
+			goto out_putf;
+		break;
+#endif
+	default:
+		error = -EINVAL;
+		goto out_putf;
+	}
+
+	if (filp->f_op->lock != NULL) {
+		error = filp->f_op->lock(filp, cmd, &file_lock);
+		if (error < 0)
+			goto out_putf;
+	}
+	error = posix_lock_file(filp, &file_lock, cmd == F_SETLKW64);
+
+out_putf:
+	fput(filp);
+out:
+	return error;
+}
+#endif /* BITS_PER_LONG == 32 */
+
 /*
  * This function is called when the file is being removed
  * from the task's fd array.
@@ -653,10 +834,10 @@
 /* Verify a "struct flock" and copy it to a "struct file_lock" as a POSIX
  * style lock.
  */
-static int posix_make_lock(struct file *filp, struct file_lock *fl,
-			   struct flock *l)
+static int flock_to_posix_lock(struct file *filp, struct file_lock *fl,
+			       struct flock *l)
 {
-	off_t start;
+	loff_t start;
 
 	memset(fl, 0, sizeof(*fl));
 	
@@ -702,6 +883,57 @@
 	return (1);
 }
 
+#if BITS_PER_LONG == 32
+static int flock64_to_posix_lock(struct file *filp, struct file_lock *fl,
+				 struct flock64 *l)
+{
+	loff_t start;
+
+	memset(fl, 0, sizeof(*fl));
+	
+	fl->fl_flags = FL_POSIX;
+
+	switch (l->l_type) {
+	case F_RDLCK:
+	case F_WRLCK:
+	case F_UNLCK:
+		fl->fl_type = l->l_type;
+		break;
+	default:
+		return (0);
+	}
+
+	switch (l->l_whence) {
+	case 0: /*SEEK_SET*/
+		start = 0;
+		break;
+	case 1: /*SEEK_CUR*/
+		start = filp->f_pos;
+		break;
+	case 2: /*SEEK_END*/
+		start = filp->f_dentry->d_inode->i_size;
+		break;
+	default:
+		return (0);
+	}
+
+	if (((start += l->l_start) < 0) || (l->l_len < 0))
+		return (0);
+	fl->fl_end = start + l->l_len - 1;
+	if (l->l_len > 0 && fl->fl_end < 0)
+		return (0);
+	fl->fl_start = start;	/* we record the absolute position */
+	if (l->l_len == 0)
+		fl->fl_end = OFFSET_MAX;
+	
+	fl->fl_file = filp;
+	fl->fl_owner = current->files;
+	fl->fl_pid = current->pid;
+
+	return (1);
+}
+#endif
+
 /* Verify a call to flock() and fill in a file_lock structure with
  * an appropriate FLOCK lock.
  */
@@ -1215,7 +1447,7 @@
 		p += sprintf(p, "FLOCK  ADVISORY  ");
 	}
 	p += sprintf(p, "%s ", (fl->fl_type == F_RDLCK) ? "READ " : "WRITE");
-	p += sprintf(p, "%d %s:%ld %ld %ld ",
+	p += sprintf(p, "%d %s:%ld %Ld %Ld ",
 		     fl->fl_pid,
 		     kdevname(inode->i_dev), inode->i_ino, fl->fl_start,
 		     fl->fl_end);
@@ -1279,6 +1511,3 @@
 		*start = buffer;
 	return (q - buffer);
 }
-
-
-
diff -urN 2.2.18/fs/minix/dir.c 2.2.18aa1/fs/minix/dir.c
--- 2.2.18/fs/minix/dir.c	Mon Jan 17 16:44:42 2000
+++ 2.2.18aa1/fs/minix/dir.c	Mon Dec 11 17:20:50 2000
@@ -82,7 +82,7 @@
 			de = (struct minix_dir_entry *) (offset + bh->b_data);
 			if (de->inode) {
 				int size = strnlen(de->name, info->s_namelen);
-				if (filldir(dirent, de->name, size, filp->f_pos, de->inode) < 0) {
+				if (filldir(dirent, de->name, size, filp->f_pos, de->inode, DT_UNKNOWN) < 0) {
 					brelse(bh);
 					return 0;
 				}
diff -urN 2.2.18/fs/minix/file.c 2.2.18aa1/fs/minix/file.c
--- 2.2.18/fs/minix/file.c	Mon Jan 17 16:44:42 2000
+++ 2.2.18aa1/fs/minix/file.c	Mon Dec 11 17:20:50 2000
@@ -70,8 +70,8 @@
 				size_t count, loff_t *ppos)
 {
 	struct inode * inode = filp->f_dentry->d_inode;
-	off_t pos;
-	ssize_t written, c;
+	loff_t pos;
+	ssize_t written, c, m;
 	struct buffer_head * bh;
 	char * p;
 
@@ -87,15 +87,34 @@
 		pos = inode->i_size;
 	else
 		pos = *ppos;
+
+	/* L-F-S spec 2.2.1.27: */
+	if (!(filp->f_flags & O_LARGEFILE)) {
+		if (pos >= 0x7fffffffULL) /* pos@2G forbidden */
+			return -EFBIG;
+
+		if (pos + count > 0x7fffffffULL)
+			/* Write only until end of allowed region */
+			count = 0x7fffffffULL - pos;
+	}
+	/* MINIX i-node file-size can't exceed 4G-1 */
+	/* With 1k blocks and triple indirection MINIX can have files
+	   up to 16 GB in size -- filesystem maximum is then 4G*1k = 4T */
+	if (pos >= 0xffffffffULL)
+		return -EFBIG; /* Absolutely too much! */
+	if ((pos + count) >= 0x100000000ULL) /* too much to write! */
+		count = 0xffffffffULL - pos;
+
 	written = 0;
 	while (written < count) {
-		bh = minix_getblk(inode,pos/BLOCK_SIZE,1);
+		bh = minix_getblk(inode, pos >> BLOCK_SIZE_BITS, 1);
 		if (!bh) {
 			if (!written)
 				written = -ENOSPC;
 			break;
 		}
-		c = BLOCK_SIZE - (pos % BLOCK_SIZE);
+		m = pos & (BLOCK_SIZE - 1);
+		c = BLOCK_SIZE - m;
 		if (c > count-written)
 			c = count-written;
 		if (c != BLOCK_SIZE && !buffer_uptodate(bh)) {
@@ -108,7 +127,7 @@
 				break;
 			}
 		}
-		p = (pos % BLOCK_SIZE) + bh->b_data;
+		p = bh->b_data + m;
 		c -= copy_from_user(p,buf,c);
 		if (!c) {
 			brelse(bh);
diff -urN 2.2.18/fs/ncpfs/dir.c 2.2.18aa1/fs/ncpfs/dir.c
--- 2.2.18/fs/ncpfs/dir.c	Mon Dec 11 16:58:00 2000
+++ 2.2.18aa1/fs/ncpfs/dir.c	Mon Dec 11 17:20:50 2000
@@ -449,14 +449,14 @@
 	result = 0;
 	if (filp->f_pos == 0) {
 		ncp_invalid_dir_cache(inode);
-		if (filldir(dirent, ".", 1, 0, inode->i_ino) < 0) {
+		if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0) {
 			goto finished;
 		}
 		filp->f_pos = 1;
 	}
 	if (filp->f_pos == 1) {
 		if (filldir(dirent, "..", 2, 1,
-				dentry->d_parent->d_inode->i_ino) < 0) {
+				dentry->d_parent->d_inode->i_ino, DT_DIR) < 0) {
 			goto finished;
 		}
 		filp->f_pos = 2;
@@ -537,7 +537,7 @@
 			ino = ncp_invent_inos(1);
 
 		if (filldir(dirent, entry->i.entryName, entry->i.nameLen,
-			    entry->f_pos, ino) < 0) {
+			    entry->f_pos, ino, DT_UNKNOWN) < 0) {
 			break;
 		}
 		if ((inode->i_dev != c_dev)
diff -urN 2.2.18/fs/ncpfs/file.c 2.2.18aa1/fs/ncpfs/file.c
--- 2.2.18/fs/ncpfs/file.c	Mon Dec 11 16:58:00 2000
+++ 2.2.18aa1/fs/ncpfs/file.c	Mon Dec 11 17:20:50 2000
@@ -17,6 +17,7 @@
 #include <linux/mm.h>
 #include <linux/locks.h>
 #include <linux/malloc.h>
+#include <linux/unistd.h>
 
 #include <linux/ncp_fs.h>
 #include "ncplib_kernel.h"
@@ -161,7 +162,7 @@
 	/* First read in as much as possible for each bufsize. */
 	while (already_read < count) {
 		int read_this_time;
-		size_t to_read = min(bufsize - (pos % bufsize),
+		size_t to_read = min(bufsize - (pos & (bufsize-1)),
 				  count - already_read);
 
 		error = ncp_read_bounce(NCP_SERVER(inode),
@@ -201,7 +202,7 @@
 	struct dentry *dentry = file->f_dentry;
 	struct inode *inode = dentry->d_inode;
 	size_t already_written = 0;
-	off_t pos;
+	loff_t pos;
 	size_t bufsize;
 	int errno;
 	void* bouncebuffer;
@@ -238,12 +239,18 @@
 
 	already_written = 0;
 
+	/* Maximum file size: 2G-1 */
+	if (pos >= 0x7fffffffULL)
+		return -EFBIG;
+	if ((pos + count) >= 0x7fffffffULL)
+		count = 0x7fffffffULL - pos;
+
 	bouncebuffer = kmalloc(bufsize, GFP_NFS);
 	if (!bouncebuffer)
 		return -EIO;	/* -ENOMEM */
 	while (already_written < count) {
 		int written_this_time;
-		size_t to_write = min(bufsize - (pos % bufsize),
+		size_t to_write = min(bufsize - (pos & (bufsize-1)),
 				   count - already_written);
 
 		if (copy_from_user(bouncebuffer, buf, to_write)) {
diff -urN 2.2.18/fs/ncpfs/inode.c 2.2.18aa1/fs/ncpfs/inode.c
--- 2.2.18/fs/ncpfs/inode.c	Tue Jun 13 03:48:14 2000
+++ 2.2.18aa1/fs/ncpfs/inode.c	Mon Dec 11 17:20:50 2000
@@ -131,7 +131,7 @@
 	}
 	inode->i_blocks = 0;
 	if ((inode->i_size)&&(inode->i_blksize)) {
-		inode->i_blocks = (inode->i_size-1)/(inode->i_blksize)+1;
+		inode->i_blocks = ((inode->i_size-1) >> fslog2(inode->i_blksize)) +1;
 	}
 
 	inode->i_mtime = ncp_date_dos2unix(le16_to_cpu(nwi->modifyTime),
@@ -201,8 +201,7 @@
 
 	inode->i_blocks = 0;
 	if ((inode->i_blksize != 0) && (inode->i_size != 0)) {
-		inode->i_blocks =
-		    (inode->i_size - 1) / inode->i_blksize + 1;
+		inode->i_blocks = ((inode->i_size - 1) >> fslog2(inode->i_blksize)) + 1;
 	}
 
 	inode->i_mtime = ncp_date_dos2unix(le16_to_cpu(nwi->modifyTime),
diff -urN 2.2.18/fs/nfs/dir.c 2.2.18aa1/fs/nfs/dir.c
--- 2.2.18/fs/nfs/dir.c	Mon Dec 11 16:58:00 2000
+++ 2.2.18aa1/fs/nfs/dir.c	Mon Dec 11 17:20:50 2000
@@ -136,7 +136,7 @@
 	int		plus = NFS_USE_READDIRPLUS(inode);
 	int		error;
 
-	dfprintk(VFS, "NFS: nfs_readdir_filler() reading cookie %Lu into page %lu.\n", (long long)desc->entry->cookie, page->offset);
+	dfprintk(VFS, "NFS: nfs_readdir_filler() reading cookie %Lu into page %Lu.\n", (long long)desc->entry->cookie, (long long) nfs_page_offset(page));
 
  again:
 	error = NFS_CALL(readdir, inode, (dir, &dir_attr,
@@ -159,7 +159,7 @@
 	 * Note: assumes we have exclusive access to this inode either
 	 *	 throught inode->i_sem or some other mechanism.
 	 */
-	if (page->offset == 0)
+	if (page_index(page) == 0)
 		invalidate_inode_pages(inode);
 	nfs_unlock_page(page);
 	return 0;
@@ -295,7 +295,7 @@
 		 *	 retrieving the current dirent on the server */
 		fileid = nfs_fileid_to_ino_t(entry->ino);
 		res = filldir(dirent, entry->name, entry->len, 
-			      entry->prev_cookie, fileid);
+			      entry->prev_cookie, fileid, DT_UNKNOWN);
 		if (res < 0)
 			break;
 		file->f_pos = desc->target = entry->cookie;
diff -urN 2.2.18/fs/nfs/file.c 2.2.18aa1/fs/nfs/file.c
--- 2.2.18/fs/nfs/file.c	Mon Dec 11 16:58:00 2000
+++ 2.2.18aa1/fs/nfs/file.c	Mon Dec 11 17:20:50 2000
@@ -162,6 +162,9 @@
 
 static int nfs_prepare_write(struct file *file, struct page *page, unsigned offset, unsigned to)
 {
+	if (!NFS_PROTO(file->f_dentry->d_inode)->bigfiles &&
+	    page_index(page) > (0x7fffffff>>PAGE_SHIFT))
+		return -EFBIG;
 	return nfs_flush_incompatible(file, page);
 }
 
@@ -232,10 +235,10 @@
 	struct inode * inode = dentry->d_inode;
 	int	status = 0;
 
-	dprintk("NFS: nfs_lock(f=%4x/%ld, t=%x, fl=%x, r=%ld:%ld)\n",
+	dprintk("NFS: nfs_lock(f=%4x/%ld, t=%x, fl=%x, r=%Ld:%Ld)\n",
 			inode->i_dev, inode->i_ino,
 			fl->fl_type, fl->fl_flags,
-			fl->fl_start, fl->fl_end);
+			(long long)fl->fl_start, (long long)fl->fl_end);
 
 	if (!inode)
 		return -EINVAL;
diff -urN 2.2.18/fs/nfs/inode.c 2.2.18aa1/fs/nfs/inode.c
--- 2.2.18/fs/nfs/inode.c	Mon Dec 11 16:58:00 2000
+++ 2.2.18aa1/fs/nfs/inode.c	Mon Dec 11 17:20:51 2000
@@ -687,7 +687,7 @@
 		 * Preset the size and mtime, as there's no need
 		 * to invalidate the caches.
 		 */
-		inode->i_size  = nfs_size_to_off_t(fattr->size);
+		inode->i_size  = nfs_size_to_loff_t(fattr->size);
 		inode->i_mtime = nfs_time_to_secs(fattr->mtime);
 		inode->i_atime = nfs_time_to_secs(fattr->atime);
 		inode->i_ctime = nfs_time_to_secs(fattr->ctime);
@@ -891,6 +891,11 @@
 	if (!S_ISREG(inode->i_mode))
 		attr->ia_valid &= ~ATTR_SIZE;
 
+	error = -EFBIG;
+	if ((attr->ia_valid & ATTR_SIZE) && !NFS_PROTO(inode)->bigfiles && 
+	    attr->ia_size > 0x7fffffff)
+		goto out; 
+
 	error = nfs_wb_all(inode);
 	if (error < 0)
 		goto out;
@@ -982,6 +987,8 @@
 	struct rpc_auth	*auth = NFS_CLIENT(inode)->cl_auth;
 	struct nfs_file	*data;
 
+	if (!NFS_PROTO(filp->f_dentry->d_inode)->bigfiles)     
+		filp->f_flags &= ~O_LARGEFILE; 
 	data = nfs_file_alloc();
 	if (!data)
 		return -ENOMEM;
@@ -1100,8 +1107,8 @@
 int
 nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr)
 {
-	off_t		new_size, new_isize;
-	__u64		new_mtime;
+	__u64		new_size, new_mtime;
+	loff_t		new_isize;
 	int		invalid = 0;
 	int		error = -EIO;
 
@@ -1148,7 +1155,7 @@
 
  	new_mtime = fattr->mtime;
 	new_size = fattr->size;
- 	new_isize = nfs_size_to_off_t(fattr->size);
+ 	new_isize = nfs_size_to_loff_t(fattr->size);
 
 	error = 0;
 
diff -urN 2.2.18/fs/nfs/nfs3proc.c 2.2.18aa1/fs/nfs/nfs3proc.c
--- 2.2.18/fs/nfs/nfs3proc.c	Mon Dec 11 16:58:00 2000
+++ 2.2.18aa1/fs/nfs/nfs3proc.c	Mon Dec 11 17:20:51 2000
@@ -143,7 +143,7 @@
 static int
 nfs3_proc_read(struct dentry *dentry, struct nfs_fattr *fattr,
 	       struct rpc_cred *cred, int flags,
-	       unsigned long offset, unsigned int count,
+	       loff_t offset, unsigned int count,
 	       void *buffer, int *eofp)
 {
 	struct nfs_readargs	arg = { NFS_FH(dentry), offset, count, 1,
@@ -153,7 +153,7 @@
 	struct rpc_message	msg = { NFS3PROC_READ, &arg, &res, cred };
 	int			status;
 
-	dprintk("NFS call  read %d @ %ld\n", count, offset);
+	dprintk("NFS call  read %d @ %Ld\n", count, (long long) offset);
 	fattr->valid = 0;
 	status = rpc_call_sync(NFS_CLIENT(dentry->d_inode), &msg, flags);
 	dprintk("NFS reply read: %d\n", status);
@@ -164,7 +164,7 @@
 static int
 nfs3_proc_write(struct dentry *dentry, struct nfs_fattr *fattr,
 		struct rpc_cred *cred, int flags,
-		unsigned long offset, unsigned int count,
+		loff_t offset, unsigned int count,
 		void *buffer, struct nfs_writeverf *verf)
 {
 	struct nfs_writeargs	arg = { NFS_FH(dentry), offset, count,
@@ -175,7 +175,7 @@
 	struct rpc_message	msg = { NFS3PROC_WRITE, &arg, &res, cred };
 	int			status, rpcflags = 0;
 
-	dprintk("NFS call  write %d @ %ld\n", count, offset);
+	dprintk("NFS call  write %d @ %Ld\n", count, (long long) offset);
 	fattr->valid = 0;
 	if (flags & NFS_RW_SWAP)
 		rpcflags |= NFS_RPC_SWAPFLAGS;
@@ -506,4 +506,6 @@
 	nfs3_proc_statfs,
 
 	nfs3_decode_dirent,
+
+	1, 
 };
diff -urN 2.2.18/fs/nfs/proc.c 2.2.18aa1/fs/nfs/proc.c
--- 2.2.18/fs/nfs/proc.c	Mon Dec 11 16:58:00 2000
+++ 2.2.18aa1/fs/nfs/proc.c	Mon Dec 11 17:20:51 2000
@@ -138,7 +138,7 @@
 static int
 nfs_proc_read(struct dentry *dentry, fattr *fattr,
 	      struct rpc_cred *cred, int flags,
-	      unsigned long offset, unsigned int count,
+	      loff_t offset, unsigned int count,
 	      void *buffer, int *eofp)
 {
 	struct nfs_readargs	arg = { NFS_FH(dentry), offset, count, 1,
@@ -148,7 +148,7 @@
 	struct rpc_message	msg = { NFSPROC_READ, &arg, &res, cred };
 	int			status;
 
-	dprintk("NFS call  read %d @ %ld\n", count, offset);
+	dprintk("NFS call  read %d @ %Ld\n", count, (long long) offset);
 	fattr->valid = 0;
 	status = rpc_call_sync(NFS_CLIENT(dentry->d_inode), &msg, flags);
 
@@ -160,7 +160,7 @@
 static int
 nfs_proc_write(struct dentry *dentry, fattr *fattr,
 	       struct rpc_cred *cred, int how,
-	       unsigned long offset, unsigned int count,
+	       loff_t offset, unsigned int count,
 	       void *buffer, struct nfs_writeverf *verf)
 {
 	struct nfs_writeargs	arg = {NFS_FH(dentry), offset, count,
@@ -171,7 +171,7 @@
 	struct rpc_message	msg = { NFSPROC_WRITE, &arg, &res, cred };
 	int			status, flags = 0;
 
-	dprintk("NFS call  write %d @ %ld\n", count, offset);
+	dprintk("NFS call  write %d @ %Ld\n", count, (long long) offset);
 	fattr->valid = 0;
 	if (how & NFS_RW_SWAP)
 		flags |= NFS_RPC_SWAPFLAGS;
@@ -431,4 +431,5 @@
        nfs_proc_mknod,
        nfs_proc_statfs,
        nfs_decode_dirent,
+       0,
 };
diff -urN 2.2.18/fs/nfs/read.c 2.2.18aa1/fs/nfs/read.c
--- 2.2.18/fs/nfs/read.c	Mon Dec 11 16:58:00 2000
+++ 2.2.18aa1/fs/nfs/read.c	Mon Dec 11 17:20:51 2000
@@ -93,7 +93,7 @@
 	struct inode	*inode = dentry->d_inode;
 	struct rpc_cred	*cred = nfs_file_cred(file);
 	struct nfs_fattr fattr;
-	unsigned long	offset = nfs_page_offset(page);
+	loff_t		offset = nfs_page_offset(page);
 	char		*buffer = (char *) page_address(page);
 	int		rsize = NFS_SERVER(inode)->rsize;
 	int		result, refresh = 0;
@@ -110,10 +110,10 @@
 		if ((chunk = rsize) > count)
 			chunk = count;
 
-		dprintk("NFS: nfs_proc_read(%s, (%s/%s), %ld, %d, %p)\n",
+		dprintk("NFS: nfs_proc_read(%s, (%s/%s), %Ld, %d, %p)\n",
 			NFS_SERVER(inode)->hostname,
 			dentry->d_parent->d_name.name, dentry->d_name.name,
-			offset, chunk, buffer);
+			(long long) offset, chunk, buffer);
 
 		result = NFS_CALL(read, inode, (dentry, &fattr, cred, flags,
 						offset, chunk, buffer, &eof));
@@ -440,11 +440,11 @@
 			set_bit(PG_error, &page->flags);
 		nfs_unlock_page(page);
 
-		dprintk("NFS: read (%s/%s %d@%ld)\n",
+		dprintk("NFS: read (%s/%s %d@%Ld)\n",
                         req->wb_dentry->d_parent->d_name.name,
                         req->wb_dentry->d_name.name,
                         req->wb_bytes,
-                        (nfs_page_offset(page) + req->wb_offset));
+                        (long long)(nfs_page_offset(page) + req->wb_offset));
 		nfs_unlock_request(req);
 		nfs_release_request(req);
 	}
@@ -473,8 +473,8 @@
 	while (!nfs_lock_page(page))
 		wait_on_page(page);
 
-	dprintk("NFS: nfs_readpage (%p %d@%ld)\n",
-		page, rsize, page->offset);
+	dprintk("NFS: nfs_readpage (%p %d@%Ld)\n",
+		page, rsize, (long long) nfs_page_offset(page));
 
 	/*
 	 * Try to flush any pending writes to the file
diff -urN 2.2.18/fs/nfs/write.c 2.2.18aa1/fs/nfs/write.c
--- 2.2.18/fs/nfs/write.c	Mon Dec 11 16:58:00 2000
+++ 2.2.18aa1/fs/nfs/write.c	Mon Dec 11 17:20:51 2000
@@ -144,7 +144,7 @@
  */
 static int
 nfs_writepage_sync(struct file *file, struct page *page,
-		   unsigned long offset, unsigned int count)
+		   unsigned int offset, unsigned int count)
 {
 	struct dentry	*dentry = file->f_dentry;
 	struct inode	*inode = dentry->d_inode;
@@ -154,13 +154,14 @@
 	u8		*buffer;
 	struct nfs_fattr fattr;
 	struct nfs_writeverf verifier;
+	loff_t		base;
 
-	dprintk("NFS:      nfs_writepage_sync(%s/%s %d@%ld)\n",
+	dprintk("NFS:      nfs_writepage_sync(%s/%s %d@%Ld)\n",
 		dentry->d_parent->d_name.name, dentry->d_name.name,
-		count, nfs_page_offset(page) + offset);
+		count, (long long) (nfs_page_offset(page) + offset));
 
 	buffer = (u8 *) page_address(page) + offset;
-	offset += nfs_page_offset(page);
+	base = nfs_page_offset(page) + offset;
 
 	flags = ((IS_SWAPFILE(inode)) ? NFS_RW_SWAP : 0) | NFS_RW_SYNC;
 
@@ -169,7 +170,7 @@
 			wsize = count;
 
 		result = NFS_PROTO(inode)->write(dentry, &fattr, cred, flags,
-						 offset, wsize, buffer,
+						 base, wsize, buffer,
 						 &verifier);
 		nfs_write_attributes(inode, &fattr);
 
@@ -183,15 +184,15 @@
 			wsize, result);
 		refresh = 1;
 		buffer  += wsize;
-		offset  += wsize;
+		base	+= wsize;
 		written += wsize;
 		count   -= wsize;
 		/*
 		 * If we've extended the file, update the inode
 		 * now so we don't invalidate the cache.
 		 */
-		if (offset > inode->i_size)
-			inode->i_size = offset;
+		if (base > inode->i_size)
+			inode->i_size = base;
 	} while (count);
 
 io_error:
@@ -208,9 +209,9 @@
 	struct inode *inode = file->f_dentry->d_inode;
 	unsigned offset = PAGE_CACHE_SIZE;
 
-	if (page->offset >= inode->i_size)
+	if (nfs_page_offset(page) >= inode->i_size)
 		return -EIO;
-	if (page->offset + offset > inode->i_size)
+	if (nfs_page_offset(page) + offset > inode->i_size)
 		offset = inode->i_size & (PAGE_CACHE_SIZE-1);
 	return nfs_writepage_sync(file, page, 0, offset);
 }
@@ -223,7 +224,7 @@
 region_locked(struct inode *inode, struct nfs_page *req)
 {
 	struct file_lock	*fl;
-	unsigned long		rqstart, rqend;
+	loff_t			rqstart, rqend;
 
 	/* Don't optimize writes if we don't use NLM */
 	if (NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM)
@@ -882,9 +883,9 @@
 	struct nfs_page	*req;
 	int		status = 0;
 
-	dprintk("NFS:      nfs_updatepage(%s/%s %d@%ld, sync=%d)\n",
+	dprintk("NFS:      nfs_updatepage(%s/%s %d@%Ld, sync=%d)\n",
 		dentry->d_parent->d_name.name, dentry->d_name.name,
-		count, nfs_page_offset(page)+offset, sync);
+		count, (long long) (nfs_page_offset(page)+offset), sync);
 
 	/*
 	 * If wsize is smaller than page size, update and write
@@ -934,8 +935,8 @@
 	}
 	nfs_release_request(req);
 done:
-        dprintk("NFS:      nfs_updatepage returns %d (isize %ld)\n",
-                                                status, inode->i_size);
+        dprintk("NFS:      nfs_updatepage returns %d (isize %Ld)\n",
+                                                status, (long long) inode->i_size);
 	if (status < 0)
 		clear_bit(PG_uptodate, &page->flags);
 	return status;
@@ -1183,13 +1184,13 @@
 	struct nfs_page		*req;
 	struct dentry		*dentry;
 	struct inode		*inode;
-	unsigned long		start, end, len;
+	loff_t			start, end, len;
 
 	/* Set up the RPC argument and reply structs
 	 * NB: take care not to mess about with data->commit et al. */
 
 	end = 0;
-	start = ~0;
+	start = NFS_OFFSET_MAX;
 	req = nfs_list_entry(head->next);
 	dentry = req->wb_dentry;
 	data->dentry = dentry;
@@ -1197,7 +1198,7 @@
 	inode = dentry->d_inode;
 	while (!list_empty(head)) {
 		struct nfs_page	*req;
-		unsigned long	rqstart, rqend;
+		loff_t	rqstart, rqend;
 		req = nfs_list_entry(head->next);
 		nfs_list_remove_request(req);
 		nfs_list_add_request(req, &data->pages);
@@ -1211,7 +1212,7 @@
 	data->args.fh     = NFS_FH(dentry);
 	data->args.offset = start;
 	len = end - start;
-	if (end >= inode->i_size || len > (~((u32)0) >> 1))
+	if (end >= inode->i_size || len < 0 || len > (~((u32)0) >> 1))
 		len = 0;
 	data->res.count   = data->args.count = (u32)len;
 	data->res.fattr   = &data->fattr;
@@ -1290,11 +1291,11 @@
 		req = nfs_list_entry(data->pages.next);
 		nfs_list_remove_request(req);
 
-		dprintk("NFS: commit (%s/%s %d@%ld)",
+		dprintk("NFS: commit (%s/%s %d@%Ld)",
 			req->wb_dentry->d_parent->d_name.name,
 			req->wb_dentry->d_name.name,
 			req->wb_bytes,
-			nfs_page_offset(req->wb_page) + req->wb_offset);
+			(long long) (nfs_page_offset(req->wb_page) + req->wb_offset));
 		if (task->tk_status < 0) {
 			if (req->wb_file)
 				req->wb_file->f_error = task->tk_status;
diff -urN 2.2.18/fs/nfsd/nfs3xdr.c 2.2.18aa1/fs/nfsd/nfs3xdr.c
--- 2.2.18/fs/nfsd/nfs3xdr.c	Mon Dec 11 16:58:01 2000
+++ 2.2.18aa1/fs/nfsd/nfs3xdr.c	Mon Dec 11 17:20:51 2000
@@ -142,9 +142,9 @@
 		iap->ia_valid |= ATTR_SIZE;
 		p = dec64(p, &newsize);
 		if (newsize <= NFS_OFFSET_MAX)
-			iap->ia_size = (u32) newsize;
+			iap->ia_size = newsize;
 		else
-			iap->ia_size = ~(size_t) 0;
+			iap->ia_size = NFS_OFFSET_MAX;
 	}
 	if ((tmp = ntohl(*p++)) == 1) {	/* set to server time */
 		iap->ia_valid |= ATTR_ATIME;
@@ -689,11 +689,6 @@
 	if (name == 0)
 		return 0;
 
-	/*
-	dprintk("encode_entry(%.*s @%ld%s)\n",
-		namlen, name, (long) offset, plus? " plus" : "");
-	 */
-
 	/* truncate filename if too long */
 	if (namlen > NFS3_MAXNAMLEN)
 		namlen = NFS3_MAXNAMLEN;
@@ -746,14 +741,14 @@
 
 int
 nfs3svc_encode_entry(struct readdir_cd *cd, const char *name,
-				int namlen, off_t offset, ino_t ino)
+				int namlen, off_t offset, ino_t ino, unsigned int d_type)
 {
 	return encode_entry(cd, name, namlen, offset, ino, 0);
 }
 
 int
 nfs3svc_encode_entry_plus(struct readdir_cd *cd, const char *name,
-				int namlen, off_t offset, ino_t ino)
+				int namlen, off_t offset, ino_t ino, unsigned int d_type)
 {
 	return encode_entry(cd, name, namlen, offset, ino, 1);
 }
diff -urN 2.2.18/fs/nfsd/nfsfh.c 2.2.18aa1/fs/nfsd/nfsfh.c
--- 2.2.18/fs/nfsd/nfsfh.c	Mon Dec 11 16:58:01 2000
+++ 2.2.18aa1/fs/nfsd/nfsfh.c	Mon Dec 11 17:20:51 2000
@@ -41,7 +41,7 @@
  * the name matching the specified inode number.
  */
 static int filldir_one(void * __buf, const char * name, int len,
-			off_t pos, ino_t ino)
+			off_t pos, ino_t ino, unsigned int d_type)
 {
 	struct nfsd_getdents_callback *buf = __buf;
 	struct qstr *qs = buf->name;
diff -urN 2.2.18/fs/nfsd/nfssvc.c 2.2.18aa1/fs/nfsd/nfssvc.c
--- 2.2.18/fs/nfsd/nfssvc.c	Mon Dec 11 16:58:01 2000
+++ 2.2.18aa1/fs/nfsd/nfssvc.c	Mon Dec 11 17:20:51 2000
@@ -115,6 +115,7 @@
 	current->session = 1;
 	current->pgrp = 1;
 	current->fs->umask = 0;
+	current->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
 
 	/* Count active threads */
 	atomic_inc(&nfsd_active);
diff -urN 2.2.18/fs/nfsd/nfsxdr.c 2.2.18aa1/fs/nfsd/nfsxdr.c
--- 2.2.18/fs/nfsd/nfsxdr.c	Mon Dec 11 16:58:01 2000
+++ 2.2.18aa1/fs/nfsd/nfsxdr.c	Mon Dec 11 17:20:51 2000
@@ -443,7 +443,7 @@
 
 int
 nfssvc_encode_entry(struct readdir_cd *cd, const char *name,
-					int namlen, off_t offset, ino_t ino)
+		    int namlen, off_t offset, ino_t ino, unsigned int d_type)
 {
 	u32	*p = cd->buffer;
 	int	buflen, slen;
diff -urN 2.2.18/fs/nfsd/vfs.c 2.2.18aa1/fs/nfsd/vfs.c
--- 2.2.18/fs/nfsd/vfs.c	Mon Dec 11 16:58:01 2000
+++ 2.2.18aa1/fs/nfsd/vfs.c	Mon Dec 11 17:20:51 2000
@@ -509,11 +509,11 @@
 	filp->f_count = 1;
 	filp->f_dentry = dentry;
 	if (access & MAY_WRITE) {
-		filp->f_flags = O_WRONLY;
+		filp->f_flags = O_WRONLY | O_LARGEFILE;
 		filp->f_mode  = FMODE_WRITE;
 		DQUOT_INIT(inode);
 	} else {
-		filp->f_flags = O_RDONLY;
+		filp->f_flags = O_RDONLY | O_LARGEFILE;
 		filp->f_mode  = FMODE_READ;
 	}
 
@@ -655,8 +655,9 @@
 	/* Write back readahead params */
 	if (ra != NULL) {
 		dprintk("nfsd: raparms %ld %ld %ld %ld %ld\n",
-			file.f_reada, file.f_ramax, file.f_raend,
-			file.f_ralen, file.f_rawin);
+			(u_long)file.f_reada, (u_long)file.f_ramax,
+			(u_long)file.f_raend, (u_long)file.f_ralen,
+			(u_long)file.f_rawin);
 		ra->p_reada = file.f_reada;
 		ra->p_ramax = file.f_ramax;
 		ra->p_raend = file.f_raend;
diff -urN 2.2.18/fs/ntfs/fs.c 2.2.18aa1/fs/ntfs/fs.c
--- 2.2.18/fs/ntfs/fs.c	Tue Sep  5 02:28:49 2000
+++ 2.2.18aa1/fs/ntfs/fs.c	Mon Dec 11 17:20:51 2000
@@ -199,7 +199,7 @@
 	/* filldir expects an off_t rather than an loff_t.
 	   Hope we don't have more than 65535 index records */
 	error=nf->filldir(nf->dirent,nf->name,nf->namelen,
-			(nf->ph<<16)|nf->pl,inum);
+			(nf->ph<<16)|nf->pl,inum,DT_UNKNOWN);
 	ntfs_free(nf->name);
 	/* Linux filldir errors are negative, other errors positive */
 	return error;
@@ -225,11 +225,11 @@
 	if(cb.ph==0xFFFF){
 		/* FIXME: Maybe we can return those with the previous call */
 		switch(cb.pl){
-		case 0: filldir(dirent,".",1,filp->f_pos,dir->i_ino);
+		case 0: filldir(dirent,".",1,filp->f_pos,dir->i_ino,DT_DIR);
 			filp->f_pos=0xFFFF0001;
 			return 0;
 			/* FIXME: parent directory */
-		case 1: filldir(dirent,"..",2,filp->f_pos,0);
+		case 1: filldir(dirent,"..",2,filp->f_pos,0,DT_DIR);
 			filp->f_pos=0xFFFF0002;
 			return 0;
 		}
@@ -822,6 +822,7 @@
 	struct statfs fs;
 	struct inode *mft;
 	ntfs_volume *vol;
+	ntfs_u64 size;
 	int error;
 
 	ntfs_debug(DEBUG_OTHER, "ntfs_statfs\n");
@@ -830,16 +831,17 @@
 	fs.f_type=NTFS_SUPER_MAGIC;
 	fs.f_bsize=vol->clustersize;
 
-	error = ntfs_get_volumesize( NTFS_SB2VOL( sb ), &fs.f_blocks );
+	error = ntfs_get_volumesize( NTFS_SB2VOL( sb ), &size );
 	if( error )
 		return -error;
+	fs.f_blocks = size;
 	fs.f_bfree=ntfs_get_free_cluster_count(vol->bitmap);
 	fs.f_bavail=fs.f_bfree;
 
 	/* Number of files is limited by free space only, so we lie here */
 	fs.f_ffree=0;
 	mft=iget(sb,FILE_MFT);
-	fs.f_files=mft->i_size/vol->mft_recordsize;
+	fs.f_files = (long)mft->i_size / vol->mft_recordsize;
 	iput(mft);
 
 	/* should be read from volume */
diff -urN 2.2.18/fs/ntfs/super.c 2.2.18aa1/fs/ntfs/super.c
--- 2.2.18/fs/ntfs/super.c	Mon Dec 11 16:58:03 2000
+++ 2.2.18aa1/fs/ntfs/super.c	Mon Dec 11 17:20:51 2000
@@ -304,7 +304,7 @@
  * Writes the volume size into vol_size. Returns 0 if successful
  * or error.
  */
-int ntfs_get_volumesize(ntfs_volume *vol, long *vol_size )
+int ntfs_get_volumesize(ntfs_volume *vol, ntfs_u64 *vol_size )
 {
 	ntfs_io io;
 	ntfs_u64 size;
@@ -325,9 +325,7 @@
 	ntfs_getput_clusters(vol,0,0,&io);
 	size=NTFS_GETU64(cluster0+0x28);
 	ntfs_free(cluster0);
-	/* FIXME: more than 2**32 cluster */
-	/* FIXME: gcc will emit udivdi3 if we don't truncate it */
-	*vol_size = ((unsigned long)size)/vol->clusterfactor;
+	*vol_size = size;
 	return 0;
 }
 
diff -urN 2.2.18/fs/ntfs/super.h 2.2.18aa1/fs/ntfs/super.h
--- 2.2.18/fs/ntfs/super.h	Mon Jan 17 16:44:42 2000
+++ 2.2.18aa1/fs/ntfs/super.h	Mon Dec 11 17:20:51 2000
@@ -10,7 +10,7 @@
 #define ALLOC_REQUIRE_SIZE     2
 
 int ntfs_get_free_cluster_count(ntfs_inode *bitmap);
-int ntfs_get_volumesize(ntfs_volume *vol, long *vol_size );
+int ntfs_get_volumesize(ntfs_volume *vol, ntfs_u64 *vol_size );
 int ntfs_init_volume(ntfs_volume *vol,char *boot);
 int ntfs_load_special_files(ntfs_volume *vol);
 int ntfs_release_volume(ntfs_volume *vol);
diff -urN 2.2.18/fs/open.c 2.2.18aa1/fs/open.c
--- 2.2.18/fs/open.c	Mon Dec 11 16:58:03 2000
+++ 2.2.18aa1/fs/open.c	Mon Dec 11 17:20:52 2000
@@ -12,7 +12,7 @@
 
 #include <asm/uaccess.h>
 
-asmlinkage int sys_statfs(const char * path, struct statfs * buf)
+asmlinkage long sys_statfs(const char * path, struct statfs * buf)
 {
 	struct dentry * dentry;
 	int error;
@@ -34,7 +34,7 @@
 	return error;
 }
 
-asmlinkage int sys_fstatfs(unsigned int fd, struct statfs * buf)
+asmlinkage long sys_fstatfs(unsigned int fd, struct statfs * buf)
 {
 	struct file * file;
 	struct inode * inode;
@@ -63,17 +63,18 @@
 	return error;
 }
 
-int do_truncate(struct dentry *dentry, unsigned long length)
+int do_truncate(struct dentry *dentry, loff_t length)
 {
 	struct inode *inode = dentry->d_inode;
 	int error;
 	struct iattr newattrs;
 
-	/* Not pretty: "inode->i_size" shouldn't really be "off_t". But it is. */
-	if ((off_t) length < 0)
-		return -EINVAL;
+	/* Not pretty: "inode->i_size" shouldn't really be signed. But it is. */
+	error = -EINVAL;
+	if (length < 0)
+		goto out;
 
-	down(&inode->i_sem);
+	fs_down(&inode->i_sem);
 	newattrs.ia_size = length;
 	newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
 	error = notify_change(dentry, &newattrs);
@@ -83,16 +84,21 @@
 		if (inode->i_op && inode->i_op->truncate)
 			inode->i_op->truncate(inode);
 	}
-	up(&inode->i_sem);
+	fs_up(&inode->i_sem);
+out:
 	return error;
 }
 
-asmlinkage int sys_truncate(const char * path, unsigned long length)
+static inline long do_sys_truncate(const char * path, loff_t length)
 {
 	struct dentry * dentry;
 	struct inode * inode;
 	int error;
 
+	error = -EINVAL;
+	if (length < 0)
+		goto out_nolock;
+
 	lock_kernel();
 	dentry = namei(path);
 
@@ -133,10 +139,16 @@
 	dput(dentry);
 out:
 	unlock_kernel();
+out_nolock:
 	return error;
 }
 
-asmlinkage int sys_ftruncate(unsigned int fd, unsigned long length)
+asmlinkage long sys_truncate(const char * path, unsigned long length)
+{
+	return do_sys_truncate(path, length);
+}
+
+static inline long do_sys_ftruncate(unsigned int fd, loff_t length)
 {
 	struct inode * inode;
 	struct dentry *dentry;
@@ -171,6 +183,24 @@
 	return error;
 }
 
+asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length)
+{
+	return do_sys_ftruncate(fd, length);
+}
+
+/* LFS versions of truncate are only needed on 32 bit machines */
+#if BITS_PER_LONG == 32
+asmlinkage long sys_truncate64(const char * path, loff_t length)
+{
+	return do_sys_truncate(path, length);
+}
+
+asmlinkage long sys_ftruncate64(unsigned int fd, loff_t length)
+{
+	return do_sys_ftruncate(fd, length);
+}
+#endif
+
 #ifndef __alpha__
 
 /*
@@ -184,7 +214,7 @@
  * must be owner or have write permission.
  * Else, update from *times, must be owner or super user.
  */
-asmlinkage int sys_utime(char * filename, struct utimbuf * times)
+asmlinkage long sys_utime(char * filename, struct utimbuf * times)
 {
 	int error;
 	struct dentry * dentry;
@@ -232,7 +262,7 @@
  * must be owner or have write permission.
  * Else, update from *times, must be owner or super user.
  */
-asmlinkage int sys_utimes(char * filename, struct timeval * utimes)
+asmlinkage long sys_utimes(char * filename, struct timeval * utimes)
 {
 	int error;
 	struct dentry * dentry;
@@ -278,7 +308,7 @@
  * We do this by temporarily clearing all FS-related capabilities and
  * switching the fsuid/fsgid around to the real ones.
  */
-asmlinkage int sys_access(const char * filename, int mode)
+asmlinkage long sys_access(const char * filename, int mode)
 {
 	struct dentry * dentry;
 	int old_fsuid, old_fsgid;
@@ -319,7 +349,7 @@
 	return res;
 }
 
-asmlinkage int sys_chdir(const char * filename)
+asmlinkage long sys_chdir(const char * filename)
 {
 	int error;
 	struct inode *inode;
@@ -354,7 +384,7 @@
 	return error;
 }
 
-asmlinkage int sys_fchdir(unsigned int fd)
+asmlinkage long sys_fchdir(unsigned int fd)
 {
 	struct file *file;
 	struct dentry *dentry;
@@ -391,7 +421,7 @@
 	return error;
 }
 
-asmlinkage int sys_chroot(const char * filename)
+asmlinkage long sys_chroot(const char * filename)
 {
 	int error;
 	struct inode *inode;
@@ -431,7 +461,7 @@
 	return error;
 }
 
-asmlinkage int sys_fchmod(unsigned int fd, mode_t mode)
+asmlinkage long sys_fchmod(unsigned int fd, mode_t mode)
 {
 	struct inode * inode;
 	struct dentry * dentry;
@@ -469,7 +499,7 @@
 	return err;
 }
 
-asmlinkage int sys_chmod(const char * filename, mode_t mode)
+asmlinkage long sys_chmod(const char * filename, mode_t mode)
 {
 	struct dentry * dentry;
 	struct inode * inode;
@@ -565,7 +595,7 @@
 	return error;
 }
 
-asmlinkage int sys_chown(const char * filename, uid_t user, gid_t group)
+asmlinkage long sys_chown(const char * filename, uid_t user, gid_t group)
 {
 	struct dentry * dentry;
 	int error;
@@ -582,7 +612,7 @@
 	return error;
 }
 
-asmlinkage int sys_lchown(const char * filename, uid_t user, gid_t group)
+asmlinkage long sys_lchown(const char * filename, uid_t user, gid_t group)
 {
 	struct dentry * dentry;
 	int error;
@@ -600,7 +630,7 @@
 }
 
 
-asmlinkage int sys_fchown(unsigned int fd, uid_t user, gid_t group)
+asmlinkage long sys_fchown(unsigned int fd, uid_t user, gid_t group)
 {
 	struct dentry * dentry;
 	struct file * file;
@@ -645,6 +675,9 @@
 	f = get_empty_filp();
 	if (!f)
 		goto out;
+#if BITS_PER_LONG != 32
+	flags |= O_LARGEFILE;
+#endif
 	f->f_flags = flag = flags;
 	f->f_mode = (flag+1) & O_ACCMODE;
 	if (f->f_mode)
@@ -783,7 +816,7 @@
  * For backward compatibility?  Maybe this should be moved
  * into arch/i386 instead?
  */
-asmlinkage int sys_creat(const char * pathname, int mode)
+asmlinkage long sys_creat(const char * pathname, int mode)
 {
 	return sys_open(pathname, O_CREAT | O_WRONLY | O_TRUNC, mode);
 }
@@ -856,7 +889,7 @@
  * This routine simulates a hangup on the tty, to arrange that users
  * are given clean terminals at login time.
  */
-asmlinkage int sys_vhangup(void)
+asmlinkage long sys_vhangup(void)
 {
 	int ret = -EPERM;
 
diff -urN 2.2.18/fs/proc/array.c 2.2.18aa1/fs/proc/array.c
--- 2.2.18/fs/proc/array.c	Mon Dec 11 16:58:03 2000
+++ 2.2.18aa1/fs/proc/array.c	Mon Dec 11 17:20:51 2000
@@ -42,6 +42,8 @@
  * Alan Cox	     :  security fixes.
  *			<Alan.Cox@linux.org>
  *
+ * Gerhard Wichert   :  added BIGMEM support
+ * Siemens AG           <Gerhard.Wichert@pdb.siemens.de>
  */
 
 #include <linux/types.h>
@@ -389,6 +391,8 @@
 		"MemShared: %8lu kB\n"
 		"Buffers:   %8lu kB\n"
 		"Cached:    %8lu kB\n"
+		"BigTotal:  %8lu kB\n"
+		"BigFree:   %8lu kB\n"
 		"SwapTotal: %8lu kB\n"
 		"SwapFree:  %8lu kB\n",
 		i.totalram >> 10,
@@ -396,6 +400,8 @@
 		i.sharedram >> 10,
 		i.bufferram >> 10,
 		page_cache_size << (PAGE_SHIFT - 10),
+		i.totalbig >> 10,
+		i.freebig >> 10,
 		i.totalswap >> 10,
 		i.freeswap >> 10);
 }
@@ -451,6 +457,8 @@
 	return pte_page(pte) + (ptr & ~PAGE_MASK);
 }
 
+#include <linux/bigmem.h>
+
 static int get_array(struct task_struct *p, unsigned long start, unsigned long end, char * buffer)
 {
 	unsigned long addr;
@@ -463,6 +471,7 @@
 		addr = get_phys_addr(p, start);
 		if (!addr)
 			return result;
+		addr = kmap(addr, KM_READ);
 		do {
 			c = *(char *) addr;
 			if (!c)
@@ -470,12 +479,19 @@
 			if (size < PAGE_SIZE)
 				buffer[size++] = c;
 			else
+			{
+				kunmap(addr, KM_READ);
 				return result;
+			}
 			addr++;
 			start++;
 			if (!c && start >= end)
+			{
+				kunmap(addr, KM_READ);
 				return result;
+			}
 		} while (addr & ~PAGE_MASK);
+		kunmap(addr-1, KM_READ);
 	}
 	return result;
 }
@@ -1157,11 +1173,11 @@
  *         + (index into the line)
  */
 /* for systems with sizeof(void*) == 4: */
-#define MAPS_LINE_FORMAT4	  "%08lx-%08lx %s %08lx %s %lu"
-#define MAPS_LINE_MAX4	49 /* sum of 8  1  8  1 4 1 8 1 5 1 10 1 */
+#define MAPS_LINE_FORMAT4	  "%08lx-%08lx %s %016Lx %s %lu"
+#define MAPS_LINE_MAX4	57 /* sum of 8  1  8  1 4 1 16 1 5 1 10 1 */
 
 /* for systems with sizeof(void*) == 8: */
-#define MAPS_LINE_FORMAT8	  "%016lx-%016lx %s %016lx %s %lu"
+#define MAPS_LINE_FORMAT8	  "%016lx-%016lx %s %016Lx %s %lu"
 #define MAPS_LINE_MAX8	73 /* sum of 16  1  16  1 4 1 16 1 5 1 10 1 */
 
 #define MAPS_LINE_MAX	MAPS_LINE_MAX8
diff -urN 2.2.18/fs/proc/fd.c 2.2.18aa1/fs/proc/fd.c
--- 2.2.18/fs/proc/fd.c	Sun Oct 31 23:31:32 1999
+++ 2.2.18aa1/fs/proc/fd.c	Mon Dec 11 17:20:51 2000
@@ -87,7 +87,6 @@
 	fd = 0;
 	len = dentry->d_name.len;
 	name = dentry->d_name.name;
-	if (len > 1 && *name == '0') goto out;
 	while (len-- > 0) {
 		c = *name - '0';
 		name++;
@@ -147,7 +146,7 @@
 		ino = inode->i_ino;
 		if (fd)
 			ino = (ino & 0xffff0000) | PROC_PID_INO;
-		if (filldir(dirent, "..", fd+1, fd, ino) < 0)
+		if (filldir(dirent, "..", fd+1, fd, ino, DT_DIR) < 0)
 			goto out;
 	}
 
@@ -177,7 +176,7 @@
 		read_unlock(&tasklist_lock);
 
 		ino = (pid << 16) + PROC_PID_FD_DIR + fd;
-		if (filldir(dirent, buf+j, NUMBUF-j, fd+2, ino) < 0)
+		if (filldir(dirent, buf+j, NUMBUF-j, fd+2, ino, DT_LNK) < 0)
 			goto out;
 
 		read_lock(&tasklist_lock);
diff -urN 2.2.18/fs/proc/mem.c 2.2.18aa1/fs/proc/mem.c
--- 2.2.18/fs/proc/mem.c	Mon Dec 11 16:58:03 2000
+++ 2.2.18aa1/fs/proc/mem.c	Mon Dec 11 17:20:48 2000
@@ -10,6 +10,7 @@
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/proc_fs.h>
+#include <linux/bigmem.h>
 
 #include <asm/page.h>
 #include <asm/uaccess.h>
@@ -120,7 +121,9 @@
 		i = PAGE_SIZE-(addr & ~PAGE_MASK);
 		if (i > scount)
 			i = scount;
+		page = (char *) kmap((unsigned long) page, KM_READ);
 		copy_to_user(tmp, page, i);
+		kunmap((unsigned long) page, KM_READ);
 		addr += i;
 		tmp += i;
 		scount -= i;
@@ -177,7 +180,9 @@
 		i = PAGE_SIZE-(addr & ~PAGE_MASK);
 		if (i > count)
 			i = count;
+		page = (unsigned long) kmap((unsigned long) page, KM_WRITE);
 		copy_from_user(page, tmp, i);
+		kunmap((unsigned long) page, KM_WRITE);
 		addr += i;
 		tmp += i;
 		count -= i;
diff -urN 2.2.18/fs/proc/openpromfs.c 2.2.18aa1/fs/proc/openpromfs.c
--- 2.2.18/fs/proc/openpromfs.c	Mon Jan 17 16:44:43 2000
+++ 2.2.18aa1/fs/proc/openpromfs.c	Mon Dec 11 17:20:51 2000
@@ -846,14 +846,14 @@
 	i = filp->f_pos;
 	switch (i) {
 	case 0:
-		if (filldir(dirent, ".", 1, i, ino) < 0) return 0;
+		if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0) return 0;
 		i++;
 		filp->f_pos++;
 		/* fall thru */
 	case 1:
 		if (filldir(dirent, "..", 2, i, 
 			(NODE(ino).parent == 0xffff) ? 
-			PROC_ROOT_INO : NODE2INO(NODE(ino).parent)) < 0) 
+			PROC_ROOT_INO : NODE2INO(NODE(ino).parent), DT_DIR) < 0) 
 			return 0;
 		i++;
 		filp->f_pos++;
@@ -869,14 +869,14 @@
 			if (prom_getname (nodes[node].node, buffer, 128) < 0)
 				return 0;
 			if (filldir(dirent, buffer, strlen(buffer),
-				    filp->f_pos, NODE2INO(node)) < 0)
+				    filp->f_pos, NODE2INO(node), DT_DIR) < 0)
 				return 0;
 			filp->f_pos++;
 			node = nodes[node].next;
 		}
 		j = NODEP2INO(NODE(ino).first_prop);
 		if (!i) {
-			if (filldir(dirent, ".node", 5, filp->f_pos, j) < 0)
+			if (filldir(dirent, ".node", 5, filp->f_pos, j, DT_REG) < 0)
 				return 0;
 			filp->f_pos++;
 		} else
@@ -887,7 +887,7 @@
 				if (alias_names [i]) {
 					if (filldir (dirent, alias_names [i], 
 						strlen (alias_names [i]), 
-						filp->f_pos, j) < 0) return 0;
+						filp->f_pos, j, DT_REG) < 0) return 0;
 					filp->f_pos++;
 				}
 			}
@@ -899,7 +899,7 @@
 				if (i) i--;
 				else {
 					if (filldir(dirent, p, strlen(p),
-						    filp->f_pos, j) < 0)
+						    filp->f_pos, j, DT_REG) < 0)
 						return 0;
 					filp->f_pos++;
 				}
@@ -911,7 +911,7 @@
 				else {
 					if (filldir(dirent, d->name,
 						    strlen(d->name),
-						    filp->f_pos, d->inode) < 0)
+						    filp->f_pos, d->inode, d->mode >> 12) < 0)
 						return 0;
 					filp->f_pos++;
 				}
diff -urN 2.2.18/fs/proc/root.c 2.2.18aa1/fs/proc/root.c
--- 2.2.18/fs/proc/root.c	Mon Dec 11 16:58:03 2000
+++ 2.2.18aa1/fs/proc/root.c	Mon Dec 11 17:20:51 2000
@@ -844,7 +844,6 @@
 		}
 		pid *= 10;
 		pid += c;
-		if (!pid) break;
 		if (pid & 0xffff0000) {
 			pid = 0;
 			break;
@@ -891,13 +890,13 @@
 	i = filp->f_pos;
 	switch (i) {
 		case 0:
-			if (filldir(dirent, ".", 1, i, ino) < 0)
+			if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
 				return 0;
 			i++;
 			filp->f_pos++;
 			/* fall through */
 		case 1:
-			if (filldir(dirent, "..", 2, i, de->parent->low_ino) < 0)
+			if (filldir(dirent, "..", 2, i, de->parent->low_ino, DT_DIR) < 0)
 				return 0;
 			i++;
 			filp->f_pos++;
@@ -916,7 +915,7 @@
 			}
 
 			do {
-				if (filldir(dirent, de->name, de->namelen, filp->f_pos, ino | de->low_ino) < 0)
+				if (filldir(dirent, de->name, de->namelen, filp->f_pos, ino | de->low_ino, de->mode >> 12) < 0)
 					return 0;
 				filp->f_pos++;
 				de = de->next;
@@ -983,7 +982,7 @@
 			pid /= 10;
 		} while (pid);
 
-		if (filldir(dirent, buf+j, PROC_NUMBUF-j, filp->f_pos, ino) < 0)
+		if (filldir(dirent, buf+j, PROC_NUMBUF-j, filp->f_pos, ino, DT_DIR) < 0)
 			break;
 		filp->f_pos++;
 	}
diff -urN 2.2.18/fs/qnx4/dir.c 2.2.18aa1/fs/qnx4/dir.c
--- 2.2.18/fs/qnx4/dir.c	Thu May  4 13:00:40 2000
+++ 2.2.18aa1/fs/qnx4/dir.c	Mon Dec 11 17:20:51 2000
@@ -61,7 +61,7 @@
 							QNX4_INODES_PER_BLOCK +
 							le->dl_inode_ndx;
 					}
-					if (filldir(dirent, de->di_fname, size, filp->f_pos, ino) < 0) {
+					if (filldir(dirent, de->di_fname, size, filp->f_pos, ino, DT_UNKNOWN) < 0) {
 						brelse(bh);
 						return 0;
 					}
diff -urN 2.2.18/fs/read_write.c 2.2.18aa1/fs/read_write.c
--- 2.2.18/fs/read_write.c	Tue Sep  5 02:28:49 2000
+++ 2.2.18aa1/fs/read_write.c	Mon Dec 11 17:20:52 2000
@@ -48,7 +48,7 @@
 
 asmlinkage off_t sys_lseek(unsigned int fd, off_t offset, unsigned int origin)
 {
-	off_t retval;
+	off_t retval, oldpos;
 	struct file * file;
 	struct dentry * dentry;
 	struct inode * inode;
@@ -62,9 +62,19 @@
 	if (!(dentry = file->f_dentry) ||
 	    !(inode = dentry->d_inode))
 		goto out_putf;
+	oldpos = file->f_pos;
 	retval = -EINVAL;
 	if (origin <= 2)
 		retval = llseek(file, offset, origin);
+
+	/* Demand L-F-S compliance only from normal files,
+	   thus raw devices can do whatever they please.. */
+	if (!(file->f_flags & O_LARGEFILE) &&
+	    retval >= 0 && S_ISREG(inode->i_mode) &&
+	    file->f_pos > 0x7fffffff) {
+		file->f_pos = oldpos;
+		retval = -EOVERFLOW;
+	}
 out_putf:
 	fput(file);
 bad:
@@ -81,7 +91,7 @@
 	struct file * file;
 	struct dentry * dentry;
 	struct inode * inode;
-	loff_t offset;
+	loff_t offset, oldpos;
 
 	lock_kernel();
 	retval = -EBADF;
@@ -96,6 +106,7 @@
 	if (origin > 2)
 		goto out_putf;
 
+	oldpos = file->f_pos;
 	offset = llseek(file, ((loff_t) offset_high << 32) | offset_low,
 			origin);
 
@@ -105,6 +116,14 @@
 		if (!copy_to_user(result, &offset, sizeof(offset)))
 			retval = 0;
 	}
+	if (!(file->f_flags & O_LARGEFILE) && S_ISREG(inode->i_mode) &&
+	    file->f_pos > 0x7fffffff) {
+		/* The target position isn't presentable without
+		   O_LARGEFILE flag being set --> yield error, and
+		   restore the file position. */
+		file->f_pos = oldpos;
+		retval = -EOVERFLOW;
+	}
 out_putf:
 	fput(file);
 bad:
@@ -166,9 +185,9 @@
 	if (!file->f_op || !(write = file->f_op->write))
 		goto out;
 
-	down(&inode->i_sem);
+	fs_down(&inode->i_sem);
 	ret = write(file, buf, count, &file->f_pos);
-	up(&inode->i_sem);
+	fs_up(&inode->i_sem);
 out:
 	fput(file);
 bad_file:
@@ -314,9 +333,9 @@
 	if (!file)
 		goto bad_file;
 	if (file->f_op && file->f_op->write && (file->f_mode & FMODE_WRITE)) {
-		down(&file->f_dentry->d_inode->i_sem);
+		fs_down(&file->f_dentry->d_inode->i_sem);
 		ret = do_readv_writev(VERIFY_READ, file, vector, count);
-		up(&file->f_dentry->d_inode->i_sem);
+		fs_up(&file->f_dentry->d_inode->i_sem);
 	}
 	fput(file);
 
@@ -335,6 +354,7 @@
 	ssize_t ret;
 	struct file * file;
 	ssize_t (*read)(struct file *, char *, size_t, loff_t *);
+	struct inode * inode;
 
 	lock_kernel();
 
@@ -342,10 +362,13 @@
 	file = fget(fd);
 	if (!file)
 		goto bad_file;
+
+	inode = file->f_dentry->d_inode;
+
 	if (!(file->f_mode & FMODE_READ))
 		goto out;
-	ret = locks_verify_area(FLOCK_VERIFY_READ, file->f_dentry->d_inode,
-				file, pos, count);
+
+	ret = locks_verify_area(FLOCK_VERIFY_READ, inode, file, pos, count);
 	if (ret)
 		goto out;
 	ret = -EINVAL;
@@ -367,6 +390,7 @@
 	ssize_t ret;
 	struct file * file;
 	ssize_t (*write)(struct file *, const char *, size_t, loff_t *);
+	struct inode * inode;
 
 	lock_kernel();
 
@@ -376,8 +400,10 @@
 		goto bad_file;
 	if (!(file->f_mode & FMODE_WRITE))
 		goto out;
-	ret = locks_verify_area(FLOCK_VERIFY_WRITE, file->f_dentry->d_inode,
-				file, pos, count);
+
+	inode = file->f_dentry->d_inode;
+
+	ret = locks_verify_area(FLOCK_VERIFY_WRITE, inode, file, pos, count);
 	if (ret)
 		goto out;
 	ret = -EINVAL;
@@ -386,9 +412,9 @@
 	if (pos < 0)
 		goto out;
 
-	down(&file->f_dentry->d_inode->i_sem);
+	fs_down(&inode->i_sem);
 	ret = write(file, buf, count, &pos);
-	up(&file->f_dentry->d_inode->i_sem);
+	fs_up(&inode->i_sem);
 
 out:
 	fput(file);
diff -urN 2.2.18/fs/readdir.c 2.2.18aa1/fs/readdir.c
--- 2.2.18/fs/readdir.c	Mon Jan 17 16:44:43 2000
+++ 2.2.18aa1/fs/readdir.c	Mon Dec 11 17:20:51 2000
@@ -36,7 +36,7 @@
 	int count;
 };
 
-static int fillonedir(void * __buf, const char * name, int namlen, off_t offset, ino_t ino)
+static int fillonedir(void * __buf, const char * name, int namlen, off_t offset, ino_t ino, unsigned int d_type)
 {
 	struct readdir_callback * buf = (struct readdir_callback *) __buf;
 	struct old_linux_dirent * dirent;
@@ -118,7 +118,7 @@
 	int error;
 };
 
-static int filldir(void * __buf, const char * name, int namlen, off_t offset, ino_t ino)
+static int filldir(void * __buf, const char * name, int namlen, off_t offset, ino_t ino, unsigned int d_type)
 {
 	struct linux_dirent * dirent;
 	struct getdents_callback * buf = (struct getdents_callback *) __buf;
@@ -188,6 +188,123 @@
 	if (lastdirent) {
 		put_user(file->f_pos, &lastdirent->d_off);
 		error = count - buf.count;
+	}
+
+out_putf:
+	fput(file);
+out:
+	unlock_kernel();
+	return error;
+}
+
+
+/*
+ * And even better one including d_type field and 64bit d_ino and d_off.
+ */
+struct linux_dirent64 {
+	u64		d_ino;
+	s64		d_off;
+	unsigned short	d_reclen;
+	unsigned char	d_type;
+	char		d_name[0];
+};
+
+#define ROUND_UP64(x) (((x)+sizeof(u64)-1) & ~(sizeof(u64)-1))
+
+struct getdents_callback64 {
+	struct linux_dirent64 * current_dir;
+	struct linux_dirent64 * previous;
+	int count;
+	int error;
+};
+
+static int filldir64(void * __buf, const char * name, int namlen, off_t offset,
+		     ino_t ino, unsigned int d_type)
+{
+	struct linux_dirent64 * dirent, d;
+	struct getdents_callback64 * buf = (struct getdents_callback64 *) __buf;
+	int reclen = ROUND_UP64(NAME_OFFSET(dirent) + namlen + 1);
+
+	buf->error = -EINVAL;	/* only used if we fail.. */
+	if (reclen > buf->count)
+		return -EINVAL;
+	dirent = buf->previous;
+	if (dirent) {
+#if BITS_PER_LONG < 64
+		d.d_off = offset;
+		copy_to_user(&dirent->d_off, &d.d_off, sizeof(d.d_off));
+#else
+		put_user(offset, &dirent->d_off);
+#endif
+	}
+	dirent = buf->current_dir;
+	buf->previous = dirent;
+	memset(&d, 0, NAME_OFFSET(&d));
+	d.d_ino = ino;
+	d.d_reclen = reclen;
+	d.d_type = d_type;
+	copy_to_user(dirent, &d, NAME_OFFSET(&d));
+	copy_to_user(dirent->d_name, name, namlen);
+	put_user(0, dirent->d_name + namlen);
+	((char *) dirent) += reclen;
+	buf->current_dir = dirent;
+	buf->count -= reclen;
+	return 0;
+}
+
+asmlinkage int sys_getdents64(unsigned int fd, void * dirent, unsigned int count)
+{
+	struct file * file;
+	struct dentry * dentry;
+	struct inode * inode;
+	struct linux_dirent64 * lastdirent;
+	struct getdents_callback64 buf;
+	int error;
+
+	lock_kernel();
+	error = -EBADF;
+	file = fget(fd);
+	if (!file)
+		goto out;
+
+	dentry = file->f_dentry;
+	if (!dentry)
+		goto out_putf;
+
+	inode = dentry->d_inode;
+	if (!inode)
+		goto out_putf;
+
+	buf.current_dir = (struct linux_dirent64 *) dirent;
+	buf.previous = NULL;
+	buf.count = count;
+	buf.error = 0;
+
+	error = -ENOTDIR;
+	if (!file->f_op || !file->f_op->readdir)
+		goto out_putf;
+
+
+	/*
+	 * Get the inode's semaphore to prevent changes
+	 * to the directory while we read it.
+	 */
+	down(&inode->i_sem);
+	error = file->f_op->readdir(file, &buf, filldir64);
+	up(&inode->i_sem);
+	if (error < 0)
+		goto out_putf;
+	error = buf.error;
+	lastdirent = buf.previous;
+	if (lastdirent) {
+#if BITS_PER_LONG < 64
+		s64 d_off;
+		d_off = file->f_pos;
+		copy_to_user(&lastdirent->d_off, &d_off, sizeof(d_off));
+		error = count - buf.count;
+#else
+		put_user(file->f_pos, &lastdirent->d_off);
+#endif
 	}
 
 out_putf:
diff -urN 2.2.18/fs/romfs/inode.c 2.2.18aa1/fs/romfs/inode.c
--- 2.2.18/fs/romfs/inode.c	Thu May  4 13:00:40 2000
+++ 2.2.18aa1/fs/romfs/inode.c	Mon Dec 11 17:20:51 2000
@@ -258,6 +258,10 @@
 	return res;
 }
 
+static unsigned char romfs_dtype_table[] = {
+	DT_UNKNOWN, DT_DIR, DT_REG, DT_LNK, DT_BLK, DT_CHR, DT_SOCK, DT_FIFO
+};
+
 static int
 romfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 {
@@ -302,7 +306,8 @@
 		nextfh = ntohl(ri.next);
 		if ((nextfh & ROMFH_TYPE) == ROMFH_HRD)
 			ino = ntohl(ri.spec);
-		if (filldir(dirent, fsname, j, offset, ino) < 0) {
+		if (filldir(dirent, fsname, j, offset, ino,
+			    romfs_dtype_table[nextfh & ROMFH_TYPE]) < 0) {
 			return stored;
 		}
 		stored++;
@@ -396,7 +401,7 @@
 	buf = page_address(page);
 	clear_bit(PG_uptodate, &page->flags);
 	clear_bit(PG_error, &page->flags);
-	offset = page->offset;
+	offset = pgoff2loff(page->index);
 	if (offset < inode->i_size) {
 		avail = inode->i_size-offset;
 		readlen = min(avail, PAGE_SIZE);
diff -urN 2.2.18/fs/smbfs/cache.c 2.2.18aa1/fs/smbfs/cache.c
--- 2.2.18/fs/smbfs/cache.c	Tue Sep  5 02:28:49 2000
+++ 2.2.18aa1/fs/smbfs/cache.c	Mon Dec 11 17:20:51 2000
@@ -40,7 +40,7 @@
 	struct cache_head * cachep;
 
 	VERBOSE("finding cache for %s/%s\n", DENTRY_PATH(dentry));
-	cachep = (struct cache_head *) get_cached_page(inode, 0, 1);
+	cachep = (struct cache_head *) get_cached_page(inode, ulong2pgoff(0), 1);
 	if (!cachep)
 		goto out;
 	if (cachep->valid)
@@ -61,9 +61,10 @@
 				PARANOIA("cache %s/%s has existing block!\n",
 					 DENTRY_PATH(dentry));
 #endif
-			offset = PAGE_SIZE + (i << PAGE_SHIFT);
-			block = (struct cache_block *) get_cached_page(inode,
-								offset, 0);
+			/* byte_offset = PAGE_SIZE + (i << PAGE_SHIFT); */
+			/*    --> page_offset = 1 + i  */ 
+			block = (struct cache_block *)
+				get_cached_page(inode, ulong2pgoff(i+1), 0);
 			if (!block)
 				goto out;
 			index->block = block;
@@ -128,7 +129,7 @@
 	struct inode * inode = get_cache_inode(cachep);
 	struct cache_index * index;
 	struct cache_block * block;
-	unsigned long page_off;
+	pgoff_t page_off;
 	unsigned int nent, offset, len = entry->len;
 	unsigned int needed = len + sizeof(struct cache_entry);
 
@@ -180,14 +181,15 @@
 	 */
 get_block:
 	cachep->pages++;
-	page_off = PAGE_SIZE + (cachep->idx << PAGE_SHIFT);
+	/* page_byte_off = PAGE_SIZE + (cachep->idx << PAGE_SHIFT); */
+	page_off = ulong2pgoff(1 + cachep->idx);
 	block = (struct cache_block *) get_cached_page(inode, page_off, 1);
 	if (block)
 	{
 		index->block = block;
 		index->space = PAGE_SIZE;
 		VERBOSE("inode=%p, pages=%d, block at %ld\n",
-			inode, cachep->pages, page_off);
+			inode, cachep->pages, (u_long)pgoff2loff(page_off));
 		goto add_entry;
 	}
 	/*
diff -urN 2.2.18/fs/smbfs/dir.c 2.2.18aa1/fs/smbfs/dir.c
--- 2.2.18/fs/smbfs/dir.c	Mon Dec 11 16:58:03 2000
+++ 2.2.18aa1/fs/smbfs/dir.c	Mon Dec 11 17:20:51 2000
@@ -91,12 +91,12 @@
 	switch ((unsigned int) filp->f_pos)
 	{
 	case 0:
-		if (filldir(dirent, ".", 1, 0, dir->i_ino) < 0)
+		if (filldir(dirent, ".", 1, 0, dir->i_ino, DT_DIR) < 0)
 			goto out;
 		filp->f_pos = 1;
 	case 1:
 		if (filldir(dirent, "..", 2, 1,
-				dentry->d_parent->d_inode->i_ino) < 0)
+				dentry->d_parent->d_inode->i_ino, DT_DIR) < 0)
 			goto out;
 		filp->f_pos = 2;
 	}
@@ -151,7 +151,7 @@
 		}
 
 		if (filldir(dirent, entry->name, entry->len, 
-				    filp->f_pos, entry->ino) < 0)
+				    filp->f_pos, entry->ino, DT_UNKNOWN) < 0)
 			break;
 		filp->f_pos += 1;
 	}
diff -urN 2.2.18/fs/smbfs/file.c 2.2.18aa1/fs/smbfs/file.c
--- 2.2.18/fs/smbfs/file.c	Tue Sep  5 02:28:49 2000
+++ 2.2.18aa1/fs/smbfs/file.c	Mon Dec 11 17:20:51 2000
@@ -15,6 +15,7 @@
 #include <linux/mm.h>
 #include <linux/malloc.h>
 #include <linux/pagemap.h>
+#include <linux/unistd.h>
 
 #include <asm/uaccess.h>
 #include <asm/system.h>
@@ -47,15 +48,15 @@
 smb_readpage_sync(struct dentry *dentry, struct page *page)
 {
 	char *buffer = (char *) page_address(page);
-	unsigned long offset = page->offset;
+	loff_t loffset = pgoff2loff(page->index);
 	int rsize = smb_get_rsize(server_from_dentry(dentry));
 	int count = PAGE_SIZE;
 	int result;
 
 	clear_bit(PG_error, &page->flags);
 
-	VERBOSE("file %s/%s, count=%d@%ld, rsize=%d\n",
-		DENTRY_PATH(dentry), count, offset, rsize);
+	VERBOSE("file %s/%s, count=%d@%Ld, rsize=%d\n",
+		DENTRY_PATH(dentry), count, loffset, rsize);
 	result = smb_open(dentry, SMB_O_RDONLY);
 	if (result < 0)
 	{
@@ -68,12 +69,12 @@
 		if (count < rsize)
 			rsize = count;
 
-		result = smb_proc_read(dentry, offset, rsize, buffer);
+		result = smb_proc_read(dentry, loffset, rsize, buffer);
 		if (result < 0)
 			goto io_error;
 
 		count -= result;
-		offset += result;
+		loffset += result;
 		buffer += result;
 		dentry->d_inode->i_atime = CURRENT_TIME;
 		if (result < rsize)
@@ -113,23 +114,36 @@
  * Offset is the data offset within the page.
  */
 static int
-smb_writepage_sync(struct dentry *dentry, struct page *page,
+smb_writepage_sync(struct file *file, struct page *page,
 		   unsigned long offset, unsigned int count)
 {
+	struct dentry * dentry = file->f_dentry;
 	struct inode *inode = dentry->d_inode;
 	u8 *buffer = (u8 *) page_address(page) + offset;
 	int wsize = smb_get_wsize(server_from_dentry(dentry));
 	int result, written = 0;
+	loff_t loffset = pgoff2loff(page->index) + offset;
+
+	VERBOSE("file %s/%s, count=%d@%Ld, wsize=%d\n",
+		DENTRY_PATH(dentry), count, loffset, wsize);
 
-	offset += page->offset;
-	VERBOSE("file %s/%s, count=%d@%ld, wsize=%d\n",
-		DENTRY_PATH(dentry), count, offset, wsize);
+	if (!(file->f_flags & O_LARGEFILE)) {
+		if (loffset >= 0x7fffffffULL)
+			return -EFBIG;
+		if (loffset + count > 0x7fffffffULL)
+			count = 0x7fffffff - loffset;
+	}
+
+	if (loffset >= 0xffffffffULL) /* 4G-1 ???  Or 2G-1 ??? */
+		return -EFBIG;
+	if ((loffset + count) > 0xffffffffULL)
+		count = 0xffffffffULL - loffset;
 
 	do {
 		if (count < wsize)
 			wsize = count;
 
-		result = smb_proc_write(dentry, offset, wsize, buffer);
+		result = smb_proc_write(dentry, loffset, wsize, buffer);
 		if (result < 0)
 			break;
 		/* N.B. what if result < wsize?? */
@@ -138,29 +152,27 @@
 			printk(KERN_DEBUG "short write, wsize=%d, result=%d\n",
 			       wsize, result);
 #endif
-		buffer += wsize;
-		offset += wsize;
+		buffer  += wsize;
+		loffset += wsize;
 		written += wsize;
-		count -= wsize;
+		count   -= wsize;
 		/*
 		 * Update the inode now rather than waiting for a refresh.
 		 */
 		inode->i_mtime = inode->i_atime = CURRENT_TIME;
-		if (offset > inode->i_size)
-			inode->i_size = offset;
+		if (loffset > inode->i_size)
+			inode->i_size = loffset;
 		inode->u.smbfs_i.cache_valid |= SMB_F_LOCALWRITE;
 	} while (count);
 	return written ? written : result;
 }
 
 /*
- * Write a page to the server. This will be used for NFS swapping only
- * (for now), and we currently do this synchronously only.
+ * Write a page to the server.
  */
 static int
 smb_writepage(struct file *file, struct page *page)
 {
-	struct dentry *dentry = file->f_dentry;
 	int 	result;
 
 #ifdef SMBFS_PARANOIA
@@ -169,7 +181,7 @@
 #endif
 	set_bit(PG_locked, &page->flags);
 	atomic_inc(&page->count);
-	result = smb_writepage_sync(dentry, page, 0, PAGE_SIZE);
+	result = smb_writepage_sync(file, page, 0, PAGE_SIZE);
 	smb_unlock_page(page);
 	free_page(page_address(page));
 	return result;
@@ -180,10 +192,10 @@
 {
 	struct dentry *dentry = file->f_dentry;
 
-	DEBUG1("(%s/%s %d@%ld, sync=%d)\n",
-	       DENTRY_PATH(dentry), count, page->offset+offset, sync);
+	DEBUG1("(%s/%s %d@%Ld, sync=%d)\n",
+	       DENTRY_PATH(dentry), count, pgoff2loff(page->index)+offset, sync);
 
-	return smb_writepage_sync(dentry, page, offset, count);
+	return smb_writepage_sync(file, page, offset, count);
 }
 
 static ssize_t
@@ -192,8 +204,8 @@
 	struct dentry * dentry = file->f_dentry;
 	ssize_t	status;
 
-	VERBOSE("file %s/%s, count=%lu@%lu\n", DENTRY_PATH(dentry),
-		(unsigned long) count, (unsigned long) *ppos);
+	VERBOSE("file %s/%s, count=%lu@%Lu\n", DENTRY_PATH(dentry),
+		(unsigned long) count, *ppos);
 
 	status = smb_revalidate_inode(dentry);
 	if (status)
@@ -242,8 +254,8 @@
 	struct dentry * dentry = file->f_dentry;
 	ssize_t	result;
 
-	VERBOSE("file %s/%s, count=%lu@%lu, pages=%ld\n", DENTRY_PATH(dentry),
-		(unsigned long) count, (unsigned long) *ppos,
+	VERBOSE("file %s/%s, count=%lu@%Lu, pages=%ld\n", DENTRY_PATH(dentry),
+		(unsigned long) count, *ppos,
 		dentry->d_inode->i_nrpages);
 
 	result = smb_revalidate_inode(dentry);
@@ -261,8 +273,8 @@
 	if (count > 0)
 	{
 		result = generic_file_write(file, buf, count, ppos);
-		VERBOSE("pos=%ld, size=%ld, mtime=%ld, atime=%ld\n",
-			(long) file->f_pos, dentry->d_inode->i_size,
+		VERBOSE("pos=%Ld, size=%Ld, mtime=%ld, atime=%ld\n",
+			file->f_pos, dentry->d_inode->i_size,
 			dentry->d_inode->i_mtime, dentry->d_inode->i_atime);
 	}
 out:
diff -urN 2.2.18/fs/smbfs/proc.c 2.2.18aa1/fs/smbfs/proc.c
--- 2.2.18/fs/smbfs/proc.c	Mon Dec 11 16:58:03 2000
+++ 2.2.18aa1/fs/smbfs/proc.c	Mon Dec 11 17:20:51 2000
@@ -1079,13 +1079,16 @@
    file-id would not be valid after a reconnection. */
 
 int
-smb_proc_read(struct dentry *dentry, off_t offset, int count, char *data)
+smb_proc_read(struct dentry *dentry, loff_t offset, int count, char *data)
 {
 	struct smb_sb_info *server = server_from_dentry(dentry);
 	__u16 returned_count, data_len;
 	unsigned char *buf;
 	int result;
 
+	if (offset > 0xffffffff)
+		return -EIO;
+
 	smb_lock_server(server);
 	smb_setup_header(server, SMBread, 5, 0);
 	buf = server->packet;
@@ -1128,13 +1131,16 @@
 }
 
 int
-smb_proc_write(struct dentry *dentry, off_t offset, int count, const char *data)
+smb_proc_write(struct dentry *dentry, loff_t offset, int count, const char *data)
 {
 	struct smb_sb_info *server = server_from_dentry(dentry);
 	int result;
 	__u8 *p;
 
-	VERBOSE("file %s/%s, count=%d@%ld, packet_size=%d\n",
+	if (offset > 0xffffffff)
+		return -EIO;
+
+	VERBOSE("file %s/%s, count=%d@%Ld, packet_size=%d\n",
 		DENTRY_PATH(dentry), count, offset, server->packet_size);
 
 	smb_lock_server(server);
diff -urN 2.2.18/fs/smbfs/sock.c 2.2.18aa1/fs/smbfs/sock.c
--- 2.2.18/fs/smbfs/sock.c	Mon Dec 11 16:58:03 2000
+++ 2.2.18aa1/fs/smbfs/sock.c	Mon Dec 11 17:20:44 2000
@@ -96,7 +96,7 @@
 	 */
 	if(!sk->dead) {
 		wake_up_interruptible(sk->sleep);
-		sock_wake_async(sk->socket,1);
+		sock_wake_async(sk->socket,1,POLL_IN);
 	}
 }
 
diff -urN 2.2.18/fs/stat.c 2.2.18aa1/fs/stat.c
--- 2.2.18/fs/stat.c	Mon Jan 17 16:44:43 2000
+++ 2.2.18aa1/fs/stat.c	Mon Dec 11 17:20:51 2000
@@ -48,6 +48,10 @@
 	tmp.st_uid = inode->i_uid;
 	tmp.st_gid = inode->i_gid;
 	tmp.st_rdev = kdev_t_to_nr(inode->i_rdev);
+#if BITS_PER_LONG == 32
+	if (inode->i_size > 0x7fffffff)
+		return -EOVERFLOW;
+#endif	
 	tmp.st_size = inode->i_size;
 	tmp.st_atime = inode->i_atime;
 	tmp.st_mtime = inode->i_mtime;
@@ -70,6 +74,10 @@
 	tmp.st_uid = inode->i_uid;
 	tmp.st_gid = inode->i_gid;
 	tmp.st_rdev = kdev_t_to_nr(inode->i_rdev);
+#if BITS_PER_LONG == 32
+	if (inode->i_size > 0x7fffffff)
+		return -EOVERFLOW;
+#endif	
 	tmp.st_size = inode->i_size;
 	tmp.st_atime = inode->i_atime;
 	tmp.st_mtime = inode->i_mtime;
@@ -280,3 +288,127 @@
 	unlock_kernel();
 	return error;
 }
+
+
+/* ---------- LFS-64 ----------- */
+#if !defined(__alpha__)
+
+static long cp_new_stat64(struct inode * inode, struct stat64 * statbuf)
+{
+	struct stat64 tmp;
+	unsigned int blocks, indirect;
+
+	memset(&tmp, 0, sizeof(tmp));
+	tmp.st_dev = kdev_t_to_nr(inode->i_dev);
+	tmp.st_ino = inode->i_ino;
+#ifdef STAT64_HAS_BROKEN_ST_INO
+	tmp.__st_ino = inode->i_ino;
+#endif
+	tmp.st_mode = inode->i_mode;
+	tmp.st_nlink = inode->i_nlink;
+	tmp.st_uid = inode->i_uid;
+	tmp.st_gid = inode->i_gid;
+	tmp.st_rdev = kdev_t_to_nr(inode->i_rdev);
+	tmp.st_atime = inode->i_atime;
+	tmp.st_mtime = inode->i_mtime;
+	tmp.st_ctime = inode->i_ctime;
+	tmp.st_size = inode->i_size;
+/*
+ * st_blocks and st_blksize are approximated with a simple algorithm if
+ * they aren't supported directly by the filesystem. The minix and msdos
+ * filesystems don't keep track of blocks, so they would either have to
+ * be counted explicitly (by delving into the file itself), or by using
+ * this simple algorithm to get a reasonable (although not 100% accurate)
+ * value.
+ */
+
+/*
+ * Use minix fs values for the number of direct and indirect blocks.  The
+ * count is now exact for the minix fs except that it counts zero blocks.
+ * Everything is in units of BLOCK_SIZE until the assignment to
+ * tmp.st_blksize.
+ */
+#define D_B   7
+#define I_B   (BLOCK_SIZE / sizeof(unsigned short))
+
+	if (!inode->i_blksize) {
+		blocks = (tmp.st_size + BLOCK_SIZE - 1) >> BLOCK_SIZE_BITS;
+		if (blocks > D_B) {
+			indirect = (blocks - D_B + I_B - 1) / I_B;
+			blocks += indirect;
+			if (indirect > 1) {
+				indirect = (indirect - 1 + I_B - 1) / I_B;
+				blocks += indirect;
+				if (indirect > 1)
+					blocks++;
+			}
+		}
+		tmp.st_blocks = (BLOCK_SIZE / 512) * blocks;
+		tmp.st_blksize = BLOCK_SIZE;
+	} else {
+		tmp.st_blocks = inode->i_blocks;
+		tmp.st_blksize = inode->i_blksize;
+	}
+	return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0;
+}
+
+asmlinkage long sys_stat64(char * filename, struct stat64 * statbuf, long flags)
+{
+	struct dentry * dentry;
+	int error;
+
+	lock_kernel();
+	dentry = namei(filename);
+
+	error = PTR_ERR(dentry);
+	if (!IS_ERR(dentry)) {
+		error = do_revalidate(dentry);
+		if (!error)
+			error = cp_new_stat64(dentry->d_inode, statbuf);
+
+		dput(dentry);
+	}
+	unlock_kernel();
+	return error;
+}
+
+asmlinkage long sys_lstat64(char * filename, struct stat64 * statbuf, long flags)
+{
+	struct dentry * dentry;
+	int error;
+
+	lock_kernel();
+	dentry = lnamei(filename);
+
+	error = PTR_ERR(dentry);
+	if (!IS_ERR(dentry)) {
+		error = do_revalidate(dentry);
+		if (!error)
+			error = cp_new_stat64(dentry->d_inode, statbuf);
+
+		dput(dentry);
+	}
+	unlock_kernel();
+	return error;
+}
+
+asmlinkage long sys_fstat64(unsigned long fd, struct stat64 * statbuf, long flags)
+{
+	struct file * f;
+	int err = -EBADF;
+
+	lock_kernel();
+	f = fget(fd);
+	if (f) {
+		struct dentry * dentry = f->f_dentry;
+
+		err = do_revalidate(dentry);
+		if (!err)
+			err = cp_new_stat64(dentry->d_inode, statbuf);
+		fput(f);
+	}
+	unlock_kernel();
+	return err;
+}
+
+#endif /* LFS-64 */
diff -urN 2.2.18/fs/sysv/dir.c 2.2.18aa1/fs/sysv/dir.c
--- 2.2.18/fs/sysv/dir.c	Mon Jan 17 16:44:43 2000
+++ 2.2.18aa1/fs/sysv/dir.c	Mon Dec 11 17:20:51 2000
@@ -100,7 +100,7 @@
 					       inode->i_ino, (off_t) filp->f_pos, sde.inode);
 
 				i = strnlen(sde.name, SYSV_NAMELEN);
-				if (filldir(dirent, sde.name, i, filp->f_pos, sde.inode) < 0) {
+				if (filldir(dirent, sde.name, i, filp->f_pos, sde.inode, DT_UNKNOWN) < 0) {
 					brelse(bh);
 					return 0;
 				}
diff -urN 2.2.18/fs/sysv/file.c 2.2.18aa1/fs/sysv/file.c
--- 2.2.18/fs/sysv/file.c	Mon Jan 17 16:44:43 2000
+++ 2.2.18aa1/fs/sysv/file.c	Mon Dec 11 17:20:51 2000
@@ -207,7 +207,7 @@
 {
 	struct inode * inode = filp->f_dentry->d_inode;
 	struct super_block * sb = inode->i_sb;
-	off_t pos;
+	loff_t pos;
 	ssize_t written, c;
 	struct buffer_head * bh;
 	char * p;
@@ -232,6 +232,21 @@
 	else
 		pos = *ppos;
 	written = 0;
+
+	/* L-F-S spec 2.2.1.27: */
+	if (!(filp->f_flags & O_LARGEFILE)) {
+		if (pos >= 0x7fffffffULL) /* pos@2G forbidden */
+			return -EFBIG;
+
+		if (pos + count > 0x7fffffffULL)
+			/* Write only until end of allowed region */
+			count = 0x7fffffffULL - pos;
+	}
+	if (pos >= 0xffffffffULL)
+		return -EFBIG; /* Only up to 4G-1! */
+	if ((pos + count) > 0xffffffffULL)
+		count = 0xffffffffULL - pos;
+
 	while (written<count) {
 		bh = sysv_getblk (inode, pos >> sb->sv_block_size_bits, 1);
 		if (!bh) {
diff -urN 2.2.18/fs/ufs/balloc.c 2.2.18aa1/fs/ufs/balloc.c
--- 2.2.18/fs/ufs/balloc.c	Mon Jan 17 16:44:43 2000
+++ 2.2.18aa1/fs/ufs/balloc.c	Mon Dec 11 17:20:51 2000
@@ -660,9 +660,9 @@
 	struct ufs_sb_private_info * uspi;
 	struct ufs_super_block_first * usb1;
 	struct ufs_cylinder_group * ucg;
-	unsigned start, length, location, result;
-	unsigned possition, fragsize, blockmap, mask;
-	unsigned swab;
+	unsigned int start, length, location, result;
+	unsigned int possition, fragsize, blockmap, mask;
+	unsigned int swab;
 	
 	UFSD(("ENTER, cg %u, goal %u, count %u\n", ucpi->c_cgx, goal, count))
 
@@ -676,7 +676,7 @@
 	else
 		start = ucpi->c_frotor >> 3;
 		
-	length = howmany(uspi->s_fpg, 8) - start;
+	length = ((uspi->s_fpg + 7) >> 3) - start;
 	location = ubh_scanc(UCPI_UBH, ucpi->c_freeoff + start, length,
 		(uspi->s_fpb == 8) ? ufs_fragtable_8fpb : ufs_fragtable_other,
 		1 << (count - 1 + (uspi->s_fpb & 7))); 
diff -urN 2.2.18/fs/ufs/dir.c 2.2.18aa1/fs/ufs/dir.c
--- 2.2.18/fs/ufs/dir.c	Thu May  4 13:00:40 2000
+++ 2.2.18aa1/fs/ufs/dir.c	Mon Dec 11 17:20:51 2000
@@ -15,6 +15,7 @@
 
 #include <linux/fs.h>
 #include <linux/ufs_fs.h>
+#include <linux/unistd.h>
 
 #include "swab.h"
 #include "util.h"
@@ -124,11 +125,14 @@
 				 * not the directory has been modified
 				 * during the copy operation. */
 				unsigned long version = inode->i_version;
+				unsigned char d_type = DT_UNKNOWN;
 
 				UFSD(("filldir(%s,%u)\n", de->d_name, SWAB32(de->d_ino)))
 				UFSD(("namlen %u\n", ufs_get_de_namlen(de)))
+				if ((flags & UFS_DE_MASK) == UFS_DE_44BSD)
+					d_type = de->d_u.d_44.d_type;
 				error = filldir(dirent, de->d_name, ufs_get_de_namlen(de),
-						filp->f_pos, SWAB32(de->d_ino));
+						filp->f_pos, SWAB32(de->d_ino), d_type);
 				if (error)
 					break;
 				if (version != inode->i_version)
@@ -170,7 +174,7 @@
 		error_msg = "inode out of bounds";
 
 	if (error_msg != NULL)
-		ufs_error (sb, function, "bad entry in directory #%lu, size %lu: %s - "
+		ufs_error (sb, function, "bad entry in directory #%lu, size %Lu: %s - "
 			    "offset=%lu, inode=%lu, reclen=%d, namlen=%d",
 			    dir->i_ino, dir->i_size, error_msg, offset,
 			    (unsigned long) SWAB32(de->d_ino),
diff -urN 2.2.18/fs/ufs/file.c 2.2.18aa1/fs/ufs/file.c
--- 2.2.18/fs/ufs/file.c	Sun Apr  2 21:07:49 2000
+++ 2.2.18aa1/fs/ufs/file.c	Mon Dec 11 17:20:51 2000
@@ -140,7 +140,7 @@
 	loff_t *ppos )
 {
 	struct inode * inode = filp->f_dentry->d_inode;
-	__u32 pos;
+	loff_t pos;
 	long block;
 	int offset;
 	int written, c;
@@ -177,11 +177,14 @@
 			return -EINVAL;
 	}
 
-	/* Check for overflow.. */
-	if (pos > (__u32) (pos + count)) {
-		count = ~pos; /* == 0xFFFFFFFF - pos */
-		if (!count)
+	/* L-F-S spec 2.2.1.27: */
+	if (!(filp->f_flags & O_LARGEFILE)) {
+		if (pos >= 0x7fffffffULL) /* pos@2G forbidden */
 			return -EFBIG;
+
+		if (pos + count > 0x7fffffffULL)
+			/* Write only until end of allowed region */
+			count = 0x7fffffffULL - pos;
 	}
 
 	/*
diff -urN 2.2.18/fs/ufs/inode.c 2.2.18aa1/fs/ufs/inode.c
--- 2.2.18/fs/ufs/inode.c	Tue Jun 13 03:48:15 2000
+++ 2.2.18aa1/fs/ufs/inode.c	Mon Dec 11 17:20:51 2000
@@ -54,7 +54,7 @@
 {
 	unsigned swab = inode->i_sb->u.ufs_sb.s_swab;
 	printk("ino %lu  mode 0%6.6o  nlink %d  uid %d  uid32 %u"
-	       "  gid %d  gid32 %u  size %lu blocks %lu\n",
+	       "  gid %d  gid32 %u  size %Lu blocks %lu\n",
 	       inode->i_ino, inode->i_mode, inode->i_nlink,
 	       inode->i_uid, inode->u.ufs_i.i_uid, inode->i_gid, 
 	       inode->u.ufs_i.i_gid, inode->i_size, inode->i_blocks);
@@ -213,13 +213,14 @@
 	if (!create)
 		return NULL;
 	limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
-	if (limit < RLIM_INFINITY) {
+	if (limit != RLIM_INFINITY) {
 		limit >>= sb->s_blocksize_bits;
 		if (new_fragment >= limit) {
 			send_sig(SIGXFSZ, current, 0);
 			return NULL;
 		}
 	}
+
 	lastblock = ufs_fragstoblks (lastfrag);
 	lastblockoff = ufs_fragnum (lastfrag);
 	/*
@@ -321,7 +322,8 @@
 		brelse (result);
 		goto repeat;
 	}
-	if (!create || new_fragment >= (current->rlim[RLIMIT_FSIZE].rlim_cur >> sb->s_blocksize)) {
+	if (!create || (current->rlim[RLIMIT_FSIZE].rlim_cur != RLIM_INFINITY &&
+			new_fragment >= (current->rlim[RLIMIT_FSIZE].rlim_cur >> sb->s_blocksize))) {
 		brelse (bh);
 		*err = -EFBIG;
 		return NULL;
@@ -497,13 +499,10 @@
 	}
 	
 	/*
-	 * Linux i_size can be 32 on some architectures. We will mark 
-	 * big files as read only and let user access first 32 bits.
+	 * Linux i_size used to be 32 bits on some architectures.
+	 * These days we allow access to the entire file as is..
 	 */
-	inode->u.ufs_i.i_size = SWAB64(ufs_inode->ui_size);
-	inode->i_size = (off_t) inode->u.ufs_i.i_size;
-	if (sizeof(off_t) == 4 && (inode->u.ufs_i.i_size >> 32))
-		inode->i_size = (__u32)-1;
+	inode->i_size = SWAB64(ufs_inode->ui_size);
 
 	inode->i_atime = SWAB32(ufs_inode->ui_atime.tv_sec);
 	inode->i_ctime = SWAB32(ufs_inode->ui_ctime.tv_sec);
@@ -516,7 +515,7 @@
 	inode->u.ufs_i.i_gen = SWAB32(ufs_inode->ui_gen);
 	inode->u.ufs_i.i_shadow = SWAB32(ufs_inode->ui_u3.ui_sun.ui_shadow);
 	inode->u.ufs_i.i_oeftflag = SWAB32(ufs_inode->ui_u3.ui_sun.ui_oeftflag);
-	inode->u.ufs_i.i_lastfrag = howmany (inode->i_size, uspi->s_fsize);
+	inode->u.ufs_i.i_lastfrag = (inode->i_size + uspi->s_fsize -1) >> uspi->s_fshift;
 	
 	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
 		inode->i_rdev = to_kdev_t(SWAB32(ufs_inode->ui_u2.ui_addr.ui_db[0]));
diff -urN 2.2.18/fs/ufs/super.c 2.2.18aa1/fs/ufs/super.c
--- 2.2.18/fs/ufs/super.c	Mon Jan 17 16:44:43 2000
+++ 2.2.18aa1/fs/ufs/super.c	Mon Dec 11 17:20:51 2000
@@ -328,7 +328,7 @@
 	 * on the device. 
 	 */
 	size = uspi->s_cssize;
-	blks = howmany(size, uspi->s_fsize);
+	blks = (size + uspi->s_fsize-1) >> uspi->s_fshift;
 	base = space = kmalloc(size, GFP_KERNEL);
 	if (!base)
 		goto failed; 
@@ -405,7 +405,7 @@
 	uspi = sb->u.ufs_sb.s_uspi;
 
 	size = uspi->s_cssize;
-	blks = howmany(size, uspi->s_fsize);
+	blks = (size + uspi->s_fsize-1) >> uspi->s_fshift;
 	base = space = (char*) sb->u.ufs_sb.s_csp[0];
 	for (i = 0; i < blks; i += uspi->s_fpb) {
 		size = uspi->s_bsize;
diff -urN 2.2.18/fs/ufs/truncate.c 2.2.18aa1/fs/ufs/truncate.c
--- 2.2.18/fs/ufs/truncate.c	Mon Jan 17 16:44:43 2000
+++ 2.2.18aa1/fs/ufs/truncate.c	Mon Dec 11 17:20:51 2000
@@ -59,8 +59,8 @@
  *		Linus
  */
 
-#define DIRECT_BLOCK howmany (inode->i_size, uspi->s_bsize)
-#define DIRECT_FRAGMENT howmany (inode->i_size, uspi->s_fsize)
+#define DIRECT_BLOCK ((inode->i_size + uspi->s_bsize -1) >> uspi->s_bshift)
+#define DIRECT_FRAGMENT ((inode->i_size + uspi->s_fsize -1) >> uspi->s_fshift)
 
 static int ufs_trunc_direct (struct inode * inode)
 {
@@ -194,7 +194,7 @@
 }
 
 
-static int ufs_trunc_indirect (struct inode * inode, unsigned offset, u32 * p)
+static int ufs_trunc_indirect (struct inode * inode, u_long offset, u32 * p)
 {
 	struct super_block * sb;
 	struct ufs_sb_private_info * uspi;
@@ -297,7 +297,7 @@
 	struct super_block * sb;
 	struct ufs_sb_private_info * uspi;
 	struct ufs_buffer_head * dind_bh;
-	unsigned i, tmp, dindirect_block;
+	unsigned int i, tmp, dindirect_block;
 	u32 * dind;
 	int retry = 0;
 	unsigned swab;
@@ -308,8 +308,8 @@
 	swab = sb->u.ufs_sb.s_swab;
 	uspi = sb->u.ufs_sb.s_uspi;
 
-	dindirect_block = (DIRECT_BLOCK > offset) 
-		? ((DIRECT_BLOCK - offset) / uspi->s_apb) : 0;
+	dindirect_block = ((DIRECT_BLOCK > offset) ?
+			   ((DIRECT_BLOCK - offset) >> uspi->s_apbshift) : 0);
 	retry = 0;
 	
 	tmp = SWAB32(*p);
@@ -379,7 +379,7 @@
 	retry = 0;
 	
 	tindirect_block = (DIRECT_BLOCK > (UFS_NDADDR + uspi->s_apb + uspi->s_2apb))
-		? ((DIRECT_BLOCK - UFS_NDADDR - uspi->s_apb - uspi->s_2apb) / uspi->s_2apb) : 0;
+		? ((DIRECT_BLOCK - UFS_NDADDR - uspi->s_apb - uspi->s_2apb) >> uspi->s_2apbshift) : 0;
 	p = inode->u.ufs_i.i_u1.i_data + UFS_TIND_BLOCK;
 	if (!(tmp = SWAB32(*p)))
 		return 0;
@@ -467,7 +467,8 @@
 		}
 	}
 	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-	inode->u.ufs_i.i_lastfrag = howmany (inode->i_size, uspi->s_fsize);
+	inode->u.ufs_i.i_lastfrag =
+	  (inode->i_size + uspi->s_fsize -1) >> uspi->s_fshift;
 	mark_inode_dirty(inode);
 	UFSD(("EXIT\n"))
 }
diff -urN 2.2.18/fs/ufs/util.h 2.2.18aa1/fs/ufs/util.h
--- 2.2.18/fs/ufs/util.h	Sat May 20 00:06:21 2000
+++ 2.2.18aa1/fs/ufs/util.h	Mon Dec 11 17:20:51 2000
@@ -14,7 +14,6 @@
  * some useful macros
  */
 #define in_range(b,first,len)	((b)>=(first)&&(b)<(first)+(len))
-#define howmany(x,y)		(((x)+(y)-1)/(y))
 #define min(x,y)		((x)<(y)?(x):(y))
 #define max(x,y)		((x)>(y)?(x):(y))
 
diff -urN 2.2.18/fs/umsdos/dir.c 2.2.18aa1/fs/umsdos/dir.c
--- 2.2.18/fs/umsdos/dir.c	Thu May  4 13:00:40 2000
+++ 2.2.18aa1/fs/umsdos/dir.c	Mon Dec 11 17:20:51 2000
@@ -90,7 +90,7 @@
 	if (d->count == 0) {
 		PRINTK ((KERN_DEBUG "dir_once :%.*s: offset %Ld\n", 
 			len, name, offset));
-		ret = d->filldir (d->dirbuf, name, len, offset, ino);
+		ret = d->filldir (d->dirbuf, name, len, offset, ino, DT_UNKNOWN);
 		d->stop = ret < 0;
 		d->count = 1;
 	}
@@ -136,7 +136,7 @@
 
 		Printk ((KERN_WARNING "umsdos_readdir_x: pseudo_root thing UMSDOS_SPECIAL_DIRFPOS\n"));
 		if (filldir (dirbuf, "DOS", 3, 
-				UMSDOS_SPECIAL_DIRFPOS, UMSDOS_ROOT_INO) == 0) {
+				UMSDOS_SPECIAL_DIRFPOS, UMSDOS_ROOT_INO, DT_DIR) == 0) {
 			filp->f_pos++;
 		}
 		goto out_end;
@@ -255,7 +255,7 @@
 		if (inode != pseudo_root &&
 		    (internal_read || !(entry.flags & UMSDOS_HIDDEN))) {
 			if (filldir (dirbuf, entry.name, entry.name_len,
-				 cur_f_pos, inode->i_ino) < 0) {
+				 cur_f_pos, inode->i_ino, DT_UNKNOWN) < 0) {
 				new_filp.f_pos = cur_f_pos;
 			}
 Printk(("umsdos_readdir_x: got %s/%s, ino=%ld\n",
diff -urN 2.2.18/fs/umsdos/rdir.c 2.2.18aa1/fs/umsdos/rdir.c
--- 2.2.18/fs/umsdos/rdir.c	Thu May  4 13:00:40 2000
+++ 2.2.18aa1/fs/umsdos/rdir.c	Mon Dec 11 17:20:51 2000
@@ -33,7 +33,8 @@
 				const char *name,
 				int name_len,
 				off_t offset,
-				ino_t ino)
+				ino_t ino,
+				unsigned int d_type)
 {
 	int ret = 0;
 	struct RDIR_FILLDIR *d = (struct RDIR_FILLDIR *) buf;
@@ -48,11 +49,11 @@
 				/* Make sure the .. entry points back to the pseudo_root */
 				ino = pseudo_root->i_ino;
 			}
-			ret = d->filldir (d->dirbuf, name, name_len, offset, ino);
+			ret = d->filldir (d->dirbuf, name, name_len, offset, ino, DT_UNKNOWN);
 		}
 	} else {
 		/* Any DOS directory */
-		ret = d->filldir (d->dirbuf, name, name_len, offset, ino);
+		ret = d->filldir (d->dirbuf, name, name_len, offset, ino, DT_UNKNOWN);
 	}
 	return ret;
 }
diff -urN 2.2.18/include/asm-alpha/bigmem.h 2.2.18aa1/include/asm-alpha/bigmem.h
--- 2.2.18/include/asm-alpha/bigmem.h	Thu Jan  1 01:00:00 1970
+++ 2.2.18aa1/include/asm-alpha/bigmem.h	Mon Dec 11 17:20:48 2000
@@ -0,0 +1,27 @@
+/*
+ * linux/include/asm-alpha/bigmem.h
+ *
+ * On alpha we can address all the VM with a flat mapping. We need
+ * to differentiate BIGMEM memory only because the default PCI DMA window
+ * is currently limited to 2g. Thus kmap/kunmap are noops here.
+ *
+ * With bigmem support the alpha now is capable of allocating up to
+ * 2048Giga of memory.
+ *
+ * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de>, SuSE GmbH
+ */
+
+#ifndef _ASM_BIGMEM_H
+#define _ASM_BIGMEM_H
+
+#include <linux/init.h>
+
+#undef BIGMEM_DEBUG /* undef for production */
+
+/* declarations for bigmem.c */
+extern unsigned long bigmem_start, bigmem_end;
+
+#define kmap(kaddr, type) kaddr
+#define kunmap(vaddr, type) do { } while (0)
+
+#endif /* _ASM_BIGMEM_H */
diff -urN 2.2.18/include/asm-alpha/bitops.h 2.2.18aa1/include/asm-alpha/bitops.h
--- 2.2.18/include/asm-alpha/bitops.h	Tue Sep  5 02:28:49 2000
+++ 2.2.18aa1/include/asm-alpha/bitops.h	Mon Dec 11 17:20:52 2000
@@ -38,6 +38,30 @@
 	:"Ir" (1UL << (nr & 31)), "m" (*m));
 }
 
+/*
+ * WARNING: non atomic version.
+ */
+extern __inline__ void __set_bit(unsigned long nr, volatile void * addr)
+{
+	unsigned int * m = ((unsigned int *) addr) + (nr >> 5);
+	/*
+	 * Asm and C produces the same thing so let
+	 * the compiler to do its good work.
+	 */
+#if 0
+	int tmp;
+
+	__asm__ __volatile__(
+	"ldl %0,%3\n\t"
+	"bis %0,%2,%0\n\t"
+	"stl %0,%1"
+	: "=&r" (tmp), "=m" (*m)
+	: "Ir" (1UL << (nr & 31)), "m" (*m));
+#else
+	*m |= 1UL << (nr & 31);
+#endif
+}
+
 extern __inline__ void clear_bit(unsigned long nr, volatile void * addr)
 {
 	unsigned long oldbit;
@@ -96,6 +120,29 @@
 	"3:	br 1b\n"
 	".previous"
 	:"=&r" (temp), "=m" (*m), "=&r" (oldbit)
+	:"Ir" (1UL << (nr & 31)), "m" (*m) : "memory");
+
+	return oldbit != 0;
+}
+
+/*
+ * WARNING: non atomic version.
+ */
+extern __inline__ unsigned long __test_and_set_bit(unsigned long nr,
+						   volatile void * addr)
+{
+	unsigned long oldbit;
+	unsigned long temp;
+	unsigned int * m = ((unsigned int *) addr) + (nr >> 5);
+
+	__asm__ __volatile__(
+	"	ldl %0,%4\n"
+	"	and %0,%3,%2\n"
+	"	bne %2,1f\n"
+	"	xor %0,%3,%0\n"
+	"	stl %0,%1\n"
+	"1:\n"
+	:"=&r" (temp), "=m" (*m), "=&r" (oldbit)
 	:"Ir" (1UL << (nr & 31)), "m" (*m));
 
 	return oldbit != 0;
@@ -121,6 +168,29 @@
 	"3:	br 1b\n"
 	".previous"
 	:"=&r" (temp), "=m" (*m), "=&r" (oldbit)
+	:"Ir" (1UL << (nr & 31)), "m" (*m) : "memory");
+
+	return oldbit != 0;
+}
+
+/*
+ * WARNING: non atomic version.
+ */
+extern __inline__ unsigned long __test_and_clear_bit(unsigned long nr,
+						     volatile void * addr)
+{
+	unsigned long oldbit;
+	unsigned long temp;
+	unsigned int * m = ((unsigned int *) addr) + (nr >> 5);
+
+	__asm__ __volatile__(
+	"	ldl %0,%4\n"
+	"	and %0,%3,%2\n"
+	"	beq %2,1f\n"
+	"	xor %0,%3,%0\n"
+	"	stl %0,%1\n"
+	"1:\n"
+	:"=&r" (temp), "=m" (*m), "=&r" (oldbit)
 	:"Ir" (1UL << (nr & 31)), "m" (*m));
 
 	return oldbit != 0;
@@ -144,7 +214,7 @@
 	"3:	br 1b\n"
 	".previous"
 	:"=&r" (temp), "=m" (*m), "=&r" (oldbit)
-	:"Ir" (1UL << (nr & 31)), "m" (*m));
+	:"Ir" (1UL << (nr & 31)), "m" (*m) : "memory");
 
 	return oldbit != 0;
 }
@@ -285,15 +355,15 @@
 
 #ifdef __KERNEL__
 
-#define ext2_set_bit                 test_and_set_bit
-#define ext2_clear_bit               test_and_clear_bit
+#define ext2_set_bit                 __test_and_set_bit
+#define ext2_clear_bit               __test_and_clear_bit
 #define ext2_test_bit                test_bit
 #define ext2_find_first_zero_bit     find_first_zero_bit
 #define ext2_find_next_zero_bit      find_next_zero_bit
 
 /* Bitmap functions for the minix filesystem.  */
-#define minix_set_bit(nr,addr) test_and_set_bit(nr,addr)
-#define minix_clear_bit(nr,addr) test_and_clear_bit(nr,addr)
+#define minix_set_bit(nr,addr) __test_and_set_bit(nr,addr)
+#define minix_clear_bit(nr,addr) __test_and_clear_bit(nr,addr)
 #define minix_test_bit(nr,addr) test_bit(nr,addr)
 #define minix_find_first_zero_bit(addr,size) find_first_zero_bit(addr,size)
 
diff -urN 2.2.18/include/asm-alpha/fcntl.h 2.2.18aa1/include/asm-alpha/fcntl.h
--- 2.2.18/include/asm-alpha/fcntl.h	Mon Jan 17 16:44:43 2000
+++ 2.2.18aa1/include/asm-alpha/fcntl.h	Mon Dec 11 17:20:51 2000
@@ -20,6 +20,7 @@
 #define O_DIRECT	040000	/* direct disk access - should check with OSF/1 */
 #define O_DIRECTORY	0100000	/* must be a directory */
 #define O_NOFOLLOW	0200000 /* don't follow links */
+#define O_LARGEFILE	0400000 /* will be set by the kernel on every open */
 
 #define F_DUPFD		0	/* dup */
 #define F_GETFD		1	/* get f_flags */
@@ -61,5 +62,9 @@
 	__kernel_off_t l_len;
 	__kernel_pid_t l_pid;
 };
+
+#ifdef __KERNEL__
+#define flock64	flock
+#endif
 
 #endif
diff -urN 2.2.18/include/asm-alpha/md.h 2.2.18aa1/include/asm-alpha/md.h
--- 2.2.18/include/asm-alpha/md.h	Mon Jan 17 16:44:43 2000
+++ 2.2.18aa1/include/asm-alpha/md.h	Thu Jan  1 01:00:00 1970
@@ -1,13 +0,0 @@
-/* $Id: md.h,v 1.1 1997/12/15 15:11:48 jj Exp $
- * md.h: High speed xor_block operation for RAID4/5 
- *
- */
- 
-#ifndef __ASM_MD_H
-#define __ASM_MD_H
-
-/* #define HAVE_ARCH_XORBLOCK */
-
-#define MD_XORBLOCK_ALIGNMENT	sizeof(long)
-
-#endif /* __ASM_MD_H */
diff -urN 2.2.18/include/asm-alpha/mmu_context.h 2.2.18aa1/include/asm-alpha/mmu_context.h
--- 2.2.18/include/asm-alpha/mmu_context.h	Thu Aug 24 17:44:06 2000
+++ 2.2.18aa1/include/asm-alpha/mmu_context.h	Mon Dec 11 19:23:36 2000
@@ -65,12 +65,7 @@
 #endif /* __SMP__ */
 
 #define WIDTH_HARDWARE_ASN	8
-#ifdef __SMP__
-#define WIDTH_THIS_PROCESSOR	5
-#else
-#define WIDTH_THIS_PROCESSOR	0
-#endif
-#define ASN_FIRST_VERSION (1UL << (WIDTH_THIS_PROCESSOR + WIDTH_HARDWARE_ASN))
+#define ASN_FIRST_VERSION (1UL << WIDTH_HARDWARE_ASN)
 #define HARDWARE_ASN_MASK ((1UL << WIDTH_HARDWARE_ASN) - 1)
 
 /*
@@ -100,6 +95,7 @@
 	/* If we've wrapped, flush the whole user TLB.  */
 	if ((asn & HARDWARE_ASN_MASK) >= MAX_ASN) {
 		tbiap();
+		imb();
 		next = (asn & ~HARDWARE_ASN_MASK) + ASN_FIRST_VERSION;
 	}
 	cpu_last_asn(smp_processor_id()) = next;
@@ -125,19 +121,21 @@
 __EXTERN_INLINE void
 ev5_get_mmu_context(struct task_struct *p)
 {
-	/* Check if our ASN is of an older version, or on a different CPU,
-	   and thus invalid.  */
-	/* ??? If we have two threads on different cpus, we'll continually
-	   fight over the context.  Find a way to record a per-mm, per-cpu
-	   value for the asn.  */
-
-	unsigned long asn = cpu_last_asn(smp_processor_id());
-	struct mm_struct *mm = p->mm;
-	unsigned long mmc = mm->context;
+	/* Check if our ASN is of an older version, and thus invalid. */
+	int cpu;
+	unsigned long asn;
+	struct mm_struct *mm;
+	unsigned long mmc;
 	
+	cpu = smp_processor_id();
+	mm = p->mm;
+	ctx_cli();
+	asn = cpu_last_asn(cpu);
+	mmc = mm->context[cpu];
+
 	if ((mmc ^ asn) & ~HARDWARE_ASN_MASK) {
 		mmc = __get_new_mmu_context();
-		mm->context = mmc;
+		mm->context[cpu] = mmc;
 	}
 
 	/* Always update the PCB ASN.  Another thread may have allocated
@@ -159,7 +157,10 @@
 extern inline void
 init_new_context(struct mm_struct *mm)
 {
-	mm->context = 0;
+	int i;
+
+	for (i = 0; i < smp_num_cpus; i++)
+		mm->context[cpu_logical_map(i)] = 0;
 }
 
 extern inline void
diff -urN 2.2.18/include/asm-alpha/pgtable.h 2.2.18aa1/include/asm-alpha/pgtable.h
--- 2.2.18/include/asm-alpha/pgtable.h	Thu Aug 24 19:11:39 2000
+++ 2.2.18aa1/include/asm-alpha/pgtable.h	Mon Dec 11 19:23:36 2000
@@ -73,7 +73,13 @@
 __EXTERN_INLINE void
 ev5_flush_tlb_other(struct mm_struct *mm)
 {
-	mm->context = 0;
+	long * mmc = &mm->context[smp_processor_id()];
+	/*
+	 * Check it's not zero first to avoid cacheline ping pong when
+	 * possible.
+	 */
+	if (*mmc)
+		*mmc = 0;
 }
 
 /*
diff -urN 2.2.18/include/asm-alpha/ptrace.h 2.2.18aa1/include/asm-alpha/ptrace.h
--- 2.2.18/include/asm-alpha/ptrace.h	Mon Jan 17 16:44:43 2000
+++ 2.2.18aa1/include/asm-alpha/ptrace.h	Mon Dec 11 17:20:45 2000
@@ -70,6 +70,7 @@
 #define user_mode(regs) (((regs)->ps & 8) != 0)
 #define instruction_pointer(regs) ((regs)->pc)
 extern void show_regs(struct pt_regs *);
+extern void __show_regs(struct pt_regs *);
 #endif
 
 #endif
diff -urN 2.2.18/include/asm-alpha/smplock.h 2.2.18aa1/include/asm-alpha/smplock.h
--- 2.2.18/include/asm-alpha/smplock.h	Mon Dec 11 16:58:03 2000
+++ 2.2.18aa1/include/asm-alpha/smplock.h	Mon Dec 11 19:23:35 2000
@@ -28,6 +28,25 @@
 		spin_lock(&kernel_flag);
 }
 
+#define DECLARE_LOCAL_LOCK_DEPTH(x) int x
+
+#define release_kernel_lock_save(local_depth) \
+do { \
+	(local_depth) = current->lock_depth; \
+	if ((local_depth) >= 0) { \
+		current->lock_depth = -1; \
+		spin_unlock(&kernel_flag); \
+	} \
+} while (0)
+
+#define reacquire_kernel_lock_restore(local_depth) \
+do { \
+	if ((local_depth) >= 0) { \
+		current->lock_depth = local_depth; \
+		spin_lock(&kernel_flag); \
+	} \
+} while (0)
+
 /*
  * Getting the big kernel lock.
  *
diff -urN 2.2.18/include/asm-alpha/softirq.h 2.2.18aa1/include/asm-alpha/softirq.h
--- 2.2.18/include/asm-alpha/softirq.h	Thu Aug 24 17:44:04 2000
+++ 2.2.18aa1/include/asm-alpha/softirq.h	Mon Dec 11 19:23:35 2000
@@ -9,6 +9,7 @@
 extern unsigned long local_bh_count;
 #else
 #define local_bh_count          (cpu_data[smp_processor_id()].bh_count)
+extern spinlock_t alpha_bh_lock;
 #endif
 
 #define get_active_bhs()	(bh_mask & bh_active)
@@ -28,24 +29,6 @@
 	:"Ir" (x), "m" (bh_active));
 }
 
-extern inline void init_bh(int nr, void (*routine)(void))
-{
-	bh_base[nr] = routine;
-	atomic_set(&bh_mask_count[nr], 0);
-	bh_mask |= 1 << nr;
-}
-
-extern inline void remove_bh(int nr)
-{
-	bh_base[nr] = NULL;
-	bh_mask &= ~(1 << nr);
-}
-
-extern inline void mark_bh(int nr)
-{
-	set_bit(nr, &bh_active);
-}
-
 #ifdef __SMP__
 
 /*
@@ -113,21 +96,58 @@
 
 #endif	/* SMP */
 
+extern inline void init_bh(int nr, void (*routine)(void))
+{
+	unsigned long flags;
+
+	bh_base[nr] = routine;
+	atomic_set(&bh_mask_count[nr], 0);
+
+	spin_lock_irqsave(&alpha_bh_lock, flags);
+	bh_mask |= 1 << nr;
+	spin_unlock_irqrestore(&alpha_bh_lock, flags);
+}
+
+extern inline void remove_bh(int nr)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&alpha_bh_lock, flags);
+	bh_mask &= ~(1 << nr);
+	spin_unlock_irqrestore(&alpha_bh_lock, flags);
+
+	synchronize_bh();
+	bh_base[nr] = NULL;
+}
+
+extern inline void mark_bh(int nr)
+{
+	set_bit(nr, &bh_active);
+}
+
 /*
  * These use a mask count to correctly handle
  * nested disable/enable calls
  */
 extern inline void disable_bh(int nr)
 {
+	unsigned long flags;
+
+	spin_lock_irqsave(&alpha_bh_lock, flags);
 	bh_mask &= ~(1 << nr);
 	atomic_inc(&bh_mask_count[nr]);
+	spin_unlock_irqrestore(&alpha_bh_lock, flags);
 	synchronize_bh();
 }
 
 extern inline void enable_bh(int nr)
 {
+	unsigned long flags;
+
+	spin_lock_irqsave(&alpha_bh_lock, flags);
 	if (atomic_dec_and_test(&bh_mask_count[nr]))
 		bh_mask |= 1 << nr;
+	spin_unlock_irqrestore(&alpha_bh_lock, flags);
 }
 
 #endif /* _ALPHA_SOFTIRQ_H */
diff -urN 2.2.18/include/asm-alpha/spinlock.h 2.2.18aa1/include/asm-alpha/spinlock.h
--- 2.2.18/include/asm-alpha/spinlock.h	Thu Aug 24 17:44:04 2000
+++ 2.2.18aa1/include/asm-alpha/spinlock.h	Mon Dec 11 19:23:34 2000
@@ -151,7 +151,7 @@
 	"	br	1b\n"
 	".previous"
 	: "=r" (tmp), "=m" (__dummy_lock(lock))
-	: "m"(__dummy_lock(lock)));
+	: "m"(__dummy_lock(lock)) : "memory");
 }
 
 #define spin_trylock(lock) (!test_and_set_bit(0,(lock)))
@@ -197,7 +197,7 @@
 	".previous"
 	: "=m" (__dummy_lock(lock)), "=&r" (regx)
 	: "0" (__dummy_lock(lock))
-	);
+	: "memory");
 }
 
 static inline void read_lock(rwlock_t * lock)
@@ -218,7 +218,7 @@
 	".previous"
 	: "=m" (__dummy_lock(lock)), "=&r" (regx)
 	: "m" (__dummy_lock(lock))
-	);
+	: "memory");
 }
 #endif /* DEBUG_RWLOCK */
 
@@ -231,6 +231,7 @@
 static inline void read_unlock(rwlock_t * lock)
 {
 	long regx;
+	mb();
 	__asm__ __volatile__(
 	"1:	ldl_l	%1,%0\n"
 	"	addl	%1,2,%1\n"
diff -urN 2.2.18/include/asm-alpha/system.h 2.2.18aa1/include/asm-alpha/system.h
--- 2.2.18/include/asm-alpha/system.h	Mon Dec 11 16:58:03 2000
+++ 2.2.18aa1/include/asm-alpha/system.h	Mon Dec 11 19:23:34 2000
@@ -112,12 +112,21 @@
 
 extern void halt(void) __attribute__((noreturn));
 
+#ifdef CONFIG_SMP
+#define ctx_cli()	__cli()
+#define ctx_sti()	__sti()
+#else
+#define ctx_cli()	do { } while(0)
+#define ctx_sti()	do { } while(0)
+#endif
+
 #define switch_to(prev,next,last)			\
 do {							\
 	unsigned long pcbb;				\
 	current = (next);				\
 	pcbb = virt_to_phys(&current->tss);		\
 	(last) = alpha_switch_to(pcbb, (prev));		\
+	ctx_sti();					\
 } while (0)
 
 extern struct task_struct* alpha_switch_to(unsigned long, struct task_struct*);
diff -urN 2.2.18/include/asm-alpha/uaccess.h 2.2.18aa1/include/asm-alpha/uaccess.h
--- 2.2.18/include/asm-alpha/uaccess.h	Thu Aug 24 19:11:39 2000
+++ 2.2.18aa1/include/asm-alpha/uaccess.h	Mon Dec 11 19:23:35 2000
@@ -3,6 +3,8 @@
 
 #include <linux/errno.h>
 #include <linux/sched.h>
+#include <linux/condsched.h>
+#include <linux/smp_lock.h>
 
 
 /*
@@ -402,8 +404,17 @@
 	return len;
 }
 
-#define __copy_to_user(to,from,n)   __copy_tofrom_user_nocheck((to),(from),(n))
-#define __copy_from_user(to,from,n) __copy_tofrom_user_nocheck((to),(from),(n))
+#define __copy_to_user(to,from,n)				\
+({								\
+	long ret;						\
+	DECLARE_LOCAL_LOCK_DEPTH(lock_depth);			\
+	release_kernel_lock_save(lock_depth);			\
+	ret = __copy_tofrom_user_nocheck((to),(from),(n));	\
+	conditional_schedule();					\
+	reacquire_kernel_lock_restore(lock_depth);		\
+	ret;							\
+})
+#define __copy_from_user(to,from,n) __copy_to_user(to,from,n)
 
 extern inline long
 copy_to_user(void *to, const void *from, long n)
@@ -430,7 +441,7 @@
 extern void __do_clear_user(void);
 
 extern inline long
-__clear_user(void *to, long len)
+____clear_user(void *to, long len)
 {
 	/* This little bit of silliness is to get the GP loaded for
 	   a function that ordinarily wouldn't.  Otherwise we could
@@ -448,20 +459,22 @@
 	return __cl_len;
 }
 
+#define __clear_user(to,len)				\
+({							\
+	long ret;					\
+	DECLARE_LOCAL_LOCK_DEPTH(lock_depth);		\
+	release_kernel_lock_save(lock_depth);		\
+	ret = ____clear_user((to),(len));		\
+	conditional_schedule();				\
+	reacquire_kernel_lock_restore(lock_depth);	\
+	ret;						\
+})
+
 extern inline long
 clear_user(void *to, long len)
 {
-	if (__access_ok((long)to, len, get_fs())) {
-		register void * pv __asm__("$27") = __do_clear_user;
-		register void * __cl_to __asm__("$6") = to;
-		register long __cl_len __asm__("$0") = len;
-		__asm__ __volatile__(
-			"jsr $28,(%2),__do_clear_user\n\tldgp $29,0($28)"
-			: "=r"(__cl_len), "=r"(__cl_to), "=r"(pv)
-			: "0"(__cl_len), "1"(__cl_to), "2"(pv)
-			: "$1","$2","$3","$4","$5","$28","memory");
-		len = __cl_len;
-	}
+	if (__access_ok((long)to, len, get_fs()))
+		len = __clear_user(to, len);
 	return len;
 }
 
@@ -474,8 +487,13 @@
 strncpy_from_user(char *to, const char *from, long n)
 {
 	long ret = -EFAULT;
-	if (__access_ok((long)from, 0, get_fs()))
+	if (__access_ok((long)from, 0, get_fs())) {
+		DECLARE_LOCAL_LOCK_DEPTH(lock_depth);
+		release_kernel_lock_save(lock_depth);
 		ret = __strncpy_from_user(to, from, n);
+		conditional_schedule();
+		reacquire_kernel_lock_restore(lock_depth);
+	}
 	return ret;
 }
 
@@ -484,7 +502,15 @@
 
 extern inline long strlen_user(const char *str)
 {
-	return access_ok(VERIFY_READ,str,0) ? __strlen_user(str) : 0;
+	long ret = 0;
+	if (access_ok(VERIFY_READ,str,0)) {
+		DECLARE_LOCAL_LOCK_DEPTH(lock_depth);
+		release_kernel_lock_save(lock_depth);
+		ret = __strlen_user(str);
+		conditional_schedule();
+		reacquire_kernel_lock_restore(lock_depth);
+	}
+	return ret;
 }
 
 /* Returns: 0 if exception before NUL or reaching the supplied limit (N),
@@ -493,7 +519,15 @@
 
 extern inline long strnlen_user(const char *str, long n)
 {
-	return access_ok(VERIFY_READ,str,0) ? __strnlen_user(str, n) : 0;
+	long ret = 0;
+	if (access_ok(VERIFY_READ,str,0)) {
+		DECLARE_LOCAL_LOCK_DEPTH(lock_depth);
+		release_kernel_lock_save(lock_depth);
+		ret = __strnlen_user(str, n);
+		conditional_schedule();
+		reacquire_kernel_lock_restore(lock_depth);
+	}
+	return ret;
 }
 
 /*
diff -urN 2.2.18/include/asm-alpha/unistd.h 2.2.18aa1/include/asm-alpha/unistd.h
--- 2.2.18/include/asm-alpha/unistd.h	Mon Dec 11 16:58:03 2000
+++ 2.2.18aa1/include/asm-alpha/unistd.h	Mon Dec 11 19:23:29 2000
@@ -314,6 +314,7 @@
 #define __NR_pivot_root			374	/* implemented in 2.3 */
 #define __NR_mincore			375	/* implemented in 2.3 */
 #define __NR_pciconfig_iobase		376
+#define __NR_getdents64			377
 
 #if defined(__GNUC__)
 
diff -urN 2.2.18/include/asm-arm/fcntl.h 2.2.18aa1/include/asm-arm/fcntl.h
--- 2.2.18/include/asm-arm/fcntl.h	Mon Jan 17 16:44:44 2000
+++ 2.2.18aa1/include/asm-arm/fcntl.h	Mon Dec 11 17:20:51 2000
@@ -18,6 +18,8 @@
 #define FASYNC		020000	/* fcntl, for BSD compatibility */
 #define O_DIRECTORY	040000	/* must be a directory */
 #define O_NOFOLLOW	0100000	/* don't follow links */
+#define O_DIRECT	0200000 /* direct disk access hint - currently ignored */
+#define O_LARGEFILE	0400000
 
 #define F_DUPFD		0	/* dup */
 #define F_GETFD		1	/* get f_flags */
@@ -33,6 +35,10 @@
 #define F_SETSIG	10	/*  for sockets. */
 #define F_GETSIG	11	/*  for sockets. */
 
+#define F_GETLK64	12	/*  using 'struct flock64' */
+#define F_SETLK64	13
+#define F_SETLKW64	14
+
 /* for F_[GET|SET]FL */
 #define FD_CLOEXEC	1	/* actually anything with low bit set goes */
 
@@ -58,6 +64,14 @@
 	off_t l_start;
 	off_t l_len;
 	pid_t l_pid;
+};
+
+struct flock64 {
+	short  l_type;
+	short  l_whence;
+	loff_t l_start;
+	loff_t l_len;
+	pid_t  l_pid;
 };
 
 #endif
diff -urN 2.2.18/include/asm-arm/stat.h 2.2.18aa1/include/asm-arm/stat.h
--- 2.2.18/include/asm-arm/stat.h	Mon Jan 17 16:44:44 2000
+++ 2.2.18aa1/include/asm-arm/stat.h	Mon Dec 11 17:20:51 2000
@@ -38,4 +38,5 @@
 	unsigned long  __unused5;
 };
 
+/* Someone please add a glibc/arm compatible stat64 struct here. */
 #endif
diff -urN 2.2.18/include/asm-arm/unistd.h 2.2.18aa1/include/asm-arm/unistd.h
--- 2.2.18/include/asm-arm/unistd.h	Mon Dec 11 16:58:04 2000
+++ 2.2.18aa1/include/asm-arm/unistd.h	Mon Dec 11 17:20:51 2000
@@ -198,6 +198,13 @@
 					/* 188 reserved */
 					/* 189 reserved */
 #define __NR_vfork			(__NR_SYSCALL_BASE+190)
+/* #define __NR_getrlimit			(__NR_SYSCALL_BASE+191) */
+#define __NR_mmap2			(__NR_SYSCALL_BASE+192)
+#define __NR_truncate64			(__NR_SYSCALL_BASE+193)
+#define __NR_ftruncate64		(__NR_SYSCALL_BASE+194)
+#define __NR_stat64			(__NR_SYSCALL_BASE+195)
+#define __NR_lstat64			(__NR_SYSCALL_BASE+196)
+#define __NR_fstat64			(__NR_SYSCALL_BASE+197)
 
 #define __sys2(x) #x
 #define __sys1(x) __sys2(x)
diff -urN 2.2.18/include/asm-i386/bigmem.h 2.2.18aa1/include/asm-i386/bigmem.h
--- 2.2.18/include/asm-i386/bigmem.h	Thu Jan  1 01:00:00 1970
+++ 2.2.18aa1/include/asm-i386/bigmem.h	Mon Dec 11 17:41:07 2000
@@ -0,0 +1,69 @@
+/*
+ * bigmem.h:	virtual kernel memory mappings for big memory
+ *
+ * Used in CONFIG_BIGMEM systems for memory pages which	are not
+ * addressable by direct kernel virtual adresses.
+ *
+ * Copyright (C) 1999 Gerhard Wichert, Siemens AG
+ *		      Gerhard.Wichert@pdb.siemens.de
+ */
+
+#ifndef _ASM_BIGMEM_H
+#define _ASM_BIGMEM_H
+
+#include <linux/init.h>
+
+#undef BIGMEM_DEBUG /* undef for production */
+
+/* declarations for bigmem.c */
+extern unsigned long bigmem_start, bigmem_end;
+extern int nr_free_bigpages;
+
+extern pte_t *kmap_pte;
+extern pgprot_t kmap_prot;
+
+extern void kmap_init(void) __init;
+
+/* kmap helper functions necessary to access the bigmem pages in kernel */
+#include <asm/pgtable.h>
+#include <asm/kmap_types.h>
+
+extern inline unsigned long kmap(unsigned long kaddr, enum km_type type)
+{
+	if (__pa(kaddr) < bigmem_start)
+		return kaddr;
+	{
+		enum fixed_addresses idx = type+KM_TYPE_NR*smp_processor_id();
+		unsigned long vaddr = __fix_to_virt(FIX_KMAP_BEGIN+idx);
+
+#ifdef BIGMEM_DEBUG
+		if (!pte_none(*(kmap_pte-idx)))
+		{
+			__label__ here;
+		here:
+			printk(KERN_ERR "not null pte on CPU %d from %p\n",
+			       smp_processor_id(), &&here);
+		}
+#endif
+		set_pte(kmap_pte-idx, mk_pte(kaddr & PAGE_MASK, kmap_prot));
+		__flush_tlb_one(vaddr);
+
+		return vaddr | (kaddr & ~PAGE_MASK);
+	}
+}
+
+extern inline void kunmap(unsigned long vaddr, enum km_type type)
+{
+#ifdef BIGMEM_DEBUG
+	enum fixed_addresses idx = type+KM_TYPE_NR*smp_processor_id();
+	if ((vaddr & PAGE_MASK) == __fix_to_virt(FIX_KMAP_BEGIN+idx))
+	{
+		/* force other mappings to Oops if they'll try to access
+		   this pte without first remap it */
+		pte_clear(kmap_pte-idx);
+		__flush_tlb_one(vaddr);
+	}
+#endif
+}
+
+#endif /* _ASM_BIGMEM_H */
diff -urN 2.2.18/include/asm-i386/bitops.h 2.2.18aa1/include/asm-i386/bitops.h
--- 2.2.18/include/asm-i386/bitops.h	Tue Nov 14 23:08:23 2000
+++ 2.2.18aa1/include/asm-i386/bitops.h	Mon Dec 11 17:29:53 2000
@@ -49,6 +49,15 @@
 		:"Ir" (nr));
 }
 
+/* WARNING: non atomic and it can be reordered! */
+static __inline__ void __set_bit(int nr, volatile void * addr)
+{
+	__asm__(
+		"btsl %1,%0"
+		:"=m" (ADDR)
+		:"Ir" (nr));
+}
+
 extern __inline__ void clear_bit(int nr, volatile void * addr)
 {
 	__asm__ __volatile__( LOCK_PREFIX
@@ -72,6 +81,18 @@
 	__asm__ __volatile__( LOCK_PREFIX
 		"btsl %2,%1\n\tsbbl %0,%0"
 		:"=r" (oldbit),"=m" (ADDR)
+		:"Ir" (nr) : "memory");
+	return oldbit;
+}
+
+/* WARNING: non atomic and it can be reordered! */
+static __inline__ int __test_and_set_bit(int nr, volatile void * addr)
+{
+	int oldbit;
+
+	__asm__(
+		"btsl %2,%1\n\tsbbl %0,%0"
+		:"=r" (oldbit),"=m" (ADDR)
 		:"Ir" (nr));
 	return oldbit;
 }
@@ -83,6 +104,18 @@
 	__asm__ __volatile__( LOCK_PREFIX
 		"btrl %2,%1\n\tsbbl %0,%0"
 		:"=r" (oldbit),"=m" (ADDR)
+		:"Ir" (nr) : "memory");
+	return oldbit;
+}
+
+/* WARNING: non atomic and it can be reordered! */
+static __inline__ int __test_and_clear_bit(int nr, volatile void * addr)
+{
+	int oldbit;
+
+	__asm__(
+		"btrl %2,%1\n\tsbbl %0,%0"
+		:"=r" (oldbit),"=m" (ADDR)
 		:"Ir" (nr));
 	return oldbit;
 }
@@ -94,7 +127,7 @@
 	__asm__ __volatile__( LOCK_PREFIX
 		"btcl %2,%1\n\tsbbl %0,%0"
 		:"=r" (oldbit),"=m" (ADDR)
-		:"Ir" (nr));
+		:"Ir" (nr) : "memory");
 	return oldbit;
 }
 
@@ -219,15 +252,15 @@
 
 #ifdef __KERNEL__
 
-#define ext2_set_bit                 test_and_set_bit
-#define ext2_clear_bit               test_and_clear_bit
+#define ext2_set_bit                 __test_and_set_bit
+#define ext2_clear_bit               __test_and_clear_bit
 #define ext2_test_bit                test_bit
 #define ext2_find_first_zero_bit     find_first_zero_bit
 #define ext2_find_next_zero_bit      find_next_zero_bit
 
 /* Bitmap functions for the minix filesystem.  */
-#define minix_set_bit(nr,addr) test_and_set_bit(nr,addr)
-#define minix_clear_bit(nr,addr) test_and_clear_bit(nr,addr)
+#define minix_set_bit(nr,addr) __test_and_set_bit(nr,addr)
+#define minix_clear_bit(nr,addr) __test_and_clear_bit(nr,addr)
 #define minix_test_bit(nr,addr) test_bit(nr,addr)
 #define minix_find_first_zero_bit(addr,size) find_first_zero_bit(addr,size)
 
diff -urN 2.2.18/include/asm-i386/bugs.h 2.2.18aa1/include/asm-i386/bugs.h
--- 2.2.18/include/asm-i386/bugs.h	Mon Dec 11 16:58:04 2000
+++ 2.2.18aa1/include/asm-i386/bugs.h	Mon Dec 11 17:20:44 2000
@@ -8,6 +8,9 @@
  *        <rreilova@ececs.uc.edu>
  *	- Channing Corn (tests & fixes),
  *	- Andrew D. Balsa (code cleanup).
+ *
+ *  Pentium III FXSR, SSE support
+ *	Gareth Hughes <gareth@valinux.com>, May 2000
  */
 
 /*
@@ -20,6 +23,7 @@
 #include <linux/config.h>
 #include <asm/processor.h>
 #include <asm/msr.h>
+#include <asm/i387.h>
 
 #define CONFIG_BUGi386
 
@@ -59,6 +63,7 @@
 
 __initfunc(static void check_fpu(void))
 {
+	extern int disable_x86_fxsr;
 	unsigned short control_word;
 
 	if (!boot_cpu_data.hard_math) {
@@ -69,6 +74,28 @@
 #endif
 		return;
 	}
+
+	/*
+	 * Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned.
+	 */
+	if (offsetof(struct task_struct, tss.i387.fxsave) & 15) {
+		extern void __buggy_fxsr_alignment(void);
+		__buggy_fxsr_alignment();
+	}
+	if (!disable_x86_fxsr) {
+		if (cpu_has_fxsr) {
+			printk(KERN_INFO "Enabling fast FPU save and restore... ");
+			set_in_cr4(X86_CR4_OSFXSR);
+			printk("done.\n");
+		}
+		if (cpu_has_xmm) {
+			printk(KERN_INFO "Enabling unmasked SIMD FPU exception support... ");
+			set_in_cr4(X86_CR4_OSXMMEXCPT);
+			printk("done.\n");
+		}
+	} else
+		printk(KERN_INFO "Disabling fast FPU save and restore.\n");
+
 	if (mca_pentium_flag) {
 		/* The IBM Model 95 machines with pentiums lock up on
 		 * fpu test, so we avoid it. All pentiums have inbuilt
diff -urN 2.2.18/include/asm-i386/cache.h 2.2.18aa1/include/asm-i386/cache.h
--- 2.2.18/include/asm-i386/cache.h	Mon Jan 17 16:44:44 2000
+++ 2.2.18aa1/include/asm-i386/cache.h	Mon Dec 11 17:29:53 2000
@@ -5,7 +5,11 @@
 #define __ARCH_I386_CACHE_H
 
 /* bytes per L1 cache line */
-#if    CPU==586 || CPU==686
+#ifdef CONFIG_M686_L1_64
+#define        L1_CACHE_BYTES  64
+#elif defined(CONFIG_M686_L1_128)
+#define        L1_CACHE_BYTES  128
+#elif CPU==586 || CPU==686
 #define        L1_CACHE_BYTES  32
 #else
 #define        L1_CACHE_BYTES  16
diff -urN 2.2.18/include/asm-i386/elf.h 2.2.18aa1/include/asm-i386/elf.h
--- 2.2.18/include/asm-i386/elf.h	Mon Dec 11 16:58:04 2000
+++ 2.2.18aa1/include/asm-i386/elf.h	Mon Dec 11 17:44:00 2000
@@ -15,6 +15,7 @@
 typedef elf_greg_t elf_gregset_t[ELF_NGREG];
 
 typedef struct user_i387_struct elf_fpregset_t;
+typedef struct user_fxsr_struct elf_fpxregset_t;
 
 /*
  * This is used to ensure we don't load something for the wrong architecture.
diff -urN 2.2.18/include/asm-i386/fcntl.h 2.2.18aa1/include/asm-i386/fcntl.h
--- 2.2.18/include/asm-i386/fcntl.h	Mon Jan 17 16:44:44 2000
+++ 2.2.18aa1/include/asm-i386/fcntl.h	Mon Dec 11 17:20:51 2000
@@ -35,6 +35,10 @@
 #define F_SETSIG	10	/*  for sockets. */
 #define F_GETSIG	11	/*  for sockets. */
 
+#define F_GETLK64	12	/*  using 'struct flock64' */
+#define F_SETLK64	13
+#define F_SETLKW64	14
+
 /* for F_[GET|SET]FL */
 #define FD_CLOEXEC	1	/* actually anything with low bit set goes */
 
@@ -60,6 +64,14 @@
 	off_t l_start;
 	off_t l_len;
 	pid_t l_pid;
+};
+
+struct flock64 {
+	short  l_type;
+	short  l_whence;
+	loff_t l_start;
+	loff_t l_len;
+	pid_t  l_pid;
 };
 
 #endif
diff -urN 2.2.18/include/asm-i386/fixmap.h 2.2.18aa1/include/asm-i386/fixmap.h
--- 2.2.18/include/asm-i386/fixmap.h	Tue Nov 14 23:08:23 2000
+++ 2.2.18aa1/include/asm-i386/fixmap.h	Mon Dec 11 17:29:53 2000
@@ -6,6 +6,8 @@
  * for more details.
  *
  * Copyright (C) 1998 Ingo Molnar
+ *
+ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
  */
 
 #ifndef _ASM_FIXMAP_H
@@ -14,6 +16,10 @@
 #include <linux/config.h>
 #include <linux/kernel.h>
 #include <asm/page.h>
+#ifdef CONFIG_BIGMEM
+#include <linux/tasks.h>
+#include <asm/kmap_types.h>
+#endif
 
 /*
  * Here we define all the compile-time 'special' virtual
@@ -55,6 +61,10 @@
 	FIX_CO_APIC,	/* Cobalt APIC Redirection Table */ 
 	FIX_LI_PCIA,	/* Lithium PCI Bridge A */
 	FIX_LI_PCIB,	/* Lithium PCI Bridge B */
+#endif
+#ifdef CONFIG_BIGMEM
+	FIX_KMAP_BEGIN,	/* reserved pte's for temporary kernel mappings */
+	FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
 #endif
 	__end_of_fixed_addresses
 };
diff -urN 2.2.18/include/asm-i386/i387.h 2.2.18aa1/include/asm-i386/i387.h
--- 2.2.18/include/asm-i386/i387.h	Thu Jan  1 01:00:00 1970
+++ 2.2.18aa1/include/asm-i386/i387.h	Mon Dec 11 17:45:31 2000
@@ -0,0 +1,83 @@
+/*
+ * include/asm-i386/i387.h
+ *
+ * Copyright (C) 1994 Linus Torvalds
+ *
+ * Pentium III FXSR, SSE support
+ * General FPU state handling cleanups
+ *	Gareth Hughes <gareth@valinux.com>, May 2000
+ */
+
+#ifndef __ASM_I386_I387_H
+#define __ASM_I386_I387_H
+
+#include <linux/sched.h>
+#include <asm/processor.h>
+#include <asm/sigcontext.h>
+#include <asm/user.h>
+
+extern void init_fpu(void);
+/*
+ * FPU lazy state save handling...
+ */
+extern void save_init_fpu( struct task_struct *tsk );
+extern void restore_fpu( struct task_struct *tsk );
+
+#define unlazy_fpu( tsk ) do { \
+	if ( tsk->flags & PF_USEDFPU ) \
+		save_init_fpu( tsk ); \
+} while (0)
+
+#define clear_fpu( tsk ) do { \
+	if ( tsk->flags & PF_USEDFPU ) { \
+		tsk->flags &= ~PF_USEDFPU; \
+		stts(); \
+	} \
+} while (0)
+
+/*
+ * FPU state interaction...
+ */
+extern unsigned short get_fpu_cwd( struct task_struct *tsk );
+extern unsigned short get_fpu_swd( struct task_struct *tsk );
+extern unsigned short get_fpu_twd( struct task_struct *tsk );
+extern unsigned short get_fpu_mxcsr( struct task_struct *tsk );
+
+extern void set_fpu_cwd( struct task_struct *tsk, unsigned short cwd );
+extern void set_fpu_swd( struct task_struct *tsk, unsigned short swd );
+extern void set_fpu_twd( struct task_struct *tsk, unsigned short twd );
+extern void set_fpu_mxcsr( struct task_struct *tsk, unsigned short mxcsr );
+
+#define load_mxcsr( val ) do { \
+	unsigned long __mxcsr = ((unsigned long)(val) & 0xffbf); \
+	asm volatile( "ldmxcsr %0" : : "m" (__mxcsr) ); \
+} while (0)
+
+/*
+ * Signal frame handlers...
+ */
+extern int save_i387( struct _fpstate *buf );
+extern int restore_i387( struct _fpstate *buf );
+
+/*
+ * ptrace request handers...
+ */
+extern int get_fpregs( struct user_i387_struct *buf,
+		       struct task_struct *tsk );
+extern int set_fpregs( struct task_struct *tsk,
+		       struct user_i387_struct *buf );
+
+extern int get_fpxregs( struct user_fxsr_struct *buf,
+			struct task_struct *tsk );
+extern int set_fpxregs( struct task_struct *tsk,
+			struct user_fxsr_struct *buf );
+
+/*
+ * FPU state for core dumps...
+ */
+extern int dump_fpu( struct pt_regs *regs,
+		     struct user_i387_struct *fpu );
+extern int dump_extended_fpu( struct pt_regs *regs,
+			      struct user_fxsr_struct *fpu );
+
+#endif /* __ASM_I386_I387_H */
diff -urN 2.2.18/include/asm-i386/io.h 2.2.18aa1/include/asm-i386/io.h
--- 2.2.18/include/asm-i386/io.h	Tue Nov 14 23:09:55 2000
+++ 2.2.18aa1/include/asm-i386/io.h	Mon Dec 11 17:30:45 2000
@@ -27,6 +27,7 @@
 
  /*
   *  Bit simplified and optimized by Jan Hubicka
+  *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999.
   */
 
 #ifdef SLOW_IO_BY_JUMPING
@@ -109,12 +110,20 @@
  */
 extern inline unsigned long virt_to_phys(volatile void * address)
 {
+#ifdef CONFIG_BIGMEM
+	return __pa(address);
+#else
 	return __io_phys(address);
+#endif
 }
 
 extern inline void * phys_to_virt(unsigned long address)
 {
+#ifdef CONFIG_BIGMEM
+	return __va(address);
+#else
 	return __io_virt(address);
+#endif
 }
 
 extern void * __ioremap(unsigned long offset, unsigned long size, unsigned long flags);
@@ -157,9 +166,9 @@
 #define writew(b,addr) (*(volatile unsigned short *) __io_virt(addr) = (b))
 #define writel(b,addr) (*(volatile unsigned int *) __io_virt(addr) = (b))
 
-#define memset_io(a,b,c)	memset(__io_virt(a),(b),(c))
-#define memcpy_fromio(a,b,c)	memcpy((a),__io_virt(b),(c))
-#define memcpy_toio(a,b,c)	memcpy(__io_virt(a),(b),(c))
+#define memset_io(a,b,c)	__memset_generic(__io_virt(a),(b),(c))
+#define memcpy_fromio(a,b,c)	__memcpy((a),__io_virt(b),(c))
+#define memcpy_toio(a,b,c)	__memcpy(__io_virt(a),(b),(c))
 
 /*
  * Again, i386 does not require mem IO specific function.
diff -urN 2.2.18/include/asm-i386/kmap_types.h 2.2.18aa1/include/asm-i386/kmap_types.h
--- 2.2.18/include/asm-i386/kmap_types.h	Thu Jan  1 01:00:00 1970
+++ 2.2.18aa1/include/asm-i386/kmap_types.h	Mon Dec 11 17:20:48 2000
@@ -0,0 +1,10 @@
+#ifndef _ASM_KMAP_TYPES_H
+#define _ASM_KMAP_TYPES_H
+
+enum km_type {
+	KM_READ,
+	KM_WRITE,
+	KM_TYPE_NR,
+};
+
+#endif
diff -urN 2.2.18/include/asm-i386/md.h 2.2.18aa1/include/asm-i386/md.h
--- 2.2.18/include/asm-i386/md.h	Mon Jan 17 16:44:44 2000
+++ 2.2.18aa1/include/asm-i386/md.h	Thu Jan  1 01:00:00 1970
@@ -1,13 +0,0 @@
-/* $Id: md.h,v 1.1 1997/12/15 15:11:57 jj Exp $
- * md.h: High speed xor_block operation for RAID4/5 
- *
- */
- 
-#ifndef __ASM_MD_H
-#define __ASM_MD_H
-
-/* #define HAVE_ARCH_XORBLOCK */
-
-#define MD_XORBLOCK_ALIGNMENT	sizeof(long)
-
-#endif /* __ASM_MD_H */
diff -urN 2.2.18/include/asm-i386/page.h 2.2.18aa1/include/asm-i386/page.h
--- 2.2.18/include/asm-i386/page.h	Mon Dec 11 16:58:04 2000
+++ 2.2.18aa1/include/asm-i386/page.h	Mon Dec 11 17:29:53 2000
@@ -107,6 +107,7 @@
 #define __pa(x)			((unsigned long)(x)-PAGE_OFFSET)
 #define __va(x)			((void *)((unsigned long)(x)+PAGE_OFFSET))
 #define MAP_NR(addr)		(__pa(addr) >> PAGE_SHIFT)
+#define PHYSMAP_NR(addr)	((unsigned long)(addr) >> PAGE_SHIFT)
 
 
 #endif /* __KERNEL__ */
diff -urN 2.2.18/include/asm-i386/processor.h 2.2.18aa1/include/asm-i386/processor.h
--- 2.2.18/include/asm-i386/processor.h	Mon Dec 11 16:58:04 2000
+++ 2.2.18aa1/include/asm-i386/processor.h	Mon Dec 11 17:29:54 2000
@@ -40,7 +40,7 @@
 	unsigned long *pgd_quick;
 	unsigned long *pte_quick;
 	unsigned long pgtable_cache_sz;
-};
+} __attribute__((__aligned__(SMP_CACHE_BYTES)));
 
 #define X86_VENDOR_INTEL 0
 #define X86_VENDOR_CYRIX 1
@@ -74,14 +74,14 @@
 #define X86_FEATURE_CMOV	0x00008000	/* CMOV instruction (FCMOVCC and FCOMI too if FPU present) */
 #define X86_FEATURE_PAT	0x00010000	/* Page Attribute Table */
 #define X86_FEATURE_PSE36	0x00020000	/* 36-bit PSEs */
-#define X86_FEATURE_18		0x00040000
+#define X86_FEATURE_PN		0x00040000      /* 96 bit CPU serial # */
 #define X86_FEATURE_19		0x00080000
 #define X86_FEATURE_20		0x00100000
 #define X86_FEATURE_21		0x00200000
 #define X86_FEATURE_22		0x00400000
 #define X86_FEATURE_MMX		0x00800000	/* multimedia extensions */
 #define X86_FEATURE_FXSR	0x01000000	/* FXSAVE and FXRSTOR instructions (fast save and restore of FPU context), and CR4.OSFXSR (OS uses these instructions) available */
-#define X86_FEATURE_25		0x02000000
+#define X86_FEATURE_XMM		0x02000000	/* Intel MMX2 instruction set */
 #define X86_FEATURE_26		0x04000000
 #define X86_FEATURE_27		0x08000000
 #define X86_FEATURE_28		0x10000000
@@ -99,6 +99,23 @@
 #define current_cpu_data boot_cpu_data
 #endif
 
+#define cpu_has_pge \
+		(boot_cpu_data.x86_capability & X86_FEATURE_PGE)
+#define cpu_has_pse \
+		(boot_cpu_data.x86_capability & X86_FEATURE_PSE)
+#define cpu_has_pae \
+		(boot_cpu_data.x86_capability & X86_FEATURE_PAE)
+#define cpu_has_tsc \
+		(boot_cpu_data.x86_capability & X86_FEATURE_TSC)
+#define cpu_has_de \
+		(boot_cpu_data.x86_capability & X86_FEATURE_DE)
+#define cpu_has_vme \
+		(boot_cpu_data.x86_capability & X86_FEATURE_VME)
+#define cpu_has_fxsr \
+		(boot_cpu_data.x86_capability & X86_FEATURE_FXSR)
+#define cpu_has_xmm \
+		(boot_cpu_data.x86_capability & X86_FEATURE_XMM)
+
 extern char ignore_irq13;
 
 extern void identify_cpu(struct cpuinfo_x86 *);
@@ -120,6 +137,49 @@
 }
 
 /*
+ * Intel CPU features in CR4
+ */
+#define X86_CR4_VME		0x0001	/* enable vm86 extensions */
+#define X86_CR4_PVI		0x0002	/* virtual interrupts flag enable */
+#define X86_CR4_TSD		0x0004	/* disable time stamp at ipl 3 */
+#define X86_CR4_DE		0x0008	/* enable debugging extensions */
+#define X86_CR4_PSE		0x0010	/* enable page size extensions */
+#define X86_CR4_PAE		0x0020	/* enable physical address extensions */
+#define X86_CR4_MCE		0x0040	/* Machine check enable */
+#define X86_CR4_PGE		0x0080	/* enable global pages */
+#define X86_CR4_PCE		0x0100	/* enable performance counters at ipl 3 */
+#define X86_CR4_OSFXSR		0x0200	/* enable fast FPU save and restore */
+#define X86_CR4_OSXMMEXCPT	0x0400	/* enable unmasked SSE exceptions */
+
+/*
+ * Save the cr4 feature set we're using (ie
+ * Pentium 4MB enable and PPro Global page
+ * enable), so that any CPU's that boot up
+ * after us can get the correct flags.
+ */
+extern unsigned long x86_cr4;
+
+static inline void set_in_cr4 (unsigned long mask)
+{
+	x86_cr4 |= mask;
+	__asm__("movl %%cr4,%%eax\n\t"
+		"orl %0,%%eax\n\t"
+		"movl %%eax,%%cr4\n"
+		: : "irg" (mask)
+		:"ax");
+}
+
+static inline void clear_in_cr4 (unsigned long mask)
+{
+	x86_cr4 &= ~mask;
+	__asm__("movl %%cr4,%%eax\n\t"
+		"andl %0,%%eax\n\t"
+		"movl %%eax,%%cr4\n"
+		: : "irg" (~mask)
+		:"ax");
+}
+
+/*
  *      Cyrix CPU configuration register indexes
  */
 #define CX86_CCR0 0xc0
@@ -173,7 +233,7 @@
  */
 #define IO_BITMAP_SIZE	32
 
-struct i387_hard_struct {
+struct i387_fsave_struct {
 	long	cwd;
 	long	swd;
 	long	twd;
@@ -185,22 +245,42 @@
 	long	status;		/* software status information */
 };
 
-struct i387_soft_struct {
-	long	cwd;
-	long	swd;
-	long	twd;
+/*
+ * has to be 128-bit aligned
+ */
+struct i387_fxsave_struct {
+	unsigned short	cwd;
+	unsigned short	swd;
+	unsigned short	twd;
+	unsigned short	fop;
 	long	fip;
 	long	fcs;
 	long	foo;
 	long	fos;
-	long	st_space[20];	/* 8*10 bytes for each FP-reg = 80 bytes */
-	unsigned char	ftop, changed, lookahead, no_update, rm, alimit;
-	struct info	*info;
-	unsigned long	entry_eip;
+	long	mxcsr;
+	long	reserved;
+	long	st_space[32];	/* 8*16 bytes for each FP-reg = 128 bytes */
+	long	xmm_space[32];	/* 8*16 bytes for each XMM-reg = 128 bytes */
+	long	padding[56];
+} __attribute__ ((aligned (16)));
+
+struct i387_soft_struct {
+	long     cwd;
+	long     swd;
+	long     twd;
+	long     fip;
+	long     fcs;
+	long     foo;
+	long     fos;
+	long     st_space[20];     /* 8*10 bytes for each FP-reg = 80 bytes */
+	unsigned char     ftop, changed, lookahead, no_update, rm, alimit;
+	struct info       *info;
+	unsigned long     entry_eip;
 };
 
 union i387_union {
-	struct i387_hard_struct hard;
+	struct i387_fsave_struct	fsave;
+	struct i387_fxsave_struct	fxsave;
 	struct i387_soft_struct soft;
 };
 
@@ -291,27 +371,6 @@
 extern void copy_segments(int nr, struct task_struct *p, struct mm_struct * mm);
 extern void release_segments(struct mm_struct * mm);
 extern void forget_segments(void);
-
-/*
- * FPU lazy state save handling..
- */
-#define save_fpu(tsk) do { \
-	asm volatile("fnsave %0\n\tfwait":"=m" (tsk->tss.i387)); \
-	tsk->flags &= ~PF_USEDFPU; \
-	stts(); \
-} while (0)
-
-#define unlazy_fpu(tsk) do { \
-	if (tsk->flags & PF_USEDFPU) \
-		save_fpu(tsk); \
-} while (0)
-
-#define clear_fpu(tsk) do { \
-	if (tsk->flags & PF_USEDFPU) { \
-		tsk->flags &= ~PF_USEDFPU; \
-		stts(); \
-	} \
-} while (0)
 
 /*
  * Return saved PC of a blocked thread.
diff -urN 2.2.18/include/asm-i386/ptrace.h 2.2.18aa1/include/asm-i386/ptrace.h
--- 2.2.18/include/asm-i386/ptrace.h	Mon Jan 17 16:44:44 2000
+++ 2.2.18aa1/include/asm-i386/ptrace.h	Mon Dec 11 17:20:44 2000
@@ -46,6 +46,8 @@
 #define PTRACE_SETREGS            13
 #define PTRACE_GETFPREGS          14
 #define PTRACE_SETFPREGS          15
+#define PTRACE_GETFPXREGS         18
+#define PTRACE_SETFPXREGS         19
 
 #ifdef __KERNEL__
 #define user_mode(regs) ((VM_MASK & (regs)->eflags) || (3 & (regs)->xcs))
diff -urN 2.2.18/include/asm-i386/sigcontext.h 2.2.18aa1/include/asm-i386/sigcontext.h
--- 2.2.18/include/asm-i386/sigcontext.h	Mon Jan 17 16:44:44 2000
+++ 2.2.18aa1/include/asm-i386/sigcontext.h	Mon Dec 11 17:20:44 2000
@@ -8,23 +8,52 @@
  * normal i387 hardware setup, the extra "status"
  * word is used to save the coprocessor status word
  * before entering the handler.
+ *
+ * Pentium III FXSR, SSE support
+ *	Gareth Hughes <gareth@valinux.com>, May 2000
+ *
+ * The FPU state data structure has had to grow to accomodate the
+ * extended FPU state required by the Streaming SIMD Extensions.
+ * There is no documented standard to accomplish this at the moment.
  */
 struct _fpreg {
 	unsigned short significand[4];
 	unsigned short exponent;
 };
 
+struct _fpxreg {
+	unsigned short significand[4];
+	unsigned short exponent;
+	unsigned short padding[3];
+};
+
+struct _xmmreg {
+	unsigned long element[4];
+};
+
 struct _fpstate {
-	unsigned long 	cw,
-			sw,
-			tag,
-			ipoff,
-			cssel,
-			dataoff,
-			datasel;
+	/* Regular FPU environment */
+	unsigned long 	cw;
+	unsigned long	sw;
+	unsigned long	tag;
+	unsigned long	ipoff;
+	unsigned long	cssel;
+	unsigned long	dataoff;
+	unsigned long	datasel;
 	struct _fpreg	_st[8];
-	unsigned long	status;
+	unsigned short	status;
+	unsigned short	magic;		/* 0xffff = regular FPU data only */
+
+	/* FXSR FPU environment */
+	unsigned long	_fxsr_env[6];	/* FXSR FPU env is ignored */
+	unsigned long	mxcsr;
+	unsigned long	reserved;
+	struct _fpxreg	_fxsr_st[8];	/* FXSR FPU reg data is ignored */
+	struct _xmmreg	_xmm[8];
+	unsigned long	padding[56];
 };
+
+#define X86_FXSR_MAGIC		0x0000
 
 struct sigcontext {
 	unsigned short gs, __gsh;
diff -urN 2.2.18/include/asm-i386/siginfo.h 2.2.18aa1/include/asm-i386/siginfo.h
--- 2.2.18/include/asm-i386/siginfo.h	Tue Nov 14 03:36:25 2000
+++ 2.2.18aa1/include/asm-i386/siginfo.h	Mon Dec 11 17:29:54 2000
@@ -77,6 +77,25 @@
 #define si_band		_sifields._sigpoll._band
 #define si_fd		_sifields._sigpoll._fd
 
+#ifdef __KERNEL__
+#define __SI_MASK	0xffff0000
+#define __SI_KILL	(0 << 16)
+#define __SI_TIMER	(1 << 16)
+#define __SI_POLL	(2 << 16)
+#define __SI_FAULT	(3 << 16)
+#define __SI_CHLD	(4 << 16)
+#define __SI_RT		(5 << 16)
+#define __SI_CODE(T,N)	((T) << 16 | ((N) & 0xffff))
+#else
+#define __SI_KILL	0
+#define __SI_TIMER	0
+#define __SI_POLL	0
+#define __SI_FAULT	0
+#define __SI_CHLD	0
+#define __SI_RT		0
+#define __SI_CODE(T,N)	(N)
+#endif
+
 /*
  * si_code values
  * Digital reserves positive values for kernel-generated signals.
diff -urN 2.2.18/include/asm-i386/smplock.h 2.2.18aa1/include/asm-i386/smplock.h
--- 2.2.18/include/asm-i386/smplock.h	Mon Dec 11 16:58:04 2000
+++ 2.2.18aa1/include/asm-i386/smplock.h	Mon Dec 11 17:29:54 2000
@@ -26,6 +26,25 @@
 		spin_lock(&kernel_flag); \
 } while (0)
 
+#define DECLARE_LOCAL_LOCK_DEPTH(x) int x
+
+#define release_kernel_lock_save(local_depth) \
+do { \
+	(local_depth) = current->lock_depth; \
+	if ((local_depth) >= 0) { \
+		current->lock_depth = -1; \
+		spin_unlock(&kernel_flag); \
+	} \
+} while (0)
+
+#define reacquire_kernel_lock_restore(local_depth) \
+do { \
+	if ((local_depth) >= 0) { \
+		current->lock_depth = local_depth; \
+		spin_lock(&kernel_flag); \
+	} \
+} while (0)
+
 
 /*
  * Getting the big kernel lock.
diff -urN 2.2.18/include/asm-i386/spinlock.h 2.2.18aa1/include/asm-i386/spinlock.h
--- 2.2.18/include/asm-i386/spinlock.h	Mon Dec 11 16:58:04 2000
+++ 2.2.18aa1/include/asm-i386/spinlock.h	Mon Dec 11 17:29:53 2000
@@ -165,12 +165,12 @@
 #define spin_lock(lock) \
 __asm__ __volatile__( \
 	spin_lock_string \
-	:"=m" (__dummy_lock(lock)))
+	:"=m" (__dummy_lock(lock)) : : "memory")
 
 #define spin_unlock(lock) \
 __asm__ __volatile__( \
 	spin_unlock_string \
-	:"=m" (__dummy_lock(lock)))
+	:"=m" (__dummy_lock(lock)) : : "memory")
 
 #define spin_trylock(lock) (!test_and_set_bit(0,(lock)))
 
@@ -220,11 +220,11 @@
 		     "js 3b\n\t" \
 		     "jmp 1b\n" \
 		     ".previous" \
-		     :"=m" (__dummy_lock(&(rw)->lock)))
+		     :"=m" (__dummy_lock(&(rw)->lock)) : : "memory")
 
 #define read_unlock(rw) \
 	asm volatile("lock ; decl %0" \
-		:"=m" (__dummy_lock(&(rw)->lock)))
+		:"=m" (__dummy_lock(&(rw)->lock)) : : "memory")
 
 #define write_lock(rw) \
 	asm volatile("\n1:\t" \
@@ -239,10 +239,10 @@
 		     "jne 4b\n\t" \
 		     "jmp 1b\n" \
 		     ".previous" \
-		     :"=m" (__dummy_lock(&(rw)->lock)))
+		     :"=m" (__dummy_lock(&(rw)->lock)) : : "memory")
 
 #define write_unlock(rw) \
-	asm volatile("lock ; btrl $31,%0":"=m" (__dummy_lock(&(rw)->lock)))
+	asm volatile("lock ; btrl $31,%0":"=m" (__dummy_lock(&(rw)->lock)) : : "memory")
 
 #define read_lock_irq(lock)	do { __cli(); read_lock(lock); } while (0)
 #define read_unlock_irq(lock)	do { read_unlock(lock); __sti(); } while (0)
diff -urN 2.2.18/include/asm-i386/stat.h 2.2.18aa1/include/asm-i386/stat.h
--- 2.2.18/include/asm-i386/stat.h	Mon Jan 17 16:44:44 2000
+++ 2.2.18aa1/include/asm-i386/stat.h	Mon Dec 11 17:20:51 2000
@@ -38,4 +38,41 @@
 	unsigned long  __unused5;
 };
 
+/* This matches struct stat64 in glibc2.1, hence the absolutely
+ * insane amounts of padding around dev_t's.
+ */
+struct stat64 {
+	unsigned short	st_dev;
+	unsigned char	__pad0[10];
+
+#define STAT64_HAS_BROKEN_ST_INO	1
+	unsigned long	__st_ino;
+
+	unsigned int	st_mode;
+	unsigned int	st_nlink;
+
+	unsigned long	st_uid;
+	unsigned long	st_gid;
+
+	unsigned short	st_rdev;
+	unsigned char	__pad3[10];
+
+	long long	st_size;
+	unsigned long	st_blksize;
+
+	unsigned long	st_blocks;	/* Number 512-byte blocks allocated. */
+	unsigned long	__pad4;		/* future possible st_blocks high bits */
+
+	unsigned long	st_atime;
+	unsigned long	__pad5;
+
+	unsigned long	st_mtime;
+	unsigned long	__pad6;
+
+	unsigned long	st_ctime;
+	unsigned long	__pad7;		/* will be high 32 bits of ctime someday */
+
+	unsigned long long	st_ino;
+};
+
 #endif
diff -urN 2.2.18/include/asm-i386/uaccess.h 2.2.18aa1/include/asm-i386/uaccess.h
--- 2.2.18/include/asm-i386/uaccess.h	Tue Nov 14 23:09:56 2000
+++ 2.2.18aa1/include/asm-i386/uaccess.h	Mon Dec 11 17:29:54 2000
@@ -6,6 +6,8 @@
  */
 #include <linux/config.h>
 #include <linux/sched.h>
+#include <linux/condsched.h>
+#include <linux/smp_lock.h>
 #include <asm/page.h>
 
 #define VERIFY_READ 0
@@ -253,6 +255,8 @@
 #define __copy_user(to,from,size)					\
 do {									\
 	int __d0, __d1;							\
+	DECLARE_LOCAL_LOCK_DEPTH(lock_depth);				\
+	release_kernel_lock_save(lock_depth);				\
 	__asm__ __volatile__(						\
 		"0:	rep; movsl\n"					\
 		"	movl %3,%0\n"					\
@@ -270,11 +274,15 @@
 		: "=&c"(size), "=&D" (__d0), "=&S" (__d1)		\
 		: "r"(size & 3), "0"(size / 4), "1"(to), "2"(from)	\
 		: "memory");						\
+	conditional_schedule();						\
+	reacquire_kernel_lock_restore(lock_depth);			\
 } while (0)
 
 #define __copy_user_zeroing(to,from,size)				\
 do {									\
 	int __d0, __d1;							\
+	DECLARE_LOCAL_LOCK_DEPTH(lock_depth);				\
+	release_kernel_lock_save(lock_depth);				\
 	__asm__ __volatile__(						\
 		"0:	rep; movsl\n"					\
 		"	movl %3,%0\n"					\
@@ -298,6 +306,8 @@
 		: "=&c"(size), "=&D" (__d0), "=&S" (__d1)		\
 		: "r"(size & 3), "0"(size / 4), "1"(to), "2"(from)	\
 		: "memory");						\
+	conditional_schedule();						\
+	reacquire_kernel_lock_restore(lock_depth);			\
 } while (0)
 
 /* We let the __ versions of copy_from/to_user inline, because they're often
@@ -322,8 +332,10 @@
 #define __constant_copy_user(to, from, size)			\
 do {								\
 	int __d0, __d1;						\
+	DECLARE_LOCAL_LOCK_DEPTH(lock_depth);			\
 	switch (size & 3) {					\
 	default:						\
+		release_kernel_lock_save(lock_depth);		\
 		__asm__ __volatile__(				\
 			"0:	rep; movsl\n"			\
 			"1:\n"					\
@@ -338,6 +350,8 @@
 			: "=c"(size), "=&S" (__d0), "=&D" (__d1)\
 			: "1"(from), "2"(to), "0"(size/4)	\
 			: "memory");				\
+		conditional_schedule();				\
+		reacquire_kernel_lock_restore(lock_depth);	\
 		break;						\
 	case 1:							\
 		__asm__ __volatile__(				\
@@ -406,8 +420,10 @@
 #define __constant_copy_user_zeroing(to, from, size)		\
 do {								\
 	int __d0, __d1;						\
+	DECLARE_LOCAL_LOCK_DEPTH(lock_depth);			\
 	switch (size & 3) {					\
 	default:						\
+		release_kernel_lock_save(lock_depth);		\
 		__asm__ __volatile__(				\
 			"0:	rep; movsl\n"			\
 			"1:\n"					\
@@ -428,6 +444,8 @@
 			: "=c"(size), "=&S" (__d0), "=&D" (__d1)\
 			: "1"(from), "2"(to), "0"(size/4)	\
 			: "memory");				\
+		conditional_schedule();				\
+		reacquire_kernel_lock_restore(lock_depth);	\
 		break;						\
 	case 1:							\
 		__asm__ __volatile__(				\
diff -urN 2.2.18/include/asm-i386/unistd.h 2.2.18aa1/include/asm-i386/unistd.h
--- 2.2.18/include/asm-i386/unistd.h	Mon Jan 17 16:44:44 2000
+++ 2.2.18aa1/include/asm-i386/unistd.h	Mon Dec 11 17:20:51 2000
@@ -80,7 +80,7 @@
 #define __NR_sigpending		 73
 #define __NR_sethostname	 74
 #define __NR_setrlimit		 75
-#define __NR_getrlimit		 76
+#define __NR_getrlimit		 76	/* Back compatible 2Gig limited rlimit */
 #define __NR_getrusage		 77
 #define __NR_gettimeofday	 78
 #define __NR_settimeofday	 79
@@ -195,8 +195,42 @@
 #define __NR_getpmsg		188	/* some people actually want streams */
 #define __NR_putpmsg		189	/* some people actually want streams */
 #define __NR_vfork		190
+/* #define __NR_ugetrlimit		191	SuS compliant getrlimit */
+#define __NR_mmap2		192
+#define __NR_truncate64		193
+#define __NR_ftruncate64	194
+#define __NR_stat64		195
+#define __NR_lstat64		196
+#define __NR_fstat64		197
+#if 0 /* 2.3.x */
+#define __NR_lchown32		198
+#define __NR_getuid32		199
+#define __NR_getgid32		200
+#define __NR_geteuid32		201
+#define __NR_getegid32		202
+#define __NR_setreuid32		203
+#define __NR_setregid32		204
+#define __NR_getgroups32	205
+#define __NR_setgroups32	206
+#define __NR_fchown32		207
+#define __NR_setresuid32	208
+#define __NR_getresuid32	209
+#define __NR_setresgid32	210
+#define __NR_getresgid32	211
+#define __NR_chown32		212
+#define __NR_setuid32		213
+#define __NR_setgid32		214
+#define __NR_setfsuid32		215
+#define __NR_setfsgid32		216
+#define __NR_pivot_root		217
+#define __NR_mincore		218
+#define __NR_madvise		219
+#define __NR_madvise1		219	/* delete when C lib stub is removed */
+#endif
+#define __NR_fcntl64		220
+#define __NR_getdents64		221
 
-/* user-visible error numbers are in the range -1 - -122: see <asm-i386/errno.h> */
+/* user-visible error numbers are in the range -1 - -124: see <asm-i386/errno.h> */
 
 #define __syscall_return(type, res) \
 do { \
@@ -269,6 +303,19 @@
 	: "=a" (__res) \
 	: "0" (__NR_##name),"b" ((long)(arg1)),"c" ((long)(arg2)), \
 	  "d" ((long)(arg3)),"S" ((long)(arg4)),"D" ((long)(arg5))); \
+__syscall_return(type,__res); \
+}
+
+#define _syscall6(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \
+	  type5,arg5,type6,arg6) \
+type name (type1 arg1,type2 arg2,type3 arg3,type4 arg4,type5 arg5,type6 arg6) \
+{ \
+long __res; \
+__asm__ volatile ("push %%ebp ; movl %%eax,%%ebp ; movl %1,%%eax ; int $0x80 ; pop %%ebp" \
+	: "=a" (__res) \
+	: "i" (__NR_##name),"b" ((long)(arg1)),"c" ((long)(arg2)), \
+	  "d" ((long)(arg3)),"S" ((long)(arg4)),"D" ((long)(arg5)), \
+	  "0" ((long)(arg6))); \
 __syscall_return(type,__res); \
 }
 
diff -urN 2.2.18/include/asm-i386/user.h 2.2.18aa1/include/asm-i386/user.h
--- 2.2.18/include/asm-i386/user.h	Tue Nov 14 03:37:33 2000
+++ 2.2.18aa1/include/asm-i386/user.h	Mon Dec 11 17:42:28 2000
@@ -30,6 +30,18 @@
    The minimum core file size is 3 pages, or 12288 bytes.
 */
 
+/*
+ * Pentium III FXSR, SSE support
+ *	Gareth Hughes <gareth@valinux.com>, May 2000
+ *
+ * Provide support for the GDB 5.0+ PTRACE_{GET|SET}FPXREGS requests for
+ * interacting with the FXSR-format floating point environment.  Floating
+ * point data can be accessed in the regular format in the usual manner,
+ * and both the standard and SIMD floating point data can be accessed via
+ * the new ptrace requests.  In either case, changes to the FPU environment
+ * will be reflected in the task's state as expected.
+ */
+
 struct user_i387_struct {
 	long	cwd;
 	long	swd;
@@ -39,6 +51,22 @@
 	long	foo;
 	long	fos;
 	long	st_space[20];	/* 8*10 bytes for each FP-reg = 80 bytes */
+};
+
+struct user_fxsr_struct {
+	unsigned short	cwd;
+	unsigned short	swd;
+	unsigned short	twd;
+	unsigned short	fop;
+	long	fip;
+	long	fcs;
+	long	foo;
+	long	fos;
+	long	mxcsr;
+	long	reserved;
+	long	st_space[32];	/* 8*16 bytes for each FP-reg = 128 bytes */
+	long	xmm_space[32];	/* 8*16 bytes for each XMM-reg = 128 bytes */
+	long	padding[56];
 };
 
 /*
diff -urN 2.2.18/include/asm-m68k/fcntl.h 2.2.18aa1/include/asm-m68k/fcntl.h
--- 2.2.18/include/asm-m68k/fcntl.h	Mon Jan 17 16:44:44 2000
+++ 2.2.18aa1/include/asm-m68k/fcntl.h	Mon Dec 11 17:20:51 2000
@@ -33,6 +33,10 @@
 #define F_SETSIG	10	/*  for sockets. */
 #define F_GETSIG	11	/*  for sockets. */
 
+#define F_GETLK64	12	/*  using 'struct flock64' */
+#define F_SETLK64	13
+#define F_SETLKW64	14
+
 /* for F_[GET|SET]FL */
 #define FD_CLOEXEC	1	/* actually anything with low bit set goes */
 
@@ -58,6 +62,14 @@
 	off_t l_start;
 	off_t l_len;
 	pid_t l_pid;
+};
+
+struct flock64 {
+	short  l_type;
+	short  l_whence;
+	loff_t l_start;
+	loff_t l_len;
+	pid_t  l_pid;
 };
 
 #endif /* _M68K_FCNTL_H */
diff -urN 2.2.18/include/asm-m68k/md.h 2.2.18aa1/include/asm-m68k/md.h
--- 2.2.18/include/asm-m68k/md.h	Mon Jan 17 16:44:44 2000
+++ 2.2.18aa1/include/asm-m68k/md.h	Thu Jan  1 01:00:00 1970
@@ -1,13 +0,0 @@
-/* $Id: md.h,v 1.1 1997/12/15 15:12:04 jj Exp $
- * md.h: High speed xor_block operation for RAID4/5 
- *
- */
- 
-#ifndef __ASM_MD_H
-#define __ASM_MD_H
-
-/* #define HAVE_ARCH_XORBLOCK */
-
-#define MD_XORBLOCK_ALIGNMENT	sizeof(long)
-
-#endif /* __ASM_MD_H */
diff -urN 2.2.18/include/asm-m68k/stat.h 2.2.18aa1/include/asm-m68k/stat.h
--- 2.2.18/include/asm-m68k/stat.h	Mon Jan 17 16:44:45 2000
+++ 2.2.18aa1/include/asm-m68k/stat.h	Mon Dec 11 17:20:51 2000
@@ -38,4 +38,8 @@
 	unsigned long  __unused5;
 };
 
+/* stat64 struct goes here -- someone please make
+ * it mesh with whatever glibc does in userland on
+ * m68k's.
+ */
 #endif /* _M68K_STAT_H */
diff -urN 2.2.18/include/asm-m68k/unistd.h 2.2.18aa1/include/asm-m68k/unistd.h
--- 2.2.18/include/asm-m68k/unistd.h	Mon Jan 17 16:44:45 2000
+++ 2.2.18aa1/include/asm-m68k/unistd.h	Mon Dec 11 17:20:51 2000
@@ -80,7 +80,7 @@
 #define __NR_sigpending		 73
 #define __NR_sethostname	 74
 #define __NR_setrlimit		 75
-#define __NR_getrlimit		 76
+#define __NR_getrlimit	 	 76
 #define __NR_getrusage		 77
 #define __NR_gettimeofday	 78
 #define __NR_settimeofday	 79
@@ -194,6 +194,13 @@
 #define __NR_getpmsg		188	/* some people actually want streams */
 #define __NR_putpmsg		189	/* some people actually want streams */
 #define __NR_vfork		190
+/* #define __NR_getrlimit		191 */
+#define __NR_mmap2		192
+#define __NR_truncate64		193
+#define __NR_ftruncate64	194
+#define __NR_stat64		195
+#define __NR_lstat64		196
+#define __NR_fstat64		197
 
 /* user-visible error numbers are in the range -1 - -122: see
    <asm-m68k/errno.h> */
diff -urN 2.2.18/include/asm-mips/fcntl.h 2.2.18aa1/include/asm-mips/fcntl.h
--- 2.2.18/include/asm-mips/fcntl.h	Mon Jan 17 16:44:45 2000
+++ 2.2.18aa1/include/asm-mips/fcntl.h	Mon Dec 11 17:20:51 2000
@@ -44,6 +44,10 @@
 #define F_SETSIG	10	/*  for sockets. */
 #define F_GETSIG	11	/*  for sockets. */
 
+#define F_GETLK64	33	/*  using 'struct flock64' */
+#define F_SETLK64	34
+#define F_SETLKW64	35
+
 /* for F_[GET|SET]FL */
 #define FD_CLOEXEC	1	/* actually anything with low bit set goes */
 
@@ -72,5 +76,13 @@
 	__kernel_pid_t l_pid;
 	long  pad[4];			/* ZZZZZZZZZZZZZZZZZZZZZZZZZZ */
 } flock_t;
+
+typedef struct flock64 {
+	short  l_type;
+	short  l_whence;
+	loff_t l_start;
+	loff_t l_len;
+	pid_t  l_pid;
+} flock64_t;
 
 #endif /* __ASM_MIPS_FCNTL_H */
diff -urN 2.2.18/include/asm-ppc/fcntl.h 2.2.18aa1/include/asm-ppc/fcntl.h
--- 2.2.18/include/asm-ppc/fcntl.h	Mon Jan 17 16:44:45 2000
+++ 2.2.18aa1/include/asm-ppc/fcntl.h	Mon Dec 11 17:20:51 2000
@@ -18,6 +18,8 @@
 #define FASYNC		020000	/* fcntl, for BSD compatibility */
 #define O_DIRECTORY	040000	/* must be a directory */
 #define O_NOFOLLOW	0100000	/* don't follow links */
+#define O_LARGEFILE     0200000
+#define O_DIRECT	0400000	/* direct disk access hint - currently ignored */
 
 #define F_DUPFD		0	/* dup */
 #define F_GETFD		1	/* get f_flags */
@@ -33,6 +35,10 @@
 #define F_SETSIG	10	/*  for sockets. */
 #define F_GETSIG	11	/*  for sockets. */
 
+#define F_GETLK64	12	/*  using 'struct flock64' */
+#define F_SETLK64	13
+#define F_SETLKW64	14
+
 /* for F_[GET|SET]FL */
 #define FD_CLOEXEC	1	/* actually anything with low bit set goes */
 
@@ -64,6 +70,14 @@
 	off_t l_start;
 	off_t l_len;
 	pid_t l_pid;
+};
+
+struct flock64 {
+	short  l_type;
+	short  l_whence;
+	loff_t l_start;
+	loff_t l_len;
+	pid_t  l_pid;
 };
 
 #endif
diff -urN 2.2.18/include/asm-ppc/md.h 2.2.18aa1/include/asm-ppc/md.h
--- 2.2.18/include/asm-ppc/md.h	Mon Jan 17 16:44:45 2000
+++ 2.2.18aa1/include/asm-ppc/md.h	Thu Jan  1 01:00:00 1970
@@ -1,13 +0,0 @@
-/* $Id: md.h,v 1.1.4.1 1999/08/13 18:30:41 davem dead $
- * md.h: High speed xor_block operation for RAID4/5 
- *
- */
- 
-#ifndef __ASM_MD_H
-#define __ASM_MD_H
-
-/* #define HAVE_ARCH_XORBLOCK */
-
-#define MD_XORBLOCK_ALIGNMENT	sizeof(long)
-
-#endif /* __ASM_MD_H */
diff -urN 2.2.18/include/asm-ppc/smplock.h 2.2.18aa1/include/asm-ppc/smplock.h
--- 2.2.18/include/asm-ppc/smplock.h	Mon Dec 11 16:58:05 2000
+++ 2.2.18aa1/include/asm-ppc/smplock.h	Mon Dec 11 17:20:52 2000
@@ -26,6 +26,25 @@
 		spin_lock(&kernel_flag); \
 } while (0)
 
+#define DECLARE_LOCAL_LOCK_DEPTH(x) int x
+
+#define release_kernel_lock_save(local_depth) \
+do { \
+	(local_depth) = current->lock_depth; \
+	if ((local_depth) >= 0) { \
+		current->lock_depth = -1; \
+		spin_unlock(&kernel_flag); \
+	} \
+} while (0)
+
+#define reacquire_kernel_lock_restore(local_depth) \
+do { \
+	if ((local_depth) >= 0) { \
+		current->lock_depth = local_depth; \
+		spin_lock(&kernel_flag); \
+	} \
+} while (0)
+
 
 /*
  * Getting the big kernel lock.
diff -urN 2.2.18/include/asm-ppc/stat.h 2.2.18aa1/include/asm-ppc/stat.h
--- 2.2.18/include/asm-ppc/stat.h	Mon Jan 17 16:44:45 2000
+++ 2.2.18aa1/include/asm-ppc/stat.h	Mon Dec 11 17:20:51 2000
@@ -37,4 +37,29 @@
 	unsigned long  	__unused5;
 };
 
+/* This matches struct stat64 in glibc2.1.
+ */
+struct stat64 {
+	unsigned long long st_dev; 	/* Device.  */
+	unsigned long long st_ino;	/* File serial number.  */
+	unsigned int st_mode;		/* File mode.  */
+	unsigned int st_nlink;		/* Link count.  */
+	unsigned int st_uid;		/* User ID of the file's owner.  */
+	unsigned int st_gid;		/* Group ID of the file's group. */
+	unsigned long long st_rdev; 	/* Device number, if device.  */
+	unsigned short int __pad2;
+	long long st_size;		/* Size of file, in bytes.  */
+	long st_blksize;		/* Optimal block size for I/O.  */
+
+	long long st_blocks;		/* Number 512-byte blocks allocated. */
+	long st_atime;			/* Time of last access.  */
+	unsigned long int __unused1;
+	long st_mtime;			/* Time of last modification.  */
+	unsigned long int __unused2;
+	long st_ctime;			/* Time of last status change.  */
+	unsigned long int __unused3;
+	unsigned long int __unused4;
+	unsigned long int __unused5;
+};
+
 #endif
diff -urN 2.2.18/include/asm-ppc/unistd.h 2.2.18aa1/include/asm-ppc/unistd.h
--- 2.2.18/include/asm-ppc/unistd.h	Mon Dec 11 16:58:05 2000
+++ 2.2.18aa1/include/asm-ppc/unistd.h	Mon Dec 11 17:20:51 2000
@@ -194,11 +194,18 @@
 #define __NR_getpmsg		187	/* some people actually want streams */
 #define __NR_putpmsg		188	/* some people actually want streams */
 #define __NR_vfork		189
-
+#define __NR_mmap2		192
+#define __NR_truncate64		193
+#define __NR_ftruncate64	194
+#define __NR_stat64		195
+#define __NR_lstat64		196
+#define __NR_fstat64		197
 #define __NR_pciconfig_read     198
 #define __NR_pciconfig_write    199
 #define __NR_pciconfig_iobase   200
 #define __NR_multiplexer        201
+#define __NR_getdents64		202
+#define __NR_fcntl64		203
 
 #define __NR(n)	#n
 
diff -urN 2.2.18/include/asm-sparc/fcntl.h 2.2.18aa1/include/asm-sparc/fcntl.h
--- 2.2.18/include/asm-sparc/fcntl.h	Mon Jan 17 16:44:46 2000
+++ 2.2.18aa1/include/asm-sparc/fcntl.h	Mon Dec 11 17:20:51 2000
@@ -19,6 +19,7 @@
 #define O_NOCTTY	0x8000	/* not fcntl */
 #define O_DIRECTORY	0x10000	/* must be a directory */
 #define O_NOFOLLOW	0x20000	/* don't follow links */
+#define O_LARGEFILE	0x40000	/* LFS */
 
 #define F_DUPFD		0	/* dup */
 #define F_GETFD		1	/* get f_flags */
@@ -32,6 +33,9 @@
 #define F_SETLKW	9
 #define F_SETSIG	10	/*  for sockets. */
 #define F_GETSIG	11	/*  for sockets. */
+#define F_GETLK64	12
+#define F_SETLK64	13
+#define F_SETLKW64	14
 
 /* for F_[GET|SET]FL */
 #define FD_CLOEXEC	1	/* actually anything with low bit set goes */
@@ -57,6 +61,15 @@
 	short l_whence;
 	off_t l_start;
 	off_t l_len;
+	pid_t l_pid;
+	short __unused;
+};
+
+struct flock64 {
+	short l_type;
+	short l_whence;
+	loff_t l_start;
+	loff_t l_len;
 	pid_t l_pid;
 	short __unused;
 };
diff -urN 2.2.18/include/asm-sparc/md.h 2.2.18aa1/include/asm-sparc/md.h
--- 2.2.18/include/asm-sparc/md.h	Mon Jan 17 16:44:46 2000
+++ 2.2.18aa1/include/asm-sparc/md.h	Thu Jan  1 01:00:00 1970
@@ -1,13 +0,0 @@
-/* $Id: md.h,v 1.1 1997/12/15 15:12:39 jj Exp $
- * md.h: High speed xor_block operation for RAID4/5 
- *
- */
- 
-#ifndef __ASM_MD_H
-#define __ASM_MD_H
-
-/* #define HAVE_ARCH_XORBLOCK */
-
-#define MD_XORBLOCK_ALIGNMENT	sizeof(long)
-
-#endif /* __ASM_MD_H */
diff -urN 2.2.18/include/asm-sparc/poll.h 2.2.18aa1/include/asm-sparc/poll.h
--- 2.2.18/include/asm-sparc/poll.h	Mon Jan 17 16:44:46 2000
+++ 2.2.18aa1/include/asm-sparc/poll.h	Mon Dec 11 17:20:44 2000
@@ -11,6 +11,7 @@
 #define POLLWRNORM	POLLOUT
 #define POLLRDBAND	128
 #define POLLWRBAND	256
+#define POLLMSG		512
 
 struct pollfd {
 	int fd;
diff -urN 2.2.18/include/asm-sparc/smplock.h 2.2.18aa1/include/asm-sparc/smplock.h
--- 2.2.18/include/asm-sparc/smplock.h	Mon Dec 11 16:58:05 2000
+++ 2.2.18aa1/include/asm-sparc/smplock.h	Mon Dec 11 17:20:52 2000
@@ -26,6 +26,25 @@
 		spin_lock(&kernel_flag); \
 } while (0)
 
+#define DECLARE_LOCAL_LOCK_DEPTH(x) int x
+
+#define release_kernel_lock_save(local_depth) \
+do { \
+	(local_depth) = current->lock_depth; \
+	if ((local_depth) >= 0) { \
+		current->lock_depth = -1; \
+		spin_unlock(&kernel_flag); \
+	} \
+} while (0)
+
+#define reacquire_kernel_lock_restore(local_depth) \
+do { \
+	if ((local_depth) >= 0) { \
+		current->lock_depth = local_depth; \
+		spin_lock(&kernel_flag); \
+	} \
+} while (0)
+
 
 /*
  * Getting the big kernel lock.
diff -urN 2.2.18/include/asm-sparc/stat.h 2.2.18aa1/include/asm-sparc/stat.h
--- 2.2.18/include/asm-sparc/stat.h	Mon Jan 17 16:44:46 2000
+++ 2.2.18aa1/include/asm-sparc/stat.h	Mon Dec 11 17:20:51 2000
@@ -1,4 +1,4 @@
-/* $Id: stat.h,v 1.9 1998/07/26 05:24:39 davem Exp $ */
+/* $Id: stat.h,v 1.10 1999/12/21 14:09:41 jj Exp $ */
 #ifndef _SPARC_STAT_H
 #define _SPARC_STAT_H
 
@@ -36,6 +36,42 @@
 	off_t   st_blksize;
 	off_t   st_blocks;
 	unsigned long  __unused4[2];
+};
+
+struct stat64 {
+	unsigned char	__pad0[6];
+	unsigned short	st_dev;
+
+	unsigned long long	st_ino;
+
+	unsigned int	st_mode;
+	unsigned int	st_nlink;
+
+	unsigned int	st_uid;
+	unsigned int	st_gid;
+
+	unsigned char	__pad2[6];
+	unsigned short	st_rdev;
+
+	unsigned char	__pad3[8];
+
+	long long	st_size;
+	unsigned int	st_blksize;
+
+	unsigned char	__pad4[8];
+	unsigned int	st_blocks;
+
+	unsigned int	st_atime;
+	unsigned int	__unused1;
+
+	unsigned int	st_mtime;
+	unsigned int	__unused2;
+
+	unsigned int	st_ctime;
+	unsigned int	__unused3;
+
+	unsigned int	__unused4;
+	unsigned int	__unused5;
 };
 
 #endif
diff -urN 2.2.18/include/asm-sparc/unistd.h 2.2.18aa1/include/asm-sparc/unistd.h
--- 2.2.18/include/asm-sparc/unistd.h	Mon Jan 17 16:44:46 2000
+++ 2.2.18aa1/include/asm-sparc/unistd.h	Mon Dec 11 17:20:51 2000
@@ -71,14 +71,14 @@
 /* #define __NR_mctl             53    SunOS specific                              */
 #define __NR_ioctl               54 /* Common                                      */
 #define __NR_reboot              55 /* Common                                      */
-/* #define __NR_ni_syscall       56    ENOSYS under SunOS                          */
+#define __NR_mmap2		 56 /* Linux sparc32 Specific			   */
 #define __NR_symlink             57 /* Common                                      */
 #define __NR_readlink            58 /* Common                                      */
 #define __NR_execve              59 /* Common                                      */
 #define __NR_umask               60 /* Common                                      */
 #define __NR_chroot              61 /* Common                                      */
 #define __NR_fstat               62 /* Common                                      */
-/* #define __NR_ni_syscall       63    ENOSYS under SunOS                          */
+#define __NR_fstat64		 63 /* Linux sparc32 Specific			   */
 #define __NR_getpagesize         64 /* Common                                      */
 #define __NR_msync               65 /* Common in newer 1.3.x revs...               */
 #define __NR_vfork               66 /* Common                                      */
@@ -92,14 +92,14 @@
 #define __NR_mprotect            74 /* Common                                      */
 /* #define __NR_madvise          75    SunOS Specific                              */
 #define __NR_vhangup             76 /* Common                                      */
-/* #define __NR_ni_syscall       77    ENOSYS under SunOS                          */
+#define __NR_truncate64		 77 /* Linux sparc32 Specific			   */
 /* #define __NR_mincore          78    SunOS Specific                              */
 #define __NR_getgroups           79 /* Common                                      */
 #define __NR_setgroups           80 /* Common                                      */
 #define __NR_getpgrp             81 /* Common                                      */
 /* #define __NR_setpgrp          82    setpgid, same difference...                 */
 #define __NR_setitimer           83 /* Common                                      */
-/* #define __NR_ni_syscall       84    ENOSYS under SunOS                          */
+#define __NR_ftruncate64	 84 /* Linux sparc32 Specific			   */
 #define __NR_swapon              85 /* Common                                      */
 #define __NR_getitimer           86 /* Common                                      */
 /* #define __NR_gethostname      87    SunOS Specific                              */
@@ -147,18 +147,18 @@
 #define __NR_truncate           129 /* Common                                      */
 #define __NR_ftruncate          130 /* Common                                      */
 #define __NR_flock              131 /* Common                                      */
-/* #define __NR_ni_syscall      132    ENOSYS under SunOS                          */
+#define __NR_lstat64		132 /* Linux sparc32 Specific			   */
 #define __NR_sendto             133 /* Common                                      */
 #define __NR_shutdown           134 /* Common                                      */
 #define __NR_socketpair         135 /* Common                                      */
 #define __NR_mkdir              136 /* Common                                      */
 #define __NR_rmdir              137 /* Common                                      */
 #define __NR_utimes             138 /* SunOS Specific                              */
-/* #define __NR_ni_syscall      139    ENOSYS under SunOS                          */
+#define __NR_stat64		139 /* Linux sparc32 Specific			   */
 /* #define __NR_adjtime         140    SunOS Specific                              */
 #define __NR_getpeername        141 /* Common                                      */
 /* #define __NR_gethostid       142    SunOS Specific                              */
-/* #define __NR_ni_syscall      143    ENOSYS under SunOS                          */
+#define __NR_fcntl64		143 /* Linux sparc32 Specific			   */
 #define __NR_getrlimit          144 /* Common                                      */
 #define __NR_setrlimit          145 /* Common                                      */
 /* #define __NR_killpg          146    SunOS Specific                              */
@@ -169,7 +169,7 @@
 /* #define __NR_getmsg          151    SunOS Specific                              */
 /* #define __NR_putmsg          152    SunOS Specific                              */
 #define __NR_poll               153 /* Common                                      */
-/* #define __NR_ni_syscall      154    ENOSYS under SunOS                          */
+#define __NR_getdents64         154 /* Linux Specific				   */
 /* #define __NR_nfssvc          155    SunOS Specific                              */
 /* #define __NR_getdirentries   156    SunOS Specific                              */
 #define __NR_statfs             157 /* Common                                      */
diff -urN 2.2.18/include/asm-sparc64/fcntl.h 2.2.18aa1/include/asm-sparc64/fcntl.h
--- 2.2.18/include/asm-sparc64/fcntl.h	Mon Jan 17 16:44:46 2000
+++ 2.2.18aa1/include/asm-sparc64/fcntl.h	Mon Dec 11 17:20:51 2000
@@ -19,6 +19,7 @@
 #define O_NOCTTY	0x8000	/* not fcntl */
 #define O_DIRECTORY	0x10000	/* must be a directory */
 #define O_NOFOLLOW	0x20000	/* don't follow links */
+#define O_LARGEFILE	0x40000
 
 #define F_DUPFD		0	/* dup */
 #define F_GETFD		1	/* get f_flags */
@@ -32,6 +33,11 @@
 #define F_SETLKW	9
 #define F_SETSIG	10	/*  for sockets. */
 #define F_GETSIG	11	/*  for sockets. */
+#ifdef __KERNEL__
+#define F_GETLK64	12
+#define F_SETLK64	13
+#define F_SETLKW64	14
+#endif
 
 /* for F_[GET|SET]FL */
 #define FD_CLOEXEC	1	/* actually anything with low bit set goes */
@@ -58,7 +64,6 @@
 	off_t l_start;
 	off_t l_len;
 	pid_t l_pid;
-	short __unused;
 };
 
 #ifdef __KERNEL__
@@ -70,6 +75,17 @@
 	__kernel_pid_t32 l_pid;
 	short __unused;
 };
+
+struct flock32_64 {
+	short l_type;
+	short l_whence;
+	__kernel_loff_t32 l_start;
+	__kernel_loff_t32 l_len;
+	__kernel_pid_t32 l_pid;
+	short __unused;
+};
+
+#define flock64 flock
 #endif
 
 #endif /* !(_SPARC64_FCNTL_H) */
diff -urN 2.2.18/include/asm-sparc64/md.h 2.2.18aa1/include/asm-sparc64/md.h
--- 2.2.18/include/asm-sparc64/md.h	Mon Jan 17 16:44:46 2000
+++ 2.2.18aa1/include/asm-sparc64/md.h	Thu Jan  1 01:00:00 1970
@@ -1,91 +0,0 @@
-/* $Id: md.h,v 1.2 1997/12/27 16:28:38 jj Exp $
- * md.h: High speed xor_block operation for RAID4/5 
- *            utilizing the UltraSparc Visual Instruction Set.
- *
- * Copyright (C) 1997 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
- */
- 
-#ifndef __ASM_MD_H
-#define __ASM_MD_H
-
-#include <asm/head.h>
-#include <asm/asi.h>
-
-#define HAVE_ARCH_XORBLOCK
-
-#define MD_XORBLOCK_ALIGNMENT	64
-
-/*	void __xor_block (char *dest, char *src, long len)
- *	{
- *		while (len--) *dest++ ^= *src++;
- * 	}
- *
- *	Requirements:
- *	!(((long)dest | (long)src) & (MD_XORBLOCK_ALIGNMENT - 1)) &&
- *	!(len & 127) && len >= 256
- */
-
-static inline void __xor_block (char *dest, char *src, long len)
-{
-	__asm__ __volatile__ ("
-	wr	%%g0, %3, %%fprs
-	wr	%%g0, %4, %%asi
-	membar	#LoadStore|#StoreLoad|#StoreStore
-	sub	%2, 128, %2
-	ldda	[%0] %4, %%f0
-	ldda	[%1] %4, %%f16
-1:	ldda	[%0 + 64] %%asi, %%f32
-	fxor	%%f0, %%f16, %%f16
-	fxor	%%f2, %%f18, %%f18
-	fxor	%%f4, %%f20, %%f20
-	fxor	%%f6, %%f22, %%f22
-	fxor	%%f8, %%f24, %%f24
-	fxor	%%f10, %%f26, %%f26
-	fxor	%%f12, %%f28, %%f28
-	fxor	%%f14, %%f30, %%f30
-	stda	%%f16, [%0] %4
-	ldda	[%1 + 64] %%asi, %%f48
-	ldda	[%0 + 128] %%asi, %%f0
-	fxor	%%f32, %%f48, %%f48
-	fxor	%%f34, %%f50, %%f50
-	add	%0, 128, %0
-	fxor	%%f36, %%f52, %%f52
-	add	%1, 128, %1
-	fxor	%%f38, %%f54, %%f54
-	subcc	%2, 128, %2
-	fxor	%%f40, %%f56, %%f56
-	fxor	%%f42, %%f58, %%f58
-	fxor	%%f44, %%f60, %%f60
-	fxor	%%f46, %%f62, %%f62
-	stda	%%f48, [%0 - 64] %%asi
-	bne,pt	%%xcc, 1b
-	 ldda	[%1] %4, %%f16
-	ldda	[%0 + 64] %%asi, %%f32
-	fxor	%%f0, %%f16, %%f16
-	fxor	%%f2, %%f18, %%f18
-	fxor	%%f4, %%f20, %%f20
-	fxor	%%f6, %%f22, %%f22
-	fxor	%%f8, %%f24, %%f24
-	fxor	%%f10, %%f26, %%f26
-	fxor	%%f12, %%f28, %%f28
-	fxor	%%f14, %%f30, %%f30
-	stda	%%f16, [%0] %4
-	ldda	[%1 + 64] %%asi, %%f48
-	membar	#Sync
-	fxor	%%f32, %%f48, %%f48
-	fxor	%%f34, %%f50, %%f50
-	fxor	%%f36, %%f52, %%f52
-	fxor	%%f38, %%f54, %%f54
-	fxor	%%f40, %%f56, %%f56
-	fxor	%%f42, %%f58, %%f58
-	fxor	%%f44, %%f60, %%f60
-	fxor	%%f46, %%f62, %%f62
-	stda	%%f48, [%0 + 64] %%asi
-	membar	#Sync|#StoreStore|#StoreLoad
-	wr	%%g0, 0, %%fprs
-	" : :
-	"r" (dest), "r" (src), "r" (len), "i" (FPRS_FEF), "i" (ASI_BLK_P) :
-	"cc", "memory");
-}
-
-#endif /* __ASM_MD_H */
diff -urN 2.2.18/include/asm-sparc64/poll.h 2.2.18aa1/include/asm-sparc64/poll.h
--- 2.2.18/include/asm-sparc64/poll.h	Mon Jan 17 16:44:46 2000
+++ 2.2.18aa1/include/asm-sparc64/poll.h	Mon Dec 11 17:20:44 2000
@@ -11,6 +11,7 @@
 #define POLLWRNORM	POLLOUT
 #define POLLRDBAND	128
 #define POLLWRBAND	256
+#define POLLMSG		512
 
 struct pollfd {
 	int fd;
diff -urN 2.2.18/include/asm-sparc64/smplock.h 2.2.18aa1/include/asm-sparc64/smplock.h
--- 2.2.18/include/asm-sparc64/smplock.h	Mon Dec 11 16:58:05 2000
+++ 2.2.18aa1/include/asm-sparc64/smplock.h	Mon Dec 11 17:20:52 2000
@@ -27,6 +27,25 @@
 		spin_lock(&kernel_flag); \
 } while (0)
 
+#define DECLARE_LOCAL_LOCK_DEPTH(x) int x
+
+#define release_kernel_lock_save(local_depth) \
+do { \
+	(local_depth) = current->lock_depth; \
+	if ((local_depth) >= 0) { \
+		current->lock_depth = -1; \
+		spin_unlock(&kernel_flag); \
+	} \
+} while (0)
+
+#define reacquire_kernel_lock_restore(local_depth) \
+do { \
+	if ((local_depth) >= 0) { \
+		current->lock_depth = local_depth; \
+		spin_lock(&kernel_flag); \
+	} \
+} while (0)
+
 
 /*
  * Getting the big kernel lock.
diff -urN 2.2.18/include/asm-sparc64/stat.h 2.2.18aa1/include/asm-sparc64/stat.h
--- 2.2.18/include/asm-sparc64/stat.h	Mon Jan 17 16:44:46 2000
+++ 2.2.18aa1/include/asm-sparc64/stat.h	Mon Dec 11 17:20:51 2000
@@ -1,4 +1,4 @@
-/* $Id: stat.h,v 1.5 1998/07/26 05:24:41 davem Exp $ */
+/* $Id: stat.h,v 1.6 1999/12/21 14:09:48 jj Exp $ */
 #ifndef _SPARC64_STAT_H
 #define _SPARC64_STAT_H
 
@@ -41,5 +41,46 @@
 	off_t   st_blocks;
 	unsigned long  __unused4[2];
 };
+
+#ifdef __KERNEL__
+/* This is sparc32 stat64 structure. */
+
+struct stat64 {
+	unsigned char	__pad0[6];
+	unsigned short	st_dev;
+
+	unsigned long long	st_ino;
+
+	unsigned int	st_mode;
+	unsigned int	st_nlink;
+
+	unsigned int	st_uid;
+	unsigned int	st_gid;
+
+	unsigned char	__pad2[6];
+	unsigned short	st_rdev;
+
+	unsigned char	__pad3[8];
+
+	long long	st_size;
+	unsigned int	st_blksize;
+
+	unsigned char	__pad4[8];
+	unsigned int	st_blocks;
+
+	unsigned int	st_atime;
+	unsigned int	__unused1;
+
+	unsigned int	st_mtime;
+	unsigned int	__unused2;
+
+	unsigned int	st_ctime;
+	unsigned int	__unused3;
+
+	unsigned int	__unused4;
+	unsigned int	__unused5;
+};
+
+#endif
 
 #endif
diff -urN 2.2.18/include/asm-sparc64/unistd.h 2.2.18aa1/include/asm-sparc64/unistd.h
--- 2.2.18/include/asm-sparc64/unistd.h	Mon Jan 17 16:44:46 2000
+++ 2.2.18aa1/include/asm-sparc64/unistd.h	Mon Dec 11 17:20:51 2000
@@ -71,14 +71,14 @@
 /* #define __NR_mctl             53    SunOS specific                              */
 #define __NR_ioctl               54 /* Common                                      */
 #define __NR_reboot              55 /* Common                                      */
-/* #define __NR_ni_syscall       56    ENOSYS under SunOS                          */
+/* #define __NR_mmap2		 56    Linux sparc32 Specific                      */
 #define __NR_symlink             57 /* Common                                      */
 #define __NR_readlink            58 /* Common                                      */
 #define __NR_execve              59 /* Common                                      */
 #define __NR_umask               60 /* Common                                      */
 #define __NR_chroot              61 /* Common                                      */
 #define __NR_fstat               62 /* Common                                      */
-/* #define __NR_ni_syscall       63    ENOSYS under SunOS                          */
+/* #define __NR_fstat64          63    Linux sparc32 Specific                      */
 #define __NR_getpagesize         64 /* Common                                      */
 #define __NR_msync               65 /* Common in newer 1.3.x revs...               */
 #define __NR_vfork               66 /* Common                                      */
@@ -92,14 +92,14 @@
 #define __NR_mprotect            74 /* Common                                      */
 /* #define __NR_madvise          75    SunOS Specific                              */
 #define __NR_vhangup             76 /* Common                                      */
-/* #define __NR_ni_syscall       77    ENOSYS under SunOS                          */
+/* #define __NR_truncate64       77    Linux sparc32 Specific			   */
 /* #define __NR_mincore          78    SunOS Specific                              */
 #define __NR_getgroups           79 /* Common                                      */
 #define __NR_setgroups           80 /* Common                                      */
 #define __NR_getpgrp             81 /* Common                                      */
 /* #define __NR_setpgrp          82    setpgid, same difference...                 */
 #define __NR_setitimer           83 /* Common                                      */
-/* #define __NR_ni_syscall       84    ENOSYS under SunOS                          */
+/* #define __NR_ftruncate64      84    Linux sparc32 Specific			   */
 #define __NR_swapon              85 /* Common                                      */
 #define __NR_getitimer           86 /* Common                                      */
 /* #define __NR_gethostname      87    SunOS Specific                              */
@@ -147,19 +147,19 @@
 #define __NR_truncate           129 /* Common                                      */
 #define __NR_ftruncate          130 /* Common                                      */
 #define __NR_flock              131 /* Common                                      */
-/* #define __NR_ni_syscall      132    ENOSYS under SunOS                          */
+/* #define __NR_lstat64		132    Linux sparc32 Specific                      */
 #define __NR_sendto             133 /* Common                                      */
 #define __NR_shutdown           134 /* Common                                      */
 #define __NR_socketpair         135 /* Common                                      */
 #define __NR_mkdir              136 /* Common                                      */
 #define __NR_rmdir              137 /* Common                                      */
 #define __NR_utimes             138 /* SunOS Specific                              */
-/* #define __NR_ni_syscall      139    ENOSYS under SunOS                          */
+/* #define __NR_stat64		139    Linux sparc32 Specific			   */
 /* #define __NR_adjtime         140    SunOS Specific                              */
 #define __NR_getpeername        141 /* Common                                      */
 /* #define __NR_gethostid       142    SunOS Specific                              */
-/* #define __NR_ni_syscall      143    ENOSYS under SunOS                          */
-#define __NR_getrlimit          144 /* Common                                      */
+/* #define __NR_fcntl64		143    Linux sparc32 Specific			   */
+#define __NR_getrlimit		144 /* Common                                      */
 #define __NR_setrlimit          145 /* Common                                      */
 /* #define __NR_killpg          146    SunOS Specific                              */
 #define __NR_prctl		147 /* ENOSYS under SunOS                          */
@@ -169,8 +169,8 @@
 /* #define __NR_getmsg          151    SunOS Specific                              */
 /* #define __NR_putmsg          152    SunOS Specific                              */
 #define __NR_poll               153 /* Common                                      */
-/* #define __NR_ni_syscall      154    ENOSYS under SunOS                          */
-/* #define __NR_nfssvc          155    SunOS Specific                              */
+#define __NR_getdents64		154 /* Linux specific				   */
+/* #define __NR_fcntl64         155    Linux sparc32 Specific                      */
 /* #define __NR_getdirentries   156    SunOS Specific                              */
 #define __NR_statfs             157 /* Common                                      */
 #define __NR_fstatfs            158 /* Common                                      */
diff -urN 2.2.18/include/linux/bigmem.h 2.2.18aa1/include/linux/bigmem.h
--- 2.2.18/include/linux/bigmem.h	Thu Jan  1 01:00:00 1970
+++ 2.2.18aa1/include/linux/bigmem.h	Mon Dec 11 19:25:20 2000
@@ -0,0 +1,50 @@
+#ifndef _LINUX_BIGMEM_H
+#define _LINUX_BIGMEM_H
+
+#include <linux/config.h>
+
+#ifdef CONFIG_BIGMEM
+
+#include <asm/bigmem.h>
+
+/* declarations for linux/mm/bigmem.c */
+extern unsigned long bigmem_mapnr;
+extern int nr_free_bigpages;
+
+extern struct page * prepare_bigmem_swapout(struct page *);
+extern struct page * replace_with_bigmem(struct page *);
+extern unsigned long prepare_bigmem_shm_swapin(unsigned long);
+
+#else /* CONFIG_BIGMEM */
+
+#define prepare_bigmem_swapout(page) page
+#define replace_with_bigmem(page) page
+#define prepare_bigmem_shm_swapin(page) page
+#define kmap(kaddr, type) kaddr
+#define kunmap(vaddr, type) do { } while (0)
+#define nr_free_bigpages 0
+
+#endif /* CONFIG_BIGMEM */
+
+/* when CONFIG_BIGMEM is not set these will be plain clear/copy_page */
+extern inline void clear_bigpage(unsigned long kaddr)
+{
+	unsigned long vaddr;
+
+	vaddr = kmap(kaddr, KM_WRITE);
+	clear_page(vaddr);
+	kunmap(vaddr, KM_WRITE);
+}
+
+extern inline void copy_bigpage(unsigned long to, unsigned long from)
+{
+	unsigned long vfrom, vto;
+
+	vfrom = kmap(from, KM_READ);
+	vto = kmap(to, KM_WRITE);
+	copy_page(vto, vfrom);
+	kunmap(vfrom, KM_READ);
+	kunmap(vto, KM_WRITE);
+}
+
+#endif /* _LINUX_BIGMEM_H */
diff -urN 2.2.18/include/linux/blkdev.h 2.2.18aa1/include/linux/blkdev.h
--- 2.2.18/include/linux/blkdev.h	Mon Dec 11 16:58:05 2000
+++ 2.2.18aa1/include/linux/blkdev.h	Mon Dec 11 19:23:36 2000
@@ -36,6 +36,8 @@
 };
 
 typedef void (request_fn_proc) (void);
+typedef int (makerq_fn_proc) (struct buffer_head *, int rw);
+typedef int (map_fn_proc) (kdev_t, kdev_t *, unsigned long *, unsigned long, int);
 typedef struct request ** (queue_proc) (kdev_t dev);
 
 typedef struct elevator_s
@@ -66,6 +68,8 @@
 
 struct blk_dev_struct {
 	request_fn_proc		*request_fn;
+	makerq_fn_proc          *makerq_fn;
+	map_fn_proc             *map_fn;
 	/*
 	 * queue_proc has to be atomic
 	 */
@@ -89,11 +93,6 @@
 extern void resetup_one_dev(struct gendisk *dev, int drive);
 extern void unplug_device(void * data);
 extern void make_request(int major,int rw, struct buffer_head * bh);
-
-/* md needs this function to remap requests */
-extern int md_map (int minor, kdev_t *rdev, unsigned long *rsector, unsigned long size);
-extern int md_make_request (int minor, int rw, struct buffer_head * bh);
-extern int md_error (kdev_t mddev, kdev_t rdev);
 
 extern int * blk_size[MAX_BLKDEV];
 
diff -urN 2.2.18/include/linux/condsched.h 2.2.18aa1/include/linux/condsched.h
--- 2.2.18/include/linux/condsched.h	Thu Jan  1 01:00:00 1970
+++ 2.2.18aa1/include/linux/condsched.h	Mon Dec 11 17:20:52 2000
@@ -0,0 +1,14 @@
+#ifndef _LINUX_CONDSCHED_H
+#define _LINUX_CONDSCHED_H
+
+#ifndef __ASSEMBLY__
+#define conditional_schedule() \
+do { \
+	if (current->need_resched) { \
+		current->state = TASK_RUNNING; \
+		schedule(); \
+	} \
+} while(0)
+#endif
+
+#endif
diff -urN 2.2.18/include/linux/dcache.h 2.2.18aa1/include/linux/dcache.h
--- 2.2.18/include/linux/dcache.h	Mon Dec 11 16:58:05 2000
+++ 2.2.18aa1/include/linux/dcache.h	Mon Dec 11 17:20:46 2000
@@ -106,6 +106,10 @@
 					 * If this dentry points to a directory, then
 					 * s_nfsd_free_path semaphore will be down
 					 */
+#define	DCACHE_REFERENCED 0x0008	/* This dentry is been recently
+					 * referenced so try to keep it in
+					 * cache.
+					 */
 
 /*
  * d_drop() unhashes the entry from the parent
@@ -149,7 +153,7 @@
 /* dcache memory management */
 extern void shrink_dcache_memory(int, unsigned int);
 extern void check_dcache_memory(void);
-extern void free_inode_memory(int);	/* defined in fs/inode.c */
+extern void free_inode_memory(void);	/* defined in fs/inode.c */
 
 /* only used at mount-time */
 extern struct dentry * d_alloc_root(struct inode * root_inode, struct dentry * old_root);
diff -urN 2.2.18/include/linux/dirent.h 2.2.18aa1/include/linux/dirent.h
--- 2.2.18/include/linux/dirent.h	Tue May 25 00:49:30 1999
+++ 2.2.18aa1/include/linux/dirent.h	Mon Dec 11 17:20:51 2000
@@ -8,4 +8,12 @@
 	char		d_name[256]; /* We must not include limits.h! */
 };
 
+struct dirent64 {
+	__u64		d_ino;
+	__s64		d_off;
+	unsigned short	d_reclen;
+	unsigned char	d_type;
+	char		d_name[256];
+};
+
 #endif
diff -urN 2.2.18/include/linux/elfcore.h 2.2.18aa1/include/linux/elfcore.h
--- 2.2.18/include/linux/elfcore.h	Tue Nov 14 03:38:17 2000
+++ 2.2.18aa1/include/linux/elfcore.h	Mon Dec 11 19:27:38 2000
@@ -20,6 +20,7 @@
 typedef elf_greg_t greg_t;
 typedef elf_gregset_t gregset_t;
 typedef elf_fpregset_t fpregset_t;
+typedef elf_fpxregset_t fpxregset_t;
 #define NGREG ELF_NGREG
 #endif
 
diff -urN 2.2.18/include/linux/ext2_fs_i.h 2.2.18aa1/include/linux/ext2_fs_i.h
--- 2.2.18/include/linux/ext2_fs_i.h	Mon Dec 11 16:58:05 2000
+++ 2.2.18aa1/include/linux/ext2_fs_i.h	Mon Dec 11 17:20:51 2000
@@ -35,7 +35,6 @@
 	__u32	i_next_alloc_goal;
 	__u32	i_prealloc_block;
 	__u32	i_prealloc_count;
-	__u32	i_high_size;
 	int	i_new_inode:1;	/* Is a freshly allocated inode */
 };
 
diff -urN 2.2.18/include/linux/fs.h 2.2.18aa1/include/linux/fs.h
--- 2.2.18/include/linux/fs.h	Mon Dec 11 16:58:05 2000
+++ 2.2.18aa1/include/linux/fs.h	Mon Dec 11 19:23:34 2000
@@ -185,6 +185,8 @@
 #define BH_Lock		2	/* 1 if the buffer is locked */
 #define BH_Req		3	/* 0 if the buffer has been invalidated */
 #define BH_Protected	6	/* 1 if the buffer is protected */
+#define BH_Wait_IO	7	/* 1 if we should throttle on this buffer */
+#define BH_LowPrio	8	/* 1 if the buffer is lowprio */
 
 /*
  * Try to keep the most commonly used fields in single cache lines (16
@@ -226,6 +228,8 @@
 	 */
 	void (*b_end_io)(struct buffer_head *bh, int uptodate);
 	void *b_dev_id;
+	void *b_pdata;
+	void (*b_rm_fn)(struct buffer_head *bh);
 };
 
 typedef void (bh_end_io_t)(struct buffer_head *bh, int uptodate);
@@ -260,6 +264,25 @@
 #define buffer_page(bh)		(mem_map + MAP_NR((bh)->b_data))
 #define touch_buffer(bh)	set_bit(PG_referenced, &buffer_page(bh)->flags)
 
+/* log of base-2 for filesystem uses, in case their super-blocks
+   don't have the shift counts readily calculated.. -- presuming
+   the divisors in question are power-of-two values! */
+static int fslog2(unsigned long val) __attribute__ ((const));
+static __inline__ int fslog2(unsigned long val)
+{
+	int i;
+	for (i = 0; val != 0; ++i, val >>= 1) {
+	  if (val & 1) return i;
+	}
+	return 0;
+}
+
+static int off_t_presentable(loff_t) __attribute((const));
+static __inline__ int off_t_presentable(loff_t loff)
+{
+	return loff >= 0 && loff <= (~0UL >> 1);
+}
+
 #include <linux/pipe_fs_i.h>
 #include <linux/minix_fs_i.h>
 #include <linux/ext2_fs_i.h>
@@ -280,6 +303,7 @@
 #include <linux/adfs_fs_i.h>
 #include <linux/qnx4_fs_i.h>
 #include <linux/usbdev_fs_i.h>
+#include <linux/gfs_fs_i.h>
 
 /*
  * Attribute flags.  These should be or-ed together to figure out what
@@ -311,7 +335,7 @@
 	umode_t		ia_mode;
 	uid_t		ia_uid;
 	gid_t		ia_gid;
-	off_t		ia_size;
+	loff_t		ia_size;
 	time_t		ia_atime;
 	time_t		ia_mtime;
 	time_t		ia_ctime;
@@ -346,7 +370,7 @@
 	uid_t			i_uid;
 	gid_t			i_gid;
 	kdev_t			i_rdev;
-	off_t			i_size;
+	loff_t			i_size;
 	time_t			i_atime;
 	time_t			i_mtime;
 	time_t			i_ctime;
@@ -395,6 +419,7 @@
 		struct adfs_inode_info		adfs_i;
 		struct qnx4_inode_info		qnx4_i;
 		struct usbdev_inode_info	usbdev_i;
+		struct gfs_inode_info           gfs_i;
 		struct socket			socket_i;
 		void				*generic_ip;
 	} u;
@@ -425,7 +450,7 @@
 	mode_t			f_mode;
 	loff_t			f_pos;
 	unsigned int 		f_count, f_flags;
-	unsigned long 		f_reada, f_ramax, f_raend, f_ralen, f_rawin;
+	loff_t			f_reada, f_ramax, f_raend, f_ralen, f_rawin;
 	struct fown_struct	f_owner;
 	unsigned int		f_uid, f_gid;
 	int			f_error;
@@ -465,8 +490,8 @@
 	struct file *fl_file;
 	unsigned char fl_flags;
 	unsigned char fl_type;
-	off_t fl_start;
-	off_t fl_end;
+	loff_t fl_start;
+	loff_t fl_end;
 
 	void (*fl_notify)(struct file_lock *);	/* unblock callback */
 	void (*fl_insert)(struct file_lock *);	/* lock insertion callback */
@@ -477,6 +502,9 @@
 	} fl_u;
 };
 
+#define OFFSET_MAX	((loff_t)((~0ULL)>>1))
+#define OFFT_OFFSET_MAX	((off_t)((~0UL)>>1))
+
 extern struct file_lock			*file_lock_table;
 
 #include <linux/fcntl.h>
@@ -484,6 +512,9 @@
 extern int fcntl_getlk(unsigned int fd, struct flock *l);
 extern int fcntl_setlk(unsigned int fd, unsigned int cmd, struct flock *l);
 
+extern int fcntl_getlk64(unsigned int fd, struct flock64 *l);
+extern int fcntl_setlk64(unsigned int fd, unsigned int cmd, struct flock64 *l);
+
 /* fs/locks.c */
 extern void locks_remove_posix(struct file *, fl_owner_t id);
 extern void locks_remove_flock(struct file *);
@@ -590,12 +621,25 @@
 extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *);
 
 /*
+ * File types
+ */
+#define DT_UNKNOWN	0
+#define DT_FIFO		1
+#define DT_CHR		2
+#define DT_DIR		4
+#define DT_BLK		6
+#define DT_REG		8
+#define DT_LNK		10
+#define DT_SOCK		12
+#define DT_WHT		14
+
+/*
  * This is the "filldir" function type, used by readdir() to let
  * the kernel specify what kind of dirent layout it wants to have.
  * This allows the kernel to read directories into kernel space or
  * to have different dirent layouts depending on the binary type.
  */
-typedef int (*filldir_t)(void *, const char *, int, off_t, ino_t);
+typedef int (*filldir_t)(void *, const char *, int, off_t, ino_t, unsigned);
 	
 struct file_operations {
 	loff_t (*llseek) (struct file *, loff_t, int);
@@ -725,7 +769,7 @@
 
 asmlinkage int sys_open(const char *, int, int);
 asmlinkage int sys_close(unsigned int);		/* yes, it's really unsigned */
-extern int do_truncate(struct dentry *, unsigned long);
+extern int do_truncate(struct dentry *, loff_t);
 
 extern struct file *filp_open(const char *, int, int);
 extern int filp_close(struct file *, fl_owner_t id);
@@ -734,7 +778,7 @@
 #define __getname()	((char *) __get_free_page(GFP_KERNEL))
 #define putname(name)	free_page((unsigned long)(name))
 
-extern void kill_fasync(struct fasync_struct *fa, int sig);
+extern void kill_fasync(struct fasync_struct *fa, int sig, int band);
 extern int register_blkdev(unsigned int, const char *, struct file_operations *);
 extern int unregister_blkdev(unsigned int major, const char * name);
 extern int blkdev_open(struct inode * inode, struct file * filp);
@@ -777,7 +821,7 @@
 
 extern void refile_buffer(struct buffer_head * buf);
 extern void set_writetime(struct buffer_head * buf, int flag);
-extern int try_to_free_buffers(struct page *, int wait);
+extern int try_to_free_buffers(struct page *, int);
 
 extern int nr_buffers;
 extern long buffermem;
@@ -786,18 +830,47 @@
 #define BUF_CLEAN	0
 #define BUF_LOCKED	1	/* Buffers scheduled for write */
 #define BUF_DIRTY	2	/* Dirty buffers, not yet scheduled for write */
-#define NR_LIST		3
+#define BUF_PROTECTED	3	/* Ramdisk persistent storage */
+#define NR_LIST		4
 
 void mark_buffer_uptodate(struct buffer_head * bh, int on);
 
+extern inline void mark_buffer_protected(struct buffer_head * bh)
+{
+	if (!test_and_set_bit(BH_Protected, &bh->b_state)) {
+		if (bh->b_list != BUF_PROTECTED)
+			refile_buffer(bh);
+	}
+}
+
 extern inline void mark_buffer_clean(struct buffer_head * bh)
 {
 	if (test_and_clear_bit(BH_Dirty, &bh->b_state)) {
 		if (bh->b_list == BUF_DIRTY)
 			refile_buffer(bh);
+		clear_bit(BH_Wait_IO, &bh->b_state);
 	}
 }
 
+extern inline void mark_buffer_highprio(struct buffer_head * bh)
+{
+	clear_bit(BH_LowPrio, &bh->b_state);
+}
+
+extern inline void mark_buffer_lowprio(struct buffer_head * bh)
+{
+	/*
+	 * dirty buffers cannot be marked lowprio.
+	 */
+	if (!buffer_dirty(bh))
+		set_bit(BH_LowPrio, &bh->b_state);
+}
+
+static inline int buffer_lowprio(struct buffer_head * bh)
+{
+	return test_bit(BH_LowPrio, &bh->b_state);
+}
+
 extern inline void mark_buffer_dirty(struct buffer_head * bh, int flag)
 {
 	if (!test_and_set_bit(BH_Dirty, &bh->b_state)) {
@@ -805,6 +878,23 @@
 		if (bh->b_list != BUF_DIRTY)
 			refile_buffer(bh);
 	}
+	/*
+	 * if a buffer gets marked dirty then it has to lose
+	 * it's lowprio state.
+	 */
+	mark_buffer_highprio(bh);
+}
+
+extern inline void mark_buffer_dirty_lowprio(struct buffer_head * bh)
+{
+	if (!test_and_set_bit(BH_Dirty, &bh->b_state)) {
+		if (bh->b_list != BUF_DIRTY)
+			refile_buffer(bh);
+		/*
+		 * Mark it lowprio only if it was not dirty before!
+		 */
+		set_bit(BH_LowPrio, &bh->b_state);
+	}
 }
 
 extern int check_disk_change(kdev_t dev);
@@ -817,6 +907,7 @@
 extern void sync_inodes(kdev_t dev);
 extern void write_inode_now(struct inode *inode);
 extern void sync_dev(kdev_t dev);
+extern int sync_buffers(kdev_t dev, int wait);
 extern int fsync_dev(kdev_t dev);
 extern void sync_supers(kdev_t dev);
 extern int bmap(struct inode * inode,int block);
@@ -882,6 +973,7 @@
 extern struct buffer_head * find_buffer(kdev_t dev, int block, int size);
 extern void ll_rw_block(int, int, struct buffer_head * bh[]);
 extern int is_read_only(kdev_t);
+extern int is_device_idle(kdev_t);
 extern void __brelse(struct buffer_head *);
 extern inline void brelse(struct buffer_head *buf)
 {
@@ -897,8 +989,12 @@
 extern void set_blocksize(kdev_t dev, int size);
 extern unsigned int get_hardblocksize(kdev_t dev);
 extern struct buffer_head * bread(kdev_t dev, int block, int size);
+extern struct buffer_head * buffer_ready (kdev_t dev, int block, int size);
+extern void bread_ahead (kdev_t dev, int block, int size);
 extern struct buffer_head * breada(kdev_t dev,int block, int size, 
 				   unsigned int pos, unsigned int filesize);
+extern struct buffer_head * breada_blocks(kdev_t dev,int block,
+						int size, int blocks);
 
 extern int brw_page(int, struct page *, kdev_t, int [], int, int);
 
@@ -935,6 +1031,9 @@
 extern void inode_setattr(struct inode *, struct iattr *);
 
 extern __u32 inode_generation_count;
+
+#define fs_down(sem)	do { current->fs_locks++; down(sem); } while (0)
+#define fs_up(sem)	do { up(sem); current->fs_locks--; } while (0)
 
 #endif /* __KERNEL__ */
 
diff -urN 2.2.18/include/linux/genhd.h 2.2.18aa1/include/linux/genhd.h
--- 2.2.18/include/linux/genhd.h	Mon Dec 11 16:58:05 2000
+++ 2.2.18aa1/include/linux/genhd.h	Mon Dec 11 19:23:36 2000
@@ -104,6 +104,7 @@
 
 	void *real_devices;		/* internal use */
 	struct gendisk *next;
+        char **device_names;		/* specified names */
 };
 
 #ifdef CONFIG_SOLARIS_X86_PARTITION
diff -urN 2.2.18/include/linux/gfs_fs_i.h 2.2.18aa1/include/linux/gfs_fs_i.h
--- 2.2.18/include/linux/gfs_fs_i.h	Thu Jan  1 01:00:00 1970
+++ 2.2.18aa1/include/linux/gfs_fs_i.h	Mon Dec 11 17:20:55 2000
@@ -0,0 +1,21 @@
+/*
+ *  Copyright 1999 Sistina Software, Inc.  <kpreslan@sistina.com>
+ *
+ *  Kludge to prevent pipes from writing over our inode pointer
+ *  in 2.2.
+ */
+
+#ifndef _GFS_FS_I
+#define _GFS_FS_I
+
+struct gfs_inode_info
+{
+  union
+  {
+    struct pipe_inode_info crap1;
+    struct socket crap2;
+  } crap3;
+  void *generic_ip;
+};
+
+#endif	/* _GFS_FS_I */
diff -urN 2.2.18/include/linux/iobuf.h 2.2.18aa1/include/linux/iobuf.h
--- 2.2.18/include/linux/iobuf.h	Thu Jan  1 01:00:00 1970
+++ 2.2.18aa1/include/linux/iobuf.h	Mon Dec 11 19:23:36 2000
@@ -0,0 +1,82 @@
+/*
+ * iobuf.h
+ *
+ * Defines the structures used to track abstract kernel-space io buffers.
+ *
+ */
+
+#ifndef __LINUX_IOBUF_H
+#define __LINUX_IOBUF_H
+
+#include <linux/mm.h>
+#include <linux/init.h>
+
+/*
+ * The kiobuf structure describes a physical set of pages reserved
+ * locked for IO.  The reference counts on each page will have been
+ * incremented, and the flags field will indicate whether or not we have
+ * pre-locked all of the pages for IO.
+ *
+ * kiobufs may be passed in arrays to form a kiovec, but we must
+ * preserve the property that no page is present more than once over the
+ * entire iovec.
+ */
+
+#define KIO_MAX_ATOMIC_IO	64 /* in kb */
+#define KIO_MAX_ATOMIC_BYTES	(64 * 1024)
+#define KIO_STATIC_PAGES	(KIO_MAX_ATOMIC_IO / (PAGE_SIZE >> 10))
+#define KIO_MAX_SECTORS		(KIO_MAX_ATOMIC_IO * 2)
+
+struct kiobuf 
+{
+	int		nr_pages;	/* Pages actually referenced */
+	int		array_len;	/* Space in the allocated lists */
+	int		offset;		/* Offset to start of valid data */
+	int		length;		/* Number of valid bytes of data */
+
+	/* Keep separate track of the physical addresses and page
+	 * structs involved.  If we do IO to a memory-mapped device
+	 * region, there won't necessarily be page structs defined for
+	 * every address. */
+
+	unsigned long *	pagelist;
+	struct page **	maplist;
+	unsigned long *	bouncelist;
+
+	unsigned int	locked : 1;	/* If set, pages has been locked */
+	unsigned int	bounced : 1;	/* If set, bounce pages are set up */
+	
+	/* Always embed enough struct pages for 64k of IO */
+	unsigned long	page_array[KIO_STATIC_PAGES];
+	struct page *	map_array[KIO_STATIC_PAGES];
+	unsigned long	bounce_array[KIO_STATIC_PAGES];
+};
+
+
+/* mm/memory.c */
+
+int	map_user_kiobuf(int rw, struct kiobuf *, unsigned long va, size_t len);
+void	unmap_kiobuf(struct kiobuf *iobuf);
+
+/* fs/iobuf.c */
+
+void __init kiobuf_init(void);
+int	alloc_kiovec(int nr, struct kiobuf **);
+void	free_kiovec(int nr, struct kiobuf **);
+int	expand_kiobuf(struct kiobuf *, int);
+int	setup_kiobuf_bounce_pages(struct kiobuf *, int gfp_mask);
+void	clear_kiobuf_bounce_pages(struct kiobuf *);
+void	kiobuf_copy_bounce(struct kiobuf *, int direction, int max);
+
+/* Direction codes for kiobuf_copy_bounce: */
+enum {
+	COPY_TO_BOUNCE,
+	COPY_FROM_BOUNCE
+};
+
+/* fs/buffer.c */
+
+int	brw_kiovec(int rw, int nr, struct kiobuf *iovec[], 
+		   kdev_t dev, unsigned long b[], int size);
+
+#endif /* __LINUX_IOBUF_H */
diff -urN 2.2.18/include/linux/kernel.h 2.2.18aa1/include/linux/kernel.h
--- 2.2.18/include/linux/kernel.h	Mon Dec 11 16:58:05 2000
+++ 2.2.18aa1/include/linux/kernel.h	Mon Dec 11 17:20:48 2000
@@ -40,7 +40,6 @@
 #define FASTCALL(x)	x
 #endif
 
-extern void math_error(void);
 extern struct notifier_block *panic_notifier_list;
 NORET_TYPE void panic(const char * fmt, ...)
 	__attribute__ ((NORET_AND format (printf, 1, 2)));
@@ -91,7 +90,9 @@
 	unsigned long totalswap;	/* Total swap space size */
 	unsigned long freeswap;		/* swap space still available */
 	unsigned short procs;		/* Number of current processes */
-	char _f[22];			/* Pads structure to 64 bytes */
+	unsigned long totalbig;		/* Total big memory size */
+	unsigned long freebig;		/* Available big memory size */
+	char _f[22-3*sizeof(long)+sizeof(short)];	/* Padding: libc5 uses this.. */
 };
 
 #endif
diff -urN 2.2.18/include/linux/lockd/nlm.h 2.2.18aa1/include/linux/lockd/nlm.h
--- 2.2.18/include/linux/lockd/nlm.h	Mon Dec 11 16:58:06 2000
+++ 2.2.18aa1/include/linux/lockd/nlm.h	Mon Dec 11 17:20:52 2000
@@ -45,10 +45,10 @@
 #define NLMPROC_CANCEL_RES	13
 #define NLMPROC_UNLOCK_RES	14
 #define NLMPROC_GRANTED_RES	15
+#define NLMPROC_NSM_NOTIFY	16		/* statd callback */
 #define NLMPROC_SHARE		20
 #define NLMPROC_UNSHARE		21
 #define NLMPROC_NM_LOCK		22
 #define NLMPROC_FREE_ALL	23
-#define NLMPROC_NSM_NOTIFY	24		/* statd callback */
 
 #endif /* LINUX_LOCKD_NLM_H */
diff -urN 2.2.18/include/linux/locks.h 2.2.18aa1/include/linux/locks.h
--- 2.2.18/include/linux/locks.h	Tue Nov 14 23:09:55 2000
+++ 2.2.18aa1/include/linux/locks.h	Mon Dec 11 19:23:36 2000
@@ -50,10 +50,12 @@
 	if (sb->s_lock)
 		__wait_on_super(sb);
 	sb->s_lock = 1;
+	current->fs_locks++;
 }
 
 extern inline void unlock_super(struct super_block * sb)
 {
+	current->fs_locks--;
 	sb->s_lock = 0;
 	wake_up(&sb->s_wait);
 }
diff -urN 2.2.18/include/linux/lvm.h 2.2.18aa1/include/linux/lvm.h
--- 2.2.18/include/linux/lvm.h	Thu Jan  1 01:00:00 1970
+++ 2.2.18aa1/include/linux/lvm.h	Mon Dec 11 19:23:54 2000
@@ -0,0 +1,883 @@
+/*
+ * include/linux/lvm.h
+ * kernel/lvm.h
+ * tools/lib/lvm.h
+ *
+ * Copyright (C) 1997 - 2000  Heinz Mauelshagen, Sistina Software
+ *
+ * February-November 1997
+ * May-July 1998
+ * January-March,July,September,October,Dezember 1999
+ * January,February,July,November 2000
+ *
+ * lvm is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ * 
+ * lvm is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING.  If not, write to
+ * the Free Software Foundation, 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA. 
+ *
+ */
+
+/*
+ * Changelog
+ *
+ *    10/10/1997 - beginning of new structure creation
+ *    12/05/1998 - incorporated structures from lvm_v1.h and deleted lvm_v1.h
+ *    07/06/1998 - avoided LVM_KMALLOC_MAX define by using vmalloc/vfree
+ *                 instead of kmalloc/kfree
+ *    01/07/1998 - fixed wrong LVM_MAX_SIZE
+ *    07/07/1998 - extended pe_t structure by ios member (for statistic)
+ *    02/08/1998 - changes for official char/block major numbers
+ *    07/08/1998 - avoided init_module() and cleanup_module() to be static
+ *    29/08/1998 - seprated core and disk structure type definitions
+ *    01/09/1998 - merged kernel integration version (mike)
+ *    20/01/1999 - added LVM_PE_DISK_OFFSET macro for use in
+ *                 vg_read_with_pv_and_lv(), pv_move_pe(), pv_show_pe_text()...
+ *    18/02/1999 - added definition of time_disk_t structure for;
+ *                 keeps time stamps on disk for nonatomic writes (future)
+ *    15/03/1999 - corrected LV() and VG() macro definition to use argument
+ *                 instead of minor
+ *    03/07/1999 - define for genhd.c name handling
+ *    23/07/1999 - implemented snapshot part
+ *    08/12/1999 - changed LVM_LV_SIZE_MAX macro to reflect current 1TB limit
+ *    01/01/2000 - extended lv_v2 core structure by wait_queue member
+ *    12/02/2000 - integrated Andrea Arcagnelli's snapshot work
+ *    18/02/2000 - seperated user and kernel space parts by 
+ *                 #ifdef them with __KERNEL__
+ *    08/03/2000 - implemented cluster/shared bits for vg_access
+ *    26/06/2000 - implemented snapshot persistency and resizing support
+ *    02/11/2000 - added hash table size member to lv structure
+ *    12/11/2000 - removed unneeded timestamp definitions
+ *
+ */
+
+
+#ifndef _LVM_H_INCLUDE
+#define _LVM_H_INCLUDE
+
+#define	_LVM_KERNEL_H_VERSION	"LVM 0.9 (13/11/2000)"
+
+#include <linux/version.h>
+#include <endian.h>
+
+/*
+ * preprocessor definitions
+ */
+/* if you like emergency reset code in the driver */
+#define	LVM_TOTAL_RESET
+
+#ifdef __KERNEL__
+#define	LVM_HD_NAME /* display nice names in /proc/partitions */
+
+/* lots of debugging output (see driver source)
+   #define DEBUG_LVM_GET_INFO
+   #define DEBUG
+   #define DEBUG_MAP
+   #define DEBUG_MAP_SIZE
+   #define DEBUG_IOCTL
+   #define DEBUG_READ
+   #define DEBUG_GENDISK
+   #define DEBUG_VG_CREATE
+   #define DEBUG_LVM_BLK_OPEN
+   #define DEBUG_KFREE
+ */
+#endif				/* #ifdef __KERNEL__ */
+
+#ifndef __KERNEL__
+#define __KERNEL__
+#include <linux/kdev_t.h>
+#include <linux/list.h>
+#undef __KERNEL__
+#else
+#include <linux/kdev_t.h>
+#include <linux/list.h>
+#endif				/* #ifndef __KERNEL__ */
+
+#include <asm/types.h>
+#include <linux/major.h>
+
+#ifdef __KERNEL__
+#if LINUX_VERSION_CODE >= KERNEL_VERSION ( 2, 3 ,0)
+#include <linux/spinlock.h>
+#else
+#include <asm/spinlock.h>
+#endif
+
+#include <asm/semaphore.h>
+#endif				/* #ifdef __KERNEL__ */
+
+#include <asm/page.h>
+
+#if !defined ( LVM_BLK_MAJOR) || !defined ( LVM_CHAR_MAJOR)
+#error Bad include/linux/major.h - LVM MAJOR undefined
+#endif
+
+#ifdef	BLOCK_SIZE
+#undef	BLOCK_SIZE
+#endif
+
+#ifdef CONFIG_ARCH_S390 
+#define BLOCK_SIZE	4096
+#else
+#define BLOCK_SIZE	1024
+#endif
+
+#ifndef	SECTOR_SIZE
+#define SECTOR_SIZE	512
+#endif
+
+#define LVM_STRUCT_VERSION	1	/* structure version */
+
+#define	LVM_DIR_PREFIX	"/dev/"
+
+#ifndef min
+#define min(a,b) (((a)<(b))?(a):(b))
+#endif
+#ifndef max
+#define max(a,b) (((a)>(b))?(a):(b))
+#endif
+
+/* set the default structure version */
+#if ( LVM_STRUCT_VERSION == 1)
+#define pv_t pv_v2_t
+#define lv_t lv_v4_t
+#define vg_t vg_v3_t
+#define pv_disk_t pv_disk_v2_t
+#define lv_disk_t lv_disk_v3_t
+#define vg_disk_t vg_disk_v2_t
+#define lv_block_exception_t lv_block_exception_v1_t
+#define lv_COW_table_disk_t lv_COW_table_disk_v1_t
+#endif
+
+
+
+/*
+ * i/o protocol version
+ *
+ * defined here for the driver and defined seperate in the
+ * user land tools/lib/liblvm.h
+ *
+ */
+#define	LVM_DRIVER_IOP_VERSION	        10
+
+#define LVM_NAME        "lvm"
+#define LVM_GLOBAL	"global"
+#define LVM_DIR		"lvm"
+#define LVM_VG_SUBDIR	"VGs"
+#define LVM_LV_SUBDIR	"LVs"
+#define LVM_PV_SUBDIR	"PVs"
+
+/*
+ * VG/LV indexing macros
+ */
+/* character minor maps directly to volume group */
+#define	VG_CHR(a) ( a)
+
+/* block minor indexes into a volume group/logical volume indirection table */
+#define	VG_BLK(a)	( vg_lv_map[a].vg_number)
+#define LV_BLK(a)	( vg_lv_map[a].lv_number)
+
+/*
+ * absolute limits for VGs, PVs per VG and LVs per VG
+ */
+#define ABS_MAX_VG	99
+#define ABS_MAX_PV	256
+#define ABS_MAX_LV	256	/* caused by 8 bit minor */
+
+#define MAX_VG  ABS_MAX_VG
+#define MAX_LV	ABS_MAX_LV
+#define	MAX_PV	ABS_MAX_PV
+
+#if ( MAX_VG > ABS_MAX_VG)
+#undef MAX_VG
+#define MAX_VG ABS_MAX_VG
+#endif
+
+#if ( MAX_LV > ABS_MAX_LV)
+#undef MAX_LV
+#define MAX_LV ABS_MAX_LV
+#endif
+
+
+/*
+ * VGDA: default disk spaces and offsets
+ *
+ *   there's space after the structures for later extensions.
+ *
+ *   offset            what                                size
+ *   ---------------   ----------------------------------  ------------
+ *   0                 physical volume structure           ~500 byte
+ *
+ *   1K                volume group structure              ~200 byte
+ *
+ *   5K                time stamp structure                ~
+ *
+ *   6K                namelist of physical volumes        128 byte each
+ *
+ *   6k + n * 128byte  n logical volume structures         ~300 byte each
+ *
+ *   + m * 328byte     m physical extent alloc. structs    4 byte each
+ *
+ *   End of disk -     first physical extent               typical 4 megabyte
+ *   PE total *
+ *   PE size
+ *
+ *
+ */
+
+/* DONT TOUCH THESE !!! */
+/* base of PV structure in disk partition */
+#define	LVM_PV_DISK_BASE  	0L
+
+/* size reserved for PV structure on disk */
+#define	LVM_PV_DISK_SIZE  	1024L
+
+/* base of VG structure in disk partition */
+#define	LVM_VG_DISK_BASE  	LVM_PV_DISK_SIZE
+
+/* size reserved for VG structure */
+#define	LVM_VG_DISK_SIZE  	( 9 * 512L)
+
+/* size reserved for timekeeping */
+#define	LVM_TIMESTAMP_DISK_BASE	( LVM_VG_DISK_BASE +  LVM_VG_DISK_SIZE)
+#define	LVM_TIMESTAMP_DISK_SIZE	512L	/* reserved for timekeeping */
+
+/* name list of physical volumes on disk */
+#define	LVM_PV_UUIDLIST_DISK_BASE ( LVM_TIMESTAMP_DISK_BASE + \
+                                    LVM_TIMESTAMP_DISK_SIZE)
+
+/* now for the dynamically calculated parts of the VGDA */
+#define	LVM_LV_DISK_OFFSET(a, b) ( (a)->lv_on_disk.base + \
+                                   sizeof ( lv_disk_t) * b)
+#define	LVM_DISK_SIZE(pv) 	 ( (pv)->pe_on_disk.base + \
+                                   (pv)->pe_on_disk.size)
+#define	LVM_PE_DISK_OFFSET(pe, pv)	( pe * pv->pe_size + \
+					  ( LVM_DISK_SIZE ( pv) / SECTOR_SIZE))
+#define	LVM_PE_ON_DISK_BASE(pv) \
+   { int rest; \
+     pv->pe_on_disk.base = pv->lv_on_disk.base + pv->lv_on_disk.size; \
+     if ( ( rest = pv->pe_on_disk.base % SECTOR_SIZE) != 0) \
+        pv->pe_on_disk.base += ( SECTOR_SIZE - rest); \
+   }
+/* END default disk spaces and offsets for PVs */
+
+
+/*
+ * LVM_PE_T_MAX corresponds to:
+ *
+ * 8KB PE size can map a ~512 MB logical volume at the cost of 1MB memory,
+ *
+ * 128MB PE size can map a 8TB logical volume at the same cost of memory.
+ *
+ * Default PE size of 4 MB gives a maximum logical volume size of 256 GB.
+ *
+ * Maximum PE size of 16GB gives a maximum logical volume size of 1024 TB.
+ *
+ * AFAIK, the actual kernels limit this to 1 TB.
+ *
+ * Should be a sufficient spectrum ;*)
+ */
+
+/* This is the usable size of pe_disk_t.le_num !!!        v     v */
+#define	LVM_PE_T_MAX		( ( 1 << ( sizeof ( uint16_t) * 8)) - 2)
+
+#define	LVM_LV_SIZE_MAX(a)	( ( long long) LVM_PE_T_MAX * (a)->pe_size > ( long long) 1024*1024/SECTOR_SIZE*1024*1024 ? ( long long) 1024*1024/SECTOR_SIZE*1024*1024 : ( long long) LVM_PE_T_MAX * (a)->pe_size)
+#define	LVM_MIN_PE_SIZE		( 8192L / SECTOR_SIZE) /* 8 KB in sectors */
+#define	LVM_MAX_PE_SIZE		( 16L * 1024L * 1024L / SECTOR_SIZE * 1024)	/* 16GB in sectors */
+#define	LVM_DEFAULT_PE_SIZE	( 4096L * 1024 / SECTOR_SIZE)	/* 4 MB in sectors */
+#define	LVM_DEFAULT_STRIPE_SIZE	16L	/* 16 KB  */
+#define	LVM_MIN_STRIPE_SIZE	( PAGE_SIZE>>9)		/* PAGESIZE in sectors */
+#define	LVM_MAX_STRIPE_SIZE	( 512L * 1024 / SECTOR_SIZE)	/* 512 KB in sectors */
+#define	LVM_MAX_STRIPES		128	/* max # of stripes */
+#define	LVM_MAX_SIZE            ( 1024LU * 1024 / SECTOR_SIZE * 1024 * 1024)	/* 1TB[sectors] */
+#define	LVM_MAX_MIRRORS    	2	/* future use */
+#define	LVM_MIN_READ_AHEAD	2	/* minimum read ahead sectors */
+#define	LVM_MAX_READ_AHEAD	120	/* maximum read ahead sectors */
+#define	LVM_MAX_LV_IO_TIMEOUT	60	/* seconds I/O timeout (future use) */
+#define	LVM_PARTITION           0xfe	/* LVM partition id */
+#define	LVM_NEW_PARTITION       0x8e	/* new LVM partition id (10/09/1999) */
+#define	LVM_PE_SIZE_PV_SIZE_REL	5	/* max relation PV size and PE size */
+
+#define	LVM_SNAPSHOT_MAX_CHUNK	1024	/* 1024 KB */
+#define	LVM_SNAPSHOT_DEF_CHUNK	64	/* 64  KB */
+#define	LVM_SNAPSHOT_MIN_CHUNK	1	/* 1   KB */
+
+#define	UNDEF	-1
+#define FALSE	0
+#define TRUE	1
+
+
+#define LVM_GET_COW_TABLE_CHUNKS_PER_PE(vg, lv) ( \
+	vg->pe_size / lv->lv_chunk_size)
+
+#define LVM_GET_COW_TABLE_ENTRIES_PER_PE(vg, lv) ( \
+{ \
+	int COW_table_entries_per_PE; \
+	int COW_table_chunks_per_PE; \
+\
+	COW_table_entries_per_PE = LVM_GET_COW_TABLE_CHUNKS_PER_PE(vg, lv); \
+	COW_table_chunks_per_PE = ( COW_table_entries_per_PE * sizeof(lv_COW_table_disk_t) / SECTOR_SIZE + lv->lv_chunk_size - 1) / lv->lv_chunk_size; \
+	COW_table_entries_per_PE - COW_table_chunks_per_PE;})
+
+
+/* to disk and to core data conversion macros */
+#if __BYTE_ORDER == __BIG_ENDIAN
+
+#define LVM_TO_CORE16(x) ( \
+        ((uint16_t)((((uint16_t)(x) & 0x00FFU) << 8) | \
+                    (((uint16_t)(x) & 0xFF00U) >> 8))))
+
+#define LVM_TO_DISK16(x) LVM_TO_CORE16(x)
+
+#define LVM_TO_CORE32(x) ( \
+        ((uint32_t)((((uint32_t)(x) & 0x000000FFU) << 24) | \
+                    (((uint32_t)(x) & 0x0000FF00U) << 8))) \
+                    (((uint32_t)(x) & 0x00FF0000U) >> 8))) \
+                    (((uint32_t)(x) & 0xFF000000U) >> 24))))
+
+#define LVM_TO_DISK32(x) LVM_TO_CORE32(x)
+
+#define LVM_TO_CORE64(x) \
+        ((uint64_t)((((uint64_t)(x) & 0x00000000000000FFULL) << 56) | \
+                    (((uint64_t)(x) & 0x000000000000FF00ULL) << 40) | \
+                    (((uint64_t)(x) & 0x0000000000FF0000ULL) << 24) | \
+                    (((uint64_t)(x) & 0x00000000FF000000ULL) <<  8) | \
+                    (((uint64_t)(x) & 0x000000FF00000000ULL) >>  8) | \
+                    (((uint64_t)(x) & 0x0000FF0000000000ULL) >> 24) | \
+                    (((uint64_t)(x) & 0x00FF000000000000ULL) >> 40) | \
+                    (((uint64_t)(x) & 0xFF00000000000000ULL) >> 56))) 
+
+#define LVM_TO_DISK64(x) LVM_TO_CORE64(x)
+
+#elif __BYTE_ORDER == __LITTLE_ENDIAN
+
+#define LVM_TO_CORE16(x) x
+#define LVM_TO_DISK16(x) x
+#define LVM_TO_CORE32(x) x
+#define LVM_TO_DISK32(x) x
+#define LVM_TO_CORE64(x) x
+#define LVM_TO_DISK64(x) x
+
+#else
+
+#error "__BYTE_ORDER must be defined as __LITTLE_ENDIAN or __BIG_ENDIAN"
+
+#endif /* #if __BYTE_ORDER == __BIG_ENDIAN */
+
+
+/*
+ * ioctls
+ */
+/* volume group */
+#define	VG_CREATE               _IOW ( 0xfe, 0x00, 1)
+#define	VG_REMOVE               _IOW ( 0xfe, 0x01, 1)
+
+#define	VG_EXTEND               _IOW ( 0xfe, 0x03, 1)
+#define	VG_REDUCE               _IOW ( 0xfe, 0x04, 1)
+
+#define	VG_STATUS               _IOWR ( 0xfe, 0x05, 1)
+#define	VG_STATUS_GET_COUNT     _IOWR ( 0xfe, 0x06, 1)
+#define	VG_STATUS_GET_NAMELIST  _IOWR ( 0xfe, 0x07, 1)
+
+#define	VG_SET_EXTENDABLE       _IOW ( 0xfe, 0x08, 1)
+#define	VG_RENAME		_IOW ( 0xfe, 0x09, 1)
+
+
+/* logical volume */
+#define	LV_CREATE               _IOW ( 0xfe, 0x20, 1)
+#define	LV_REMOVE               _IOW ( 0xfe, 0x21, 1)
+
+#define	LV_ACTIVATE             _IO ( 0xfe, 0x22)
+#define	LV_DEACTIVATE           _IO ( 0xfe, 0x23)
+
+#define	LV_EXTEND               _IOW ( 0xfe, 0x24, 1)
+#define	LV_REDUCE               _IOW ( 0xfe, 0x25, 1)
+
+#define	LV_STATUS_BYNAME        _IOWR ( 0xfe, 0x26, 1)
+#define	LV_STATUS_BYINDEX       _IOWR ( 0xfe, 0x27, 1)
+
+#define LV_SET_ACCESS           _IOW ( 0xfe, 0x28, 1)
+#define LV_SET_ALLOCATION       _IOW ( 0xfe, 0x29, 1)
+#define LV_SET_STATUS           _IOW ( 0xfe, 0x2a, 1)
+
+#define LE_REMAP                _IOW ( 0xfe, 0x2b, 1)
+
+#define LV_SNAPSHOT_USE_RATE    _IOWR ( 0xfe, 0x2c, 1)
+
+#define	LV_STATUS_BYDEV		_IOWR ( 0xfe, 0x2e, 1)
+
+#define	LV_RENAME		_IOW ( 0xfe, 0x2f, 1)
+
+#define	LV_BMAP			_IOWR ( 0xfe, 0x30, 1)
+
+
+/* physical volume */
+#define	PV_STATUS               _IOWR ( 0xfe, 0x40, 1)
+#define	PV_CHANGE               _IOWR ( 0xfe, 0x41, 1)
+#define	PV_FLUSH                _IOW ( 0xfe, 0x42, 1)
+
+/* physical extent */
+#define	PE_LOCK_UNLOCK          _IOW ( 0xfe, 0x50, 1)
+
+/* i/o protocol version */
+#define	LVM_GET_IOP_VERSION     _IOR ( 0xfe, 0x98, 1)
+
+#ifdef LVM_TOTAL_RESET
+/* special reset function for testing purposes */
+#define	LVM_RESET               _IO ( 0xfe, 0x99)
+#endif
+
+/* lock the logical volume manager */
+#define	LVM_LOCK_LVM            _IO ( 0xfe, 0x100)
+/* END ioctls */
+
+
+/*
+ * Status flags
+ */
+/* volume group */
+#define	VG_ACTIVE            0x01	/* vg_status */
+#define	VG_EXPORTED          0x02	/*     "     */
+#define	VG_EXTENDABLE        0x04	/*     "     */
+
+#define	VG_READ              0x01	/* vg_access */
+#define	VG_WRITE             0x02	/*     "     */
+#define	VG_CLUSTERED         0x04	/*     "     */
+#define	VG_SHARED            0x08	/*     "     */
+
+/* logical volume */
+#define	LV_ACTIVE            0x01	/* lv_status */
+#define	LV_SPINDOWN          0x02	/*     "     */
+
+#define	LV_READ              0x01	/* lv_access */
+#define	LV_WRITE             0x02	/*     "     */
+#define	LV_SNAPSHOT          0x04	/*     "     */
+#define	LV_SNAPSHOT_ORG      0x08	/*     "     */
+
+#define	LV_BADBLOCK_ON       0x01	/* lv_badblock */
+
+#define	LV_STRICT            0x01	/* lv_allocation */
+#define	LV_CONTIGUOUS        0x02	/*       "       */
+
+/* physical volume */
+#define	PV_ACTIVE            0x01	/* pv_status */
+#define	PV_ALLOCATABLE       0x02	/* pv_allocatable */
+
+
+/*
+ * Structure definitions core/disk follow
+ *
+ * conditional conversion takes place on big endian architectures
+ * in functions * pv_copy_*(), vg_copy_*() and lv_copy_*()
+ *
+ */
+
+#define	NAME_LEN		128	/* don't change!!! */
+#define	UUID_LEN		32	/* don't change!!! */
+
+/* copy on write tables in disk format */
+typedef struct {
+	uint64_t pv_org_number;
+	uint64_t pv_org_rsector;
+	uint64_t pv_snap_number;
+	uint64_t pv_snap_rsector;
+} lv_COW_table_disk_v1_t;
+
+/* remap physical sector/rdev pairs including hash */
+typedef struct {
+	struct list_head hash;
+	ulong rsector_org;
+	kdev_t rdev_org;
+	ulong rsector_new;
+	kdev_t rdev_new;
+} lv_block_exception_v1_t;
+
+/* disk stored pe information */
+typedef struct {
+	uint16_t lv_num;
+	uint16_t le_num;
+} pe_disk_t;
+
+/* disk stored PV, VG, LV and PE size and offset information */
+typedef struct {
+	uint32_t base;
+	uint32_t size;
+} lvm_disk_data_t;
+
+
+/*
+ * Structure Physical Volume (PV) Version 1
+ */
+
+/* core */
+typedef struct {
+	char id[2];		/* Identifier */
+	unsigned short version;	/* HM lvm version */
+	lvm_disk_data_t pv_on_disk;
+	lvm_disk_data_t vg_on_disk;
+	lvm_disk_data_t pv_namelist_on_disk;
+	lvm_disk_data_t lv_on_disk;
+	lvm_disk_data_t pe_on_disk;
+	char pv_name[NAME_LEN];
+	char vg_name[NAME_LEN];
+	char system_id[NAME_LEN];	/* for vgexport/vgimport */
+	kdev_t pv_dev;
+	uint pv_number;
+	uint pv_status;
+	uint pv_allocatable;
+	uint pv_size;		/* HM */
+	uint lv_cur;
+	uint pe_size;
+	uint pe_total;
+	uint pe_allocated;
+	uint pe_stale;		/* for future use */
+	pe_disk_t *pe;		/* HM */
+	struct inode *inode;	/* HM */
+} pv_v1_t;
+
+/* core */
+typedef struct {
+	char id[2];		/* Identifier */
+	unsigned short version;	/* HM lvm version */
+	lvm_disk_data_t pv_on_disk;
+	lvm_disk_data_t vg_on_disk;
+	lvm_disk_data_t pv_uuidlist_on_disk;
+	lvm_disk_data_t lv_on_disk;
+	lvm_disk_data_t pe_on_disk;
+	char pv_name[NAME_LEN];
+	char vg_name[NAME_LEN];
+	char system_id[NAME_LEN];	/* for vgexport/vgimport */
+	kdev_t pv_dev;
+	uint pv_number;
+	uint pv_status;
+	uint pv_allocatable;
+	uint pv_size;		/* HM */
+	uint lv_cur;
+	uint pe_size;
+	uint pe_total;
+	uint pe_allocated;
+	uint pe_stale;		/* for future use */
+	pe_disk_t *pe;		/* HM */
+	struct inode *inode;	/* HM */
+	char pv_uuid[UUID_LEN+1];
+} pv_v2_t;
+
+
+/* disk */
+typedef struct {
+	uint8_t id[2];		/* Identifier */
+	uint16_t version;		/* HM lvm version */
+	lvm_disk_data_t pv_on_disk;
+	lvm_disk_data_t vg_on_disk;
+	lvm_disk_data_t pv_namelist_on_disk;
+	lvm_disk_data_t lv_on_disk;
+	lvm_disk_data_t pe_on_disk;
+	uint8_t pv_name[NAME_LEN];
+	uint8_t vg_name[NAME_LEN];
+	uint8_t system_id[NAME_LEN];	/* for vgexport/vgimport */
+	uint32_t pv_major;
+	uint32_t pv_number;
+	uint32_t pv_status;
+	uint32_t pv_allocatable;
+	uint32_t pv_size;		/* HM */
+	uint32_t lv_cur;
+	uint32_t pe_size;
+	uint32_t pe_total;
+	uint32_t pe_allocated;
+} pv_disk_v1_t;
+
+/* disk */
+typedef struct {
+	uint8_t id[2];		/* Identifier */
+	uint16_t version;		/* HM lvm version */
+	lvm_disk_data_t pv_on_disk;
+	lvm_disk_data_t vg_on_disk;
+	lvm_disk_data_t pv_uuidlist_on_disk;
+	lvm_disk_data_t lv_on_disk;
+	lvm_disk_data_t pe_on_disk;
+	uint8_t pv_uuid[NAME_LEN];
+	uint8_t vg_name[NAME_LEN];
+	uint8_t system_id[NAME_LEN];	/* for vgexport/vgimport */
+	uint32_t pv_major;
+	uint32_t pv_number;
+	uint32_t pv_status;
+	uint32_t pv_allocatable;
+	uint32_t pv_size;		/* HM */
+	uint32_t lv_cur;
+	uint32_t pe_size;
+	uint32_t pe_total;
+	uint32_t pe_allocated;
+} pv_disk_v2_t;
+
+
+/*
+ * Structures for Logical Volume (LV)
+ */
+
+/* core PE information */
+typedef struct {
+	kdev_t dev;
+	ulong pe;		/* to be changed if > 2TB */
+	ulong reads;
+	ulong writes;
+} pe_t;
+
+typedef struct {
+	char lv_name[NAME_LEN];
+	kdev_t old_dev;
+	kdev_t new_dev;
+	ulong old_pe;
+	ulong new_pe;
+} le_remap_req_t;
+
+typedef struct lv_bmap {
+	ulong lv_block;
+	dev_t lv_dev;
+} lv_bmap_t;
+
+/*
+ * Structure Logical Volume (LV) Version 3
+ */
+
+/* core */
+typedef struct lv_v4 {
+	char lv_name[NAME_LEN];
+	char vg_name[NAME_LEN];
+	uint lv_access;
+	uint lv_status;
+	uint lv_open;		/* HM */
+	kdev_t lv_dev;		/* HM */
+	uint lv_number;		/* HM */
+	uint lv_mirror_copies;	/* for future use */
+	uint lv_recovery;	/*       "        */
+	uint lv_schedule;	/*       "        */
+	uint lv_size;
+	pe_t *lv_current_pe;	/* HM */
+	uint lv_current_le;	/* for future use */
+	uint lv_allocated_le;
+	uint lv_stripes;
+	uint lv_stripesize;
+	uint lv_badblock;	/* for future use */
+	uint lv_allocation;
+	uint lv_io_timeout;	/* for future use */
+	uint lv_read_ahead;
+
+	/* delta to version 1 starts here */
+	struct lv_v4 *lv_snapshot_org;
+	struct lv_v4 *lv_snapshot_prev;
+	struct lv_v4 *lv_snapshot_next;
+	lv_block_exception_t *lv_block_exception;
+	uint lv_remap_ptr;
+	uint lv_remap_end;
+	uint lv_chunk_size;
+	uint lv_snapshot_minor;
+#ifdef __KERNEL__
+	struct kiobuf *lv_iobuf;
+	struct semaphore lv_snapshot_sem;
+	struct list_head *lv_snapshot_hash_table;
+	ulong lv_snapshot_hash_table_size;
+	ulong lv_snapshot_hash_mask;
+	struct page *lv_COW_table_page;
+#if LINUX_VERSION_CODE > KERNEL_VERSION ( 2, 3, 0)
+	wait_queue_head_t lv_snapshot_wait;
+#else
+	struct wait_queue *lv_snapshot_wait;
+#endif
+	int	lv_snapshot_use_rate;
+	void	*vg;
+#else
+	char dummy[200];
+#endif
+} lv_v4_t;
+
+/* disk */
+typedef struct {
+	uint8_t lv_name[NAME_LEN];
+	uint8_t vg_name[NAME_LEN];
+	uint32_t lv_access;
+	uint32_t lv_status;
+	uint32_t lv_open;		/* HM */
+	uint32_t lv_dev;		/* HM */
+	uint32_t lv_number;	/* HM */
+	uint32_t lv_mirror_copies;	/* for future use */
+	uint32_t lv_recovery;	/*       "        */
+	uint32_t lv_schedule;	/*       "        */
+	uint32_t lv_size;
+	uint32_t lv_snapshot_minor;/* minor number of original */
+	uint16_t lv_chunk_size;	/* chunk size of snapshot */
+	uint16_t dummy;
+	uint32_t lv_allocated_le;
+	uint32_t lv_stripes;
+	uint32_t lv_stripesize;
+	uint32_t lv_badblock;	/* for future use */
+	uint32_t lv_allocation;
+	uint32_t lv_io_timeout;	/* for future use */
+	uint32_t lv_read_ahead;	/* HM */
+} lv_disk_v3_t;
+
+/*
+ * Structure Volume Group (VG) Version 1
+ */
+
+/* core */
+typedef struct {
+	char vg_name[NAME_LEN];	/* volume group name */
+	uint vg_number;		/* volume group number */
+	uint vg_access;		/* read/write */
+	uint vg_status;		/* active or not */
+	uint lv_max;		/* maximum logical volumes */
+	uint lv_cur;		/* current logical volumes */
+	uint lv_open;		/* open    logical volumes */
+	uint pv_max;		/* maximum physical volumes */
+	uint pv_cur;		/* current physical volumes FU */
+	uint pv_act;		/* active physical volumes */
+	uint dummy;		/* was obsolete max_pe_per_pv */
+	uint vgda;		/* volume group descriptor arrays FU */
+	uint pe_size;		/* physical extent size in sectors */
+	uint pe_total;		/* total of physical extents */
+	uint pe_allocated;	/* allocated physical extents */
+	uint pvg_total;		/* physical volume groups FU */
+	struct proc_dir_entry *proc;
+	pv_t *pv[ABS_MAX_PV + 1];	/* physical volume struct pointers */
+	lv_t *lv[ABS_MAX_LV + 1];	/* logical  volume struct pointers */
+} vg_v1_t;
+
+typedef struct {
+	char vg_name[NAME_LEN];	/* volume group name */
+	uint vg_number;		/* volume group number */
+	uint vg_access;		/* read/write */
+	uint vg_status;		/* active or not */
+	uint lv_max;		/* maximum logical volumes */
+	uint lv_cur;		/* current logical volumes */
+	uint lv_open;		/* open    logical volumes */
+	uint pv_max;		/* maximum physical volumes */
+	uint pv_cur;		/* current physical volumes FU */
+	uint pv_act;		/* active physical volumes */
+	uint dummy;		/* was obsolete max_pe_per_pv */
+	uint vgda;		/* volume group descriptor arrays FU */
+	uint pe_size;		/* physical extent size in sectors */
+	uint pe_total;		/* total of physical extents */
+	uint pe_allocated;	/* allocated physical extents */
+	uint pvg_total;		/* physical volume groups FU */
+	struct proc_dir_entry *proc;
+	pv_t *pv[ABS_MAX_PV + 1];	/* physical volume struct pointers */
+	lv_t *lv[ABS_MAX_LV + 1];	/* logical  volume struct pointers */
+	char vg_uuid[UUID_LEN+1];	/* volume group UUID */
+#ifdef __KERNEL__
+	struct proc_dir_entry *vg_dir_pde;
+	struct proc_dir_entry *lv_subdir_pde;
+	struct proc_dir_entry *pv_subdir_pde;
+#else
+	char dummy1[200];
+#endif
+} vg_v3_t;
+
+
+/* disk */
+typedef struct {
+	uint8_t vg_name[NAME_LEN];	/* volume group name */
+	uint32_t vg_number;	/* volume group number */
+	uint32_t vg_access;	/* read/write */
+	uint32_t vg_status;	/* active or not */
+	uint32_t lv_max;		/* maximum logical volumes */
+	uint32_t lv_cur;		/* current logical volumes */
+	uint32_t lv_open;		/* open    logical volumes */
+	uint32_t pv_max;		/* maximum physical volumes */
+	uint32_t pv_cur;		/* current physical volumes FU */
+	uint32_t pv_act;		/* active physical volumes */
+	uint32_t dummy;
+	uint32_t vgda;		/* volume group descriptor arrays FU */
+	uint32_t pe_size;		/* physical extent size in sectors */
+	uint32_t pe_total;		/* total of physical extents */
+	uint32_t pe_allocated;	/* allocated physical extents */
+	uint32_t pvg_total;	/* physical volume groups FU */
+} vg_disk_v1_t;
+
+typedef struct {
+	uint8_t vg_uuid[UUID_LEN];	/* volume group UUID */
+	uint8_t vg_name_dummy[NAME_LEN-UUID_LEN];	/* rest of v1 VG name */
+	uint32_t vg_number;	/* volume group number */
+	uint32_t vg_access;	/* read/write */
+	uint32_t vg_status;	/* active or not */
+	uint32_t lv_max;		/* maximum logical volumes */
+	uint32_t lv_cur;		/* current logical volumes */
+	uint32_t lv_open;		/* open    logical volumes */
+	uint32_t pv_max;		/* maximum physical volumes */
+	uint32_t pv_cur;		/* current physical volumes FU */
+	uint32_t pv_act;		/* active physical volumes */
+	uint32_t dummy;
+	uint32_t vgda;		/* volume group descriptor arrays FU */
+	uint32_t pe_size;		/* physical extent size in sectors */
+	uint32_t pe_total;		/* total of physical extents */
+	uint32_t pe_allocated;	/* allocated physical extents */
+	uint32_t pvg_total;	/* physical volume groups FU */
+} vg_disk_v2_t;
+
+
+/*
+ * Request structures for ioctls
+ */
+
+/* Request structure PV_STATUS_BY_NAME... */
+typedef struct {
+	char pv_name[NAME_LEN];
+	pv_t *pv;
+} pv_status_req_t, pv_change_req_t;
+
+/* Request structure PV_FLUSH */
+typedef struct {
+	char pv_name[NAME_LEN];
+	kdev_t pv_dev;
+} pv_flush_req_t;
+
+
+/* Request structure PE_MOVE */
+typedef struct {
+	enum {
+		LOCK_PE, UNLOCK_PE
+	} lock;
+	struct {
+		kdev_t lv_dev;
+		kdev_t pv_dev;
+		ulong pv_offset;
+	} data;
+} pe_lock_req_t;
+
+
+/* Request structure LV_STATUS_BYNAME */
+typedef struct {
+	char lv_name[NAME_LEN];
+	lv_t *lv;
+} lv_status_byname_req_t, lv_req_t;
+
+/* Request structure LV_STATUS_BYINDEX */
+typedef struct {
+	ulong lv_index;
+	lv_t *lv;
+	/* Transfer size because user space and kernel space differ */
+	ushort size;
+} lv_status_byindex_req_t;
+
+/* Request structure LV_STATUS_BYDEV... */
+typedef struct {
+	dev_t dev;
+	pv_t *lv;
+} lv_status_bydev_req_t;
+
+
+/* Request structure LV_SNAPSHOT_USE_RATE */
+typedef struct {
+	int	block;
+	int	rate;
+} lv_snapshot_use_rate_req_t;
+
+#endif				/* #ifndef _LVM_H_INCLUDE */
diff -urN 2.2.18/include/linux/major.h 2.2.18aa1/include/linux/major.h
--- 2.2.18/include/linux/major.h	Mon Dec 11 16:58:06 2000
+++ 2.2.18aa1/include/linux/major.h	Mon Dec 11 17:20:48 2000
@@ -126,6 +126,8 @@
 
 #define AURORA_MAJOR 79
 
+#define RAW_MAJOR	162
+
 #define UNIX98_PTY_MASTER_MAJOR	128
 #define UNIX98_PTY_MAJOR_COUNT	8
 #define UNIX98_PTY_SLAVE_MAJOR	(UNIX98_PTY_MASTER_MAJOR+UNIX98_PTY_MAJOR_COUNT)
diff -urN 2.2.18/include/linux/md.h 2.2.18aa1/include/linux/md.h
--- 2.2.18/include/linux/md.h	Tue Nov 14 23:08:35 2000
+++ 2.2.18aa1/include/linux/md.h	Thu Jan  1 01:00:00 1970
@@ -1,300 +0,0 @@
-/*
-   md.h : Multiple Devices driver for Linux
-          Copyright (C) 1994-96 Marc ZYNGIER
-	  <zyngier@ufr-info-p7.ibp.fr> or
-	  <maz@gloups.fdn.fr>
-	  
-   This program is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published by
-   the Free Software Foundation; either version 2, or (at your option)
-   any later version.
-   
-   You should have received a copy of the GNU General Public License
-   (for example /usr/src/linux/COPYING); if not, write to the Free
-   Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  
-*/
-
-#ifndef _MD_H
-#define _MD_H
-
-#include <linux/major.h>
-#include <linux/ioctl.h>
-#include <linux/types.h>
-
-/*
- * Different major versions are not compatible.
- * Different minor versions are only downward compatible.
- * Different patchlevel versions are downward and upward compatible.
- */
-#define MD_MAJOR_VERSION		0
-#define MD_MINOR_VERSION		36
-#define MD_PATCHLEVEL_VERSION		6
-
-#define MD_DEFAULT_DISK_READAHEAD	(256 * 1024)
-
-/* ioctls */
-#define REGISTER_DEV 		_IO (MD_MAJOR, 1)
-#define START_MD     		_IO (MD_MAJOR, 2)
-#define STOP_MD      		_IO (MD_MAJOR, 3)
-#define REGISTER_DEV_NEW	_IO (MD_MAJOR, 4)
-
-/*
-   personalities :
-   Byte 0 : Chunk size factor
-   Byte 1 : Fault tolerance count for each physical device
-            (   0 means no fault tolerance,
-             0xFF means always tolerate faults), not used by now.
-   Byte 2 : Personality
-   Byte 3 : Reserved.
- */
-
-#define FAULT_SHIFT       8
-#define PERSONALITY_SHIFT 16
-
-#define FACTOR_MASK       0x000000FFUL
-#define FAULT_MASK        0x0000FF00UL
-#define PERSONALITY_MASK  0x00FF0000UL
-
-#define MD_RESERVED       0	/* Not used by now */
-#define LINEAR            (1UL << PERSONALITY_SHIFT)
-#define STRIPED           (2UL << PERSONALITY_SHIFT)
-#define RAID0             STRIPED
-#define RAID1             (3UL << PERSONALITY_SHIFT)
-#define RAID5             (4UL << PERSONALITY_SHIFT)
-#define MAX_PERSONALITY   5
-
-/*
- * MD superblock.
- *
- * The MD superblock maintains some statistics on each MD configuration.
- * Each real device in the MD set contains it near the end of the device.
- * Some of the ideas are copied from the ext2fs implementation.
- *
- * We currently use 4096 bytes as follows:
- *
- *	word offset	function
- *
- *	   0  -    31	Constant generic MD device information.
- *        32  -    63   Generic state information.
- *	  64  -   127	Personality specific information.
- *	 128  -   511	12 32-words descriptors of the disks in the raid set.
- *	 512  -   911	Reserved.
- *	 912  -  1023	Disk specific descriptor.
- */
-
-/*
- * If x is the real device size in bytes, we return an apparent size of:
- *
- *	y = (x & ~(MD_RESERVED_BYTES - 1)) - MD_RESERVED_BYTES
- *
- * and place the 4kB superblock at offset y.
- */
-#define MD_RESERVED_BYTES		(64 * 1024)
-#define MD_RESERVED_SECTORS		(MD_RESERVED_BYTES / 512)
-#define MD_RESERVED_BLOCKS		(MD_RESERVED_BYTES / BLOCK_SIZE)
-
-#define MD_NEW_SIZE_SECTORS(x)		((x & ~(MD_RESERVED_SECTORS - 1)) - MD_RESERVED_SECTORS)
-#define MD_NEW_SIZE_BLOCKS(x)		((x & ~(MD_RESERVED_BLOCKS - 1)) - MD_RESERVED_BLOCKS)
-
-#define MD_SB_BYTES			4096
-#define MD_SB_WORDS			(MD_SB_BYTES / 4)
-#define MD_SB_BLOCKS			(MD_SB_BYTES / BLOCK_SIZE)
-#define MD_SB_SECTORS			(MD_SB_BYTES / 512)
-
-/*
- * The following are counted in 32-bit words
- */
-#define	MD_SB_GENERIC_OFFSET		0
-#define MD_SB_PERSONALITY_OFFSET	64
-#define MD_SB_DISKS_OFFSET		128
-#define MD_SB_DESCRIPTOR_OFFSET		992
-
-#define MD_SB_GENERIC_CONSTANT_WORDS	32
-#define MD_SB_GENERIC_STATE_WORDS	32
-#define MD_SB_GENERIC_WORDS		(MD_SB_GENERIC_CONSTANT_WORDS + MD_SB_GENERIC_STATE_WORDS)
-#define MD_SB_PERSONALITY_WORDS		64
-#define MD_SB_DISKS_WORDS		384
-#define MD_SB_DESCRIPTOR_WORDS		32
-#define MD_SB_RESERVED_WORDS		(1024 - MD_SB_GENERIC_WORDS - MD_SB_PERSONALITY_WORDS - MD_SB_DISKS_WORDS - MD_SB_DESCRIPTOR_WORDS)
-#define MD_SB_EQUAL_WORDS		(MD_SB_GENERIC_WORDS + MD_SB_PERSONALITY_WORDS + MD_SB_DISKS_WORDS)
-#define MD_SB_DISKS			(MD_SB_DISKS_WORDS / MD_SB_DESCRIPTOR_WORDS)
-
-/*
- * Device "operational" state bits
- */
-#define MD_FAULTY_DEVICE		0	/* Device is faulty / operational */
-#define MD_ACTIVE_DEVICE		1	/* Device is a part or the raid set / spare disk */
-#define MD_SYNC_DEVICE			2	/* Device is in sync with the raid set */
-
-typedef struct md_device_descriptor_s {
-	__u32 number;		/* 0 Device number in the entire set */
-	__u32 major;		/* 1 Device major number */
-	__u32 minor;		/* 2 Device minor number */
-	__u32 raid_disk;	/* 3 The role of the device in the raid set */
-	__u32 state;		/* 4 Operational state */
-	__u32 reserved[MD_SB_DESCRIPTOR_WORDS - 5];
-} md_descriptor_t;
-
-#define MD_SB_MAGIC		0xa92b4efc
-
-/*
- * Superblock state bits
- */
-#define MD_SB_CLEAN		0
-#define MD_SB_ERRORS		1
-
-typedef struct md_superblock_s {
-
-	/*
-	 * Constant generic information
-	 */
-	__u32 md_magic;		/*  0 MD identifier */
-	__u32 major_version;	/*  1 major version to which the set conforms */
-	__u32 minor_version;	/*  2 minor version to which the set conforms */
-	__u32 patch_version;	/*  3 patchlevel version to which the set conforms */
-	__u32 gvalid_words;	/*  4 Number of non-reserved words in this section */
-	__u32 set_magic;	/*  5 Raid set identifier */
-	__u32 ctime;		/*  6 Creation time */
-	__u32 level;		/*  7 Raid personality (mirroring, raid5, ...) */
-	__u32 size;		/*  8 Apparent size of each individual disk, in kB */
-	__u32 nr_disks;		/*  9 Number of total disks in the raid set */
-	__u32 raid_disks;	/* 10 Number of disks in a fully functional raid set */
-	__u32 gstate_creserved[MD_SB_GENERIC_CONSTANT_WORDS - 11];
-
-	/*
-	 * Generic state information
-	 */
-	__u32 utime;		/*  0 Superblock update time */
-	__u32 state;		/*  1 State bits (clean, ...) */
-	__u32 active_disks;	/*  2 Number of currently active disks (some non-faulty disks might not be in sync) */
-	__u32 working_disks;	/*  3 Number of working disks */
-	__u32 failed_disks;	/*  4 Number of failed disks */
-	__u32 spare_disks;	/*  5 Number of spare disks */
-	__u32 gstate_sreserved[MD_SB_GENERIC_STATE_WORDS - 6];
-
-	/*
-	 * Personality information
-	 */
-	__u32 parity_algorithm;
-	__u32 chunk_size;
-	__u32 pstate_reserved[MD_SB_PERSONALITY_WORDS - 2];
-
-	/*
-	 * Disks information
-	 */
-	md_descriptor_t disks[MD_SB_DISKS];
-
-	/*
-	 * Reserved
-	 */
-	__u32 reserved[MD_SB_RESERVED_WORDS];
-
-	/*
-	 * Active descriptor
-	 */
-	md_descriptor_t descriptor;
-} md_superblock_t;
-
-#ifdef __KERNEL__
-
-#include <linux/mm.h>
-#include <linux/fs.h>
-#include <linux/blkdev.h>
-#include <asm/semaphore.h>
-
-/*
- * Kernel-based reconstruction is mostly working, but still requires
- * some additional work.
- */
-#define SUPPORT_RECONSTRUCTION	0
-
-#define MAX_REAL     8		/* Max number of physical dev per md dev */
-#define MAX_MD_DEV   4		/* Max number of md dev */
-
-#define FACTOR(a)         ((a)->repartition & FACTOR_MASK)
-#define MAX_FAULT(a)      (((a)->repartition & FAULT_MASK)>>8)
-#define PERSONALITY(a)    ((a)->repartition & PERSONALITY_MASK)
-
-#define FACTOR_SHIFT(a) (PAGE_SHIFT + (a) - 10)
-
-struct real_dev
-{
-  kdev_t dev;			/* Device number */
-  int size;			/* Device size (in blocks) */
-  int offset;			/* Real device offset (in blocks) in md dev
-				   (only used in linear mode) */
-  struct inode *inode;		/* Lock inode */
-  md_superblock_t *sb;
-  u32 sb_offset;
-};
-
-struct md_dev;
-
-#define SPARE_INACTIVE	0
-#define SPARE_WRITE	1
-#define SPARE_ACTIVE	2
-
-struct md_personality
-{
-  char *name;
-  int (*map)(struct md_dev *mddev, kdev_t *rdev,
-	              unsigned long *rsector, unsigned long size);
-  int (*make_request)(struct md_dev *mddev, int rw, struct buffer_head * bh);
-  void (*end_request)(struct buffer_head * bh, int uptodate);
-  int (*run)(int minor, struct md_dev *mddev);
-  int (*stop)(int minor, struct md_dev *mddev);
-  int (*status)(char *page, int minor, struct md_dev *mddev);
-  int (*ioctl)(struct inode *inode, struct file *file,
-	       unsigned int cmd, unsigned long arg);
-  int max_invalid_dev;
-  int (*error_handler)(struct md_dev *mddev, kdev_t dev);
-
-/*
- * Some personalities (RAID-1, RAID-5) can get disks hot-added and
- * hot-removed. Hot removal is different from failure. (failure marks
- * a disk inactive, but the disk is still part of the array)
- */
-  int (*hot_add_disk) (struct md_dev *mddev, kdev_t dev);
-  int (*hot_remove_disk) (struct md_dev *mddev, kdev_t dev);
-  int (*mark_spare) (struct md_dev *mddev, md_descriptor_t *descriptor, int state);
-};
-
-struct md_dev
-{
-  struct real_dev	devices[MAX_REAL];
-  struct md_personality	*pers;
-  md_superblock_t	*sb;
-  int			sb_dirty;
-  int			repartition;
-  int			busy;
-  int			nb_dev;
-  void			*private;
-};
-
-struct md_thread {
-	void			(*run) (void *data);
-	void			*data;
-	struct wait_queue	*wqueue;
-	unsigned long           flags;
-	struct semaphore	*sem;
-	struct task_struct	*tsk;
-};
-
-#define THREAD_WAKEUP  0
-
-extern struct md_dev md_dev[MAX_MD_DEV];
-extern int md_size[MAX_MD_DEV];
-extern int md_maxreadahead[MAX_MD_DEV];
-
-extern char *partition_name (kdev_t dev);
-
-extern int register_md_personality (int p_num, struct md_personality *p);
-extern int unregister_md_personality (int p_num);
-extern struct md_thread *md_register_thread (void (*run) (void *data), void *data);
-extern void md_unregister_thread (struct md_thread *thread);
-extern void md_wakeup_thread(struct md_thread *thread);
-extern int md_update_sb (int minor);
-extern int md_do_sync(struct md_dev *mddev);
-
-#endif __KERNEL__
-#endif _MD_H
diff -urN 2.2.18/include/linux/mm.h 2.2.18aa1/include/linux/mm.h
--- 2.2.18/include/linux/mm.h	Mon Dec 11 16:58:06 2000
+++ 2.2.18aa1/include/linux/mm.h	Mon Dec 11 19:23:35 2000
@@ -54,7 +54,7 @@
 	struct vm_area_struct **vm_pprev_share;
 
 	struct vm_operations_struct * vm_ops;
-	unsigned long vm_offset;
+	loff_t vm_offset;
 	struct file * vm_file;
 	unsigned long vm_pte;			/* shared mem */
 };
@@ -81,6 +81,8 @@
 #define VM_LOCKED	0x2000
 #define VM_IO           0x4000  /* Memory mapped I/O or similar */
 
+#define VM_RESERVED	0x8000	/* Don't unmap it from swap_out */
+
 #define VM_STACK_FLAGS	0x0177
 
 /*
@@ -106,9 +108,46 @@
 	unsigned long (*wppage)(struct vm_area_struct * area, unsigned long address,
 		unsigned long page);
 	int (*swapout)(struct vm_area_struct *, struct page *);
-	pte_t (*swapin)(struct vm_area_struct *, unsigned long, unsigned long);
+	pte_t (*swapin)(struct vm_area_struct *, loff_t, unsigned long);
 };
 
+
+/*
+ *  pgoff_t  type -- a complex one, and its simple alternate.
+ *  The complex one has type that compiler can trap at compile
+ *  time, but the simple one does simpler code (?)
+ */
+
+#if 0
+typedef struct pgoff_t {
+  unsigned long pgoff;
+} pgoff_t;
+
+#define pgoff2ulong(pgof) ((pgof).pgoff)
+extern __inline__ pgoff_t ulong2pgoff(unsigned long ul) {
+  pgoff_t up;
+  up.pgoff = ul;
+  return up;
+}
+
+#define pgoff2loff(pgof) (((loff_t)(pgof).pgoff) << PAGE_SHIFT)
+#define loff2pgoff(loff) ulong2pgoff((loff) >> PAGE_SHIFT)
+
+#else /* Integer scalars -- simpler code.. */
+
+typedef unsigned long pgoff_t;
+
+#define pgoff2ulong(pgof) (pgof)
+#define ulong2pgoff(pgof) (pgof)
+
+#define pgoff2loff(pgof) (((loff_t)(pgof)) << PAGE_SHIFT)
+#define loff2pgoff(loff) ulong2pgoff((loff) >> PAGE_SHIFT)
+
+#endif
+
+#define PAGE_MASK_loff ((loff_t)(long)(PAGE_MASK))
+
+
 /*
  * Try to keep the most commonly accessed fields in single cache lines
  * here (16 bytes or greater).  This ordering should be particularly
@@ -117,12 +156,13 @@
  * The first line is data used in page cache lookup, the second line
  * is used for linear searches (eg. clock algorithm scans). 
  */
+
 typedef struct page {
 	/* these must be first (free area handling) */
 	struct page *next;
 	struct page *prev;
+	pgoff_t index;
 	struct inode *inode;
-	unsigned long offset;
 	struct page *next_hash;
 	atomic_t count;
 	unsigned long flags;	/* atomic flags, some possibly updated asynchronously */
@@ -144,6 +184,7 @@
 #define PG_Slab			 9
 #define PG_swap_cache		10
 #define PG_skip			11
+#define PG_BIGMEM		12
 #define PG_reserved		31
 
 /* Make it prettier to test the above... */
@@ -175,6 +216,11 @@
 			(test_and_clear_bit(PG_dirty, &(page)->flags))
 #define PageTestandClearSwapCache(page)	\
 			(test_and_clear_bit(PG_swap_cache, &(page)->flags))
+#ifdef CONFIG_BIGMEM
+#define PageBIGMEM(page)	(test_bit(PG_BIGMEM, &(page)->flags))
+#else
+#define PageBIGMEM(page) 0 /* needed to optimize away at compile time */
+#endif
 
 /*
  * Various page->flags bits:
@@ -291,7 +337,7 @@
 extern int remap_page_range(unsigned long from, unsigned long to, unsigned long size, pgprot_t prot);
 extern int zeromap_page_range(unsigned long from, unsigned long size, pgprot_t prot);
 
-extern void vmtruncate(struct inode * inode, unsigned long offset);
+extern void vmtruncate(struct inode * inode, loff_t offset);
 extern int handle_mm_fault(struct task_struct *tsk,struct vm_area_struct *vma, unsigned long address, int write_access);
 extern int make_pages_present(unsigned long addr, unsigned long end);
 
@@ -311,16 +357,22 @@
 extern void exit_mmap(struct mm_struct *);
 extern unsigned long get_unmapped_area(unsigned long, unsigned long);
 
-extern unsigned long do_mmap(struct file *, unsigned long, unsigned long,
-	unsigned long, unsigned long, unsigned long);
+extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
+	unsigned long len, unsigned long prot,
+	unsigned long flag, unsigned long pgoff);
+
+extern unsigned long do_mmap(struct file *, unsigned long,
+			     unsigned long, unsigned long,
+			     unsigned long, unsigned long);
+
 extern int do_munmap(unsigned long, size_t);
 
 /* filemap.c */
 extern void remove_inode_page(struct page *);
 extern unsigned long page_unuse(struct page *);
 extern int shrink_mmap(int, int);
-extern void truncate_inode_pages(struct inode *, unsigned long);
-extern unsigned long get_cached_page(struct inode *, unsigned long, int);
+extern void truncate_inode_pages(struct inode *, loff_t);
+extern unsigned long get_cached_page(struct inode *, pgoff_t, int);
 extern void put_cached_page(unsigned long);
 
 /*
@@ -332,11 +384,17 @@
 #define __GFP_HIGH	0x08
 #define __GFP_IO	0x10
 #define __GFP_SWAP	0x20
+#ifdef CONFIG_BIGMEM
+#define __GFP_BIGMEM	0x40
+#else
+#define __GFP_BIGMEM	0x0 /* noop */
+#endif
 
 #define __GFP_DMA	0x80
 
 #define GFP_BUFFER	(__GFP_MED | __GFP_WAIT)
 #define GFP_ATOMIC	(__GFP_HIGH)
+#define GFP_BIGUSER	(__GFP_LOW | __GFP_WAIT | __GFP_IO | __GFP_BIGMEM)
 #define GFP_USER	(__GFP_LOW | __GFP_WAIT | __GFP_IO)
 #define GFP_KERNEL	(__GFP_MED | __GFP_WAIT | __GFP_IO)
 #define GFP_NFS		(__GFP_HIGH | __GFP_WAIT | __GFP_IO)
@@ -347,13 +405,23 @@
 
 #define GFP_DMA		__GFP_DMA
 
+/* Flag - indicates that the buffer can be taken from big memory which is not
+   directly addressable by the kernel */
+
+#define GFP_BIGMEM	__GFP_BIGMEM
+
+extern int heap_stack_gap;
+
 /* vma is the first one with  address < vma->vm_end,
  * and even  address < vma->vm_start. Have to extend vma. */
-static inline int expand_stack(struct vm_area_struct * vma, unsigned long address)
+static inline int expand_stack(struct vm_area_struct * vma, unsigned long address,
+			       struct vm_area_struct * prev_vma)
 {
 	unsigned long grow;
 
 	address &= PAGE_MASK;
+	if (prev_vma && prev_vma->vm_end + (heap_stack_gap << PAGE_SHIFT) > address)
+		return -ENOMEM;
 	grow = vma->vm_start - address;
 	if ((vma->vm_end - address
 	    > current->rlim[RLIMIT_STACK].rlim_cur) ||
@@ -371,6 +439,8 @@
 
 /* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
 extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr);
+extern struct vm_area_struct * find_vma_prev(struct mm_struct *, unsigned long,
+					     struct vm_area_struct **);
 
 /* Look up the first VMA which intersects the interval start_addr..end_addr-1,
    NULL if none.  Assume start_addr < end_addr. */
diff -urN 2.2.18/include/linux/net.h 2.2.18aa1/include/linux/net.h
--- 2.2.18/include/linux/net.h	Mon Nov 13 01:51:15 2000
+++ 2.2.18aa1/include/linux/net.h	Mon Dec 11 17:29:53 2000
@@ -126,7 +126,7 @@
 };
 
 extern struct net_proto_family *net_families[];
-extern int	sock_wake_async(struct socket *sk, int how);
+extern int	sock_wake_async(struct socket *sk, int how, int band);
 extern int	sock_register(struct net_proto_family *fam);
 extern int	sock_unregister(int family);
 extern struct socket *sock_alloc(void);
diff -urN 2.2.18/include/linux/nfs.h 2.2.18aa1/include/linux/nfs.h
--- 2.2.18/include/linux/nfs.h	Mon Dec 11 16:58:06 2000
+++ 2.2.18aa1/include/linux/nfs.h	Mon Dec 11 17:20:51 2000
@@ -78,11 +78,7 @@
 #define NFS_MNTPROC_MNT		1
 #define NFS_MNTPROC_UMNT	3
 
-/*
- * This is really a general kernel constant, but since nothing like
- * this is defined in the kernel headers, I have to do it here.
- */
-#define NFS_OFFSET_MAX		LONG_MAX
+#define NFS_OFFSET_MAX		((__s64)((~(__u64)0) >> 1))
 
 /*
  * These data types are used exlusively by the NFS client implementation.
diff -urN 2.2.18/include/linux/nfs_fs.h 2.2.18aa1/include/linux/nfs_fs.h
--- 2.2.18/include/linux/nfs_fs.h	Mon Dec 11 16:58:06 2000
+++ 2.2.18aa1/include/linux/nfs_fs.h	Mon Dec 11 19:23:48 2000
@@ -133,15 +133,15 @@
 }
 
 static inline
-unsigned long nfs_page_offset(struct page *page)
+loff_t nfs_page_offset(struct page *page)
 {
-        return page->offset;
+	return ((loff_t)page->index) << PAGE_CACHE_SHIFT;
 }
 
 static inline
 unsigned long page_index(struct page *page)
 {
-	return page->offset >> PAGE_CACHE_SHIFT;
+	return page->index;
 }
 
 /*
@@ -305,6 +305,15 @@
 nfs_size_to_off_t(__u64 size)
 {
 	return (size > (__u64)LONG_MAX) ? (off_t)LONG_MAX : (off_t) size;
+}
+
+static inline loff_t
+nfs_size_to_loff_t(__u64 size)
+{
+	loff_t maxsz = (((loff_t) ULONG_MAX) << PAGE_CACHE_SHIFT) + PAGE_CACHE_SIZE - 1;
+	if (size > maxsz)
+		return maxsz;
+	return (loff_t) size;
 }
 
 static inline ino_t
diff -urN 2.2.18/include/linux/nfs_xdr.h 2.2.18aa1/include/linux/nfs_xdr.h
--- 2.2.18/include/linux/nfs_xdr.h	Mon Dec 11 16:58:06 2000
+++ 2.2.18aa1/include/linux/nfs_xdr.h	Mon Dec 11 17:20:51 2000
@@ -89,7 +89,7 @@
 #define NFS_WRITE_MAXIOV        8
 struct nfs_writeargs {
 	struct nfs_fh *		fh;
-	__u32			offset;
+	__u64			offset;
 	__u32			count;
 	enum nfs3_stable_how	stable;
 	unsigned int		nriov;
@@ -330,11 +330,11 @@
 			void *buffer, unsigned int buflen);
 	int	(*read)(struct dentry *, struct nfs_fattr *,
 			struct rpc_cred *,
-			int flags, unsigned long offset,
+			int flags, loff_t offset,
 			unsigned int count, void *buffer, int *eofp);
 	int	(*write)(struct dentry *, struct nfs_fattr *,
 			struct rpc_cred *,
-			int flags, unsigned long offset,
+			int flags, loff_t offset,
 			unsigned int count, void *buffer,
 			struct nfs_writeverf *verfp);
 	int	(*commit)(struct dentry *, struct nfs_fattr *,
@@ -366,6 +366,7 @@
 	int	(*statfs)(struct nfs_server *, struct nfs_fh *,
 			struct nfs_fsinfo *);
 	__u32 *	(*decode_dirent)(__u32 *, struct nfs_entry *, int plus);
+	int	bigfiles;
 };
 
 /*
diff -urN 2.2.18/include/linux/nfsd/nfsd.h 2.2.18aa1/include/linux/nfsd/nfsd.h
--- 2.2.18/include/linux/nfsd/nfsd.h	Mon Dec 11 16:58:06 2000
+++ 2.2.18aa1/include/linux/nfsd/nfsd.h	Mon Dec 11 19:26:37 2000
@@ -57,7 +57,7 @@
 	char			dotonly;
 };
 typedef int		(*encode_dent_fn)(struct readdir_cd *, const char *,
-						int, off_t, ino_t);
+						int, off_t, ino_t, unsigned int);
 typedef int (*nfsd_dirop_t)(struct inode *, struct dentry *, int, int);
 
 /*
diff -urN 2.2.18/include/linux/nfsd/xdr.h 2.2.18aa1/include/linux/nfsd/xdr.h
--- 2.2.18/include/linux/nfsd/xdr.h	Tue Nov 14 03:37:47 2000
+++ 2.2.18aa1/include/linux/nfsd/xdr.h	Mon Dec 11 19:26:37 2000
@@ -152,7 +152,7 @@
 int nfssvc_encode_readdirres(struct svc_rqst *, u32 *, struct nfsd_readdirres *);
 
 int nfssvc_encode_entry(struct readdir_cd *, const char *name,
-				int namlen, off_t offset, ino_t ino);
+				int namlen, off_t offset, ino_t ino, unsigned int d_type);
 
 int nfssvc_release_fhandle(struct svc_rqst *, u32 *, struct nfsd_fhandle *);
 
diff -urN 2.2.18/include/linux/nfsd/xdr3.h 2.2.18aa1/include/linux/nfsd/xdr3.h
--- 2.2.18/include/linux/nfsd/xdr3.h	Mon Dec 11 16:58:06 2000
+++ 2.2.18aa1/include/linux/nfsd/xdr3.h	Mon Dec 11 19:26:40 2000
@@ -292,9 +292,9 @@
 int nfs3svc_release_fhandle2(struct svc_rqst *, u32 *,
 				struct nfsd3_fhandle_pair *);
 int nfs3svc_encode_entry(struct readdir_cd *, const char *name,
-				int namlen, off_t offset, ino_t ino);
+				int namlen, off_t offset, ino_t ino, unsigned int d_type);
 int nfs3svc_encode_entry_plus(struct readdir_cd *, const char *name,
-				int namlen, off_t offset, ino_t ino);
+				int namlen, off_t offset, ino_t ino, unsigned int d_type);
 
 #ifdef __KERNEL__
 
diff -urN 2.2.18/include/linux/pagemap.h 2.2.18aa1/include/linux/pagemap.h
--- 2.2.18/include/linux/pagemap.h	Mon Dec 11 16:58:06 2000
+++ 2.2.18aa1/include/linux/pagemap.h	Mon Dec 11 19:23:36 2000
@@ -28,6 +28,7 @@
 #define PAGE_CACHE_SHIFT	PAGE_SHIFT
 #define PAGE_CACHE_SIZE		PAGE_SIZE
 #define PAGE_CACHE_MASK		PAGE_MASK
+#define PAGE_CACHE_MASK_loff	PAGE_MASK_loff
 
 #define page_cache_alloc()	__get_free_page(GFP_USER)
 #define page_cache_free(x)	free_page(x)
@@ -54,10 +55,10 @@
  * inode pointer and offsets are distributed (ie, we
  * roughly know which bits are "significant")
  */
-static inline unsigned long _page_hashfn(struct inode * inode, unsigned long offset)
+static inline unsigned long _page_hashfn(struct inode * inode, pgoff_t index)
 {
 #define i (((unsigned long) inode)/(sizeof(struct inode) & ~ (sizeof(struct inode) - 1)))
-#define o ((offset >> PAGE_SHIFT) + (offset & ~PAGE_MASK))
+#define o (index + (index >> PAGE_HASH_BITS))
 	return ((i+o) & PAGE_HASH_MASK);
 #undef i
 #undef o
@@ -65,7 +66,7 @@
 
 #define page_hash(inode,offset) (page_hash_table+_page_hashfn(inode,offset))
 
-static inline struct page * __find_page(struct inode * inode, unsigned long offset, struct page *page)
+static inline struct page * __find_page(struct inode * inode, pgoff_t index, struct page *page)
 {
 	goto inside;
 	for (;;) {
@@ -75,7 +76,7 @@
 			goto not_found;
 		if (page->inode != inode)
 			continue;
-		if (page->offset == offset)
+		if (pgoff2ulong(page->index) == pgoff2ulong(index))
 			break;
 	}
 	/* Found the page. */
@@ -85,9 +86,9 @@
 	return page;
 }
 
-static inline struct page *find_page(struct inode * inode, unsigned long offset)
+static inline struct page *find_page(struct inode * inode, pgoff_t poffset)
 {
-	return __find_page(inode, offset, *page_hash(inode, offset));
+	return __find_page(inode, poffset, *page_hash(inode, poffset));
 }
 
 static inline void remove_page_from_hash_queue(struct page * page)
@@ -110,9 +111,9 @@
 	page->pprev_hash = p;
 }
 
-static inline void add_page_to_hash_queue(struct page * page, struct inode * inode, unsigned long offset)
+static inline void add_page_to_hash_queue(struct page * page, struct inode * inode, pgoff_t poffset)
 {
-	__add_page_to_hash_queue(page, page_hash(inode,offset));
+	__add_page_to_hash_queue(page, page_hash(inode,poffset));
 }
 
 static inline void remove_page_from_inode_queue(struct page * page)
@@ -150,8 +151,8 @@
 		__wait_on_page(page);
 }
 
-extern void update_vm_cache_conditional(struct inode *, unsigned long, const char *, int, unsigned long);
-extern void update_vm_cache(struct inode *, unsigned long, const char *, int);
+extern void update_vm_cache_conditional(struct inode *, loff_t, const char *, int, unsigned long);
+extern void update_vm_cache(struct inode *, loff_t, const char *, int);
 
 typedef int filler_t(void *, struct page*);
 
diff -urN 2.2.18/include/linux/raid/hsm.h 2.2.18aa1/include/linux/raid/hsm.h
--- 2.2.18/include/linux/raid/hsm.h	Thu Jan  1 01:00:00 1970
+++ 2.2.18aa1/include/linux/raid/hsm.h	Mon Dec 11 17:20:54 2000
@@ -0,0 +1,65 @@
+#ifndef _HSM_H
+#define _HSM_H
+
+#include <linux/raid/md.h>
+
+#if __alpha__
+#error fix cpu_addr on Alpha first
+#endif
+
+#include <linux/raid/hsm_p.h>
+
+#define index_pv(lv,index) ((lv)->vg->pv_array+(index)->data.phys_nr)
+#define index_dev(lv,index) index_pv((lv),(index))->dev
+#define index_block(lv,index) (index)->data.phys_block
+#define index_child(index) ((lv_lptr_t *)((index)->cpu_addr))
+
+#define ptr_to_cpuaddr(ptr) ((__u32) (ptr))
+
+
+typedef struct pv_bg_desc_s {
+	unsigned int		free_blocks;
+	pv_block_group_t 	*bg;
+} pv_bg_desc_t;
+
+typedef struct pv_s pv_t;
+typedef struct vg_s vg_t;
+typedef struct lv_s lv_t;
+
+struct pv_s
+{
+	int			phys_nr;
+	kdev_t			dev;
+	pv_sb_t			*pv_sb;
+	pv_bg_desc_t	 	*bg_array;
+};
+
+struct lv_s
+{
+	int		log_id;
+	vg_t		*vg;
+
+	unsigned int	max_indices;
+	unsigned int	free_indices;
+	lv_lptr_t	root_index;
+
+	kdev_t		dev;
+};
+
+struct vg_s
+{
+	int		nr_pv;
+	pv_t		pv_array [MD_SB_DISKS];
+
+	int		nr_lv;
+	lv_t		lv_array [HSM_MAX_LVS_PER_VG];
+
+	vg_sb_t		*vg_sb;
+	mddev_t		*mddev;
+};
+
+#define kdev_to_lv(dev) ((lv_t *) mddev_map[MINOR(dev)].data)
+#define mddev_to_vg(mddev) ((vg_t *) mddev->private)
+
+#endif
+
diff -urN 2.2.18/include/linux/raid/hsm_p.h 2.2.18aa1/include/linux/raid/hsm_p.h
--- 2.2.18/include/linux/raid/hsm_p.h	Thu Jan  1 01:00:00 1970
+++ 2.2.18aa1/include/linux/raid/hsm_p.h	Mon Dec 11 17:20:54 2000
@@ -0,0 +1,237 @@
+#ifndef _HSM_P_H
+#define _HSM_P_H
+
+#define HSM_BLOCKSIZE 4096
+#define HSM_BLOCKSIZE_WORDS (HSM_BLOCKSIZE/4)
+#define PACKED __attribute__ ((packed))
+
+/*
+ * Identifies a block in physical space
+ */
+typedef struct phys_idx_s {
+	__u16 phys_nr;
+	__u32 phys_block;
+
+} PACKED phys_idx_t;
+
+/*
+ * Identifies a block in logical space
+ */
+typedef struct log_idx_s {
+	__u16 log_id;
+	__u32 log_index;
+
+} PACKED log_idx_t;
+
+/*
+ * Describes one PV
+ */
+#define HSM_PV_SB_MAGIC          0xf091ae9fU
+
+#define HSM_PV_SB_GENERIC_WORDS 32
+#define HSM_PV_SB_RESERVED_WORDS \
+		(HSM_BLOCKSIZE_WORDS - HSM_PV_SB_GENERIC_WORDS)
+
+/*
+ * On-disk PV identification data, on block 0 in any PV.
+ */
+typedef struct pv_sb_s
+{
+	__u32 pv_magic;		/*  0 		 			    */
+
+	__u32 pv_uuid0;		/*  1 					    */
+	__u32 pv_uuid1;		/*  2		 			    */
+	__u32 pv_uuid2;		/*  3 					    */
+	__u32 pv_uuid3;		/*  4 					    */
+
+	__u32 pv_major;		/*  5  					    */
+	__u32 pv_minor;		/*  6  					    */
+	__u32 pv_patch;		/*  7 					    */
+
+	__u32 pv_ctime;		/*  8 Creation time			    */
+
+	__u32 pv_total_size;	/*  9 size of this PV, in blocks	    */
+	__u32 pv_first_free;	/*  10 first free block			    */
+	__u32 pv_first_used;	/*  11 first used block			    */
+	__u32 pv_blocks_left;	/*  12 unallocated blocks		    */
+	__u32 pv_bg_size;	/*  13 size of a block group, in blocks	    */
+	__u32 pv_block_size;	/*  14 size of blocks, in bytes		    */
+	__u32 pv_pptr_size;	/*  15 size of block descriptor, in bytes   */
+	__u32 pv_block_groups;	/*  16 number of block groups		    */
+
+	__u32 __reserved1[HSM_PV_SB_GENERIC_WORDS - 17];
+
+	/*
+	 * Reserved
+	 */
+	__u32 __reserved2[HSM_PV_SB_RESERVED_WORDS];
+
+} PACKED pv_sb_t;
+
+/*
+ * this is pretty much arbitrary, but has to be less than ~64
+ */
+#define HSM_MAX_LVS_PER_VG 32
+
+#define HSM_VG_SB_GENERIC_WORDS 32
+
+#define LV_DESCRIPTOR_WORDS 8
+#define HSM_VG_SB_RESERVED_WORDS (HSM_BLOCKSIZE_WORDS - \
+	LV_DESCRIPTOR_WORDS*HSM_MAX_LVS_PER_VG - HSM_VG_SB_GENERIC_WORDS)
+
+#if (HSM_PV_SB_RESERVED_WORDS < 0)
+#error you messed this one up dude ...
+#endif
+
+typedef struct lv_descriptor_s
+{
+	__u32 lv_id;		/*  0 					    */
+	phys_idx_t lv_root_idx; /*  1					    */
+	__u16 __reserved;	/*  2 					    */
+	__u32 lv_max_indices;	/*  3 					    */
+	__u32 lv_free_indices;	/*  4 					    */
+	__u32 md_id;		/*  5 					    */
+
+	__u32 reserved[LV_DESCRIPTOR_WORDS - 6];
+
+} PACKED lv_descriptor_t;
+
+#define HSM_VG_SB_MAGIC          0x98320d7aU
+/*
+ * On-disk VG identification data, in block 1 on all PVs
+ */
+typedef struct vg_sb_s
+{
+	__u32 vg_magic;		/*  0 		 			    */
+	__u32 nr_lvs;		/*  1					    */
+
+	__u32 __reserved1[HSM_VG_SB_GENERIC_WORDS - 2];
+
+	lv_descriptor_t lv_array [HSM_MAX_LVS_PER_VG];
+	/*
+	 * Reserved
+	 */
+	__u32 __reserved2[HSM_VG_SB_RESERVED_WORDS];
+
+} PACKED vg_sb_t;
+
+/*
+ * Describes one LV
+ */
+
+#define HSM_LV_SB_MAGIC          0xe182bd8aU
+
+/* do we need lv_sb_t? */
+
+typedef struct lv_sb_s
+{
+	/*
+	 * On-disk LV identifier
+	 */
+	__u32 lv_magic;		/*  0 LV identifier 			    */
+	__u32 lv_uuid0;		/*  1 					    */
+	__u32 lv_uuid1;		/*  2		 			    */
+	__u32 lv_uuid2;		/*  3 					    */
+	__u32 lv_uuid3;		/*  4 					    */
+
+	__u32 lv_major;		/*  5 PV identifier 			    */
+	__u32 lv_minor;		/*  6 PV identifier 			    */
+	__u32 lv_patch;		/*  7 PV identifier 			    */
+
+	__u32 ctime;		/*  8 Creation time			    */
+	__u32 size;		/*  9 size of this LV, in blocks	    */
+	phys_idx_t start;	/*  10 position of root index block	    */
+	log_idx_t first_free;	/*  11-12 first free index		    */
+
+	/*
+	 * Reserved
+	 */
+	__u32 reserved[HSM_BLOCKSIZE_WORDS-13];
+
+} PACKED lv_sb_t;
+
+/*
+ * Pointer pointing from the physical space, points to
+ * the LV owning this block. It also contains various
+ * statistics about the physical block.
+ */
+typedef struct pv_pptr_s
+{
+	union {
+	/* case 1 */
+		struct {
+			log_idx_t owner;
+			log_idx_t predicted;
+			__u32 last_referenced;
+		} used;
+	/* case 2 */
+		struct {
+			__u16 log_id;
+			__u16 __unused1;
+			__u32 next_free;
+			__u32 __unused2;
+			__u32 __unused3;
+		} free;
+	} u;
+} PACKED pv_pptr_t;
+
+static __inline__ int pv_pptr_free (const pv_pptr_t * pptr)
+{
+	return !pptr->u.free.log_id;
+}
+
+
+#define DATA_BLOCKS_PER_BG ((HSM_BLOCKSIZE*8)/(8*sizeof(pv_pptr_t)+1))
+
+#define TOTAL_BLOCKS_PER_BG (DATA_BLOCKS_PER_BG+1)
+/*
+ * A table of pointers filling up a single block, managing
+ * the next DATA_BLOCKS_PER_BG physical blocks. Such block
+ * groups form the physical space of blocks.
+ */
+typedef struct pv_block_group_s
+{
+	__u8 used_bitmap[(DATA_BLOCKS_PER_BG+7)/8];
+
+	pv_pptr_t blocks[DATA_BLOCKS_PER_BG];
+
+} PACKED pv_block_group_t;
+
+/*
+ * Pointer from the logical space, points to
+ * the (PV,block) containing this logical block
+ */
+typedef struct lv_lptr_s
+{
+	phys_idx_t data;
+	__u16 __reserved;
+	__u32 cpu_addr;
+	__u32 __reserved2;
+
+} PACKED lv_lptr_t;
+
+static __inline__ int index_free (const lv_lptr_t * index)
+{
+	return !index->data.phys_block;
+}
+
+static __inline__ int index_present (const lv_lptr_t * index)
+{
+	return index->cpu_addr;
+}
+
+
+#define HSM_LPTRS_PER_BLOCK (HSM_BLOCKSIZE/sizeof(lv_lptr_t))
+/*
+ * A table of pointers filling up a single block, managing
+ * HSM_LPTRS_PER_BLOCK logical blocks. Such block groups form
+ * the logical space of blocks.
+ */
+typedef struct lv_index_block_s
+{
+	lv_lptr_t blocks[HSM_LPTRS_PER_BLOCK];
+
+} PACKED lv_index_block_t;
+
+#endif
+
diff -urN 2.2.18/include/linux/raid/linear.h 2.2.18aa1/include/linux/raid/linear.h
--- 2.2.18/include/linux/raid/linear.h	Thu Jan  1 01:00:00 1970
+++ 2.2.18aa1/include/linux/raid/linear.h	Mon Dec 11 17:46:31 2000
@@ -0,0 +1,32 @@
+#ifndef _LINEAR_H
+#define _LINEAR_H
+
+#include <linux/raid/md.h>
+
+struct dev_info {
+	kdev_t		dev;
+	int		size;
+	unsigned int	offset;
+};
+
+typedef struct dev_info dev_info_t;
+
+struct linear_hash
+{
+	dev_info_t *dev0, *dev1;
+};
+
+struct linear_private_data
+{
+	struct linear_hash	*hash_table;
+	dev_info_t		disks[MD_SB_DISKS];
+	dev_info_t		*smallest;
+	int			nr_zones;
+};
+
+
+typedef struct linear_private_data linear_conf_t;
+
+#define mddev_to_conf(mddev) ((linear_conf_t *) mddev->private)
+
+#endif
diff -urN 2.2.18/include/linux/raid/md.h 2.2.18aa1/include/linux/raid/md.h
--- 2.2.18/include/linux/raid/md.h	Thu Jan  1 01:00:00 1970
+++ 2.2.18aa1/include/linux/raid/md.h	Mon Dec 11 19:23:54 2000
@@ -0,0 +1,97 @@
+/*
+   md.h : Multiple Devices driver for Linux
+          Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman
+          Copyright (C) 1994-96 Marc ZYNGIER
+	  <zyngier@ufr-info-p7.ibp.fr> or
+	  <maz@gloups.fdn.fr>
+	  
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+   
+   You should have received a copy of the GNU General Public License
+   (for example /usr/src/linux/COPYING); if not, write to the Free
+   Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  
+*/
+
+#ifndef _MD_H
+#define _MD_H
+
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/blkdev.h>
+#include <asm/semaphore.h>
+#include <linux/major.h>
+#include <linux/ioctl.h>
+#include <linux/types.h>
+#include <asm/bitops.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/hdreg.h>
+#include <linux/sysctl.h>
+#include <linux/fs.h>
+#include <linux/proc_fs.h>
+#include <linux/smp_lock.h>
+#include <linux/delay.h>
+#include <net/checksum.h>
+#include <linux/random.h>
+#include <linux/locks.h>
+#include <asm/io.h>
+
+#include <linux/raid/md_compatible.h>
+/*
+ * 'md_p.h' holds the 'physical' layout of RAID devices
+ * 'md_u.h' holds the user <=> kernel API
+ *
+ * 'md_k.h' holds kernel internal definitions
+ */
+
+#include <linux/raid/md_p.h>
+#include <linux/raid/md_u.h>
+#include <linux/raid/md_k.h>
+
+/*
+ * Different major versions are not compatible.
+ * Different minor versions are only downward compatible.
+ * Different patchlevel versions are downward and upward compatible.
+ */
+#define MD_MAJOR_VERSION                0
+#define MD_MINOR_VERSION                90
+#define MD_PATCHLEVEL_VERSION           0
+
+extern int md_size[MAX_MD_DEVS];
+extern struct hd_struct md_hd_struct[MAX_MD_DEVS];
+
+extern void add_mddev_mapping (mddev_t *mddev, kdev_t dev, void *data);
+extern void del_mddev_mapping (mddev_t *mddev, kdev_t dev);
+extern char * partition_name (kdev_t dev);
+extern int register_md_personality (int p_num, mdk_personality_t *p);
+extern int unregister_md_personality (int p_num);
+extern mdk_thread_t * md_register_thread (void (*run) (void *data),
+				void *data, const char *name);
+extern void md_unregister_thread (mdk_thread_t *thread);
+extern void md_wakeup_thread(mdk_thread_t *thread);
+extern void md_interrupt_thread (mdk_thread_t *thread);
+extern int md_update_sb (mddev_t *mddev);
+extern int md_do_sync(mddev_t *mddev, mdp_disk_t *spare);
+extern void md_recover_arrays (void);
+extern int md_check_ordering (mddev_t *mddev);
+extern void autodetect_raid(void);
+extern struct gendisk * find_gendisk (kdev_t dev);
+extern int md_notify_reboot(struct notifier_block *this,
+					unsigned long code, void *x);
+#if CONFIG_BLK_DEV_MD
+extern void raid_setup(char *str,int *ints) md__init;
+#endif
+#ifdef CONFIG_MD_BOOT
+extern void md_setup(char *str,int *ints) md__init;
+#endif
+extern int md_error (mddev_t * mddev, kdev_t rdev);
+
+extern void md_print_devices (void);
+
+#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
+
+#endif _MD_H
+
diff -urN 2.2.18/include/linux/raid/md_compatible.h 2.2.18aa1/include/linux/raid/md_compatible.h
--- 2.2.18/include/linux/raid/md_compatible.h	Thu Jan  1 01:00:00 1970
+++ 2.2.18aa1/include/linux/raid/md_compatible.h	Mon Dec 11 19:23:54 2000
@@ -0,0 +1,387 @@
+
+/*
+   md.h : Multiple Devices driver compatibility layer for Linux 2.0/2.2
+          Copyright (C) 1998 Ingo Molnar
+	  
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+   
+   You should have received a copy of the GNU General Public License
+   (for example /usr/src/linux/COPYING); if not, write to the Free
+   Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  
+*/
+
+#include <linux/version.h>
+
+#ifndef _MD_COMPATIBLE_H
+#define _MD_COMPATIBLE_H
+
+#define LinuxVersionCode(v, p, s) (((v)<<16)+((p)<<8)+(s))
+
+#if LINUX_VERSION_CODE < LinuxVersionCode(2,1,0)
+
+/* 000 */
+#define md__get_free_pages(x,y) __get_free_pages(x,y,GFP_KERNEL)
+
+#ifdef __i386__
+/* 001 */
+extern __inline__ int md_cpu_has_mmx(void)
+{
+	return x86_capability & 0x00800000;
+}
+#endif
+
+/* 002 */
+#define md_clear_page(page)        memset((void *)(page), 0, PAGE_SIZE)
+
+/* 003 */
+/*
+ * someone please suggest a sane compatibility layer for modules
+ */
+#define MD_EXPORT_SYMBOL(x)
+
+/* 004 */
+static inline unsigned long
+md_copy_from_user(void *to, const void *from, unsigned long n)
+{
+	int err;
+
+	err = verify_area(VERIFY_READ,from,n);
+	if (!err)
+		memcpy_fromfs(to, from, n);
+	return err; 
+}
+
+/* 005 */
+extern inline unsigned long
+md_copy_to_user(void *to, const void *from, unsigned long n)
+{
+	int err;
+
+	err = verify_area(VERIFY_WRITE,to,n);
+	if (!err)
+		memcpy_tofs(to, from, n);
+	return err; 
+}
+
+/* 006 */
+#define md_put_user(x,ptr)						\
+({									\
+	int __err;							\
+									\
+	__err = verify_area(VERIFY_WRITE,ptr,sizeof(*ptr));		\
+	if (!__err)							\
+		put_user(x,ptr);					\
+	__err;								\
+})
+
+/* 007 */
+extern inline int md_capable_admin(void)
+{
+	return suser();
+}
+ 
+/* 008 */
+#define MD_FILE_TO_INODE(file) ((file)->f_inode)
+
+/* 009 */
+extern inline void md_flush_signals (void)
+{
+	current->signal = 0;
+}
+ 
+/* 010 */
+#define __S(nr) (1<<((nr)-1))
+extern inline void md_init_signals (void)
+{
+        current->exit_signal = SIGCHLD;
+        current->blocked = ~(__S(SIGKILL));
+}
+#undef __S
+
+/* 011 */
+extern inline unsigned long md_signal_pending (struct task_struct * tsk)
+{
+	return (tsk->signal & ~tsk->blocked);
+}
+
+/* 012 */
+#define md_set_global_readahead(x) read_ahead[MD_MAJOR] = MD_READAHEAD
+
+/* 013 */
+#define md_mdelay(n) (\
+	{unsigned long msec=(n); while (msec--) udelay(1000);})
+
+/* 014 */
+#define MD_SYS_DOWN 0
+#define MD_SYS_HALT 0
+#define MD_SYS_POWER_OFF 0
+
+/* 015 */
+#define md_register_reboot_notifier(x)
+
+/* 016 */
+extern __inline__ unsigned long
+md_test_and_set_bit(int nr, void * addr)
+{
+	unsigned long flags;
+	unsigned long oldbit;
+
+	save_flags(flags);
+	cli();
+	oldbit = test_bit(nr,addr);
+	set_bit(nr,addr);
+	restore_flags(flags);
+	return oldbit;
+}
+
+/* 017 */
+extern __inline__ unsigned long
+md_test_and_clear_bit(int nr, void * addr)
+{
+	unsigned long flags;
+	unsigned long oldbit;
+
+	save_flags(flags);
+	cli();
+	oldbit = test_bit(nr,addr);
+	clear_bit(nr,addr);
+	restore_flags(flags);
+	return oldbit;
+}
+
+/* 018 */
+#define md_atomic_read(x) (*(volatile int *)(x))
+#define md_atomic_set(x,y) (*(volatile int *)(x) = (y))
+
+/* 019 */
+extern __inline__ void md_lock_kernel (void)
+{
+#if __SMP__
+	lock_kernel();
+	syscall_count++;
+#endif
+}
+
+extern __inline__ void md_unlock_kernel (void)
+{
+#if __SMP__
+	syscall_count--;
+	unlock_kernel();
+#endif
+}
+/* 020 */
+
+#define md__init
+#define md__initdata
+#define md__initfunc(__arginit) __arginit
+
+/* 021 */
+
+/* 022 */
+
+struct md_list_head {
+	struct md_list_head *next, *prev;
+};
+
+#define MD_LIST_HEAD(name) \
+	struct md_list_head name = { &name, &name }
+
+#define MD_INIT_LIST_HEAD(ptr) do { \
+	(ptr)->next = (ptr); (ptr)->prev = (ptr); \
+} while (0)
+
+static __inline__ void md__list_add(struct md_list_head * new,
+	struct md_list_head * prev,
+	struct md_list_head * next)
+{
+	next->prev = new;
+	new->next = next;
+	new->prev = prev;
+	prev->next = new;
+}
+
+static __inline__ void md_list_add(struct md_list_head *new,
+						struct md_list_head *head)
+{
+	md__list_add(new, head, head->next);
+}
+
+static __inline__ void md__list_del(struct md_list_head * prev,
+					struct md_list_head * next)
+{
+	next->prev = prev;
+	prev->next = next;
+}
+
+static __inline__ void md_list_del(struct md_list_head *entry)
+{
+	md__list_del(entry->prev, entry->next);
+}
+
+static __inline__ int md_list_empty(struct md_list_head *head)
+{
+	return head->next == head;
+}
+
+#define md_list_entry(ptr, type, member) \
+	((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member)))
+
+/* 023 */
+
+static __inline__ signed long md_schedule_timeout(signed long timeout)
+{
+	current->timeout = jiffies + timeout;
+	schedule();
+	return 0;
+}
+
+/* 024 */
+#define md_need_resched(tsk) (need_resched)
+
+/* 025 */
+typedef struct { int gcc_is_buggy; } md_spinlock_t;
+#define MD_SPIN_LOCK_UNLOCKED (md_spinlock_t) { 0 }
+
+#define md_spin_lock_irq cli
+#define md_spin_unlock_irq sti
+#define md_spin_unlock_irqrestore(x,flags) restore_flags(flags)
+#define md_spin_lock_irqsave(x,flags) do { save_flags(flags); cli(); } while (0)
+
+/* END */
+
+#else
+
+#include <linux/reboot.h>
+#include <linux/vmalloc.h>
+
+/* 000 */
+#define md__get_free_pages(x,y) __get_free_pages(x,y)
+
+#ifdef __i386__
+/* 001 */
+extern __inline__ int md_cpu_has_mmx(void)
+{
+	return boot_cpu_data.x86_capability & X86_FEATURE_MMX;
+}
+#endif
+
+/* 002 */
+#define md_clear_page(page)        clear_page(page)
+
+/* 003 */
+#define MD_EXPORT_SYMBOL(x) EXPORT_SYMBOL(x)
+
+/* 004 */
+#define md_copy_to_user(x,y,z) copy_to_user(x,y,z)
+
+/* 005 */
+#define md_copy_from_user(x,y,z) copy_from_user(x,y,z)
+
+/* 006 */
+#define md_put_user put_user
+
+/* 007 */
+extern inline int md_capable_admin(void)
+{
+	return capable(CAP_SYS_ADMIN);
+}
+
+/* 008 */
+#define MD_FILE_TO_INODE(file) ((file)->f_dentry->d_inode)
+
+/* 009 */
+extern inline void md_flush_signals (void)
+{
+	spin_lock(&current->sigmask_lock);
+	flush_signals(current);
+	spin_unlock(&current->sigmask_lock);
+}
+ 
+/* 010 */
+extern inline void md_init_signals (void)
+{
+        current->exit_signal = SIGCHLD;
+        siginitsetinv(&current->blocked, sigmask(SIGKILL));
+}
+
+/* 011 */
+#define md_signal_pending signal_pending
+
+/* 012 */
+extern inline void md_set_global_readahead(int * table)
+{
+	max_readahead[MD_MAJOR] = table;
+}
+
+/* 013 */
+#define md_mdelay(x) mdelay(x)
+
+/* 014 */
+#define MD_SYS_DOWN SYS_DOWN
+#define MD_SYS_HALT SYS_HALT
+#define MD_SYS_POWER_OFF SYS_POWER_OFF
+
+/* 015 */
+#define md_register_reboot_notifier register_reboot_notifier
+
+/* 016 */
+#define md_test_and_set_bit test_and_set_bit
+
+/* 017 */
+#define md_test_and_clear_bit test_and_clear_bit
+
+/* 018 */
+#define md_atomic_read atomic_read
+#define md_atomic_set atomic_set
+
+/* 019 */
+#define md_lock_kernel lock_kernel
+#define md_unlock_kernel unlock_kernel
+
+/* 020 */
+
+#include <linux/init.h>
+
+#define md__init __init
+#define md__initdata __initdata
+#define md__initfunc(__arginit) __initfunc(__arginit)
+
+/* 021 */
+
+
+/* 022 */
+
+#define md_list_head list_head
+#define MD_LIST_HEAD(name) LIST_HEAD(name)
+#define MD_INIT_LIST_HEAD(ptr) INIT_LIST_HEAD(ptr)
+#define md_list_add list_add
+#define md_list_del list_del
+#define md_list_empty list_empty
+
+#define md_list_entry(ptr, type, member) list_entry(ptr, type, member)
+
+/* 023 */
+
+#define md_schedule_timeout schedule_timeout
+
+/* 024 */
+#define md_need_resched(tsk) ((tsk)->need_resched)
+
+/* 025 */
+#define md_spinlock_t spinlock_t
+#define MD_SPIN_LOCK_UNLOCKED SPIN_LOCK_UNLOCKED
+
+#define md_spin_lock_irq spin_lock_irq
+#define md_spin_unlock_irq spin_unlock_irq
+#define md_spin_unlock_irqrestore spin_unlock_irqrestore
+#define md_spin_lock_irqsave spin_lock_irqsave
+
+/* END */
+
+#endif
+
+#endif _MD_COMPATIBLE_H
+
diff -urN 2.2.18/include/linux/raid/md_k.h 2.2.18aa1/include/linux/raid/md_k.h
--- 2.2.18/include/linux/raid/md_k.h	Thu Jan  1 01:00:00 1970
+++ 2.2.18aa1/include/linux/raid/md_k.h	Mon Dec 11 17:20:54 2000
@@ -0,0 +1,338 @@
+/*
+   md_k.h : kernel internal structure of the Linux MD driver
+          Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman
+	  
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+   
+   You should have received a copy of the GNU General Public License
+   (for example /usr/src/linux/COPYING); if not, write to the Free
+   Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  
+*/
+
+#ifndef _MD_K_H
+#define _MD_K_H
+
+#define MD_RESERVED       0UL
+#define LINEAR            1UL
+#define STRIPED           2UL
+#define RAID0             STRIPED
+#define RAID1             3UL
+#define RAID5             4UL
+#define TRANSLUCENT       5UL
+#define HSM               6UL
+#define MAX_PERSONALITY   7UL
+
+extern inline int pers_to_level (int pers)
+{
+	switch (pers) {
+		case HSM:		return -3;
+		case TRANSLUCENT:	return -2;
+		case LINEAR:		return -1;
+		case RAID0:		return 0;
+		case RAID1:		return 1;
+		case RAID5:		return 5;
+	}
+	panic("pers_to_level()");
+}
+
+extern inline int level_to_pers (int level)
+{
+	switch (level) {
+		case -3: return HSM;
+		case -2: return TRANSLUCENT;
+		case -1: return LINEAR;
+		case 0: return RAID0;
+		case 1: return RAID1;
+		case 4:
+		case 5: return RAID5;
+	}
+	return MD_RESERVED;
+}
+
+typedef struct mddev_s mddev_t;
+typedef struct mdk_rdev_s mdk_rdev_t;
+
+#if (MINORBITS != 8)
+#error MD doesnt handle bigger kdev yet
+#endif
+
+#define MAX_REAL     12			/* Max number of disks per md dev */
+#define MAX_MD_DEVS  (1<<MINORBITS)	/* Max number of md dev */
+
+/*
+ * Maps a kdev to an mddev/subdev. How 'data' is handled is up to
+ * the personality. (eg. HSM uses this to identify individual LVs)
+ */
+typedef struct dev_mapping_s {
+	mddev_t *mddev;
+	void *data;
+} dev_mapping_t;
+
+extern dev_mapping_t mddev_map [MAX_MD_DEVS];
+
+extern inline mddev_t * kdev_to_mddev (kdev_t dev)
+{
+        return mddev_map[MINOR(dev)].mddev;
+}
+
+/*
+ * options passed in raidrun:
+ */
+
+#define MAX_CHUNK_SIZE (4096*1024)
+
+/*
+ * default readahead
+ */
+#define MD_READAHEAD	(256 * 512)
+
+extern inline int disk_faulty(mdp_disk_t * d)
+{
+	return d->state & (1 << MD_DISK_FAULTY);
+}
+
+extern inline int disk_active(mdp_disk_t * d)
+{
+	return d->state & (1 << MD_DISK_ACTIVE);
+}
+
+extern inline int disk_sync(mdp_disk_t * d)
+{
+	return d->state & (1 << MD_DISK_SYNC);
+}
+
+extern inline int disk_spare(mdp_disk_t * d)
+{
+	return !disk_sync(d) && !disk_active(d) && !disk_faulty(d);
+}
+
+extern inline int disk_removed(mdp_disk_t * d)
+{
+	return d->state & (1 << MD_DISK_REMOVED);
+}
+
+extern inline void mark_disk_faulty(mdp_disk_t * d)
+{
+	d->state |= (1 << MD_DISK_FAULTY);
+}
+
+extern inline void mark_disk_active(mdp_disk_t * d)
+{
+	d->state |= (1 << MD_DISK_ACTIVE);
+}
+
+extern inline void mark_disk_sync(mdp_disk_t * d)
+{
+	d->state |= (1 << MD_DISK_SYNC);
+}
+
+extern inline void mark_disk_spare(mdp_disk_t * d)
+{
+	d->state = 0;
+}
+
+extern inline void mark_disk_removed(mdp_disk_t * d)
+{
+	d->state = (1 << MD_DISK_FAULTY) | (1 << MD_DISK_REMOVED);
+}
+
+extern inline void mark_disk_inactive(mdp_disk_t * d)
+{
+	d->state &= ~(1 << MD_DISK_ACTIVE);
+}
+
+extern inline void mark_disk_nonsync(mdp_disk_t * d)
+{
+	d->state &= ~(1 << MD_DISK_SYNC);
+}
+
+/*
+ * MD's 'extended' device
+ */
+struct mdk_rdev_s
+{
+	struct md_list_head same_set;	/* RAID devices within the same set */
+	struct md_list_head all;	/* all RAID devices */
+	struct md_list_head pending;	/* undetected RAID devices */
+
+	kdev_t dev;			/* Device number */
+	kdev_t old_dev;			/*  "" when it was last imported */
+	int size;			/* Device size (in blocks) */
+	mddev_t *mddev;			/* RAID array if running */
+	unsigned long last_events;	/* IO event timestamp */
+
+	struct inode *inode;		/* Lock inode */
+	struct file filp;		/* Lock file */
+
+	mdp_super_t *sb;
+	int sb_offset;
+
+	int faulty;			/* if faulty do not issue IO requests */
+	int desc_nr;			/* descriptor index in the superblock */
+};
+
+
+/*
+ * disk operations in a working array:
+ */
+#define DISKOP_SPARE_INACTIVE	0
+#define DISKOP_SPARE_WRITE	1
+#define DISKOP_SPARE_ACTIVE	2
+#define DISKOP_HOT_REMOVE_DISK	3
+#define DISKOP_HOT_ADD_DISK	4
+
+typedef struct mdk_personality_s mdk_personality_t;
+
+struct mddev_s
+{
+	void				*private;
+	mdk_personality_t		*pers;
+	int				__minor;
+	mdp_super_t			*sb;
+	int				nb_dev;
+	struct md_list_head 		disks;
+	int				sb_dirty;
+	mdu_param_t			param;
+	int				ro;
+	unsigned int			curr_resync;
+	unsigned long			resync_start;
+	char				*name;
+	int				recovery_running;
+	struct semaphore		reconfig_sem;
+	struct semaphore		recovery_sem;
+	struct semaphore		resync_sem;
+	struct md_list_head		all_mddevs;
+};
+
+struct mdk_personality_s
+{
+	char *name;
+	int (*map)(mddev_t *mddev, kdev_t dev, kdev_t *rdev,
+		unsigned long *rsector, unsigned long size);
+	int (*make_request)(mddev_t *mddev, int rw, struct buffer_head * bh);
+	void (*end_request)(struct buffer_head * bh, int uptodate);
+	int (*run)(mddev_t *mddev);
+	int (*stop)(mddev_t *mddev);
+	int (*status)(char *page, mddev_t *mddev);
+	int (*ioctl)(struct inode *inode, struct file *file,
+		unsigned int cmd, unsigned long arg);
+	int max_invalid_dev;
+	int (*error_handler)(mddev_t *mddev, kdev_t dev);
+
+/*
+ * Some personalities (RAID-1, RAID-5) can have disks hot-added and
+ * hot-removed. Hot removal is different from failure. (failure marks
+ * a disk inactive, but the disk is still part of the array) The interface
+ * to such operations is the 'pers->diskop()' function, can be NULL.
+ *
+ * the diskop function can change the pointer pointing to the incoming
+ * descriptor, but must do so very carefully. (currently only
+ * SPARE_ACTIVE expects such a change)
+ */
+	int (*diskop) (mddev_t *mddev, mdp_disk_t **descriptor, int state);
+
+	int (*stop_resync)(mddev_t *mddev);
+	int (*restart_resync)(mddev_t *mddev);
+};
+
+
+/*
+ * Currently we index md_array directly, based on the minor
+ * number. This will have to change to dynamic allocation
+ * once we start supporting partitioning of md devices.
+ */
+extern inline int mdidx (mddev_t * mddev)
+{
+	return mddev->__minor;
+}
+
+extern inline kdev_t mddev_to_kdev(mddev_t * mddev)
+{
+	return MKDEV(MD_MAJOR, mdidx(mddev));
+}
+
+extern mdk_rdev_t * find_rdev(mddev_t * mddev, kdev_t dev);
+extern mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr);
+
+/*
+ * iterates through some rdev ringlist. It's safe to remove the
+ * current 'rdev'. Dont touch 'tmp' though.
+ */
+#define ITERATE_RDEV_GENERIC(head,field,rdev,tmp)			\
+									\
+	for (tmp = head.next;						\
+		rdev = md_list_entry(tmp, mdk_rdev_t, field),		\
+			tmp = tmp->next, tmp->prev != &head		\
+		; )
+/*
+ * iterates through the 'same array disks' ringlist
+ */
+#define ITERATE_RDEV(mddev,rdev,tmp)					\
+	ITERATE_RDEV_GENERIC((mddev)->disks,same_set,rdev,tmp)
+
+/*
+ * Same as above, but assumes that the device has rdev->desc_nr numbered
+ * from 0 to mddev->nb_dev, and iterates through rdevs in ascending order.
+ */
+#define ITERATE_RDEV_ORDERED(mddev,rdev,i)				\
+	for (i = 0; rdev = find_rdev_nr(mddev, i), i < mddev->nb_dev; i++)
+
+
+/*
+ * Iterates through all 'RAID managed disks'
+ */
+#define ITERATE_RDEV_ALL(rdev,tmp)					\
+	ITERATE_RDEV_GENERIC(all_raid_disks,all,rdev,tmp)
+
+/*
+ * Iterates through 'pending RAID disks'
+ */
+#define ITERATE_RDEV_PENDING(rdev,tmp)					\
+	ITERATE_RDEV_GENERIC(pending_raid_disks,pending,rdev,tmp)
+
+/*
+ * iterates through all used mddevs in the system.
+ */
+#define ITERATE_MDDEV(mddev,tmp)					\
+									\
+	for (tmp = all_mddevs.next;					\
+		mddev = md_list_entry(tmp, mddev_t, all_mddevs),	\
+			tmp = tmp->next, tmp->prev != &all_mddevs	\
+		; )
+
+extern inline int lock_mddev (mddev_t * mddev)
+{
+	return down_interruptible(&mddev->reconfig_sem);
+}
+
+extern inline void unlock_mddev (mddev_t * mddev)
+{
+	up(&mddev->reconfig_sem);
+}
+
+#define xchg_values(x,y) do { __typeof__(x) __tmp = x; \
+				x = y; y = __tmp; } while (0)
+
+typedef struct mdk_thread_s {
+	void			(*run) (void *data);
+	void			*data;
+	struct wait_queue	*wqueue;
+	unsigned long           flags;
+	struct semaphore	*sem;
+	struct task_struct	*tsk;
+	const char		*name;
+} mdk_thread_t;
+
+#define THREAD_WAKEUP  0
+
+typedef struct dev_name_s {
+	struct md_list_head list;
+	kdev_t dev;
+	char name [MAX_DISKNAME_LEN];
+} dev_name_t;
+
+#endif _MD_K_H
+
diff -urN 2.2.18/include/linux/raid/md_p.h 2.2.18aa1/include/linux/raid/md_p.h
--- 2.2.18/include/linux/raid/md_p.h	Thu Jan  1 01:00:00 1970
+++ 2.2.18aa1/include/linux/raid/md_p.h	Mon Dec 11 17:20:54 2000
@@ -0,0 +1,161 @@
+/*
+   md_p.h : physical layout of Linux RAID devices
+          Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman
+	  
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+   
+   You should have received a copy of the GNU General Public License
+   (for example /usr/src/linux/COPYING); if not, write to the Free
+   Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  
+*/
+
+#ifndef _MD_P_H
+#define _MD_P_H
+
+/*
+ * RAID superblock.
+ *
+ * The RAID superblock maintains some statistics on each RAID configuration.
+ * Each real device in the RAID set contains it near the end of the device.
+ * Some of the ideas are copied from the ext2fs implementation.
+ *
+ * We currently use 4096 bytes as follows:
+ *
+ *	word offset	function
+ *
+ *	   0  -    31	Constant generic RAID device information.
+ *        32  -    63   Generic state information.
+ *	  64  -   127	Personality specific information.
+ *	 128  -   511	12 32-words descriptors of the disks in the raid set.
+ *	 512  -   911	Reserved.
+ *	 912  -  1023	Disk specific descriptor.
+ */
+
+/*
+ * If x is the real device size in bytes, we return an apparent size of:
+ *
+ *	y = (x & ~(MD_RESERVED_BYTES - 1)) - MD_RESERVED_BYTES
+ *
+ * and place the 4kB superblock at offset y.
+ */
+#define MD_RESERVED_BYTES		(64 * 1024)
+#define MD_RESERVED_SECTORS		(MD_RESERVED_BYTES / 512)
+#define MD_RESERVED_BLOCKS		(MD_RESERVED_BYTES / BLOCK_SIZE)
+
+#define MD_NEW_SIZE_SECTORS(x)		((x & ~(MD_RESERVED_SECTORS - 1)) - MD_RESERVED_SECTORS)
+#define MD_NEW_SIZE_BLOCKS(x)		((x & ~(MD_RESERVED_BLOCKS - 1)) - MD_RESERVED_BLOCKS)
+
+#define MD_SB_BYTES			4096
+#define MD_SB_WORDS			(MD_SB_BYTES / 4)
+#define MD_SB_BLOCKS			(MD_SB_BYTES / BLOCK_SIZE)
+#define MD_SB_SECTORS			(MD_SB_BYTES / 512)
+
+/*
+ * The following are counted in 32-bit words
+ */
+#define	MD_SB_GENERIC_OFFSET		0
+#define MD_SB_PERSONALITY_OFFSET	64
+#define MD_SB_DISKS_OFFSET		128
+#define MD_SB_DESCRIPTOR_OFFSET		992
+
+#define MD_SB_GENERIC_CONSTANT_WORDS	32
+#define MD_SB_GENERIC_STATE_WORDS	32
+#define MD_SB_GENERIC_WORDS		(MD_SB_GENERIC_CONSTANT_WORDS + MD_SB_GENERIC_STATE_WORDS)
+#define MD_SB_PERSONALITY_WORDS		64
+#define MD_SB_DISKS_WORDS		384
+#define MD_SB_DESCRIPTOR_WORDS		32
+#define MD_SB_RESERVED_WORDS		(1024 - MD_SB_GENERIC_WORDS - MD_SB_PERSONALITY_WORDS - MD_SB_DISKS_WORDS - MD_SB_DESCRIPTOR_WORDS)
+#define MD_SB_EQUAL_WORDS		(MD_SB_GENERIC_WORDS + MD_SB_PERSONALITY_WORDS + MD_SB_DISKS_WORDS)
+#define MD_SB_DISKS			(MD_SB_DISKS_WORDS / MD_SB_DESCRIPTOR_WORDS)
+
+/*
+ * Device "operational" state bits
+ */
+#define MD_DISK_FAULTY		0 /* disk is faulty / operational */
+#define MD_DISK_ACTIVE		1 /* disk is running or spare disk */
+#define MD_DISK_SYNC		2 /* disk is in sync with the raid set */
+#define MD_DISK_REMOVED		3 /* disk is in sync with the raid set */
+
+typedef struct mdp_device_descriptor_s {
+	__u32 number;		/* 0 Device number in the entire set	      */
+	__u32 major;		/* 1 Device major number		      */
+	__u32 minor;		/* 2 Device minor number		      */
+	__u32 raid_disk;	/* 3 The role of the device in the raid set   */
+	__u32 state;		/* 4 Operational state			      */
+	__u32 reserved[MD_SB_DESCRIPTOR_WORDS - 5];
+} mdp_disk_t;
+
+#define MD_SB_MAGIC		0xa92b4efc
+
+/*
+ * Superblock state bits
+ */
+#define MD_SB_CLEAN		0
+#define MD_SB_ERRORS		1
+
+typedef struct mdp_superblock_s {
+	/*
+	 * Constant generic information
+	 */
+	__u32 md_magic;		/*  0 MD identifier 			      */
+	__u32 major_version;	/*  1 major version to which the set conforms */
+	__u32 minor_version;	/*  2 minor version ...			      */
+	__u32 patch_version;	/*  3 patchlevel version ...		      */
+	__u32 gvalid_words;	/*  4 Number of used words in this section    */
+	__u32 set_uuid0;	/*  5 Raid set identifier		      */
+	__u32 ctime;		/*  6 Creation time			      */
+	__u32 level;		/*  7 Raid personality			      */
+	__u32 size;		/*  8 Apparent size of each individual disk   */
+	__u32 nr_disks;		/*  9 total disks in the raid set	      */
+	__u32 raid_disks;	/* 10 disks in a fully functional raid set    */
+	__u32 md_minor;		/* 11 preferred MD minor device number	      */
+	__u32 not_persistent;	/* 12 does it have a persistent superblock    */
+	__u32 set_uuid1;	/* 13 Raid set identifier #2		      */
+	__u32 set_uuid2;	/* 14 Raid set identifier #3		      */
+	__u32 set_uuid3;	/* 14 Raid set identifier #4		      */
+	__u32 gstate_creserved[MD_SB_GENERIC_CONSTANT_WORDS - 16];
+
+	/*
+	 * Generic state information
+	 */
+	__u32 utime;		/*  0 Superblock update time		      */
+	__u32 state;		/*  1 State bits (clean, ...)		      */
+	__u32 active_disks;	/*  2 Number of currently active disks	      */
+	__u32 working_disks;	/*  3 Number of working disks		      */
+	__u32 failed_disks;	/*  4 Number of failed disks		      */
+	__u32 spare_disks;	/*  5 Number of spare disks		      */
+	__u32 sb_csum;		/*  6 checksum of the whole superblock        */
+	__u64 events;		/*  7 number of superblock updates (64-bit!)  */
+	__u32 gstate_sreserved[MD_SB_GENERIC_STATE_WORDS - 9];
+
+	/*
+	 * Personality information
+	 */
+	__u32 layout;		/*  0 the array's physical layout	      */
+	__u32 chunk_size;	/*  1 chunk size in bytes		      */
+	__u32 root_pv;		/*  2 LV root PV */
+	__u32 root_block;	/*  3 LV root block */
+	__u32 pstate_reserved[MD_SB_PERSONALITY_WORDS - 4];
+
+	/*
+	 * Disks information
+	 */
+	mdp_disk_t disks[MD_SB_DISKS];
+
+	/*
+	 * Reserved
+	 */
+	__u32 reserved[MD_SB_RESERVED_WORDS];
+
+	/*
+	 * Active descriptor
+	 */
+	mdp_disk_t this_disk;
+
+} mdp_super_t;
+
+#endif _MD_P_H
+
diff -urN 2.2.18/include/linux/raid/md_u.h 2.2.18aa1/include/linux/raid/md_u.h
--- 2.2.18/include/linux/raid/md_u.h	Thu Jan  1 01:00:00 1970
+++ 2.2.18aa1/include/linux/raid/md_u.h	Mon Dec 11 17:20:54 2000
@@ -0,0 +1,115 @@
+/*
+   md_u.h : user <=> kernel API between Linux raidtools and RAID drivers
+          Copyright (C) 1998 Ingo Molnar
+	  
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+   
+   You should have received a copy of the GNU General Public License
+   (for example /usr/src/linux/COPYING); if not, write to the Free
+   Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  
+*/
+
+#ifndef _MD_U_H
+#define _MD_U_H
+
+/* ioctls */
+
+/* status */
+#define RAID_VERSION		_IOR (MD_MAJOR, 0x10, mdu_version_t)
+#define GET_ARRAY_INFO		_IOR (MD_MAJOR, 0x11, mdu_array_info_t)
+#define GET_DISK_INFO		_IOR (MD_MAJOR, 0x12, mdu_disk_info_t)
+#define PRINT_RAID_DEBUG	_IO (MD_MAJOR, 0x13)
+
+/* configuration */
+#define CLEAR_ARRAY		_IO (MD_MAJOR, 0x20)
+#define ADD_NEW_DISK		_IOW (MD_MAJOR, 0x21, mdu_disk_info_t)
+#define HOT_REMOVE_DISK		_IO (MD_MAJOR, 0x22)
+#define SET_ARRAY_INFO		_IOW (MD_MAJOR, 0x23, mdu_array_info_t)
+#define SET_DISK_INFO		_IO (MD_MAJOR, 0x24)
+#define WRITE_RAID_INFO		_IO (MD_MAJOR, 0x25)
+#define UNPROTECT_ARRAY		_IO (MD_MAJOR, 0x26)
+#define PROTECT_ARRAY		_IO (MD_MAJOR, 0x27)
+#define HOT_ADD_DISK		_IO (MD_MAJOR, 0x28)
+#define SET_DISK_FAULTY		_IO (MD_MAJOR, 0x29)
+
+/* usage */
+#define RUN_ARRAY		_IOW (MD_MAJOR, 0x30, mdu_param_t)
+#define START_ARRAY		_IO (MD_MAJOR, 0x31)
+#define STOP_ARRAY		_IO (MD_MAJOR, 0x32)
+#define STOP_ARRAY_RO		_IO (MD_MAJOR, 0x33)
+#define RESTART_ARRAY_RW	_IO (MD_MAJOR, 0x34)
+
+typedef struct mdu_version_s {
+	int major;
+	int minor;
+	int patchlevel;
+} mdu_version_t;
+
+typedef struct mdu_array_info_s {
+	/*
+	 * Generic constant information
+	 */
+	int major_version;
+	int minor_version;
+	int patch_version;
+	int ctime;
+	int level;
+	int size;
+	int nr_disks;
+	int raid_disks;
+	int md_minor;
+	int not_persistent;
+
+	/*
+	 * Generic state information
+	 */
+	int utime;		/*  0 Superblock update time		      */
+	int state;		/*  1 State bits (clean, ...)		      */
+	int active_disks;	/*  2 Number of currently active disks	      */
+	int working_disks;	/*  3 Number of working disks		      */
+	int failed_disks;	/*  4 Number of failed disks		      */
+	int spare_disks;	/*  5 Number of spare disks		      */
+
+	/*
+	 * Personality information
+	 */
+	int layout;		/*  0 the array's physical layout	      */
+	int chunk_size;	/*  1 chunk size in bytes		      */
+
+} mdu_array_info_t;
+
+typedef struct mdu_disk_info_s {
+	/*
+	 * configuration/status of one particular disk
+	 */
+	int number;
+	int major;
+	int minor;
+	int raid_disk;
+	int state;
+
+} mdu_disk_info_t;
+
+typedef struct mdu_start_info_s {
+	/*
+	 * configuration/status of one particular disk
+	 */
+	int major;
+	int minor;
+	int raid_disk;
+	int state;
+
+} mdu_start_info_t;
+
+typedef struct mdu_param_s
+{
+	int			personality;	/* 1,2,3,4 */
+	int			chunk_size;	/* in bytes */
+	int			max_fault;	/* unused for now */
+} mdu_param_t;
+
+#endif _MD_U_H
+
diff -urN 2.2.18/include/linux/raid/raid0.h 2.2.18aa1/include/linux/raid/raid0.h
--- 2.2.18/include/linux/raid/raid0.h	Thu Jan  1 01:00:00 1970
+++ 2.2.18aa1/include/linux/raid/raid0.h	Mon Dec 11 17:46:34 2000
@@ -0,0 +1,33 @@
+#ifndef _RAID0_H
+#define _RAID0_H
+
+#include <linux/raid/md.h>
+
+struct strip_zone
+{
+	int zone_offset;		/* Zone offset in md_dev */
+	int dev_offset;			/* Zone offset in real dev */
+	int size;			/* Zone size */
+	int nb_dev;			/* # of devices attached to the zone */
+	mdk_rdev_t *dev[MAX_REAL]; /* Devices attached to the zone */
+};
+
+struct raid0_hash
+{
+	struct strip_zone *zone0, *zone1;
+};
+
+struct raid0_private_data
+{
+	struct raid0_hash *hash_table; /* Dynamically allocated */
+	struct strip_zone *strip_zone; /* This one too */
+	int nr_strip_zones;
+	struct strip_zone *smallest;
+	int nr_zones;
+};
+
+typedef struct raid0_private_data raid0_conf_t;
+
+#define mddev_to_conf(mddev) ((raid0_conf_t *) mddev->private)
+
+#endif
diff -urN 2.2.18/include/linux/raid/raid1.h 2.2.18aa1/include/linux/raid/raid1.h
--- 2.2.18/include/linux/raid/raid1.h	Thu Jan  1 01:00:00 1970
+++ 2.2.18aa1/include/linux/raid/raid1.h	Mon Dec 11 17:46:35 2000
@@ -0,0 +1,64 @@
+#ifndef _RAID1_H
+#define _RAID1_H
+
+#include <linux/raid/md.h>
+
+struct mirror_info {
+	int		number;
+	int		raid_disk;
+	kdev_t		dev;
+	int		next;
+	int		sect_limit;
+
+	/*
+	 * State bits:
+	 */
+	int		operational;
+	int		write_only;
+	int		spare;
+
+	int		used_slot;
+};
+
+struct raid1_private_data {
+	mddev_t			*mddev;
+	struct mirror_info	mirrors[MD_SB_DISKS];
+	int			nr_disks;
+	int			raid_disks;
+	int			working_disks;
+	int			last_used;
+	unsigned long		next_sect;
+	int			sect_count;
+	mdk_thread_t		*thread, *resync_thread;
+	int			resync_mirrors;
+	struct mirror_info	*spare;
+};
+
+typedef struct raid1_private_data raid1_conf_t;
+
+/*
+ * this is the only point in the RAID code where we violate
+ * C type safety. mddev->private is an 'opaque' pointer.
+ */
+#define mddev_to_conf(mddev) ((raid1_conf_t *) mddev->private)
+
+/*
+ * this is our 'private' 'collective' RAID1 buffer head.
+ * it contains information about what kind of IO operations were started
+ * for this RAID1 operation, and about their status:
+ */
+
+struct raid1_bh {
+	atomic_t		remaining; /* 'have we finished' count,
+					    * used from IRQ handlers
+					    */
+	int			cmd;
+	unsigned long		state;
+	mddev_t			*mddev;
+	struct buffer_head	*master_bh;
+	struct buffer_head	*mirror_bh [MD_SB_DISKS];
+	struct buffer_head	bh_req;
+	struct buffer_head	*next_retry;
+};
+
+#endif
diff -urN 2.2.18/include/linux/raid/raid5.h 2.2.18aa1/include/linux/raid/raid5.h
--- 2.2.18/include/linux/raid/raid5.h	Thu Jan  1 01:00:00 1970
+++ 2.2.18aa1/include/linux/raid/raid5.h	Mon Dec 11 17:46:37 2000
@@ -0,0 +1,113 @@
+#ifndef _RAID5_H
+#define _RAID5_H
+
+#include <linux/raid/md.h>
+#include <linux/raid/xor.h>
+
+struct disk_info {
+	kdev_t	dev;
+	int	operational;
+	int	number;
+	int	raid_disk;
+	int	write_only;
+	int	spare;
+	int	used_slot;
+};
+
+struct stripe_head {
+	md_spinlock_t		stripe_lock;
+	struct stripe_head	*hash_next, **hash_pprev; /* hash pointers */
+	struct stripe_head	*free_next;		/* pool of free sh's */
+	struct buffer_head	*buffer_pool;		/* pool of free buffers */
+	struct buffer_head	*bh_pool;		/* pool of free bh's */
+	struct raid5_private_data	*raid_conf;
+	struct buffer_head	*bh_old[MD_SB_DISKS];	/* disk image */
+	struct buffer_head	*bh_new[MD_SB_DISKS];	/* buffers of the MD device (present in buffer cache) */
+	struct buffer_head	*bh_copy[MD_SB_DISKS];	/* copy on write of bh_new (bh_new can change from under us) */
+	struct buffer_head	*bh_req[MD_SB_DISKS];	/* copy of bh_new (only the buffer heads), queued to the lower levels */
+	int			cmd_new[MD_SB_DISKS];	/* READ/WRITE for new */
+	int			new[MD_SB_DISKS];	/* buffer added since the last handle_stripe() */
+	unsigned long		sector;			/* sector of this row */
+	int			size;			/* buffers size */
+	int			pd_idx;			/* parity disk index */
+	atomic_t		nr_pending;		/* nr of pending cmds */
+	unsigned long		state;			/* state flags */
+	int			cmd;			/* stripe cmd */
+	int			count;			/* nr of waiters */
+	int			write_method;		/* reconstruct-write / read-modify-write */
+	int			phase;			/* PHASE_BEGIN, ..., PHASE_COMPLETE */
+	struct wait_queue	*wait;			/* processes waiting for this stripe */
+};
+
+/*
+ * Phase
+ */
+#define PHASE_BEGIN		0
+#define PHASE_READ_OLD		1
+#define PHASE_WRITE		2
+#define PHASE_READ		3
+#define PHASE_COMPLETE		4
+
+/*
+ * Write method
+ */
+#define METHOD_NONE		0
+#define RECONSTRUCT_WRITE	1
+#define READ_MODIFY_WRITE	2
+
+/*
+ * Stripe state
+ */
+#define STRIPE_LOCKED		0
+#define STRIPE_ERROR		1
+
+/*
+ * Stripe commands
+ */
+#define STRIPE_NONE		0
+#define	STRIPE_WRITE		1
+#define STRIPE_READ		2
+
+struct raid5_private_data {
+	struct stripe_head	**stripe_hashtbl;
+	mddev_t			*mddev;
+	mdk_thread_t		*thread, *resync_thread;
+	struct disk_info	disks[MD_SB_DISKS];
+	struct disk_info	*spare;
+	int			buffer_size;
+	int			chunk_size, level, algorithm;
+	int			raid_disks, working_disks, failed_disks;
+	int			sector_count;
+	unsigned long		next_sector;
+	atomic_t		nr_handle;
+	struct stripe_head	*next_free_stripe;
+	int			nr_stripes;
+	int			resync_parity;
+	int			max_nr_stripes;
+	int			clock;
+	int			nr_hashed_stripes;
+	int			nr_locked_stripes;
+	int			nr_pending_stripes;
+	int			nr_cached_stripes;
+
+	/*
+	 * Free stripes pool
+	 */
+	int			nr_free_sh;
+	struct stripe_head	*free_sh_list;
+	struct wait_queue	*wait_for_stripe;
+};
+
+typedef struct raid5_private_data raid5_conf_t;
+
+#define mddev_to_conf(mddev) ((raid5_conf_t *) mddev->private)
+
+/*
+ * Our supported algorithms
+ */
+#define ALGORITHM_LEFT_ASYMMETRIC	0
+#define ALGORITHM_RIGHT_ASYMMETRIC	1
+#define ALGORITHM_LEFT_SYMMETRIC	2
+#define ALGORITHM_RIGHT_SYMMETRIC	3
+
+#endif
diff -urN 2.2.18/include/linux/raid/translucent.h 2.2.18aa1/include/linux/raid/translucent.h
--- 2.2.18/include/linux/raid/translucent.h	Thu Jan  1 01:00:00 1970
+++ 2.2.18aa1/include/linux/raid/translucent.h	Mon Dec 11 17:20:54 2000
@@ -0,0 +1,23 @@
+#ifndef _TRANSLUCENT_H
+#define _TRANSLUCENT_H
+
+#include <linux/raid/md.h>
+
+typedef struct dev_info dev_info_t;
+
+struct dev_info {
+	kdev_t		dev;
+	int		size;
+};
+
+struct translucent_private_data
+{
+	dev_info_t		disks[MD_SB_DISKS];
+};
+
+
+typedef struct translucent_private_data translucent_conf_t;
+
+#define mddev_to_conf(mddev) ((translucent_conf_t *) mddev->private)
+
+#endif
diff -urN 2.2.18/include/linux/raid/xor.h 2.2.18aa1/include/linux/raid/xor.h
--- 2.2.18/include/linux/raid/xor.h	Thu Jan  1 01:00:00 1970
+++ 2.2.18aa1/include/linux/raid/xor.h	Mon Dec 11 17:33:12 2000
@@ -0,0 +1,12 @@
+#ifndef _XOR_H
+#define _XOR_H
+
+#include <linux/raid/md.h>
+
+#define MAX_XOR_BLOCKS 5
+
+extern void calibrate_xor_block(void);
+extern void (*xor_block)(unsigned int count,
+                         struct buffer_head **bh_ptr);
+
+#endif
diff -urN 2.2.18/include/linux/raid0.h 2.2.18aa1/include/linux/raid0.h
--- 2.2.18/include/linux/raid0.h	Tue May 25 00:49:32 1999
+++ 2.2.18aa1/include/linux/raid0.h	Thu Jan  1 01:00:00 1970
@@ -1,27 +0,0 @@
-#ifndef _RAID0_H
-#define _RAID0_H
-
-struct strip_zone
-{
-  int zone_offset;		/* Zone offset in md_dev */
-  int dev_offset;		/* Zone offset in real dev */
-  int size;			/* Zone size */
-  int nb_dev;			/* Number of devices attached to the zone */
-  struct real_dev *dev[MAX_REAL]; /* Devices attached to the zone */
-};
-
-struct raid0_hash
-{
-  struct strip_zone *zone0, *zone1;
-};
-
-struct raid0_data
-{
-  struct raid0_hash *hash_table; /* Dynamically allocated */
-  struct strip_zone *strip_zone; /* This one too */
-  int nr_strip_zones;
-  struct strip_zone *smallest;
-  int nr_zones;
-};
-
-#endif
diff -urN 2.2.18/include/linux/raid1.h 2.2.18aa1/include/linux/raid1.h
--- 2.2.18/include/linux/raid1.h	Wed Nov 29 19:55:36 2000
+++ 2.2.18aa1/include/linux/raid1.h	Thu Jan  1 01:00:00 1970
@@ -1,49 +0,0 @@
-#ifndef _RAID1_H
-#define _RAID1_H
-
-#include <linux/md.h>
-
-struct mirror_info {
-	int		number;
-	int		raid_disk;
-	kdev_t		dev;
-	int		next;
-	int		sect_limit;
-
-	/*
-	 * State bits:
-	 */
-	int		operational;
-	int		write_only;
-	int		spare;
-};
-
-struct raid1_data {
-	struct md_dev *mddev;
-	struct mirror_info mirrors[MD_SB_DISKS];  	/* RAID1 devices, 2 to MD_SB_DISKS */
-	int raid_disks;
-	int working_disks;			/* Number of working disks */
-	int last_used;
-	unsigned long	next_sect;
-	int		sect_count;
-	int resync_running;
-};
-
-/*
- * this is our 'private' 'collective' RAID1 buffer head.
- * it contains information about what kind of IO operations were started
- * for this RAID5 operation, and about their status:
- */
-
-struct raid1_bh {
-	unsigned int		remaining;
-	int			cmd;
-	unsigned long		state;
-	struct md_dev		*mddev;
-	struct buffer_head	*master_bh;
-	struct buffer_head	*mirror_bh [MD_SB_DISKS];
-	struct buffer_head	bh_req;
-	struct buffer_head	*next_retry;
-};
-
-#endif
diff -urN 2.2.18/include/linux/raid5.h 2.2.18aa1/include/linux/raid5.h
--- 2.2.18/include/linux/raid5.h	Wed Nov 29 19:55:37 2000
+++ 2.2.18aa1/include/linux/raid5.h	Thu Jan  1 01:00:00 1970
@@ -1,110 +0,0 @@
-#ifndef _RAID5_H
-#define _RAID5_H
-
-#ifdef __KERNEL__
-#include <linux/md.h>
-#include <asm/atomic.h>
-
-struct disk_info {
-	kdev_t	dev;
-	int	operational;
-	int	number;
-	int	raid_disk;
-	int	write_only;
-	int	spare;
-};
-
-struct stripe_head {
-	struct stripe_head	*hash_next, **hash_pprev; /* hash pointers */
-	struct stripe_head	*free_next;		/* pool of free sh's */
-	struct buffer_head	*buffer_pool;		/* pool of free buffers */
-	struct buffer_head	*bh_pool;		/* pool of free bh's */
-	struct raid5_data	*raid_conf;
-	struct buffer_head	*bh_old[MD_SB_DISKS];	/* disk image */
-	struct buffer_head	*bh_new[MD_SB_DISKS];	/* buffers of the MD device (present in buffer cache) */
-	struct buffer_head	*bh_copy[MD_SB_DISKS];	/* copy on write of bh_new (bh_new can change from under us) */
-	struct buffer_head	*bh_req[MD_SB_DISKS];	/* copy of bh_new (only the buffer heads), queued to the lower levels */
-	int			cmd_new[MD_SB_DISKS];	/* READ/WRITE for new */
-	int			new[MD_SB_DISKS];	/* buffer added since the last handle_stripe() */
-	unsigned long		sector;			/* sector of this row */
-	int			size;			/* buffers size */
-	int			pd_idx;			/* parity disk index */
-	int			nr_pending;		/* nr of pending cmds */
-	unsigned long		state;			/* state flags */
-	int			cmd;			/* stripe cmd */
-	int			count;			/* nr of waiters */
-	int			write_method;		/* reconstruct-write / read-modify-write */
-	int			phase;			/* PHASE_BEGIN, ..., PHASE_COMPLETE */
-	struct wait_queue	*wait;			/* processes waiting for this stripe */
-};
-
-/*
- * Phase
- */
-#define PHASE_BEGIN		0
-#define PHASE_READ_OLD		1
-#define PHASE_WRITE		2
-#define PHASE_READ		3
-#define PHASE_COMPLETE		4
-
-/*
- * Write method
- */
-#define METHOD_NONE		0
-#define RECONSTRUCT_WRITE	1
-#define READ_MODIFY_WRITE	2
-
-/*
- * Stripe state
- */
-#define STRIPE_LOCKED		0
-#define STRIPE_ERROR		1
-
-/*
- * Stripe commands
- */
-#define STRIPE_NONE		0
-#define	STRIPE_WRITE		1
-#define STRIPE_READ		2
-
-struct raid5_data {
-	struct stripe_head	**stripe_hashtbl;
-	struct md_dev		*mddev;
-	struct md_thread	*thread, *resync_thread;
-	struct disk_info	disks[MD_SB_DISKS];
-	struct disk_info	*spare;
-	int			buffer_size;
-	int			chunk_size, level, algorithm;
-	int			raid_disks, working_disks, failed_disks;
-	int			sector_count;
-	unsigned long		next_sector;
-	atomic_t		nr_handle;
-	struct stripe_head	*next_free_stripe;
-	int			nr_stripes;
-	int			resync_parity;
-	int			max_nr_stripes;
-	int			clock;
-	int			nr_hashed_stripes;
-	int			nr_locked_stripes;
-	int			nr_pending_stripes;
-	int			nr_cached_stripes;
-
-	/*
-	 * Free stripes pool
-	 */
-	int			nr_free_sh;
-	struct stripe_head	*free_sh_list;
-	struct wait_queue	*wait_for_stripe;
-};
-
-#endif
-
-/*
- * Our supported algorithms
- */
-#define ALGORITHM_LEFT_ASYMMETRIC	0
-#define ALGORITHM_RIGHT_ASYMMETRIC	1
-#define ALGORITHM_LEFT_SYMMETRIC	2
-#define ALGORITHM_RIGHT_SYMMETRIC	3
-
-#endif
diff -urN 2.2.18/include/linux/raw.h 2.2.18aa1/include/linux/raw.h
--- 2.2.18/include/linux/raw.h	Thu Jan  1 01:00:00 1970
+++ 2.2.18aa1/include/linux/raw.h	Mon Dec 11 17:36:43 2000
@@ -0,0 +1,23 @@
+#ifndef __LINUX_RAW_H
+#define __LINUX_RAW_H
+
+#include <linux/types.h>
+
+#define RAW_SETBIND	_IO( 0xac, 0 )
+#define RAW_GETBIND	_IO( 0xac, 1 )
+
+struct raw_config_request 
+{
+	int	raw_minor;
+	__u64	block_major;
+	__u64	block_minor;
+};
+
+#ifdef __KERNEL__
+
+/* drivers/char/raw.c */
+extern void raw_init(void);
+
+#endif /* __KERNEL__ */
+
+#endif /* __LINUX_RAW_H */
diff -urN 2.2.18/include/linux/sched.h 2.2.18aa1/include/linux/sched.h
--- 2.2.18/include/linux/sched.h	Mon Dec 11 16:58:06 2000
+++ 2.2.18aa1/include/linux/sched.h	Mon Dec 11 19:23:35 2000
@@ -79,6 +79,7 @@
 #define TASK_ZOMBIE		4
 #define TASK_STOPPED		8
 #define TASK_SWAPPING		16
+#define TASK_EXCLUSIVE		32
 
 /*
  * Scheduling policies
@@ -180,7 +181,11 @@
 	atomic_t count;
 	int map_count;				/* number of VMAs */
 	struct semaphore mmap_sem;
+#ifdef __alpha__
+	unsigned long context[NR_CPUS];
+#else
 	unsigned long context;
+#endif
 	unsigned long start_code, end_code, start_data, end_data;
 	unsigned long start_brk, brk, start_stack;
 	unsigned long arg_start, arg_end, env_start, env_end;
@@ -196,12 +201,18 @@
 	void * segments;
 };
 
+#ifdef __alpha__
+#define CONTEXT_INIT	{ 0, }
+#else
+#define CONTEXT_INIT	0
+#endif
+
 #define INIT_MM {					\
 		&init_mmap, NULL, NULL,			\
 		swapper_pg_dir, 			\
 		ATOMIC_INIT(1), 1,			\
 		MUTEX,					\
-		0,					\
+		CONTEXT_INIT,				\
 		0, 0, 0, 0,				\
 		0, 0, 0, 				\
 		0, 0, 0, 0,				\
@@ -243,6 +254,7 @@
 	long counter;
 	long priority;
 	cycles_t avg_slice;
+	int counter_refresh;
 /* SMP and runqueue state */
 	int has_cpu;
 	int processor;
@@ -300,7 +312,7 @@
 	int keep_capabilities:1;
 	struct user_struct *user;
 /* limits */
-	struct rlimit rlim[RLIM_NLIMITS];
+	struct rlimit   rlim[RLIM_NLIMITS];
 	unsigned short used_math;
 	char comm[16];
 /* file system info */
@@ -313,10 +325,13 @@
 	struct thread_struct tss;
 /* filesystem information */
 	struct fs_struct *fs;
+	void *fs_transaction;
 /* open file information */
 	struct files_struct *files;
 /* memory management info */
 	struct mm_struct *mm;
+	struct list_head local_pages; int allocation_order, nr_local_pages;
+	int fs_locks;
 
 /* signal handlers */
 	spinlock_t sigmask_lock;	/* Protects signal and blocked */
@@ -349,6 +364,7 @@
 #define PF_SIGNALED	0x00000400	/* killed by a signal */
 #define PF_MEMALLOC	0x00000800	/* Allocating memory */
 #define PF_VFORK	0x00001000	/* Wake up parent in mm_release */
+#define PF_FREE_PAGES	0x00002000	/* The current-> */
 
 #define PF_USEDFPU	0x00100000	/* task used FPU this quantum (SMP) */
 #define PF_DTRACE	0x00200000	/* delayed trace (used on m68k, i386) */
@@ -367,7 +383,7 @@
  */
 #define INIT_TASK \
 /* state etc */	{ 0,0,0,KERNEL_DS,&default_exec_domain,0, \
-/* counter */	DEF_PRIORITY,DEF_PRIORITY,0, \
+/* counter */	DEF_PRIORITY,DEF_PRIORITY,0,0, \
 /* SMP */	0,0,0,-1, \
 /* schedlink */	&init_task,&init_task, &init_task, &init_task, \
 /* binfmt */	NULL, \
@@ -396,8 +412,9 @@
 /* ipc */	NULL, NULL, \
 /* tss */	INIT_TSS, \
 /* fs */	&init_fs, \
+/* trans */	NULL, \
 /* files */	&init_files, \
-/* mm */	&init_mm, \
+/* mm */	&init_mm, { &init_task.local_pages, &init_task.local_pages}, 0, 0, 0, \
 /* signals */	SPIN_LOCK_UNLOCKED, &init_signals, {{0}}, {{0}}, NULL, &init_task.sigqueue, 0, 0, \
 /* exec cts */	0,0, \
 /* oom */	0, \
diff -urN 2.2.18/include/linux/shm.h 2.2.18aa1/include/linux/shm.h
--- 2.2.18/include/linux/shm.h	Tue Nov 14 03:37:16 2000
+++ 2.2.18aa1/include/linux/shm.h	Mon Dec 11 17:41:24 2000
@@ -7,7 +7,7 @@
 
 struct shmid_ds {
 	struct ipc_perm		shm_perm;	/* operation perms */
-	int			shm_segsz;	/* size of segment (bytes) */
+	unsigned int		shm_segsz;	/* size of segment (bytes) */
 	__kernel_time_t		shm_atime;	/* last attach time */
 	__kernel_time_t		shm_dtime;	/* last detach time */
 	__kernel_time_t		shm_ctime;	/* last change time */
@@ -68,7 +68,7 @@
 #define	SHM_DEST	01000	/* segment will be destroyed on last detach */
 #define SHM_LOCKED      02000   /* segment will not be swapped */
 
-asmlinkage int sys_shmget (key_t key, int size, int flag);
+asmlinkage int sys_shmget (key_t key, unsigned int size, int flag);
 asmlinkage int sys_shmat (int shmid, char *shmaddr, int shmflg, unsigned long *addr);
 asmlinkage int sys_shmdt (char *shmaddr);
 asmlinkage int sys_shmctl (int shmid, int cmd, struct shmid_ds *buf);
diff -urN 2.2.18/include/linux/smb_fs.h 2.2.18aa1/include/linux/smb_fs.h
--- 2.2.18/include/linux/smb_fs.h	Mon Dec 11 16:58:06 2000
+++ 2.2.18aa1/include/linux/smb_fs.h	Mon Dec 11 19:27:04 2000
@@ -121,8 +121,8 @@
 void smb_close_dentry(struct dentry *);
 int smb_close_fileid(struct dentry *, __u16);
 int smb_open(struct dentry *, int);
-int smb_proc_read(struct dentry *, off_t, int, char *);
-int smb_proc_write(struct dentry *, off_t, int, const char *);
+int smb_proc_read(struct dentry *, loff_t, int, char *);
+int smb_proc_write(struct dentry *, loff_t, int, const char *);
 int smb_proc_create(struct dentry *, __u16, time_t, __u16 *);
 int smb_proc_mv(struct dentry *, struct dentry *);
 int smb_proc_mkdir(struct dentry *);
diff -urN 2.2.18/include/linux/smp_lock.h 2.2.18aa1/include/linux/smp_lock.h
--- 2.2.18/include/linux/smp_lock.h	Tue Nov 14 23:08:24 2000
+++ 2.2.18aa1/include/linux/smp_lock.h	Mon Dec 11 19:23:35 2000
@@ -8,6 +8,10 @@
 #define release_kernel_lock(task, cpu)		do { } while(0)
 #define reacquire_kernel_lock(task)		do { } while(0)
 
+#define DECLARE_LOCAL_LOCK_DEPTH(x)
+#define release_kernel_lock_save(x)		do {} while(0)
+#define reacquire_kernel_lock_restore(x)	do {} while(0)
+
 #else
 
 #include <asm/smplock.h>
diff -urN 2.2.18/include/linux/swap.h 2.2.18aa1/include/linux/swap.h
--- 2.2.18/include/linux/swap.h	Tue Nov 14 23:08:24 2000
+++ 2.2.18aa1/include/linux/swap.h	Mon Dec 11 19:23:36 2000
@@ -114,7 +114,7 @@
 extern unsigned int nr_swapfiles;
 extern struct swap_info_struct swap_info[];
 void si_swapinfo(struct sysinfo *);
-unsigned long get_swap_page(void);
+extern unsigned long  get_swap_page(void);
 extern void FASTCALL(swap_free(unsigned long));
 struct swap_list_t {
 	int head;	/* head of priority-ordered swapfile list */
@@ -147,7 +147,7 @@
 extern inline unsigned long in_swap_cache(struct page *page)
 {
 	if (PageSwapCache(page))
-		return page->offset;
+		return pgoff2ulong(page->index);
 	return 0;
 }
 
@@ -164,7 +164,7 @@
 		return 1;
 	count = atomic_read(&page->count);
 	if (PageSwapCache(page))
-		count += swap_count(page->offset) - 2;
+		count += swap_count(pgoff2ulong(page->index)) - 2;
 	if (PageFreeAfter(page))
 		count--;
 	return  count > 1;
diff -urN 2.2.18/include/linux/sysctl.h 2.2.18aa1/include/linux/sysctl.h
--- 2.2.18/include/linux/sysctl.h	Mon Dec 11 16:58:06 2000
+++ 2.2.18aa1/include/linux/sysctl.h	Mon Dec 11 17:20:54 2000
@@ -123,7 +123,8 @@
 	VM_PAGECACHE=7,		/* struct: Set cache memory thresholds */
 	VM_PAGERDAEMON=8,	/* struct: Control kswapd behaviour */
 	VM_PGT_CACHE=9,		/* struct: Set page table cache parameters */
-	VM_PAGE_CLUSTER=10	/* int: set number of pages to swap together */
+	VM_PAGE_CLUSTER=10,	/* int: set number of pages to swap together */
+	VM_HEAP_STACK_GAP=11,	/* int: page gap between heap and stack */
 };
 
 
@@ -435,6 +436,7 @@
 enum {
 	DEV_CDROM=1,
 	DEV_HWMON=2,
+	DEV_MD=3,
 	DEV_MAC_HID=5
 };
 
@@ -446,6 +448,11 @@
 	DEV_CDROM_DEBUG=4,
 	DEV_CDROM_LOCK=5,
 	DEV_CDROM_CHECK_MEDIA=6
+};
+
+/* /proc/sys/dev/md */
+enum {
+	DEV_MD_SPEED_LIMIT=1
 };
 
 /* /proc/sys/dev/mac_hid */
diff -urN 2.2.18/include/linux/time.h 2.2.18aa1/include/linux/time.h
--- 2.2.18/include/linux/time.h	Tue Nov 14 03:36:24 2000
+++ 2.2.18aa1/include/linux/time.h	Mon Dec 11 17:29:53 2000
@@ -46,10 +46,53 @@
 	value->tv_sec = jiffies / HZ;
 }
  
+static __inline__ int
+timespec_before(struct timespec a, struct timespec b)
+{
+	if (a.tv_sec == b.tv_sec)
+		return a.tv_nsec < b.tv_nsec;
+	return a.tv_sec < b.tv_sec;
+}
+
+/* computes `a - b'  and write the result in `result', assumes `a >= b' */
+static inline void
+timespec_less(struct timespec a, struct timespec b, struct timespec * result)
+{
+	if (a.tv_nsec < b.tv_nsec)
+	{
+		a.tv_sec--;
+		a.tv_nsec += 1000000000;
+	}
+
+	result->tv_sec = a.tv_sec - b.tv_sec;
+	result->tv_nsec = a.tv_nsec - b.tv_nsec;
+}
+
 struct timeval {
 	time_t		tv_sec;		/* seconds */
 	suseconds_t	tv_usec;	/* microseconds */
 };
+
+/* computes `a - b'  and write the result in `result', assumes `a >= b' */
+static inline void
+timeval_less(struct timeval a, struct timeval b, struct timeval * result)
+{
+	if (a.tv_usec < b.tv_usec)
+	{
+		a.tv_sec--;
+		a.tv_usec += 1000000;
+	}
+
+	result->tv_sec = a.tv_sec - b.tv_sec;
+	result->tv_usec = a.tv_usec - b.tv_usec;
+}
+
+static __inline__ void
+timeval_to_timespec(struct timeval tv, struct timespec * ts)
+{
+	ts->tv_sec = tv.tv_sec;
+	ts->tv_nsec = (long) tv.tv_usec * 1000;
+}
 
 struct timezone {
 	int	tz_minuteswest;	/* minutes west of Greenwich */
diff -urN 2.2.18/include/linux/tty.h 2.2.18aa1/include/linux/tty.h
--- 2.2.18/include/linux/tty.h	Tue Nov 14 23:09:54 2000
+++ 2.2.18aa1/include/linux/tty.h	Mon Dec 11 19:23:35 2000
@@ -397,6 +397,7 @@
 /* tty_ioctl.c */
 extern int n_tty_ioctl(struct tty_struct * tty, struct file * file,
 		       unsigned int cmd, unsigned long arg);
+extern void change_termios(struct tty_struct * tty, struct termios * new_termios);
 
 /* serial.c */
 
diff -urN 2.2.18/include/linux/ufs_fs_i.h 2.2.18aa1/include/linux/ufs_fs_i.h
--- 2.2.18/include/linux/ufs_fs_i.h	Tue Feb  1 18:24:19 2000
+++ 2.2.18aa1/include/linux/ufs_fs_i.h	Mon Dec 11 17:20:52 2000
@@ -18,7 +18,6 @@
 		__u32	i_data[15];
 		__u8	i_symlink[4*15];
 	} i_u1;
-	__u64	i_size;
 	__u32	i_flags;
 	__u32	i_gen;
 	__u32	i_shadow;
diff -urN 2.2.18/include/linux/wrapper.h 2.2.18aa1/include/linux/wrapper.h
--- 2.2.18/include/linux/wrapper.h	Tue Feb  1 18:24:19 2000
+++ 2.2.18aa1/include/linux/wrapper.h	Mon Dec 11 17:20:45 2000
@@ -33,6 +33,14 @@
 #define vma_get_end(v) v->vm_end
 #define vma_get_page_prot(v) v->vm_page_prot
 
+/*
+ * mem_map_reserve()/unreserve() are going to be obsoleted by
+ * setting the VM_RESERVED in vma->vm_flags.
+ *
+ * Instead of marking the pages as reserved, just mark the vma as reserved
+ * this will improve performance (it's zero cost unlike the PG_reserved check)
+ * and it will be trivial for not physically contigous mappings too.
+ */
 #define mem_map_reserve(p) set_bit(PG_reserved, &mem_map[p].flags)
 #define mem_map_unreserve(p) clear_bit(PG_reserved, &mem_map[p].flags)
 #define mem_map_inc_count(p) atomic_inc(&(mem_map[p].count))
diff -urN 2.2.18/init/main.c 2.2.18aa1/init/main.c
--- 2.2.18/init/main.c	Mon Dec 11 16:58:06 2000
+++ 2.2.18aa1/init/main.c	Mon Dec 11 17:20:54 2000
@@ -19,9 +19,11 @@
 #include <linux/utsname.h>
 #include <linux/ioport.h>
 #include <linux/init.h>
+#include <linux/raid/md.h>
 #include <linux/smp_lock.h>
 #include <linux/blk.h>
 #include <linux/hdreg.h>
+#include <linux/iobuf.h>
 #include <linux/init.h>
 
 #include <asm/io.h>
@@ -80,7 +82,6 @@
 extern int bdflush(void *);
 extern int kupdate(void *);
 extern int kswapd(void *);
-extern int kpiod(void *);
 extern void kswapd_setup(void);
 extern unsigned long init_IRQ( unsigned long);
 extern void init_modules(void);
@@ -549,7 +550,7 @@
 #ifdef CONFIG_BLK_DEV_FD
 	{ "fd",      0x0200 },
 #endif
-#ifdef CONFIG_MD_BOOT
+#if CONFIG_MD_BOOT || CONFIG_AUTODETECT_RAID
 	{ "md",      0x0900 },	     
 #endif     
 #ifdef CONFIG_BLK_DEV_XD
@@ -1057,6 +1058,9 @@
 #ifdef CONFIG_MD_BOOT
 	{ "md=", md_setup},
 #endif
+#if CONFIG_BLK_DEV_MD
+	{ "raid=", raid_setup},
+#endif
 #ifdef CONFIG_ADBMOUSE
 	{ "adb_buttons=", adb_mouse_setup },
 #endif
@@ -1441,6 +1445,7 @@
 #ifdef CONFIG_ARCH_S390
 	ccwcache_init();
 #endif
+	kiobuf_init();
 	signals_init();
 	inode_init();
 	file_table_init();
@@ -1580,7 +1585,6 @@
 	kernel_thread(kupdate, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
 	/* Start the background pageout daemon. */
 	kswapd_setup();
-	kernel_thread(kpiod, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
 	kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
 
 #if CONFIG_AP1000
@@ -1635,6 +1639,9 @@
 			while (pid != wait(&i));
 		if (MAJOR(real_root_dev) != RAMDISK_MAJOR
 		     || MINOR(real_root_dev) != 0) {
+#ifdef CONFIG_BLK_DEV_MD
+			autodetect_raid();
+#endif
 			error = change_root(real_root_dev,"/initrd");
 			if (error)
 				printk(KERN_ERR "Change root to /initrd: "
diff -urN 2.2.18/ipc/shm.c 2.2.18aa1/ipc/shm.c
--- 2.2.18/ipc/shm.c	Tue Jun 13 03:48:15 2000
+++ 2.2.18aa1/ipc/shm.c	Mon Dec 11 17:20:52 2000
@@ -4,6 +4,7 @@
  *         Many improvements/fixes by Bruno Haible.
  * Replaced `struct shm_desc' by `struct vm_area_struct', July 1994.
  * Fixed the shm swap deallocation (shm_unuse()), August 1998 Andrea Arcangeli.
+ * BIGMEM support, Andrea Arcangeli <andrea@suse.de>
  */
 
 #include <linux/malloc.h>
@@ -13,6 +14,8 @@
 #include <linux/init.h>
 #include <linux/vmalloc.h>
 #include <linux/tasks.h>
+#include <linux/bigmem.h>
+#include <linux/pagemap.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -20,7 +23,7 @@
 extern int ipcperms (struct ipc_perm *ipcp, short shmflg);
 extern unsigned long get_swap_page (void);
 static int findkey (key_t key);
-static int newseg (key_t key, int shmflg, int size);
+static int newseg (key_t key, int shmflg, unsigned int size);
 static int shm_map (struct vm_area_struct *shmd);
 static void killseg (int id);
 static void shm_open (struct vm_area_struct *shmd);
@@ -75,7 +78,7 @@
 /*
  * allocate new shmid_kernel and pgtable. protected by shm_segs[id] = NOID.
  */
-static int newseg (key_t key, int shmflg, int size)
+static int newseg (key_t key, int shmflg, unsigned int size)
 {
 	struct shmid_kernel *shp;
 	int numpages = (size + PAGE_SIZE -1) >> PAGE_SHIFT;
@@ -136,9 +139,9 @@
 	return (unsigned int) shp->u.shm_perm.seq * SHMMNI + id;
 }
 
-int shmmax = SHMMAX;
+unsigned int shmmax = SHMMAX;
 
-asmlinkage int sys_shmget (key_t key, int size, int shmflg)
+asmlinkage int sys_shmget (key_t key, unsigned int size, int shmflg)
 {
 	struct shmid_kernel *shp;
 	int err, id = 0;
@@ -648,21 +651,29 @@
 
 	pte = __pte(shp->shm_pages[idx]);
 	if (!pte_present(pte)) {
-		unsigned long page = get_free_page(GFP_USER);
+		unsigned long page = __get_free_page(GFP_BIGUSER);
 		if (!page)
 			return -1;
+		clear_bigpage(page);
 		pte = __pte(shp->shm_pages[idx]);
 		if (pte_present(pte)) {
 			free_page (page); /* doesn't sleep */
 			goto done;
 		}
 		if (!pte_none(pte)) {
+			struct page * page_map;
+
+			page = prepare_bigmem_shm_swapin(page);
+			if (!page)
+				return -1;
 			rw_swap_page_nocache(READ, pte_val(pte), (char *)page);
 			pte = __pte(shp->shm_pages[idx]);
 			if (pte_present(pte))  {
 				free_page (page); /* doesn't sleep */
 				goto done;
 			}
+			page_map = replace_with_bigmem(&mem_map[MAP_NR(page)]);
+			page = page_address(page_map);
 			swap_free(pte_val(pte));
 			shm_swp--;
 		}
@@ -679,7 +690,7 @@
 }
 
 /*
- * Goes through counter = (shm_rss >> prio) present shm pages.
+ * Goes through counter = (shm_rss / prio) present shm pages.
  */
 static unsigned long swap_id = 0; /* currently being swapped */
 static unsigned long swap_idx = 0; /* next to swap */
@@ -692,8 +703,9 @@
 	unsigned long id, idx;
 	int loop = 0;
 	int counter;
+	struct page * page_map;
 	
-	counter = shm_rss >> prio;
+	counter = shm_rss / prio;
 	if (!counter || !(swap_nr = get_swap_page()))
 		return 0;
 
@@ -720,7 +732,10 @@
 	page = __pte(shp->shm_pages[idx]);
 	if (!pte_present(page))
 		goto check_table;
-	if ((gfp_mask & __GFP_DMA) && !PageDMA(&mem_map[MAP_NR(pte_page(page))]))
+	page_map = &mem_map[MAP_NR(pte_page(page))];
+	if ((gfp_mask & __GFP_DMA) && !PageDMA(page_map))
+		goto check_table;
+	if (!(gfp_mask & __GFP_BIGMEM) && PageBIGMEM(page_map))
 		goto check_table;
 	swap_attempts++;
 
@@ -729,11 +744,13 @@
 		swap_free (swap_nr);
 		return 0;
 	}
-	if (atomic_read(&mem_map[MAP_NR(pte_page(page))].count) != 1)
+	if (atomic_read(&page_map->count) != 1)
+		goto check_table;
+	if (!(page_map = prepare_bigmem_swapout(page_map)))
 		goto check_table;
 	shp->shm_pages[idx] = swap_nr;
-	rw_swap_page_nocache (WRITE, swap_nr, (char *) pte_page(page));
-	free_page(pte_page(page));
+	rw_swap_page_nocache (WRITE, swap_nr, (char *) page_address(page_map));
+	__free_page(page_map);
 	swap_successes++;
 	shm_swp++;
 	shm_rss--;
diff -urN 2.2.18/ipc/util.c 2.2.18aa1/ipc/util.c
--- 2.2.18/ipc/util.c	Mon Jan 17 16:44:49 2000
+++ 2.2.18aa1/ipc/util.c	Mon Dec 11 17:20:48 2000
@@ -99,7 +99,7 @@
 	return -ENOSYS;
 }
 
-asmlinkage int sys_shmget (key_t key, int size, int flag)
+asmlinkage int sys_shmget (key_t key, unsigned int size, int flag)
 {
 	return -ENOSYS;
 }
diff -urN 2.2.18/kernel/exit.c 2.2.18aa1/kernel/exit.c
--- 2.2.18/kernel/exit.c	Sun Apr  2 21:07:50 2000
+++ 2.2.18aa1/kernel/exit.c	Mon Dec 11 17:20:46 2000
@@ -56,6 +56,17 @@
 		current->cmin_flt += p->min_flt + p->cmin_flt;
 		current->cmaj_flt += p->maj_flt + p->cmaj_flt;
 		current->cnswap += p->nswap + p->cnswap;
+		/*
+		 * Potentially available timeslices are retrieved
+		 * here - this way the parent does not get penalized
+		 * for creating too many processes.
+		 *
+		 * (this cannot be used to artificially 'generate'
+		 * timeslices, because any timeslice recovered here
+		 * was given away by the parent in the first place.)
+		 */
+		if (!p->counter_refresh)
+			current->counter += p->counter;
 		free_task_struct(p);
 	} else {
 		printk("task releasing itself\n");
@@ -150,6 +161,7 @@
 			p->exit_signal = SIGCHLD;
 			p->self_exec_id++;
 			p->p_opptr = child_reaper; /* init */
+			p->counter_refresh = 1;
 			if (p->pdeath_signal) send_sig(p->pdeath_signal, p, 0);
 		}
 	}
diff -urN 2.2.18/kernel/fork.c 2.2.18aa1/kernel/fork.c
--- 2.2.18/kernel/fork.c	Mon Jan 17 16:44:50 2000
+++ 2.2.18aa1/kernel/fork.c	Mon Dec 11 17:20:52 2000
@@ -665,6 +665,8 @@
 	p->lock_depth = -1;		/* -1 = no lock */
 	p->start_time = jiffies;
 
+	INIT_LIST_HEAD(&p->local_pages);
+
 	retval = -ENOMEM;
 	/* copy all the process information */
 	if (copy_files(clone_flags, p))
@@ -698,6 +700,8 @@
 	 */
 	current->counter >>= 1;
 	p->counter = current->counter;
+	/* Tell the parent if it can get back its timeslice when child exits */
+	p->counter_refresh = 0;
 
 	/*
 	 * Ok, add it to the run-queues and make it
diff -urN 2.2.18/kernel/ksyms.c 2.2.18aa1/kernel/ksyms.c
--- 2.2.18/kernel/ksyms.c	Mon Dec 11 16:58:06 2000
+++ 2.2.18aa1/kernel/ksyms.c	Mon Dec 11 17:20:55 2000
@@ -37,6 +37,7 @@
 #include <linux/poll.h>
 #include <linux/mm.h>
 #include <linux/capability.h>
+#include <linux/iobuf.h>
 
 #if defined(CONFIG_PROC_FS)
 #include <linux/proc_fs.h>
@@ -84,8 +85,14 @@
 #endif
 EXPORT_SYMBOL(get_options);
 
+#ifdef CONFIG_BLK_DEV_LVM_MODULE
+   extern void (*lvm_hd_name_ptr) ( char*, int);
+   EXPORT_SYMBOL(lvm_hd_name_ptr);
+#endif
+
 /* process memory management */
 EXPORT_SYMBOL(do_mmap);
+EXPORT_SYMBOL(do_mmap_pgoff);
 EXPORT_SYMBOL(do_munmap);
 EXPORT_SYMBOL(exit_mm);
 EXPORT_SYMBOL(exit_files);
@@ -110,7 +117,9 @@
 EXPORT_SYMBOL(vfree);
 EXPORT_SYMBOL(mem_map);
 EXPORT_SYMBOL(remap_page_range);
+EXPORT_SYMBOL(zap_page_range);
 EXPORT_SYMBOL(max_mapnr);
+EXPORT_SYMBOL(num_physpages);
 EXPORT_SYMBOL(high_memory);
 EXPORT_SYMBOL(update_vm_cache);
 EXPORT_SYMBOL(update_vm_cache_conditional);
@@ -159,6 +168,7 @@
 EXPORT_SYMBOL(invalidate_inodes);
 EXPORT_SYMBOL(invalidate_inode_pages);
 EXPORT_SYMBOL(truncate_inode_pages);
+EXPORT_SYMBOL(sync_buffers);
 EXPORT_SYMBOL(fsync_dev);
 EXPORT_SYMBOL(vfs_permission);
 EXPORT_SYMBOL(permission);
@@ -196,6 +206,7 @@
 EXPORT_SYMBOL(find_inode_number);
 EXPORT_SYMBOL(is_subdir);
 EXPORT_SYMBOL(get_unused_fd);
+EXPORT_SYMBOL(put_unused_fd);
 EXPORT_SYMBOL(vfs_rmdir);
 EXPORT_SYMBOL(vfs_unlink);
 EXPORT_SYMBOL(vfs_rename);
@@ -253,6 +264,14 @@
 EXPORT_SYMBOL(max_segments);
 EXPORT_SYMBOL(max_readahead);
 
+/* kiobuf support */
+EXPORT_SYMBOL(map_user_kiobuf);
+EXPORT_SYMBOL(unmap_kiobuf);
+EXPORT_SYMBOL(alloc_kiovec);
+EXPORT_SYMBOL(free_kiovec);
+EXPORT_SYMBOL(expand_kiobuf);
+EXPORT_SYMBOL(brw_kiovec);
+
 /* tty routines */
 EXPORT_SYMBOL(tty_hangup);
 EXPORT_SYMBOL(tty_wait_until_sent);
@@ -260,6 +279,7 @@
 EXPORT_SYMBOL(tty_hung_up_p);
 EXPORT_SYMBOL(tty_flip_buffer_push);
 EXPORT_SYMBOL(tty_get_baud_rate);
+EXPORT_SYMBOL(change_termios);
 EXPORT_SYMBOL(do_SAK);
 EXPORT_SYMBOL(console_print);
 
@@ -342,6 +362,7 @@
 
 /* process management */
 EXPORT_SYMBOL(__wake_up);
+EXPORT_SYMBOL(wake_up_process);
 EXPORT_SYMBOL(sleep_on);
 EXPORT_SYMBOL(sleep_on_timeout);
 EXPORT_SYMBOL(interruptible_sleep_on);
diff -urN 2.2.18/kernel/sched.c 2.2.18aa1/kernel/sched.c
--- 2.2.18/kernel/sched.c	Mon Dec 11 16:58:06 2000
+++ 2.2.18aa1/kernel/sched.c	Mon Dec 11 17:20:47 2000
@@ -212,101 +212,89 @@
 }
 
 /*
- * If there is a dependency between p1 and p2,
- * don't be too eager to go into the slow schedule.
- * In particular, if p1 and p2 both want the kernel
- * lock, there is no point in trying to make them
- * extremely parallel..
- *
- * (No lock - lock_depth < 0)
- *
- * There are two additional metrics here:
- *
- * first, a 'cutoff' interval, currently 0-200 usecs on
- * x86 CPUs, depending on the size of the 'SMP-local cache'.
- * If the current process has longer average timeslices than
- * this, then we utilize the idle CPU.
- *
- * second, if the wakeup comes from a process context,
- * then the two processes are 'related'. (they form a
- * 'gang')
- *
- * An idle CPU is almost always a bad thing, thus we skip
- * the idle-CPU utilization only if both these conditions
- * are true. (ie. a 'process-gang' rescheduling with rather
- * high frequency should stay on the same CPU).
- *
- * [We can switch to something more finegrained in 2.3.]
- *
- * do not 'guess' if the to-be-scheduled task is RT.
+ * This is ugly, but reschedule_idle() is very timing-critical.
+ * We enter with the runqueue spinlock held, but we might end
+ * up unlocking it early, so the caller must not unlock the
+ * runqueue, it's always done by reschedule_idle().
  */
-#define related(p1,p2) (((p1)->lock_depth >= 0) && (p2)->lock_depth >= 0) && \
-	(((p2)->policy == SCHED_OTHER) && ((p1)->avg_slice < cacheflush_time))
-
-static inline void reschedule_idle_slow(struct task_struct * p)
+static inline void reschedule_idle(struct task_struct * p, unsigned long flags)
 {
 #ifdef __SMP__
-/*
- * (see reschedule_idle() for an explanation first ...)
- *
- * Pass #2
- *
- * We try to find another (idle) CPU for this woken-up process.
- *
- * On SMP, we mostly try to see if the CPU the task used
- * to run on is idle.. but we will use another idle CPU too,
- * at this point we already know that this CPU is not
- * willing to reschedule in the near future.
- *
- * An idle CPU is definitely wasted, especially if this CPU is
- * running long-timeslice processes. The following algorithm is
- * pretty good at finding the best idle CPU to send this process
- * to.
- *
- * [We can try to preempt low-priority processes on other CPUs in
- * 2.3. Also we can try to use the avg_slice value to predict
- * 'likely reschedule' events even on other CPUs.]
- */
 	int this_cpu = smp_processor_id(), target_cpu;
-	struct task_struct *tsk, *target_tsk;
-	int cpu, best_cpu, weight, best_weight, i;
-	unsigned long flags;
-
-	best_weight = 0; /* prevents negative weight */
-
-	spin_lock_irqsave(&runqueue_lock, flags);
+	struct task_struct *tsk;
+	int cpu, best_cpu, i;
 
 	/*
 	 * shortcut if the woken up task's last CPU is
 	 * idle now.
 	 */
 	best_cpu = p->processor;
-	target_tsk = idle_task(best_cpu);
-	if (cpu_curr(best_cpu) == target_tsk)
+	tsk = idle_task(best_cpu);
+	if (cpu_curr(best_cpu) == tsk)
 		goto send_now;
 
-	target_tsk = NULL;
-	for (i = 0; i < smp_num_cpus; i++) {
+	/*
+	 * We know that the preferred CPU has a cache-affine current
+	 * process, lets try to find a new idle CPU for the woken-up
+	 * process:
+	 */
+	for (i = smp_num_cpus - 1; i >= 0; i--) {
 		cpu = cpu_logical_map(i);
+		if (cpu == best_cpu)
+			continue;
 		tsk = cpu_curr(cpu);
-		if (related(tsk, p))
-			goto out_no_target;
-		weight = preemption_goodness(tsk, p, cpu);
-		if (weight > best_weight) {
-			best_weight = weight;
-			target_tsk = tsk;
-		}
+		/*
+		 * We use the last available idle CPU. This creates
+		 * a priority list between idle CPUs, but this is not
+		 * a problem.
+		 */
+		if (tsk == idle_task(cpu))
+			goto send_now;
 	}
 
 	/*
-	 * found any suitable CPU?
+	 * No CPU is idle, but maybe this process has enough priority
+	 * to preempt it's preferred CPU.
 	 */
-	if (!target_tsk)
-		goto out_no_target;
+	tsk = cpu_curr(best_cpu);
+	if (preemption_goodness(tsk, p, best_cpu) > 0)
+		goto send_now;
+
+	/*
+	 * We will get here often - or in the high CPU contention
+	 * case. No CPU is idle and this process is either lowprio or
+	 * the preferred CPU is highprio. Try to preemt some other CPU
+	 * only if it's RT or if it's iteractive and the preferred
+	 * cpu won't reschedule shortly.
+	 */
+	if ((p->avg_slice < cacheflush_time && cpu_curr(best_cpu)->avg_slice > cacheflush_time) ||
+	    ((p->policy & ~SCHED_YIELD) != SCHED_OTHER))
+	{
+		int weight, best_weight = 0;
+		struct task_struct * best_tsk = NULL;
+
+		for (i = smp_num_cpus - 1; i >= 0; i--) {
+			cpu = cpu_logical_map(i);
+			if (cpu == best_cpu)
+				continue;
+			tsk = cpu_curr(cpu);
+			weight = preemption_goodness(tsk, p, cpu);
+			if (weight > best_weight) {
+				best_weight = weight;
+				best_tsk = tsk;
+			}
+		}
+
+		if ((tsk = best_tsk))
+			goto send_now;
+	}
+
+	spin_unlock_irqrestore(&runqueue_lock, flags);
+	return;
 		
 send_now:
-	target_cpu = target_tsk->processor;
-	target_tsk->need_resched = 1;
+	target_cpu = tsk->processor;
+	tsk->need_resched = 1;
 	spin_unlock_irqrestore(&runqueue_lock, flags);
 	/*
 	 * the APIC stuff can go outside of the lock because
@@ -315,9 +303,6 @@
 	if (target_cpu != this_cpu)
 		smp_send_reschedule(target_cpu);
 	return;
-out_no_target:
-	spin_unlock_irqrestore(&runqueue_lock, flags);
-	return;
 #else /* UP */
 	int this_cpu = smp_processor_id();
 	struct task_struct *tsk;
@@ -325,38 +310,10 @@
 	tsk = current;
 	if (preemption_goodness(tsk, p, this_cpu) > 0)
 		tsk->need_resched = 1;
+	spin_unlock_irqrestore(&runqueue_lock, flags);
 #endif
 }
 
-static void reschedule_idle(struct task_struct * p)
-{
-#ifdef __SMP__
-	int cpu = smp_processor_id();
-	/*
-	 * ("wakeup()" should not be called before we've initialized
-	 * SMP completely.
-	 * Basically a not-yet initialized SMP subsystem can be
-	 * considered as a not-yet working scheduler, simply dont use
-	 * it before it's up and running ...)
-	 *
-	 * SMP rescheduling is done in 2 passes:
-	 *  - pass #1: faster: 'quick decisions'
-	 *  - pass #2: slower: 'lets try and find a suitable CPU'
-	 */
-
-	/*
-	 * Pass #1. (subtle. We might be in the middle of __switch_to, so
-	 * to preserve scheduling atomicity we have to use cpu_curr)
-	 */
-	if ((p->processor == cpu) && related(cpu_curr(cpu), p))
-		return;
-#endif /* __SMP__ */
-	/*
-	 * Pass #2
-	 */
-	reschedule_idle_slow(p);
-}
-
 /*
  * Careful!
  *
@@ -453,9 +410,8 @@
 	if (p->next_run)
 		goto out;
 	add_to_runqueue(p);
-	spin_unlock_irqrestore(&runqueue_lock, flags);
+	reschedule_idle(p, flags); // spin_unlocks runqueue
 
-	reschedule_idle(p);
 	return;
 out:
 	spin_unlock_irqrestore(&runqueue_lock, flags);
@@ -668,9 +624,13 @@
 {
 #ifdef __SMP__
 	if ((prev->state == TASK_RUNNING) &&
-			(prev != idle_task(smp_processor_id())))
-		reschedule_idle(prev);
-	wmb();
+			(prev != idle_task(smp_processor_id()))) {
+		unsigned long flags;
+
+		spin_lock_irqsave(&runqueue_lock, flags);
+		reschedule_idle(prev, flags); // spin_unlocks runqueue
+	}
+	mb();
 	prev->has_cpu = 0;
 #endif /* __SMP__ */
 }
@@ -727,7 +687,7 @@
 		goto move_rr_last;
 move_rr_back:
 
-	switch (prev->state) {
+	switch (prev->state & ~TASK_EXCLUSIVE) {
 		case TASK_INTERRUPTIBLE:
 			if (signal_pending(prev)) {
 				prev->state = TASK_RUNNING;
@@ -841,8 +801,10 @@
 		struct task_struct *p;
 		spin_unlock_irq(&runqueue_lock);
 		read_lock(&tasklist_lock);
-		for_each_task(p)
+		for_each_task(p) {
 			p->counter = (p->counter >> 1) + p->priority;
+			p->counter_refresh = 1;
+		}
 		read_unlock(&tasklist_lock);
 		spin_lock_irq(&runqueue_lock);
 		goto repeat_schedule;
@@ -892,6 +854,7 @@
 {
 	struct task_struct *p;
 	struct wait_queue *head, *next;
+	int wake_one = 0;
 
         if (!q)
 		goto out;
@@ -910,6 +873,11 @@
 		p = next->task;
 		next = next->next;
 		if (p->state & mode) {
+			if (p->state & TASK_EXCLUSIVE) {
+				if (wake_one)
+					continue;
+				wake_one = 1;
+			}
 			/*
 			 * We can drop the read-lock early if this
 			 * is the only/last process.
@@ -1219,7 +1187,7 @@
 	read_lock(&tasklist_lock);
 	for_each_task(p) {
 		if ((p->state == TASK_RUNNING ||
-		     p->state == TASK_UNINTERRUPTIBLE ||
+		     p->state & TASK_UNINTERRUPTIBLE ||
 		     p->state == TASK_SWAPPING))
 			nr += FIXED_1;
 	}
@@ -1920,6 +1888,7 @@
 {
 	struct timespec t;
 	unsigned long expire;
+	struct timeval before, after;
 
 	if(copy_from_user(&t, rqtp, sizeof(struct timespec)))
 		return -EFAULT;
@@ -1952,11 +1921,20 @@
 	expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec);
 
 	current->state = TASK_INTERRUPTIBLE;
+	get_fast_time(&before);
 	expire = schedule_timeout(expire);
+	get_fast_time(&after);
 
 	if (expire) {
 		if (rmtp) {
-			jiffies_to_timespec(expire, &t);
+			struct timespec elapsed;
+
+			timeval_less(after, before, &after);
+			timeval_to_timespec(after, &elapsed);
+			if (timespec_before(elapsed, t))
+				timespec_less(t, elapsed, &t);
+			else
+				t.tv_nsec = t.tv_sec = 0;
 			if (copy_to_user(rmtp, &t, sizeof(struct timespec)))
 				return -EFAULT;
 		}
diff -urN 2.2.18/kernel/signal.c 2.2.18aa1/kernel/signal.c
--- 2.2.18/kernel/signal.c	Thu May  4 13:00:40 2000
+++ 2.2.18aa1/kernel/signal.c	Mon Dec 11 17:20:54 2000
@@ -394,7 +394,7 @@
 
 out:
 	spin_unlock_irqrestore(&t->sigmask_lock, flags);
-        if (t->state == TASK_INTERRUPTIBLE && signal_pending(t))
+        if (t->state & TASK_INTERRUPTIBLE && signal_pending(t))
                 wake_up_process(t);
 
 out_nolock:
@@ -622,6 +622,7 @@
 
 EXPORT_SYMBOL(dequeue_signal);
 EXPORT_SYMBOL(flush_signals);
+EXPORT_SYMBOL(flush_signal_handlers);
 EXPORT_SYMBOL(force_sig);
 EXPORT_SYMBOL(force_sig_info);
 EXPORT_SYMBOL(kill_pg);
diff -urN 2.2.18/kernel/sysctl.c 2.2.18aa1/kernel/sysctl.c
--- 2.2.18/kernel/sysctl.c	Mon Dec 11 16:58:07 2000
+++ 2.2.18aa1/kernel/sysctl.c	Mon Dec 11 17:20:46 2000
@@ -272,6 +272,8 @@
 	 &pgt_cache_water, 2*sizeof(int), 0600, NULL, &proc_dointvec},
 	{VM_PAGE_CLUSTER, "page-cluster", 
 	 &page_cluster, sizeof(int), 0600, NULL, &proc_dointvec},
+	{VM_HEAP_STACK_GAP, "heap-stack-gap", 
+	 &heap_stack_gap, sizeof(int), 0644, NULL, &proc_dointvec},
 	{0}
 };
 
diff -urN 2.2.18/lib/vsprintf.c 2.2.18aa1/lib/vsprintf.c
--- 2.2.18/lib/vsprintf.c	Mon Jan 17 16:44:50 2000
+++ 2.2.18aa1/lib/vsprintf.c	Mon Dec 11 17:20:52 2000
@@ -67,10 +67,106 @@
 #define LARGE	64		/* use 'ABCDEF' instead of 'abcdef' */
 
 #define do_div(n,base) ({ \
-int __res; \
-__res = ((unsigned long) n) % (unsigned) base; \
-n = ((unsigned long) n) / (unsigned) base; \
-__res; })
+  int __res; \
+  __res = ((unsigned long) n) % (unsigned) base; \
+  n = ((unsigned long) n) / (unsigned) base; \
+  __res; })
+
+#if BITS_PER_LONG < 64
+
+/* Note: do_ldiv assumes that unsigned long long is a 64 bit long
+ * and unsigned long is at least a 32 bits long.
+ */
+#define do_ldiv(n, base) \
+({ \
+	unsigned long long value = n; \
+	unsigned long long leftover; \
+	unsigned long temp; \
+	unsigned long result_div1, result_div2, result_div3, result_mod; \
+\
+	temp = value >> 32; \
+	result_div1 = temp/(base); \
+	result_mod = temp%(base); \
+\
+	temp = (result_mod << 24) | ((value >> 8) & 0xFFFFFF); \
+	result_div2 = temp/(base); \
+	result_mod = temp%(base); \
+\
+	temp = (result_mod << 8) | (value & 0xFF); \
+	result_div3 = temp/(base); \
+	result_mod = temp%(base);\
+\
+	leftover = ((unsigned long long)result_div1 << 32) | \
+		((unsigned long long)result_div2 << 8) | (result_div3); \
+\
+	n = leftover; \
+	result_mod; \
+})
+
+
+static char * lnumber(char * str, long long num, int base, int size,
+		      int precision, int type)
+{
+	char c,sign,tmp[66];
+	const char *digits="0123456789abcdef";
+	int i;
+
+	if (type & LARGE)
+		digits = "0123456789ABCDEF";
+	if (type & LEFT)
+		type &= ~ZEROPAD;
+	if (base < 2 || base > 36)
+		return 0;
+	c = (type & ZEROPAD) ? '0' : ' ';
+	sign = 0;
+	if (type & SIGN) {
+		if (num < 0) {
+			sign = '-';
+			num = -num;
+			size--;
+		} else if (type & PLUS) {
+			sign = '+';
+			size--;
+		} else if (type & SPACE) {
+			sign = ' ';
+			size--;
+		}
+	}
+	if (type & SPECIAL) {
+		if (base == 16)
+			size -= 2;
+	}
+	i = 0;
+	if (num == 0)
+		tmp[i++]='0';
+	else while (num != 0)
+		tmp[i++] = digits[do_ldiv(num,base)];
+	if (i > precision)
+		precision = i;
+	size -= precision;
+	if (!(type&(ZEROPAD+LEFT)))
+		while(size-->0)
+			*str++ = ' ';
+	if (sign)
+		*str++ = sign;
+	if (type & SPECIAL) {
+		if (base==16) {
+			*str++ = '0';
+			*str++ = digits[33];
+		}
+	}
+	if (!(type & LEFT))
+		while (size-- > 0)
+			*str++ = c;
+	while (i < precision--)
+		*str++ = '0';
+	while (i-- > 0)
+		*str++ = tmp[i];
+	while (size-- > 0)
+		*str++ = ' ';
+	return str;
+}
+#endif
 
 static char * number(char * str, long num, int base, int size, int precision
 	,int type)
@@ -207,7 +303,10 @@
 		/* get the conversion qualifier */
 		qualifier = -1;
 		if (*fmt == 'h' || *fmt == 'l' || *fmt == 'L') {
-			qualifier = *fmt;
+			if (*fmt == 'l' && qualifier == 'l')
+				qualifier = 'L';
+			else
+				qualifier = *fmt;
 			++fmt;
 		}
 
@@ -290,7 +389,22 @@
 				--fmt;
 			continue;
 		}
-		if (qualifier == 'l')
+		if (qualifier == 'L') {
+
+#if BITS_PER_LONG < 64
+		/* 64-bit printout in 32-bit systems !!
+		   Needed at some point for 64-bit file offsets and
+		   mmap() reporting functions. */
+
+			unsigned long long lnum;
+			lnum = va_arg(args, unsigned long long);
+			str = lnumber(str, lnum, base, field_width,
+				      precision, flags);
+			continue;
+#else
+			num = va_arg(args, unsigned long); /* 64-bit longs..*/
+#endif
+		} else if (qualifier == 'l')
 			num = va_arg(args, unsigned long);
 		else if (qualifier == 'h') {
 			num = (unsigned short) va_arg(args, int);
diff -urN 2.2.18/mm/Makefile 2.2.18aa1/mm/Makefile
--- 2.2.18/mm/Makefile	Mon Jan 18 02:27:01 1999
+++ 2.2.18aa1/mm/Makefile	Mon Dec 11 17:20:48 2000
@@ -12,4 +12,8 @@
 	    vmalloc.o slab.o \
 	    swap.o vmscan.o page_io.o page_alloc.o swap_state.o swapfile.o
 
+ifeq ($(CONFIG_BIGMEM),y)
+O_OBJS += bigmem.o
+endif
+
 include $(TOPDIR)/Rules.make
diff -urN 2.2.18/mm/bigmem.c 2.2.18aa1/mm/bigmem.c
--- 2.2.18/mm/bigmem.c	Thu Jan  1 01:00:00 1970
+++ 2.2.18aa1/mm/bigmem.c	Mon Dec 11 17:20:48 2000
@@ -0,0 +1,88 @@
+/*
+ * BIGMEM common code and variables.
+ *
+ * (C) 1999 Andrea Arcangeli, SuSE GmbH, andrea@suse.de
+ *          Gerhard Wichert, Siemens AG, Gerhard.Wichert@pdb.siemens.de
+ */
+
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/bigmem.h>
+
+unsigned long bigmem_mapnr;
+int nr_free_bigpages = 0;
+
+struct page * prepare_bigmem_swapout(struct page * page)
+{
+	/* if this is a bigmem page so it can't be swapped out directly
+	   otherwise the b_data buffer addresses will break
+	   the lowlevel device drivers. */
+	if (PageBIGMEM(page))
+	{
+		unsigned long regular_page;
+		unsigned long vaddr;
+
+		regular_page = __get_free_page(GFP_ATOMIC);
+		if (!regular_page)
+			return NULL;
+
+		vaddr = kmap(page_address(page), KM_READ);
+		copy_page(regular_page, vaddr);
+		kunmap(vaddr, KM_READ);
+
+		/* ok, we can just forget about our bigmem page since 
+		   we stored its data into the new regular_page. */
+		__free_page(page);
+
+		page = MAP_NR(regular_page) + mem_map;
+	}
+	return page;
+}
+
+struct page * replace_with_bigmem(struct page * page)
+{
+	if (!PageBIGMEM(page) && nr_free_bigpages)
+	{
+		unsigned long kaddr;
+
+		kaddr = __get_free_page(GFP_ATOMIC|GFP_BIGMEM);
+		if (kaddr)
+		{
+			struct page * bigmem_page;
+
+			bigmem_page = MAP_NR(kaddr) + mem_map;
+			if (PageBIGMEM(bigmem_page))
+			{
+				unsigned long vaddr;
+
+				vaddr = kmap(kaddr, KM_WRITE);
+				copy_page(vaddr, page_address(page));
+				kunmap(vaddr, KM_WRITE);
+
+				if (bigmem_page->inode)
+					panic("replace page cache with bigmem");
+
+				/* We can just forget the old page since 
+				   we stored its data into the new
+				   bigmem_page. */
+				__free_page(page);
+
+				page = bigmem_page;
+			} else 
+				__free_page(bigmem_page);
+		}
+	}
+	return page;
+}
+
+unsigned long prepare_bigmem_shm_swapin(unsigned long page)
+{
+	if (!PageBIGMEM(&mem_map[MAP_NR(page)]))
+		return page;
+
+	free_page(page);
+
+	/* no need to clear the page since it will be rewrited by the
+	   swapin. */
+	return __get_free_page(GFP_ATOMIC);
+}
diff -urN 2.2.18/mm/filemap.c 2.2.18aa1/mm/filemap.c
--- 2.2.18/mm/filemap.c	Mon Dec 11 16:58:07 2000
+++ 2.2.18aa1/mm/filemap.c	Mon Dec 11 17:20:52 2000
@@ -19,8 +19,8 @@
 #include <linux/blkdev.h>
 #include <linux/file.h>
 #include <linux/swapctl.h>
-#include <linux/slab.h>
 #include <linux/init.h>
+#include <linux/bigmem.h>
 
 #include <asm/pgtable.h>
 #include <asm/uaccess.h>
@@ -36,25 +36,6 @@
 unsigned int page_hash_bits, page_hash_mask;
 struct page **page_hash_table;
 
-/* 
- * Define a request structure for outstanding page write requests
- * to the background page io daemon
- */
-
-struct pio_request 
-{
-	struct pio_request *	next;
-	struct file *		file;
-	unsigned long		offset;
-	unsigned long		page;
-};
-static struct pio_request *pio_first = NULL, **pio_last = &pio_first;
-static kmem_cache_t *pio_request_cache;
-static struct wait_queue *pio_wait = NULL;
-
-static inline void 
-make_pio_request(struct file *, unsigned long, unsigned long);
-
 static inline int sync_page(struct page *page)
 {
 	struct inode *inode = page->inode;
@@ -97,7 +78,7 @@
  * Truncate the page cache at a set offset, removing the pages
  * that are beyond that offset (and zeroing out partial pages).
  */
-void truncate_inode_pages(struct inode * inode, unsigned long start)
+void truncate_inode_pages(struct inode * inode, loff_t start)
 {
 	struct page ** p;
 	struct page * page;
@@ -105,10 +86,10 @@
 repeat:
 	p = &inode->i_pages;
 	while ((page = *p) != NULL) {
-		unsigned long offset = page->offset;
+		loff_t loffset = pgoff2loff(page->index);
 
 		/* page wholly truncated - free it */
-		if (offset >= start) {
+		if (loffset >= start) {
 			if (PageLocked(page)) {
 				wait_on_page(page);
 				goto repeat;
@@ -124,9 +105,10 @@
 			continue;
 		}
 		p = &page->next;
-		offset = start - offset;
+		loffset = start - loffset;
 		/* partial truncate, clear end of page */
-		if (offset < PAGE_CACHE_SIZE) {
+		if (loffset < PAGE_CACHE_SIZE) {
+			unsigned int  offset  = loffset; /* truncate ok */
 			unsigned long address = page_address(page);
 			memset((void *) (offset + address), 0, PAGE_CACHE_SIZE - offset);
 			flush_page_to_ram(address);
@@ -147,26 +129,39 @@
 int shrink_mmap(int priority, int gfp_mask)
 {
 	static unsigned long clock = 0;
+#ifndef CONFIG_BIGMEM
 	unsigned long limit = num_physpages;
+#else
+	unsigned long limit = bigmem_mapnr;
+#endif
 	struct page * page;
 	int count;
-	int nr_dirty = 0;
-	
+
 	/* Make sure we scan all pages twice at priority 0. */
-	count = (limit << 1) >> priority;
+	count = limit / priority;
 
  refresh_clock:
 	page = mem_map + clock;
 	do {
 		int referenced;
 
+		if (current->need_resched) {
+			current->state = TASK_RUNNING;
+			schedule();
+			goto refresh_clock;
+		}
+		
 		/* This works even in the presence of PageSkip because
 		 * the first two entries at the beginning of a hole will
 		 * be marked, not just the first.
 		 */
 		page++;
 		clock++;
+#ifndef CONFIG_BIGMEM
 		if (clock >= max_mapnr) {
+#else
+		if (clock >= bigmem_mapnr) {
+#endif
 			clock = 0;
 			page = mem_map;
 		}
@@ -176,6 +171,8 @@
 			clock = page - mem_map;
 		}
 		
+		count--;
+
 		/* We can't free pages unless there's just one user */
 		if (atomic_read(&page->count) != 1)
 			continue;
@@ -185,8 +182,15 @@
 		if (PageLocked(page))
 			continue;
 
-		if ((gfp_mask & __GFP_DMA) && !PageDMA(page))
+		if (!(gfp_mask & __GFP_BIGMEM) && PageBIGMEM(page)) {
+			count++;
 			continue;
+		}
+
+		if ((gfp_mask & __GFP_DMA) && !PageDMA(page)) {
+			count++;
+			continue;
+		}
 
 		/*
 		 * Is it a page swap page? If so, we want to
@@ -194,7 +198,8 @@
 		 * were to be marked referenced..
 		 */
 		if (PageSwapCache(page)) {
-			if (referenced && swap_count(page->offset) != 1)
+			if (referenced &&
+			    swap_count(pgoff2ulong(page->index)) != 1)
 				continue;
 			delete_from_swap_cache(page);
 			return 1;
@@ -205,14 +210,6 @@
 
 		/* Is it a buffer page? */
 		if (page->buffers) {
-			/*
-			 * Wait for async IO to complete
-			 * at each 64 buffers
-			 */ 
-
-			int wait = ((gfp_mask & __GFP_IO) 
-				&& (!(nr_dirty++ % 64)));
-
 			if (buffer_under_min())
 				continue;
 			/*
@@ -220,10 +217,8 @@
 			 * throttling.
 			 */
 
-			if (!try_to_free_buffers(page, wait)) { 
-				if(--count < 0) break;
+			if (!try_to_free_buffers(page, gfp_mask))
 				goto refresh_clock;
-			}
 			return 1;
 		}
 
@@ -234,8 +229,7 @@
 			remove_inode_page(page);
 			return 1;
 		}
-
-	} while (--count > 0);
+	} while (count > 0);
 	return 0;
 }
 
@@ -260,11 +254,12 @@
  * memory maps.  --sct
  */
 
-void update_vm_cache_conditional(struct inode * inode, unsigned long pos, const char * buf, int count, unsigned long source_address)
+void update_vm_cache_conditional(struct inode * inode, loff_t pos, const char * buf, int count, unsigned long source_address)
 {
 	unsigned long offset, len;
+	pgoff_t pgoff = loff2pgoff(pos);
 
-	offset = (pos & ~PAGE_CACHE_MASK);
+	offset = ((unsigned long)pos & ~PAGE_CACHE_MASK);
 	pos = pos & PAGE_CACHE_MASK;
 	len = PAGE_CACHE_SIZE - offset;
 	do {
@@ -272,7 +267,7 @@
 
 		if (len > count)
 			len = count;
-		page = find_page(inode, pos);
+		page = find_page(inode, pgoff);
 		if (page) {
 			char *dest = (char*) (offset + page_address(page));
 
@@ -291,19 +286,20 @@
 	} while (count);
 }
 
-void update_vm_cache(struct inode * inode, unsigned long pos, const char * buf, int count)
+void update_vm_cache(struct inode * inode, loff_t pos, const char * buf, int count)
 {
 	update_vm_cache_conditional(inode, pos, buf, count, 0);
 }
 
 
 static inline void add_to_page_cache(struct page * page,
-	struct inode * inode, unsigned long offset,
-	struct page **hash)
+				     struct inode * inode,
+				     pgoff_t pgoff,
+				     struct page **hash)
 {
 	atomic_inc(&page->count);
-	page->flags = (page->flags & ~((1 << PG_uptodate) | (1 << PG_error))) | (1 << PG_referenced);
-	page->offset = offset;
+	page->flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_referenced));
+	page->index = pgoff;
 	add_page_to_inode_queue(inode, page);
 	__add_page_to_hash_queue(page, hash);
 }
@@ -314,29 +310,32 @@
  * this is all overlapped with the IO on the previous page finishing anyway)
  */
 static unsigned long try_to_read_ahead(struct file * file,
-				unsigned long offset, unsigned long page_cache)
+				       pgoff_t pgoff, unsigned long page_cache)
 {
 	struct inode *inode = file->f_dentry->d_inode;
-	struct page * page;
-	struct page ** hash;
+	pgoff_t pg_size;
 
-	offset &= PAGE_CACHE_MASK;
-	switch (page_cache) {
-	case 0:
+	/* Calculate file size in 'pages' -- if even one byte (according to
+	   the 'i_size') exceeds the final page-size block, round up. */
+	pg_size = loff2pgoff(inode->i_size+(PAGE_SIZE-1));
+
+	if (!page_cache) {
 		page_cache = page_cache_alloc();
 		if (!page_cache)
-			break;
-	default:
-		if (offset >= inode->i_size)
-			break;
-		hash = page_hash(inode, offset);
-		page = __find_page(inode, offset, *hash);
+			return 0; /* Can't allocate! */
+	}
+	/* Ok, we have a page, make sure it is in the page cache */
+	if (pgoff2ulong(pgoff) < pgoff2ulong(pg_size)) {
+		struct page * page;
+		struct page ** hash;
+		hash = page_hash(inode, pgoff);
+		page = __find_page(inode, pgoff, *hash);
 		if (!page) {
 			/*
 			 * Ok, add the new page to the hash-queues...
 			 */
 			page = page_cache_entry(page_cache);
-			add_to_page_cache(page, inode, offset, hash);
+			add_to_page_cache(page, inode, pgoff, hash);
 			inode->i_op->readpage(file, page);
 			page_cache = 0;
 		}
@@ -359,13 +358,13 @@
 
 	wait.task = tsk;
 	add_wait_queue(&page->wait, &wait);
-repeat:
-	tsk->state = TASK_UNINTERRUPTIBLE;
-	sync_page(page);
-	if (PageLocked(page)) {
+	do {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		sync_page(page);
+		if (!PageLocked(page))
+			break;
 		schedule();
-		goto repeat;
-	}
+	} while (PageLocked(page));
 	tsk->state = TASK_RUNNING;
 	remove_wait_queue(&page->wait, &wait);
 }
@@ -390,11 +389,11 @@
 
 #define PROFILE_MAXREADCOUNT 1000
 
-static unsigned long total_reada;
-static unsigned long total_async;
-static unsigned long total_ramax;
-static unsigned long total_ralen;
-static unsigned long total_rawin;
+static u_long total_reada;
+static u_long total_async;
+static u_long total_ramax;
+static u_long total_ralen;
+static u_long total_rawin;
 
 static void profile_readahead(int async, struct file *filp)
 {
@@ -502,13 +501,13 @@
 
 static inline unsigned long generic_file_readahead(int reada_ok,
 	struct file * filp, struct inode * inode,
-	unsigned long ppos, struct page * page, unsigned long page_cache)
+	loff_t ppos, struct page * page, unsigned long page_cache)
 {
-	unsigned long max_ahead, ahead;
-	unsigned long raend;
+	loff_t max_ahead, ahead;
+	loff_t raend;
 	int max_readahead = get_max_readahead(inode);
 
-	raend = filp->f_raend & PAGE_CACHE_MASK;
+	raend = filp->f_raend & PAGE_CACHE_MASK_loff;
 	max_ahead = 0;
 
 /*
@@ -566,7 +565,7 @@
 	ahead = 0;
 	while (ahead < max_ahead) {
 		ahead += PAGE_CACHE_SIZE;
-		page_cache = try_to_read_ahead(filp, raend + ahead,
+		page_cache = try_to_read_ahead(filp, loff2pgoff(raend + ahead),
 						page_cache);
 	}
 /*
@@ -581,9 +580,11 @@
  *   accessed sequentially.
  */
 	if (ahead) {
+#if 0
 		if (reada_ok == 2) {
 			run_task_queue(&tq_disk);
 		}
+#endif
 
 		filp->f_ralen += ahead;
 		filp->f_rawin += filp->f_ralen;
@@ -633,14 +634,14 @@
 	struct dentry *dentry = filp->f_dentry;
 	struct inode *inode = dentry->d_inode;
 	unsigned long page_cache;
-	size_t pos, pgpos;
+	loff_t pos, posp;
 	int reada_ok;
 	int max_readahead = get_max_readahead(inode);
 
 	page_cache = 0;
 
 	pos = *ppos;
-	pgpos = pos & PAGE_CACHE_MASK;
+	posp = pos & PAGE_CACHE_MASK_loff;
 /*
  * If the current position is outside the previous read-ahead window, 
  * we reset the current read-ahead context and set read ahead max to zero
@@ -648,7 +649,7 @@
  * otherwise, we assume that the file accesses are sequential enough to
  * continue read-ahead.
  */
-	if (pgpos > filp->f_raend || pgpos + filp->f_rawin < filp->f_raend) {
+	if (posp > filp->f_raend || posp + filp->f_rawin < filp->f_raend) {
 		reada_ok = 0;
 		filp->f_raend = 0;
 		filp->f_ralen = 0;
@@ -664,12 +665,12 @@
  * Then, at least MIN_READAHEAD if read ahead is ok,
  * and at most MAX_READAHEAD in all cases.
  */
-	if (pos + desc->count <= (PAGE_CACHE_SIZE >> 1)) {
+	if (pos + desc->count <= (loff_t)(PAGE_CACHE_SIZE >> 1)) {
 		filp->f_ramax = 0;
 	} else {
-		unsigned long needed;
+		loff_t needed;
 
-		needed = ((pos + desc->count) & PAGE_CACHE_MASK) - pgpos;
+		needed = ((pos + desc->count) & PAGE_CACHE_MASK) - posp;
 
 		if (filp->f_ramax < needed)
 			filp->f_ramax = needed;
@@ -682,6 +683,7 @@
 
 	for (;;) {
 		struct page *page, **hash;
+		pgoff_t pgoff;
 
 		if (pos >= inode->i_size)
 			break;
@@ -689,8 +691,9 @@
 		/*
 		 * Try to find the data in the page cache..
 		 */
-		hash = page_hash(inode, pos & PAGE_CACHE_MASK);
-		page = __find_page(inode, pos & PAGE_CACHE_MASK, *hash);
+		pgoff = loff2pgoff(pos);
+		hash = page_hash(inode, pgoff);
+		page = __find_page(inode, pgoff, *hash);
 		if (!page)
 			goto no_cached_page;
 
@@ -703,7 +706,7 @@
  * the page has been rewritten.
  */
 		if (PageUptodate(page) || PageLocked(page))
-			page_cache = generic_file_readahead(reada_ok, filp, inode, pos & PAGE_CACHE_MASK, page, page_cache);
+			page_cache = generic_file_readahead(reada_ok, filp, inode, pos & PAGE_CACHE_MASK_loff, page, page_cache);
 		else if (reada_ok && filp->f_ramax > MIN_READAHEAD)
 				filp->f_ramax = MIN_READAHEAD;
 
@@ -728,8 +731,8 @@
 			flush_dcache_page(page_address(page));
 
 		offset = pos & ~PAGE_CACHE_MASK;
-		nr = PAGE_CACHE_SIZE - offset;
-		if (nr > inode->i_size - pos)
+		nr = PAGE_CACHE_SIZE - offset; /* small value */
+		if ((loff_t)nr > (inode->i_size - pos))
 			nr = inode->i_size - pos;
 
 		/*
@@ -769,7 +772,7 @@
 		 */
 		page = page_cache_entry(page_cache);
 		page_cache = 0;
-		add_to_page_cache(page, inode, pos & PAGE_CACHE_MASK, hash);
+		add_to_page_cache(page, inode, pgoff, hash);
 
 		/*
 		 * Error handling is tricky. If we get a read error,
@@ -850,10 +853,26 @@
 ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos)
 {
 	ssize_t retval;
+	struct inode *inode = filp->f_dentry->d_inode;
+
+	if (((ssize_t) count) < 0)
+		return -EINVAL;
 
 	retval = -EFAULT;
 	if (access_ok(VERIFY_WRITE, buf, count)) {
 		retval = 0;
+
+		/* L-F-S spec 2.2.1.25: */
+		if (!(filp->f_flags & O_LARGEFILE) &&
+		    S_ISREG(inode->i_mode) &&
+		    (*ppos < inode->i_size) && count) {
+			if (*ppos >= 0x7fffffff) /* pos@2G forbidden */
+				return -EOVERFLOW;
+			if (*ppos + count > 0x7fffffff)
+				/* Read only until end of allowed region */
+				count = 0x7fffffff - *ppos;
+		}
+
 		if (count) {
 			read_descriptor_t desc;
 
@@ -881,12 +900,12 @@
 
 	if (size > count)
 		size = count;
-	down(&inode->i_sem);
+	fs_down(&inode->i_sem);
 	old_fs = get_fs();
 	set_fs(KERNEL_DS);
 	written = file->f_op->write(file, area, size, &file->f_pos);
 	set_fs(old_fs);
-	up(&inode->i_sem);
+	fs_up(&inode->i_sem);
 	if (written < 0) {
 		desc->error = written;
 		written = 0;
@@ -902,6 +921,9 @@
 	struct file * in_file, * out_file;
 	struct inode * in_inode, * out_inode;
 
+	if (((ssize_t) count) < 0)
+		return -EINVAL;
+
 	lock_kernel();
 
 	/*
@@ -995,20 +1017,25 @@
 	struct file * file = area->vm_file;
 	struct dentry * dentry = file->f_dentry;
 	struct inode * inode = dentry->d_inode;
-	unsigned long offset, reada, i;
+	loff_t offset;
+	pgoff_t pgoff, reada;
+	int i;
 	struct page * page, **hash;
 	unsigned long old_page, new_page;
 
 	new_page = 0;
-	offset = (address & PAGE_MASK) - area->vm_start + area->vm_offset;
+	offset = ((loff_t)((address & PAGE_MASK) - area->vm_start) +
+		  area->vm_offset);
+
 	if (offset >= inode->i_size && (area->vm_flags & VM_SHARED) && area->vm_mm == current->mm)
 		goto no_page;
 
 	/*
 	 * Do we have something in the page cache already?
 	 */
-	hash = page_hash(inode, offset);
-	page = __find_page(inode, offset, *hash);
+	pgoff = loff2pgoff(offset);
+	hash = page_hash(inode, pgoff);
+	page = __find_page(inode, pgoff, *hash);
 	if (!page)
 		goto no_cached_page;
 
@@ -1059,11 +1086,12 @@
 	/*
 	 * Try to read in an entire cluster at once.
 	 */
-	reada   = offset;
-	reada >>= PAGE_CACHE_SHIFT + page_cluster;
-	reada <<= PAGE_CACHE_SHIFT + page_cluster;
+	reada   = loff2pgoff(offset);
+	/* Mask lowest  'page_cluster'  worth of the lowest bits */
+	reada   = ulong2pgoff(pgoff2ulong(reada) & ((~(0UL)) << page_cluster));
 
-	for (i = 1 << page_cluster; i > 0; --i, reada += PAGE_CACHE_SIZE)
+	for (i = 1 << page_cluster; i > 0;
+	     --i, reada = ulong2pgoff(pgoff2ulong(reada)+1))
 		new_page = try_to_read_ahead(file, reada, new_page);
 
 	if (!new_page)
@@ -1077,7 +1105,7 @@
 	 * cache.. The page we just got may be useful if we
 	 * can't share, so don't get rid of it here.
 	 */
-	page = find_page(inode, offset);
+	page = find_page(inode, pgoff);
 	if (page)
 		goto found_page;
 
@@ -1086,7 +1114,7 @@
 	 */
 	page = page_cache_entry(new_page);
 	new_page = 0;
-	add_to_page_cache(page, inode, offset, hash);
+	add_to_page_cache(page, inode, pgoff, hash);
 
 	if (inode->i_op->readpage(file, page) != 0)
 		goto failure;
@@ -1135,10 +1163,10 @@
  * if the disk is full.
  */
 static inline int do_write_page(struct inode * inode, struct file * file,
-	const char * page, unsigned long offset)
+				const char * page, loff_t offset)
 {
 	int retval;
-	unsigned long size;
+	loff_t size;
 	loff_t loff = offset;
 	mm_segment_t old_fs;
 
@@ -1162,9 +1190,8 @@
 }
 
 static int filemap_write_page(struct vm_area_struct * vma,
-			      unsigned long offset,
-			      unsigned long page,
-			      int wait)
+			      loff_t offset,
+			      unsigned long page)
 {
 	int result;
 	struct file * file;
@@ -1182,20 +1209,9 @@
 	 * and file could be released ... increment the count to be safe.
 	 */
 	file->f_count++;
-
-	/* 
-	 * If this is a swapping operation rather than msync(), then
-	 * leave the actual IO, and the restoration of the file count,
-	 * to the kpiod thread.  Just queue the request for now.
-	 */
-	if (!wait) {
-		make_pio_request(file, offset, page);
-		return 0;
-	}
-	
-	down(&inode->i_sem);
+	fs_down(&inode->i_sem);
 	result = do_write_page(inode, file, (const char *) page, offset);
-	up(&inode->i_sem);
+	fs_up(&inode->i_sem);
 	fput(file);
 	return result;
 }
@@ -1208,7 +1224,7 @@
  */
 int filemap_swapout(struct vm_area_struct * vma, struct page * page)
 {
-	return filemap_write_page(vma, page->offset, page_address(page), 0);
+	return filemap_write_page(vma, pgoff2loff(page->index), page_address(page));
 }
 
 static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
@@ -1245,7 +1261,7 @@
 			return 0;
 		}
 	}
-	error = filemap_write_page(vma, address - vma->vm_start + vma->vm_offset, page, 1);
+	error = filemap_write_page(vma, address - vma->vm_start + vma->vm_offset, page);
 	page_cache_free(page);
 	return error;
 }
@@ -1417,9 +1433,9 @@
 			if (file) {
 				struct dentry * dentry = file->f_dentry;
 				struct inode * inode = dentry->d_inode;
-				down(&inode->i_sem);
+				fs_down(&inode->i_sem);
 				error = file_fsync(file, dentry);
-				up(&inode->i_sem);
+				fs_up(&inode->i_sem);
 			}
 		}
 		return error;
@@ -1572,7 +1588,7 @@
 {
 	struct dentry	*dentry = file->f_dentry; 
 	struct inode	*inode = dentry->d_inode; 
-	unsigned long	pos = *ppos;
+	loff_t		pos = *ppos;
 	unsigned long	limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
 	struct page	*page, **hash;
 	unsigned long	page_cache = 0;
@@ -1588,6 +1604,9 @@
 		return error;
 	}
 
+	if (((ssize_t) count) < 0)
+		return -EINVAL;
+
 	sync    = file->f_flags & O_SYNC;
 	written = 0;
 
@@ -1598,31 +1617,39 @@
 	 * Check whether we've reached the file size limit.
 	 */
 	status = -EFBIG;
-	if (pos >= limit) {
+	if (limit != RLIM_INFINITY && pos >= limit) {
 		send_sig(SIGXFSZ, current, 0);
 		goto out;
 	}
 
+	/* L-F-S */
+	if (!(file->f_flags & O_LARGEFILE) &&
+	    S_ISREG(inode->i_mode) && count) {
+		if (pos >= 0x7fffffff) /* pos@2G forbidden */
+			goto out;
+
+		if (pos + count > 0x7fffffff)
+			count = 0x7fffffff - pos;
+	}
+
 	status  = 0;
 	/*
 	 * Check whether to truncate the write,
 	 * and send the signal if we do.
 	 */
-	if (count > limit - pos) {
-		send_sig(SIGXFSZ, current, 0);
+	if (limit != RLIM_INFINITY && count > limit - pos)
 		count = limit - pos;
-	}
 
 	while (count) {
-		unsigned long bytes, pgpos, offset;
+		unsigned long bytes, offset;
+		pgoff_t pgpos = loff2pgoff(pos);
 		char * dest;
 
 		/*
 		 * Try to find the page in the cache. If it isn't there,
 		 * allocate a free page.
 		 */
-		offset = (pos & ~PAGE_CACHE_MASK);
-		pgpos = pos & PAGE_CACHE_MASK;
+		offset = ((unsigned long)pos & ~PAGE_CACHE_MASK);
 		bytes = PAGE_CACHE_SIZE - offset;
 		if (bytes > count)
 			bytes = count;
@@ -1700,15 +1727,14 @@
  * Note: we don't have to worry about races here, as the caller
  * is holding the inode semaphore.
  */
-unsigned long get_cached_page(struct inode * inode, unsigned long offset,
-				int new)
+unsigned long get_cached_page(struct inode * inode, pgoff_t pgoff, int new)
 {
 	struct page * page;
 	struct page ** hash;
 	unsigned long page_cache = 0;
 
-	hash = page_hash(inode, offset);
-	page = __find_page(inode, offset, *hash);
+	hash = page_hash(inode, pgoff);
+	page = __find_page(inode, pgoff, *hash);
 	if (!page) {
 		if (!new)
 			goto out;
@@ -1717,7 +1743,7 @@
 			goto out;
 		clear_page(page_cache);
 		page = page_cache_entry(page_cache);
-		add_to_page_cache(page, inode, offset, hash);
+		add_to_page_cache(page, inode, pgoff, hash);
 	}
 	if (atomic_read(&page->count) != 2)
 		printk(KERN_ERR "get_cached_page: page count=%d\n",
@@ -1746,130 +1772,6 @@
 	clear_bit(PG_locked, &page->flags);
 	wake_up(&page->wait);
 	page_cache_release(page);
-}
-
-
-/* Add request for page IO to the queue */
-
-static inline void put_pio_request(struct pio_request *p)
-{
-	*pio_last = p;
-	p->next = NULL;
-	pio_last = &p->next;
-}
-
-/* Take the first page IO request off the queue */
-
-static inline struct pio_request * get_pio_request(void)
-{
-	struct pio_request * p = pio_first;
-	pio_first = p->next;
-	if (!pio_first)
-		pio_last = &pio_first;
-	return p;
-}
-
-/* Make a new page IO request and queue it to the kpiod thread */
-
-static inline void make_pio_request(struct file *file,
-				    unsigned long offset,
-				    unsigned long page)
-{
-	struct pio_request *p;
-
-	atomic_inc(&page_cache_entry(page)->count);
-
-	/* 
-	 * We need to allocate without causing any recursive IO in the
-	 * current thread's context.  We might currently be swapping out
-	 * as a result of an allocation made while holding a critical
-	 * filesystem lock.  To avoid deadlock, we *MUST* not reenter
-	 * the filesystem in this thread.
-	 *
-	 * We can wait for kswapd to free memory, or we can try to free
-	 * pages without actually performing further IO, without fear of
-	 * deadlock.  --sct
-	 */
-
-	while ((p = kmem_cache_alloc(pio_request_cache, GFP_BUFFER)) == NULL) {
-		if (try_to_free_pages(__GFP_WAIT))
-			continue;
-		current->state = TASK_INTERRUPTIBLE;
-		schedule_timeout(HZ/10);
-	}
-	
-	p->file   = file;
-	p->offset = offset;
-	p->page   = page;
-
-	put_pio_request(p);
-	wake_up(&pio_wait);
-}
-
-
-/*
- * This is the only thread which is allowed to write out filemap pages
- * while swapping.
- * 
- * To avoid deadlock, it is important that we never reenter this thread.
- * Although recursive memory allocations within this thread may result
- * in more page swapping, that swapping will always be done by queuing
- * another IO request to the same thread: we will never actually start
- * that IO request until we have finished with the current one, and so
- * we will not deadlock.  
- */
-
-int kpiod(void * unused)
-{
-	struct task_struct *tsk = current;
-	struct wait_queue wait = { tsk, };
-	struct inode * inode;
-	struct dentry * dentry;
-	struct pio_request * p;
-	
-	tsk->session = 1;
-	tsk->pgrp = 1;
-	strcpy(tsk->comm, "kpiod");
-	sigfillset(&tsk->blocked);
-	init_waitqueue(&pio_wait);
-	/*
-	 * Mark this task as a memory allocator - we don't want to get caught
-	 * up in the regular mm freeing frenzy if we have to allocate memory
-	 * in order to write stuff out.
-	 */
-	tsk->flags |= PF_MEMALLOC;
-
-	lock_kernel();
-	
-	pio_request_cache = kmem_cache_create("pio_request", 
-					      sizeof(struct pio_request),
-					      0, SLAB_HWCACHE_ALIGN, 
-					      NULL, NULL);
-	if (!pio_request_cache)
-		panic ("Could not create pio_request slab cache");
-
-	while (1) {
-		tsk->state = TASK_INTERRUPTIBLE;
-		add_wait_queue(&pio_wait, &wait);
-		if (!pio_first)
-			schedule();
-		remove_wait_queue(&pio_wait, &wait);
-		tsk->state = TASK_RUNNING;
-
-		while (pio_first) {
-			p = get_pio_request();
-			dentry = p->file->f_dentry;
-			inode = dentry->d_inode;
-			
-			down(&inode->i_sem);
-			do_write_page(inode, p->file,
-				      (const char *) p->page, p->offset);
-			up(&inode->i_sem);
-			fput(p->file);
-			page_cache_free(p->page);
-			kmem_cache_free(pio_request_cache, p);
-		}
-	}
 }
 
 void __init page_cache_init(unsigned long memory_size)
diff -urN 2.2.18/mm/memory.c 2.2.18aa1/mm/memory.c
--- 2.2.18/mm/memory.c	Mon Dec 11 16:58:07 2000
+++ 2.2.18aa1/mm/memory.c	Mon Dec 11 17:20:52 2000
@@ -31,12 +31,18 @@
 /*
  * 05.04.94  -  Multi-page memory management added for v1.1.
  * 		Idea by Alex Bligh (alex@cconcepts.co.uk)
+ *
+ * 16.07.99  -  Support of BIGMEM added by Gerhard Wichert, Siemens AG
+ *		(Gerhard.Wichert@pdb.siemens.de)
  */
 
 #include <linux/mm.h>
 #include <linux/mman.h>
 #include <linux/swap.h>
 #include <linux/smp_lock.h>
+#include <linux/bigmem.h>
+#include <linux/pagemap.h>
+#include <linux/iobuf.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -53,10 +59,10 @@
 static inline void copy_cow_page(unsigned long from, unsigned long to)
 {
 	if (from == ZERO_PAGE(to)) {
-		clear_page(to);
+		clear_bigpage(to);
 		return;
 	}
-	copy_page(to, from);
+	copy_bigpage(to, from);
 }
 
 mem_map_t * mem_map = NULL;
@@ -397,6 +403,223 @@
 	}
 }
 
+
+/*
+ * Do a quick page-table lookup for a single page. 
+ */
+static unsigned long get_page(unsigned long address, int write)
+{
+	pgd_t *pgd;
+	pmd_t *pmd;
+
+	pgd = pgd_offset(current->mm, address);
+	pmd = pmd_offset(pgd, address);
+	if (pmd) {
+		pte_t * pte = pte_offset(pmd, address);
+		if (pte && pte_present(*pte)) {
+			if (!write ||
+			    (pte_write(*pte) && pte_dirty(*pte)))
+				return pte_page(*pte);
+		}
+	}
+	
+	return 0;
+}
+
+/* 
+ * Given a physical address, is there a useful struct page pointing to it?
+ */
+
+static struct page * get_page_map(unsigned long page)
+{
+	struct page *map;
+	
+	if (MAP_NR(page) >= max_mapnr)
+		return 0;
+	if (page == ZERO_PAGE(page))
+		return 0;
+	map = mem_map + MAP_NR(page);
+	if (PageReserved(map))
+		return 0;
+	return map;
+}
+
+/*
+ * Force in an entire range of pages from the current process's user VA,
+ * and pin and lock the pages for IO.  
+ */
+
+#define dprintk(x...)
+int map_user_kiobuf(int rw, struct kiobuf *iobuf, unsigned long va, size_t len)
+{
+	unsigned long		ptr, end;
+	int			err;
+	struct mm_struct *	mm;
+	struct vm_area_struct *	vma, * prev_vma;
+	unsigned long		page;
+	struct page *		map;
+	int			doublepage = 0;
+	int			repeat = 0;
+	int			i;
+	int			write = (rw == READ); /* if we read from disk
+							 it means we write
+							 to memory */
+	
+	/* Make sure the iobuf is not already mapped somewhere. */
+	if (iobuf->nr_pages)
+		return -EINVAL;
+
+	mm = current->mm;
+	dprintk ("map_user_kiobuf: begin\n");
+	
+	ptr = va & PAGE_MASK;
+	end = (va + len + PAGE_SIZE - 1) & PAGE_MASK;
+	err = expand_kiobuf(iobuf, (end - ptr) >> PAGE_SHIFT);
+	if (err)
+		return err;
+
+ repeat:
+	down(&mm->mmap_sem);
+
+	err = -EFAULT;
+	iobuf->locked = write;
+	iobuf->offset = va & ~PAGE_MASK;
+	iobuf->length = len;
+	vma = NULL;
+	
+	i = 0;
+	
+	/* 
+	 * First of all, try to fault in all of the necessary pages
+	 */
+	while (ptr < end) {
+		if (!vma || ptr >= vma->vm_end) {
+			vma = find_vma_prev(mm, ptr, &prev_vma);
+			if (!vma)
+				goto out;
+			if (vma->vm_start > ptr) {
+				if (!(vma->vm_flags & VM_GROWSDOWN))
+					goto out;
+				if (expand_stack(vma, ptr, prev_vma))
+					goto out;
+			}
+			err = -EACCES;
+			if (write) {
+				if (!(vma->vm_flags & VM_WRITE))
+					goto out;
+			} else {
+				if (!(vma->vm_flags & VM_READ))
+					goto out;
+			}
+			err = -EFAULT;
+		}
+		while (!(page = get_page(ptr, write))) {
+			int ret;
+
+			ret = handle_mm_fault(current, vma, ptr, write);
+			if (ret <= 0) {
+				if (!ret)
+					goto out;
+				else {
+					err = -ENOMEM;
+					goto out;
+				}
+			}
+		}
+		map = get_page_map(page);
+		if (map) {
+			if (write) {
+				/*
+				 * Lock down the pages only if we're going
+				 * to write to memory. If we're reading
+				 * from memory we're free to go ahead
+				 * only after pinning the page on the
+				 * physical side.
+				 */
+				if (PageLocked(map))
+					goto retry;
+				set_bit(PG_locked, &map->flags);
+			}
+			flush_dcache_page(page_address(map));
+			atomic_inc(&map->count);
+		}
+		dprintk ("Installing page %p %p: %d\n", (void *)page, map, i);
+		iobuf->pagelist[i] = page;
+		iobuf->maplist[i] = map;
+		iobuf->nr_pages = ++i;
+		
+		ptr += PAGE_SIZE;
+	}
+
+	up(&mm->mmap_sem);
+	dprintk ("map_user_kiobuf: end OK\n");
+	return 0;
+
+ out:
+	up(&mm->mmap_sem);
+	unmap_kiobuf(iobuf);
+	dprintk ("map_user_kiobuf: end %d\n", err);
+	return err;
+
+
+ retry:
+
+	/* 
+	 * Undo the locking so far, wait on the page we got to, and try again.
+	 */
+	up(&mm->mmap_sem);
+	unmap_kiobuf(iobuf);
+	ptr = va & PAGE_MASK;
+
+	/* 
+	 * Did the release also unlock the page we got stuck on?
+	 */
+	if (!PageLocked(map)) {
+		/* If so, we may well have the page mapped twice in the
+		 * IO address range.  Bad news.  Of course, it _might_
+		 * just be a coincidence, but if it happens more than
+		 * once, chances are we have a double-mapped page. */
+		if (++doublepage >= 3) {
+			return -EINVAL;
+		}
+	}
+	
+	/*
+	 * Try again...
+	 */
+	wait_on_page(map);
+	if (++repeat < 16)
+		goto repeat;
+	return -EAGAIN;
+}
+
+
+/*
+ * Unmap all of the pages referenced by a kiobuf.  We release the pages,
+ * and unlock them if they were locked. 
+ */
+
+void unmap_kiobuf (struct kiobuf *iobuf) 
+{
+	int i;
+	struct page *map;
+	
+	for (i = 0; i < iobuf->nr_pages; i++) {
+		map = iobuf->maplist[i];
+		
+		if (map) {
+			if (iobuf->locked) {
+				clear_bit(PG_locked, &map->flags);
+				wake_up(&map->wait);
+			}
+			__free_page(map);
+		}
+	}
+	
+	iobuf->nr_pages = 0;
+	iobuf->locked = 0;
+}
+
 static inline void zeromap_pte_range(pte_t * pte, unsigned long address,
                                      unsigned long size, pgprot_t prot)
 {
@@ -613,7 +836,7 @@
 	struct page * page_map;
 	
 	pte = *page_table;
-	new_page = __get_free_page(GFP_USER);
+	new_page = __get_free_page(GFP_BIGUSER);
 	/* Did swap_out() unmapped the protected page while we slept? */
 	if (pte_val(*page_table) != pte_val(pte))
 		goto end_wp_page;
@@ -639,7 +862,7 @@
 	case 2:
 		if (!PageSwapCache(page_map))
 			break;
-		if (swap_count(page_map->offset) != 1)
+		if (swap_count(pgoff2ulong(page_map->index)) != 1)
 			break;
 		delete_from_swap_cache(page_map);
 		/* FallThrough */
@@ -763,7 +986,7 @@
  * between the file and the memory map for a potential last
  * incomplete page.  Ugly, but necessary.
  */
-void vmtruncate(struct inode * inode, unsigned long offset)
+void vmtruncate(struct inode * inode, loff_t offset)
 {
 	truncate_inode_pages(inode, offset);
 	if (inode->i_mmap)
@@ -810,10 +1033,10 @@
 {
 	pte_t entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot));
 	if (write_access) {
-		unsigned long page = __get_free_page(GFP_USER);
+		unsigned long page = __get_free_page(GFP_BIGUSER);
 		if (!page)
 			return -1;
-		clear_page(page);
+		clear_bigpage(page);
 		entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
 		vma->vm_mm->rss++;
 		tsk->min_flt++;
diff -urN 2.2.18/mm/mmap.c 2.2.18aa1/mm/mmap.c
--- 2.2.18/mm/mmap.c	Mon Dec 11 16:58:07 2000
+++ 2.2.18aa1/mm/mmap.c	Mon Dec 11 17:20:52 2000
@@ -40,6 +40,7 @@
 kmem_cache_t *vm_area_cachep;
 
 int sysctl_overcommit_memory;
+int heap_stack_gap = 1;
 
 /* Check that a process has enough memory to allocate a
  * new virtual mapping.
@@ -66,7 +67,6 @@
 	free += page_cache_size;
 	free += nr_free_pages;
 	free += nr_swap_pages;
-	free -= (page_cache.min_percent + buffer_mem.min_percent + 2)*num_physpages/100; 
 	return free > pages;
 }
 
@@ -169,11 +169,25 @@
 #undef _trans
 }
 
-unsigned long do_mmap(struct file * file, unsigned long addr, unsigned long len,
-	unsigned long prot, unsigned long flags, unsigned long off)
+unsigned long do_mmap(struct file *file, unsigned long addr,
+		      unsigned long len, unsigned long prot,
+		      unsigned long flag, unsigned long offset)
+{
+	unsigned long ret = -EINVAL;
+	if ((offset + PAGE_ALIGN(len)) < offset)
+		goto out;
+	if (!(offset & ~PAGE_MASK))
+		ret = do_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
+out:
+	return ret;
+}
+
+unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, unsigned long len,
+	unsigned long prot, unsigned long flags, unsigned long pg_off)
 {
 	struct mm_struct * mm = current->mm;
 	struct vm_area_struct * vma;
+	loff_t off = (loff_t)pg_off << PAGE_SHIFT;
 	int error;
 
 	if (file && (!file->f_op || !file->f_op->mmap))
@@ -371,9 +385,14 @@
 
 	for (vmm = find_vma(current->mm, addr); ; vmm = vmm->vm_next) {
 		/* At this point:  (!vmm || addr < vmm->vm_end). */
+		unsigned long __heap_stack_gap = 0;
 		if (TASK_SIZE - len < addr)
 			return 0;
-		if (!vmm || addr + len <= vmm->vm_start)
+		if (!vmm)
+			return addr;
+		if (vmm->vm_flags & VM_GROWSDOWN)
+			__heap_stack_gap = heap_stack_gap << PAGE_SHIFT;
+		if (addr + len + __heap_stack_gap <= vmm->vm_start)
 			return addr;
 		addr = vmm->vm_end;
 	}
@@ -842,7 +861,8 @@
 		 * the offsets must be contiguous..
 		 */
 		if ((mpnt->vm_file != NULL) || (mpnt->vm_flags & VM_SHM)) {
-			unsigned long off = prev->vm_offset+prev->vm_end-prev->vm_start;
+			loff_t off = (prev->vm_offset +
+				      (loff_t)(prev->vm_end - prev->vm_start));
 			if (off != mpnt->vm_offset)
 				continue;
 		}
diff -urN 2.2.18/mm/mremap.c 2.2.18aa1/mm/mremap.c
--- 2.2.18/mm/mremap.c	Mon Jan 17 16:44:50 2000
+++ 2.2.18aa1/mm/mremap.c	Mon Dec 11 17:20:46 2000
@@ -127,7 +127,7 @@
 
 	new_vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
 	if (new_vma) {
-		unsigned long new_addr = get_unmapped_area(addr, new_len);
+		unsigned long new_addr = get_unmapped_area(0, new_len);
 
 		if (new_addr && !move_page_tables(current->mm, new_addr, addr, old_len)) {
 			*new_vma = *vma;
diff -urN 2.2.18/mm/page_alloc.c 2.2.18aa1/mm/page_alloc.c
--- 2.2.18/mm/page_alloc.c	Tue Sep  5 02:28:50 2000
+++ 2.2.18aa1/mm/page_alloc.c	Mon Dec 11 17:20:52 2000
@@ -3,6 +3,7 @@
  *
  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
  *  Swap reorganised 29.12.95, Stephen Tweedie
+ *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
  */
 
 #include <linux/config.h>
@@ -13,6 +14,7 @@
 #include <linux/interrupt.h>
 #include <linux/init.h>
 #include <linux/pagemap.h>
+#include <linux/bigmem.h> /* export bigmem vars */
 
 #include <asm/dma.h>
 #include <asm/uaccess.h> /* for copy_to/from_user */
@@ -35,7 +37,11 @@
 #else
 #define NR_MEM_LISTS 10
 #endif
+#ifndef CONFIG_BIGMEM
 #define NR_MEM_TYPES 2		/* GFP_DMA vs not for now. */
+#else
+#define NR_MEM_TYPES 3
+#endif
 
 /* The start of this MUST match the start of "struct page" */
 struct free_area_struct {
@@ -93,34 +99,81 @@
  */
 spinlock_t page_alloc_lock = SPIN_LOCK_UNLOCKED;
 
+#define list(x) (mem_map+(x))
+#define __free_pages_ok(map_nr, mask, area, index)		\
+	nr_free_pages -= (mask);				\
+	while ((mask) + (1 << (NR_MEM_LISTS-1))) {		\
+		if (!test_and_change_bit((index), (area)->map))	\
+			break;					\
+		(area)->count--;				\
+		remove_mem_queue(list((map_nr) ^ -(mask)));	\
+		(mask) <<= 1;					\
+		(area)++;					\
+		(index) >>= 1;					\
+		(map_nr) &= (mask);				\
+	}							\
+	add_mem_queue(area, list(map_nr));
+
+static void free_local_pages(struct page * page) {
+	unsigned long order = page->index;
+	unsigned int type = PageDMA(page) ? 1 : 0;
+	struct free_area_struct *area;
+	unsigned long map_nr = page - mem_map;
+	unsigned long mask = (~0UL) << order;
+	unsigned long index = map_nr >> (1 + order);
+
+#ifdef CONFIG_BIGMEM
+	if (PageBIGMEM(page)) {
+		nr_free_bigpages -= mask;
+		type = 2;
+	}
+#endif
+	area = free_area[type] + order;
+	__free_pages_ok(map_nr, mask, area, index);
+}
+
 static inline void free_pages_ok(unsigned long map_nr, unsigned long order, unsigned type)
 {
-	struct free_area_struct *area = free_area[type] + order;
-	unsigned long index = map_nr >> (1 + order);
-	unsigned long mask = (~0UL) << order;
+	struct free_area_struct *area;
+	unsigned long index;
+	unsigned long mask;
 	unsigned long flags;
+	struct page * page;
 
-	spin_lock_irqsave(&page_alloc_lock, flags);
-
-#define list(x) (mem_map+(x))
+	if (current->flags & PF_FREE_PAGES)
+		goto local_freelist;
+ back_local_freelist:
 
+	index = map_nr >> (1 + order);
+	mask = (~0UL) << order;
 	map_nr &= mask;
-	nr_free_pages -= mask;
-	while (mask + (1 << (NR_MEM_LISTS-1))) {
-		if (!test_and_change_bit(index, area->map))
-			break;
-		area->count--;
-		remove_mem_queue(list(map_nr ^ -mask));
-		mask <<= 1;
-		area++;
-		index >>= 1;
-		map_nr &= mask;
+
+	spin_lock_irqsave(&page_alloc_lock, flags);
+#ifdef CONFIG_BIGMEM
+	if (map_nr >= bigmem_mapnr) {
+		nr_free_bigpages -= mask;
+		type = 2;
 	}
-	add_mem_queue(area, list(map_nr));
+#endif
+	area = free_area[type] + order;
+	__free_pages_ok(map_nr, mask, area, index);
+	spin_unlock_irqrestore(&page_alloc_lock, flags);
+	return;
 
-#undef list
+ local_freelist:
+	/*
+	 * This is a little subtle: if the allocation order
+	 * wanted is major than zero we'd better take all the pages
+	 * local since we must deal with fragmentation too and we
+	 * can't rely on the nr_local_pages information.
+	 */
+	if (current->nr_local_pages && !current->allocation_order)
+		goto back_local_freelist;
 
-	spin_unlock_irqrestore(&page_alloc_lock, flags);
+	page = mem_map + map_nr;
+	list_add((struct list_head *) page, &current->local_pages);
+	page->index = order;
+	current->nr_local_pages++;
 }
 
 void __free_pages(struct page *page, unsigned long order)
@@ -148,6 +201,17 @@
 #define MARK_USED(index, order, area) \
 	change_bit((index) >> (1+(order)), (area)->map)
 #define ADDRESS(x) (PAGE_OFFSET + ((x) << PAGE_SHIFT))
+#ifdef CONFIG_BIGMEM
+#define UPDATE_NR_FREE_BIGPAGES(map_nr, order)			\
+	do							\
+	{							\
+		if ((map_nr) >= bigmem_mapnr)			\
+			nr_free_bigpages -= 1 << (order);	\
+	}							\
+	while (0)
+#else
+#define UPDATE_NR_FREE_BIGPAGES(map_nr, order) do { } while (0)
+#endif
 #define RMQUEUE_TYPE(order, type) \
 do { struct free_area_struct * area = free_area[type]+order; \
      unsigned long new_order = order; \
@@ -158,6 +222,7 @@
 			map_nr = ret - mem_map; \
 			MARK_USED(map_nr, new_order, area); \
 			nr_free_pages -= 1 << order; \
+			UPDATE_NR_FREE_BIGPAGES(map_nr, order); \
 			area->count--; \
 			EXPAND(ret, map_nr, order, new_order, area); \
 			spin_unlock_irqrestore(&page_alloc_lock, flags); \
@@ -179,13 +244,32 @@
 	atomic_set(&map->count, 1); \
 } while (0)
 
+static void refile_local_pages(void)
+{
+	if (current->nr_local_pages) {
+		struct page * page;
+		struct list_head * entry;
+		int nr_pages = current->nr_local_pages;
+
+		while ((entry = current->local_pages.next) != &current->local_pages) {
+			list_del(entry);
+			page = (struct page *) entry;
+			free_local_pages(page);
+			if (!nr_pages--)
+				panic("__get_free_pages local_pages list corrupted I");
+		}
+		if (nr_pages)
+			panic("__get_free_pages local_pages list corrupted II");
+		current->nr_local_pages = 0;
+	}
+}
+
 unsigned long __get_free_pages(int gfp_mask, unsigned long order)
 {
 	unsigned long flags;
-	static atomic_t free_before_allocate = ATOMIC_INIT(0);
 
 	if (order >= NR_MEM_LISTS)
-		goto nopage;
+		goto out;
 
 #ifdef ATOMIC_MEMORY_DEBUGGING
 	if ((gfp_mask & __GFP_WAIT) && in_interrupt()) {
@@ -194,26 +278,25 @@
 			printk("gfp called nonatomically from interrupt %p\n",
 				__builtin_return_address(0));
 		}
-		goto nopage;
+		goto out;
 	}
 #endif
 
 	/*
+	 * Acquire lock before reading nr_free_pages to make sure it
+	 * won't change from under us.
+	 */
+	spin_lock_irqsave(&page_alloc_lock, flags);
+
+	/*
 	 * If this is a recursive call, we'd better
 	 * do our best to just allocate things without
 	 * further thought.
 	 */
 	if (!(current->flags & PF_MEMALLOC)) {
-		int freed;
 		extern struct wait_queue * kswapd_wait;
 
-		/* Somebody needs to free pages so we free some of our own. */
-		if (atomic_read(&free_before_allocate)) {
-			current->flags |= PF_MEMALLOC;
-			try_to_free_pages(gfp_mask);
-			current->flags &= ~PF_MEMALLOC;
-		}
-
+#ifndef CONFIG_BIGMEM
 		if (nr_free_pages > freepages.low)
 			goto ok_to_allocate;
 
@@ -223,35 +306,88 @@
 		/* Do we have to block or can we proceed? */
 		if (nr_free_pages > freepages.min)
 			goto ok_to_allocate;
+#else
+		if (gfp_mask & __GFP_BIGMEM) {
+			if (nr_free_pages > freepages.low)
+				goto ok_to_allocate;
+
+			/*
+			 * Wake kswapd only if the normal classzone
+			 * is low on memory otherwise waking up kswapd would
+			 * be useless.
+			 */
+			if (nr_free_pages-nr_free_bigpages <= freepages.low &&
+			    waitqueue_active(&kswapd_wait))
+				wake_up_interruptible(&kswapd_wait);
+
+			/* Do we have to block or can we proceed? */
+			if (nr_free_pages > freepages.min)
+				goto ok_to_allocate;
+		} else {
+			if (nr_free_pages-nr_free_bigpages > freepages.low)
+				goto ok_to_allocate;
+
+			if (waitqueue_active(&kswapd_wait))
+				wake_up_interruptible(&kswapd_wait);
+
+			/* Do we have to block or can we proceed? */
+			if (nr_free_pages-nr_free_bigpages > freepages.min)
+				goto ok_to_allocate;
+		}
+#endif
+		if (gfp_mask & __GFP_WAIT) {
+			int freed;
+			/*
+			 * If the task is ok to sleep it's fine also
+			 * if we release irq here.
+			 */
+			spin_unlock_irq(&page_alloc_lock);
+
+			current->flags |= PF_MEMALLOC|PF_FREE_PAGES;
+			current->allocation_order = order;
+			freed = try_to_free_pages(gfp_mask);
+			current->flags &= ~(PF_MEMALLOC|PF_FREE_PAGES);
+
+			spin_lock_irq(&page_alloc_lock);
+			refile_local_pages();
+
+			/*
+			 * Re-check we're still low on memory after we blocked
+			 * for some time. Somebody may have released lots of
+			 * memory from under us while we was trying to free
+			 * the pages. We check against pages_high to be sure
+			 * to succeed only if lots of memory is been released.
+			 */
+#ifndef CONFIG_BIGMEM
+			if (nr_free_pages > freepages.high)
+				goto ok_to_allocate;
+#else
+			if (gfp_mask & __GFP_BIGMEM) {
+				if (nr_free_pages > freepages.high)
+					goto ok_to_allocate;
+			} else {
+				if (nr_free_pages-nr_free_bigpages > freepages.high)
+					goto ok_to_allocate;
+			}
+#endif
 
-		current->flags |= PF_MEMALLOC;
-		atomic_inc(&free_before_allocate);
-		freed = try_to_free_pages(gfp_mask);
-		atomic_dec(&free_before_allocate);
-		current->flags &= ~PF_MEMALLOC;
-
-		/*
-		 * Re-check we're still low on memory after we blocked
-		 * for some time. Somebody may have released lots of
-		 * memory from under us while we was trying to free
-		 * the pages. We check against pages_high to be sure
-		 * to succeed only if lots of memory is been released.
-		 */
-		if (nr_free_pages > freepages.high)
-			goto ok_to_allocate;
-
-		if (!freed && !(gfp_mask & (__GFP_MED | __GFP_HIGH)))
-			goto nopage;
+			if (!freed && !(gfp_mask & (__GFP_MED | __GFP_HIGH)))
+				goto nopage;
+		}
 	}
 ok_to_allocate:
-	spin_lock_irqsave(&page_alloc_lock, flags);
 	/* if it's not a dma request, try non-dma first */
-	if (!(gfp_mask & __GFP_DMA))
+	if (!(gfp_mask & __GFP_DMA)) {
+#ifdef CONFIG_BIGMEM
+		if (gfp_mask & __GFP_BIGMEM)
+			RMQUEUE_TYPE(order, 2);
+#endif
 		RMQUEUE_TYPE(order, 0);
+	}
 	RMQUEUE_TYPE(order, 1);
+ nopage:
 	spin_unlock_irqrestore(&page_alloc_lock, flags);
-
-nopage:
+ out:
 	return 0;
 }
 
@@ -266,7 +402,9 @@
 	unsigned type;
 
 	spin_lock_irqsave(&page_alloc_lock, flags);
-	printk("Free pages:      %6dkB\n ( ",nr_free_pages<<(PAGE_SHIFT-10));
+	printk("Free pages:      %6dkB (%6dkB BigMem)\n ( ",
+		nr_free_pages<<(PAGE_SHIFT-10),
+		nr_free_bigpages<<(PAGE_SHIFT-10));
 	printk("Free: %d (%d %d %d)\n",
 		nr_free_pages,
 		freepages.min,
@@ -274,7 +412,19 @@
 		freepages.high);
 	for (type = 0; type < NR_MEM_TYPES; type++) {
  		unsigned long total = 0;
+#ifdef CONFIG_BIGMEM
+		switch (type)
+		{
+		case 0:
+		case 1:
+#endif
 		printk("%sDMA: ", type ? "" : "Non");
+#ifdef CONFIG_BIGMEM
+			break;
+		case 2:
+			printk("BIGMEM: ");
+		}
+#endif
  		for (order=0 ; order < NR_MEM_LISTS; order++) {
 			unsigned long nr = free_area[type][order].count;
 
@@ -426,6 +576,8 @@
 	 * this process.
 	 */
 	delete_from_swap_cache(page_map);
+	page_map = replace_with_bigmem(page_map);
+	page = page_address(page_map);
 	set_pte(page_table, pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))));
   	return 1;
 }
diff -urN 2.2.18/mm/page_io.c 2.2.18aa1/mm/page_io.c
--- 2.2.18/mm/page_io.c	Tue Jun 13 03:48:15 2000
+++ 2.2.18aa1/mm/page_io.c	Mon Dec 11 17:20:52 2000
@@ -112,7 +112,7 @@
 		 * as if it were: we are not allowed to manipulate the inode
 		 * hashing for locked pages.
 		 */
-		if (page->offset != entry) {
+		if (pgoff2ulong(page->index) != entry) {
 			printk ("swap entry mismatch");
 			return;
 		}
@@ -265,8 +265,8 @@
 		printk("VM: swap page is not in swap cache\n");
 		return;
 	}
-	if (page->offset != entry) {
-		printk ("swap entry mismatch");
+	if (pgoff2ulong(page->index) != entry) {
+		printk ("VM: swap entry mismatch");
 		return;
 	}
 	rw_swap_page_base(rw, entry, page, wait);
@@ -291,12 +291,12 @@
 		printk ("VM: read_swap_page: page already in page cache!\n");
 		return;
 	}
-	page->inode = &swapper_inode;
-	page->offset = entry;
+	page->inode     = &swapper_inode;
+	page->index = ulong2pgoff(entry);
 	atomic_inc(&page->count);	/* Protect from shrink_mmap() */
 	rw_swap_page(rw, entry, buffer, 1);
 	atomic_dec(&page->count);
-	page->inode = 0;
+	page->inode     = 0;
 	clear_bit(PG_swap_cache, &page->flags);
 }
 
diff -urN 2.2.18/mm/swap.c 2.2.18aa1/mm/swap.c
--- 2.2.18/mm/swap.c	Mon Jan 18 02:27:01 1999
+++ 2.2.18aa1/mm/swap.c	Mon Dec 11 17:20:43 2000
@@ -47,13 +47,13 @@
 atomic_t nr_async_pages = ATOMIC_INIT(0);
 
 buffer_mem_t buffer_mem = {
-	2,	/* minimum percent buffer */
+	4,	/* minimum percent buffer */
 	10,	/* borrow percent buffer */
 	60	/* maximum percent buffer */
 };
 
 buffer_mem_t page_cache = {
-	2,	/* minimum percent page cache */
+	4,	/* minimum percent page cache */
 	15,	/* borrow percent page cache */
 	75	/* maximum */
 };
diff -urN 2.2.18/mm/swap_state.c 2.2.18aa1/mm/swap_state.c
--- 2.2.18/mm/swap_state.c	Mon Jan 17 16:44:50 2000
+++ 2.2.18aa1/mm/swap_state.c	Mon Dec 11 17:20:52 2000
@@ -54,7 +54,7 @@
 	if (PageTestandSetSwapCache(page)) {
 		printk(KERN_ERR "swap_cache: replacing non-empty entry %08lx "
 		       "on page %08lx\n",
-		       page->offset, page_address(page));
+		       pgoff2ulong(page->index), page_address(page));
 		return 0;
 	}
 	if (page->inode) {
@@ -63,9 +63,10 @@
 		return 0;
 	}
 	atomic_inc(&page->count);
+	page->flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_referenced));
 	page->inode = &swapper_inode;
-	page->offset = entry;
-	add_page_to_hash_queue(page, &swapper_inode, entry);
+	page->index = ulong2pgoff(entry);
+	add_page_to_hash_queue(page, &swapper_inode, ulong2pgoff(entry));
 	add_page_to_inode_queue(&swapper_inode, page);
 	return 1;
 }
@@ -203,7 +204,7 @@
  */
 void delete_from_swap_cache(struct page *page)
 {
-	long entry = page->offset;
+	long entry = pgoff2ulong(page->index);
 
 #ifdef SWAP_CACHE_INFO
 	swap_cache_del_total++;
@@ -251,7 +252,7 @@
 	swap_cache_find_total++;
 #endif
 	while (1) {
-		found = find_page(&swapper_inode, entry);
+		found = find_page(&swapper_inode, ulong2pgoff(entry));
 		if (!found)
 			return 0;
 		if (found->inode != &swapper_inode || !PageSwapCache(found))
diff -urN 2.2.18/mm/vmalloc.c 2.2.18aa1/mm/vmalloc.c
--- 2.2.18/mm/vmalloc.c	Mon Dec 11 16:58:07 2000
+++ 2.2.18aa1/mm/vmalloc.c	Mon Dec 11 17:20:48 2000
@@ -2,6 +2,7 @@
  *  linux/mm/vmalloc.c
  *
  *  Copyright (C) 1993  Linus Torvalds
+ *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
  */
 
 #include <linux/malloc.h>
@@ -95,7 +96,7 @@
 		unsigned long page;
 		if (!pte_none(*pte))
 			printk("alloc_area_pte: page already exists\n");
-		page = __get_free_page(GFP_KERNEL);
+		page = __get_free_page(GFP_KERNEL|GFP_BIGMEM);
 		if (!page)
 			return -ENOMEM;
 		set_pte(pte, mk_pte(page, PAGE_KERNEL));
diff -urN 2.2.18/mm/vmscan.c 2.2.18aa1/mm/vmscan.c
--- 2.2.18/mm/vmscan.c	Tue Sep  5 02:28:50 2000
+++ 2.2.18aa1/mm/vmscan.c	Mon Dec 11 17:20:52 2000
@@ -17,6 +17,7 @@
 #include <linux/smp_lock.h>
 #include <linux/pagemap.h>
 #include <linux/init.h>
+#include <linux/bigmem.h>
 
 #include <asm/pgtable.h>
 
@@ -60,7 +61,8 @@
 
 	if (PageReserved(page_map)
 	    || PageLocked(page_map)
-	    || ((gfp_mask & __GFP_DMA) && !PageDMA(page_map)))
+	    || ((gfp_mask & __GFP_DMA) && !PageDMA(page_map))
+	    || (!(gfp_mask & __GFP_BIGMEM) && PageBIGMEM(page_map)))
 		return 0;
 
 	/*
@@ -72,7 +74,7 @@
 	 * memory, and we should just continue our scan.
 	 */
 	if (PageSwapCache(page_map)) {
-		entry = page_map->offset;
+		entry = pgoff2ulong(page_map->index);
 		swap_duplicate(entry);
 		set_pte(page_table, __pte(entry));
 drop_pte:
@@ -96,6 +98,9 @@
 	 * some real work in the future in "shrink_mmap()".
 	 */
 	if (!pte_dirty(pte)) {
+		if (page_map->inode && pgcache_under_min())
+			/* unmapping this page would be useless */
+			return 0;
 		flush_cache_page(vma, address);
 		pte_clear(page_table);
 		goto drop_pte;
@@ -106,7 +111,7 @@
 	 * we cannot do I/O! Avoid recursing on FS
 	 * locks etc.
 	 */
-	if (!(gfp_mask & __GFP_IO))
+	if (!(gfp_mask & __GFP_IO) || current->fs_locks)
 		return 0;
 
 	/*
@@ -151,6 +156,9 @@
 	if (!entry)
 		return 0; /* No swap space left */
 		
+	if (!(page_map = prepare_bigmem_swapout(page_map)))
+		goto out_swap_free;
+
 	vma->vm_mm->rss--;
 	tsk->nswap++;
 	set_pte(page_table, __pte(entry));
@@ -162,10 +170,14 @@
 	set_bit(PG_locked, &page_map->flags);
 
 	/* OK, do a physical asynchronous write to swap.  */
-	rw_swap_page(WRITE, entry, (char *) page, 0);
+	rw_swap_page(WRITE, entry, (char *) page_address(page_map), 0);
 
 	__free_page(page_map);
 	return 1;
+
+ out_swap_free:
+	swap_free(entry);
+	return 0;
 }
 
 /*
@@ -208,6 +220,8 @@
 		result = try_to_swap_out(tsk, vma, address, pte, gfp_mask);
 		if (result)
 			return result;
+		if (current->need_resched)
+			return 2;
 		address += PAGE_SIZE;
 		pte++;
 	} while (address < end);
@@ -251,7 +265,7 @@
 	unsigned long end;
 
 	/* Don't swap out areas which are locked down */
-	if (vma->vm_flags & VM_LOCKED)
+	if (vma->vm_flags & (VM_LOCKED|VM_RESERVED))
 		return 0;
 
 	pgdir = pgd_offset(tsk->mm, address);
@@ -327,7 +341,7 @@
 	 * Think of swap_cnt as a "shadow rss" - it tells us which process
 	 * we want to page out (always try largest first).
 	 */
-	counter = nr_tasks / (priority+1);
+	counter = nr_tasks / priority;
 	if (counter < 1)
 		counter = 1;
 
@@ -361,8 +375,13 @@
 			goto out;
 		}
 
-		if (swap_out_process(pbest, gfp_mask))
+		switch (swap_out_process(pbest, gfp_mask)) {
+		case 1:
 			return 1;
+		case 2:
+			current->state = TASK_RUNNING;
+			schedule();
+		}
 	}
 out:
 	return 0;
@@ -377,11 +396,9 @@
  * cluster them so that we get good swap-out behaviour. See
  * the "free_memory()" macro for details.
  */
-static int do_try_to_free_pages(unsigned int gfp_mask)
+int try_to_free_pages(unsigned int gfp_mask)
 {
 	int priority;
-	int ret = 0;
-	int swapcount;
 	int count = SWAP_CLUSTER_MAX;
 
 	lock_kernel();
@@ -389,41 +406,34 @@
 	/* Always trim SLAB caches when memory gets low. */
 	kmem_cache_reap(gfp_mask);
 
-	priority = 6;
+	priority = 5;
 	do {
 		while (shrink_mmap(priority, gfp_mask)) {
-			ret = 1;
 			if (!--count)
 				goto done;
 		}
 
 		/* Try to get rid of some shared memory pages.. */
-		if (gfp_mask & __GFP_IO) {
+		if (gfp_mask & __GFP_IO && !current->fs_locks) {
 			while (shm_swap(priority, gfp_mask)) {
-				ret = 1;
 				if (!--count)
 					goto done;
 			}
 		}
 
 		/* Then, try to page stuff out.. */
-		swapcount = count;
 		while (swap_out(priority, gfp_mask)) {
-			ret = 1;
-			if (!--swapcount)
-				break;
+			if (!--count)
+				goto done;
 		}
 
 		shrink_dcache_memory(priority, gfp_mask);
-	} while (--priority >= 0);
+	} while (--priority > 0);
 done:
 	unlock_kernel();
 
-	if (!ret)
-		printk("VM: do_try_to_free_pages failed for %s...\n",
-				current->comm);
 	/* Return success if we freed a page. */
-	return ret;
+	return priority > 0;
 }
 
 /*
@@ -497,9 +507,13 @@
 		 */
 		interruptible_sleep_on(&kswapd_wait);
 
-		while (nr_free_pages < freepages.high)
+		/*
+		 * In 2.2.x-bigmem kswapd is critical to provide GFP_ATOMIC
+		 * allocations (not GFP_BIGMEM ones).
+		 */
+		while (nr_free_pages - nr_free_bigpages < freepages.high)
 		{
-			if (do_try_to_free_pages(GFP_KSWAPD))
+			if (try_to_free_pages(GFP_KSWAPD))
 			{
 				if (tsk->need_resched)
 					schedule();
@@ -510,17 +524,3 @@
 		}
 	}
 }
-
-/*
- * Called by non-kswapd processes when kswapd really cannot
- * keep up with the demand for free memory.
- */
-int try_to_free_pages(unsigned int gfp_mask)
-{
-	int retval = 1;
-
-	if (gfp_mask & __GFP_WAIT)
-		retval = do_try_to_free_pages(gfp_mask);
-	return retval;
-}
-	
diff -urN 2.2.18/net/core/sock.c 2.2.18aa1/net/core/sock.c
--- 2.2.18/net/core/sock.c	Mon Dec 11 16:58:08 2000
+++ 2.2.18aa1/net/core/sock.c	Mon Dec 11 17:20:44 2000
@@ -1019,7 +1019,7 @@
 {
 	if (!sk->dead) {
 		wake_up_interruptible(sk->sleep);
-		sock_wake_async(sk->socket,0); 
+		sock_wake_async(sk->socket,0,POLL_ERR); 
 	}
 }
 
@@ -1027,7 +1027,7 @@
 {
 	if(!sk->dead) {
 		wake_up_interruptible(sk->sleep);
-		sock_wake_async(sk->socket,1);
+		sock_wake_async(sk->socket,1,POLL_IN);
 	}
 }
 
@@ -1042,7 +1042,7 @@
 
 		/* Should agree with poll, otherwise some programs break */
 		if (sock_writeable(sk))
-			sock_wake_async(sk->socket, 2);
+			sock_wake_async(sk->socket, 2, POLL_OUT);
 	}
 }
 
diff -urN 2.2.18/net/ipv4/tcp.c 2.2.18aa1/net/ipv4/tcp.c
--- 2.2.18/net/ipv4/tcp.c	Mon Dec 11 16:58:08 2000
+++ 2.2.18aa1/net/ipv4/tcp.c	Mon Dec 11 17:20:52 2000
@@ -416,6 +416,8 @@
 #include <linux/fcntl.h>
 #include <linux/poll.h>
 #include <linux/init.h>
+#include <linux/smp_lock.h>
+#include <linux/condsched.h>
 
 #include <net/icmp.h>
 #include <net/tcp.h>
@@ -615,7 +617,7 @@
 	wake_up_interruptible(sk->sleep);
 	if (sock_wspace(sk) >=
 	    tcp_min_write_space(sk))
-		sock_wake_async(sk->socket, 2);
+		sock_wake_async(sk->socket, 2, POLL_OUT);
 }
 
 
@@ -767,6 +769,7 @@
 	int iovlen, flags;
 	int mss_now;
 	int err, copied;
+	DECLARE_LOCAL_LOCK_DEPTH(lock_depth);
 
 	lock_sock(sk);
 
@@ -848,10 +851,13 @@
 						skb->csum = csum_partial(skb->data,
 									 skb->len, 0);
 					} else {
+						release_kernel_lock_save(lock_depth);
 						skb->csum =
 							csum_and_copy_from_user(
 							from, skb_put(skb, copy),
 							copy, skb->csum, &err);
+						conditional_schedule();
+						reacquire_kernel_lock_restore(lock_depth);
 					}
 		
 					/*
@@ -966,8 +972,11 @@
 			 * Reserve header space and checksum the data.
 			 */
 			skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
+			release_kernel_lock_save(lock_depth);
 			skb->csum = csum_and_copy_from_user(from,
 					skb_put(skb, copy), copy, 0, &err);
+			conditional_schedule();
+			reacquire_kernel_lock_restore(lock_depth);
 
 			if (err)
 				goto do_fault;
@@ -1621,7 +1630,7 @@
 
 	add_wait_queue(sk->sleep, &wait);
 	for (;;) {
-		current->state = TASK_INTERRUPTIBLE;
+		current->state = TASK_INTERRUPTIBLE | TASK_EXCLUSIVE;
 		release_sock(sk);
 		schedule();
 		lock_sock(sk);
diff -urN 2.2.18/net/ipv4/tcp_input.c 2.2.18aa1/net/ipv4/tcp_input.c
--- 2.2.18/net/ipv4/tcp_input.c	Mon Dec 11 16:58:08 2000
+++ 2.2.18aa1/net/ipv4/tcp_input.c	Mon Dec 11 17:20:46 2000
@@ -97,6 +97,7 @@
  */
 static void tcp_delack_estimator(struct tcp_opt *tp)
 {
+	tcp_exit_quickack_mode(tp);
 	if(tp->ato == 0) {
 		tp->lrcvtime = tcp_time_stamp;
 
@@ -115,10 +116,7 @@
 		if(m > tp->rto)
 			tp->ato = tp->rto;
 		else {
-			/* This funny shift makes sure we
-			 * clear the "quick ack mode" bit.
-			 */
-			tp->ato = ((tp->ato << 1) >> 2) + m;
+			tp->ato = (tp->ato >> 1) + m;
 		}
 	}
 }
@@ -1187,7 +1185,7 @@
 
 	if (!sk->dead) {
 		sk->state_change(sk);
-		sock_wake_async(sk->socket, 1);
+		sock_wake_async(sk->socket, 1, POLL_HUP);
 	}
 }
 
@@ -1681,6 +1679,7 @@
 			kill_proc(sk->proc, SIGURG, 1);
 		else
 			kill_pg(-sk->proc, SIGURG, 1);
+		sock_wake_async(sk->socket, 3, POLL_PRI);
 	}
 
 	/* We may be adding urgent data when the last byte read was
@@ -2225,7 +2224,7 @@
 
 			if(!sk->dead) {
 				sk->state_change(sk);
-				sock_wake_async(sk->socket, 0);
+				sock_wake_async(sk->socket, 0, POLL_OUT);
 			}
 		} else {
 			if(th->syn && !th->rst) {
diff -urN 2.2.18/net/ipv4/tcp_ipv4.c 2.2.18aa1/net/ipv4/tcp_ipv4.c
--- 2.2.18/net/ipv4/tcp_ipv4.c	Mon Dec 11 16:58:08 2000
+++ 2.2.18aa1/net/ipv4/tcp_ipv4.c	Mon Dec 11 17:20:46 2000
@@ -1402,6 +1402,7 @@
 		newtp->snd_una = req->snt_isn + 1;
 		newtp->srtt = 0;
 		newtp->ato = 0;
+		tcp_enter_quickack_mode(newtp);
 		newtp->snd_wl1 = req->rcv_isn;
 		newtp->snd_wl2 = req->snt_isn;
 
@@ -1956,6 +1957,7 @@
 	skb_queue_head_init(&tp->out_of_order_queue);
 	tcp_init_xmit_timers(sk);
 
+	tcp_enter_quickack_mode(tp);
 	tp->rto  = TCP_TIMEOUT_INIT;		/*TCP_WRITE_TIME*/
 	tp->mdev = TCP_TIMEOUT_INIT;
 	tp->mss_clamp = ~0;
diff -urN 2.2.18/net/ipv4/tcp_output.c 2.2.18aa1/net/ipv4/tcp_output.c
--- 2.2.18/net/ipv4/tcp_output.c	Mon Dec 11 16:58:08 2000
+++ 2.2.18aa1/net/ipv4/tcp_output.c	Mon Dec 11 17:20:46 2000
@@ -1042,6 +1042,17 @@
 	timeout = (tp->ato << 1) >> 1;
 	if (timeout > max_timeout)
 		timeout = max_timeout;
+	if (!timeout)
+	{
+		timeout = tp->rto;
+		if ((signed) timeout <= 0)
+		{
+			printk(KERN_ERR
+				"tcp_send_delayed_ack: rto %ld!\n", timeout);
+			timeout = 1;
+		}
+		timeout = min(timeout, max_timeout);
+	}
 	timeout += jiffies;
 
 	/* Use new timeout only if there wasn't a older one earlier. */
diff -urN 2.2.18/net/ipv4/tcp_timer.c 2.2.18aa1/net/ipv4/tcp_timer.c
--- 2.2.18/net/ipv4/tcp_timer.c	Tue Jun 13 03:48:15 2000
+++ 2.2.18aa1/net/ipv4/tcp_timer.c	Mon Dec 11 17:20:46 2000
@@ -195,7 +195,21 @@
 		if (!atomic_read(&sk->sock_readers))
 			tcp_send_ack(sk);
 		else
-			tcp_send_delayed_ack(&(sk->tp_pinfo.af_tcp), HZ/10);
+		{
+			struct tcp_opt * tp = &(sk->tp_pinfo.af_tcp);
+			int rto;
+
+			rto = tp->rto;
+			if (rto <= 0)
+			{
+				printk(KERN_ERR
+				       "tcp_delack_timer: rto %d!\n", rto);
+				rto = 1;
+			}
+			rto = min(rto, HZ/10);
+			tp->delack_timer.expires = rto + jiffies;
+			add_timer(&tp->delack_timer);
+		}
 	}
 }
 
diff -urN 2.2.18/net/socket.c 2.2.18aa1/net/socket.c
--- 2.2.18/net/socket.c	Mon Dec 11 16:58:08 2000
+++ 2.2.18aa1/net/socket.c	Mon Dec 11 17:20:44 2000
@@ -546,7 +546,7 @@
 	return 0;
 }
 
-int sock_wake_async(struct socket *sock, int how)
+int sock_wake_async(struct socket *sock, int how, int band)
 {
 	if (!sock || !sock->fasync_list)
 		return -1;
@@ -563,8 +563,10 @@
 		/* fall through */
 	case 0:
 	call_kill:
-		kill_fasync(sock->fasync_list, SIGIO);
+		kill_fasync(sock->fasync_list, SIGIO, band);
 		break;
+	case 3:
+		kill_fasync(sock->fasync_list, SIGURG, band);
 	}
 	return 0;
 }
diff -urN 2.2.18/net/unix/af_unix.c 2.2.18aa1/net/unix/af_unix.c
--- 2.2.18/net/unix/af_unix.c	Mon Dec 11 16:58:09 2000
+++ 2.2.18aa1/net/unix/af_unix.c	Mon Dec 11 17:20:44 2000
@@ -1525,7 +1525,7 @@
 		return;
 	wake_up_interruptible(sk->sleep);
 	if (sk->sndbuf - (int)atomic_read(&sk->wmem_alloc) >= MIN_WRITE_SPACE)
-		sock_wake_async(sk->socket, 2);
+		sock_wake_async(sk->socket, 2, POLL_OUT);
 }
 
 #ifdef CONFIG_PROC_FS