diff -urN 2.3.99-pre6/fs/buffer.c 2.3.99-pre6-VM-2/fs/buffer.c
--- 2.3.99-pre6/fs/buffer.c	Thu Apr 27 17:56:42 2000
+++ 2.3.99-pre6-VM-2/fs/buffer.c	Fri Apr 28 17:01:57 2000
@@ -2104,7 +2104,8 @@
 	spin_unlock(&free_list[isize].lock);
 
 	page->buffers = bh;
-	lru_cache_add(page);
+	page->flags &= ~(1 << PG_referenced);
+	lru_cache_add(page, LRU_NORMAL_CACHE);
 	atomic_inc(&buffermem_pages);
 	return 1;
 
diff -urN 2.3.99-pre6/fs/dcache.c 2.3.99-pre6-VM-2/fs/dcache.c
--- 2.3.99-pre6/fs/dcache.c	Thu Apr 27 17:56:42 2000
+++ 2.3.99-pre6-VM-2/fs/dcache.c	Fri Apr 28 16:44:32 2000
@@ -513,19 +513,20 @@
  */
 int shrink_dcache_memory(int priority, unsigned int gfp_mask, zone_t * zone)
 {
-	int count = 0;
-	lock_kernel();
-	if (priority)
-		count = dentry_stat.nr_unused / priority;
-	prune_dcache(count);
-	unlock_kernel();
-	/* FIXME: kmem_cache_shrink here should tell us
-	   the number of pages freed, and it should
-	   work in a __GFP_DMA/__GFP_HIGHMEM behaviour
-	   to free only the interesting pages in
-	   function of the needs of the current allocation. */
-	kmem_cache_shrink(dentry_cache);
-
+	if (gfp_mask & __GFP_IO) {
+		int count = 0;
+		lock_kernel();
+		if (priority)
+			count = dentry_stat.nr_unused / priority;
+		prune_dcache(count);
+		unlock_kernel();
+		/* FIXME: kmem_cache_shrink here should tell us
+		   the number of pages freed, and it should
+		   work in a __GFP_DMA/__GFP_HIGHMEM behaviour
+		   to free only the interesting pages in
+		   function of the needs of the current allocation. */
+		kmem_cache_shrink(dentry_cache);
+	}
 	return 0;
 }
 
diff -urN 2.3.99-pre6/fs/inode.c 2.3.99-pre6-VM-2/fs/inode.c
--- 2.3.99-pre6/fs/inode.c	Thu Apr 27 17:56:42 2000
+++ 2.3.99-pre6-VM-2/fs/inode.c	Fri Apr 28 16:44:32 2000
@@ -452,18 +452,19 @@
 
 int shrink_icache_memory(int priority, int gfp_mask, zone_t *zone)
 {
-	int count = 0;
+	if (gfp_mask & __GFP_IO) {
+		int count = 0;
 		
-	if (priority)
-		count = inodes_stat.nr_unused / priority;
-	prune_icache(count);
-	/* FIXME: kmem_cache_shrink here should tell us
-	   the number of pages freed, and it should
-	   work in a __GFP_DMA/__GFP_HIGHMEM behaviour
-	   to free only the interesting pages in
-	   function of the needs of the current allocation. */
-	kmem_cache_shrink(inode_cachep);
-
+		if (priority)
+			count = inodes_stat.nr_unused / priority;
+		prune_icache(count);
+		/* FIXME: kmem_cache_shrink here should tell us
+		   the number of pages freed, and it should
+		   work in a __GFP_DMA/__GFP_HIGHMEM behaviour
+		   to free only the interesting pages in
+		   function of the needs of the current allocation. */
+		kmem_cache_shrink(inode_cachep);
+	}
 	return 0;
 }
 
diff -urN 2.3.99-pre6/include/linux/cache.h 2.3.99-pre6-VM-2/include/linux/cache.h
--- 2.3.99-pre6/include/linux/cache.h	Sun Apr 23 22:42:11 2000
+++ 2.3.99-pre6-VM-2/include/linux/cache.h	Fri Apr 28 18:36:38 2000
@@ -1,6 +1,7 @@
 #ifndef __LINUX_CACHE_H
 #define __LINUX_CACHE_H
 
+#include <linux/config.h>
 #include <asm/cache.h>
 
 #ifndef L1_CACHE_ALIGN
@@ -13,6 +14,14 @@
 
 #ifndef ____cacheline_aligned
 #define ____cacheline_aligned __attribute__((__aligned__(SMP_CACHE_BYTES)))
+#endif
+
+#ifndef ____cacheline_aligned_in_smp
+#ifdef CONFIG_SMP
+#define ____cacheline_aligned_in_smp ____cacheline_aligned
+#else
+#define ____cacheline_aligned_in_smp
+#endif /* CONFIG_SMP */
 #endif
 
 #ifndef __cacheline_aligned
diff -urN 2.3.99-pre6/include/linux/mm.h 2.3.99-pre6-VM-2/include/linux/mm.h
--- 2.3.99-pre6/include/linux/mm.h	Thu Apr 27 17:56:45 2000
+++ 2.3.99-pre6-VM-2/include/linux/mm.h	Fri Apr 28 18:36:38 2000
@@ -15,7 +15,6 @@
 extern unsigned long num_physpages;
 extern void * high_memory;
 extern int page_cluster;
-extern struct list_head lru_cache;
 
 #include <asm/page.h>
 #include <asm/pgtable.h>
@@ -146,6 +145,7 @@
 	unsigned long index;
 	struct page *next_hash;
 	atomic_t count;
+	int map_count;
 	unsigned long flags;	/* atomic flags, some possibly updated asynchronously */
 	struct list_head lru;
 	wait_queue_head_t wait;
@@ -308,21 +308,21 @@
  * can allocate highmem pages, the *get*page*() variants return
  * virtual kernel addresses to the allocated page(s).
  */
-extern struct page * FASTCALL(__alloc_pages(zonelist_t *zonelist, unsigned long order));
+extern struct page * FASTCALL(__alloc_pages(gfpmask_zone_t *, unsigned long order));
 extern struct page * alloc_pages_node(int nid, int gfp_mask, unsigned long order);
 
 #ifndef CONFIG_DISCONTIGMEM
 extern inline struct page * alloc_pages(int gfp_mask, unsigned long order)
 {
 	/*  temporary check. */
-	if (contig_page_data.node_zonelists[gfp_mask].gfp_mask != (gfp_mask))
+	if (contig_page_data.node_gfpmask_zone[gfp_mask].gfp_mask != (gfp_mask))
 		BUG();
 	/*
 	 * Gets optimized away by the compiler.
 	 */
 	if (order >= MAX_ORDER)
 		return NULL;
-	return __alloc_pages(contig_page_data.node_zonelists+(gfp_mask), order);
+	return __alloc_pages(contig_page_data.node_gfpmask_zone+gfp_mask, order);
 }
 #else /* !CONFIG_DISCONTIGMEM */
 extern struct page * alloc_pages(int gfp_mask, unsigned long order);
@@ -454,7 +454,7 @@
 /* filemap.c */
 extern void remove_inode_page(struct page *);
 extern unsigned long page_unuse(struct page *);
-extern int shrink_mmap(int, int, zone_t *);
+extern int shrink_mmap(int, zone_t *);
 extern void truncate_inode_pages(struct address_space *, loff_t);
 
 /* generic vm_area_ops exported for stackable file systems */
@@ -535,10 +535,8 @@
 
 extern struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr);
 
-#define buffer_under_min()	(atomic_read(&buffermem_pages) * 100 < \
-				buffer_mem.min_percent * num_physpages)
-#define pgcache_under_min()	(atomic_read(&page_cache_size) * 100 < \
-				page_cache.min_percent * num_physpages)
+#define lru_cache_under_min(lru_pages)	((lru_pages) * 100 < \
+					 lru_cache_mem.min_percent * num_physpages)
 
 #define vmlist_access_lock(mm)		spin_lock(&mm->page_table_lock)
 #define vmlist_access_unlock(mm)	spin_unlock(&mm->page_table_lock)
diff -urN 2.3.99-pre6/include/linux/mmzone.h 2.3.99-pre6-VM-2/include/linux/mmzone.h
--- 2.3.99-pre6/include/linux/mmzone.h	Thu Apr 27 17:56:45 2000
+++ 2.3.99-pre6-VM-2/include/linux/mmzone.h	Fri Apr 28 18:36:38 2000
@@ -21,16 +21,26 @@
 
 struct pglist_data;
 
+/*
+ * Memory balancing internally to the node can work correctly only on
+ * classzone basis while handling overlapped classzones.
+ */
 typedef struct zone_struct {
 	/*
 	 * Commonly accessed fields:
 	 */
-	spinlock_t		lock;
 	unsigned long		offset;
 	unsigned long		free_pages;
-	char			low_on_memory;
-	char			zone_wake_kswapd;
+
+	/*
+	 * Memory balancing is all classzone based, all the below
+	 * fields refer to the classzone. The classzone includes
+	 * the current zone plus all the lower zones in the MM.
+	 */
+	unsigned long		classzone_free_pages;
 	unsigned long		pages_min, pages_low, pages_high;
+	int			nr_zone;
+	char			zone_wake_kswapd;
 
 	/*
 	 * free areas of different sizes
@@ -57,27 +67,35 @@
 #define MAX_NR_ZONES		3
 
 /*
- * One allocation request operates on a zonelist. A zonelist
- * is a list of zones, the first one is the 'goal' of the
- * allocation, the other zones are fallback zones, in decreasing
- * priority.
- *
- * Right now a zonelist takes up less than a cacheline. We never
- * modify it apart from boot-up, and only a few indices are used,
- * so despite the zonelist table being relatively big, the cache
- * footprint of this construct is very small.
+ * The pgdat->node_gfpmask_zone[] array tell us which classzone
+ * we should allocate from given a certain gfpmask. It translates
+ * the gfpmask to a classzone.
  */
-typedef struct zonelist_struct {
-	zone_t * zones [MAX_NR_ZONES+1]; // NULL delimited
+typedef struct gfpmask_zone_s {
+	zone_t * classzone;
 	int gfp_mask;
-} zonelist_t;
+} gfpmask_zone_t;
 
 #define NR_GFPINDEX		0x100
 
+#define LRU_SWAP_CACHE		0
+#define LRU_NORMAL_CACHE	1
+#define NR_LRU_CACHE		2
+/* the lru cache should be per-node */
+typedef struct lru_cache_s {
+	struct list_head heads[NR_LRU_CACHE];
+	unsigned long nr_cache_pages; /* pages in the lrus */
+	unsigned long nr_map_pages; /* pages temporarly out of the lru */
+	/* keep lock in a separate cacheline to avoid ping pong in SMP */
+	spinlock_t lock ____cacheline_aligned_in_smp;
+} lru_cache_t;
+
 struct bootmem_data;
 typedef struct pglist_data {
+	int nr_zones;
 	zone_t node_zones[MAX_NR_ZONES];
-	zonelist_t node_zonelists[NR_GFPINDEX];
+	gfpmask_zone_t node_gfpmask_zone[NR_GFPINDEX];
+	lru_cache_t lru_cache;
 	struct page *node_mem_map;
 	unsigned long *valid_addr_bitmap;
 	struct bootmem_data *bdata;
@@ -86,14 +104,14 @@
 	unsigned long node_size;
 	int node_id;
 	struct pglist_data *node_next;
+	spinlock_t freelist_lock;
 } pg_data_t;
 
 extern int numnodes;
 extern pg_data_t *pgdat_list;
 
 #define memclass(pgzone, tzone)	(((pgzone)->zone_pgdat == (tzone)->zone_pgdat) \
-			&& (((pgzone) - (pgzone)->zone_pgdat->node_zones) <= \
-			((tzone) - (pgzone)->zone_pgdat->node_zones)))
+			&& ((pgzone) <= (tzone)))
 
 /*
  * The following two are not meant for general usage. They are here as
diff -urN 2.3.99-pre6/include/linux/sched.h 2.3.99-pre6-VM-2/include/linux/sched.h
--- 2.3.99-pre6/include/linux/sched.h	Thu Apr 27 17:56:45 2000
+++ 2.3.99-pre6-VM-2/include/linux/sched.h	Fri Apr 28 18:36:38 2000
@@ -309,8 +309,8 @@
 	long per_cpu_utime[NR_CPUS], per_cpu_stime[NR_CPUS];
 /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
 	unsigned long min_flt, maj_flt, nswap, cmin_flt, cmaj_flt, cnswap;
+	int low_on_memory:1;
 	int swappable:1;
-	int hog:1;
 /* process credentials */
 	uid_t uid,euid,suid,fsuid;
 	gid_t gid,egid,sgid,fsgid;
diff -urN 2.3.99-pre6/include/linux/swap.h 2.3.99-pre6-VM-2/include/linux/swap.h
--- 2.3.99-pre6/include/linux/swap.h	Thu Apr 27 17:56:45 2000
+++ 2.3.99-pre6-VM-2/include/linux/swap.h	Fri Apr 28 18:36:38 2000
@@ -87,7 +87,6 @@
 
 /* linux/mm/vmscan.c */
 extern int try_to_free_pages(unsigned int gfp_mask, zone_t *zone);
-extern int swap_out(unsigned int gfp_mask, int priority);
 
 /* linux/mm/page_io.c */
 extern void rw_swap_page(int, struct page *, int);
@@ -160,27 +159,80 @@
 	return  count > 1;
 }
 
-extern spinlock_t pagemap_lru_lock;
+/*
+ * Helper macros for lru_cache handling.
+ */
+#define	lru_cache_add(page, lru_type)					\
+do {									\
+	lru_cache_t * this_lru = &(page)->zone->zone_pgdat->lru_cache;	\
+	if ((page)->map_count)						\
+		BUG();							\
+	spin_lock(&this_lru->lock);					\
+	list_add(&(page)->lru, &this_lru->heads[lru_type]);		\
+	this_lru->nr_cache_pages++;					\
+	spin_unlock(&this_lru->lock);					\
+} while (0)
+
+/* needs the lock on the page to be sure the page is in the lru list */
+#define	lru_cache_map(page)						\
+do {									\
+	lru_cache_t * this_lru = &(page)->zone->zone_pgdat->lru_cache;	\
+	if (!PageLocked(page))						\
+		BUG();							\
+	spin_lock(&this_lru->lock);					\
+	if (!(page)->map_count++) {					\
+		list_del(&(page)->lru);					\
+		this_lru->nr_cache_pages--;				\
+		this_lru->nr_map_pages++;				\
+	}								\
+	spin_unlock(&this_lru->lock);					\
+} while (0)
+
+#define	lru_cache_dup_map(page)						\
+do {									\
+	lru_cache_t * this_lru = &(page)->zone->zone_pgdat->lru_cache;	\
+	if ((page)->map_count <= 0)					\
+		BUG();							\
+	spin_lock(&this_lru->lock);					\
+	(page)->map_count++;						\
+	spin_unlock(&this_lru->lock);					\
+} while (0)
 
 /*
- * Helper macros for lru_pages handling.
+ * The page isn't in the lru list anymore so don't need the page lock.
+ * Don't discriminate between lru and put all pages that were mapped
+ * in the normal lru.
  */
-#define	lru_cache_add(page)			\
-do {						\
-	spin_lock(&pagemap_lru_lock);		\
-	list_add(&(page)->lru, &lru_cache);	\
-	nr_lru_pages++;				\
-	spin_unlock(&pagemap_lru_lock);		\
+#define	lru_cache_unmap(page)						    \
+do {									    \
+	lru_cache_t * this_lru = &(page)->zone->zone_pgdat->lru_cache;	    \
+	if ((page)->map_count <= 0)					    \
+		BUG();							    \
+	spin_lock(&this_lru->lock);					    \
+	if (!--(page)->map_count) {					    \
+		list_add(&(page)->lru, &this_lru->heads[LRU_NORMAL_CACHE]); \
+		this_lru->nr_cache_pages++;				    \
+		this_lru->nr_map_pages--;				    \
+	}								    \
+	spin_unlock(&this_lru->lock);					    \
 } while (0)
 
-#define	lru_cache_del(page)			\
-do {						\
-	if (!PageLocked(page))			\
-		BUG();				\
-	spin_lock(&pagemap_lru_lock);		\
-	list_del(&(page)->lru);			\
-	nr_lru_pages--;				\
-	spin_unlock(&pagemap_lru_lock);		\
+/*
+ * The map_count BUG() check will trigger if NFS will try to unlink
+ * from the pagecache mapped pages. That insn't safe. If that will become
+ * safe just change the below code to set map_count to 0.
+ */
+#define	lru_cache_del(page)						\
+do {									\
+	lru_cache_t * this_lru = &(page)->zone->zone_pgdat->lru_cache;	\
+	if (!PageLocked(page))						\
+		BUG();							\
+	if ((page)->map_count)						\
+		BUG();							\
+	spin_lock(&this_lru->lock);					\
+	list_del(&(page)->lru);						\
+	this_lru->nr_cache_pages--;					\
+	spin_unlock(&this_lru->lock);					\
 } while (0)
 
 extern spinlock_t swaplock;
diff -urN 2.3.99-pre6/include/linux/swapctl.h 2.3.99-pre6-VM-2/include/linux/swapctl.h
--- 2.3.99-pre6/include/linux/swapctl.h	Tue Apr 25 16:32:34 2000
+++ 2.3.99-pre6-VM-2/include/linux/swapctl.h	Fri Apr 28 18:36:45 2000
@@ -11,8 +11,7 @@
 	unsigned int	max_percent;
 } buffer_mem_v1;
 typedef buffer_mem_v1 buffer_mem_t;
-extern buffer_mem_t buffer_mem;
-extern buffer_mem_t page_cache;
+extern buffer_mem_t lru_cache_mem;
 
 typedef struct freepages_v1
 {
diff -urN 2.3.99-pre6/include/linux/sysctl.h 2.3.99-pre6-VM-2/include/linux/sysctl.h
--- 2.3.99-pre6/include/linux/sysctl.h	Thu Apr 27 17:56:45 2000
+++ 2.3.99-pre6-VM-2/include/linux/sysctl.h	Fri Apr 28 16:44:32 2000
@@ -119,15 +119,18 @@
 enum
 {
 	VM_SWAPCTL=1,		/* struct: Set vm swapping control */
-	VM_SWAPOUT=2,		/* int: Linear or sqrt() swapout for hogs */
+	VM_SWAPOUT=2,		/* int: Background pageout interval */
 	VM_FREEPG=3,		/* struct: Set free page thresholds */
 	VM_BDFLUSH=4,		/* struct: Control buffer cache flushing */
 	VM_OVERCOMMIT_MEMORY=5,	/* Turn off the virtual memory safety limit */
+#if 0 /* obsolete but don't reuse */
 	VM_BUFFERMEM=6,		/* struct: Set buffer memory thresholds */
 	VM_PAGECACHE=7,		/* struct: Set cache memory thresholds */
+#endif
 	VM_PAGERDAEMON=8,	/* struct: Control kswapd behaviour */
 	VM_PGT_CACHE=9,		/* struct: Set page table cache parameters */
-	VM_PAGE_CLUSTER=10	/* int: set number of pages to swap together */
+	VM_PAGE_CLUSTER=10,	/* int: set number of pages to swap together */
+	VM_LRU_CACHE=11,	/* struct: Set lru cache memory thresholds */
 };
 
 
diff -urN 2.3.99-pre6/ipc/shm.c 2.3.99-pre6-VM-2/ipc/shm.c
--- 2.3.99-pre6/ipc/shm.c	Thu Apr 27 17:56:45 2000
+++ 2.3.99-pre6-VM-2/ipc/shm.c	Fri Apr 28 16:44:32 2000
@@ -132,7 +132,7 @@
 static int sysvipc_shm_read_proc(char *buffer, char **start, off_t offset, int length, int *eof, void *data);
 #endif
 
-static void zshm_swap (int prio, int gfp_mask, zone_t *zone);
+static void zshm_swap (int prio, zone_t *zone);
 static void zmap_unuse(swp_entry_t entry, struct page *page);
 static void shmzero_open(struct vm_area_struct *shmd);
 static void shmzero_close(struct vm_area_struct *shmd);
@@ -1439,7 +1439,7 @@
 	if (!pte_present(page))
 		return RETRY;
 	page_map = pte_page(page);
-	if (zone && (!memclass(page_map->zone, zone)))
+	if (!memclass(page_map->zone, zone))
 		return RETRY;
 	if (shp->id != zero_id) swap_attempts++;
 
@@ -1496,17 +1496,21 @@
 	struct shmid_kernel *shp;
 	swp_entry_t swap_entry;
 	unsigned long id, idx;
-	int loop = 0;
+	int loop;
 	int counter;
 	struct page * page_map;
 
-	zshm_swap(prio, gfp_mask, zone);
+	if (!(gfp_mask & __GFP_IO))
+		return 0;
+
+	zshm_swap(prio, zone);
 	counter = shm_rss >> prio;
 	if (!counter)
 		return 0;
 	if (shm_swap_preop(&swap_entry))
 		return 0;
 
+	loop = 0;
 	shm_lockall();
 check_id:
 	shp = shm_get(swap_id);
@@ -1819,7 +1823,7 @@
 	spin_unlock(&zmap_list_lock);
 }
 
-static void zshm_swap (int prio, int gfp_mask, zone_t *zone)
+static void zshm_swap (int prio, zone_t *zone)
 {
 	struct shmid_kernel *shp;
 	swp_entry_t swap_entry;
diff -urN 2.3.99-pre6/kernel/sysctl.c 2.3.99-pre6-VM-2/kernel/sysctl.c
--- 2.3.99-pre6/kernel/sysctl.c	Thu Apr 27 17:56:45 2000
+++ 2.3.99-pre6-VM-2/kernel/sysctl.c	Fri Apr 28 16:44:32 2000
@@ -233,16 +233,14 @@
 	 &bdflush_min, &bdflush_max},
 	{VM_OVERCOMMIT_MEMORY, "overcommit_memory", &sysctl_overcommit_memory,
 	 sizeof(sysctl_overcommit_memory), 0644, NULL, &proc_dointvec},
-	{VM_BUFFERMEM, "buffermem",
-	 &buffer_mem, sizeof(buffer_mem_t), 0644, NULL, &proc_dointvec},
-	{VM_PAGECACHE, "pagecache",
-	 &page_cache, sizeof(buffer_mem_t), 0644, NULL, &proc_dointvec},
 	{VM_PAGERDAEMON, "kswapd",
 	 &pager_daemon, sizeof(pager_daemon_t), 0644, NULL, &proc_dointvec},
 	{VM_PGT_CACHE, "pagetable_cache", 
 	 &pgt_cache_water, 2*sizeof(int), 0644, NULL, &proc_dointvec},
 	{VM_PAGE_CLUSTER, "page-cluster", 
 	 &page_cluster, sizeof(int), 0644, NULL, &proc_dointvec},
+	{VM_LRU_CACHE, "lru_cache",
+	 &lru_cache_mem, sizeof(buffer_mem_t), 0644, NULL, &proc_dointvec},
 	{0}
 };
 
diff -urN 2.3.99-pre6/mm/filemap.c 2.3.99-pre6-VM-2/mm/filemap.c
--- 2.3.99-pre6/mm/filemap.c	Thu Apr 27 17:56:45 2000
+++ 2.3.99-pre6-VM-2/mm/filemap.c	Fri Apr 28 18:18:29 2000
@@ -44,14 +44,8 @@
 atomic_t page_cache_size = ATOMIC_INIT(0);
 unsigned int page_hash_bits;
 struct page **page_hash_table;
-struct list_head lru_cache;
 
 spinlock_t pagecache_lock = SPIN_LOCK_UNLOCKED;
-/*
- * NOTE: to avoid deadlocking you must never acquire the pagecache_lock with
- *       the pagemap_lru_lock held.
- */
-spinlock_t pagemap_lru_lock = SPIN_LOCK_UNLOCKED;
 
 #define CLUSTER_PAGES		(1 << page_cluster)
 #define CLUSTER_OFFSET(x)	(((x) >> page_cluster) << page_cluster)
@@ -97,9 +91,6 @@
 	if (!PageLocked(page))
 		PAGE_BUG(page);
 
-	/* Initiate completion of any async operations */
-	sync_page(page);
-
 	spin_lock(&pagecache_lock);
 	remove_page_from_inode_queue(page);
 	remove_page_from_hash_queue(page);
@@ -111,19 +102,32 @@
 {
 	struct list_head *head, *curr;
 	struct page * page;
+	LIST_HEAD(dispose);
 
- repeat:
 	head = &inode->i_mapping->pages;
+ repeat:
 	spin_lock(&pagecache_lock);
 	curr = head->next;
 
 	while (curr != head) {
 		page = list_entry(curr, struct page, list);
-		curr = curr->next;
 
 		/* We cannot invalidate a locked page */
-		if (TryLockPage(page))
+		if (TryLockPage(page)) {
+			list_del(curr); list_add(curr, &dispose);
+			continue;
+		}
+		/*
+		 * Avoid dropping mapped pages from the cache. We can't
+		 * rely on the page->map_count for this check. We know
+		 * shrink_mmap won't increase the page->count from under
+		 * us since we hold the page lock.
+		 */
+		if (page_count(page) != 1) {
+			UnlockPage(page);
+			list_del(curr); list_add(curr, &dispose);
 			continue;
+		}
 		spin_unlock(&pagecache_lock);
 
 		lru_cache_del(page);
@@ -132,6 +136,7 @@
 		page_cache_release(page);
 		goto repeat;
 	}
+	list_splice(&dispose, head);
 	spin_unlock(&pagecache_lock);
 }
 
@@ -145,33 +150,28 @@
 	struct page * page;
 	unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
 	unsigned long start;
+	LIST_HEAD(dispose);
 
 	start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 
-repeat:
 	head = &mapping->pages;
+repeat:
 	spin_lock(&pagecache_lock);
 	curr = head->next;
 	while (curr != head) {
 		unsigned long offset;
 
 		page = list_entry(curr, struct page, list);
-		curr = curr->next;
 
 		offset = page->index;
 
 		/* page wholly truncated - free it */
 		if (offset >= start) {
-			if (TryLockPage(page)) {
-				spin_unlock(&pagecache_lock);
-				get_page(page);
-				wait_on_page(page);
-				put_page(page);
-				goto repeat;
-			}
 			get_page(page);
 			spin_unlock(&pagecache_lock);
 
+			lock_page(page);
+
 			if (!page->buffers || block_flushpage(page, 0))
 				lru_cache_del(page);
 
@@ -198,24 +198,21 @@
 			 */
 			goto repeat;
 		}
+
+		list_del(curr); list_add(curr, &dispose);
 		/*
-		 * there is only one partial page possible.
+		 * there is only one partial page possible and it's the
+		 * one preceeding the first wholly truncated page.
 		 */
-		if (!partial)
-			continue;
-
-		/* and it's the one preceeding the first wholly truncated page */
-		if ((offset + 1) != start)
+		if (!partial || (offset + 1) != start)
 			continue;
 
 		/* partial truncate, clear end of page */
-		if (TryLockPage(page)) {
-			spin_unlock(&pagecache_lock);
-			goto repeat;
-		}
 		get_page(page);
 		spin_unlock(&pagecache_lock);
 
+		lock_page(page);
+
 		memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
 		if (page->buffers)
 			block_flushpage(page, partial);
@@ -228,74 +225,68 @@
 		 */
 		UnlockPage(page);
 		page_cache_release(page);
-		get_page(page);
-		wait_on_page(page);
-		put_page(page);
 		goto repeat;
 	}
+	list_splice(&dispose, head);
 	spin_unlock(&pagecache_lock);
 }
 
-int shrink_mmap(int priority, int gfp_mask, zone_t *zone)
+static int FASTCALL(__shrink_mmap(int priority, zone_t *zone,
+				  unsigned long * __count,
+				  lru_cache_t * this_lru,
+				  int lru_type));
+static int __shrink_mmap(int priority, zone_t *zone,
+			 unsigned long * __count,
+			 lru_cache_t * this_lru,
+			 int lru_type)
 {
-	int ret = 0, loop = 0, count;
+	int ret = 0;
+	unsigned long count = *__count;
 	LIST_HEAD(young);
 	LIST_HEAD(old);
 	LIST_HEAD(forget);
 	struct list_head * page_lru, * dispose;
-	struct page * page = NULL;
-	struct zone_struct * p_zone;
-	int maxloop = 256 >> priority;
-	
-	if (!zone)
-		BUG();
+	struct page * page;
+	spinlock_t * lru_lock = &this_lru->lock;
+	struct list_head * lru_head = &this_lru->heads[lru_type];
+
+	spin_lock(lru_lock);
 
-	count = nr_lru_pages >> priority;
-	if (!count)
-		return ret;
-
-	spin_lock(&pagemap_lru_lock);
-again:
-	/* we need pagemap_lru_lock for list_del() ... subtle code below */
-	while (count > 0 && (page_lru = lru_cache.prev) != &lru_cache) {
+	while (count > 0 && (page_lru = lru_head->prev) != lru_head) {
 		page = list_entry(page_lru, struct page, lru);
 		list_del(page_lru);
-		p_zone = page->zone;
 
-		/*
-		 * These two tests are there to make sure we don't free too
-		 * many pages from the "wrong" zone. We free some anyway,
-		 * they are the least recently used pages in the system.
-		 * When we don't free them, leave them in &old.
-		 */
 		dispose = &old;
-		if (p_zone != zone && (loop > (maxloop / 4) ||
-				p_zone->free_pages > p_zone->pages_high))
+		/* don't account passes over not DMA pages */
+		if (!memclass(page->zone, zone))
 			goto dispose_continue;
 
-		/* The page is in use, or was used very recently, put it in
-		 * &young to make sure that we won't try to free it the next
-		 * time */
-		dispose = &young;
+		count--;
 
+		dispose = lru_head;
 		if (test_and_clear_bit(PG_referenced, &page->flags))
+			/* Roll the page at the top of the lru list,
+			 * we could also be more aggressive putting
+			 * the page in the young-dispose-list, so
+			 * avoiding to free young pages in each pass.
+			 */
 			goto dispose_continue;
 
-		count--;
+		dispose = &young;
+
+		/* avoid unscalable SMP locking */
 		if (!page->buffers && page_count(page) > 1)
 			goto dispose_continue;
 
-		/* Page not used -> free it; if that fails -> &old */
-		dispose = &old;
 		if (TryLockPage(page))
 			goto dispose_continue;
 
-		/* Release the pagemap_lru lock even if the page is not yet
+		/* Release the lru_cache lock even if the page is not yet
 		   queued in any lru queue since we have just locked down
 		   the page so nobody else may SMP race with us running
 		   a lru_cache_del() (lru_cache_del() always run with the
 		   page locked down ;). */
-		spin_unlock(&pagemap_lru_lock);
+		spin_unlock(lru_lock);
 
 		/* avoid freeing the page while it's locked */
 		get_page(page);
@@ -332,12 +323,17 @@
 		if (PageSwapCache(page)) {
 			spin_unlock(&pagecache_lock);
 			__delete_from_swap_cache(page);
+			/*
+			 * We hold the lock on the page so we don't
+			 * need to do an atomic clear_bit().
+			 */
+			page->flags &= ~(1UL << PG_swap_entry);
 			goto made_inode_progress;
 		}	
 
 		/* is it a page-cache page? */
 		if (page->mapping) {
-			if (!PageDirty(page) && !pgcache_under_min()) {
+			if (!PageDirty(page)) {
 				remove_page_from_inode_queue(page);
 				remove_page_from_hash_queue(page);
 				page->mapping = NULL;
@@ -353,13 +349,12 @@
 cache_unlock_continue:
 		spin_unlock(&pagecache_lock);
 unlock_continue:
-		spin_lock(&pagemap_lru_lock);
+		spin_lock(lru_lock);
 		UnlockPage(page);
 		put_page(page);
 		list_add(page_lru, dispose);
 		continue;
 
-		/* we're holding pagemap_lru_lock, so we can just loop again */
 dispose_continue:
 		list_add(page_lru, dispose);
 	}
@@ -371,24 +366,41 @@
 	UnlockPage(page);
 	put_page(page);
 	ret = 1;
-	spin_lock(&pagemap_lru_lock);
-	/* nr_lru_pages needs the spinlock */
-	nr_lru_pages--;
-
-	loop++;
-	/* wrong zone?  not looped too often?    roll again... */
-	if (page->zone != zone && loop < maxloop)
-		goto again;
+	spin_lock(lru_lock);
+	/* nr_pages needs the spinlock */
+	this_lru->nr_cache_pages--;
 
 out:
-	list_splice(&young, &lru_cache);
-	list_splice(&old, lru_cache.prev);
+	list_splice(&young, lru_head);
+	list_splice(&old, lru_head->prev);
 
-	spin_unlock(&pagemap_lru_lock);
+	spin_unlock(lru_lock);
 
+	*__count = count;
 	return ret;
 }
 
+int shrink_mmap(int priority, zone_t *zone)
+{
+	lru_cache_t * this_lru;
+	unsigned long count;
+	int i;
+
+	this_lru = &zone->zone_pgdat->lru_cache;
+
+	count = this_lru->nr_cache_pages;
+	if (lru_cache_under_min(count))
+		return 0;
+
+	count /= priority + 1;
+
+	for (i = 0; i < NR_LRU_CACHE; i++)
+		if (__shrink_mmap(priority, zone, &count,
+				  this_lru, i))
+			return 1;
+	return 0;
+}
+
 static inline struct page * __find_page_nolock(struct address_space *mapping, unsigned long offset, struct page *page)
 {
 	goto inside;
@@ -498,9 +510,9 @@
  * This adds a page to the page cache, starting out as locked,
  * owned by us, referenced, but not uptodate and with no errors.
  */
-static inline void __add_to_page_cache(struct page * page,
+static inline void ____add_to_page_cache(struct page * page,
 	struct address_space *mapping, unsigned long offset,
-	struct page **hash)
+	struct page **hash, int lru_type)
 {
 	struct page *alias;
 	unsigned long flags;
@@ -508,18 +520,25 @@
 	if (PageLocked(page))
 		BUG();
 
-	flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_dirty));
-	page->flags = flags | (1 << PG_locked) | (1 << PG_referenced);
+	flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_dirty) | (1 << PG_referenced));
+	page->flags = flags | (1 << PG_locked);
 	get_page(page);
 	page->index = offset;
 	add_page_to_inode_queue(mapping, page);
 	__add_page_to_hash_queue(page, hash);
-	lru_cache_add(page);
+	lru_cache_add(page, lru_type);
 	alias = __find_page_nolock(mapping, offset, *hash);
 	if (alias != page)
 		BUG();
 }
 
+static inline void __add_to_page_cache(struct page * page,
+	struct address_space *mapping, unsigned long offset,
+	struct page **hash)
+{
+	____add_to_page_cache(page, mapping, offset, hash, LRU_NORMAL_CACHE);
+}
+
 void add_to_page_cache(struct page * page, struct address_space * mapping, unsigned long offset)
 {
 	spin_lock(&pagecache_lock);
@@ -527,6 +546,14 @@
 	spin_unlock(&pagecache_lock);
 }
 
+void __add_to_swap_cache(struct page * page, struct address_space * mapping, unsigned long offset)
+{
+	spin_lock(&pagecache_lock);
+	____add_to_page_cache(page, mapping, offset,
+			      page_hash(mapping, offset), LRU_SWAP_CACHE);
+	spin_unlock(&pagecache_lock);
+}
+
 static int add_to_page_cache_unique(struct page * page,
 	struct address_space *mapping, unsigned long offset,
 	struct page **hash)
@@ -1447,6 +1474,14 @@
 	}
 
 	flush_page_to_ram(old_page);
+	/*
+	 * lock_page() is necessary to synchronize with shrink_mmap.
+	 * We must make sure the page is in the lru list while
+	 * we list_del(&(page)->lru).
+	 */
+	lock_page(old_page);
+	lru_cache_map(old_page);
+	UnlockPage(old_page);
 	return old_page;
 
 no_cached_page:
diff -urN 2.3.99-pre6/mm/highmem.c 2.3.99-pre6-VM-2/mm/highmem.c
--- 2.3.99-pre6/mm/highmem.c	Mon Apr  3 03:21:59 2000
+++ 2.3.99-pre6-VM-2/mm/highmem.c	Fri Apr 28 16:44:32 2000
@@ -75,7 +75,7 @@
 
 	/* Preserve the caching of the swap_entry. */
 	highpage->index = page->index;
-	highpage->mapping = page->mapping;
+	highpage->flags = page->flags;
 
 	/*
 	 * We can just forget the old page since 
diff -urN 2.3.99-pre6/mm/memory.c 2.3.99-pre6-VM-2/mm/memory.c
--- 2.3.99-pre6/mm/memory.c	Thu Apr 27 17:56:45 2000
+++ 2.3.99-pre6-VM-2/mm/memory.c	Fri Apr 28 16:46:27 2000
@@ -211,6 +211,7 @@
 			do {
 				pte_t pte = *src_pte;
 				unsigned long page_nr;
+				struct page * page;
 				
 				/* copy_one_pte */
 
@@ -235,8 +236,11 @@
 				/* If it's a shared mapping, mark it clean in the child */
 				if (vma->vm_flags & VM_SHARED)
 					pte = pte_mkclean(pte);
+				page = &mem_map[page_nr];
+				if (page->mapping)
+					lru_cache_dup_map(page);
+				get_page(page);
 				set_pte(dst_pte, pte_mkold(pte));
-				get_page(mem_map + page_nr);
 			
 cont_copy_pte_range:		address += PAGE_SIZE;
 				if (address >= end)
@@ -837,6 +841,7 @@
 	 */
 	switch (page_count(old_page)) {
 	case 2:
+	case 3:
 		/*
 		 * Lock the page so that no one can look it up from
 		 * the swap cache, grab a reference and start using it.
@@ -848,6 +853,7 @@
 			UnlockPage(old_page);
 			break;
 		}
+		lru_cache_unmap(old_page);
 		delete_from_swap_cache_nolock(old_page);
 		UnlockPage(old_page);
 		/* FallThrough */
@@ -874,12 +880,26 @@
 		if (PageReserved(old_page))
 			++mm->rss;
 		break_cow(vma, old_page, new_page, address, page_table);
+		if (old_page->mapping)
+			lru_cache_unmap(old_page);
 
 		/* Free the old page.. */
 		new_page = old_page;
 	}
 	spin_unlock(&mm->page_table_lock);
-	__free_page(new_page);
+	/*
+	 * We're releasing a page, it can be an anonymous
+	 * page as well. Since we don't hold any lock (except
+	 * the mmap_sem semaphore) the other user of the anonymous
+	 * page may have released it from under us and now we
+	 * could be the only owner of the page, thus put_page_testzero() can
+	 * return 1, and we have to clear the swap-entry
+	 * bitflag in such case.
+	 */
+	if (put_page_testzero(new_page)) {
+		new_page->flags &= ~(1UL << PG_swap_entry);
+		__free_pages_ok(new_page, 0);
+	}
 	return 1;
 
 bad_wp_page:
@@ -1068,8 +1088,10 @@
 		page = replace_with_highmem(page);
 		pte = mk_pte(page, vma->vm_page_prot);
 		pte = pte_mkwrite(pte_mkdirty(pte));
-	} else
+	} else {
+		lru_cache_map(page);
 		UnlockPage(page);
+	}
 
 	set_pte(page_table, pte);
 	/* No need to invalidate - it was non-present before */
diff -urN 2.3.99-pre6/mm/numa.c 2.3.99-pre6-VM-2/mm/numa.c
--- 2.3.99-pre6/mm/numa.c	Tue Apr 18 16:11:42 2000
+++ 2.3.99-pre6-VM-2/mm/numa.c	Fri Apr 28 16:44:32 2000
@@ -33,7 +33,7 @@
 
 struct page * alloc_pages_node(int nid, int gfp_mask, unsigned long order)
 {
-	return __alloc_pages(NODE_DATA(nid)->node_zonelists + gfp_mask, order);
+	return __alloc_pages(NODE_DATA(nid)->node_gfpmask_zone + gfp_mask, order);
 }
 
 #ifdef CONFIG_DISCONTIGMEM
diff -urN 2.3.99-pre6/mm/page_alloc.c 2.3.99-pre6-VM-2/mm/page_alloc.c
--- 2.3.99-pre6/mm/page_alloc.c	Thu Apr 27 17:56:45 2000
+++ 2.3.99-pre6-VM-2/mm/page_alloc.c	Fri Apr 28 18:46:47 2000
@@ -25,7 +25,6 @@
 #endif
 
 int nr_swap_pages = 0;
-int nr_lru_pages = 0;
 pg_data_t *pgdat_list = (pg_data_t *)0;
 
 static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
@@ -87,6 +86,8 @@
 	free_area_t *area;
 	struct page *base;
 	zone_t *zone;
+	spinlock_t * freelist_lock;
+	pg_data_t * pgdat;
 
 	/*
 	 * Subtle. We do not want to test this in the inlined part of
@@ -110,6 +111,8 @@
 		BUG();
 	if (PageDecrAfter(page))
 		BUG();
+	if (PageSwapEntry(page))
+		BUG();
 
 	zone = page->zone;
 
@@ -122,10 +125,25 @@
 
 	area = zone->free_area + order;
 
-	spin_lock_irqsave(&zone->lock, flags);
+	pgdat = zone->zone_pgdat;
+	freelist_lock = &pgdat->freelist_lock;
+	spin_lock_irqsave(freelist_lock, flags);
 
 	zone->free_pages -= mask;
 
+	/* update the classzone */
+	{
+		int nr_zone = zone->nr_zone;
+		register zone_t * z = zone;
+		do {
+			z->classzone_free_pages -= mask;
+			if (z->zone_wake_kswapd &&
+			    z->classzone_free_pages > z->pages_high)
+				z->zone_wake_kswapd = 0;
+			z++;
+		} while (++nr_zone < pgdat->nr_zones);
+	}
+
 	while (mask + (1 << (MAX_ORDER-1))) {
 		struct page *buddy1, *buddy2;
 
@@ -153,13 +171,7 @@
 		page_idx &= mask;
 	}
 	memlist_add_head(&(base + page_idx)->list, &area->free_list);
-
-	spin_unlock_irqrestore(&zone->lock, flags);
-
-	if (zone->free_pages > zone->pages_high) {
-		zone->zone_wake_kswapd = 0;
-		zone->low_on_memory = 0;
-	}
+	spin_unlock_irqrestore(freelist_lock, flags);
 }
 
 #define MARK_USED(index, order, area) \
@@ -186,16 +198,15 @@
 	return page;
 }
 
-static FASTCALL(struct page * rmqueue(zone_t *zone, unsigned long order));
-static struct page * rmqueue(zone_t *zone, unsigned long order)
+static FASTCALL(struct page * rmqueue(zone_t *zone, unsigned long order, unsigned long flags));
+static struct page * rmqueue(zone_t *zone, unsigned long order, unsigned long flags)
 {
 	free_area_t * area = zone->free_area + order;
 	unsigned long curr_order = order;
 	struct list_head *head, *curr;
-	unsigned long flags;
 	struct page *page;
+	pg_data_t * pgdat;
 
-	spin_lock_irqsave(&zone->lock, flags);
 	do {
 		head = &area->free_list;
 		curr = memlist_next(head);
@@ -209,10 +220,21 @@
 			memlist_del(curr);
 			index = (page - mem_map) - zone->offset;
 			MARK_USED(index, curr_order, area);
-			zone->free_pages -= 1 << order;
+
+			zone->free_pages -= 1UL << order;
+			pgdat = zone->zone_pgdat;
+			/* update the classzone */
+			{
+				int nr_zone = zone->nr_zone;
+				register zone_t * z = zone;
+				do {
+					z->classzone_free_pages -= 1UL<<order;
+					z++;
+				} while (++nr_zone < pgdat->nr_zones);
+			}
 
 			page = expand(zone, page, index, order, curr_order, area);
-			spin_unlock_irqrestore(&zone->lock, flags);
+			spin_unlock_irqrestore(&pgdat->freelist_lock, flags);
 
 			set_page_count(page, 1);
 			if (BAD_RANGE(zone,page))
@@ -222,59 +244,23 @@
 		curr_order++;
 		area++;
 	} while (curr_order < MAX_ORDER);
-	spin_unlock_irqrestore(&zone->lock, flags);
 
 	return NULL;
 }
 
-static int zone_balance_memory(zonelist_t *zonelist)
-{
-	int tried = 0, freed = 0;
-	zone_t **zone;
-	int gfp_mask = zonelist->gfp_mask;
-	extern wait_queue_head_t kswapd_wait;
-
-	zone = zonelist->zones;
-	for (;;) {
-		zone_t *z = *(zone++);
-		if (!z)
-			break;
-		if (z->free_pages > z->pages_low)
-			continue;
-
-		z->zone_wake_kswapd = 1;
-		wake_up_interruptible(&kswapd_wait);
-
-		/* Are we reaching the critical stage? */
-		if (!z->low_on_memory) {
-			/* Not yet critical, so let kswapd handle it.. */
-			if (z->free_pages > z->pages_min)
-				continue;
-			z->low_on_memory = 1;
-		}
-		/*
-		 * In the atomic allocation case we only 'kick' the
-		 * state machine, but do not try to free pages
-		 * ourselves.
-		 */
-		tried = 1;
-		freed |= try_to_free_pages(gfp_mask, z);
-	}
-	if (tried && !freed) {
-		if (!(gfp_mask & __GFP_HIGH))
-			return 0;
-	}
-	return 1;
-}
-
 /*
  * This is the 'heart' of the zoned buddy allocator:
  */
-struct page * __alloc_pages(zonelist_t *zonelist, unsigned long order)
+struct page * __alloc_pages(gfpmask_zone_t * gfpmask_zone, unsigned long order)
 {
-	zone_t **zone = zonelist->zones;
-	int gfp_mask = zonelist->gfp_mask;
-	static int low_on_memory;
+	zone_t * classzone = gfpmask_zone->classzone;
+	pg_data_t * pgdat = classzone->zone_pgdat;
+	int freed;
+	spinlock_t * freelist_lock = &pgdat->freelist_lock;
+	long flags;
+	unsigned long classzone_free_pages;
+
+	spin_lock_irqsave(freelist_lock, flags);
 
 	/*
 	 * If this is a recursive call, we'd better
@@ -284,58 +270,70 @@
 	if (current->flags & PF_MEMALLOC)
 		goto allocate_ok;
 
-	/* If we're a memory hog, unmap some pages */
-	if (current->hog && low_on_memory &&
-			(gfp_mask & __GFP_WAIT))
-		swap_out(4, gfp_mask);
+	/* classzone based memory balancing */
+	classzone_free_pages = classzone->classzone_free_pages;
+	if (!current->low_on_memory &&
+	    classzone_free_pages > classzone->pages_low) {
+		int nr_zone;
+		zone_t * z;
+
+	allocate_ok:
+		z = classzone;
+		for (nr_zone = classzone->nr_zone;
+		     nr_zone >= 0;
+		     nr_zone--, z--) {
+			if (z->free_pages >= (1UL << order)) {
+				struct page *page = rmqueue(z, order, flags);
+				if (page)
+					return page;
+			}
+		}
+	} else {
+		extern wait_queue_head_t kswapd_wait;
 
-	/*
-	 * (If anyone calls gfp from interrupts nonatomically then it
-	 * will sooner or later tripped up by a schedule().)
-	 *
-	 * We are falling back to lower-level zones if allocation
-	 * in a higher zone fails.
-	 */
-	for (;;) {
-		zone_t *z = *(zone++);
-		if (!z)
-			break;
-		if (!z->size)
-			BUG();
+		if (classzone_free_pages > classzone->pages_low) {
+			if (current->low_on_memory)
+				current->low_on_memory = 0;
+			goto allocate_ok;
+		}
 
-		/* Are we supposed to free memory? Don't make it worse.. */
-		if (!z->zone_wake_kswapd && z->free_pages > z->pages_low) {
-			struct page *page = rmqueue(z, order);
-			low_on_memory = 0;
-			if (page)
-				return page;
+		if (!classzone->zone_wake_kswapd) {
+			classzone->zone_wake_kswapd = 1;
+			wake_up_interruptible(&kswapd_wait);
 		}
-	}
 
-	low_on_memory = 1;
-	/*
-	 * Ok, no obvious zones were available, start
-	 * balancing things a bit..
-	 */
-	if (zone_balance_memory(zonelist)) {
-		zone = zonelist->zones;
-allocate_ok:
-		for (;;) {
-			zone_t *z = *(zone++);
-			if (!z)
-				break;
-			if (z->free_pages) {
-				struct page *page = rmqueue(z, order);
-				if (page)
-					return page;
-			}
+		/* Are we reaching the critical stage? */
+		if (!current->low_on_memory) {
+			/* Not yet critical, so let kswapd handle it.. */
+			if (classzone_free_pages > classzone->pages_min)
+				goto allocate_ok;
+			current->low_on_memory = 1;
+		}
+
+		spin_unlock_irqrestore(freelist_lock, flags);
+		freed = try_to_free_pages(gfpmask_zone->gfp_mask, classzone);
+		spin_lock_irq(freelist_lock);
+
+		if (freed || gfpmask_zone->gfp_mask & __GFP_HIGH)
+			goto allocate_ok;
+		
+		/*
+		 * Re-check we're low on memory keeping the spinlock held
+		 * before failing. Somebody may have released
+		 * lots of memory from under us while we was trying
+		 * to free the pages. We check against pages_high
+		 * to be sure to succeed only if lots of memory is been
+		 * released.
+		 */
+		classzone_free_pages = classzone->classzone_free_pages;
+		if (classzone_free_pages > classzone->pages_high) {
+			if (current->low_on_memory)
+				current->low_on_memory = 0;
+			goto allocate_ok;
 		}
 	}
+	spin_unlock_irqrestore(freelist_lock, flags);
 	return NULL;
-
-/*
- * The main chunk of the balancing code is in this offline branch:
- */
 }
 
 /*
@@ -344,13 +342,14 @@
 unsigned int nr_free_pages (void)
 {
 	unsigned int sum;
-	zone_t *zone;
 	int i;
 
 	sum = 0;
-	for (i = 0; i < NUMNODES; i++)
-		for (zone = NODE_DATA(i)->node_zones; zone < NODE_DATA(i)->node_zones + MAX_NR_ZONES; zone++)
-			sum += zone->free_pages;
+	for (i = 0; i < NUMNODES; i++) {
+		pg_data_t * pgdat = NODE_DATA(i);
+		zone_t * node_zones = pgdat->node_zones;
+		sum += node_zones[pgdat->nr_zones-1].classzone_free_pages;
+	}
 	return sum;
 }
 
@@ -359,14 +358,16 @@
  */
 unsigned int nr_free_buffer_pages (void)
 {
-	unsigned int sum;
-	zone_t *zone;
+	unsigned int sum = 0;
 	int i;
 
-	sum = nr_lru_pages;
-	for (i = 0; i < NUMNODES; i++)
-		for (zone = NODE_DATA(i)->node_zones; zone <= NODE_DATA(i)->node_zones+ZONE_NORMAL; zone++)
-			sum += zone->free_pages;
+	for (i = 0; i < NUMNODES; i++) {
+		pg_data_t * pgdat = NODE_DATA(i);
+		zone_t * node_zones = pgdat->node_zones;
+		int higher_zone = pgdat->nr_zones-1;
+		sum += pgdat->lru_cache.nr_cache_pages;
+		sum += node_zones[higher_zone <= ZONE_NORMAL ? higher_zone : ZONE_NORMAL].classzone_free_pages;
+	}
 	return sum;
 }
 
@@ -389,30 +390,33 @@
  */
 void show_free_areas_core(int nid)
 {
- 	unsigned long order;
+ 	unsigned long order, flags;
 	unsigned type;
+	pg_data_t * pgdat = NODE_DATA(nid);
+	spinlock_t * freelist_lock = &pgdat->freelist_lock;
 
 	printk("Free pages:      %6dkB (%6dkB HighMem)\n",
 		nr_free_pages() << (PAGE_SHIFT-10),
 		nr_free_highpages() << (PAGE_SHIFT-10));
 
-	printk("( Free: %d, lru_cache: %d (%d %d %d) )\n",
+	printk("( Free: %d, cache: %lu map: %lu (%d %d %d) )\n",
 		nr_free_pages(),
-		nr_lru_pages,
+		NODE_DATA(nid)->lru_cache.nr_cache_pages,
+		NODE_DATA(nid)->lru_cache.nr_map_pages,
 		freepages.min,
 		freepages.low,
 		freepages.high);
 
+	spin_lock_irqsave(freelist_lock, flags);
 	for (type = 0; type < MAX_NR_ZONES; type++) {
 		struct list_head *head, *curr;
-		zone_t *zone = NODE_DATA(nid)->node_zones + type;
- 		unsigned long nr, total, flags;
+		zone_t *zone = pgdat->node_zones + type;
+ 		unsigned long nr, total;
 
 		printk("  %s: ", zone->name);
 
 		total = 0;
 		if (zone->size) {
-			spin_lock_irqsave(&zone->lock, flags);
 		 	for (order = 0; order < MAX_ORDER; order++) {
 				head = &(zone->free_area + order)->free_list;
 				curr = head;
@@ -427,10 +431,15 @@
 				printk("%lu*%lukB ", nr,
 						(PAGE_SIZE>>10) << order);
 			}
-			spin_unlock_irqrestore(&zone->lock, flags);
+			if (total != zone->free_pages)
+				printk("error %lu ",
+				       zone->free_pages * (PAGE_SIZE>>10));
 		}
-		printk("= %lukB)\n", total * (PAGE_SIZE>>10));
+		printk("= %lukB, ", total * (PAGE_SIZE>>10));
+		printk("classzone = %lukB\n",
+		       zone->classzone_free_pages * (PAGE_SIZE>>10));
 	}
+	spin_unlock_irqrestore(freelist_lock, flags);
 
 #ifdef SWAP_CACHE_INFO
 	show_swap_cache_info();
@@ -445,18 +454,17 @@
 /*
  * Builds allocation fallback zone lists.
  */
-static inline void build_zonelists(pg_data_t *pgdat)
+static inline void build_gfpmask_zone(pg_data_t *pgdat)
 {
 	int i, j, k;
 
 	for (i = 0; i < NR_GFPINDEX; i++) {
-		zonelist_t *zonelist;
+		gfpmask_zone_t * gfpmask_zone;
 		zone_t *zone;
 
-		zonelist = pgdat->node_zonelists + i;
-		memset(zonelist, 0, sizeof(*zonelist));
+		gfpmask_zone = pgdat->node_gfpmask_zone + i;
 
-		zonelist->gfp_mask = i;
+		gfpmask_zone->gfp_mask = i;
 		j = 0;
 		k = ZONE_NORMAL;
 		if (i & __GFP_HIGHMEM)
@@ -476,21 +484,35 @@
 #ifndef CONFIG_HIGHMEM
 					BUG();
 #endif
-					zonelist->zones[j++] = zone;
+					gfpmask_zone->classzone = zone;
 				}
+				break;
 			case ZONE_NORMAL:
 				zone = pgdat->node_zones + ZONE_NORMAL;
 				if (zone->size)
-					zonelist->zones[j++] = zone;
+					gfpmask_zone->classzone = zone;
+				break;
 			case ZONE_DMA:
 				zone = pgdat->node_zones + ZONE_DMA;
 				if (zone->size)
-					zonelist->zones[j++] = zone;
+					gfpmask_zone->classzone = zone;
+				break;
 		}
-		zonelist->zones[j++] = NULL;
 	} 
 }
 
+static void __init lru_cache_init(pg_data_t * pgdat)
+{
+	int i;
+	lru_cache_t * this_lru = &pgdat->lru_cache;
+
+	for (i = 0; i < NR_LRU_CACHE; i++)
+		INIT_LIST_HEAD(&this_lru->heads[i]);
+	this_lru->nr_cache_pages = 0;
+	this_lru->nr_map_pages = 0;
+	spin_lock_init(&this_lru->lock);
+}
+
 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
 
 /*
@@ -507,9 +529,8 @@
 	unsigned long i, j;
 	unsigned long map_size;
 	unsigned long totalpages, offset, realtotalpages;
-	unsigned int cumulative = 0;
 
-	pgdat->node_next = pgdat_list;
+	pgdat->node_next = NULL;
 	pgdat_list = pgdat;
 
 	totalpages = 0;
@@ -539,7 +560,6 @@
 	freepages.min += i;
 	freepages.low += i * 2;
 	freepages.high += i * 3;
-	memlist_init(&lru_cache);
 
 	/*
 	 * Some architectures (with lots of mem and discontinous memory
@@ -556,6 +576,8 @@
 	pgdat->node_size = totalpages;
 	pgdat->node_start_paddr = zone_start_paddr;
 	pgdat->node_start_mapnr = (lmem_map - mem_map);
+	pgdat->nr_zones = 0;
+	spin_lock_init(&pgdat->freelist_lock);
 
 	/*
 	 * Initially all pages are reserved - free ones are freed
@@ -582,14 +604,15 @@
 		printk("zone(%lu): %lu pages.\n", j, size);
 		zone->size = size;
 		zone->name = zone_names[j];
-		zone->lock = SPIN_LOCK_UNLOCKED;
 		zone->zone_pgdat = pgdat;
+		zone->nr_zone = j;
 		zone->free_pages = 0;
+		zone->zone_wake_kswapd = 0;
 		if (!size)
 			continue;
+		pgdat->nr_zones = j+1;
 
 		zone->offset = offset;
-		cumulative += size;
 		mask = (realsize / zone_balance_ratio[j]);
 		if (mask < zone_balance_min[j])
 			mask = zone_balance_min[j];
@@ -598,8 +621,6 @@
 		zone->pages_min = mask;
 		zone->pages_low = mask*2;
 		zone->pages_high = mask*3;
-		zone->low_on_memory = 0;
-		zone->zone_wake_kswapd = 0;
 		zone->zone_mem_map = mem_map + offset;
 		zone->zone_start_mapnr = offset;
 		zone->zone_start_paddr = zone_start_paddr;
@@ -628,7 +649,8 @@
 			  (unsigned int *) alloc_bootmem_node(nid, bitmap_size);
 		}
 	}
-	build_zonelists(pgdat);
+	build_gfpmask_zone(pgdat);
+	lru_cache_init(pgdat);
 }
 
 void __init free_area_init(unsigned long *zones_size)
diff -urN 2.3.99-pre6/mm/swap.c 2.3.99-pre6-VM-2/mm/swap.c
--- 2.3.99-pre6/mm/swap.c	Wed Dec  8 00:05:28 1999
+++ 2.3.99-pre6-VM-2/mm/swap.c	Fri Apr 28 16:44:32 2000
@@ -46,13 +46,7 @@
    out, so that we don't try to swap TOO many pages out at once */
 atomic_t nr_async_pages = ATOMIC_INIT(0);
 
-buffer_mem_t buffer_mem = {
-	2,	/* minimum percent buffer */
-	10,	/* borrow percent buffer */
-	60	/* maximum percent buffer */
-};
-
-buffer_mem_t page_cache = {
+buffer_mem_t lru_cache_mem = {
 	2,	/* minimum percent page cache */
 	15,	/* borrow percent page cache */
 	75	/* maximum */
diff -urN 2.3.99-pre6/mm/swap_state.c 2.3.99-pre6-VM-2/mm/swap_state.c
--- 2.3.99-pre6/mm/swap_state.c	Thu Apr 27 17:56:45 2000
+++ 2.3.99-pre6-VM-2/mm/swap_state.c	Fri Apr 28 16:44:32 2000
@@ -45,6 +45,9 @@
 }
 #endif
 
+extern void __add_to_swap_cache(struct page *, struct address_space *,
+				unsigned long);
+
 void add_to_swap_cache(struct page *page, swp_entry_t entry)
 {
 #ifdef SWAP_CACHE_INFO
@@ -54,7 +57,7 @@
 		BUG();
 	if (page->mapping)
 		BUG();
-	add_to_page_cache(page, &swapper_space, entry.val);
+	__add_to_swap_cache(page, &swapper_space, entry.val);
 }
 
 static inline void remove_from_swap_cache(struct page *page)
@@ -121,6 +124,8 @@
  */
 void free_page_and_swap_cache(struct page *page)
 {
+	if (page->mapping)
+		lru_cache_unmap(page);
 	/* 
 	 * If we are the only user, then try to free up the swap cache. 
 	 */
@@ -131,9 +136,14 @@
 		UnlockPage(page);
 	}
 
-	ClearPageSwapEntry(page);
-
-	__free_page(page);
+	/*
+	 * Only the last unmap have to lose the swap entry
+	 * information that we have cached into page->index.
+	 */
+	if (put_page_testzero(page)) {
+		page->flags &= ~(1UL << PG_swap_entry);
+		__free_pages_ok(page, 0);
+	}
 }
 
 
diff -urN 2.3.99-pre6/mm/swapfile.c 2.3.99-pre6-VM-2/mm/swapfile.c
--- 2.3.99-pre6/mm/swapfile.c	Thu Apr 27 17:56:45 2000
+++ 2.3.99-pre6-VM-2/mm/swapfile.c	Fri Apr 28 16:44:32 2000
@@ -212,22 +212,22 @@
 
 	/* We have the old entry in the page offset still */
 	if (!page->index)
-		goto new_swap_entry;
+		goto null_swap_entry;
 	entry.val = page->index;
 	type = SWP_TYPE(entry);
 	if (type >= nr_swapfiles)
-		goto new_swap_entry;
+		goto bad_nofile;
+	swap_list_lock();
 	p = type + swap_info;
 	if ((p->flags & SWP_WRITEOK) != SWP_WRITEOK)
-		goto new_swap_entry;
+		goto unlock_list;
 	offset = SWP_OFFSET(entry);
 	if (offset >= p->max)
-		goto new_swap_entry;
+		goto bad_offset;
 	/* Has it been re-used for something else? */
-	swap_list_lock();
 	swap_device_lock(p);
 	if (p->swap_map[offset])
-		goto unlock_new_swap_entry;
+		goto unlock;
 
 	/* We're cool, we can just use the old one */
 	p->swap_map[offset] = 1;
@@ -236,11 +236,24 @@
 	swap_list_unlock();
 	return entry;
 
-unlock_new_swap_entry:
+unlock:
 	swap_device_unlock(p);
+unlock_list:
 	swap_list_unlock();
+clear_swap_entry:
+	ClearPageSwapEntry(page);
 new_swap_entry:
 	return get_swap_page();
+
+null_swap_entry:
+	printk(KERN_WARNING __FUNCTION__ " null swap entry\n");
+	goto clear_swap_entry;
+bad_nofile:
+	printk(KERN_WARNING __FUNCTION__ " nonexistent swap file\n");
+	goto clear_swap_entry;
+bad_offset:
+	printk(KERN_WARNING __FUNCTION__ " bad offset\n");
+	goto unlock_list;
 }
 
 /*
@@ -263,9 +276,13 @@
 		/* If this entry is swap-cached, then page must already
                    hold the right address for any copies in physical
                    memory */
-		if (pte_page(pte) != page)
+		if (pte_page(pte) != page) {
+			if (page->index == entry.val)
+				ClearPageSwapEntry(page);
 			return;
+		}
 		/* We will be removing the swap cache in a moment, so... */
+		lru_cache_unmap(page);
 		set_pte(dir, pte_mkdirty(pte));
 		return;
 	}
@@ -358,10 +375,20 @@
 	 */
 	if (!mm)
 		return;
+	/*
+	 * Avoid the vmas to go away from under us
+	 * and also avoids the task to play with
+	 * pagetables while we're running. If the
+	 * vmlist_modify_lock wouldn't acquire the
+	 * mm->page_table_lock spinlock we should
+	 * acquire it by hand.
+	 */
+	vmlist_access_lock(mm);
 	for (vma = mm->mmap; vma; vma = vma->vm_next) {
 		pgd_t * pgd = pgd_offset(mm, vma->vm_start);
 		unuse_vma(vma, pgd, entry, page);
 	}
+	vmlist_access_unlock(mm);
 	return;
 }
 
@@ -418,8 +445,10 @@
 		shm_unuse(entry, page);
 		/* Now get rid of the extra reference to the temporary
                    page we've been using. */
-		if (PageSwapCache(page))
+		if (PageSwapCache(page)) {
 			delete_from_swap_cache(page);
+			ClearPageSwapEntry(page);
+		}
 		__free_page(page);
 		/*
 		 * Check for and clear any overflowed swap map counts.
diff -urN 2.3.99-pre6/mm/vmscan.c 2.3.99-pre6-VM-2/mm/vmscan.c
--- 2.3.99-pre6/mm/vmscan.c	Thu Apr 27 17:56:45 2000
+++ 2.3.99-pre6-VM-2/mm/vmscan.c	Fri Apr 28 18:18:44 2000
@@ -34,7 +34,7 @@
  * using a process that no longer actually exists (it might
  * have died while we slept).
  */
-static int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, int gfp_mask)
+static int try_to_swap_out(struct vm_area_struct* vma, unsigned long address, pte_t * page_table, int gfp_mask)
 {
 	pte_t pte;
 	swp_entry_t entry;
@@ -48,7 +48,6 @@
 	if ((page-mem_map >= max_mapnr) || PageReserved(page))
 		goto out_failed;
 
-	mm->swap_cnt--;
 	/* Don't look at this pte if it's been accessed recently. */
 	if (pte_young(pte)) {
 		/*
@@ -78,6 +77,8 @@
 drop_pte:
 		vma->vm_mm->rss--;
 		flush_tlb_page(vma, address);
+		if (page->mapping)
+			lru_cache_unmap(page);
 		__free_page(page);
 		goto out_failed;
 	}
@@ -141,6 +142,8 @@
 		if (file) fput(file);
 		if (!error)
 			goto out_free_success;
+		if (page->mapping)
+			lru_cache_unmap(page);
 		__free_page(page);
 		return error;
 	}
@@ -195,7 +198,7 @@
  * (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de
  */
 
-static inline int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
+static inline int swap_out_pmd(struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
 {
 	pte_t * pte;
 	unsigned long pmd_end;
@@ -217,18 +220,16 @@
 	do {
 		int result;
 		vma->vm_mm->swap_address = address + PAGE_SIZE;
-		result = try_to_swap_out(mm, vma, address, pte, gfp_mask);
+		result = try_to_swap_out(vma, address, pte, gfp_mask);
 		if (result)
 			return result;
-		if (!mm->swap_cnt)
-			return 0;
 		address += PAGE_SIZE;
 		pte++;
 	} while (address && (address < end));
 	return 0;
 }
 
-static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
+static inline int swap_out_pgd(struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
 {
 	pmd_t * pmd;
 	unsigned long pgd_end;
@@ -248,18 +249,16 @@
 		end = pgd_end;
 	
 	do {
-		int result = swap_out_pmd(mm, vma, pmd, address, end, gfp_mask);
+		int result = swap_out_pmd(vma, pmd, address, end, gfp_mask);
 		if (result)
 			return result;
-		if (!mm->swap_cnt)
-			return 0;
 		address = (address + PMD_SIZE) & PMD_MASK;
 		pmd++;
 	} while (address && (address < end));
 	return 0;
 }
 
-static int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int gfp_mask)
+static int swap_out_vma(struct vm_area_struct * vma, unsigned long address, int gfp_mask)
 {
 	pgd_t *pgdir;
 	unsigned long end;
@@ -274,11 +273,9 @@
 	if (address >= end)
 		BUG();
 	do {
-		int result = swap_out_pgd(mm, vma, pgdir, address, end, gfp_mask);
+		int result = swap_out_pgd(vma, pgdir, address, end, gfp_mask);
 		if (result)
 			return result;
-		if (!mm->swap_cnt)
-			return 0;
 		address = (address + PGDIR_SIZE) & PGDIR_MASK;
 		pgdir++;
 	} while (address && (address < end));
@@ -306,7 +303,7 @@
 			address = vma->vm_start;
 
 		for (;;) {
-			int result = swap_out_vma(mm, vma, address, gfp_mask);
+			int result = swap_out_vma(vma, address, gfp_mask);
 			if (result)
 				return result;
 			vma = vma->vm_next;
@@ -328,7 +325,7 @@
  * N.B. This function returns only 0 or 1.  Return values != 1 from
  * the lower level routines result in continued processing.
  */
-int swap_out(unsigned int priority, int gfp_mask)
+static int swap_out(unsigned int priority, int gfp_mask)
 {
 	struct task_struct * p;
 	int counter;
@@ -363,7 +360,6 @@
 		p = init_task.next_task;
 		for (; p != &init_task; p = p->next_task) {
 			struct mm_struct *mm = p->mm;
-			p->hog = 0;
 			if (!p->swappable || !mm)
 				continue;
 	 		if (mm->rss <= 0)
@@ -377,26 +373,9 @@
 				pid = p->pid;
 			}
 		}
-		if (assign == 1) {
-			/* we just assigned swap_cnt, normalise values */
-			assign = 2;
-			p = init_task.next_task;
-			for (; p != &init_task; p = p->next_task) {
-				int i = 0;
-				struct mm_struct *mm = p->mm;
-				if (!p->swappable || !mm || mm->rss <= 0)
-					continue;
-				/* small processes are swapped out less */
-				while ((mm->swap_cnt << 2 * (i + 1) < max_cnt))
-					i++;
-				mm->swap_cnt >>= i;
-				mm->swap_cnt += i; /* if swap_cnt reaches 0 */
-				/* we're big -> hog treatment */
-				if (!i)
-					p->hog = 1;
-			}
-		}
 		read_unlock(&tasklist_lock);
+		if (assign == 1)
+			assign = 2;
 		if (!best) {
 			if (!assign) {
 				assign = 1;
@@ -437,39 +416,35 @@
 {
 	int priority;
 	int count = SWAP_CLUSTER_MAX;
-	int ret;
 
 	/* Always trim SLAB caches when memory gets low. */
 	kmem_cache_reap(gfp_mask);
 
 	priority = 6;
 	do {
-		while ((ret = shrink_mmap(priority, gfp_mask, zone))) {
+		while (shrink_mmap(priority, zone)) {
 			if (!--count)
 				goto done;
 		}
 
 
+		/*
+		 * don't be too light against the d/i cache since
+		 * shrink_mmap() almost never fail when there's
+		 * really plenty of memory free. 
+		 */
+		count -= shrink_dcache_memory(priority, gfp_mask, zone);
+		count -= shrink_icache_memory(priority, gfp_mask, zone);
+		if (count <= 0)
+			goto done;
+
 		/* Try to get rid of some shared memory pages.. */
-		if (gfp_mask & __GFP_IO) {
-			/*
-			 * don't be too light against the d/i cache since
-		   	 * shrink_mmap() almost never fail when there's
-		   	 * really plenty of memory free. 
-			 */
-			count -= shrink_dcache_memory(priority, gfp_mask, zone);
-			count -= shrink_icache_memory(priority, gfp_mask, zone);
-			if (count <= 0)
+		while (shm_swap(priority, gfp_mask, zone)) {
+			if (!--count)
 				goto done;
-			while (shm_swap(priority, gfp_mask, zone)) {
-				if (!--count)
-					goto done;
-			}
 		}
 
-		/* Then, try to page stuff out..
-		 * We use swapcount here because this doesn't actually
-		 * free pages */
+		/* Then, try to page stuff out.. */
 		while (swap_out(priority, gfp_mask)) {
 			if (!--count)
 				goto done;
@@ -482,6 +457,70 @@
 
 DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
 
+static int kswapd_work_pgdat(pg_data_t * pgdat)
+{
+	int worked = 0, i;
+	zone_t * zone;
+
+	for (i = pgdat->nr_zones-1; i >= 0; i--) {
+		zone = pgdat->node_zones + i;
+		if (current->need_resched)
+			schedule();
+		if (!zone->zone_wake_kswapd)
+			continue;
+		if (!do_try_to_free_pages(GFP_KSWAPD, zone)) {
+			zone->zone_wake_kswapd = 0;
+			continue;
+		}
+		worked = 1;
+	}
+
+	return worked;
+}
+
+static void kswapd_work(void)
+{
+	int worked;
+	pg_data_t * pgdat;
+
+	do {
+		worked = 0;
+		pgdat = pgdat_list;
+		do
+			worked |= kswapd_work_pgdat(pgdat);
+		while ((pgdat = pgdat->node_next));
+	} while (worked);
+}
+
+static int kswapd_can_sleep_pgdat(pg_data_t * pgdat)
+{
+	zone_t * zone;
+	int i;
+
+	for (i = pgdat->nr_zones-1; i >= 0; i--) {
+		zone = pgdat->node_zones + i;
+		if (!zone->zone_wake_kswapd)
+			continue;
+		return 0;
+	}
+
+	return 1;
+}
+
+static int kswapd_can_sleep(void)
+{
+	pg_data_t * pgdat;
+
+	pgdat = pgdat_list;
+	do {
+		if (kswapd_can_sleep_pgdat(pgdat))
+			continue;
+		return 0;
+	} while ((pgdat = pgdat->node_next));
+
+	return 1;
+}
+
 /*
  * The background pageout daemon, started as a kernel thread
  * from the init process. 
@@ -497,15 +536,14 @@
  */
 int kswapd(void *unused)
 {
-	int i;
 	struct task_struct *tsk = current;
-	pg_data_t *pgdat;
-	zone_t *zone;
+	wait_queue_t wait;
 
 	tsk->session = 1;
 	tsk->pgrp = 1;
 	strcpy(tsk->comm, "kswapd");
 	sigfillset(&tsk->blocked);
+	init_waitqueue_entry(&wait, tsk);
 	
 	/*
 	 * Tell the memory management that we're a "memory allocator",
@@ -527,21 +565,17 @@
 		 * the processes needing more memory will wake us
 		 * up on a more timely basis.
 		 */
-		pgdat = pgdat_list;
-		while (pgdat) {
-			for (i = 0; i < MAX_NR_ZONES; i++) {
-				zone = pgdat->node_zones + i;
-				if (tsk->need_resched)
-					schedule();
-				if ((!zone->size) || (!zone->zone_wake_kswapd))
-					continue;
-				do_try_to_free_pages(GFP_KSWAPD, zone);
-			}
-			pgdat = pgdat->node_next;
-		}
+		kswapd_work();
 		run_task_queue(&tq_disk);
-		tsk->state = TASK_INTERRUPTIBLE;
-		interruptible_sleep_on(&kswapd_wait);
+
+		__set_current_state(TASK_INTERRUPTIBLE);
+		add_wait_queue(&kswapd_wait, &wait);
+
+		if (kswapd_can_sleep())
+			schedule();
+
+		__set_current_state(TASK_RUNNING);
+		remove_wait_queue(&kswapd_wait, &wait);
 	}
 }