diff -urN 2.3.99-pre6-pre5/fs/buffer.c 2.3.99-pre6-pre5-VM-1/fs/buffer.c --- 2.3.99-pre6-pre5/fs/buffer.c Sat Apr 22 18:11:25 2000 +++ 2.3.99-pre6-pre5-VM-1/fs/buffer.c Mon Apr 24 02:26:58 2000 @@ -2106,6 +2106,7 @@ spin_unlock(&free_list[isize].lock); page->buffers = bh; + page->flags &= ~(1 << PG_referenced); lru_cache_add(page); atomic_inc(&buffermem_pages); return 1; diff -urN 2.3.99-pre6-pre5/fs/dcache.c 2.3.99-pre6-pre5-VM-1/fs/dcache.c --- 2.3.99-pre6-pre5/fs/dcache.c Sat Apr 22 18:11:25 2000 +++ 2.3.99-pre6-pre5-VM-1/fs/dcache.c Mon Apr 24 02:59:44 2000 @@ -511,19 +511,20 @@ */ int shrink_dcache_memory(int priority, unsigned int gfp_mask, zone_t * zone) { - int count = 0; - lock_kernel(); - if (priority) - count = dentry_stat.nr_unused / priority; - prune_dcache(count); - unlock_kernel(); - /* FIXME: kmem_cache_shrink here should tell us - the number of pages freed, and it should - work in a __GFP_DMA/__GFP_HIGHMEM behaviour - to free only the interesting pages in - function of the needs of the current allocation. */ - kmem_cache_shrink(dentry_cache); - + if (gfp_mask & __GFP_IO) { + int count = 0; + lock_kernel(); + if (priority) + count = dentry_stat.nr_unused / priority; + prune_dcache(count); + unlock_kernel(); + /* FIXME: kmem_cache_shrink here should tell us + the number of pages freed, and it should + work in a __GFP_DMA/__GFP_HIGHMEM behaviour + to free only the interesting pages in + function of the needs of the current allocation. */ + kmem_cache_shrink(dentry_cache); + } return 0; } diff -urN 2.3.99-pre6-pre5/fs/inode.c 2.3.99-pre6-pre5-VM-1/fs/inode.c --- 2.3.99-pre6-pre5/fs/inode.c Sat Apr 22 18:11:25 2000 +++ 2.3.99-pre6-pre5-VM-1/fs/inode.c Mon Apr 24 02:59:18 2000 @@ -449,18 +449,19 @@ int shrink_icache_memory(int priority, int gfp_mask, zone_t *zone) { - int count = 0; + if (gfp_mask & __GFP_IO) { + int count = 0; - if (priority) - count = inodes_stat.nr_unused / priority; - prune_icache(count); - /* FIXME: kmem_cache_shrink here should tell us - the number of pages freed, and it should - work in a __GFP_DMA/__GFP_HIGHMEM behaviour - to free only the interesting pages in - function of the needs of the current allocation. */ - kmem_cache_shrink(inode_cachep); - + if (priority) + count = inodes_stat.nr_unused / priority; + prune_icache(count); + /* FIXME: kmem_cache_shrink here should tell us + the number of pages freed, and it should + work in a __GFP_DMA/__GFP_HIGHMEM behaviour + to free only the interesting pages in + function of the needs of the current allocation. */ + kmem_cache_shrink(inode_cachep); + } return 0; } diff -urN 2.3.99-pre6-pre5/include/linux/cache.h 2.3.99-pre6-pre5-VM-1/include/linux/cache.h --- 2.3.99-pre6-pre5/include/linux/cache.h Sun Apr 23 22:42:11 2000 +++ 2.3.99-pre6-pre5-VM-1/include/linux/cache.h Tue Apr 25 17:27:43 2000 @@ -1,6 +1,7 @@ #ifndef __LINUX_CACHE_H #define __LINUX_CACHE_H +#include #include #ifndef L1_CACHE_ALIGN @@ -13,6 +14,14 @@ #ifndef ____cacheline_aligned #define ____cacheline_aligned __attribute__((__aligned__(SMP_CACHE_BYTES))) +#endif + +#ifndef ____cacheline_aligned_in_smp +#ifdef CONFIG_SMP +#define ____cacheline_aligned_in_smp ____cacheline_aligned +#else +#define ____cacheline_aligned_in_smp +#endif /* CONFIG_SMP */ #endif #ifndef __cacheline_aligned diff -urN 2.3.99-pre6-pre5/include/linux/mm.h 2.3.99-pre6-pre5-VM-1/include/linux/mm.h --- 2.3.99-pre6-pre5/include/linux/mm.h Sat Apr 22 18:11:26 2000 +++ 2.3.99-pre6-pre5-VM-1/include/linux/mm.h Tue Apr 25 17:27:43 2000 @@ -307,21 +307,21 @@ * can allocate highmem pages, the *get*page*() variants return * virtual kernel addresses to the allocated page(s). */ -extern struct page * FASTCALL(__alloc_pages(zonelist_t *zonelist, unsigned long order)); +extern struct page * FASTCALL(__alloc_pages(gfpmask_zone_t *, unsigned long order)); extern struct page * alloc_pages_node(int nid, int gfp_mask, unsigned long order); #ifndef CONFIG_DISCONTIGMEM extern inline struct page * alloc_pages(int gfp_mask, unsigned long order) { /* temporary check. */ - if (contig_page_data.node_zonelists[gfp_mask].gfp_mask != (gfp_mask)) + if (contig_page_data.node_gfpmask_zone[gfp_mask].gfp_mask != (gfp_mask)) BUG(); /* * Gets optimized away by the compiler. */ if (order >= MAX_ORDER) return NULL; - return __alloc_pages(contig_page_data.node_zonelists+(gfp_mask), order); + return __alloc_pages(contig_page_data.node_gfpmask_zone+gfp_mask, order); } #else /* !CONFIG_DISCONTIGMEM */ extern struct page * alloc_pages(int gfp_mask, unsigned long order); @@ -452,7 +452,7 @@ /* filemap.c */ extern void remove_inode_page(struct page *); extern unsigned long page_unuse(struct page *); -extern int shrink_mmap(int, int, zone_t *); +extern int shrink_mmap(int, zone_t *); extern void truncate_inode_pages(struct address_space *, loff_t); /* generic vm_area_ops exported for stackable file systems */ @@ -533,10 +533,8 @@ extern struct vm_area_struct *find_extend_vma(struct task_struct *tsk, unsigned long addr); -#define buffer_under_min() (atomic_read(&buffermem_pages) * 100 < \ - buffer_mem.min_percent * num_physpages) -#define pgcache_under_min() (atomic_read(&page_cache_size) * 100 < \ - page_cache.min_percent * num_physpages) +#define lru_cache_under_min(lru_pages) ((lru_pages) * 100 < \ + lru_cache_mem.min_percent * num_physpages) #define vmlist_access_lock(mm) spin_lock(&mm->page_table_lock) #define vmlist_access_unlock(mm) spin_unlock(&mm->page_table_lock) diff -urN 2.3.99-pre6-pre5/include/linux/mmzone.h 2.3.99-pre6-pre5-VM-1/include/linux/mmzone.h --- 2.3.99-pre6-pre5/include/linux/mmzone.h Sun Apr 23 22:42:11 2000 +++ 2.3.99-pre6-pre5-VM-1/include/linux/mmzone.h Tue Apr 25 17:27:43 2000 @@ -21,17 +21,28 @@ struct pglist_data; +/* + * Memory balancing can work correctly only on classzone basis. + * A strict zone based memory balancing can't work correctly since + * all the kernel allocations cares only about classzones and not about + * zones. + */ typedef struct zone_struct { /* * Commonly accessed fields: */ - spinlock_t lock; unsigned long offset; unsigned long free_pages; - char low_on_memory; - char zone_wake_kswapd; + + /* + * Memory balancing is all classzone based, all the below + * fields refer to the classzone. The classzone includes + * the current zone plus all the lower zones in the MM. + */ + unsigned long classzone_free_pages; unsigned long pages_min, pages_low, pages_high; - struct list_head lru_cache; + int nr_zone; + char zone_wake_kswapd; /* * free areas of different sizes @@ -58,27 +69,30 @@ #define MAX_NR_ZONES 3 /* - * One allocation request operates on a zonelist. A zonelist - * is a list of zones, the first one is the 'goal' of the - * allocation, the other zones are fallback zones, in decreasing - * priority. - * - * Right now a zonelist takes up less than a cacheline. We never - * modify it apart from boot-up, and only a few indices are used, - * so despite the zonelist table being relatively big, the cache - * footprint of this construct is very small. + * The pgdat->node_gfpmask_zone[] array tell us which classzone + * we should allocate from given a certain gfpmask. It translates + * the gfpmask to a classzone. */ -typedef struct zonelist_struct { - zone_t * zones [MAX_NR_ZONES+1]; // NULL delimited +typedef struct gfpmask_zone_s { + zone_t * classzone; int gfp_mask; -} zonelist_t; +} gfpmask_zone_t; #define NR_GFPINDEX 0x100 +/* the lru cache should be per-node */ +typedef struct lru_cache_s { + struct list_head head; + int nr_pages; + /* keep lock in a separate cacheline to avoid ping pong in SMP */ + spinlock_t lock ____cacheline_aligned_in_smp; +} lru_cache_t; + struct bootmem_data; typedef struct pglist_data { + int nr_zones; zone_t node_zones[MAX_NR_ZONES]; - zonelist_t node_zonelists[NR_GFPINDEX]; + gfpmask_zone_t node_gfpmask_zone[NR_GFPINDEX]; struct page *node_mem_map; unsigned long *valid_addr_bitmap; struct bootmem_data *bdata; @@ -87,14 +101,15 @@ unsigned long node_size; int node_id; struct pglist_data *node_next; + spinlock_t freelist_lock; } pg_data_t; extern int numnodes; extern pg_data_t *pgdat_list; +extern lru_cache_t lru_cache; #define memclass(pgzone, tzone) (((pgzone)->zone_pgdat == (tzone)->zone_pgdat) \ - && (((pgzone) - (pgzone)->zone_pgdat->node_zones) <= \ - ((tzone) - (pgzone)->zone_pgdat->node_zones))) + && ((pgzone) <= (tzone))) /* * The following two are not meant for general usage. They are here as diff -urN 2.3.99-pre6-pre5/include/linux/sched.h 2.3.99-pre6-pre5-VM-1/include/linux/sched.h --- 2.3.99-pre6-pre5/include/linux/sched.h Sat Apr 22 18:11:26 2000 +++ 2.3.99-pre6-pre5-VM-1/include/linux/sched.h Tue Apr 25 17:27:43 2000 @@ -308,6 +308,7 @@ long per_cpu_utime[NR_CPUS], per_cpu_stime[NR_CPUS]; /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */ unsigned long min_flt, maj_flt, nswap, cmin_flt, cmaj_flt, cnswap; + int low_on_memory:1; int swappable:1; /* process credentials */ uid_t uid,euid,suid,fsuid; diff -urN 2.3.99-pre6-pre5/include/linux/swap.h 2.3.99-pre6-pre5-VM-1/include/linux/swap.h --- 2.3.99-pre6-pre5/include/linux/swap.h Sun Apr 23 22:42:13 2000 +++ 2.3.99-pre6-pre5-VM-1/include/linux/swap.h Tue Apr 25 17:27:43 2000 @@ -159,27 +159,25 @@ return count > 1; } -extern spinlock_t pagemap_lru_lock; - /* - * Helper macros for lru_pages handling. + * Helper macros for lru_cache handling. */ -#define lru_cache_add(page) \ -do { \ - spin_lock(&pagemap_lru_lock); \ - list_add(&(page)->lru, &page->zone->lru_cache); \ - nr_lru_pages++; \ - spin_unlock(&pagemap_lru_lock); \ +#define lru_cache_add(page) \ +do { \ + spin_lock(&lru_cache.lock); \ + list_add(&(page)->lru, &lru_cache.head); \ + lru_cache.nr_pages++; \ + spin_unlock(&lru_cache.lock); \ } while (0) #define lru_cache_del(page) \ do { \ if (!PageLocked(page)) \ BUG(); \ - spin_lock(&pagemap_lru_lock); \ + spin_lock(&lru_cache.lock); \ list_del(&(page)->lru); \ - nr_lru_pages--; \ - spin_unlock(&pagemap_lru_lock); \ + lru_cache.nr_pages--; \ + spin_unlock(&lru_cache.lock); \ } while (0) extern spinlock_t swaplock; diff -urN 2.3.99-pre6-pre5/include/linux/swapctl.h 2.3.99-pre6-pre5-VM-1/include/linux/swapctl.h --- 2.3.99-pre6-pre5/include/linux/swapctl.h Tue Apr 25 16:32:34 2000 +++ 2.3.99-pre6-pre5-VM-1/include/linux/swapctl.h Tue Apr 25 17:27:49 2000 @@ -11,8 +11,7 @@ unsigned int max_percent; } buffer_mem_v1; typedef buffer_mem_v1 buffer_mem_t; -extern buffer_mem_t buffer_mem; -extern buffer_mem_t page_cache; +extern buffer_mem_t lru_cache_mem; typedef struct freepages_v1 { diff -urN 2.3.99-pre6-pre5/include/linux/sysctl.h 2.3.99-pre6-pre5-VM-1/include/linux/sysctl.h --- 2.3.99-pre6-pre5/include/linux/sysctl.h Sat Mar 11 20:02:33 2000 +++ 2.3.99-pre6-pre5-VM-1/include/linux/sysctl.h Tue Apr 25 16:46:02 2000 @@ -123,11 +123,14 @@ VM_FREEPG=3, /* struct: Set free page thresholds */ VM_BDFLUSH=4, /* struct: Control buffer cache flushing */ VM_OVERCOMMIT_MEMORY=5, /* Turn off the virtual memory safety limit */ +#if 0 /* obsolete but don't reuse */ VM_BUFFERMEM=6, /* struct: Set buffer memory thresholds */ VM_PAGECACHE=7, /* struct: Set cache memory thresholds */ +#endif VM_PAGERDAEMON=8, /* struct: Control kswapd behaviour */ VM_PGT_CACHE=9, /* struct: Set page table cache parameters */ - VM_PAGE_CLUSTER=10 /* int: set number of pages to swap together */ + VM_PAGE_CLUSTER=10, /* int: set number of pages to swap together */ + VM_LRU_CACHE=11, /* struct: Set lru cache memory thresholds */ }; diff -urN 2.3.99-pre6-pre5/ipc/shm.c 2.3.99-pre6-pre5-VM-1/ipc/shm.c --- 2.3.99-pre6-pre5/ipc/shm.c Sat Apr 22 18:11:27 2000 +++ 2.3.99-pre6-pre5-VM-1/ipc/shm.c Mon Apr 24 03:27:28 2000 @@ -132,7 +132,7 @@ static int sysvipc_shm_read_proc(char *buffer, char **start, off_t offset, int length, int *eof, void *data); #endif -static void zshm_swap (int prio, int gfp_mask, zone_t *zone); +static void zshm_swap (int prio, zone_t *zone); static void zmap_unuse(swp_entry_t entry, struct page *page); static void shmzero_open(struct vm_area_struct *shmd); static void shmzero_close(struct vm_area_struct *shmd); @@ -1438,7 +1438,7 @@ if (!pte_present(page)) return RETRY; page_map = pte_page(page); - if (zone && (!memclass(page_map->zone, zone))) + if (!memclass(page_map->zone, zone)) return RETRY; if (shp->id != zero_id) swap_attempts++; @@ -1495,17 +1495,21 @@ struct shmid_kernel *shp; swp_entry_t swap_entry; unsigned long id, idx; - int loop = 0; + int loop; int counter; struct page * page_map; - zshm_swap(prio, gfp_mask, zone); + if (!(gfp_mask & __GFP_IO)) + return 0; + + zshm_swap(prio, zone); counter = shm_rss >> prio; if (!counter) return 0; if (shm_swap_preop(&swap_entry)) return 0; + loop = 0; shm_lockall(); check_id: shp = shm_get(swap_id); @@ -1817,7 +1821,7 @@ spin_unlock(&zmap_list_lock); } -static void zshm_swap (int prio, int gfp_mask, zone_t *zone) +static void zshm_swap (int prio, zone_t *zone) { struct shmid_kernel *shp; swp_entry_t swap_entry; diff -urN 2.3.99-pre6-pre5/kernel/sysctl.c 2.3.99-pre6-pre5-VM-1/kernel/sysctl.c --- 2.3.99-pre6-pre5/kernel/sysctl.c Sat Apr 22 18:11:27 2000 +++ 2.3.99-pre6-pre5-VM-1/kernel/sysctl.c Tue Apr 25 16:40:47 2000 @@ -233,16 +233,14 @@ &bdflush_min, &bdflush_max}, {VM_OVERCOMMIT_MEMORY, "overcommit_memory", &sysctl_overcommit_memory, sizeof(sysctl_overcommit_memory), 0644, NULL, &proc_dointvec}, - {VM_BUFFERMEM, "buffermem", - &buffer_mem, sizeof(buffer_mem_t), 0644, NULL, &proc_dointvec}, - {VM_PAGECACHE, "pagecache", - &page_cache, sizeof(buffer_mem_t), 0644, NULL, &proc_dointvec}, {VM_PAGERDAEMON, "kswapd", &pager_daemon, sizeof(pager_daemon_t), 0644, NULL, &proc_dointvec}, {VM_PGT_CACHE, "pagetable_cache", &pgt_cache_water, 2*sizeof(int), 0644, NULL, &proc_dointvec}, {VM_PAGE_CLUSTER, "page-cluster", &page_cluster, sizeof(int), 0644, NULL, &proc_dointvec}, + {VM_LRU_CACHE, "lru_cache", + &lru_cache_mem, sizeof(buffer_mem_t), 0644, NULL, &proc_dointvec}, {0} }; diff -urN 2.3.99-pre6-pre5/mm/filemap.c 2.3.99-pre6-pre5-VM-1/mm/filemap.c --- 2.3.99-pre6-pre5/mm/filemap.c Tue Apr 18 16:11:42 2000 +++ 2.3.99-pre6-pre5-VM-1/mm/filemap.c Tue Apr 25 16:53:07 2000 @@ -46,11 +46,6 @@ struct page **page_hash_table; spinlock_t pagecache_lock = SPIN_LOCK_UNLOCKED; -/* - * NOTE: to avoid deadlocking you must never acquire the pagecache_lock with - * the pagemap_lru_lock held. - */ -spinlock_t pagemap_lru_lock = SPIN_LOCK_UNLOCKED; #define CLUSTER_PAGES (1 << page_cluster) #define CLUSTER_OFFSET(x) (((x) >> page_cluster) << page_cluster) @@ -213,7 +208,7 @@ spin_unlock(&pagecache_lock); } -int shrink_mmap(int priority, int gfp_mask, zone_t *zone) +int shrink_mmap(int priority, zone_t *zone) { int ret = 0, count; LIST_HEAD(young); @@ -221,19 +216,31 @@ LIST_HEAD(forget); struct list_head * page_lru, * dispose; struct page * page; + spinlock_t * lru_lock = &lru_cache.lock; + struct list_head * lru_head = &lru_cache.head; if (!zone) BUG(); - count = nr_lru_pages / (priority+1); + if (lru_cache_under_min(lru_cache.nr_pages)) + return 0; + + count = lru_cache.nr_pages / (priority+1); - spin_lock(&pagemap_lru_lock); + spin_lock(lru_lock); - while (count > 0 && (page_lru = zone->lru_cache.prev) != &zone->lru_cache) { + while (count > 0 && (page_lru = lru_head->prev) != lru_head) { page = list_entry(page_lru, struct page, lru); list_del(page_lru); - dispose = &zone->lru_cache; + dispose = &old; + /* don't account passes over not DMA pages */ + if (!memclass(page->zone, zone)) + goto dispose_continue; + + count--; + + dispose = lru_head; if (test_and_clear_bit(PG_referenced, &page->flags)) /* Roll the page at the top of the lru list, * we could also be more aggressive putting @@ -242,13 +249,6 @@ */ goto dispose_continue; - dispose = &old; - /* don't account passes over not DMA pages */ - if (zone && (!memclass(page->zone, zone))) - goto dispose_continue; - - count--; - dispose = &young; /* avoid unscalable SMP locking */ @@ -258,12 +258,12 @@ if (TryLockPage(page)) goto dispose_continue; - /* Release the pagemap_lru lock even if the page is not yet + /* Release the lru_cache lock even if the page is not yet queued in any lru queue since we have just locked down the page so nobody else may SMP race with us running a lru_cache_del() (lru_cache_del() always run with the page locked down ;). */ - spin_unlock(&pagemap_lru_lock); + spin_unlock(lru_lock); /* avoid freeing the page while it's locked */ get_page(page); @@ -300,12 +300,17 @@ if (PageSwapCache(page)) { spin_unlock(&pagecache_lock); __delete_from_swap_cache(page); + /* + * We hold the lock on the page so we don't + * need to do an atomic clear_bit(). + */ + page->flags &= ~(1UL << PG_swap_entry); goto made_inode_progress; } /* is it a page-cache page? */ if (page->mapping) { - if (!PageDirty(page) && !pgcache_under_min()) { + if (!PageDirty(page)) { remove_page_from_inode_queue(page); remove_page_from_hash_queue(page); page->mapping = NULL; @@ -321,7 +326,7 @@ cache_unlock_continue: spin_unlock(&pagecache_lock); unlock_continue: - spin_lock(&pagemap_lru_lock); + spin_lock(lru_lock); UnlockPage(page); put_page(page); list_add(page_lru, dispose); @@ -338,15 +343,15 @@ UnlockPage(page); put_page(page); ret = 1; - spin_lock(&pagemap_lru_lock); - /* nr_lru_pages needs the spinlock */ - nr_lru_pages--; + spin_lock(lru_lock); + /* nr_pages needs the spinlock */ + lru_cache.nr_pages--; out: - list_splice(&young, &zone->lru_cache); - list_splice(&old, zone->lru_cache.prev); + list_splice(&young, lru_head); + list_splice(&old, lru_head->prev); - spin_unlock(&pagemap_lru_lock); + spin_unlock(lru_lock); return ret; } @@ -467,8 +472,8 @@ struct page *alias; unsigned long flags; - flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_dirty)); - page->flags = flags | (1 << PG_locked) | (1 << PG_referenced); + flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_dirty) | (1 << PG_referenced)); + page->flags = flags | (1 << PG_locked); get_page(page); page->index = offset; add_page_to_inode_queue(mapping, page); diff -urN 2.3.99-pre6-pre5/mm/highmem.c 2.3.99-pre6-pre5-VM-1/mm/highmem.c --- 2.3.99-pre6-pre5/mm/highmem.c Mon Apr 3 03:21:59 2000 +++ 2.3.99-pre6-pre5-VM-1/mm/highmem.c Mon Apr 24 02:26:50 2000 @@ -75,7 +75,7 @@ /* Preserve the caching of the swap_entry. */ highpage->index = page->index; - highpage->mapping = page->mapping; + highpage->flags = page->flags; /* * We can just forget the old page since diff -urN 2.3.99-pre6-pre5/mm/memory.c 2.3.99-pre6-pre5-VM-1/mm/memory.c --- 2.3.99-pre6-pre5/mm/memory.c Tue Apr 18 16:11:42 2000 +++ 2.3.99-pre6-pre5-VM-1/mm/memory.c Mon Apr 24 02:26:51 2000 @@ -837,6 +837,7 @@ */ switch (page_count(old_page)) { case 2: + case 3: /* * Lock the page so that no one can look it up from * the swap cache, grab a reference and start using it. @@ -879,7 +880,19 @@ new_page = old_page; } spin_unlock(&tsk->mm->page_table_lock); - __free_page(new_page); + /* + * We're releasing a page, it can be an anonymous + * page as well. Since we don't hold any lock (except + * the mmap_sem semaphore) the other user of the anonymous + * page may have released it from under us and now we + * could be the only owner of the page, thus put_page_testzero() can + * return 1, and we have to clear the swap-entry + * bitflag in such case. + */ + if (put_page_testzero(new_page)) { + new_page->flags &= ~(1UL << PG_swap_entry); + __free_pages_ok(new_page, 0); + } return 1; bad_wp_page: diff -urN 2.3.99-pre6-pre5/mm/numa.c 2.3.99-pre6-pre5-VM-1/mm/numa.c --- 2.3.99-pre6-pre5/mm/numa.c Tue Apr 18 16:11:42 2000 +++ 2.3.99-pre6-pre5-VM-1/mm/numa.c Tue Apr 25 16:31:29 2000 @@ -33,7 +33,7 @@ struct page * alloc_pages_node(int nid, int gfp_mask, unsigned long order) { - return __alloc_pages(NODE_DATA(nid)->node_zonelists + gfp_mask, order); + return __alloc_pages(NODE_DATA(nid)->node_gfpmask_zone + gfp_mask, order); } #ifdef CONFIG_DISCONTIGMEM diff -urN 2.3.99-pre6-pre5/mm/page_alloc.c 2.3.99-pre6-pre5-VM-1/mm/page_alloc.c --- 2.3.99-pre6-pre5/mm/page_alloc.c Tue Apr 18 16:11:42 2000 +++ 2.3.99-pre6-pre5-VM-1/mm/page_alloc.c Tue Apr 25 16:45:03 2000 @@ -25,8 +25,8 @@ #endif int nr_swap_pages = 0; -int nr_lru_pages; pg_data_t *pgdat_list = (pg_data_t *)0; +lru_cache_t lru_cache = { LIST_HEAD_INIT(lru_cache.head), 0, SPIN_LOCK_UNLOCKED, }; static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" }; static int zone_balance_ratio[MAX_NR_ZONES] = { 128, 128, 128, }; @@ -87,6 +87,8 @@ free_area_t *area; struct page *base; zone_t *zone; + spinlock_t * freelist_lock; + pg_data_t * pgdat; /* * Subtle. We do not want to test this in the inlined part of @@ -110,6 +112,8 @@ BUG(); if (PageDecrAfter(page)) BUG(); + if (PageSwapEntry(page)) + BUG(); zone = page->zone; @@ -122,10 +126,25 @@ area = zone->free_area + order; - spin_lock_irqsave(&zone->lock, flags); + pgdat = zone->zone_pgdat; + freelist_lock = &pgdat->freelist_lock; + spin_lock_irqsave(freelist_lock, flags); zone->free_pages -= mask; + /* update the classzone */ + { + int nr_zone = zone->nr_zone; + register zone_t * z = zone; + do { + z->classzone_free_pages -= mask; + if (z->zone_wake_kswapd && + z->classzone_free_pages > z->pages_high) + z->zone_wake_kswapd = 0; + z++; + } while (++nr_zone < pgdat->nr_zones); + } + while (mask + (1 << (MAX_ORDER-1))) { struct page *buddy1, *buddy2; @@ -153,13 +172,7 @@ page_idx &= mask; } memlist_add_head(&(base + page_idx)->list, &area->free_list); - - spin_unlock_irqrestore(&zone->lock, flags); - - if (zone->free_pages > zone->pages_high) { - zone->zone_wake_kswapd = 0; - zone->low_on_memory = 0; - } + spin_unlock_irqrestore(freelist_lock, flags); } #define MARK_USED(index, order, area) \ @@ -186,16 +199,15 @@ return page; } -static FASTCALL(struct page * rmqueue(zone_t *zone, unsigned long order)); -static struct page * rmqueue(zone_t *zone, unsigned long order) +static FASTCALL(struct page * rmqueue(zone_t *zone, unsigned long order, unsigned long flags)); +static struct page * rmqueue(zone_t *zone, unsigned long order, unsigned long flags) { free_area_t * area = zone->free_area + order; unsigned long curr_order = order; struct list_head *head, *curr; - unsigned long flags; struct page *page; + pg_data_t * pgdat; - spin_lock_irqsave(&zone->lock, flags); do { head = &area->free_list; curr = memlist_next(head); @@ -209,10 +221,21 @@ memlist_del(curr); index = (page - mem_map) - zone->offset; MARK_USED(index, curr_order, area); - zone->free_pages -= 1 << order; + + zone->free_pages -= 1UL << order; + pgdat = zone->zone_pgdat; + /* update the classzone */ + { + int nr_zone = zone->nr_zone; + register zone_t * z = zone; + do { + z->classzone_free_pages -= 1UL<nr_zones); + } page = expand(zone, page, index, order, curr_order, area); - spin_unlock_irqrestore(&zone->lock, flags); + spin_unlock_irqrestore(&pgdat->freelist_lock, flags); set_page_count(page, 1); if (BAD_RANGE(zone,page)) @@ -222,57 +245,23 @@ curr_order++; area++; } while (curr_order < MAX_ORDER); - spin_unlock_irqrestore(&zone->lock, flags); return NULL; } -static int zone_balance_memory(zonelist_t *zonelist) -{ - int tried = 0, freed = 0; - zone_t **zone; - int gfp_mask = zonelist->gfp_mask; - extern wait_queue_head_t kswapd_wait; - - zone = zonelist->zones; - for (;;) { - zone_t *z = *(zone++); - if (!z) - break; - if (z->free_pages > z->pages_low) - continue; - - z->zone_wake_kswapd = 1; - wake_up_interruptible(&kswapd_wait); - - /* Are we reaching the critical stage? */ - if (!z->low_on_memory) { - /* Not yet critical, so let kswapd handle it.. */ - if (z->free_pages > z->pages_min) - continue; - z->low_on_memory = 1; - } - /* - * In the atomic allocation case we only 'kick' the - * state machine, but do not try to free pages - * ourselves. - */ - tried = 1; - freed |= try_to_free_pages(gfp_mask, z); - } - if (tried && !freed) { - if (!(gfp_mask & __GFP_HIGH)) - return 0; - } - return 1; -} - /* * This is the 'heart' of the zoned buddy allocator: */ -struct page * __alloc_pages(zonelist_t *zonelist, unsigned long order) +struct page * __alloc_pages(gfpmask_zone_t * gfpmask_zone, unsigned long order) { - zone_t **zone = zonelist->zones; + zone_t * classzone = gfpmask_zone->classzone; + pg_data_t * pgdat = classzone->zone_pgdat; + int freed; + spinlock_t * freelist_lock = &pgdat->freelist_lock; + long flags; + unsigned long classzone_free_pages; + + spin_lock_irqsave(freelist_lock, flags); /* * If this is a recursive call, we'd better @@ -282,51 +271,70 @@ if (current->flags & PF_MEMALLOC) goto allocate_ok; - /* - * (If anyone calls gfp from interrupts nonatomically then it - * will sooner or later tripped up by a schedule().) - * - * We are falling back to lower-level zones if allocation - * in a higher zone fails. - */ - for (;;) { - zone_t *z = *(zone++); - if (!z) - break; - if (!z->size) - BUG(); - - /* Are we supposed to free memory? Don't make it worse.. */ - if (!z->zone_wake_kswapd && z->free_pages > z->pages_low) { - struct page *page = rmqueue(z, order); - if (page) - return page; - } - } - - /* - * Ok, no obvious zones were available, start - * balancing things a bit.. - */ - if (zone_balance_memory(zonelist)) { - zone = zonelist->zones; -allocate_ok: - for (;;) { - zone_t *z = *(zone++); - if (!z) - break; - if (z->free_pages) { - struct page *page = rmqueue(z, order); + /* classzone based memory balancing */ + classzone_free_pages = classzone->classzone_free_pages; + if (!current->low_on_memory && + classzone_free_pages > classzone->pages_low) { + int nr_zone; + zone_t * z; + + allocate_ok: + z = classzone; + for (nr_zone = classzone->nr_zone; + nr_zone >= 0; + nr_zone--, z--) { + if (z->free_pages >= (1UL << order)) { + struct page *page = rmqueue(z, order, flags); if (page) return page; } } + } else { + extern wait_queue_head_t kswapd_wait; + + if (classzone_free_pages > classzone->pages_low) { + if (current->low_on_memory) + current->low_on_memory = 0; + goto allocate_ok; + } + + if (!classzone->zone_wake_kswapd) { + classzone->zone_wake_kswapd = 1; + wake_up_interruptible(&kswapd_wait); + } + + /* Are we reaching the critical stage? */ + if (!current->low_on_memory) { + /* Not yet critical, so let kswapd handle it.. */ + if (classzone_free_pages > classzone->pages_min) + goto allocate_ok; + current->low_on_memory = 1; + } + + spin_unlock_irqrestore(freelist_lock, flags); + freed = try_to_free_pages(gfpmask_zone->gfp_mask, classzone); + spin_lock_irq(freelist_lock); + + if (freed || gfpmask_zone->gfp_mask & __GFP_HIGH) + goto allocate_ok; + + /* + * Re-check we're low on memory keeping the spinlock held + * before failing. Somebody may have released + * lots of memory from under us while we was trying + * to free the pages. We check against pages_high + * to be sure to succeed only if lots of memory is been + * released. + */ + classzone_free_pages = classzone->classzone_free_pages; + if (classzone_free_pages > classzone->pages_high) { + if (current->low_on_memory) + current->low_on_memory = 0; + goto allocate_ok; + } } + spin_unlock_irqrestore(freelist_lock, flags); return NULL; - -/* - * The main chunk of the balancing code is in this offline branch: - */ } /* @@ -335,13 +343,14 @@ unsigned int nr_free_pages (void) { unsigned int sum; - zone_t *zone; int i; sum = 0; - for (i = 0; i < NUMNODES; i++) - for (zone = NODE_DATA(i)->node_zones; zone < NODE_DATA(i)->node_zones + MAX_NR_ZONES; zone++) - sum += zone->free_pages; + for (i = 0; i < NUMNODES; i++) { + pg_data_t * pgdat = NODE_DATA(i); + zone_t * node_zones = pgdat->node_zones; + sum += node_zones[pgdat->nr_zones-1].classzone_free_pages; + } return sum; } @@ -351,13 +360,15 @@ unsigned int nr_free_buffer_pages (void) { unsigned int sum; - zone_t *zone; int i; - sum = nr_lru_pages; - for (i = 0; i < NUMNODES; i++) - for (zone = NODE_DATA(i)->node_zones; zone <= NODE_DATA(i)->node_zones+ZONE_NORMAL; zone++) - sum += zone->free_pages; + sum = lru_cache.nr_pages; + for (i = 0; i < NUMNODES; i++) { + pg_data_t * pgdat = NODE_DATA(i); + zone_t * node_zones = pgdat->node_zones; + int higher_zone = pgdat->nr_zones-1; + sum += node_zones[higher_zone <= ZONE_NORMAL ? higher_zone : ZONE_NORMAL].classzone_free_pages; + } return sum; } @@ -389,21 +400,23 @@ printk("( Free: %d, lru_cache: %d (%d %d %d) )\n", nr_free_pages(), - nr_lru_pages, + lru_cache.nr_pages, freepages.min, freepages.low, freepages.high); for (type = 0; type < MAX_NR_ZONES; type++) { struct list_head *head, *curr; - zone_t *zone = NODE_DATA(nid)->node_zones + type; + pg_data_t * pgdat = NODE_DATA(nid); + zone_t *zone = pgdat->node_zones + type; + spinlock_t * freelist_lock = &pgdat->freelist_lock; unsigned long nr, total, flags; printk(" %s: ", zone->name); total = 0; if (zone->size) { - spin_lock_irqsave(&zone->lock, flags); + spin_lock_irqsave(freelist_lock, flags); for (order = 0; order < MAX_ORDER; order++) { head = &(zone->free_area + order)->free_list; curr = head; @@ -418,7 +431,7 @@ printk("%lu*%lukB ", nr, (PAGE_SIZE>>10) << order); } - spin_unlock_irqrestore(&zone->lock, flags); + spin_unlock_irqrestore(freelist_lock, flags); } printk("= %lukB)\n", total * (PAGE_SIZE>>10)); } @@ -436,18 +449,17 @@ /* * Builds allocation fallback zone lists. */ -static inline void build_zonelists(pg_data_t *pgdat) +static inline void build_gfpmask_zone(pg_data_t *pgdat) { int i, j, k; for (i = 0; i < NR_GFPINDEX; i++) { - zonelist_t *zonelist; + gfpmask_zone_t * gfpmask_zone; zone_t *zone; - zonelist = pgdat->node_zonelists + i; - memset(zonelist, 0, sizeof(*zonelist)); + gfpmask_zone = pgdat->node_gfpmask_zone + i; - zonelist->gfp_mask = i; + gfpmask_zone->gfp_mask = i; j = 0; k = ZONE_NORMAL; if (i & __GFP_HIGHMEM) @@ -467,18 +479,20 @@ #ifndef CONFIG_HIGHMEM BUG(); #endif - zonelist->zones[j++] = zone; + gfpmask_zone->classzone = zone; } + break; case ZONE_NORMAL: zone = pgdat->node_zones + ZONE_NORMAL; if (zone->size) - zonelist->zones[j++] = zone; + gfpmask_zone->classzone = zone; + break; case ZONE_DMA: zone = pgdat->node_zones + ZONE_DMA; if (zone->size) - zonelist->zones[j++] = zone; + gfpmask_zone->classzone = zone; + break; } - zonelist->zones[j++] = NULL; } } @@ -498,9 +512,8 @@ unsigned long i, j; unsigned long map_size; unsigned long totalpages, offset, realtotalpages; - unsigned int cumulative = 0; - pgdat->node_next = pgdat_list; + pgdat->node_next = NULL; pgdat_list = pgdat; totalpages = 0; @@ -546,6 +559,8 @@ pgdat->node_size = totalpages; pgdat->node_start_paddr = zone_start_paddr; pgdat->node_start_mapnr = (lmem_map - mem_map); + pgdat->nr_zones = 0; + spin_lock_init(&pgdat->freelist_lock); /* * Initially all pages are reserved - free ones are freed @@ -572,14 +587,15 @@ printk("zone(%lu): %lu pages.\n", j, size); zone->size = size; zone->name = zone_names[j]; - zone->lock = SPIN_LOCK_UNLOCKED; zone->zone_pgdat = pgdat; + zone->nr_zone = j; zone->free_pages = 0; + zone->zone_wake_kswapd = 0; if (!size) continue; + pgdat->nr_zones = j+1; zone->offset = offset; - cumulative += size; mask = (realsize / zone_balance_ratio[j]); if (mask < zone_balance_min[j]) mask = zone_balance_min[j]; @@ -588,8 +604,6 @@ zone->pages_min = mask; zone->pages_low = mask*2; zone->pages_high = mask*3; - zone->low_on_memory = 0; - zone->zone_wake_kswapd = 0; zone->zone_mem_map = mem_map + offset; zone->zone_start_mapnr = offset; zone->zone_start_paddr = zone_start_paddr; @@ -609,7 +623,6 @@ unsigned long bitmap_size; memlist_init(&zone->free_area[i].free_list); - memlist_init(&zone->lru_cache); mask += mask; size = (size + ~mask) & mask; bitmap_size = size >> i; @@ -619,7 +632,7 @@ (unsigned int *) alloc_bootmem_node(nid, bitmap_size); } } - build_zonelists(pgdat); + build_gfpmask_zone(pgdat); } void __init free_area_init(unsigned long *zones_size) diff -urN 2.3.99-pre6-pre5/mm/swap.c 2.3.99-pre6-pre5-VM-1/mm/swap.c --- 2.3.99-pre6-pre5/mm/swap.c Wed Dec 8 00:05:28 1999 +++ 2.3.99-pre6-pre5-VM-1/mm/swap.c Tue Apr 25 16:41:41 2000 @@ -46,13 +46,7 @@ out, so that we don't try to swap TOO many pages out at once */ atomic_t nr_async_pages = ATOMIC_INIT(0); -buffer_mem_t buffer_mem = { - 2, /* minimum percent buffer */ - 10, /* borrow percent buffer */ - 60 /* maximum percent buffer */ -}; - -buffer_mem_t page_cache = { +buffer_mem_t lru_cache_mem = { 2, /* minimum percent page cache */ 15, /* borrow percent page cache */ 75 /* maximum */ diff -urN 2.3.99-pre6-pre5/mm/swap_state.c 2.3.99-pre6-pre5-VM-1/mm/swap_state.c --- 2.3.99-pre6-pre5/mm/swap_state.c Tue Apr 18 16:11:42 2000 +++ 2.3.99-pre6-pre5-VM-1/mm/swap_state.c Mon Apr 24 02:26:51 2000 @@ -126,9 +126,14 @@ UnlockPage(page); } - ClearPageSwapEntry(page); - - __free_page(page); + /* + * Only the last unmap have to lose the swap entry + * information that we have cached into page->index. + */ + if (put_page_testzero(page)) { + page->flags &= ~(1UL << PG_swap_entry); + __free_pages_ok(page, 0); + } } diff -urN 2.3.99-pre6-pre5/mm/swapfile.c 2.3.99-pre6-pre5-VM-1/mm/swapfile.c --- 2.3.99-pre6-pre5/mm/swapfile.c Sat Apr 22 18:11:27 2000 +++ 2.3.99-pre6-pre5-VM-1/mm/swapfile.c Mon Apr 24 02:26:51 2000 @@ -212,22 +212,22 @@ /* We have the old entry in the page offset still */ if (!page->index) - goto new_swap_entry; + goto null_swap_entry; entry.val = page->index; type = SWP_TYPE(entry); if (type >= nr_swapfiles) - goto new_swap_entry; + goto bad_nofile; + swap_list_lock(); p = type + swap_info; if ((p->flags & SWP_WRITEOK) != SWP_WRITEOK) - goto new_swap_entry; + goto unlock_list; offset = SWP_OFFSET(entry); if (offset >= p->max) - goto new_swap_entry; + goto bad_offset; /* Has it been re-used for something else? */ - swap_list_lock(); swap_device_lock(p); if (p->swap_map[offset]) - goto unlock_new_swap_entry; + goto unlock; /* We're cool, we can just use the old one */ p->swap_map[offset] = 1; @@ -236,11 +236,24 @@ swap_list_unlock(); return entry; -unlock_new_swap_entry: +unlock: swap_device_unlock(p); +unlock_list: swap_list_unlock(); +clear_swap_entry: + ClearPageSwapEntry(page); new_swap_entry: return get_swap_page(); + +null_swap_entry: + printk(KERN_WARNING __FUNCTION__ " null swap entry\n"); + goto clear_swap_entry; +bad_nofile: + printk(KERN_WARNING __FUNCTION__ " nonexistent swap file\n"); + goto clear_swap_entry; +bad_offset: + printk(KERN_WARNING __FUNCTION__ " bad offset\n"); + goto unlock_list; } /* @@ -263,8 +276,11 @@ /* If this entry is swap-cached, then page must already hold the right address for any copies in physical memory */ - if (pte_page(pte) != page) + if (pte_page(pte) != page) { + if (page->index == entry.val) + ClearPageSwapEntry(page); return; + } /* We will be removing the swap cache in a moment, so... */ set_pte(dir, pte_mkdirty(pte)); return; @@ -418,8 +434,10 @@ shm_unuse(entry, page); /* Now get rid of the extra reference to the temporary page we've been using. */ - if (PageSwapCache(page)) + if (PageSwapCache(page)) { delete_from_swap_cache(page); + ClearPageSwapEntry(page); + } __free_page(page); /* * Check for and clear any overflowed swap map counts. diff -urN 2.3.99-pre6-pre5/mm/vmscan.c 2.3.99-pre6-pre5-VM-1/mm/vmscan.c --- 2.3.99-pre6-pre5/mm/vmscan.c Tue Apr 18 16:11:42 2000 +++ 2.3.99-pre6-pre5-VM-1/mm/vmscan.c Tue Apr 25 16:01:39 2000 @@ -418,27 +418,26 @@ priority = 6; do { - while (shrink_mmap(priority, gfp_mask, zone)) { + while (shrink_mmap(priority, zone)) { if (!--count) goto done; } + /* + * don't be too light against the d/i cache since + * shrink_mmap() almost never fail when there's + * really plenty of memory free. + */ + count -= shrink_dcache_memory(priority, gfp_mask, zone); + count -= shrink_icache_memory(priority, gfp_mask, zone); + if (count <= 0) + goto done; + /* Try to get rid of some shared memory pages.. */ - if (gfp_mask & __GFP_IO) { - /* - * don't be too light against the d/i cache since - * shrink_mmap() almost never fail when there's - * really plenty of memory free. - */ - count -= shrink_dcache_memory(priority, gfp_mask, zone); - count -= shrink_icache_memory(priority, gfp_mask, zone); - if (count <= 0) + while (shm_swap(priority, gfp_mask, zone)) { + if (!--count) goto done; - while (shm_swap(priority, gfp_mask, zone)) { - if (!--count) - goto done; - } } /* Then, try to page stuff out.. */ @@ -454,6 +453,70 @@ DECLARE_WAIT_QUEUE_HEAD(kswapd_wait); +static int kswapd_work_pgdat(pg_data_t * pgdat) +{ + int worked = 0, i; + zone_t * zone; + + for (i = pgdat->nr_zones-1; i >= 0; i--) { + zone = pgdat->node_zones + i; + if (current->need_resched) + schedule(); + if (!zone->zone_wake_kswapd) + continue; + if (!do_try_to_free_pages(GFP_KSWAPD, zone)) { + zone->zone_wake_kswapd = 0; + continue; + } + worked = 1; + } + + return worked; +} + +static void kswapd_work(void) +{ + int worked; + pg_data_t * pgdat; + + do { + worked = 0; + pgdat = pgdat_list; + do + worked |= kswapd_work_pgdat(pgdat); + while ((pgdat = pgdat->node_next)); + } while (worked); +} + +static int kswapd_can_sleep_pgdat(pg_data_t * pgdat) +{ + zone_t * zone; + int i; + + for (i = pgdat->nr_zones-1; i >= 0; i--) { + zone = pgdat->node_zones + i; + if (!zone->zone_wake_kswapd) + continue; + return 0; + } + + return 1; +} + +static int kswapd_can_sleep(void) +{ + pg_data_t * pgdat; + + pgdat = pgdat_list; + do { + if (kswapd_can_sleep_pgdat(pgdat)) + continue; + return 0; + } while ((pgdat = pgdat->node_next)); + + return 1; +} + /* * The background pageout daemon, started as a kernel thread * from the init process. @@ -469,15 +532,14 @@ */ int kswapd(void *unused) { - int i; struct task_struct *tsk = current; - pg_data_t *pgdat; - zone_t *zone; + wait_queue_t wait; tsk->session = 1; tsk->pgrp = 1; strcpy(tsk->comm, "kswapd"); sigfillset(&tsk->blocked); + init_waitqueue_entry(&wait, tsk); /* * Tell the memory management that we're a "memory allocator", @@ -499,21 +561,17 @@ * the processes needing more memory will wake us * up on a more timely basis. */ - pgdat = pgdat_list; - while (pgdat) { - for (i = 0; i < MAX_NR_ZONES; i++) { - zone = pgdat->node_zones + i; - if (tsk->need_resched) - schedule(); - if ((!zone->size) || (!zone->zone_wake_kswapd)) - continue; - do_try_to_free_pages(GFP_KSWAPD, zone); - } - pgdat = pgdat->node_next; - } + kswapd_work(); run_task_queue(&tq_disk); - tsk->state = TASK_INTERRUPTIBLE; - interruptible_sleep_on(&kswapd_wait); + + __set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&kswapd_wait, &wait); + + if (kswapd_can_sleep()) + schedule(); + + __set_current_state(TASK_RUNNING); + remove_wait_queue(&kswapd_wait, &wait); } }