diff -prauN linux-2.5.71/arch/i386/Kconfig wli-2.5.71-7/arch/i386/Kconfig --- linux-2.5.71/arch/i386/Kconfig 2003-06-14 12:17:58.000000000 -0700 +++ wli-2.5.71-7/arch/i386/Kconfig 2003-06-14 20:49:04.000000000 -0700 @@ -709,6 +709,15 @@ config HIGHPTE low memory. Setting this option will put user-space page table entries in high memory. +config HIGHPMD + bool "Allocate 2nd-level pagetables from highmem" + depends on HIGHMEM64G + help + The VM uses one pmd entry for each pagetable page of physical + memory allocated. For systems with extreme amounts of highmem, + this cannot be tolerated. Setting this option will put + userspace 2nd-level pagetables in highmem. + config MATH_EMULATION bool "Math emulation" ---help--- diff -prauN linux-2.5.71/arch/i386/kernel/vm86.c wli-2.5.71-7/arch/i386/kernel/vm86.c --- linux-2.5.71/arch/i386/kernel/vm86.c 2003-06-14 12:17:58.000000000 -0700 +++ wli-2.5.71-7/arch/i386/kernel/vm86.c 2003-06-14 20:49:04.000000000 -0700 @@ -144,12 +144,14 @@ static void mark_screen_rdonly(struct ta pgd_clear(pgd); goto out; } - pmd = pmd_offset(pgd, 0xA0000); - if (pmd_none(*pmd)) + pmd = pmd_offset_map(pgd, 0xA0000); + if (pmd_none(*pmd)) { + pmd_unmap(pmd); goto out; - if (pmd_bad(*pmd)) { + } else if (pmd_bad(*pmd)) { pmd_ERROR(*pmd); pmd_clear(pmd); + pmd_unmap(pmd); goto out; } pte = mapped = pte_offset_map(pmd, 0xA0000); @@ -159,6 +161,7 @@ static void mark_screen_rdonly(struct ta pte++; } pte_unmap(mapped); + pmd_unmap(pmd); out: spin_unlock(&tsk->mm->page_table_lock); preempt_enable(); diff -prauN linux-2.5.71/arch/i386/mm/fault.c wli-2.5.71-7/arch/i386/mm/fault.c --- linux-2.5.71/arch/i386/mm/fault.c 2003-06-14 12:17:56.000000000 -0700 +++ wli-2.5.71-7/arch/i386/mm/fault.c 2003-06-14 20:49:04.000000000 -0700 @@ -330,8 +330,8 @@ vmalloc_fault: * and redundant with the set_pmd() on non-PAE. */ - pmd = pmd_offset(pgd, address); - pmd_k = pmd_offset(pgd_k, address); + pmd = pmd_offset_kernel(pgd, address); + pmd_k = pmd_offset_kernel(pgd_k, address); if (!pmd_present(*pmd_k)) goto no_context; set_pmd(pmd, *pmd_k); diff -prauN linux-2.5.71/arch/i386/mm/hugetlbpage.c wli-2.5.71-7/arch/i386/mm/hugetlbpage.c --- linux-2.5.71/arch/i386/mm/hugetlbpage.c 2003-06-14 12:18:34.000000000 -0700 +++ wli-2.5.71-7/arch/i386/mm/hugetlbpage.c 2003-06-14 20:49:04.000000000 -0700 @@ -57,8 +57,8 @@ static pte_t *huge_pte_alloc(struct mm_s pmd_t *pmd = NULL; pgd = pgd_offset(mm, addr); - pmd = pmd_alloc(mm, pgd, addr); - return (pte_t *) pmd; + pmd = pmd_alloc_map(mm, pgd, addr); + return (pte_t *)pmd; } static pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) @@ -67,8 +67,8 @@ static pte_t *huge_pte_offset(struct mm_ pmd_t *pmd = NULL; pgd = pgd_offset(mm, addr); - pmd = pmd_offset(pgd, addr); - return (pte_t *) pmd; + pmd = pmd_offset_map(pgd, addr); + return (pte_t *)pmd; } static void set_huge_pte(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page, pte_t * page_table, int write_access) @@ -115,6 +115,8 @@ int copy_hugetlb_page_range(struct mm_st ptepage = pte_page(entry); get_page(ptepage); set_pte(dst_pte, entry); + pmd_unmap(dst_pte); + pmd_unmap_nested(src_pte); dst->rss += (HPAGE_SIZE / PAGE_SIZE); addr += HPAGE_SIZE; } @@ -152,6 +154,7 @@ follow_hugetlb_page(struct mm_struct *mm get_page(page); pages[i] = page; + pmd_unmap(pte); } if (vmas) @@ -241,6 +244,7 @@ follow_huge_pmd(struct mm_struct *mm, un page += ((address & ~HPAGE_MASK) >> PAGE_SHIFT); get_page(page); } + pmd_unmap(pmd); return page; } #endif @@ -284,6 +288,7 @@ void unmap_hugepage_range(struct vm_area page = pte_page(*pte); huge_page_release(page); pte_clear(pte); + pmd_unmap(pte); } mm->rss -= (end - start) >> PAGE_SHIFT; flush_tlb_range(vma, start, end); @@ -328,16 +333,19 @@ int hugetlb_prefault(struct address_spac page = alloc_hugetlb_page(); if (!page) { ret = -ENOMEM; + pmd_unmap(pte); goto out; } ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC); unlock_page(page); if (ret) { free_huge_page(page); + pmd_unmap(pte); goto out; } } set_huge_pte(mm, vma, page, pte, vma->vm_flags & VM_WRITE); + pmd_unmap(pte); } out: spin_unlock(&mm->page_table_lock); diff -prauN linux-2.5.71/arch/i386/mm/init.c wli-2.5.71-7/arch/i386/mm/init.c --- linux-2.5.71/arch/i386/mm/init.c 2003-06-14 12:18:29.000000000 -0700 +++ wli-2.5.71-7/arch/i386/mm/init.c 2003-06-15 00:40:56.000000000 -0700 @@ -58,10 +58,10 @@ static pmd_t * __init one_md_table_init( #ifdef CONFIG_X86_PAE pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); - if (pmd_table != pmd_offset(pgd, 0)) + if (pmd_table != pmd_offset_kernel(pgd, 0)) BUG(); #else - pmd_table = pmd_offset(pgd, 0); + pmd_table = pmd_offset_kernel(pgd, 0); #endif return pmd_table; @@ -112,7 +112,7 @@ static void __init page_table_range_init if (pgd_none(*pgd)) one_md_table_init(pgd); - pmd = pmd_offset(pgd, vaddr); + pmd = pmd_offset_kernel(pgd, vaddr); for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) { if (pmd_none(*pmd)) one_page_table_init(pmd); @@ -193,7 +193,7 @@ pte_t *kmap_pte; pgprot_t kmap_prot; #define kmap_get_fixmap_pte(vaddr) \ - pte_offset_kernel(pmd_offset(pgd_offset_k(vaddr), (vaddr)), (vaddr)) + pte_offset_kernel(pmd_offset_kernel(pgd_offset_k(vaddr), (vaddr)), (vaddr)) void __init kmap_init(void) { @@ -217,7 +217,7 @@ void __init permanent_kmaps_init(pgd_t * page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base); pgd = swapper_pg_dir + pgd_index(vaddr); - pmd = pmd_offset(pgd, vaddr); + pmd = pmd_offset_kernel(pgd, vaddr); pte = pte_offset_kernel(pmd, vaddr); pkmap_page_table = pte; } @@ -462,7 +462,7 @@ void __init mem_init(void) /* this will put all low memory onto the freelists */ totalram_pages += __free_all_bootmem(); - + tlb_init(); reservedpages = 0; for (tmp = 0; tmp < max_low_pfn; tmp++) /* @@ -505,20 +505,19 @@ void __init mem_init(void) #endif } -#ifdef CONFIG_X86_PAE -struct kmem_cache_s *pae_pgd_cachep; +kmem_cache_t *pgd_cache; void __init pgtable_cache_init(void) { - /* - * PAE pgds must be 16-byte aligned: - */ - pae_pgd_cachep = kmem_cache_create("pae_pgd", 32, 0, - SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN, NULL, NULL); - if (!pae_pgd_cachep) - panic("init_pae(): Cannot alloc pae_pgd SLAB cache"); + pgd_cache = kmem_cache_create("pgd", + PTRS_PER_PGD*sizeof(pgd_t), + 0, + SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN, + pgd_ctor, + PTRS_PER_PMD == 1 ? pgd_dtor : NULL); + if (!pgd_cache) + panic("pagetable_cache_init(): Cannot create pgd cache"); } -#endif /* * This function cannot be __init, since exceptions don't work in that diff -prauN linux-2.5.71/arch/i386/mm/ioremap.c wli-2.5.71-7/arch/i386/mm/ioremap.c --- linux-2.5.71/arch/i386/mm/ioremap.c 2003-06-14 12:18:04.000000000 -0700 +++ wli-2.5.71-7/arch/i386/mm/ioremap.c 2003-06-14 20:49:04.000000000 -0700 @@ -82,7 +82,7 @@ static int remap_area_pages(unsigned lon spin_lock(&init_mm.page_table_lock); do { pmd_t *pmd; - pmd = pmd_alloc(&init_mm, dir, address); + pmd = pmd_alloc_kernel(&init_mm, dir, address); error = -ENOMEM; if (!pmd) break; diff -prauN linux-2.5.71/arch/i386/mm/pageattr.c wli-2.5.71-7/arch/i386/mm/pageattr.c --- linux-2.5.71/arch/i386/mm/pageattr.c 2003-06-14 12:18:08.000000000 -0700 +++ wli-2.5.71-7/arch/i386/mm/pageattr.c 2003-06-15 00:09:34.000000000 -0700 @@ -19,7 +19,7 @@ static inline pte_t *lookup_address(unsi pmd_t *pmd; if (pgd_none(*pgd)) return NULL; - pmd = pmd_offset(pgd, address); + pmd = pmd_offset_kernel(pgd, address); if (pmd_none(*pmd)) return NULL; if (pmd_large(*pmd)) @@ -58,19 +58,22 @@ static void flush_kernel_map(void *dummy static void set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) { + struct page *page; + unsigned long flags; + set_pte_atomic(kpte, pte); /* change init_mm */ -#ifndef CONFIG_X86_PAE - { - struct list_head *l; - spin_lock(&mmlist_lock); - list_for_each(l, &init_mm.mmlist) { - struct mm_struct *mm = list_entry(l, struct mm_struct, mmlist); - pmd_t *pmd = pmd_offset(pgd_offset(mm, address), address); - set_pte_atomic((pte_t *)pmd, pte); - } - spin_unlock(&mmlist_lock); + if (PTRS_PER_PMD > 1) + return; + + spin_lock_irqsave(&pgd_lock, flags); + list_for_each_entry(page, &pgd_list, lru) { + pgd_t *pgd; + pmd_t *pmd; + pgd = page_address(page) + pgd_index(address); + pmd = pmd_offset_kernel(pgd, address); + set_pte_atomic((pte_t *)pmd, pte); } -#endif + spin_unlock_irqrestore(&pgd_lock, flags); } /* @@ -80,7 +83,7 @@ static void set_pmd_pte(pte_t *kpte, uns static inline void revert_page(struct page *kpte_page, unsigned long address) { pte_t *linear = (pte_t *) - pmd_offset(pgd_offset(&init_mm, address), address); + pmd_offset_kernel(pgd_offset_k(address), address); set_pmd_pte(linear, address, pfn_pte((__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); diff -prauN linux-2.5.71/arch/i386/mm/pgtable.c wli-2.5.71-7/arch/i386/mm/pgtable.c --- linux-2.5.71/arch/i386/mm/pgtable.c 2003-06-14 12:18:52.000000000 -0700 +++ wli-2.5.71-7/arch/i386/mm/pgtable.c 2003-06-15 00:31:34.000000000 -0700 @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -69,7 +70,7 @@ static void set_pte_pfn(unsigned long va BUG(); return; } - pmd = pmd_offset(pgd, vaddr); + pmd = pmd_offset_kernel(pgd, vaddr); if (pmd_none(*pmd)) { BUG(); return; @@ -109,7 +110,7 @@ void set_pmd_pfn(unsigned long vaddr, un printk ("set_pmd_pfn: pgd_none\n"); return; /* BUG(); */ } - pmd = pmd_offset(pgd, vaddr); + pmd = pmd_offset_kernel(pgd, vaddr); set_pmd(pmd, pfn_pmd(pfn, flags)); /* * It's enough to flush this one mapping. @@ -137,75 +138,142 @@ pte_t *pte_alloc_one_kernel(struct mm_st return pte; } -struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) +void tlb_init(void) { - struct page *pte; + int cpu; + for (cpu = 0; cpu < NR_CPUS; ++cpu) { + int zone; + struct mmu_gather *tlb = &per_cpu(mmu_gathers, cpu); + for (zone = 0; zone < MAX_ZONE_ID; ++zone) { + INIT_LIST_HEAD(&tlb->active_list[zone]); + INIT_LIST_HEAD(&tlb->ready_list[zone]); + } + } +} -#ifdef CONFIG_HIGHPTE - pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT, 0); -#else - pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT, 0); -#endif - if (pte) - clear_highpage(pte); - return pte; +static inline struct page *pte_alloc_fresh(int gfp_mask) +{ + struct page *page = alloc_page(gfp_mask); + if (page) { + clear_highpage(page); + if (TestSetPagePTE(page)) + BUG(); + } + return page; } -#ifdef CONFIG_X86_PAE +static inline int zone_high(struct zone *zone) +{ + if (!zone) + return 1; + else + return zone - zone->zone_pgdat->node_zones >= ZONE_HIGHMEM; +} -pgd_t *pgd_alloc(struct mm_struct *mm) +static inline struct page *pte_alloc_ready(int gfp_flags) { - int i; - pgd_t *pgd = kmem_cache_alloc(pae_pgd_cachep, GFP_KERNEL); + struct mmu_gather *tlb = &per_cpu(mmu_gathers, get_cpu()); + struct page *page = NULL; - if (pgd) { - for (i = 0; i < USER_PTRS_PER_PGD; i++) { - unsigned long pmd = __get_free_page(GFP_KERNEL); - if (!pmd) - goto out_oom; - clear_page(pmd); - set_pgd(pgd + i, __pgd(1 + __pa(pmd))); + if (tlb->nr_pte_ready) { + int z; + for (z = MAX_ZONE_ID - 1; z >= 0; --z) { + struct zone *zone = zone_table[z]; + if (!(gfp_flags & __GFP_HIGHMEM) && zone_high(zone)) + continue; + if (!list_empty(&tlb->ready_list[z])) + break; } - memcpy(pgd + USER_PTRS_PER_PGD, - swapper_pg_dir + USER_PTRS_PER_PGD, - (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); + page = list_entry(tlb->ready_list[z].next, struct page, list); + if (TestSetPagePTE(page)) + BUG(); + list_del(&page->list); + tlb->ready_count[z]--; + tlb->nr_pte_ready--; } - return pgd; -out_oom: - for (i--; i >= 0; i--) - free_page((unsigned long)__va(pgd_val(pgd[i])-1)); - kmem_cache_free(pae_pgd_cachep, pgd); - return NULL; + put_cpu(); + return page; } -void pgd_free(pgd_t *pgd) +struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) { - int i; + struct page *page = pte_alloc_ready(GFP_PTE); + return page ? page : pte_alloc_fresh(GFP_PTE); +} - for (i = 0; i < USER_PTRS_PER_PGD; i++) - free_page((unsigned long)__va(pgd_val(pgd[i])-1)); - kmem_cache_free(pae_pgd_cachep, pgd); +static inline struct page *__pmd_alloc_one(void) +{ + struct page *page = pte_alloc_ready(GFP_PMD); + return page ? page : pte_alloc_fresh(GFP_PMD); } -#else +LIST_HEAD(pgd_list); +spinlock_t pgd_lock = SPIN_LOCK_UNLOCKED; -pgd_t *pgd_alloc(struct mm_struct *mm) +void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused) { - pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL); + unsigned long flags; + + if (PTRS_PER_PMD == 1) + spin_lock_irqsave(&pgd_lock, flags); - if (pgd) { - memset(pgd, 0, USER_PTRS_PER_PGD * sizeof(pgd_t)); - memcpy(pgd + USER_PTRS_PER_PGD, + memcpy((pgd_t *)pgd + USER_PTRS_PER_PGD, swapper_pg_dir + USER_PTRS_PER_PGD, - (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); + (PTRS_PER_PGD - USER_PTRS_PER_PGD)*sizeof(pgd_t)); + + if (PTRS_PER_PMD > 1) + return; + + list_add(&virt_to_page(pgd)->lru, &pgd_list); + spin_unlock_irqrestore(&pgd_lock, flags); + memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); +} + +void pgd_dtor(void *pgd, kmem_cache_t *cache, unsigned long unused) +{ + unsigned long flags; + + spin_lock_irqsave(&pgd_lock, flags); + list_del(&virt_to_page(pgd)->lru); + spin_unlock_irqrestore(&pgd_lock, flags); +} + +pgd_t *pgd_alloc(struct mm_struct *mm) +{ + int i; + pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL); + + if (PTRS_PER_PMD == 1 || !pgd) + return pgd; + + for (i = 0; i < USER_PTRS_PER_PGD; i++) { + struct page *pmd = __pmd_alloc_one(); + if (!pmd) + goto out_oom; + set_pgd(&pgd[i], __pgd(1ULL | (u64)page_to_pfn(pmd) << PAGE_SHIFT)); } + return pgd; + + /* + * This looks unusual. pte_free() is actually a convenient wrapper + * for queueing up preconstructed pmd and/or pte pages. The cases + * fall through to just queueing them in the per-cpu lists. + */ +out_oom: + for (i--; i >= 0; i--) + pte_free(pgd_page(pgd[i])); + kmem_cache_free(pgd_cache, pgd); + return NULL; } + void pgd_free(pgd_t *pgd) { - free_page((unsigned long)pgd); + if (PTRS_PER_PMD > 1) { + int i; + for (i = 0; i < USER_PTRS_PER_PGD; i++) + pte_free(pgd_page(pgd[i])); + } + kmem_cache_free(pgd_cache, pgd); } - -#endif /* CONFIG_X86_PAE */ - diff -prauN linux-2.5.71/drivers/char/drm/drm_memory.h wli-2.5.71-7/drivers/char/drm/drm_memory.h --- linux-2.5.71/drivers/char/drm/drm_memory.h 2003-06-14 12:17:59.000000000 -0700 +++ wli-2.5.71-7/drivers/char/drm/drm_memory.h 2003-06-14 20:54:58.000000000 -0700 @@ -123,7 +123,7 @@ static inline unsigned long drm_follow_page (void *vaddr) { pgd_t *pgd = pgd_offset_k((unsigned long) vaddr); - pmd_t *pmd = pmd_offset(pgd, (unsigned long) vaddr); + pmd_t *pmd = pmd_offset_kernel(pgd, (unsigned long)vaddr); pte_t *ptep = pte_offset_kernel(pmd, (unsigned long) vaddr); return pte_pfn(*ptep) << PAGE_SHIFT; } diff -prauN linux-2.5.71/fs/exec.c wli-2.5.71-7/fs/exec.c --- linux-2.5.71/fs/exec.c 2003-06-14 12:18:07.000000000 -0700 +++ wli-2.5.71-7/fs/exec.c 2003-06-14 20:49:04.000000000 -0700 @@ -304,10 +304,10 @@ void put_dirty_page(struct task_struct * if (!pte_chain) goto out_sig; spin_lock(&tsk->mm->page_table_lock); - pmd = pmd_alloc(tsk->mm, pgd, address); + pmd = pmd_alloc_map(tsk->mm, pgd, address); if (!pmd) goto out; - pte = pte_alloc_map(tsk->mm, pmd, address); + pte = pte_alloc_map(tsk->mm, &pmd, address); if (!pte) goto out; if (!pte_none(*pte)) { @@ -319,6 +319,7 @@ void put_dirty_page(struct task_struct * set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(page, prot)))); pte_chain = page_add_rmap(page, pte, pte_chain); pte_unmap(pte); + pmd_unmap(pmd); tsk->mm->rss++; spin_unlock(&tsk->mm->page_table_lock); @@ -326,6 +327,8 @@ void put_dirty_page(struct task_struct * pte_chain_free(pte_chain); return; out: + if (pmd) + pmd_unmap(pmd); spin_unlock(&tsk->mm->page_table_lock); out_sig: __free_page(page); diff -prauN linux-2.5.71/fs/proc/proc_misc.c wli-2.5.71-7/fs/proc/proc_misc.c --- linux-2.5.71/fs/proc/proc_misc.c 2003-06-14 12:17:58.000000000 -0700 +++ wli-2.5.71-7/fs/proc/proc_misc.c 2003-06-14 21:09:31.000000000 -0700 @@ -169,7 +169,7 @@ static int uptime_read_proc(char *page, #endif return proc_calc_metrics(page, start, off, count, eof, len); } - +extern unsigned long nr_deferred_pages(void); static int meminfo_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) { @@ -217,6 +217,7 @@ static int meminfo_read_proc(char *page, "SwapFree: %8lu kB\n" "Dirty: %8lu kB\n" "Writeback: %8lu kB\n" + "Deferred: %8lu kB\n" "Mapped: %8lu kB\n" "Slab: %8lu kB\n" "Committed_AS: %8u kB\n" @@ -239,6 +240,7 @@ static int meminfo_read_proc(char *page, K(i.freeswap), K(ps.nr_dirty), K(ps.nr_writeback), + K(nr_deferred_pages()), K(ps.nr_mapped), K(ps.nr_slab), K(committed), @@ -514,11 +516,10 @@ static int ds1286_read_proc(char *page, static int locks_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) { - int len; - lock_kernel(); - len = get_locks_status(page, start, off, count); - unlock_kernel(); - if (len < count) *eof = 1; + int len = get_locks_status(page, start, off, count); + + if (len < count) + *eof = 1; return len; } diff -prauN linux-2.5.71/fs/proc/root.c wli-2.5.71-7/fs/proc/root.c --- linux-2.5.71/fs/proc/root.c 2003-06-14 12:18:29.000000000 -0700 +++ wli-2.5.71-7/fs/proc/root.c 2003-06-14 21:08:19.000000000 -0700 @@ -81,11 +81,13 @@ void __init proc_root_init(void) static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry) { - if (dir->i_ino == PROC_ROOT_INO) { /* check for safety... */ - lock_kernel(); + /* + * nr_threads is actually protected by the tasklist_lock; + * however, it's conventional to do reads, especially for + * reporting, without any locking whatsoever. + */ + if (dir->i_ino == PROC_ROOT_INO) /* check for safety... */ dir->i_nlink = proc_root.nlink + nr_threads; - unlock_kernel(); - } if (!proc_lookup(dir, dentry)) { return NULL; diff -prauN linux-2.5.71/include/asm-i386/kmap_types.h wli-2.5.71-7/include/asm-i386/kmap_types.h --- linux-2.5.71/include/asm-i386/kmap_types.h 2003-06-14 12:18:28.000000000 -0700 +++ wli-2.5.71-7/include/asm-i386/kmap_types.h 2003-06-14 20:49:04.000000000 -0700 @@ -17,14 +17,16 @@ D(3) KM_USER0, D(4) KM_USER1, D(5) KM_BIO_SRC_IRQ, D(6) KM_BIO_DST_IRQ, -D(7) KM_PTE0, -D(8) KM_PTE1, -D(9) KM_PTE2, -D(10) KM_IRQ0, -D(11) KM_IRQ1, -D(12) KM_SOFTIRQ0, -D(13) KM_SOFTIRQ1, -D(14) KM_TYPE_NR +D(7) KM_PMD0, +D(8) KM_PMD1, +D(9) KM_PTE0, +D(10) KM_PTE1, +D(11) KM_PTE2, +D(12) KM_IRQ0, +D(13) KM_IRQ1, +D(14) KM_SOFTIRQ0, +D(15) KM_SOFTIRQ1, +D(16) KM_TYPE_NR }; #undef D diff -prauN linux-2.5.71/include/asm-i386/pgalloc.h wli-2.5.71-7/include/asm-i386/pgalloc.h --- linux-2.5.71/include/asm-i386/pgalloc.h 2003-06-14 12:17:57.000000000 -0700 +++ wli-2.5.71-7/include/asm-i386/pgalloc.h 2003-06-14 23:29:51.000000000 -0700 @@ -31,14 +31,6 @@ static inline void pte_free_kernel(pte_t free_page((unsigned long)pte); } -static inline void pte_free(struct page *pte) -{ - __free_page(pte); -} - - -#define __pte_free_tlb(tlb,pte) tlb_remove_page((tlb),(pte)) - /* * allocating and freeing a pmd is trivial: the 1-entry pmd is * inside the pgd, so has no extra memory associated with it. @@ -46,10 +38,29 @@ static inline void pte_free(struct page */ #define pmd_alloc_one(mm, addr) ({ BUG(); ((pmd_t *)2); }) +#define pmd_alloc_one_kernel(mm, addr) ({ BUG(); ((pmd_t *)2); }) #define pmd_free(x) do { } while (0) #define __pmd_free_tlb(tlb,x) do { } while (0) #define pgd_populate(mm, pmd, pte) BUG() #define check_pgt_cache() do { } while (0) +#include + +static inline void pte_free(struct page *page) +{ + struct mmu_gather *tlb = &per_cpu(mmu_gathers, get_cpu()); + tlb_remove_page(tlb, page); + put_cpu(); +} + +static inline void pte_free_tlb(struct mmu_gather *tlb, struct page *page) +{ + tlb_remove_page(tlb, page); +} + +static inline void pmd_free_tlb(struct mmu_gather *tlb, struct page *page) +{ +} + #endif /* _I386_PGALLOC_H */ diff -prauN linux-2.5.71/include/asm-i386/pgtable-2level.h wli-2.5.71-7/include/asm-i386/pgtable-2level.h --- linux-2.5.71/include/asm-i386/pgtable-2level.h 2003-06-14 12:18:22.000000000 -0700 +++ wli-2.5.71-7/include/asm-i386/pgtable-2level.h 2003-06-14 20:52:52.000000000 -0700 @@ -48,13 +48,15 @@ static inline int pgd_present(pgd_t pgd) #define set_pmd(pmdptr, pmdval) (*(pmdptr) = pmdval) #define set_pgd(pgdptr, pgdval) (*(pgdptr) = pgdval) -#define pgd_page(pgd) \ -((unsigned long) __va(pgd_val(pgd) & PAGE_MASK)) +#define pgd_page(pgd) pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT) + +#define pmd_offset_map(pgd, addr) ({ (pmd_t *)(pgd); }) +#define pmd_offset_map_nested(pgd, addr) pmd_offset_map(pgd, addr) +#define pmd_offset_kernel(pgd, addr) pmd_offset_map(pgd, addr) + +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) -static inline pmd_t * pmd_offset(pgd_t * dir, unsigned long address) -{ - return (pmd_t *) dir; -} #define ptep_get_and_clear(xp) __pte(xchg(&(xp)->pte_low, 0)) #define pte_same(a, b) ((a).pte_low == (b).pte_low) #define pte_page(x) pfn_to_page(pte_pfn(x)) diff -prauN linux-2.5.71/include/asm-i386/pgtable-3level.h wli-2.5.71-7/include/asm-i386/pgtable-3level.h --- linux-2.5.71/include/asm-i386/pgtable-3level.h 2003-06-14 12:18:30.000000000 -0700 +++ wli-2.5.71-7/include/asm-i386/pgtable-3level.h 2003-06-15 00:18:28.000000000 -0700 @@ -64,12 +64,25 @@ static inline void set_pte(pte_t *ptep, */ static inline void pgd_clear (pgd_t * pgd) { } -#define pgd_page(pgd) \ -((unsigned long) __va(pgd_val(pgd) & PAGE_MASK)) +#define pgd_page(pgd) pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT) + +static inline unsigned long pgd_pfn(pgd_t pgd) +{ + return pgd_val(pgd) >> PAGE_SHIFT; +} + +#define pmd_offset_kernel(pgd, addr) \ + ((pmd_t *)__va(pgd_val(*(pgd)) & PAGE_MASK) + pmd_index(addr)) /* Find an entry in the second-level page table.. */ -#define pmd_offset(dir, address) ((pmd_t *) pgd_page(*(dir)) + \ - pmd_index(address)) +#define __pmd_offset(pgd, addr, type) \ + ((pmd_t *)kmap_atomic(pgd_page(*(pgd)), type) + pmd_index(addr)) + +#define pmd_offset_map(pgd, addr) __pmd_offset(pgd, addr, KM_PMD0) +#define pmd_offset_map_nested(pgd, addr) __pmd_offset(pgd, addr, KM_PMD1) + +#define pmd_unmap(pmd) kunmap_atomic(pmd, KM_PMD0); +#define pmd_unmap_nested(pmd) kunmap_atomic(pmd, KM_PMD1); static inline pte_t ptep_get_and_clear(pte_t *ptep) { @@ -123,6 +136,4 @@ static inline pmd_t pfn_pmd(unsigned lon #define pgoff_to_pte(off) ((pte_t) { _PAGE_FILE, (off) }) #define PTE_FILE_MAX_BITS 32 -extern struct kmem_cache_s *pae_pgd_cachep; - #endif /* _I386_PGTABLE_3LEVEL_H */ diff -prauN linux-2.5.71/include/asm-i386/pgtable.h wli-2.5.71-7/include/asm-i386/pgtable.h --- linux-2.5.71/include/asm-i386/pgtable.h 2003-06-14 12:18:29.000000000 -0700 +++ wli-2.5.71-7/include/asm-i386/pgtable.h 2003-06-15 00:23:20.000000000 -0700 @@ -16,6 +16,9 @@ #include #include #include +#include +#include +#include #ifndef _I386_BITOPS_H #include @@ -31,33 +34,26 @@ extern void paging_init(void); extern unsigned long empty_zero_page[1024]; #define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) -#endif /* !__ASSEMBLY__ */ +extern kmem_cache_t *pgd_cache; +extern struct list_head pgd_list; +extern spinlock_t pgd_lock; +void pgtable_cache_init(void); +void pgd_ctor(void *, kmem_cache_t *, unsigned long); +void pgd_dtor(void *, kmem_cache_t *, unsigned long); /* * The Linux x86 paging architecture is 'compile-time dual-mode', it * implements both the traditional 2-level x86 page tables and the * newer 3-level PAE-mode page tables. */ -#ifndef __ASSEMBLY__ #ifdef CONFIG_X86_PAE # include - -/* - * Need to initialise the X86 PAE caches - */ -extern void pgtable_cache_init(void); - #else # include - -/* - * No page table caches to initialise - */ -#define pgtable_cache_init() do { } while (0) - -#endif #endif +#endif /* !__ASSEMBLY__ */ + #define PMD_SIZE (1UL << PMD_SHIFT) #define PMD_MASK (~(PMD_SIZE-1)) #define PGDIR_SIZE (1UL << PGDIR_SHIFT) @@ -294,32 +290,25 @@ static inline pte_t pte_modify(pte_t pte #define pte_offset_kernel(dir, address) \ ((pte_t *) pmd_page_kernel(*(dir)) + pte_index(address)) -#if defined(CONFIG_HIGHPTE) -#define pte_offset_map(dir, address) \ - ((pte_t *)kmap_atomic(pmd_page(*(dir)),KM_PTE0) + pte_index(address)) -#define pte_offset_map_nested(dir, address) \ - ((pte_t *)kmap_atomic(pmd_page(*(dir)),KM_PTE1) + pte_index(address)) -#define pte_unmap(pte) kunmap_atomic(pte, KM_PTE0) -#define pte_unmap_nested(pte) kunmap_atomic(pte, KM_PTE1) -#else -#define pte_offset_map(dir, address) \ - ((pte_t *)page_address(pmd_page(*(dir))) + pte_index(address)) -#define pte_offset_map_nested(dir, address) pte_offset_map(dir, address) -#define pte_unmap(pte) do { } while (0) -#define pte_unmap_nested(pte) do { } while (0) -#endif +#define __pte_offset(pmd, addr, type) \ + ((pte_t *)kmap_atomic(pmd_page(*pmd), type) + pte_index(addr)) -#if defined(CONFIG_HIGHPTE) && defined(CONFIG_HIGHMEM4G) -typedef u32 pte_addr_t; -#endif +#define pte_offset_map(pmd, addr) __pte_offset(pmd, addr, KM_PTE0) +#define pte_offset_map_nested(pmd, addr) __pte_offset(pmd, addr, KM_PTE1) +#define pte_unmap(pte) kunmap_atomic(pte, KM_PTE0) +#define pte_unmap_nested(pte) kunmap_atomic(pte, KM_PTE1) + +#ifdef CONFIG_HIGHPTE -#if defined(CONFIG_HIGHPTE) && defined(CONFIG_HIGHMEM64G) +#ifdef CONFIG_HIGHMEM64G typedef u64 pte_addr_t; -#endif +#else /* CONFIG_HIGHMEM4G */ +typedef u32 pte_addr_t; +#endif /* CONFIG_HIGHMEM4G */ -#if !defined(CONFIG_HIGHPTE) +#else /* !CONFIG_HIGHPTE */ typedef pte_t *pte_addr_t; -#endif +#endif /* !CONFIG_HIGHPTE */ /* * The i386 doesn't have any external MMU info: the kernel page diff -prauN linux-2.5.71/include/asm-i386/tlb.h wli-2.5.71-7/include/asm-i386/tlb.h --- linux-2.5.71/include/asm-i386/tlb.h 2003-06-14 12:18:06.000000000 -0700 +++ wli-2.5.71-7/include/asm-i386/tlb.h 2003-06-14 23:19:55.000000000 -0700 @@ -1,10 +1,53 @@ #ifndef _I386_TLB_H #define _I386_TLB_H - /* - * x86 doesn't need any special per-pte or - * per-vma handling.. + * include/asm-i386/tlb.h + * (C) June 2003 William Irwin, IBM + * Routines for pagetable cacheing and release. */ + +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_HIGHPTE +#define GFP_PTE (GFP_KERNEL|__GFP_REPEAT|__GFP_HIGHMEM) +#else +#define GFP_PTE (GFP_KERNEL|__GFP_REPEAT) +#endif + +#ifdef CONFIG_HIGHPMD +#define GFP_PMD (GFP_KERNEL|__GFP_REPEAT|__GFP_HIGHMEM) +#else +#define GFP_PMD (GFP_KERNEL|__GFP_REPEAT) +#endif + +#define PG_PTE PG_arch_1 +#define NR_PTE 128 +#define NR_NONPTE 512 +#define MAX_ZONE_ID (MAX_NUMNODES * MAX_NR_ZONES) + +#define PagePTE(page) test_bit(PG_PTE, &(page)->flags) +#define SetPagePTE(page) set_bit(PG_PTE, &(page)->flags) +#define ClearPagePTE(page) clear_bit(PG_PTE, &(page)->flags) +#define TestSetPagePTE(page) test_and_set_bit(PG_PTE, &(page)->flags) +#define TestClearPagePTE(page) test_and_clear_bit(PG_PTE, &(page)->flags) +#define PageZoneID(page) ((page)->flags >> ZONE_SHIFT) + +struct mmu_gather { + struct mm_struct *mm; + int nr_pte_active, nr_pte_ready, nr_nonpte, need_flush, fullmm, freed; + struct list_head active_list[MAX_ZONE_ID], ready_list[MAX_ZONE_ID]; + int active_count[MAX_ZONE_ID], ready_count[MAX_ZONE_ID]; + struct page *nonpte[NR_NONPTE]; +}; + +DECLARE_PER_CPU(struct mmu_gather, mmu_gathers); + #define tlb_start_vma(tlb, vma) do { } while (0) #define tlb_end_vma(tlb, vma) do { } while (0) #define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0) @@ -15,6 +58,108 @@ */ #define tlb_flush(tlb) flush_tlb_mm((tlb)->mm) -#include +void tlb_init(void); -#endif +static inline +struct mmu_gather *tlb_gather_mmu(struct mm_struct *mm, unsigned int flush) +{ + struct mmu_gather *tlb = &per_cpu(mmu_gathers, get_cpu()); + tlb->mm = mm; + tlb->fullmm = flush; + put_cpu(); + return tlb; +} + +static inline +void tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *pte, unsigned long addr) +{ + tlb->need_flush = 1; +} + +static inline +void tlb_flush_ready(struct mmu_gather *tlb) +{ + int zone = 0; + while (tlb->nr_pte_ready >= NR_PTE) { + if (!list_empty(&tlb->ready_list[zone])) { + struct page *head = list_entry(tlb->ready_list[zone].next, struct page, list); + list_del_init(&head->list); + list_splice_init(&tlb->ready_list[zone], &head->list); + head->private = tlb->ready_count[zone]; + tlb->nr_pte_ready -= tlb->ready_count[zone]; + tlb->ready_count[zone] = 0; + free_pages_bulk(zone_table[zone], head, 0); + } + ++zone; + } +} + +static inline +void tlb_flush_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) +{ + int zone; + + if (!tlb->need_flush && tlb->nr_nonpte < NR_NONPTE) + return; + + tlb->need_flush = 0; + tlb_flush(tlb); + if (tlb->nr_nonpte) { + free_pages_and_swap_cache(tlb->nonpte, tlb->nr_nonpte); + tlb->nr_nonpte = 0; + } + + for (zone = 0; zone < MAX_ZONE_ID; ++zone) { + if (!tlb->active_count[zone]) + continue; + + list_splice_init(&tlb->active_list[zone], &tlb->ready_list[zone]); + tlb->ready_count[zone] += tlb->active_count[zone]; + tlb->active_count[zone] = 0; + } + tlb->nr_pte_ready += tlb->nr_pte_active; + tlb->nr_pte_active = 0; + if (tlb->nr_pte_ready >= NR_PTE) + tlb_flush_ready(tlb); +} + +static inline +void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) +{ + if (tlb->mm->rss >= tlb->freed) + tlb->mm->rss -= tlb->freed; + else + tlb->mm->rss = 0; + tlb_flush_mmu(tlb, start, end); +} + +static inline +void tlb_remove_nonpte_page(struct mmu_gather *tlb, struct page *page) +{ + tlb->nonpte[tlb->nr_nonpte] = page; + tlb->nr_nonpte++; + if (tlb->nr_nonpte >= NR_NONPTE) + tlb_flush_mmu(tlb, 0, 0); +} + +static inline +void tlb_remove_pte_page(struct mmu_gather *tlb, struct page *page) +{ + int zone = PageZoneID(page); + ClearPagePTE(page); + tlb->nr_pte_active++; + tlb->active_count[zone]++; + list_add(&page->list, &tlb->active_list[zone]); +} + +static inline +void tlb_remove_page(struct mmu_gather *tlb, struct page *page) +{ + tlb->need_flush = 1; + if (PagePTE(page)) + tlb_remove_pte_page(tlb, page); + else + tlb_remove_nonpte_page(tlb, page); +} + +#endif /* _I386_TLB_H */ diff -prauN linux-2.5.71/include/linux/gfp.h wli-2.5.71-7/include/linux/gfp.h --- linux-2.5.71/include/linux/gfp.h 2003-06-14 12:18:04.000000000 -0700 +++ wli-2.5.71-7/include/linux/gfp.h 2003-06-14 23:11:54.000000000 -0700 @@ -76,6 +76,7 @@ static inline struct page * alloc_pages_ extern unsigned long FASTCALL(__get_free_pages(unsigned int gfp_mask, unsigned int order)); extern unsigned long FASTCALL(get_zeroed_page(unsigned int gfp_mask)); +void free_pages_bulk(struct zone *zone, struct page *page, unsigned int order); #define __get_free_page(gfp_mask) \ __get_free_pages((gfp_mask),0) diff -prauN linux-2.5.71/include/linux/mm.h wli-2.5.71-7/include/linux/mm.h --- linux-2.5.71/include/linux/mm.h 2003-06-14 12:17:57.000000000 -0700 +++ wli-2.5.71-7/include/linux/mm.h 2003-06-14 23:11:08.000000000 -0700 @@ -339,9 +339,14 @@ static inline void set_page_zone(struct page->flags |= zone_num << ZONE_SHIFT; } -static inline void * lowmem_page_address(struct page *page) +#ifndef CONFIG_DISCONTIGMEM +/* The array of struct pages - for discontigmem use pgdat->lmem_map */ +extern struct page *mem_map; +#endif + +static inline void *lowmem_page_address(struct page *page) { - return __va( ( (page - page_zone(page)->zone_mem_map) + page_zone(page)->zone_start_pfn) << PAGE_SHIFT); + return __va(page_to_pfn(page) << PAGE_SHIFT); } #if defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) @@ -395,11 +400,6 @@ static inline int page_mapped(struct pag #define VM_FAULT_MINOR 1 #define VM_FAULT_MAJOR 2 -#ifndef CONFIG_DISCONTIGMEM -/* The array of struct pages - for discontigmem use pgdat->lmem_map */ -extern struct page *mem_map; -#endif - extern void show_free_areas(void); struct page *shmem_nopage(struct vm_area_struct * vma, @@ -423,8 +423,9 @@ int zeromap_page_range(struct vm_area_st extern int vmtruncate(struct inode * inode, loff_t offset); extern pmd_t *FASTCALL(__pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)); +pmd_t *FASTCALL(__pmd_alloc_kernel(struct mm_struct *mm, pgd_t *pmd, unsigned long address)); extern pte_t *FASTCALL(pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address)); -extern pte_t *FASTCALL(pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address)); +pte_t *FASTCALL(pte_alloc_map(struct mm_struct *mm, pmd_t **pmd, unsigned long address)); extern int install_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot); extern int handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma, unsigned long address, int write_access); extern int make_pages_present(unsigned long addr, unsigned long end); @@ -485,12 +486,11 @@ static inline int set_page_dirty(struct * inlining and the symmetry break with pte_alloc_map() that does all * of this out-of-line. */ -static inline pmd_t *pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) -{ - if (pgd_none(*pgd)) - return __pmd_alloc(mm, pgd, address); - return pmd_offset(pgd, address); -} +#define pmd_alloc_map(mm, pgd, addr) \ + (pgd_none(*(pgd))? __pmd_alloc(mm,pgd,addr): pmd_offset_map(pgd,addr)) + +#define pmd_alloc_kernel(mm, pgd, addr) \ + (pgd_none(*(pgd))? __pmd_alloc_kernel(mm,pgd,addr): pmd_offset_kernel(pgd,addr)) extern void free_area_init(unsigned long * zones_size); extern void free_area_init_node(int nid, pg_data_t *pgdat, struct page *pmap, diff -prauN linux-2.5.71/include/linux/mmzone.h wli-2.5.71-7/include/linux/mmzone.h --- linux-2.5.71/include/linux/mmzone.h 2003-06-14 12:18:22.000000000 -0700 +++ wli-2.5.71-7/include/linux/mmzone.h 2003-06-14 19:29:33.000000000 -0700 @@ -26,8 +26,8 @@ #endif struct free_area { - struct list_head free_list; - unsigned long *map; + struct list_head free_list, deferred_pages; + unsigned long *map, globally_free, active, locally_free; }; struct pglist_data; diff -prauN linux-2.5.71/mm/fremap.c wli-2.5.71-7/mm/fremap.c --- linux-2.5.71/mm/fremap.c 2003-06-14 12:17:56.000000000 -0700 +++ wli-2.5.71-7/mm/fremap.c 2003-06-14 20:49:04.000000000 -0700 @@ -67,11 +67,11 @@ int install_page(struct mm_struct *mm, s pgd = pgd_offset(mm, addr); spin_lock(&mm->page_table_lock); - pmd = pmd_alloc(mm, pgd, addr); + pmd = pmd_alloc_map(mm, pgd, addr); if (!pmd) goto err_unlock; - pte = pte_alloc_map(mm, pmd, addr); + pte = pte_alloc_map(mm, &pmd, addr); if (!pte) goto err_unlock; @@ -82,6 +82,7 @@ int install_page(struct mm_struct *mm, s set_pte(pte, mk_pte(page, prot)); pte_chain = page_add_rmap(page, pte, pte_chain); pte_unmap(pte); + pmd_unmap(pmd); if (flush) flush_tlb_page(vma, addr); update_mmu_cache(vma, addr, *pte); diff -prauN linux-2.5.71/mm/memory.c wli-2.5.71-7/mm/memory.c --- linux-2.5.71/mm/memory.c 2003-06-14 12:18:09.000000000 -0700 +++ wli-2.5.71-7/mm/memory.c 2003-06-14 23:24:55.000000000 -0700 @@ -103,7 +103,8 @@ static inline void free_one_pmd(struct m static inline void free_one_pgd(struct mmu_gather *tlb, pgd_t * dir) { int j; - pmd_t * pmd; + pmd_t *pmd; + struct page *page; if (pgd_none(*dir)) return; @@ -112,11 +113,13 @@ static inline void free_one_pgd(struct m pgd_clear(dir); return; } - pmd = pmd_offset(dir, 0); + page = pgd_page(*dir); + pmd = pmd_offset_map(dir, 0); pgd_clear(dir); for (j = 0; j < PTRS_PER_PMD ; j++) free_one_pmd(tlb, pmd+j); - pmd_free_tlb(tlb, pmd); + pmd_unmap(pmd); + pmd_free_tlb(tlb, page); } /* @@ -136,30 +139,40 @@ void clear_page_tables(struct mmu_gather } while (--nr); } -pte_t * pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address) +/* + * error return happens with pmd unmapped + */ +pte_t *pte_alloc_map(struct mm_struct *mm, pmd_t **pmd, unsigned long address) { - if (!pmd_present(*pmd)) { + if (!pmd_present(**pmd)) { + pgd_t *pgd; struct page *new; + pmd_unmap(*pmd); spin_unlock(&mm->page_table_lock); new = pte_alloc_one(mm, address); spin_lock(&mm->page_table_lock); - if (!new) + if (!new) { + *pmd = NULL; return NULL; + } + + pgd = pgd_offset(mm, address); + *pmd = pmd_offset_map(pgd, address); /* * Because we dropped the lock, we should re-check the * entry, as somebody else could have populated it.. */ - if (pmd_present(*pmd)) { + if (pmd_present(**pmd)) { pte_free(new); goto out; } pgtable_add_rmap(new, mm, address); - pmd_populate(mm, pmd, new); + pmd_populate(mm, *pmd, new); } out: - return pte_offset_map(pmd, address); + return pte_offset_map(*pmd, address); } pte_t * pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address) @@ -244,10 +257,10 @@ skip_copy_pmd_range: address = (address continue; } - src_pmd = pmd_offset(src_pgd, address); - dst_pmd = pmd_alloc(dst, dst_pgd, address); + dst_pmd = pmd_alloc_map(dst, dst_pgd, address); if (!dst_pmd) goto nomem; + src_pmd = pmd_offset_map_nested(src_pgd, address); do { pte_t * src_pte, * dst_pte; @@ -261,15 +274,20 @@ skip_copy_pmd_range: address = (address pmd_clear(src_pmd); skip_copy_pte_range: address = (address + PMD_SIZE) & PMD_MASK; - if (address >= end) + if (address >= end) { + pmd_unmap(dst_pmd); + pmd_unmap_nested(src_pmd); goto out; + } goto cont_copy_pmd_range; } - dst_pte = pte_alloc_map(dst, dst_pmd, address); + pmd_unmap_nested(src_pmd); + dst_pte = pte_alloc_map(dst, &dst_pmd, address); if (!dst_pte) goto nomem; spin_lock(&src->page_table_lock); + src_pmd = pmd_offset_map_nested(src_pgd, address); src_pte = pte_offset_map_nested(src_pmd, address); do { pte_t pte = *src_pte; @@ -336,6 +354,8 @@ skip_copy_pte_range: */ pte_unmap_nested(src_pte); pte_unmap(dst_pte); + pmd_unmap_nested(src_pmd); + pmd_unmap(dst_pmd); spin_unlock(&src->page_table_lock); spin_unlock(&dst->page_table_lock); pte_chain = pte_chain_alloc(GFP_KERNEL); @@ -343,12 +363,16 @@ skip_copy_pte_range: if (!pte_chain) goto nomem; spin_lock(&src->page_table_lock); + dst_pmd = pmd_offset_map(dst_pgd, address); + src_pmd = pmd_offset_map_nested(src_pgd, address); dst_pte = pte_offset_map(dst_pmd, address); src_pte = pte_offset_map_nested(src_pmd, address); cont_copy_pte_range_noset: address += PAGE_SIZE; if (address >= end) { + pmd_unmap(dst_pmd); + pmd_unmap_nested(src_pmd); pte_unmap_nested(src_pte); pte_unmap(dst_pte); goto out_unlock; @@ -364,6 +388,8 @@ cont_copy_pmd_range: src_pmd++; dst_pmd++; } while ((unsigned long)src_pmd & PMD_TABLE_MASK); + pmd_unmap_nested(src_pmd-1); + pmd_unmap(dst_pmd-1); } out_unlock: spin_unlock(&src->page_table_lock); @@ -439,7 +465,7 @@ zap_pmd_range(struct mmu_gather *tlb, pg pgd_clear(dir); return; } - pmd = pmd_offset(dir, address); + pmd = pmd_offset_map(dir, address); end = address + size; if (end > ((address + PGDIR_SIZE) & PGDIR_MASK)) end = ((address + PGDIR_SIZE) & PGDIR_MASK); @@ -448,6 +474,7 @@ zap_pmd_range(struct mmu_gather *tlb, pg address = (address + PMD_SIZE) & PMD_MASK; pmd++; } while (address < end); + pmd_unmap(pmd - 1); } void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, @@ -629,20 +656,24 @@ follow_page(struct mm_struct *mm, unsign if (pgd_none(*pgd) || pgd_bad(*pgd)) goto out; - pmd = pmd_offset(pgd, address); + pmd = pmd_offset_map(pgd, address); if (pmd_none(*pmd)) - goto out; - if (pmd_huge(*pmd)) - return follow_huge_pmd(mm, address, pmd, write); + goto out_unmap; + if (pmd_huge(*pmd)) { + struct page *page = follow_huge_pmd(mm, address, pmd, write); + pmd_unmap(pmd); + return page; + } if (pmd_bad(*pmd)) - goto out; + goto out_unmap; ptep = pte_offset_map(pmd, address); if (!ptep) - goto out; + goto out_unmap; pte = *ptep; pte_unmap(ptep); + pmd_unmap(pmd); if (pte_present(pte)) { if (!write || (pte_write(pte) && pte_dirty(pte))) { pfn = pte_pfn(pte); @@ -653,6 +684,9 @@ follow_page(struct mm_struct *mm, unsign out: return NULL; +out_unmap: + pmd_unmap(pmd); + goto out; } /* @@ -711,7 +745,7 @@ int get_user_pages(struct task_struct *t pgd = pgd_offset_k(pg); if (!pgd) return i ? : -EFAULT; - pmd = pmd_offset(pgd, pg); + pmd = pmd_offset_kernel(pgd, pg); if (!pmd) return i ? : -EFAULT; pte = pte_offset_kernel(pmd, pg); @@ -813,7 +847,7 @@ static inline int zeromap_pmd_range(stru if (end > PGDIR_SIZE) end = PGDIR_SIZE; do { - pte_t * pte = pte_alloc_map(mm, pmd, address); + pte_t *pte = pte_alloc_map(mm, &pmd, address); if (!pte) return -ENOMEM; zeromap_pte_range(pte, address, end - address, prot); @@ -839,13 +873,14 @@ int zeromap_page_range(struct vm_area_st spin_lock(&mm->page_table_lock); do { - pmd_t *pmd = pmd_alloc(mm, dir, address); + pmd_t *pmd = pmd_alloc_map(mm, dir, address); error = -ENOMEM; if (!pmd) break; error = zeromap_pmd_range(mm, pmd, address, end - address, prot); if (error) break; + pmd_unmap(pmd - 1); address = (address + PGDIR_SIZE) & PGDIR_MASK; dir++; } while (address && (address < end)); @@ -892,7 +927,7 @@ static inline int remap_pmd_range(struct end = PGDIR_SIZE; phys_addr -= address; do { - pte_t * pte = pte_alloc_map(mm, pmd, base + address); + pte_t *pte = pte_alloc_map(mm, &pmd, base + address); if (!pte) return -ENOMEM; remap_pte_range(pte, base + address, end - address, address + phys_addr, prot); @@ -920,13 +955,14 @@ int remap_page_range(struct vm_area_stru spin_lock(&mm->page_table_lock); do { - pmd_t *pmd = pmd_alloc(mm, dir, from); + pmd_t *pmd = pmd_alloc_map(mm, dir, from); error = -ENOMEM; if (!pmd) break; error = remap_pmd_range(mm, pmd, from, end - from, phys_addr + from, prot); if (error) break; + pmd_unmap(pmd); from = (from + PGDIR_SIZE) & PGDIR_MASK; dir++; } while (from && (from < end)); @@ -996,6 +1032,7 @@ static int do_wp_page(struct mm_struct * * data, but for the moment just pretend this is OOM. */ pte_unmap(page_table); + pmd_unmap(pmd); printk(KERN_ERR "do_wp_page: bogus page at address %08lx\n", address); goto oom; @@ -1010,11 +1047,13 @@ static int do_wp_page(struct mm_struct * establish_pte(vma, address, page_table, pte_mkyoung(pte_mkdirty(pte_mkwrite(pte)))); pte_unmap(page_table); + pmd_unmap(pmd); ret = VM_FAULT_MINOR; goto out; } } pte_unmap(page_table); + pmd_unmap(pmd); /* * Ok, we need to copy. Oh, well.. @@ -1034,6 +1073,7 @@ static int do_wp_page(struct mm_struct * * Re-check the pte - we dropped the lock */ spin_lock(&mm->page_table_lock); + pmd = pmd_offset_map(pgd_offset(mm, address), address); page_table = pte_offset_map(pmd, address); if (pte_same(*page_table, pte)) { if (PageReserved(old_page)) @@ -1047,6 +1087,7 @@ static int do_wp_page(struct mm_struct * new_page = old_page; } pte_unmap(page_table); + pmd_unmap(pmd); page_cache_release(new_page); page_cache_release(old_page); ret = VM_FAULT_MINOR; @@ -1180,6 +1221,7 @@ static int do_swap_page(struct mm_struct struct pte_chain *pte_chain = NULL; pte_unmap(page_table); + pmd_unmap(pmd); spin_unlock(&mm->page_table_lock); page = lookup_swap_cache(entry); if (!page) { @@ -1191,12 +1233,14 @@ static int do_swap_page(struct mm_struct * we released the page table lock. */ spin_lock(&mm->page_table_lock); + pmd = pmd_offset_map(pgd_offset(mm, address), address); page_table = pte_offset_map(pmd, address); if (pte_same(*page_table, orig_pte)) ret = VM_FAULT_OOM; else ret = VM_FAULT_MINOR; pte_unmap(page_table); + pmd_unmap(pmd); spin_unlock(&mm->page_table_lock); goto out; } @@ -1219,9 +1263,11 @@ static int do_swap_page(struct mm_struct * released the page table lock. */ spin_lock(&mm->page_table_lock); + pmd = pmd_offset_map(pgd_offset(mm, address), address); page_table = pte_offset_map(pmd, address); if (!pte_same(*page_table, orig_pte)) { pte_unmap(page_table); + pmd_unmap(pmd); spin_unlock(&mm->page_table_lock); unlock_page(page); page_cache_release(page); @@ -1247,6 +1293,7 @@ static int do_swap_page(struct mm_struct /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, address, pte); + pmd_unmap(pmd); pte_unmap(page_table); spin_unlock(&mm->page_table_lock); out: @@ -1272,11 +1319,13 @@ do_anonymous_page(struct mm_struct *mm, pte_chain = pte_chain_alloc(GFP_ATOMIC); if (!pte_chain) { pte_unmap(page_table); + pmd_unmap(pmd); spin_unlock(&mm->page_table_lock); pte_chain = pte_chain_alloc(GFP_KERNEL); if (!pte_chain) goto no_mem; spin_lock(&mm->page_table_lock); + pmd = pmd_offset_map(pgd_offset(mm, addr), addr); page_table = pte_offset_map(pmd, addr); } @@ -1287,6 +1336,7 @@ do_anonymous_page(struct mm_struct *mm, if (write_access) { /* Allocate our own private page. */ pte_unmap(page_table); + pmd_unmap(pmd); spin_unlock(&mm->page_table_lock); page = alloc_page(GFP_HIGHUSER); @@ -1295,9 +1345,11 @@ do_anonymous_page(struct mm_struct *mm, clear_user_highpage(page, addr); spin_lock(&mm->page_table_lock); + pmd = pmd_offset_map(pgd_offset(mm, addr), addr); page_table = pte_offset_map(pmd, addr); if (!pte_none(*page_table)) { + pmd_unmap(pmd); pte_unmap(page_table); page_cache_release(page); spin_unlock(&mm->page_table_lock); @@ -1313,6 +1365,7 @@ do_anonymous_page(struct mm_struct *mm, set_pte(page_table, entry); /* ignores ZERO_PAGE */ pte_chain = page_add_rmap(page, page_table, pte_chain); + pmd_unmap(pmd); pte_unmap(page_table); /* No need to invalidate - it was non-present before */ @@ -1353,6 +1406,7 @@ do_no_page(struct mm_struct *mm, struct return do_anonymous_page(mm, vma, page_table, pmd, write_access, address); pte_unmap(page_table); + pmd_unmap(pmd); spin_unlock(&mm->page_table_lock); new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, 0); @@ -1383,6 +1437,7 @@ do_no_page(struct mm_struct *mm, struct } spin_lock(&mm->page_table_lock); + pmd = pmd_offset_map(pgd_offset(mm, address), address); page_table = pte_offset_map(pmd, address); /* @@ -1405,9 +1460,11 @@ do_no_page(struct mm_struct *mm, struct set_pte(page_table, entry); pte_chain = page_add_rmap(new_page, page_table, pte_chain); pte_unmap(page_table); + pmd_unmap(pmd); } else { /* One of our sibling threads was faster, back out. */ pte_unmap(page_table); + pmd_unmap(pmd); page_cache_release(new_page); spin_unlock(&mm->page_table_lock); ret = VM_FAULT_MINOR; @@ -1451,6 +1508,7 @@ static int do_file_page(struct mm_struct pgoff = pte_to_pgoff(*pte); pte_unmap(pte); + pmd_unmap(pmd); spin_unlock(&mm->page_table_lock); err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE, vma->vm_page_prot, pgoff, 0); @@ -1537,10 +1595,10 @@ int handle_mm_fault(struct mm_struct *mm * and the SMP-safe atomic PTE updates. */ spin_lock(&mm->page_table_lock); - pmd = pmd_alloc(mm, pgd, address); + pmd = pmd_alloc_map(mm, pgd, address); if (pmd) { - pte_t * pte = pte_alloc_map(mm, pmd, address); + pte_t *pte = pte_alloc_map(mm, &pmd, address); if (pte) return handle_pte_fault(mm, vma, address, write_access, pte, pmd); } @@ -1577,7 +1635,30 @@ pmd_t *__pmd_alloc(struct mm_struct *mm, } pgd_populate(mm, pgd, new); out: - return pmd_offset(pgd, address); + return pmd_offset_map(pgd, address); +} + +pmd_t *__pmd_alloc_kernel(struct mm_struct *mm, pgd_t *pgd, unsigned long address) +{ + pmd_t *new; + + spin_unlock(&mm->page_table_lock); + new = pmd_alloc_one_kernel(mm, address); + spin_lock(&mm->page_table_lock); + if (!new) + return NULL; + + /* + * Because we dropped the lock, we should re-check the + * entry, as somebody else could have populated it.. + */ + if (pgd_present(*pgd)) { + pmd_free(new); + goto out; + } + pgd_populate(mm, pgd, new); +out: + return pmd_offset_kernel(pgd, address); } int make_pages_present(unsigned long addr, unsigned long end) @@ -1600,7 +1681,7 @@ int make_pages_present(unsigned long add /* * Map a vmalloc()-space virtual address to the physical page. */ -struct page * vmalloc_to_page(void * vmalloc_addr) +struct page *vmalloc_to_page(void *vmalloc_addr) { unsigned long addr = (unsigned long) vmalloc_addr; struct page *page = NULL; @@ -1609,7 +1690,7 @@ struct page * vmalloc_to_page(void * vma pte_t *ptep, pte; if (!pgd_none(*pgd)) { - pmd = pmd_offset(pgd, addr); + pmd = pmd_offset_map(pgd, addr); if (!pmd_none(*pmd)) { preempt_disable(); ptep = pte_offset_map(pmd, addr); @@ -1619,6 +1700,7 @@ struct page * vmalloc_to_page(void * vma pte_unmap(ptep); preempt_enable(); } + pmd_unmap(pmd); } return page; } diff -prauN linux-2.5.71/mm/mprotect.c wli-2.5.71-7/mm/mprotect.c --- linux-2.5.71/mm/mprotect.c 2003-06-14 12:18:08.000000000 -0700 +++ wli-2.5.71-7/mm/mprotect.c 2003-06-14 20:49:04.000000000 -0700 @@ -73,7 +73,7 @@ change_pmd_range(pgd_t *pgd, unsigned lo pgd_clear(pgd); return; } - pmd = pmd_offset(pgd, address); + pmd = pmd_offset_map(pgd, address); address &= ~PGDIR_MASK; end = address + size; if (end > PGDIR_SIZE) @@ -83,6 +83,7 @@ change_pmd_range(pgd_t *pgd, unsigned lo address = (address + PMD_SIZE) & PMD_MASK; pmd++; } while (address && (address < end)); + pmd_unmap(pmd - 1); } static void diff -prauN linux-2.5.71/mm/mremap.c wli-2.5.71-7/mm/mremap.c --- linux-2.5.71/mm/mremap.c 2003-06-14 12:18:22.000000000 -0700 +++ wli-2.5.71-7/mm/mremap.c 2003-06-14 20:49:04.000000000 -0700 @@ -22,10 +22,10 @@ #include #include -static pte_t *get_one_pte_map_nested(struct mm_struct *mm, unsigned long addr) +static pte_t *get_one_pte_map_nested(struct mm_struct *mm, + unsigned long addr, pmd_t **pmd) { pgd_t *pgd; - pmd_t *pmd; pte_t *pte = NULL; pgd = pgd_offset(mm, addr); @@ -37,19 +37,22 @@ static pte_t *get_one_pte_map_nested(str goto end; } - pmd = pmd_offset(pgd, addr); - if (pmd_none(*pmd)) + *pmd = pmd_offset_map_nested(pgd, addr); + if (pmd_none(**pmd)) + pmd_unmap_nested(*pmd); goto end; - if (pmd_bad(*pmd)) { - pmd_ERROR(*pmd); - pmd_clear(pmd); + if (pmd_bad(**pmd)) { + pmd_ERROR(**pmd); + pmd_clear(*pmd); + pmd_unmap_nested(*pmd); goto end; } - pte = pte_offset_map_nested(pmd, addr); + pte = pte_offset_map_nested(*pmd, addr); if (pte_none(*pte)) { pte_unmap_nested(pte); pte = NULL; + pmd_unmap_nested(*pmd); } end: return pte; @@ -60,24 +63,26 @@ static inline int page_table_present(str { pgd_t *pgd; pmd_t *pmd; + int ret; pgd = pgd_offset(mm, addr); if (pgd_none(*pgd)) return 0; - pmd = pmd_offset(pgd, addr); - return pmd_present(*pmd); + pmd = pmd_offset_map(pgd, addr); + ret = pmd_present(*pmd); + pmd_unmap(pmd); + return ret; } #else #define page_table_present(mm, addr) (1) #endif -static inline pte_t *alloc_one_pte_map(struct mm_struct *mm, unsigned long addr) +static inline pte_t *alloc_one_pte_map(struct mm_struct *mm, unsigned long addr, pmd_t **pmd) { - pmd_t *pmd; pte_t *pte = NULL; - pmd = pmd_alloc(mm, pgd_offset(mm, addr), addr); - if (pmd) + *pmd = pmd_alloc_map(mm, pgd_offset(mm, addr), addr); + if (*pmd) pte = pte_alloc_map(mm, pmd, addr); return pte; } @@ -116,6 +121,7 @@ move_one_page(struct vm_area_struct *vma struct mm_struct *mm = vma->vm_mm; int error = 0; pte_t *src, *dst; + pmd_t *src_pmd = NULL, *dst_pmd = NULL; struct pte_chain *pte_chain; pte_chain = pte_chain_alloc(GFP_KERNEL); @@ -124,7 +130,7 @@ move_one_page(struct vm_area_struct *vma goto out; } spin_lock(&mm->page_table_lock); - src = get_one_pte_map_nested(mm, old_addr); + src = get_one_pte_map_nested(mm, old_addr, &src_pmd); if (src) { /* * Look to see whether alloc_one_pte_map needs to perform a @@ -133,14 +139,29 @@ move_one_page(struct vm_area_struct *vma */ if (!page_table_present(mm, new_addr)) { pte_unmap_nested(src); + if (src_pmd) { + pmd_unmap_nested(src_pmd); + src_pmd = NULL; + } src = NULL; } - dst = alloc_one_pte_map(mm, new_addr); - if (src == NULL) - src = get_one_pte_map_nested(mm, old_addr); - error = copy_one_pte(mm, src, dst, &pte_chain); - pte_unmap_nested(src); - pte_unmap(dst); + + dst = alloc_one_pte_map(mm, new_addr, &dst_pmd); + if (!src) + src = get_one_pte_map_nested(mm, old_addr, &src_pmd); + + if (dst && src) + error = copy_one_pte(mm, src, dst, &pte_chain); + else + error = -ENOMEM; + if (src) + pte_unmap_nested(src); + if (dst) + pte_unmap(dst); + if (src_pmd) + pmd_unmap_nested(src_pmd); + if (dst_pmd) + pmd_unmap(dst_pmd); } flush_tlb_page(vma, old_addr); spin_unlock(&mm->page_table_lock); diff -prauN linux-2.5.71/mm/msync.c wli-2.5.71-7/mm/msync.c --- linux-2.5.71/mm/msync.c 2003-06-14 12:18:07.000000000 -0700 +++ wli-2.5.71-7/mm/msync.c 2003-06-14 20:49:04.000000000 -0700 @@ -82,7 +82,7 @@ static inline int filemap_sync_pmd_range pgd_clear(pgd); return 0; } - pmd = pmd_offset(pgd, address); + pmd = pmd_offset_map(pgd, address); if ((address & PGDIR_MASK) != (end & PGDIR_MASK)) end = (address & PGDIR_MASK) + PGDIR_SIZE; error = 0; @@ -91,6 +91,7 @@ static inline int filemap_sync_pmd_range address = (address + PMD_SIZE) & PMD_MASK; pmd++; } while (address && (address < end)); + pmd_unmap(pmd - 1); return error; } diff -prauN linux-2.5.71/mm/page_alloc.c wli-2.5.71-7/mm/page_alloc.c --- linux-2.5.71/mm/page_alloc.c 2003-06-14 12:17:57.000000000 -0700 +++ wli-2.5.71-7/mm/page_alloc.c 2003-06-14 23:12:11.000000000 -0700 @@ -163,7 +163,7 @@ static void destroy_compound_page(struct * -- wli */ -static inline void __free_pages_bulk (struct page *page, struct page *base, +static inline void buddy_free(struct page *page, struct page *base, struct zone *zone, struct free_area *area, unsigned long mask, unsigned int order) { @@ -176,7 +176,6 @@ static inline void __free_pages_bulk (st BUG(); index = page_idx >> (1 + order); - zone->free_pages -= mask; while (mask + (1 << (MAX_ORDER-1))) { struct page *buddy1, *buddy2; @@ -197,11 +196,39 @@ static inline void __free_pages_bulk (st BUG_ON(bad_range(zone, buddy2)); list_del(&buddy1->list); mask <<= 1; + area->globally_free--; area++; index >>= 1; page_idx &= mask; } list_add(&(base + page_idx)->list, &area->free_list); + area->globally_free++; +} + +static inline void __free_pages_bulk(struct page *page, struct page *base, + struct zone *zone, struct free_area *area, unsigned long mask, + unsigned int order) +{ + switch (area->active - area->locally_free) { + case 0: + if (!list_empty(&area->deferred_pages)) { + struct page *defer = list_entry(area->deferred_pages.next, struct page, list); + list_del(&defer->list); + area->locally_free--; + buddy_free(defer, base, zone, area, mask, order); + } + /* fall through */ + case 1: + buddy_free(page, base, zone, area, mask, order); + break; + default: + list_add(&page->list, &area->deferred_pages); + area->locally_free++; + break; + } + if (area->active) + area->active--; + zone->free_pages += 1 << order; } static inline void free_pages_check(const char *function, struct page *page) @@ -232,40 +259,78 @@ static inline void free_pages_check(cons * And clear the zone's pages_scanned counter, to hold off the "all pages are * pinned" detection logic. */ -static int -free_pages_bulk(struct zone *zone, int count, - struct list_head *list, unsigned int order) +void free_pages_bulk(struct zone *zone, struct page *page, unsigned int order) { - unsigned long mask, flags; + unsigned long mask, flags, count; struct free_area *area; - struct page *base, *page = NULL; - int ret = 0; + struct page *base, *save; + LIST_HEAD(tmp); + + count = page->private; mask = (~0UL) << order; base = zone->zone_mem_map; area = zone->free_area + order; spin_lock_irqsave(&zone->lock, flags); zone->all_unreclaimable = 0; zone->pages_scanned = 0; - while (!list_empty(list) && count--) { - page = list_entry(list->prev, struct page, list); - /* have to delete it as __free_pages_bulk list manipulates */ - list_del(&page->list); - __free_pages_bulk(page, base, zone, area, mask, order); - ret++; + + if (order || area->active - area->locally_free <= 2*count) { + list_splice(&page->list, &tmp); + list_add(&page->list, &tmp); + page->private = 0; + } + + if (order) { + list_for_each_entry_safe(page, save, &tmp, list) { + list_del(&page->list); + __free_pages_bulk(page, base, zone, area, mask, order); + } + } else if (area->active - area->locally_free <= 2*count) { + /* + * This is a somewhat ad hoc approach to dealing with + * the interaction of gang allocation and the deferred + * coalescing heuristics. + */ + if (area->active - area->locally_free < count) { + int local = 0; + + while (local < count && area->locally_free) { + struct page *follow, *head = + list_entry(area->deferred_pages.next, struct page, lru); + list_del(&head->lru); + list_for_each_entry_safe(follow, save, &head->list, list) { + list_del(&follow->list); + buddy_free(follow, base, zone, area, mask, 0); + } + local += head->private; + area->locally_free -= head->private; + head->private = 0; + buddy_free(head, base, zone, area, mask, 0); + } + } + list_for_each_entry_safe(page, save, &tmp, list) { + list_del(&page->list); + buddy_free(page, base, zone, area, mask, order); + } + } else { + area->locally_free += count; + list_add(&page->lru, &area->deferred_pages); + } + if (!order) { + zone->free_pages += count; + area->active -= min(area->active, count); } spin_unlock_irqrestore(&zone->lock, flags); - return ret; } void __free_pages_ok(struct page *page, unsigned int order) { - LIST_HEAD(list); - mod_page_state(pgfree, 1 << order); free_pages_check(__FUNCTION__, page); - list_add(&page->list, &list); - free_pages_bulk(page_zone(page), 1, &list, order); + page->private = 1; + INIT_LIST_HEAD(&page->list); + free_pages_bulk(page_zone(page), page, order); } #define MARK_USED(index, order, area) \ @@ -278,10 +343,10 @@ expand(struct zone *zone, struct page *p unsigned long size = 1 << high; while (high > low) { - BUG_ON(bad_range(zone, page)); area--; high--; size >>= 1; + area->globally_free++; list_add(&page->list, &area->free_list); MARK_USED(index, high, area); index += size; @@ -332,7 +397,7 @@ static void prep_new_page(struct page *p * Do the hard work of removing an element from the buddy allocator. * Call me with the zone->lock already held. */ -static struct page *__rmqueue(struct zone *zone, unsigned int order) +static struct page *buddy_alloc(struct zone *zone, unsigned int order) { struct free_area * area; unsigned int current_order; @@ -346,16 +411,144 @@ static struct page *__rmqueue(struct zon page = list_entry(area->free_list.next, struct page, list); list_del(&page->list); + area->globally_free--; index = page - zone->zone_mem_map; if (current_order != MAX_ORDER-1) MARK_USED(index, current_order, area); - zone->free_pages -= 1UL << order; return expand(zone, page, index, order, current_order, area); } return NULL; } +/* + * This is bad; some way to avoid putting singleton pages on the + * deferred lists should be worked out at some point. + */ +static void split_pages(struct zone *zone, struct page *page, int page_order, int deferred_order) +{ + int split_order = deferred_order - 1; + unsigned long split_offset = 1UL << split_order; + struct page *split_page; + + while (split_order >= page_order) { + split_page = &page[split_offset]; + if (split_order) + list_add(&split_page->list, + &zone->free_area[split_order].deferred_pages); + else if (!zone->free_area[split_order].locally_free) { + INIT_LIST_HEAD(&split_page->list); + split_page->private = 1; + list_add(&split_page->lru, + &zone->free_area[split_order].deferred_pages); + } else { + struct page *head; + head = list_entry(zone->free_area[split_order].deferred_pages.next, struct page, lru); + head->private++; + list_add(&split_page->list, &head->list); + } + zone->free_area[split_order].locally_free++; + --split_order; + split_offset >>= 1; + } +} + +#define COALESCE_BATCH 256 +static inline struct page *steal_deferred_page(struct zone *zone, int order) +{ + struct page *page; + struct list_head *elem; + struct free_area *area = zone->free_area; + int found_order, k; + + if (zone->free_pages < (1 << order)) + return NULL; + + /* the range of found_order precludes order 0 */ + for (found_order = order + 1; found_order < MAX_ORDER; ++found_order) + if (!list_empty(&area[found_order].deferred_pages)) { + elem = area[found_order].deferred_pages.next; + page = list_entry(elem, struct page, list); + list_del(elem); + area[found_order].locally_free--; + split_pages(zone, page, order, found_order); + return page; + } + + for (found_order = order - 1; found_order >= 0; --found_order) { + for (k = 0; k < COALESCE_BATCH; ++k) { + unsigned long mask = (~0UL) << found_order; + if (list_empty(&area[found_order].deferred_pages)) + break; + elem = area[found_order].deferred_pages.next; + if (found_order) { + page = list_entry(elem, struct page, list); + list_del(elem); + area[found_order].locally_free--; + buddy_free(page, zone->zone_mem_map, zone, &area[found_order], mask, found_order); + } else { + LIST_HEAD(tmp); + struct page *save; + + page = list_entry(elem, struct page, lru); + list_del(elem); + area[found_order].locally_free -= page->private; + page->private = 0; + list_splice(&page->list, &tmp); + list_add(&page->list, &tmp); + list_for_each_entry_safe(page, save, &tmp, list) { + list_del(&page->list); + buddy_free(page, zone->zone_mem_map, zone, &area[found_order], mask, found_order); + } + } + } + page = buddy_alloc(zone, order); + if (page) + return page; + } + return buddy_alloc(zone, order); +} + +static inline int __rmqueue(struct zone *zone, unsigned int order, struct list_head *list) +{ + struct free_area *area = &zone->free_area[order]; + struct page *page; + int count; + + if (!list_empty(&area->deferred_pages)) { + if (order) { + page = list_entry(area->deferred_pages.next, struct page, list); + list_del(&page->list); + count = 1; + } else { + page = list_entry(area->deferred_pages.next, struct page, lru); + list_del(&page->lru); + count = page->private; + page->private = 0; + list_splice(&page->list, list); + } + + area->locally_free -= count; + area->active += count; + zone->free_pages -= count << order; + } else { + page = buddy_alloc(zone, order); + if (page) + count = 1; + else { + page = steal_deferred_page(zone, order); + if (page) + count = 1; + else + return 0; + } + area->active += count; + zone->free_pages -= count << order; + } + list_add(&page->list, list); + return count; +} + /* * Obtain a specified number of elements from the buddy allocator, all under * a single hold of the lock, for efficiency. Add them to the supplied list. @@ -365,17 +558,14 @@ static int rmqueue_bulk(struct zone *zon unsigned long count, struct list_head *list) { unsigned long flags; - int i; - int allocated = 0; - struct page *page; + int i, j, allocated = 0; spin_lock_irqsave(&zone->lock, flags); - for (i = 0; i < count; ++i) { - page = __rmqueue(zone, order); - if (page == NULL) + for (i = 0; i < count && allocated < count; ++i) { + j = __rmqueue(zone, order, list); + if (!j) break; - allocated++; - list_add_tail(&page->list, list); + allocated += j; } spin_unlock_irqrestore(&zone->lock, flags); return allocated; @@ -420,10 +610,14 @@ void drain_local_pages(void) pset = &zone->pageset[smp_processor_id()]; for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { struct per_cpu_pages *pcp; + struct page *page, save; pcp = &pset->pcp[i]; - pcp->count -= free_pages_bulk(zone, pcp->count, - &pcp->list, 0); + list_for_each_entry_safe(page, save, &pcp->list, lru) { + list_del(&page->lru); + pcp->count -= page->private; + free_pages_bulk(zone, page, 0); + } } } local_irq_restore(flags); @@ -439,14 +633,27 @@ static void free_hot_cold_page(struct pa struct zone *zone = page_zone(page); struct per_cpu_pages *pcp; unsigned long flags; + struct page *head; inc_page_state(pgfree); free_pages_check(__FUNCTION__, page); pcp = &zone->pageset[get_cpu()].pcp[cold]; local_irq_save(flags); - if (pcp->count >= pcp->high) - pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0); - list_add(&page->list, &pcp->list); + while (pcp->count >= pcp->high) { + struct page *free = list_entry(pcp->list.prev, struct page, lru); + list_del(&free->lru); + pcp->count -= free->private; + free_pages_bulk(zone, free, 0); + } + head = list_entry(pcp->list.next, struct page, lru); + if (!list_empty(&pcp->list) && head->private < pcp->batch) { + list_add(&page->list, &head->list); + head->private++; + } else { + INIT_LIST_HEAD(&page->list); + list_add(&page->lru, &pcp->list); + page->private = 1; + } pcp->count++; local_irq_restore(flags); put_cpu(); @@ -471,31 +678,76 @@ void free_cold_page(struct page *page) static struct page *buffered_rmqueue(struct zone *zone, int order, int cold) { unsigned long flags; - struct page *page = NULL; + struct page *head, *page = NULL; + struct per_cpu_pages *pcp = NULL; if (order == 0) { - struct per_cpu_pages *pcp; - pcp = &zone->pageset[get_cpu()].pcp[cold]; local_irq_save(flags); - if (pcp->count <= pcp->low) - pcp->count += rmqueue_bulk(zone, 0, - pcp->batch, &pcp->list); + if (pcp->count <= pcp->low) { + LIST_HEAD(tmp); + int k; + + k = rmqueue_bulk(zone, 0, pcp->batch, &tmp); + if (k) { + pcp->count += k; + head = list_entry(tmp.next, struct page, list); + list_del_init(&head->list); + head->private = k; + list_splice(&tmp, &head->list); + list_add(&head->lru, &pcp->list); + } + } if (pcp->count) { - page = list_entry(pcp->list.next, struct page, list); - list_del(&page->list); + head = list_entry(pcp->list.next, struct page, lru); + WARN_ON(!head->private); + if (head->private == 1) { + list_del(&head->lru); + page = head; + page->private = 0; + } else { + page = list_entry(head->list.next, struct page,list); + list_del(&page->list); + head->private--; + } pcp->count--; } local_irq_restore(flags); put_cpu(); } - if (page == NULL) { + if (!page) { + LIST_HEAD(tmp); + int count; + + if (!order) + pcp = &zone->pageset[get_cpu()].pcp[cold]; + spin_lock_irqsave(&zone->lock, flags); - page = __rmqueue(zone, order); - spin_unlock_irqrestore(&zone->lock, flags); + count = __rmqueue(zone, order, &tmp); + spin_unlock(&zone->lock); + + if (!list_empty(&tmp)) + page = list_entry(tmp.next, struct page, list); + + if (!order && count > 1) { + struct page *head; + + list_del(&page->list); + pcp->count += count - 1; + head = list_entry(tmp.next, struct page, list); + list_del_init(&head->list); + head->private = count - 1; + list_splice(&tmp, &head->list); + list_add(&head->lru, &pcp->list); + } + + local_irq_restore(flags); + if (order && page) prep_compound_page(page, order); + else if (!order) + put_cpu(); } if (page != NULL) { @@ -809,6 +1061,17 @@ static void show_node(struct zone *zone) #define show_node(zone) do { } while (0) #endif +unsigned long nr_deferred_pages(void) +{ + struct zone *zone; + unsigned long order, pages = 0; + + for_each_zone(zone) + for (order = 0; order < MAX_ORDER; ++order) + pages += zone->free_area[order].locally_free << order; + return pages; +} + /* * Accumulate the page_state information across all CPUs. * The result is unavoidably approximate - it can change @@ -979,8 +1242,7 @@ void show_free_areas(void) } for_each_zone(zone) { - struct list_head *elem; - unsigned long nr, flags, order, total = 0; + unsigned long order, total = 0; show_node(zone); printk("%s: ", zone->name); @@ -989,16 +1251,20 @@ void show_free_areas(void) continue; } - spin_lock_irqsave(&zone->lock, flags); + printk("buddy: "); for (order = 0; order < MAX_ORDER; order++) { - nr = 0; - list_for_each(elem, &zone->free_area[order].free_list) - ++nr; - total += nr << order; - printk("%lu*%lukB ", nr, K(1UL) << order); + printk("%lu*%lukB ", zone->free_area[order].globally_free, K(1UL) << order); + total += zone->free_area[order].globally_free << order; } - spin_unlock_irqrestore(&zone->lock, flags); - printk("= %lukB\n", K(total)); + printk("\ndefer: "); + for (order = 0; order < MAX_ORDER; order++) { + printk("%lu*%lukB ", zone->free_area[order].locally_free, K(1UL) << order); + total += zone->free_area[order].locally_free << order; + } + printk("\nactive: "); + for (order = 0; order < MAX_ORDER; order++) + printk("%lu*%lukB ", zone->free_area[order].active, K(1UL) << order); + printk("\n= %lukB\n", K(total)); } show_swap_cache_info(); @@ -1294,8 +1560,11 @@ static void __init free_area_init_core(s for (i = 0; ; i++) { unsigned long bitmap_size; - + INIT_LIST_HEAD(&zone->free_area[i].deferred_pages); INIT_LIST_HEAD(&zone->free_area[i].free_list); + zone->free_area[i].globally_free = 0; + zone->free_area[i].locally_free = 0; + zone->free_area[i].active = 0; if (i == MAX_ORDER-1) { zone->free_area[i].map = NULL; break; @@ -1401,24 +1670,22 @@ static int frag_show(struct seq_file *m, pg_data_t *pgdat = (pg_data_t *)arg; struct zone *zone; struct zone *node_zones = pgdat->node_zones; - unsigned long flags; int order; for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { if (!zone->present_pages) continue; - spin_lock_irqsave(&zone->lock, flags); - seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); - for (order = 0; order < MAX_ORDER; ++order) { - unsigned long nr_bufs = 0; - struct list_head *elem; - - list_for_each(elem, &(zone->free_area[order].free_list)) - ++nr_bufs; - seq_printf(m, "%6lu ", nr_bufs); - } - spin_unlock_irqrestore(&zone->lock, flags); + seq_printf(m, "Node %d, zone %8s\n", pgdat->node_id, zone->name); + seq_puts(m, "buddy: "); + for (order = 0; order < MAX_ORDER; ++order) + seq_printf(m, "%6lu ", zone->free_area[order].globally_free); + seq_puts(m, "\ndefer: "); + for (order = 0; order < MAX_ORDER; ++order) + seq_printf(m, "%6lu ", zone->free_area[order].locally_free); + seq_puts(m, "\nactive: "); + for (order = 0; order < MAX_ORDER; ++order) + seq_printf(m, "%6lu ", zone->free_area[order].active); seq_putc(m, '\n'); } return 0; diff -prauN linux-2.5.71/mm/swapfile.c wli-2.5.71-7/mm/swapfile.c --- linux-2.5.71/mm/swapfile.c 2003-06-14 12:18:00.000000000 -0700 +++ wli-2.5.71-7/mm/swapfile.c 2003-06-14 20:49:04.000000000 -0700 @@ -444,7 +444,7 @@ static int unuse_pgd(struct vm_area_stru pgd_clear(dir); return 0; } - pmd = pmd_offset(dir, address); + pmd = pmd_offset_map(dir, address); offset = address & PGDIR_MASK; address &= ~PGDIR_MASK; end = address + size; @@ -459,6 +459,7 @@ static int unuse_pgd(struct vm_area_stru address = (address + PMD_SIZE) & PMD_MASK; pmd++; } while (address && (address < end)); + pmd_unmap(pmd - 1); return 0; } diff -prauN linux-2.5.71/mm/vmalloc.c wli-2.5.71-7/mm/vmalloc.c --- linux-2.5.71/mm/vmalloc.c 2003-06-14 12:18:23.000000000 -0700 +++ wli-2.5.71-7/mm/vmalloc.c 2003-06-14 20:49:04.000000000 -0700 @@ -70,7 +70,7 @@ static void unmap_area_pmd(pgd_t *dir, u return; } - pmd = pmd_offset(dir, address); + pmd = pmd_offset_kernel(dir, address); address &= ~PGDIR_MASK; end = address + size; if (end > PGDIR_SIZE) @@ -159,7 +159,7 @@ int map_vm_area(struct vm_struct *area, dir = pgd_offset_k(address); spin_lock(&init_mm.page_table_lock); do { - pmd_t *pmd = pmd_alloc(&init_mm, dir, address); + pmd_t *pmd = pmd_alloc_kernel(&init_mm, dir, address); if (!pmd) { err = -ENOMEM; break; diff -prauN linux-2.5.71/net/core/flow.c wli-2.5.71-7/net/core/flow.c --- linux-2.5.71/net/core/flow.c 2003-06-14 12:17:56.000000000 -0700 +++ wli-2.5.71-7/net/core/flow.c 2003-06-14 20:00:58.000000000 -0700 @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include