diff -prauN linux-2.6.0-test11/Documentation/filesystems/Locking wli-2.6.0-test11-30/Documentation/filesystems/Locking --- linux-2.6.0-test11/Documentation/filesystems/Locking 2003-11-26 12:45:30.000000000 -0800 +++ wli-2.6.0-test11-30/Documentation/filesystems/Locking 2003-12-04 08:43:29.000000000 -0800 @@ -204,7 +204,7 @@ currently-in-progress I/O. If the filesystem is not called for "sync" and it determines that it would need to block against in-progress I/O to be able to start new I/O against the page the filesystem shoud redirty the page (usually with -__set_page_dirty_nobuffers()), then unlock the page and return zero. +set_page_dirty_nobuffers()), then unlock the page and return zero. This may also be done to avoid internal deadlocks, but rarely. If the filesytem is called for sync then it must wait on any @@ -420,7 +420,7 @@ transfer: no prototypes: void (*open)(struct vm_area_struct*); void (*close)(struct vm_area_struct*); - struct page *(*nopage)(struct vm_area_struct*, unsigned long, int); + struct page *(*nopage)(struct vm_area_struct*, unsigned long, int *); locking rules: BKL mmap_sem diff -prauN linux-2.6.0-test11/Documentation/vm/locking wli-2.6.0-test11-30/Documentation/vm/locking --- linux-2.6.0-test11/Documentation/vm/locking 2003-11-26 12:43:30.000000000 -0800 +++ wli-2.6.0-test11-30/Documentation/vm/locking 2003-12-04 05:51:48.000000000 -0800 @@ -66,7 +66,7 @@ in some cases it is not really needed. E expand_stack(), it is hard to come up with a destructive scenario without having the vmlist protection in this case. -The page_table_lock nests with the inode i_shared_sem and the kmem cache +The page_table_lock nests with the inode i_shared_lock and the kmem cache c_spinlock spinlocks. This is okay, since the kmem code asks for pages after dropping c_spinlock. The page_table_lock also nests with pagecache_lock and pagemap_lru_lock spinlocks, and no code asks for memory with these locks diff -prauN linux-2.6.0-test11/arch/alpha/kernel/process.c wli-2.6.0-test11-30/arch/alpha/kernel/process.c --- linux-2.6.0-test11/arch/alpha/kernel/process.c 2003-11-26 12:45:29.000000000 -0800 +++ wli-2.6.0-test11-30/arch/alpha/kernel/process.c 2003-12-04 08:35:58.000000000 -0800 @@ -513,11 +513,6 @@ thread_saved_pc(task_t *t) /* * These bracket the sleeping functions.. */ -extern void scheduling_functions_start_here(void); -extern void scheduling_functions_end_here(void); -#define first_sched ((unsigned long) scheduling_functions_start_here) -#define last_sched ((unsigned long) scheduling_functions_end_here) - unsigned long get_wchan(struct task_struct *p) { @@ -536,7 +531,8 @@ get_wchan(struct task_struct *p) */ pc = thread_saved_pc(p); - if (pc >= first_sched && pc < last_sched) { + if (pc >= scheduling_functions_start_here && + pc < scheduling_functions_end_here) { schedule_frame = ((unsigned long *)p->thread_info->pcb.ksp)[6]; return ((unsigned long *)schedule_frame)[12]; } diff -prauN linux-2.6.0-test11/arch/alpha/kernel/semaphore.c wli-2.6.0-test11-30/arch/alpha/kernel/semaphore.c --- linux-2.6.0-test11/arch/alpha/kernel/semaphore.c 2003-11-26 12:44:12.000000000 -0800 +++ wli-2.6.0-test11-30/arch/alpha/kernel/semaphore.c 2003-12-04 08:35:58.000000000 -0800 @@ -7,6 +7,7 @@ #include #include +#include /* * Semaphores are implemented using a two-way counter: @@ -52,7 +53,7 @@ * Either form may be used in conjunction with "up()". */ -void +__sched void __down_failed(struct semaphore *sem) { DECLARE_WAITQUEUE(wait, current); @@ -103,7 +104,7 @@ __down_failed(struct semaphore *sem) #endif } -int +__sched int __down_failed_interruptible(struct semaphore *sem) { DECLARE_WAITQUEUE(wait, current); @@ -201,7 +202,7 @@ __up_wakeup(struct semaphore *sem) wake_up(&sem->wait); } -void +__sched void down(struct semaphore *sem) { #if WAITQUEUE_DEBUG @@ -215,7 +216,7 @@ down(struct semaphore *sem) __down(sem); } -int +__sched int down_interruptible(struct semaphore *sem) { #if WAITQUEUE_DEBUG diff -prauN linux-2.6.0-test11/arch/alpha/kernel/vmlinux.lds.S wli-2.6.0-test11-30/arch/alpha/kernel/vmlinux.lds.S --- linux-2.6.0-test11/arch/alpha/kernel/vmlinux.lds.S 2003-11-26 12:44:14.000000000 -0800 +++ wli-2.6.0-test11-30/arch/alpha/kernel/vmlinux.lds.S 2003-12-04 08:35:58.000000000 -0800 @@ -17,6 +17,9 @@ SECTIONS _text = .; /* Text and read-only data */ .text : { *(.text) + __scheduling_functions_start_here = .; + *(.sched.text) + __scheduling_functions_end_here = .; *(.fixup) *(.gnu.warning) } :kernel diff -prauN linux-2.6.0-test11/arch/alpha/mm/remap.c wli-2.6.0-test11-30/arch/alpha/mm/remap.c --- linux-2.6.0-test11/arch/alpha/mm/remap.c 2003-11-26 12:42:52.000000000 -0800 +++ wli-2.6.0-test11-30/arch/alpha/mm/remap.c 2003-12-03 18:20:41.000000000 -0800 @@ -73,7 +73,7 @@ __alpha_remap_area_pages(unsigned long a spin_lock(&init_mm.page_table_lock); do { pmd_t *pmd; - pmd = pmd_alloc(&init_mm, dir, address); + pmd = pmd_alloc_kernel(&init_mm, dir, address); error = -ENOMEM; if (!pmd) break; diff -prauN linux-2.6.0-test11/arch/arm/kernel/process.c wli-2.6.0-test11-30/arch/arm/kernel/process.c --- linux-2.6.0-test11/arch/arm/kernel/process.c 2003-11-26 12:45:33.000000000 -0800 +++ wli-2.6.0-test11-30/arch/arm/kernel/process.c 2003-12-04 08:35:58.000000000 -0800 @@ -415,11 +415,6 @@ pid_t kernel_thread(int (*fn)(void *), v /* * These bracket the sleeping functions.. */ -extern void scheduling_functions_start_here(void); -extern void scheduling_functions_end_here(void); -#define first_sched ((unsigned long) scheduling_functions_start_here) -#define last_sched ((unsigned long) scheduling_functions_end_here) - unsigned long get_wchan(struct task_struct *p) { unsigned long fp, lr; @@ -434,7 +429,8 @@ unsigned long get_wchan(struct task_stru if (fp < stack_page || fp > 4092+stack_page) return 0; lr = pc_pointer (((unsigned long *)fp)[-1]); - if (lr < first_sched || lr > last_sched) + if (lr < scheduling_functions_start_here || + lr > scheduling_functions_end_here) return lr; fp = *(unsigned long *) (fp - 12); } while (count ++ < 16); diff -prauN linux-2.6.0-test11/arch/arm/kernel/semaphore.c wli-2.6.0-test11-30/arch/arm/kernel/semaphore.c --- linux-2.6.0-test11/arch/arm/kernel/semaphore.c 2003-11-26 12:43:40.000000000 -0800 +++ wli-2.6.0-test11-30/arch/arm/kernel/semaphore.c 2003-12-04 08:35:58.000000000 -0800 @@ -13,6 +13,7 @@ */ #include #include +#include #include @@ -54,7 +55,7 @@ void __up(struct semaphore *sem) static spinlock_t semaphore_lock = SPIN_LOCK_UNLOCKED; -void __down(struct semaphore * sem) +__sched void __down(struct semaphore * sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -87,7 +88,7 @@ void __down(struct semaphore * sem) wake_up(&sem->wait); } -int __down_interruptible(struct semaphore * sem) +__sched int __down_interruptible(struct semaphore * sem) { int retval = 0; struct task_struct *tsk = current; @@ -176,7 +177,8 @@ int __down_trylock(struct semaphore * se * registers (r0 to r3 and lr), but not ip, as we use it as a return * value in some cases.. */ -asm(" .align 5 \n\ +asm(" .section .sched.text \n\ + .align 5 \n\ .globl __down_failed \n\ __down_failed: \n\ stmfd sp!, {r0 - r3, lr} \n\ diff -prauN linux-2.6.0-test11/arch/arm/kernel/vmlinux.lds.S wli-2.6.0-test11-30/arch/arm/kernel/vmlinux.lds.S --- linux-2.6.0-test11/arch/arm/kernel/vmlinux.lds.S 2003-11-26 12:43:34.000000000 -0800 +++ wli-2.6.0-test11-30/arch/arm/kernel/vmlinux.lds.S 2003-12-04 08:35:58.000000000 -0800 @@ -73,6 +73,9 @@ SECTIONS .text : { /* Real text segment */ _text = .; /* Text and read-only data */ *(.text) + __scheduling_functions_start_here = .; + *(.sched.text) + __scheduling_functions_end_here = .; *(.fixup) *(.gnu.warning) *(.rodata) diff -prauN linux-2.6.0-test11/arch/arm/mm/consistent.c wli-2.6.0-test11-30/arch/arm/mm/consistent.c --- linux-2.6.0-test11/arch/arm/mm/consistent.c 2003-11-26 12:45:10.000000000 -0800 +++ wli-2.6.0-test11-30/arch/arm/mm/consistent.c 2003-12-03 18:20:41.000000000 -0800 @@ -327,7 +327,7 @@ static int __init consistent_init(void) do { pgd = pgd_offset(&init_mm, CONSISTENT_BASE); - pmd = pmd_alloc(&init_mm, pgd, CONSISTENT_BASE); + pmd = pmd_alloc_kernel(&init_mm, pgd, CONSISTENT_BASE); if (!pmd) { printk(KERN_ERR "consistent_init: no pmd tables\n"); ret = -ENOMEM; diff -prauN linux-2.6.0-test11/arch/arm/mm/fault-armv.c wli-2.6.0-test11-30/arch/arm/mm/fault-armv.c --- linux-2.6.0-test11/arch/arm/mm/fault-armv.c 2003-11-26 12:43:25.000000000 -0800 +++ wli-2.6.0-test11-30/arch/arm/mm/fault-armv.c 2003-12-04 06:13:40.000000000 -0800 @@ -191,19 +191,22 @@ void __flush_dcache_page(struct page *pa __cpuc_flush_dcache_page(page_address(page)); - if (!page->mapping) + if (!page_mapping(page)) return; /* * With a VIVT cache, we need to also write back * and invalidate any user data. */ - list_for_each(l, &page->mapping->i_mmap_shared) { + list_for_each_rcu(l, &page_mapping(page)->i_mmap_shared) { struct vm_area_struct *mpnt; unsigned long off; mpnt = list_entry(l, struct vm_area_struct, shared); + if (mpnt->vm_flags & VM_DEAD) + continue; + /* * If this VMA is not in our MM, we can ignore it. */ @@ -234,12 +237,15 @@ make_coherent(struct vm_area_struct *vma * space, then we need to handle them specially to maintain * cache coherency. */ - list_for_each(l, &page->mapping->i_mmap_shared) { + list_for_each_rcu(l, &page_mapping(page)->i_mmap_shared) { struct vm_area_struct *mpnt; unsigned long off; mpnt = list_entry(l, struct vm_area_struct, shared); + if (mpnt->vm_flags & VM_DEAD) + continue; + /* * If this VMA is not in our MM, we can ignore it. * Note that we intentionally don't mask out the VMA @@ -292,7 +298,7 @@ void update_mmu_cache(struct vm_area_str if (!pfn_valid(pfn)) return; page = pfn_to_page(pfn); - if (page->mapping) { + if (page_mapping(page)) { int dirty = test_and_clear_bit(PG_dcache_dirty, &page->flags); if (dirty) diff -prauN linux-2.6.0-test11/arch/arm/mm/ioremap.c wli-2.6.0-test11-30/arch/arm/mm/ioremap.c --- linux-2.6.0-test11/arch/arm/mm/ioremap.c 2003-11-26 12:44:33.000000000 -0800 +++ wli-2.6.0-test11-30/arch/arm/mm/ioremap.c 2003-12-03 18:20:41.000000000 -0800 @@ -95,7 +95,7 @@ static int remap_area_pages(unsigned lon spin_lock(&init_mm.page_table_lock); do { pmd_t *pmd; - pmd = pmd_alloc(&init_mm, dir, address); + pmd = pmd_alloc_kernel(&init_mm, dir, address); error = -ENOMEM; if (!pmd) break; diff -prauN linux-2.6.0-test11/arch/arm/mm/minicache.c wli-2.6.0-test11-30/arch/arm/mm/minicache.c --- linux-2.6.0-test11/arch/arm/mm/minicache.c 2003-11-26 12:44:52.000000000 -0800 +++ wli-2.6.0-test11-30/arch/arm/mm/minicache.c 2003-12-03 18:20:41.000000000 -0800 @@ -59,7 +59,7 @@ static int __init minicache_init(void) spin_lock(&init_mm.page_table_lock); pgd = pgd_offset_k(minicache_address); - pmd = pmd_alloc(&init_mm, pgd, minicache_address); + pmd = pmd_alloc_kernel(&init_mm, pgd, minicache_address); if (!pmd) BUG(); minicache_pte = pte_alloc_kernel(&init_mm, pmd, minicache_address); diff -prauN linux-2.6.0-test11/arch/arm/mm/mm-armv.c wli-2.6.0-test11-30/arch/arm/mm/mm-armv.c --- linux-2.6.0-test11/arch/arm/mm/mm-armv.c 2003-11-26 12:44:34.000000000 -0800 +++ wli-2.6.0-test11-30/arch/arm/mm/mm-armv.c 2003-12-03 18:20:41.000000000 -0800 @@ -132,7 +132,7 @@ pgd_t *get_pgd_slow(struct mm_struct *mm if (vectors_base() == 0) { /* - * This lock is here just to satisfy pmd_alloc and pte_lock + * This lock is here just to satisfy pmd_alloc_map() and pte_lock */ spin_lock(&mm->page_table_lock); @@ -140,20 +140,22 @@ pgd_t *get_pgd_slow(struct mm_struct *mm * On ARM, first page must always be allocated since it * contains the machine vectors. */ - new_pmd = pmd_alloc(mm, new_pgd, 0); + new_pmd = pmd_alloc_map(mm, new_pgd, 0); if (!new_pmd) goto no_pmd; - new_pte = pte_alloc_map(mm, new_pmd, 0); - if (!new_pte) + new_pte = pte_alloc_map(mm, new_pgd, &new_pmd, 0); + if (!new_pte) { + pmd_unmap(new_pmd); goto no_pte; + } init_pmd = pmd_offset(init_pgd, 0); init_pte = pte_offset_map_nested(init_pmd, 0); set_pte(new_pte, *init_pte); pte_unmap_nested(init_pte); pte_unmap(new_pte); - + pmd_unmap(new_pmd); spin_unlock(&mm->page_table_lock); } diff -prauN linux-2.6.0-test11/arch/arm26/kernel/process.c wli-2.6.0-test11-30/arch/arm26/kernel/process.c --- linux-2.6.0-test11/arch/arm26/kernel/process.c 2003-11-26 12:45:31.000000000 -0800 +++ wli-2.6.0-test11-30/arch/arm26/kernel/process.c 2003-12-04 08:35:58.000000000 -0800 @@ -400,11 +400,6 @@ pid_t kernel_thread(int (*fn)(void *), v /* * These bracket the sleeping functions.. */ -extern void scheduling_functions_start_here(void); -extern void scheduling_functions_end_here(void); -#define first_sched ((unsigned long) scheduling_functions_start_here) -#define last_sched ((unsigned long) scheduling_functions_end_here) - unsigned long get_wchan(struct task_struct *p) { unsigned long fp, lr; @@ -419,7 +414,8 @@ unsigned long get_wchan(struct task_stru if (fp < stack_page || fp > 4092+stack_page) return 0; lr = pc_pointer (((unsigned long *)fp)[-1]); - if (lr < first_sched || lr > last_sched) + if (lr < scheduling_functions_start_here || + lr > scheduling_functions_end_here) return lr; fp = *(unsigned long *) (fp - 12); } while (count ++ < 16); diff -prauN linux-2.6.0-test11/arch/arm26/kernel/semaphore.c wli-2.6.0-test11-30/arch/arm26/kernel/semaphore.c --- linux-2.6.0-test11/arch/arm26/kernel/semaphore.c 2003-11-26 12:43:27.000000000 -0800 +++ wli-2.6.0-test11-30/arch/arm26/kernel/semaphore.c 2003-12-04 08:35:58.000000000 -0800 @@ -15,6 +15,7 @@ #include #include #include +#include #include @@ -56,7 +57,7 @@ void __up(struct semaphore *sem) static spinlock_t semaphore_lock = SPIN_LOCK_UNLOCKED; -void __down(struct semaphore * sem) +__sched void __down(struct semaphore * sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -89,7 +90,7 @@ void __down(struct semaphore * sem) wake_up(&sem->wait); } -int __down_interruptible(struct semaphore * sem) +__sched int __down_interruptible(struct semaphore * sem) { int retval = 0; struct task_struct *tsk = current; @@ -178,7 +179,8 @@ int __down_trylock(struct semaphore * se * registers (r0 to r3 and lr), but not ip, as we use it as a return * value in some cases.. */ -asm(" .align 5 \n\ +asm(" .section .sched.text \n\ + .align 5 \n\ .globl __down_failed \n\ __down_failed: \n\ stmfd sp!, {r0 - r3, lr} \n\ diff -prauN linux-2.6.0-test11/arch/arm26/kernel/vmlinux-arm26-xip.lds.in wli-2.6.0-test11-30/arch/arm26/kernel/vmlinux-arm26-xip.lds.in --- linux-2.6.0-test11/arch/arm26/kernel/vmlinux-arm26-xip.lds.in 2003-11-26 12:45:25.000000000 -0800 +++ wli-2.6.0-test11-30/arch/arm26/kernel/vmlinux-arm26-xip.lds.in 2003-12-04 08:35:58.000000000 -0800 @@ -66,6 +66,9 @@ SECTIONS .text : { /* Real text segment */ _text = .; /* Text and read-only data */ *(.text) + __scheduling_functions_start_here = .; + *(.sched.text) + __scheduling_functions_end_here = .; *(.fixup) *(.gnu.warning) *(.rodata) diff -prauN linux-2.6.0-test11/arch/arm26/kernel/vmlinux-arm26.lds.in wli-2.6.0-test11-30/arch/arm26/kernel/vmlinux-arm26.lds.in --- linux-2.6.0-test11/arch/arm26/kernel/vmlinux-arm26.lds.in 2003-11-26 12:42:58.000000000 -0800 +++ wli-2.6.0-test11-30/arch/arm26/kernel/vmlinux-arm26.lds.in 2003-12-04 08:35:58.000000000 -0800 @@ -67,6 +67,9 @@ SECTIONS .text : { /* Real text segment */ _text = .; /* Text and read-only data */ *(.text) + __scheduling_functions_start_here = .; + *(.sched.text) + __scheduling_functions_end_here = .; *(.fixup) *(.gnu.warning) *(.rodata) diff -prauN linux-2.6.0-test11/arch/arm26/mm/mm-memc.c wli-2.6.0-test11-30/arch/arm26/mm/mm-memc.c --- linux-2.6.0-test11/arch/arm26/mm/mm-memc.c 2003-11-26 12:45:48.000000000 -0800 +++ wli-2.6.0-test11-30/arch/arm26/mm/mm-memc.c 2003-12-03 18:20:41.000000000 -0800 @@ -79,7 +79,7 @@ pgd_t *get_pgd_slow(struct mm_struct *mm goto no_pgd; /* - * This lock is here just to satisfy pmd_alloc and pte_lock + * This lock is here just to satisfy pmd_alloc_kernel() and pte_lock * FIXME: I bet we could avoid taking it pretty much altogether */ spin_lock(&mm->page_table_lock); @@ -88,7 +88,7 @@ pgd_t *get_pgd_slow(struct mm_struct *mm * On ARM, first page must always be allocated since it contains * the machine vectors. */ - new_pmd = pmd_alloc(mm, new_pgd, 0); + new_pmd = pmd_alloc_kernel(mm, new_pgd, 0); if (!new_pmd) goto no_pmd; diff -prauN linux-2.6.0-test11/arch/cris/arch-v10/vmlinux.lds.S wli-2.6.0-test11-30/arch/cris/arch-v10/vmlinux.lds.S --- linux-2.6.0-test11/arch/cris/arch-v10/vmlinux.lds.S 2003-11-26 12:43:40.000000000 -0800 +++ wli-2.6.0-test11-30/arch/cris/arch-v10/vmlinux.lds.S 2003-12-04 08:35:58.000000000 -0800 @@ -25,6 +25,9 @@ SECTIONS __stext = .; .text : { *(.text) + __scheduling_functions_start_here = .; + *(.sched.text) + __scheduling_functions_end_here = .; *(.fixup) *(.text.__*) } diff -prauN linux-2.6.0-test11/arch/cris/kernel/semaphore.c wli-2.6.0-test11-30/arch/cris/kernel/semaphore.c --- linux-2.6.0-test11/arch/cris/kernel/semaphore.c 2003-11-26 12:43:33.000000000 -0800 +++ wli-2.6.0-test11-30/arch/cris/kernel/semaphore.c 2003-12-04 08:35:58.000000000 -0800 @@ -4,6 +4,7 @@ */ #include +#include #include /* @@ -94,7 +95,7 @@ void __up(struct semaphore *sem) tsk->state = TASK_RUNNING; \ remove_wait_queue(&sem->wait, &wait); -void __down(struct semaphore * sem) +__sched void __down(struct semaphore * sem) { DOWN_VAR DOWN_HEAD(TASK_UNINTERRUPTIBLE) @@ -104,7 +105,7 @@ void __down(struct semaphore * sem) DOWN_TAIL(TASK_UNINTERRUPTIBLE) } -int __down_interruptible(struct semaphore * sem) +__sched int __down_interruptible(struct semaphore * sem) { int ret = 0; DOWN_VAR diff -prauN linux-2.6.0-test11/arch/cris/mm/ioremap.c wli-2.6.0-test11-30/arch/cris/mm/ioremap.c --- linux-2.6.0-test11/arch/cris/mm/ioremap.c 2003-11-26 12:42:58.000000000 -0800 +++ wli-2.6.0-test11-30/arch/cris/mm/ioremap.c 2003-12-03 18:20:41.000000000 -0800 @@ -78,7 +78,7 @@ static int remap_area_pages(unsigned lon spin_lock(&init_mm.page_table_lock); do { pmd_t *pmd; - pmd = pmd_alloc(&init_mm, dir, address); + pmd = pmd_alloc_kernel(&init_mm, dir, address); error = -ENOMEM; if (!pmd) break; diff -prauN linux-2.6.0-test11/arch/h8300/kernel/process.c wli-2.6.0-test11-30/arch/h8300/kernel/process.c --- linux-2.6.0-test11/arch/h8300/kernel/process.c 2003-11-26 12:42:47.000000000 -0800 +++ wli-2.6.0-test11-30/arch/h8300/kernel/process.c 2003-12-04 08:35:58.000000000 -0800 @@ -264,11 +264,6 @@ out: /* * These bracket the sleeping functions.. */ -extern void scheduling_functions_start_here(void); -extern void scheduling_functions_end_here(void); -#define first_sched ((unsigned long) scheduling_functions_start_here) -#define last_sched ((unsigned long) scheduling_functions_end_here) - unsigned long thread_saved_pc(struct task_struct *tsk) { return ((struct pt_regs *)tsk->thread.esp0)->pc; @@ -289,8 +284,8 @@ unsigned long get_wchan(struct task_stru fp >= 8184+stack_page) return 0; pc = ((unsigned long *)fp)[1]; - /* FIXME: This depends on the order of these functions. */ - if (pc < first_sched || pc >= last_sched) + if (pc < scheduling_functions_start_here || + pc >= scheduling_functions_end_here) return pc; fp = *(unsigned long *) fp; } while (count++ < 16); diff -prauN linux-2.6.0-test11/arch/h8300/kernel/semaphore.c wli-2.6.0-test11-30/arch/h8300/kernel/semaphore.c --- linux-2.6.0-test11/arch/h8300/kernel/semaphore.c 2003-11-26 12:44:23.000000000 -0800 +++ wli-2.6.0-test11-30/arch/h8300/kernel/semaphore.c 2003-12-04 08:35:58.000000000 -0800 @@ -5,6 +5,7 @@ #include #include +#include #include #ifndef CONFIG_RMW_INSNS @@ -95,7 +96,7 @@ void __up(struct semaphore *sem) current->state = TASK_RUNNING; \ remove_wait_queue(&sem->wait, &wait); -void __down(struct semaphore * sem) +__sched void __down(struct semaphore * sem) { DECLARE_WAITQUEUE(wait, current); @@ -106,7 +107,7 @@ void __down(struct semaphore * sem) DOWN_TAIL(TASK_UNINTERRUPTIBLE) } -int __down_interruptible(struct semaphore * sem) +__sched int __down_interruptible(struct semaphore * sem) { DECLARE_WAITQUEUE(wait, current); int ret = 0; diff -prauN linux-2.6.0-test11/arch/h8300/kernel/vmlinux.lds.S wli-2.6.0-test11-30/arch/h8300/kernel/vmlinux.lds.S --- linux-2.6.0-test11/arch/h8300/kernel/vmlinux.lds.S 2003-11-26 12:44:27.000000000 -0800 +++ wli-2.6.0-test11-30/arch/h8300/kernel/vmlinux.lds.S 2003-12-04 08:35:58.000000000 -0800 @@ -82,6 +82,9 @@ SECTIONS #endif __stext = . ; *(.text) + __scheduling_functions_start_here = .; + *(.sched.text) + __scheduling_functions_end_here = .; . = ALIGN(0x4) ; *(.exit.text) *(.text.*) diff -prauN linux-2.6.0-test11/arch/i386/Kconfig wli-2.6.0-test11-30/arch/i386/Kconfig --- linux-2.6.0-test11/arch/i386/Kconfig 2003-11-26 12:43:07.000000000 -0800 +++ wli-2.6.0-test11-30/arch/i386/Kconfig 2003-12-04 07:34:23.000000000 -0800 @@ -397,6 +397,11 @@ config X86_OOSTORE depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 default y +config X86_CMOV + bool + depends on M686 || MPENTIUMII || MPENTIUMIII || MPENTIUM4 || MK8 || MCRUSOE + default y + config HPET_TIMER bool "HPET Timer Support" help @@ -725,6 +730,26 @@ config HIGHPTE low memory. Setting this option will put user-space page table entries in high memory. +config HIGHPMD + bool "Allocate 2nd-level pagetables from highmem" + depends on HIGHMEM64G && HIGHPTE + help + The VM uses one lowmem-allocated pmd entry for each pagetable + page of physical memory allocated, and preallocates them all + for 12KB of per-process lowmem overhead. For systems with + extreme amounts of highmem, this cannot be tolerated. Setting + this option will put userspace 2nd-level pagetables in highmem. + +config 4K_STACK + bool "Use smaller 4k per-task stacks" + help + This option will shrink the kernel's per-task stack from 8k to + 4k. This will greatly increase your chance of overflowing it. + But, if you use the per-cpu interrupt stacks as well, your chances + go way down. Also try the CONFIG_X86_STACK_CHECK overflow + detection. It is much more reliable than the currently in-kernel + version. + config MATH_EMULATION bool "Math emulation" ---help--- @@ -1135,6 +1160,11 @@ config DEBUG_KERNEL config DEBUG_STACKOVERFLOW bool "Check for stack overflows" + help + Say Y here if you are hacking the kernel to trim stack usage + on 4KB stacks and are unafraid of frequent panics. If youre + using 8KB stacks, this is less interesting, but could point + out unusual broken codepaths. depends on DEBUG_KERNEL config DEBUG_SLAB @@ -1217,6 +1247,38 @@ config FRAME_POINTER If you don't debug the kernel, you can say N, but we may not be able to solve problems without frame pointers. +config X86_STACK_CHECK + bool "Detect stack overflows" + depends on FRAME_POINTER + help + Say Y here to have the kernel attempt to detect when the per-task + kernel stack overflows. This is much more robust checking than + the above overflow check, which will only occasionally detect + an overflow. The level of guarantee here is much greater. + + Some older versions of gcc don't handle the -p option correctly. + Kernprof is affected by the same problem, which is described here: + http://oss.sgi.com/projects/kernprof/faq.html#Q9 + + Basically, if you get oopses in __free_pages_ok during boot when + you have this turned on, you need to fix gcc. The Redhat 2.96 + version and gcc-3.x seem to work. + + If not debugging a stack overflow problem, say N + Say Y here if you are hacking the kernel to trim stack usage + on 4KB stacks and are unafraid of frequent panics. If youre + using 8KB stacks, this is less interesting, but could point + out unusual broken codepaths. + +config MMAP_TOPDOWN + bool "Top-down vma allocation" + help + Say Y here to have the kernel change its vma allocation policy + to allocate vma's from the top of the address space down, and + to shove the stack low so as to conserve virtualspace. This is + risky because various apps, including a number of versions of + ld.so, depend on the kernel's bottom-up behavior. + config X86_EXTRA_IRQS bool depends on X86_LOCAL_APIC || X86_VOYAGER diff -prauN linux-2.6.0-test11/arch/i386/Makefile wli-2.6.0-test11-30/arch/i386/Makefile --- linux-2.6.0-test11/arch/i386/Makefile 2003-11-26 12:43:35.000000000 -0800 +++ wli-2.6.0-test11-30/arch/i386/Makefile 2003-12-03 19:38:56.000000000 -0800 @@ -84,6 +84,10 @@ mcore-$(CONFIG_X86_ES7000) := mach-es700 # default subarch .h files mflags-y += -Iinclude/asm-i386/mach-default +ifdef CONFIG_X86_STACK_CHECK +CFLAGS += -p +endif + head-y := arch/i386/kernel/head.o arch/i386/kernel/init_task.o libs-y += arch/i386/lib/ diff -prauN linux-2.6.0-test11/arch/i386/boot/compressed/misc.c wli-2.6.0-test11-30/arch/i386/boot/compressed/misc.c --- linux-2.6.0-test11/arch/i386/boot/compressed/misc.c 2003-11-26 12:44:26.000000000 -0800 +++ wli-2.6.0-test11-30/arch/i386/boot/compressed/misc.c 2003-12-03 19:38:56.000000000 -0800 @@ -379,3 +379,7 @@ asmlinkage int decompress_kernel(struct if (high_loaded) close_output_buffer_if_we_run_high(mv); return high_loaded; } + +/* We don't actually check for stack overflows this early. */ +__asm__(".globl mcount ; mcount: ret\n"); + diff -prauN linux-2.6.0-test11/arch/i386/kernel/apic.c wli-2.6.0-test11-30/arch/i386/kernel/apic.c --- linux-2.6.0-test11/arch/i386/kernel/apic.c 2003-11-26 12:46:07.000000000 -0800 +++ wli-2.6.0-test11-30/arch/i386/kernel/apic.c 2003-12-03 19:38:56.000000000 -0800 @@ -1080,7 +1080,8 @@ inline void smp_local_timer_interrupt(st * interrupt as well. Thus we cannot inline the local irq ... ] */ -void smp_apic_timer_interrupt(struct pt_regs regs) +struct pt_regs * IRQHANDLER(smp_apic_timer_interrupt(struct pt_regs* regs)); +struct pt_regs * smp_apic_timer_interrupt(struct pt_regs* regs) { int cpu = smp_processor_id(); @@ -1100,14 +1101,16 @@ void smp_apic_timer_interrupt(struct pt_ * interrupt lock, which is the WrongThing (tm) to do. */ irq_enter(); - smp_local_timer_interrupt(®s); + smp_local_timer_interrupt(regs); irq_exit(); + return regs; } /* * This interrupt should _never_ happen with our APIC/SMP architecture */ -asmlinkage void smp_spurious_interrupt(void) +struct pt_regs * IRQHANDLER(smp_spurious_interrupt(struct pt_regs* regs)); +struct pt_regs * smp_spurious_interrupt(struct pt_regs* regs) { unsigned long v; @@ -1125,13 +1128,15 @@ asmlinkage void smp_spurious_interrupt(v printk(KERN_INFO "spurious APIC interrupt on CPU#%d, should never happen.\n", smp_processor_id()); irq_exit(); + return regs; } /* * This interrupt should never happen with our APIC/SMP architecture */ -asmlinkage void smp_error_interrupt(void) +struct pt_regs * IRQHANDLER(smp_error_interrupt(struct pt_regs* regs)); +struct pt_regs * smp_error_interrupt(struct pt_regs* regs) { unsigned long v, v1; @@ -1156,6 +1161,7 @@ asmlinkage void smp_error_interrupt(void printk (KERN_INFO "APIC error on CPU%d: %02lx(%02lx)\n", smp_processor_id(), v , v1); irq_exit(); + return regs; } /* diff -prauN linux-2.6.0-test11/arch/i386/kernel/cpu/mcheck/p4.c wli-2.6.0-test11-30/arch/i386/kernel/cpu/mcheck/p4.c --- linux-2.6.0-test11/arch/i386/kernel/cpu/mcheck/p4.c 2003-11-26 12:44:32.000000000 -0800 +++ wli-2.6.0-test11-30/arch/i386/kernel/cpu/mcheck/p4.c 2003-12-03 19:38:56.000000000 -0800 @@ -61,11 +61,13 @@ static void intel_thermal_interrupt(stru /* Thermal interrupt handler for this CPU setup */ static void (*vendor_thermal_interrupt)(struct pt_regs *regs) = unexpected_thermal_interrupt; -asmlinkage void smp_thermal_interrupt(struct pt_regs regs) +struct pt_regs * IRQHANDLER(smp_thermal_interrupt(struct pt_regs* regs)); +struct pt_regs * smp_thermal_interrupt(struct pt_regs* regs) { irq_enter(); - vendor_thermal_interrupt(®s); + vendor_thermal_interrupt(regs); irq_exit(); + return regs; } /* P4/Xeon Thermal regulation detect and init */ diff -prauN linux-2.6.0-test11/arch/i386/kernel/entry.S wli-2.6.0-test11-30/arch/i386/kernel/entry.S --- linux-2.6.0-test11/arch/i386/kernel/entry.S 2003-11-26 12:43:26.000000000 -0800 +++ wli-2.6.0-test11-30/arch/i386/kernel/entry.S 2003-12-03 19:39:57.000000000 -0800 @@ -162,7 +162,7 @@ do_lcall: movl %eax,EFLAGS(%ebp) # movl %edx,EIP(%ebp) # Now we move them to their "normal" places movl %ecx,CS(%ebp) # - andl $-8192, %ebp # GET_THREAD_INFO + GET_THREAD_INFO_WITH_ESP(%ebp) # GET_THREAD_INFO movl TI_EXEC_DOMAIN(%ebp), %edx # Get the execution domain call *4(%edx) # Call the lcall7 handler for the domain addl $4, %esp @@ -396,17 +396,78 @@ ENTRY(irq_entries_start) vector=vector+1 .endr + +# lets play optimizing compiler... +#ifdef CONFIG_X86_CMOV +#define COND_MOVE cmovnz %esi,%esp; +#else +#define COND_MOVE \ + jz 1f; \ + mov %esi,%esp; \ +1: +#endif + +# These macros will switch you to, and from a per-cpu interrupt stack +# They take the pt_regs arg and move it from the normal place on the +# stack to %eax. Any handler function can retrieve it using regparm(1). +# The handlers are expected to return the stack to switch back to in +# the same register. +# +# This means that the irq handlers need to return their arg +# +# SWITCH_TO_IRQSTACK clobbers %ebx, %ecx, %edx, %esi +# old stack gets put in %eax + +.macro SWITCH_TO_IRQSTACK + GET_THREAD_INFO(%ebx); + movl TI_IRQ_STACK(%ebx),%ecx; + movl TI_TASK(%ebx),%edx; + movl %esp,%eax; + + # %ecx+THREAD_SIZE is next stack -4 keeps us in the right one + leal (THREAD_SIZE-4)(%ecx),%esi; + + # is there a valid irq_stack? + testl %ecx,%ecx; + COND_MOVE; + + # update the task pointer in the irq stack + GET_THREAD_INFO(%esi); + movl %edx,TI_TASK(%esi); + + # update the preempt count in the irq stack + movl TI_PRE_COUNT(%ebx),%ecx; + movl %ecx,TI_PRE_COUNT(%esi); +.endm + +# copy flags from the irq stack back into the task's thread_info +# %esi is saved over the irq handler call and contains the irq stack's +# thread_info pointer +# %eax was returned from the handler, as described above +# %ebx contains the original thread_info pointer + +.macro RESTORE_FROM_IRQSTACK + movl %eax,%esp; + movl TI_FLAGS(%esi),%eax; + movl $0,TI_FLAGS(%esi); + LOCK orl %eax,TI_FLAGS(%ebx); +.endm + ALIGN common_interrupt: SAVE_ALL + SWITCH_TO_IRQSTACK call do_IRQ + RESTORE_FROM_IRQSTACK jmp ret_from_intr #define BUILD_INTERRUPT(name, nr) \ ENTRY(name) \ pushl $nr-256; \ SAVE_ALL \ - call smp_/**/name; \ + SWITCH_TO_IRQSTACK; \ + call smp_/**/name; \ + RESTORE_FROM_IRQSTACK; \ jmp ret_from_intr; /* The include is where all of the SMP etc. interrupts come from */ @@ -515,8 +576,8 @@ ENTRY(nmi) /* Do not access memory above the end of our stack page, * it might not exist. */ - andl $0x1fff,%eax - cmpl $0x1fec,%eax + andl $(THREAD_SIZE-1),%eax + cmpl $(THREAD_SIZE-20),%eax popl %eax jae nmi_stack_correct cmpl $sysenter_entry,12(%esp) @@ -606,6 +667,61 @@ ENTRY(spurious_interrupt_bug) pushl $do_spurious_interrupt_bug jmp error_code + +#ifdef CONFIG_X86_STACK_CHECK +.data + .globl stack_overflowed +stack_overflowed: + .long 0 +.text + +ENTRY(mcount) + push %eax + movl $(THREAD_SIZE - 1),%eax + andl %esp,%eax + cmpl $STACK_WARN,%eax /* more than half the stack is used*/ + jle 1f +2: + popl %eax + ret +1: + lock; btsl $0,stack_overflowed + jc 2b + + # switch to overflow stack + movl %esp,%eax + movl $(stack_overflow_stack + THREAD_SIZE - 4),%esp + + pushf + cli + pushl %eax + + # push eip then esp of error for stack_overflow_panic + pushl 4(%eax) + pushl %eax + + # update the task pointer and cpu in the overflow stack's thread_info. + GET_THREAD_INFO_WITH_ESP(%eax) + movl TI_TASK(%eax),%ebx + movl %ebx,stack_overflow_stack+TI_TASK + movl TI_CPU(%eax),%ebx + movl %ebx,stack_overflow_stack+TI_CPU + + call stack_overflow + + # pop off call arguments + addl $8,%esp + + popl %eax + popf + movl %eax,%esp + popl %eax + movl $0,stack_overflowed + ret + +#warning stack check enabled +#endif + .data ENTRY(sys_call_table) .long sys_restart_syscall /* 0 - old "setup()" system call, used for restarting */ diff -prauN linux-2.6.0-test11/arch/i386/kernel/head.S wli-2.6.0-test11-30/arch/i386/kernel/head.S --- linux-2.6.0-test11/arch/i386/kernel/head.S 2003-11-26 12:42:58.000000000 -0800 +++ wli-2.6.0-test11-30/arch/i386/kernel/head.S 2003-12-03 19:38:56.000000000 -0800 @@ -16,6 +16,7 @@ #include #include #include +#include #define OLD_CL_MAGIC_ADDR 0x90020 #define OLD_CL_MAGIC 0xA33F @@ -325,7 +326,7 @@ rp_sidt: ret ENTRY(stack_start) - .long init_thread_union+8192 + .long init_thread_union+THREAD_SIZE .long __BOOT_DS /* This is the default interrupt "handler" :-) */ diff -prauN linux-2.6.0-test11/arch/i386/kernel/i386_ksyms.c wli-2.6.0-test11-30/arch/i386/kernel/i386_ksyms.c --- linux-2.6.0-test11/arch/i386/kernel/i386_ksyms.c 2003-11-26 12:46:01.000000000 -0800 +++ wli-2.6.0-test11-30/arch/i386/kernel/i386_ksyms.c 2003-12-03 19:39:57.000000000 -0800 @@ -172,8 +172,6 @@ EXPORT_SYMBOL(machine_id); EXPORT_SYMBOL(screen_info); #endif -EXPORT_SYMBOL(get_wchan); - EXPORT_SYMBOL(rtc_lock); EXPORT_SYMBOL_GPL(set_nmi_callback); @@ -195,14 +193,6 @@ EXPORT_SYMBOL(is_sony_vaio_laptop); EXPORT_SYMBOL(__PAGE_KERNEL); -#ifdef CONFIG_HIGHMEM -EXPORT_SYMBOL(kmap); -EXPORT_SYMBOL(kunmap); -EXPORT_SYMBOL(kmap_atomic); -EXPORT_SYMBOL(kunmap_atomic); -EXPORT_SYMBOL(kmap_atomic_to_page); -#endif - #ifdef CONFIG_EDD_MODULE EXPORT_SYMBOL(edd); EXPORT_SYMBOL(eddnr); @@ -212,4 +202,9 @@ EXPORT_SYMBOL(eddnr); EXPORT_SYMBOL(ist_info); #endif +#ifdef CONFIG_X86_STACK_CHECK +void mcount(void); +EXPORT_SYMBOL(mcount); +#endif + EXPORT_SYMBOL(csum_partial); diff -prauN linux-2.6.0-test11/arch/i386/kernel/init_task.c wli-2.6.0-test11-30/arch/i386/kernel/init_task.c --- linux-2.6.0-test11/arch/i386/kernel/init_task.c 2003-11-26 12:45:45.000000000 -0800 +++ wli-2.6.0-test11-30/arch/i386/kernel/init_task.c 2003-12-03 19:38:56.000000000 -0800 @@ -17,6 +17,14 @@ struct mm_struct init_mm = INIT_MM(init_ EXPORT_SYMBOL(init_mm); +union thread_union init_irq_union + __attribute__((__section__(".data.init_task"))); + +#ifdef CONFIG_X86_STACK_CHECK +union thread_union stack_overflow_stack + __attribute__((__section__(".data.init_task"))); +#endif + /* * Initial thread structure. * diff -prauN linux-2.6.0-test11/arch/i386/kernel/irq.c wli-2.6.0-test11-30/arch/i386/kernel/irq.c --- linux-2.6.0-test11/arch/i386/kernel/irq.c 2003-11-26 12:42:56.000000000 -0800 +++ wli-2.6.0-test11-30/arch/i386/kernel/irq.c 2003-12-03 19:41:16.000000000 -0800 @@ -404,7 +404,8 @@ void enable_irq(unsigned int irq) * SMP cross-CPU interrupts have their own specific * handlers). */ -asmlinkage unsigned int do_IRQ(struct pt_regs regs) +struct pt_regs * IRQHANDLER(do_IRQ(struct pt_regs *regs)); +struct pt_regs * do_IRQ(struct pt_regs *regs) { /* * We ack quickly, we don't want the irq controller @@ -416,7 +417,7 @@ asmlinkage unsigned int do_IRQ(struct pt * 0 return value means that this irq is already being * handled by some other CPU. (or is disabled) */ - int irq = regs.orig_eax & 0xff; /* high bits used in ret_from_ code */ + int irq = regs->orig_eax & 0xff; /* high bits used in ret_from_ code */ irq_desc_t *desc = irq_desc + irq; struct irqaction * action; unsigned int status; @@ -424,13 +425,17 @@ asmlinkage unsigned int do_IRQ(struct pt irq_enter(); #ifdef CONFIG_DEBUG_STACKOVERFLOW - /* Debugging check for stack overflow: is there less than 1KB free? */ + /* + * Debugging check for stack overflow: + * Is there less than THREAD_SIZE/8 free? + */ { + const long minstk = THREAD_SIZE >= 8192 ? THREAD_SIZE/8 : 256; long esp; __asm__ __volatile__("andl %%esp,%0" : - "=r" (esp) : "0" (8191)); - if (unlikely(esp < (sizeof(struct thread_info) + 1024))) { + "=r" (esp) : "0" (THREAD_SIZE - 1)); + if (unlikely(esp < (sizeof(struct thread_info) + minstk))) { printk("do_IRQ: stack overflow: %ld\n", esp - sizeof(struct thread_info)); dump_stack(); @@ -482,7 +487,7 @@ asmlinkage unsigned int do_IRQ(struct pt irqreturn_t action_ret; spin_unlock(&desc->lock); - action_ret = handle_IRQ_event(irq, ®s, action); + action_ret = handle_IRQ_event(irq, regs, action); spin_lock(&desc->lock); if (!noirqdebug) note_interrupt(irq, desc, action_ret); @@ -502,7 +507,7 @@ out: irq_exit(); - return 1; + return regs; } /** diff -prauN linux-2.6.0-test11/arch/i386/kernel/process.c wli-2.6.0-test11-30/arch/i386/kernel/process.c --- linux-2.6.0-test11/arch/i386/kernel/process.c 2003-11-26 12:42:37.000000000 -0800 +++ wli-2.6.0-test11-30/arch/i386/kernel/process.c 2003-12-04 08:37:28.000000000 -0800 @@ -213,7 +213,25 @@ static int __init idle_setup (char *str) __setup("idle=", idle_setup); -void show_regs(struct pt_regs * regs) +void stack_overflow(unsigned long esp, unsigned long eip) +{ + int panicing = ((esp&(THREAD_SIZE-1)) <= STACK_PANIC); + + printk( "esp: 0x%lx masked: 0x%lx STACK_PANIC:0x%lx %d %d\n", + esp, (esp&(THREAD_SIZE-1)), STACK_PANIC, (((esp&(THREAD_SIZE-1)) <= STACK_PANIC)), panicing ); + + if (panicing) + print_symbol("stack overflow from %s\n", eip); + else + print_symbol("excessive stack use from %s\n", eip); + printk("esp: %p\n", (void*)esp); + show_trace(NULL, (void*)esp); + + if (panicing) + panic("stack overflow\n"); +} + +asmlinkage void show_regs(struct pt_regs * regs) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; @@ -501,7 +519,7 @@ struct task_struct * __switch_to(struct struct tss_struct *tss = init_tss + cpu; /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ - + next_p->thread_info->irq_stack = prev_p->thread_info->irq_stack; __unlazy_fpu(prev_p); /* @@ -631,36 +649,34 @@ out: /* * These bracket the sleeping functions.. */ -extern void scheduling_functions_start_here(void); -extern void scheduling_functions_end_here(void); -#define first_sched ((unsigned long) scheduling_functions_start_here) -#define last_sched ((unsigned long) scheduling_functions_end_here) +#define top_esp (THREAD_SIZE - sizeof(unsigned long)) +#define top_ebp (THREAD_SIZE - 2*sizeof(unsigned long)) -unsigned long get_wchan(struct task_struct *p) +unsigned long get_wchan(task_t *task) { unsigned long ebp, esp, eip; unsigned long stack_page; int count = 0; - if (!p || p == current || p->state == TASK_RUNNING) + if (!task || task == current || task->state == TASK_RUNNING) return 0; - stack_page = (unsigned long)p->thread_info; - esp = p->thread.esp; - if (!stack_page || esp < stack_page || esp > 8188+stack_page) + stack_page = (unsigned long)task->thread_info; + esp = task->thread.esp; + if (!stack_page || esp < stack_page || esp > top_esp + stack_page) return 0; /* include/asm-i386/system.h:switch_to() pushes ebp last. */ ebp = *(unsigned long *) esp; do { - if (ebp < stack_page || ebp > 8184+stack_page) + if (ebp < stack_page || ebp > top_ebp + stack_page) return 0; eip = *(unsigned long *) (ebp+4); - if (eip < first_sched || eip >= last_sched) + if (eip < scheduling_functions_start_here + || eip >= scheduling_functions_end_here) return eip; ebp = *(unsigned long *) ebp; } while (count++ < 16); return 0; } -#undef last_sched -#undef first_sched +EXPORT_SYMBOL(get_wchan); /* * sys_alloc_thread_area: get a yet unused TLS descriptor index. diff -prauN linux-2.6.0-test11/arch/i386/kernel/semaphore.c wli-2.6.0-test11-30/arch/i386/kernel/semaphore.c --- linux-2.6.0-test11/arch/i386/kernel/semaphore.c 2003-11-26 12:43:40.000000000 -0800 +++ wli-2.6.0-test11-30/arch/i386/kernel/semaphore.c 2003-12-04 08:35:58.000000000 -0800 @@ -15,6 +15,7 @@ #include #include #include +#include #include /* @@ -53,7 +54,7 @@ void __up(struct semaphore *sem) wake_up(&sem->wait); } -void __down(struct semaphore * sem) +__sched void __down(struct semaphore * sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -90,7 +91,7 @@ void __down(struct semaphore * sem) tsk->state = TASK_RUNNING; } -int __down_interruptible(struct semaphore * sem) +__sched int __down_interruptible(struct semaphore * sem) { int retval = 0; struct task_struct *tsk = current; @@ -187,7 +188,7 @@ int __down_trylock(struct semaphore * se * value.. */ asm( -".text\n" +".section .sched.text\n" ".align 4\n" ".globl __down_failed\n" "__down_failed:\n\t" @@ -210,7 +211,7 @@ asm( ); asm( -".text\n" +".section .sched.text\n" ".align 4\n" ".globl __down_failed_interruptible\n" "__down_failed_interruptible:\n\t" @@ -231,7 +232,7 @@ asm( ); asm( -".text\n" +".section .sched.text\n" ".align 4\n" ".globl __down_failed_trylock\n" "__down_failed_trylock:\n\t" @@ -252,7 +253,7 @@ asm( ); asm( -".text\n" +".section .sched.text\n" ".align 4\n" ".globl __up_wakeup\n" "__up_wakeup:\n\t" @@ -271,7 +272,7 @@ asm( */ #if defined(CONFIG_SMP) asm( -".text\n" +".section .sched.text\n" ".align 4\n" ".globl __write_lock_failed\n" "__write_lock_failed:\n\t" @@ -285,7 +286,7 @@ asm( ); asm( -".text\n" +".section .sched.text\n" ".align 4\n" ".globl __read_lock_failed\n" "__read_lock_failed:\n\t" diff -prauN linux-2.6.0-test11/arch/i386/kernel/smp.c wli-2.6.0-test11-30/arch/i386/kernel/smp.c --- linux-2.6.0-test11/arch/i386/kernel/smp.c 2003-11-26 12:42:56.000000000 -0800 +++ wli-2.6.0-test11-30/arch/i386/kernel/smp.c 2003-12-03 19:38:56.000000000 -0800 @@ -308,7 +308,8 @@ static inline void leave_mm (unsigned lo * 2) Leave the mm if we are in the lazy tlb mode. */ -asmlinkage void smp_invalidate_interrupt (void) +struct pt_regs * IRQHANDLER(smp_invalidate_interrupt(struct pt_regs *regs)); +struct pt_regs * smp_invalidate_interrupt(struct pt_regs *regs) { unsigned long cpu; @@ -340,6 +341,7 @@ asmlinkage void smp_invalidate_interrupt smp_mb__after_clear_bit(); out: put_cpu_no_resched(); + return regs; } static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, @@ -576,12 +578,15 @@ void smp_send_stop(void) * all the work is done automatically when * we return from the interrupt. */ -asmlinkage void smp_reschedule_interrupt(void) +struct pt_regs *IRQHANDLER(smp_reschedule_interrupt(struct pt_regs *)); +struct pt_regs *smp_reschedule_interrupt(struct pt_regs *regs) { ack_APIC_irq(); + return regs; } -asmlinkage void smp_call_function_interrupt(void) +struct pt_regs *IRQHANDLER(smp_call_function_interrupt(struct pt_regs *)); +struct pt_regs *smp_call_function_interrupt(struct pt_regs *regs) { void (*func) (void *info) = call_data->func; void *info = call_data->info; @@ -605,5 +610,6 @@ asmlinkage void smp_call_function_interr mb(); atomic_inc(&call_data->finished); } + return regs; } diff -prauN linux-2.6.0-test11/arch/i386/kernel/smpboot.c wli-2.6.0-test11-30/arch/i386/kernel/smpboot.c --- linux-2.6.0-test11/arch/i386/kernel/smpboot.c 2003-11-26 12:44:06.000000000 -0800 +++ wli-2.6.0-test11-30/arch/i386/kernel/smpboot.c 2003-12-03 19:38:56.000000000 -0800 @@ -71,6 +71,11 @@ static cpumask_t smp_commenced_mask; /* Per CPU bogomips and other parameters */ struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned; +/* Per CPU interrupt stacks */ +extern union thread_union init_irq_union; +union thread_union *irq_stacks[NR_CPUS] __cacheline_aligned = + { &init_irq_union, }; + /* Set when the idlers are all forked */ int smp_threads_ready; @@ -770,6 +775,24 @@ wakeup_secondary_cpu(int phys_apicid, un } #endif /* WAKE_SECONDARY_VIA_INIT */ +static void __init setup_irq_stack(task_t *task, int cpu) +{ + unsigned long stack; + + stack = __get_free_pages(GFP_KERNEL, THREAD_ORDER); + if (!task) + panic("Cannot allocate irq stack\n"); + irq_stacks[cpu] = (void *)stack; + memset(irq_stacks[cpu], 0, THREAD_SIZE); + irq_stacks[cpu]->thread_info.cpu = cpu; + irq_stacks[cpu]->thread_info.preempt_count = 1; + task->thread_info->irq_stack = &irq_stacks[cpu]->thread_info; + /* + * If we want to make the irq stack more than one unit + * deep, we can chain them off the irq_stack pointer here. + */ +} + extern cpumask_t cpu_initialized; static int __init do_boot_cpu(int apicid) @@ -793,6 +816,7 @@ static int __init do_boot_cpu(int apicid idle = fork_by_hand(); if (IS_ERR(idle)) panic("failed fork for CPU %d", cpu); + setup_irq_stack(idle, cpu); wake_up_forked_process(idle); /* diff -prauN linux-2.6.0-test11/arch/i386/kernel/vm86.c wli-2.6.0-test11-30/arch/i386/kernel/vm86.c --- linux-2.6.0-test11/arch/i386/kernel/vm86.c 2003-11-26 12:42:58.000000000 -0800 +++ wli-2.6.0-test11-30/arch/i386/kernel/vm86.c 2003-12-03 19:11:55.000000000 -0800 @@ -134,16 +134,17 @@ struct pt_regs * save_v86_state(struct k return ret; } -static void mark_screen_rdonly(struct task_struct * tsk) +static void mark_screen_rdonly(task_t *task) { + struct mm_struct *mm = task->mm; pgd_t *pgd; pmd_t *pmd; pte_t *pte, *mapped; int i; preempt_disable(); - spin_lock(&tsk->mm->page_table_lock); - pgd = pgd_offset(tsk->mm, 0xA0000); + spin_lock(&mm->page_table_lock); + pgd = pgd_offset(mm, 0xA0000); if (pgd_none(*pgd)) goto out; if (pgd_bad(*pgd)) { @@ -151,23 +152,26 @@ static void mark_screen_rdonly(struct ta pgd_clear(pgd); goto out; } - pmd = pmd_offset(pgd, 0xA0000); - if (pmd_none(*pmd)) + pmd = pmd_offset_map(pgd, 0xA0000); + if (pmd_none(*pmd)) { + pmd_unmap(pmd); goto out; - if (pmd_bad(*pmd)) { + } else if (pmd_bad(*pmd)) { pmd_ERROR(*pmd); pmd_clear(pmd); + pmd_unmap(pmd); goto out; } pte = mapped = pte_offset_map(pmd, 0xA0000); for (i = 0; i < 32; i++) { if (pte_present(*pte)) - set_pte(pte, pte_wrprotect(*pte)); + vm_ptep_set_wrprotect(mm, pte); pte++; } pte_unmap(mapped); + pmd_unmap(pmd); out: - spin_unlock(&tsk->mm->page_table_lock); + spin_unlock(&mm->page_table_lock); preempt_enable(); flush_tlb(); } diff -prauN linux-2.6.0-test11/arch/i386/kernel/vmlinux.lds.S wli-2.6.0-test11-30/arch/i386/kernel/vmlinux.lds.S --- linux-2.6.0-test11/arch/i386/kernel/vmlinux.lds.S 2003-11-26 12:43:24.000000000 -0800 +++ wli-2.6.0-test11-30/arch/i386/kernel/vmlinux.lds.S 2003-12-04 08:35:58.000000000 -0800 @@ -15,6 +15,9 @@ SECTIONS _text = .; /* Text and read-only data */ .text : { *(.text) + __scheduling_functions_start_here = .; + *(.sched.text) + __scheduling_functions_end_here = .; *(.fixup) *(.gnu.warning) } = 0x9090 diff -prauN linux-2.6.0-test11/arch/i386/mm/discontig.c wli-2.6.0-test11-30/arch/i386/mm/discontig.c --- linux-2.6.0-test11/arch/i386/mm/discontig.c 2003-11-26 12:44:20.000000000 -0800 +++ wli-2.6.0-test11-30/arch/i386/mm/discontig.c 2003-12-04 07:27:23.000000000 -0800 @@ -72,8 +72,6 @@ extern unsigned long max_low_pfn; extern unsigned long totalram_pages; extern unsigned long totalhigh_pages; -#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) - unsigned long node_remap_start_pfn[MAX_NUMNODES]; unsigned long node_remap_size[MAX_NUMNODES]; unsigned long node_remap_offset[MAX_NUMNODES]; @@ -128,6 +126,48 @@ static void __init find_max_pfn_node(int BUG(); } +extern char __per_cpu_start[], __per_cpu_end[]; +unsigned long __per_cpu_offset[NR_CPUS]; + +#define PER_CPU_PAGES PFN_UP((unsigned long)(__per_cpu_end-__per_cpu_start)) +#define MEM_MAP_SIZE(n) PFN_UP((node_end_pfn[n]-node_start_pfn[n]+1)*sizeof(struct page)) + +static void __init allocate_per_cpu_pages(int cpu) +{ + int cpu_in_node, node = cpu_to_node(cpu); + unsigned long vaddr; + cpumask_t nodemask = node_to_cpumask(node); + + if (!PER_CPU_PAGES || node >= numnodes) + return; + + if (!node) { + vaddr = (unsigned long)alloc_bootmem(PER_CPU_PAGES*PAGE_SIZE); + __per_cpu_offset[cpu] = vaddr - (unsigned long)__per_cpu_start; + } else { + int k; + vaddr = (unsigned long)node_remap_start_vaddr[node]; + for (k = 0, cpu_in_node = 0; k < cpu; ++k) + if (cpu_isset(k, nodemask)) + ++cpu_in_node; + __per_cpu_offset[cpu] = vaddr + PAGE_SIZE*MEM_MAP_SIZE(node) + + PAGE_SIZE*PFN_UP(sizeof(pg_data_t)) + + PAGE_SIZE*PER_CPU_PAGES*cpu_in_node + - (unsigned long)__per_cpu_start; + } + memcpy(RELOC_HIDE((char *)__per_cpu_start, __per_cpu_offset[cpu]), + __per_cpu_start, + PER_CPU_PAGES*PAGE_SIZE); +} + +void __init setup_per_cpu_areas(void) +{ + int cpu; + for (cpu = 0; cpu < NR_CPUS; ++cpu) + allocate_per_cpu_pages(cpu); +} + + /* * Allocate memory for the pg_data_t via a crude pre-bootmem method * We ought to relocate these onto their own node later on during boot. @@ -205,13 +245,11 @@ static unsigned long calculate_numa_rema unsigned long size, reserve_pages = 0; for (nid = 1; nid < numnodes; nid++) { - /* calculate the size of the mem_map needed in bytes */ - size = (node_end_pfn[nid] - node_start_pfn[nid] + 1) - * sizeof(struct page) + sizeof(pg_data_t); - /* convert size to large (pmd size) pages, rounding up */ - size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES; - /* now the roundup is correct, convert to PAGE_SIZE pages */ - size = size * PTRS_PER_PTE; + /* calculate the size of the mem_map needed in pages */ + size = MEM_MAP_SIZE(nid) + PFN_UP(sizeof(pg_data_t)) + + PER_CPU_PAGES*MAX_NODE_CPUS; + /* round up to nearest pmd boundary */ + size = (size + PTRS_PER_PTE - 1) & ~(PTRS_PER_PTE - 1); printk("Reserving %ld pages of KVA for lmem_map of node %d\n", size, nid); node_remap_size[nid] = size; diff -prauN linux-2.6.0-test11/arch/i386/mm/fault.c wli-2.6.0-test11-30/arch/i386/mm/fault.c --- linux-2.6.0-test11/arch/i386/mm/fault.c 2003-11-26 12:42:46.000000000 -0800 +++ wli-2.6.0-test11-30/arch/i386/mm/fault.c 2003-12-03 18:20:41.000000000 -0800 @@ -413,6 +413,13 @@ no_context: printk(" printing eip:\n"); printk("%08lx\n", regs->eip); asm("movl %%cr3,%0":"=r" (page)); +#ifdef CONFIG_HIGHPMD /* Oh boy. Error reporting is going to blow major goats. */ + printk(KERN_ALERT "%%cr3 = 0x%lx\n", page); + /* Mask off flag bits. It should end up 32B-aligned. */ + page &= ~(PTRS_PER_PGD*sizeof(pgd_t) - 1); + printk(KERN_ALERT "*pdpte = 0x%Lx\n", + pgd_val(((pgd_t *)__va(page))[address >> PGDIR_SHIFT])); +#else /* !CONFIG_HIGHPMD */ page = ((unsigned long *) __va(page))[address >> 22]; printk(KERN_ALERT "*pde = %08lx\n", page); /* @@ -428,7 +435,8 @@ no_context: page = ((unsigned long *) __va(page))[address >> PAGE_SHIFT]; printk(KERN_ALERT "*pte = %08lx\n", page); } -#endif +#endif /* !CONFIG_HIGHPTE */ +#endif /* CONFIG_HIGHPMD */ die("Oops", regs, error_code); bust_spinlocks(0); do_exit(SIGKILL); @@ -496,8 +504,8 @@ vmalloc_fault: * and redundant with the set_pmd() on non-PAE. */ - pmd = pmd_offset(pgd, address); - pmd_k = pmd_offset(pgd_k, address); + pmd = pmd_offset_kernel(pgd, address); + pmd_k = pmd_offset_kernel(pgd_k, address); if (!pmd_present(*pmd_k)) goto no_context; set_pmd(pmd, *pmd_k); diff -prauN linux-2.6.0-test11/arch/i386/mm/highmem.c wli-2.6.0-test11-30/arch/i386/mm/highmem.c --- linux-2.6.0-test11/arch/i386/mm/highmem.c 2003-11-26 12:44:16.000000000 -0800 +++ wli-2.6.0-test11-30/arch/i386/mm/highmem.c 2003-12-03 19:29:08.000000000 -0800 @@ -1,21 +1,5 @@ #include - -void *kmap(struct page *page) -{ - might_sleep(); - if (page < highmem_start_page) - return page_address(page); - return kmap_high(page); -} - -void kunmap(struct page *page) -{ - if (in_interrupt()) - BUG(); - if (page < highmem_start_page) - return; - kunmap_high(page); -} +#include /* * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because @@ -25,40 +9,40 @@ void kunmap(struct page *page) * However when holding an atomic kmap is is not legal to sleep, so atomic * kmaps are appropriate for short, tight code paths only. */ -void *kmap_atomic(struct page *page, enum km_type type) +void *__kmap_atomic(struct page *page, enum km_type type, unsigned long vaddr) { enum fixed_addresses idx; - unsigned long vaddr; + unsigned long offset = KM_TYPE_NR*smp_processor_id(); + pte_t old_pte, pte, *kpte; - inc_preempt_count(); - if (page < highmem_start_page) - return page_address(page); - - idx = type + KM_TYPE_NR*smp_processor_id(); - vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); + idx = type + offset; + vaddr -= PAGE_SIZE*offset; + kpte = kmap_pte - idx; + old_pte = *kpte; #ifdef CONFIG_DEBUG_HIGHMEM - if (!pte_none(*(kmap_pte-idx))) - BUG(); + BUG_ON(!pte_none(old_pte)); #endif - set_pte(kmap_pte-idx, mk_pte(page, kmap_prot)); - __flush_tlb_one(vaddr); - - return (void*) vaddr; + pte = mk_pte(page, kmap_prot); + if (!pte_same(old_pte, pte)) { + set_pte(kpte, pte); + if (!pte_none(old_pte)) + __flush_tlb_one(vaddr); + } + return (void *)vaddr; } +EXPORT_SYMBOL(__kmap_atomic); -void kunmap_atomic(void *kvaddr, enum km_type type) -{ #ifdef CONFIG_DEBUG_HIGHMEM - unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; - enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); +void __kunmap_atomic(void *kvaddr, enum km_type type, unsigned long vaddr) +{ + unsigned long offset = KM_TYPE_NR*smp_processor_id(); + unsigned long uvaddr = (unsigned long) kvaddr & PAGE_MASK; + enum fixed_addresses idx; - if (vaddr < FIXADDR_START) { // FIXME - dec_preempt_count(); - return; - } + idx = type + offset; + vaddr -= PAGE_SIZE*offset; - if (vaddr != __fix_to_virt(FIX_KMAP_BEGIN+idx)) - BUG(); + BUG_ON(uvaddr != vaddr); /* * force other mappings to Oops if they'll try to access @@ -66,21 +50,6 @@ void kunmap_atomic(void *kvaddr, enum km */ pte_clear(kmap_pte-idx); __flush_tlb_one(vaddr); -#endif - - dec_preempt_count(); } - -struct page *kmap_atomic_to_page(void *ptr) -{ - unsigned long idx, vaddr = (unsigned long)ptr; - pte_t *pte; - - if (vaddr < FIXADDR_START) - return virt_to_page(ptr); - - idx = virt_to_fix(vaddr); - pte = kmap_pte - (idx - FIX_KMAP_BEGIN); - return pte_page(*pte); -} - +EXPORT_SYMBOL(__kunmap_atomic); +#endif diff -prauN linux-2.6.0-test11/arch/i386/mm/hugetlbpage.c wli-2.6.0-test11-30/arch/i386/mm/hugetlbpage.c --- linux-2.6.0-test11/arch/i386/mm/hugetlbpage.c 2003-11-26 12:45:35.000000000 -0800 +++ wli-2.6.0-test11-30/arch/i386/mm/hugetlbpage.c 2003-12-04 08:43:29.000000000 -0800 @@ -87,8 +87,8 @@ static pte_t *huge_pte_alloc(struct mm_s pmd_t *pmd = NULL; pgd = pgd_offset(mm, addr); - pmd = pmd_alloc(mm, pgd, addr); - return (pte_t *) pmd; + pmd = pmd_alloc_map(mm, pgd, addr); + return (pte_t *)pmd; } static pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) @@ -97,11 +97,13 @@ static pte_t *huge_pte_offset(struct mm_ pmd_t *pmd = NULL; pgd = pgd_offset(mm, addr); - pmd = pmd_offset(pgd, addr); - return (pte_t *) pmd; + pmd = pmd_offset_map_nested(pgd, addr); + return (pte_t *)pmd; } -static void set_huge_pte(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page, pte_t * page_table, int write_access) +static void set_huge_pte(struct mm_struct *mm, struct vm_area_struct *vma, + struct page *page, pte_t * page_table, + unsigned long addr, int write_access) { pte_t entry; @@ -114,6 +116,7 @@ static void set_huge_pte(struct mm_struc entry = pte_mkyoung(entry); mk_pte_huge(entry); set_pte(page_table, entry); + vm_account_huge_inc(vma, *page_table, addr); } /* @@ -145,6 +148,8 @@ int copy_hugetlb_page_range(struct mm_st ptepage = pte_page(entry); get_page(ptepage); set_pte(dst_pte, entry); + pmd_unmap(dst_pte); + pmd_unmap_nested(src_pte); dst->rss += (HPAGE_SIZE / PAGE_SIZE); addr += HPAGE_SIZE; } @@ -182,6 +187,7 @@ follow_hugetlb_page(struct mm_struct *mm get_page(page); pages[i] = page; + pmd_unmap_nested(pte); } if (vmas) @@ -271,6 +277,7 @@ follow_huge_pmd(struct mm_struct *mm, un page += ((address & ~HPAGE_MASK) >> PAGE_SHIFT); get_page(page); } + pmd_unmap(pmd); return page; } #endif @@ -278,7 +285,7 @@ follow_huge_pmd(struct mm_struct *mm, un static void free_huge_page(struct page *page) { BUG_ON(page_count(page)); - BUG_ON(page->mapping); + BUG_ON(page_mapping(page)); INIT_LIST_HEAD(&page->list); @@ -314,6 +321,8 @@ void unmap_hugepage_range(struct vm_area page = pte_page(*pte); huge_page_release(page); pte_clear(pte); + vm_account_huge_dec(vma, *pte, address); + pmd_unmap_nested(pte); } mm->rss -= (end - start) >> PAGE_SHIFT; flush_tlb_range(vma, start, end); @@ -348,8 +357,10 @@ int hugetlb_prefault(struct address_spac ret = -ENOMEM; goto out; } - if (!pte_none(*pte)) + if (!pte_none(*pte)) { + pmd_unmap(pte); continue; + } idx = ((addr - vma->vm_start) >> HPAGE_SHIFT) + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); @@ -358,12 +369,14 @@ int hugetlb_prefault(struct address_spac /* charge the fs quota first */ if (hugetlb_get_quota(mapping)) { ret = -ENOMEM; + pmd_unmap(pte); goto out; } page = alloc_hugetlb_page(); if (!page) { hugetlb_put_quota(mapping); ret = -ENOMEM; + pmd_unmap(pte); goto out; } ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC); @@ -371,10 +384,12 @@ int hugetlb_prefault(struct address_spac if (ret) { hugetlb_put_quota(mapping); free_huge_page(page); + pmd_unmap(pte); goto out; } } - set_huge_pte(mm, vma, page, pte, vma->vm_flags & VM_WRITE); + set_huge_pte(mm, vma, page, pte, addr, vma->vm_flags & VM_WRITE); + pmd_unmap(pte); } out: spin_unlock(&mm->page_table_lock); @@ -534,7 +549,7 @@ int is_hugepage_mem_enough(size_t size) * this far. */ static struct page *hugetlb_nopage(struct vm_area_struct *vma, - unsigned long address, int unused) + unsigned long address, int *unused) { BUG(); return NULL; diff -prauN linux-2.6.0-test11/arch/i386/mm/init.c wli-2.6.0-test11-30/arch/i386/mm/init.c --- linux-2.6.0-test11/arch/i386/mm/init.c 2003-11-26 12:45:05.000000000 -0800 +++ wli-2.6.0-test11-30/arch/i386/mm/init.c 2003-12-03 18:30:38.000000000 -0800 @@ -57,10 +57,10 @@ static pmd_t * __init one_md_table_init( #ifdef CONFIG_X86_PAE pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); - if (pmd_table != pmd_offset(pgd, 0)) + if (pmd_table != pmd_offset_kernel(pgd, 0)) BUG(); #else - pmd_table = pmd_offset(pgd, 0); + pmd_table = pmd_offset_kernel(pgd, 0); #endif return pmd_table; @@ -111,7 +111,7 @@ static void __init page_table_range_init if (pgd_none(*pgd)) one_md_table_init(pgd); - pmd = pmd_offset(pgd, vaddr); + pmd = pmd_offset_kernel(pgd, vaddr); for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) { if (pmd_none(*pmd)) one_page_table_init(pmd); @@ -195,7 +195,7 @@ EXPORT_SYMBOL(kmap_prot); EXPORT_SYMBOL(kmap_pte); #define kmap_get_fixmap_pte(vaddr) \ - pte_offset_kernel(pmd_offset(pgd_offset_k(vaddr), (vaddr)), (vaddr)) + pte_offset_kernel(pmd_offset_kernel(pgd_offset_k(vaddr), (vaddr)), (vaddr)) void __init kmap_init(void) { @@ -219,7 +219,7 @@ void __init permanent_kmaps_init(pgd_t * page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base); pgd = swapper_pg_dir + pgd_index(vaddr); - pmd = pmd_offset(pgd, vaddr); + pmd = pmd_offset_kernel(pgd, vaddr); pte = pte_offset_kernel(pmd, vaddr); pkmap_page_table = pte; } @@ -466,7 +466,7 @@ void __init mem_init(void) /* this will put all low memory onto the freelists */ totalram_pages += __free_all_bootmem(); - + tlb_init(); reservedpages = 0; for (tmp = 0; tmp < max_low_pfn; tmp++) /* @@ -514,20 +514,9 @@ void __init mem_init(void) } kmem_cache_t *pgd_cache; -kmem_cache_t *pmd_cache; void __init pgtable_cache_init(void) { - if (PTRS_PER_PMD > 1) { - pmd_cache = kmem_cache_create("pmd", - PTRS_PER_PMD*sizeof(pmd_t), - 0, - SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN, - pmd_ctor, - NULL); - if (!pmd_cache) - panic("pgtable_cache_init(): cannot create pmd cache"); - } pgd_cache = kmem_cache_create("pgd", PTRS_PER_PGD*sizeof(pgd_t), 0, diff -prauN linux-2.6.0-test11/arch/i386/mm/ioremap.c wli-2.6.0-test11-30/arch/i386/mm/ioremap.c --- linux-2.6.0-test11/arch/i386/mm/ioremap.c 2003-11-26 12:43:26.000000000 -0800 +++ wli-2.6.0-test11-30/arch/i386/mm/ioremap.c 2003-12-03 18:20:41.000000000 -0800 @@ -82,7 +82,7 @@ static int remap_area_pages(unsigned lon spin_lock(&init_mm.page_table_lock); do { pmd_t *pmd; - pmd = pmd_alloc(&init_mm, dir, address); + pmd = pmd_alloc_kernel(&init_mm, dir, address); error = -ENOMEM; if (!pmd) break; diff -prauN linux-2.6.0-test11/arch/i386/mm/pageattr.c wli-2.6.0-test11-30/arch/i386/mm/pageattr.c --- linux-2.6.0-test11/arch/i386/mm/pageattr.c 2003-11-26 12:43:41.000000000 -0800 +++ wli-2.6.0-test11-30/arch/i386/mm/pageattr.c 2003-12-03 18:20:41.000000000 -0800 @@ -23,7 +23,7 @@ static inline pte_t *lookup_address(unsi pmd_t *pmd; if (pgd_none(*pgd)) return NULL; - pmd = pmd_offset(pgd, address); + pmd = pmd_offset_kernel(pgd, address); if (pmd_none(*pmd)) return NULL; if (pmd_large(*pmd)) @@ -79,7 +79,7 @@ static void set_pmd_pte(pte_t *kpte, uns pgd_t *pgd; pmd_t *pmd; pgd = (pgd_t *)page_address(page) + pgd_index(address); - pmd = pmd_offset(pgd, address); + pmd = pmd_offset_kernel(pgd, address); set_pte_atomic((pte_t *)pmd, pte); } spin_unlock_irqrestore(&pgd_lock, flags); @@ -92,7 +92,7 @@ static void set_pmd_pte(pte_t *kpte, uns static inline void revert_page(struct page *kpte_page, unsigned long address) { pte_t *linear = (pte_t *) - pmd_offset(pgd_offset(&init_mm, address), address); + pmd_offset_kernel(pgd_offset_k(address), address); set_pmd_pte(linear, address, pfn_pte((__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); diff -prauN linux-2.6.0-test11/arch/i386/mm/pgtable.c wli-2.6.0-test11-30/arch/i386/mm/pgtable.c --- linux-2.6.0-test11/arch/i386/mm/pgtable.c 2003-11-26 12:46:12.000000000 -0800 +++ wli-2.6.0-test11-30/arch/i386/mm/pgtable.c 2003-12-04 07:36:00.000000000 -0800 @@ -70,7 +70,7 @@ static void set_pte_pfn(unsigned long va BUG(); return; } - pmd = pmd_offset(pgd, vaddr); + pmd = pmd_offset_kernel(pgd, vaddr); if (pmd_none(*pmd)) { BUG(); return; @@ -110,7 +110,7 @@ void set_pmd_pfn(unsigned long vaddr, un printk ("set_pmd_pfn: pgd_none\n"); return; /* BUG(); */ } - pmd = pmd_offset(pgd, vaddr); + pmd = pmd_offset_kernel(pgd, vaddr); set_pmd(pmd, pfn_pmd(pfn, flags)); /* * It's enough to flush this one mapping. @@ -138,23 +138,76 @@ pte_t *pte_alloc_one_kernel(struct mm_st return pte; } -struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) +void tlb_init(void) { - struct page *pte; + int cpu; + for (cpu = 0; cpu < NR_CPUS; ++cpu) { + int zone; + struct mmu_gather *tlb = &per_cpu(mmu_gathers, cpu); + for (zone = 0; zone < MAX_ZONE_ID; ++zone) { + INIT_LIST_HEAD(&tlb->active_list[zone]); + INIT_LIST_HEAD(&tlb->ready_list[zone]); + } + } +} -#ifdef CONFIG_HIGHPTE - pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT, 0); -#else - pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT, 0); -#endif - if (pte) - clear_highpage(pte); - return pte; +static inline struct page *pte_alloc_fresh(int gfp_mask) +{ + struct page *page = alloc_page(gfp_mask); + if (page) { + clear_highpage(page); + if (TestSetPagePTE(page)) + BUG(); + } + return page; +} + +static inline int zone_high(struct zone *zone) +{ + if (!zone) + return 1; + else + return zone - zone->zone_pgdat->node_zones >= ZONE_HIGHMEM; +} + +static inline struct page *pte_alloc_ready(int gfp_flags) +{ + struct mmu_gather *tlb = &per_cpu(mmu_gathers, get_cpu()); + unsigned long flags; + struct page *page = NULL; + + smp_local_irq_save(flags); + if (tlb->nr_pte_ready) { + int z; + for (z = MAX_ZONE_ID - 1; z >= 0; --z) { + struct zone *zone = zone_table[z]; + if (!(gfp_flags & __GFP_HIGHMEM) && zone_high(zone)) + continue; + if (!list_empty(&tlb->ready_list[z])) + break; + } + page = list_entry(tlb->ready_list[z].next, struct page, list); + if (TestSetPagePTE(page)) + BUG(); + list_del(&page->list); + tlb->ready_count[z]--; + tlb->nr_pte_ready--; + } + smp_local_irq_restore(flags); + put_cpu(); + return page; } -void pmd_ctor(void *pmd, kmem_cache_t *cache, unsigned long flags) +struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) { - memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t)); + struct page *page = pte_alloc_ready(GFP_PTE); + return page ? page : pte_alloc_fresh(GFP_PTE); +} + +static inline struct page *__pmd_alloc_one(void) +{ + struct page *page = pte_alloc_ready(GFP_PMD); + return page ? page : pte_alloc_fresh(GFP_PMD); } /* @@ -212,16 +265,21 @@ pgd_t *pgd_alloc(struct mm_struct *mm) return pgd; for (i = 0; i < USER_PTRS_PER_PGD; ++i) { - pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL); + struct page *pmd = __pmd_alloc_one(); if (!pmd) goto out_oom; - set_pgd(&pgd[i], __pgd(1 + __pa((u64)((u32)pmd)))); + set_pgd(&pgd[i], __pgd(1ULL | (u64)page_to_pfn(pmd) << PAGE_SHIFT)); } return pgd; + /* + * This looks unusual. pte_free() is actually a convenient wrapper + * for queueing up preconstructed pmd and/or pte pages. The cases + * fall through to just queueing them in the per-cpu lists. + */ out_oom: for (i--; i >= 0; i--) - kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1)); + pte_free(pgd_page(pgd[i])); kmem_cache_free(pgd_cache, pgd); return NULL; } @@ -233,7 +291,127 @@ void pgd_free(pgd_t *pgd) /* in the PAE case user pgd entries are overwritten before usage */ if (PTRS_PER_PMD > 1) for (i = 0; i < USER_PTRS_PER_PGD; ++i) - kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1)); + pte_free(pgd_page(pgd[i])); /* in the non-PAE case, clear_page_tables() clears user pgd entries */ kmem_cache_free(pgd_cache, pgd); } + +static void shrink_cpu_pagetable_cache(void *__gfp_mask) +{ + int cpu, zone, high, gfp_mask = (int)gfp_mask; + unsigned long flags; + struct mmu_gather *tlb; + + high = !!(gfp_mask & __GFP_HIGHMEM); + cpu = get_cpu(); + tlb = &per_cpu(mmu_gathers, cpu); + smp_local_irq_save(flags); + + if (tlb->nr_pte_active || tlb->nr_nonpte) + tlb_flush(tlb); + + if (tlb->nr_pte_active) { + for (zone = 0; zone < MAX_ZONE_ID; ++zone) { + if (!high && zone_high(zone_table[zone])) + continue; + if (!tlb->active_count[zone]) + continue; + + list_splice_init(&tlb->active_list[zone], &tlb->ready_list[zone]); + tlb->ready_count[zone] += tlb->active_count[zone]; + tlb->active_count[zone] = 0; + } + tlb->nr_pte_ready += tlb->nr_pte_active; + tlb->nr_pte_active = 0; + } + + for (zone = 0; zone < MAX_ZONE_ID; ++zone) { + struct page *head; + + if (list_empty(&tlb->ready_list[zone])) + continue; + if (!high && zone_high(zone_table[zone])) + continue; + + head = list_entry(tlb->ready_list[zone].next, struct page, list); + list_del_init(&head->list); + list_splice_init(&tlb->ready_list[zone], &head->list); + head->private = tlb->ready_count[zone]; + tlb->nr_pte_ready -= tlb->ready_count[zone]; + tlb->ready_count[zone] = 0; + free_pages_bulk(zone_table[zone], head, 0); + } + + smp_local_irq_restore(flags); + put_cpu(); +} + +void shrink_pagetable_cache(int gfp_mask) +{ + BUG_ON(irqs_disabled()); + + preempt_disable(); + + /* disables interrupts appropriately internally */ + shrink_cpu_pagetable_cache((void *)gfp_mask); + + smp_call_function(shrink_cpu_pagetable_cache, (void *)gfp_mask, 1, 1); + preempt_enable(); +} + +#define GLIBC_BUFFER (32*1024*1024) + +/* + * This is total crap; it needs to use the free area cache to mitigate + * catastrophic O(n) search with many vmas. + */ +unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma, *prev; + + len = PAGE_ALIGN(len); + addr = PAGE_ALIGN(addr); + + if (len > TASK_SIZE) + return -ENOMEM; + + if (addr) { + struct vm_area_struct *vma; + vma = find_vma(mm, addr); + if (TASK_SIZE - len >= addr && + (!vma || addr + len <= vma->vm_start)) + goto out; + } + + if (!mm->mmap) { + if (len > TASK_SIZE - GLIBC_BUFFER) + addr = TASK_SIZE - len; + else + addr = TASK_SIZE - GLIBC_BUFFER - len; + goto out; + } + + addr = -ENOMEM; + for (prev = NULL, vma = mm->mmap; vma; prev = vma, vma = vma->vm_next) { + unsigned long lo, hi; + lo = prev ? prev->vm_end : 0; + hi = vma->vm_start; + if (hi - lo >= len && (addr == -ENOMEM || addr < hi - len)) + addr = hi - len; + } + /* + * We're at the last one; let's try the top, but only if nothing + * else can be found (to respect GLIBC_BUFFER). + */ + if (prev && TASK_SIZE - prev->vm_end >= len) { + if (TASK_SIZE - GLIBC_BUFFER - prev->vm_end >= len) + addr = TASK_SIZE - GLIBC_BUFFER - len; + else if (addr == -ENOMEM) + addr = TASK_SIZE - len; + } +out: + return addr; +} diff -prauN linux-2.6.0-test11/arch/ia64/ia32/binfmt_elf32.c wli-2.6.0-test11-30/arch/ia64/ia32/binfmt_elf32.c --- linux-2.6.0-test11/arch/ia64/ia32/binfmt_elf32.c 2003-11-26 12:45:36.000000000 -0800 +++ wli-2.6.0-test11-30/arch/ia64/ia32/binfmt_elf32.c 2003-12-04 08:44:47.000000000 -0800 @@ -60,10 +60,12 @@ extern struct page *ia32_shared_page[]; extern unsigned long *ia32_gdt; struct page * -ia32_install_shared_page (struct vm_area_struct *vma, unsigned long address, int no_share) +ia32_install_shared_page (struct vm_area_struct *vma, unsigned long address, int *type) { struct page *pg = ia32_shared_page[smp_processor_id()]; get_page(pg); + if (type) + *type = VM_FAULT_MINOR; return pg; } @@ -202,7 +204,8 @@ ia32_setup_arg_pages (struct linux_binpr struct page *page = bprm->page[i]; if (page) { bprm->page[i] = NULL; - put_dirty_page(current, page, stack_base, PAGE_COPY); + put_dirty_page(current, mpnt, page, + stack_base, PAGE_COPY); } stack_base += PAGE_SIZE; } diff -prauN linux-2.6.0-test11/arch/ia64/kernel/process.c wli-2.6.0-test11-30/arch/ia64/kernel/process.c --- linux-2.6.0-test11/arch/ia64/kernel/process.c 2003-11-26 12:44:05.000000000 -0800 +++ wli-2.6.0-test11-30/arch/ia64/kernel/process.c 2003-12-04 08:35:58.000000000 -0800 @@ -638,11 +638,6 @@ get_wchan (struct task_struct *p) /* * These bracket the sleeping functions.. */ - extern void scheduling_functions_start_here(void); - extern void scheduling_functions_end_here(void); -# define first_sched ((unsigned long) scheduling_functions_start_here) -# define last_sched ((unsigned long) scheduling_functions_end_here) - /* * Note: p may not be a blocked task (it could be current or * another process running on some other CPU. Rather than @@ -656,12 +651,11 @@ get_wchan (struct task_struct *p) if (unw_unwind(&info) < 0) return 0; unw_get_ip(&info, &ip); - if (ip < first_sched || ip >= last_sched) + if (ip < scheduling_functions_start_here || + ip >= scheduling_functions_end_here) return ip; } while (count++ < 16); return 0; -# undef first_sched -# undef last_sched } void diff -prauN linux-2.6.0-test11/arch/ia64/kernel/semaphore.c wli-2.6.0-test11-30/arch/ia64/kernel/semaphore.c --- linux-2.6.0-test11/arch/ia64/kernel/semaphore.c 2003-11-26 12:43:40.000000000 -0800 +++ wli-2.6.0-test11-30/arch/ia64/kernel/semaphore.c 2003-12-04 08:35:58.000000000 -0800 @@ -24,6 +24,7 @@ * where we want to avoid any extra jumps and calls. */ #include +#include #include #include @@ -44,7 +45,7 @@ __up (struct semaphore *sem) wake_up(&sem->wait); } -void +__sched void __down (struct semaphore *sem) { struct task_struct *tsk = current; @@ -82,7 +83,7 @@ __down (struct semaphore *sem) tsk->state = TASK_RUNNING; } -int +__sched int __down_interruptible (struct semaphore * sem) { int retval = 0; diff -prauN linux-2.6.0-test11/arch/ia64/kernel/vmlinux.lds.S wli-2.6.0-test11-30/arch/ia64/kernel/vmlinux.lds.S --- linux-2.6.0-test11/arch/ia64/kernel/vmlinux.lds.S 2003-11-26 12:45:26.000000000 -0800 +++ wli-2.6.0-test11-30/arch/ia64/kernel/vmlinux.lds.S 2003-12-04 08:35:58.000000000 -0800 @@ -35,6 +35,9 @@ SECTIONS { *(.text.ivt) *(.text) + __scheduling_functions_start_here = .; + *(.sched.text) + __scheduling_functions_end_here = .; *(.gnu.linkonce.t*) } .text2 : AT(ADDR(.text2) - LOAD_OFFSET) diff -prauN linux-2.6.0-test11/arch/ia64/mm/hugetlbpage.c wli-2.6.0-test11-30/arch/ia64/mm/hugetlbpage.c --- linux-2.6.0-test11/arch/ia64/mm/hugetlbpage.c 2003-11-26 12:44:13.000000000 -0800 +++ wli-2.6.0-test11-30/arch/ia64/mm/hugetlbpage.c 2003-12-04 08:43:29.000000000 -0800 @@ -91,9 +91,9 @@ huge_pte_alloc (struct mm_struct *mm, un pte_t *pte = NULL; pgd = pgd_offset(mm, taddr); - pmd = pmd_alloc(mm, pgd, taddr); + pmd = pmd_alloc_map(mm, pgd, taddr); if (pmd) - pte = pte_alloc_map(mm, pmd, taddr); + pte = pte_alloc_map(mm, pgd, &pmd, taddr); return pte; } @@ -254,7 +254,7 @@ follow_huge_pmd(struct mm_struct *mm, un void free_huge_page(struct page *page) { BUG_ON(page_count(page)); - BUG_ON(page->mapping); + BUG_ON(page_mapping(page)); INIT_LIST_HEAD(&page->list); @@ -518,7 +518,7 @@ int is_hugepage_mem_enough(size_t size) return 1; } -static struct page *hugetlb_nopage(struct vm_area_struct * area, unsigned long address, int unused) +static struct page *hugetlb_nopage(struct vm_area_struct * area, unsigned long address, int *unused) { BUG(); return NULL; diff -prauN linux-2.6.0-test11/arch/ia64/mm/init.c wli-2.6.0-test11-30/arch/ia64/mm/init.c --- linux-2.6.0-test11/arch/ia64/mm/init.c 2003-11-26 12:43:41.000000000 -0800 +++ wli-2.6.0-test11-30/arch/ia64/mm/init.c 2003-12-03 18:20:41.000000000 -0800 @@ -234,10 +234,10 @@ put_kernel_page (struct page *page, unsi spin_lock(&init_mm.page_table_lock); { - pmd = pmd_alloc(&init_mm, pgd, address); + pmd = pmd_alloc_kernel(&init_mm, pgd, address); if (!pmd) goto out; - pte = pte_alloc_map(&init_mm, pmd, address); + pte = pte_alloc_map(&init_mm, pgd, &pmd, address); if (!pte) goto out; if (!pte_none(*pte)) { diff -prauN linux-2.6.0-test11/arch/m68k/kernel/head.S wli-2.6.0-test11-30/arch/m68k/kernel/head.S --- linux-2.6.0-test11/arch/m68k/kernel/head.S 2003-11-26 12:46:05.000000000 -0800 +++ wli-2.6.0-test11-30/arch/m68k/kernel/head.S 2003-12-03 18:20:41.000000000 -0800 @@ -110,7 +110,7 @@ * * These routines are used by other mmu routines to get a pointer into * a table, if necessary a new table is allocated. These routines are working - * basically like pmd_alloc() and pte_alloc() in . The root + * basically like pmd_alloc_map() and pte_alloc_map() in . The root * table needs of course only to be allocated once in mmu_get_root_table_entry, * so that here also some mmu specific initialization is done. The second page * at the start of the kernel (the first page is unmapped later) is used for diff -prauN linux-2.6.0-test11/arch/m68k/kernel/process.c wli-2.6.0-test11-30/arch/m68k/kernel/process.c --- linux-2.6.0-test11/arch/m68k/kernel/process.c 2003-11-26 12:46:09.000000000 -0800 +++ wli-2.6.0-test11-30/arch/m68k/kernel/process.c 2003-12-04 08:35:58.000000000 -0800 @@ -65,12 +65,10 @@ asmlinkage void ret_from_fork(void); */ unsigned long thread_saved_pc(struct task_struct *tsk) { - extern void scheduling_functions_start_here(void); - extern void scheduling_functions_end_here(void); struct switch_stack *sw = (struct switch_stack *)tsk->thread.ksp; /* Check whether the thread is blocked in resume() */ - if (sw->retpc > (unsigned long)scheduling_functions_start_here && - sw->retpc < (unsigned long)scheduling_functions_end_here) + if (sw->retpc > scheduling_functions_start_here && + sw->retpc < scheduling_functions_end_here) return ((unsigned long *)sw->a6)[1]; else return sw->retpc; @@ -387,11 +385,6 @@ out: /* * These bracket the sleeping functions.. */ -extern void scheduling_functions_start_here(void); -extern void scheduling_functions_end_here(void); -#define first_sched ((unsigned long) scheduling_functions_start_here) -#define last_sched ((unsigned long) scheduling_functions_end_here) - unsigned long get_wchan(struct task_struct *p) { unsigned long fp, pc; @@ -407,8 +400,8 @@ unsigned long get_wchan(struct task_stru fp >= 8184+stack_page) return 0; pc = ((unsigned long *)fp)[1]; - /* FIXME: This depends on the order of these functions. */ - if (pc < first_sched || pc >= last_sched) + if (pc < scheduling_functions_start_here || + pc >= scheduling_functions_end_here) return pc; fp = *(unsigned long *) fp; } while (count++ < 16); diff -prauN linux-2.6.0-test11/arch/m68k/kernel/semaphore.c wli-2.6.0-test11-30/arch/m68k/kernel/semaphore.c --- linux-2.6.0-test11/arch/m68k/kernel/semaphore.c 2003-11-26 12:44:56.000000000 -0800 +++ wli-2.6.0-test11-30/arch/m68k/kernel/semaphore.c 2003-12-04 08:35:58.000000000 -0800 @@ -5,6 +5,7 @@ #include #include +#include #include #ifndef CONFIG_RMW_INSNS @@ -95,7 +96,7 @@ void __up(struct semaphore *sem) current->state = TASK_RUNNING; \ remove_wait_queue(&sem->wait, &wait); -void __down(struct semaphore * sem) +__sched void __down(struct semaphore * sem) { DECLARE_WAITQUEUE(wait, current); @@ -106,7 +107,7 @@ void __down(struct semaphore * sem) DOWN_TAIL(TASK_UNINTERRUPTIBLE) } -int __down_interruptible(struct semaphore * sem) +__sched int __down_interruptible(struct semaphore * sem) { DECLARE_WAITQUEUE(wait, current); int ret = 0; diff -prauN linux-2.6.0-test11/arch/m68k/kernel/vmlinux-std.lds wli-2.6.0-test11-30/arch/m68k/kernel/vmlinux-std.lds --- linux-2.6.0-test11/arch/m68k/kernel/vmlinux-std.lds 2003-11-26 12:45:30.000000000 -0800 +++ wli-2.6.0-test11-30/arch/m68k/kernel/vmlinux-std.lds 2003-12-04 08:35:58.000000000 -0800 @@ -12,6 +12,9 @@ SECTIONS _text = .; /* Text and read-only data */ .text : { *(.text) + __scheduling_functions_start_here = .; + *(.sched.text) + __scheduling_functions_end_here = .; *(.fixup) *(.gnu.warning) } = 0x4e75 diff -prauN linux-2.6.0-test11/arch/m68k/kernel/vmlinux-sun3.lds wli-2.6.0-test11-30/arch/m68k/kernel/vmlinux-sun3.lds --- linux-2.6.0-test11/arch/m68k/kernel/vmlinux-sun3.lds 2003-11-26 12:42:40.000000000 -0800 +++ wli-2.6.0-test11-30/arch/m68k/kernel/vmlinux-sun3.lds 2003-12-04 08:35:58.000000000 -0800 @@ -13,6 +13,9 @@ SECTIONS .text : { *(.head) *(.text) + __scheduling_functions_start_here = .; + *(.sched.text) + __scheduling_functions_end_here = .; *(.fixup) *(.gnu.warning) } = 0x4e75 diff -prauN linux-2.6.0-test11/arch/m68k/mm/kmap.c wli-2.6.0-test11-30/arch/m68k/mm/kmap.c --- linux-2.6.0-test11/arch/m68k/mm/kmap.c 2003-11-26 12:45:49.000000000 -0800 +++ wli-2.6.0-test11-30/arch/m68k/mm/kmap.c 2003-12-03 18:20:41.000000000 -0800 @@ -189,7 +189,7 @@ void *__ioremap(unsigned long physaddr, printk ("\npa=%#lx va=%#lx ", physaddr, virtaddr); #endif pgd_dir = pgd_offset_k(virtaddr); - pmd_dir = pmd_alloc(&init_mm, pgd_dir, virtaddr); + pmd_dir = pmd_alloc_kernel(&init_mm, pgd_dir, virtaddr); if (!pmd_dir) { printk("ioremap: no mem for pmd_dir\n"); return NULL; diff -prauN linux-2.6.0-test11/arch/m68k/sun3x/dvma.c wli-2.6.0-test11-30/arch/m68k/sun3x/dvma.c --- linux-2.6.0-test11/arch/m68k/sun3x/dvma.c 2003-11-26 12:43:09.000000000 -0800 +++ wli-2.6.0-test11-30/arch/m68k/sun3x/dvma.c 2003-12-03 18:20:41.000000000 -0800 @@ -102,7 +102,7 @@ inline int dvma_map_cpu(unsigned long ka pmd_t *pmd; unsigned long end2; - if((pmd = pmd_alloc(&init_mm, pgd, vaddr)) == NULL) { + if((pmd = pmd_alloc_kernel(&init_mm, pgd, vaddr)) == NULL) { ret = -ENOMEM; goto out; } diff -prauN linux-2.6.0-test11/arch/m68knommu/kernel/process.c wli-2.6.0-test11-30/arch/m68knommu/kernel/process.c --- linux-2.6.0-test11/arch/m68knommu/kernel/process.c 2003-11-26 12:43:25.000000000 -0800 +++ wli-2.6.0-test11-30/arch/m68knommu/kernel/process.c 2003-12-04 08:35:58.000000000 -0800 @@ -406,11 +406,6 @@ out: /* * These bracket the sleeping functions.. */ -extern void scheduling_functions_start_here(void); -extern void scheduling_functions_end_here(void); -#define first_sched ((unsigned long) scheduling_functions_start_here) -#define last_sched ((unsigned long) scheduling_functions_end_here) - unsigned long get_wchan(struct task_struct *p) { unsigned long fp, pc; @@ -426,8 +421,8 @@ unsigned long get_wchan(struct task_stru fp >= 8184+stack_page) return 0; pc = ((unsigned long *)fp)[1]; - /* FIXME: This depends on the order of these functions. */ - if (pc < first_sched || pc >= last_sched) + if (pc < scheduling_functions_start_here || + pc >= scheduling_functions_end_here) return pc; fp = *(unsigned long *) fp; } while (count++ < 16); @@ -439,13 +434,11 @@ unsigned long get_wchan(struct task_stru */ unsigned long thread_saved_pc(struct task_struct *tsk) { - extern void scheduling_functions_start_here(void); - extern void scheduling_functions_end_here(void); struct switch_stack *sw = (struct switch_stack *)tsk->thread.ksp; /* Check whether the thread is blocked in resume() */ - if (sw->retpc > (unsigned long)scheduling_functions_start_here && - sw->retpc < (unsigned long)scheduling_functions_end_here) + if (sw->retpc > scheduling_functions_start_here && + sw->retpc < scheduling_functions_end_here) return ((unsigned long *)sw->a6)[1]; else return sw->retpc; diff -prauN linux-2.6.0-test11/arch/m68knommu/kernel/semaphore.c wli-2.6.0-test11-30/arch/m68knommu/kernel/semaphore.c --- linux-2.6.0-test11/arch/m68knommu/kernel/semaphore.c 2003-11-26 12:46:10.000000000 -0800 +++ wli-2.6.0-test11-30/arch/m68knommu/kernel/semaphore.c 2003-12-04 08:35:58.000000000 -0800 @@ -6,6 +6,7 @@ #include #include #include +#include #include #ifndef CONFIG_RMW_INSNS @@ -96,7 +97,7 @@ void __up(struct semaphore *sem) current->state = TASK_RUNNING; \ remove_wait_queue(&sem->wait, &wait); -void __down(struct semaphore * sem) +__sched void __down(struct semaphore * sem) { DECLARE_WAITQUEUE(wait, current); @@ -107,7 +108,7 @@ void __down(struct semaphore * sem) DOWN_TAIL(TASK_UNINTERRUPTIBLE) } -int __down_interruptible(struct semaphore * sem) +__sched int __down_interruptible(struct semaphore * sem) { DECLARE_WAITQUEUE(wait, current); int ret = 0; diff -prauN linux-2.6.0-test11/arch/m68knommu/kernel/vmlinux.lds.S wli-2.6.0-test11-30/arch/m68knommu/kernel/vmlinux.lds.S --- linux-2.6.0-test11/arch/m68knommu/kernel/vmlinux.lds.S 2003-11-26 12:43:40.000000000 -0800 +++ wli-2.6.0-test11-30/arch/m68knommu/kernel/vmlinux.lds.S 2003-12-04 08:35:58.000000000 -0800 @@ -191,6 +191,9 @@ SECTIONS { .text : { _stext = . ; *(.text) + __scheduling_functions_start_here = .; + *(.sched.text) + __scheduling_functions_end_here = .; *(.text.lock) . = ALIGN(16); /* Exception table */ diff -prauN linux-2.6.0-test11/arch/mips/kernel/process.c wli-2.6.0-test11-30/arch/mips/kernel/process.c --- linux-2.6.0-test11/arch/mips/kernel/process.c 2003-11-26 12:43:39.000000000 -0800 +++ wli-2.6.0-test11-30/arch/mips/kernel/process.c 2003-12-04 08:35:58.000000000 -0800 @@ -276,11 +276,6 @@ unsigned long thread_saved_pc(struct tas /* * These bracket the sleeping functions.. */ -extern void scheduling_functions_start_here(void); -extern void scheduling_functions_end_here(void); -#define first_sched ((unsigned long) scheduling_functions_start_here) -#define last_sched ((unsigned long) scheduling_functions_end_here) - /* get_wchan - a maintenance nightmare^W^Wpain in the ass ... */ unsigned long get_wchan(struct task_struct *p) { @@ -292,7 +287,8 @@ unsigned long get_wchan(struct task_stru if (!mips_frame_info_initialized) return 0; pc = thread_saved_pc(p); - if (pc < first_sched || pc >= last_sched) + if (pc < scheduling_functions_start_here || + pc >= scheduling_functions_end_here) goto out; if (pc >= (unsigned long) sleep_on_timeout) @@ -326,7 +322,8 @@ schedule_timeout_caller: */ pc = ((unsigned long *)frame)[schedule_timeout_frame.pc_offset]; - if (pc >= first_sched && pc < last_sched) { + if (pc >= scheduling_functions_start_here && + pc < scheduling_functions_end_here) { /* schedule_timeout called by [interruptible_]sleep_on_timeout */ frame = ((unsigned long *)frame)[schedule_timeout_frame.frame_offset]; pc = ((unsigned long *)frame)[sleep_on_timeout_frame.pc_offset]; diff -prauN linux-2.6.0-test11/arch/mips/kernel/semaphore.c wli-2.6.0-test11-30/arch/mips/kernel/semaphore.c --- linux-2.6.0-test11/arch/mips/kernel/semaphore.c 2003-11-26 12:44:34.000000000 -0800 +++ wli-2.6.0-test11-30/arch/mips/kernel/semaphore.c 2003-12-04 08:35:58.000000000 -0800 @@ -4,6 +4,7 @@ */ #include +#include #include /* @@ -94,7 +95,7 @@ void __up(struct semaphore *sem) tsk->state = TASK_RUNNING; \ remove_wait_queue(&sem->wait, &wait); -void __down(struct semaphore * sem) +__sched void __down(struct semaphore * sem) { DOWN_VAR DOWN_HEAD(TASK_UNINTERRUPTIBLE) @@ -104,7 +105,7 @@ void __down(struct semaphore * sem) DOWN_TAIL(TASK_UNINTERRUPTIBLE) } -int __down_interruptible(struct semaphore * sem) +__sched int __down_interruptible(struct semaphore * sem) { int ret = 0; DOWN_VAR diff -prauN linux-2.6.0-test11/arch/mips/kernel/vmlinux.lds.S wli-2.6.0-test11-30/arch/mips/kernel/vmlinux.lds.S --- linux-2.6.0-test11/arch/mips/kernel/vmlinux.lds.S 2003-11-26 12:43:33.000000000 -0800 +++ wli-2.6.0-test11-30/arch/mips/kernel/vmlinux.lds.S 2003-12-04 08:35:58.000000000 -0800 @@ -27,6 +27,9 @@ SECTIONS _text = .; /* Text and read-only data */ .text : { *(.text) + __scheduling_functions_start_here = .; + *(.sched.text) + __scheduling_functions_end_here = .; *(.fixup) *(.gnu.warning) } =0 diff -prauN linux-2.6.0-test11/arch/mips/mm/ioremap.c wli-2.6.0-test11-30/arch/mips/mm/ioremap.c --- linux-2.6.0-test11/arch/mips/mm/ioremap.c 2003-11-26 12:44:07.000000000 -0800 +++ wli-2.6.0-test11-30/arch/mips/mm/ioremap.c 2003-12-03 18:20:41.000000000 -0800 @@ -81,7 +81,7 @@ static int remap_area_pages(unsigned lon spin_lock(&init_mm.page_table_lock); do { pmd_t *pmd; - pmd = pmd_alloc(&init_mm, dir, address); + pmd = pmd_alloc_kernel(&init_mm, dir, address); error = -ENOMEM; if (!pmd) break; diff -prauN linux-2.6.0-test11/arch/parisc/kernel/cache.c wli-2.6.0-test11-30/arch/parisc/kernel/cache.c --- linux-2.6.0-test11/arch/parisc/kernel/cache.c 2003-11-26 12:43:47.000000000 -0800 +++ wli-2.6.0-test11-30/arch/parisc/kernel/cache.c 2003-12-04 06:13:40.000000000 -0800 @@ -68,7 +68,7 @@ update_mmu_cache(struct vm_area_struct * { struct page *page = pte_page(pte); - if (VALID_PAGE(page) && page->mapping && + if (VALID_PAGE(page) && page_mapping(page) && test_bit(PG_dcache_dirty, &page->flags)) { flush_kernel_dcache_page(page_address(page)); @@ -234,15 +234,17 @@ void __flush_dcache_page(struct page *pa flush_kernel_dcache_page(page_address(page)); - if (!page->mapping) + if (!page_mapping(page)) return; /* check shared list first if it's not empty...it's usually * the shortest */ - list_for_each(l, &page->mapping->i_mmap_shared) { + list_for_each_rcu(l, &page->mapping->i_mmap_shared) { struct vm_area_struct *mpnt; unsigned long off; mpnt = list_entry(l, struct vm_area_struct, shared); + if (mpnt->vm_flags & VM_DEAD) + continue; /* * If this VMA is not in our MM, we can ignore it. diff -prauN linux-2.6.0-test11/arch/parisc/kernel/pci-dma.c wli-2.6.0-test11-30/arch/parisc/kernel/pci-dma.c --- linux-2.6.0-test11/arch/parisc/kernel/pci-dma.c 2003-11-26 12:45:35.000000000 -0800 +++ wli-2.6.0-test11-30/arch/parisc/kernel/pci-dma.c 2003-12-03 18:20:41.000000000 -0800 @@ -133,7 +133,7 @@ static inline int map_uncached_pages(uns do { pmd_t *pmd; - pmd = pmd_alloc(NULL, dir, vaddr); + pmd = pmd_alloc_kernel(NULL, dir, vaddr); if (!pmd) return -ENOMEM; if (map_pmd_uncached(pmd, vaddr, end - vaddr, &paddr)) diff -prauN linux-2.6.0-test11/arch/parisc/kernel/semaphore.c wli-2.6.0-test11-30/arch/parisc/kernel/semaphore.c --- linux-2.6.0-test11/arch/parisc/kernel/semaphore.c 2003-11-26 12:44:59.000000000 -0800 +++ wli-2.6.0-test11-30/arch/parisc/kernel/semaphore.c 2003-12-04 08:35:58.000000000 -0800 @@ -5,6 +5,7 @@ #include #include #include +#include /* * Semaphores are complex as we wish to avoid using two variables. @@ -58,7 +59,7 @@ void __up(struct semaphore *sem) sem->count += (sem->count < 0) ? 1 : - 1; -void __down(struct semaphore * sem) +__sched void __down(struct semaphore * sem) { DOWN_HEAD @@ -74,7 +75,7 @@ void __down(struct semaphore * sem) UPDATE_COUNT } -int __down_interruptible(struct semaphore * sem) +__sched int __down_interruptible(struct semaphore * sem) { DOWN_HEAD diff -prauN linux-2.6.0-test11/arch/parisc/kernel/vmlinux.lds.S wli-2.6.0-test11-30/arch/parisc/kernel/vmlinux.lds.S --- linux-2.6.0-test11/arch/parisc/kernel/vmlinux.lds.S 2003-11-26 12:43:24.000000000 -0800 +++ wli-2.6.0-test11-30/arch/parisc/kernel/vmlinux.lds.S 2003-12-04 08:35:58.000000000 -0800 @@ -24,6 +24,9 @@ SECTIONS _text = .; /* Text and read-only data */ .text BLOCK(16) : { *(.text*) + __scheduling_functions_start_here = .; + *(.sched.text) + __scheduling_functions_end_here =.; *(.PARISC.unwind) *(.fixup) *(.lock.text) /* out-of-line lock text */ diff -prauN linux-2.6.0-test11/arch/parisc/mm/ioremap.c wli-2.6.0-test11-30/arch/parisc/mm/ioremap.c --- linux-2.6.0-test11/arch/parisc/mm/ioremap.c 2003-11-26 12:43:35.000000000 -0800 +++ wli-2.6.0-test11-30/arch/parisc/mm/ioremap.c 2003-12-03 18:20:41.000000000 -0800 @@ -77,7 +77,7 @@ static int remap_area_pages(unsigned lon spin_lock(&init_mm.page_table_lock); do { pmd_t *pmd; - pmd = pmd_alloc(dir, address); + pmd = pmd_alloc_kernel(dir, address); error = -ENOMEM; if (!pmd) break; diff -prauN linux-2.6.0-test11/arch/ppc/kernel/process.c wli-2.6.0-test11-30/arch/ppc/kernel/process.c --- linux-2.6.0-test11/arch/ppc/kernel/process.c 2003-11-26 12:44:06.000000000 -0800 +++ wli-2.6.0-test11-30/arch/ppc/kernel/process.c 2003-12-04 08:35:58.000000000 -0800 @@ -650,11 +650,6 @@ void __init ll_puts(const char *s) /* * These bracket the sleeping functions.. */ -extern void scheduling_functions_start_here(void); -extern void scheduling_functions_end_here(void); -#define first_sched ((unsigned long) scheduling_functions_start_here) -#define last_sched ((unsigned long) scheduling_functions_end_here) - unsigned long get_wchan(struct task_struct *p) { unsigned long ip, sp; @@ -669,7 +664,8 @@ unsigned long get_wchan(struct task_stru return 0; if (count > 0) { ip = *(unsigned long *)(sp + 4); - if (ip < first_sched || ip >= last_sched) + if (ip < scheduling_functions_start_here || + ip >= scheduling_functions_end_here) return ip; } } while (count++ < 16); diff -prauN linux-2.6.0-test11/arch/ppc/kernel/semaphore.c wli-2.6.0-test11-30/arch/ppc/kernel/semaphore.c --- linux-2.6.0-test11/arch/ppc/kernel/semaphore.c 2003-11-26 12:45:10.000000000 -0800 +++ wli-2.6.0-test11-30/arch/ppc/kernel/semaphore.c 2003-12-04 08:35:58.000000000 -0800 @@ -15,6 +15,7 @@ */ #include +#include #include #include #include @@ -69,7 +70,7 @@ void __up(struct semaphore *sem) * Thus it is only when we decrement count from some value > 0 * that we have actually got the semaphore. */ -void __down(struct semaphore *sem) +__sched void __down(struct semaphore *sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -99,7 +100,7 @@ void __down(struct semaphore *sem) wake_up(&sem->wait); } -int __down_interruptible(struct semaphore * sem) +__sched int __down_interruptible(struct semaphore * sem) { int retval = 0; struct task_struct *tsk = current; diff -prauN linux-2.6.0-test11/arch/ppc/kernel/vmlinux.lds.S wli-2.6.0-test11-30/arch/ppc/kernel/vmlinux.lds.S --- linux-2.6.0-test11/arch/ppc/kernel/vmlinux.lds.S 2003-11-26 12:46:12.000000000 -0800 +++ wli-2.6.0-test11-30/arch/ppc/kernel/vmlinux.lds.S 2003-12-04 08:35:58.000000000 -0800 @@ -31,6 +31,9 @@ SECTIONS .text : { *(.text) + __scheduling_functions_start_here = .; + *(.sched.text) + __scheduling_functions_end_here = .; *(.fixup) *(.got1) __got2_start = .; diff -prauN linux-2.6.0-test11/arch/ppc/mm/init.c wli-2.6.0-test11-30/arch/ppc/mm/init.c --- linux-2.6.0-test11/arch/ppc/mm/init.c 2003-11-26 12:43:25.000000000 -0800 +++ wli-2.6.0-test11-30/arch/ppc/mm/init.c 2003-12-04 06:13:40.000000000 -0800 @@ -477,14 +477,14 @@ void __init mem_init(void) printk(KERN_INFO "AGP special page: 0x%08lx\n", agp_special_page); #endif - /* Make sure all our pagetable pages have page->mapping + /* Make sure all our pagetable pages have page_mapping(page) and page->index set correctly. */ for (addr = KERNELBASE; addr != 0; addr += PGDIR_SIZE) { struct page *pg; pmd_t *pmd = pmd_offset(pgd_offset_k(addr), addr); if (pmd_present(*pmd)) { pg = pmd_page(*pmd); - pg->mapping = (void *) &init_mm; + set_page_mapping(pg, &init_mm); pg->index = addr; } } diff -prauN linux-2.6.0-test11/arch/ppc64/kernel/process.c wli-2.6.0-test11-30/arch/ppc64/kernel/process.c --- linux-2.6.0-test11/arch/ppc64/kernel/process.c 2003-11-26 12:44:27.000000000 -0800 +++ wli-2.6.0-test11-30/arch/ppc64/kernel/process.c 2003-12-04 08:35:58.000000000 -0800 @@ -369,11 +369,6 @@ out: /* * These bracket the sleeping functions.. */ -extern void scheduling_functions_start_here(void); -extern void scheduling_functions_end_here(void); -#define first_sched (*(unsigned long *)scheduling_functions_start_here) -#define last_sched (*(unsigned long *)scheduling_functions_end_here) - unsigned long get_wchan(struct task_struct *p) { unsigned long ip, sp; @@ -393,7 +388,8 @@ unsigned long get_wchan(struct task_stru * XXX we mask the upper 32 bits until procps * gets fixed. */ - if (ip < first_sched || ip >= last_sched) + if (ip < scheduling_functions_start_here || + ip >= scheduling_functions_end_here) return (ip & 0xFFFFFFFF); } } while (count++ < 16); diff -prauN linux-2.6.0-test11/arch/ppc64/kernel/semaphore.c wli-2.6.0-test11-30/arch/ppc64/kernel/semaphore.c --- linux-2.6.0-test11/arch/ppc64/kernel/semaphore.c 2003-11-26 12:42:56.000000000 -0800 +++ wli-2.6.0-test11-30/arch/ppc64/kernel/semaphore.c 2003-12-04 08:35:58.000000000 -0800 @@ -17,6 +17,7 @@ */ #include +#include #include #include #include @@ -70,7 +71,7 @@ void __up(struct semaphore *sem) * Thus it is only when we decrement count from some value > 0 * that we have actually got the semaphore. */ -void __down(struct semaphore *sem) +__sched void __down(struct semaphore *sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -99,7 +100,7 @@ void __down(struct semaphore *sem) wake_up(&sem->wait); } -int __down_interruptible(struct semaphore * sem) +__sched int __down_interruptible(struct semaphore * sem) { int retval = 0; struct task_struct *tsk = current; diff -prauN linux-2.6.0-test11/arch/ppc64/kernel/vmlinux.lds.S wli-2.6.0-test11-30/arch/ppc64/kernel/vmlinux.lds.S --- linux-2.6.0-test11/arch/ppc64/kernel/vmlinux.lds.S 2003-11-26 12:45:05.000000000 -0800 +++ wli-2.6.0-test11-30/arch/ppc64/kernel/vmlinux.lds.S 2003-12-04 08:35:58.000000000 -0800 @@ -33,6 +33,9 @@ SECTIONS .text : { *(.text) + __scheduling_functions_start_here = .; + *(.sched.text) + __scheduling_functions_end_here = .; *(.fixup) *(.got1) } diff -prauN linux-2.6.0-test11/arch/ppc64/mm/hugetlbpage.c wli-2.6.0-test11-30/arch/ppc64/mm/hugetlbpage.c --- linux-2.6.0-test11/arch/ppc64/mm/hugetlbpage.c 2003-11-26 12:44:12.000000000 -0800 +++ wli-2.6.0-test11-30/arch/ppc64/mm/hugetlbpage.c 2003-12-04 08:43:29.000000000 -0800 @@ -921,7 +921,7 @@ int is_hugepage_mem_enough(size_t size) * this far. */ static struct page *hugetlb_nopage(struct vm_area_struct *vma, - unsigned long address, int unused) + unsigned long address, int *unused) { BUG(); return NULL; diff -prauN linux-2.6.0-test11/arch/ppc64/mm/init.c wli-2.6.0-test11-30/arch/ppc64/mm/init.c --- linux-2.6.0-test11/arch/ppc64/mm/init.c 2003-11-26 12:44:24.000000000 -0800 +++ wli-2.6.0-test11-30/arch/ppc64/mm/init.c 2003-12-03 18:20:41.000000000 -0800 @@ -205,7 +205,7 @@ static void map_io_page(unsigned long ea if (mem_init_done) { spin_lock(&ioremap_mm.page_table_lock); pgdp = pgd_offset_i(ea); - pmdp = pmd_alloc(&ioremap_mm, pgdp, ea); + pmdp = pmd_alloc_kernel(&ioremap_mm, pgdp, ea); ptep = pte_alloc_kernel(&ioremap_mm, pmdp, ea); pa = absolute_to_phys(pa); diff -prauN linux-2.6.0-test11/arch/s390/kernel/compat_exec.c wli-2.6.0-test11-30/arch/s390/kernel/compat_exec.c --- linux-2.6.0-test11/arch/s390/kernel/compat_exec.c 2003-11-26 12:45:29.000000000 -0800 +++ wli-2.6.0-test11-30/arch/s390/kernel/compat_exec.c 2003-12-03 19:11:55.000000000 -0800 @@ -81,7 +81,8 @@ int setup_arg_pages32(struct linux_binpr struct page *page = bprm->page[i]; if (page) { bprm->page[i] = NULL; - put_dirty_page(current,page,stack_base,PAGE_COPY); + put_dirty_page(current, mpnt, page, + stack_base, PAGE_COPY); } stack_base += PAGE_SIZE; } diff -prauN linux-2.6.0-test11/arch/s390/kernel/process.c wli-2.6.0-test11-30/arch/s390/kernel/process.c --- linux-2.6.0-test11/arch/s390/kernel/process.c 2003-11-26 12:45:08.000000000 -0800 +++ wli-2.6.0-test11-30/arch/s390/kernel/process.c 2003-12-04 08:35:58.000000000 -0800 @@ -371,11 +371,6 @@ void dump_thread(struct pt_regs * regs, /* * These bracket the sleeping functions.. */ -extern void scheduling_functions_start_here(void); -extern void scheduling_functions_end_here(void); -#define first_sched ((unsigned long) scheduling_functions_start_here) -#define last_sched ((unsigned long) scheduling_functions_end_here) - unsigned long get_wchan(struct task_struct *p) { unsigned long r14, r15, bc; @@ -398,12 +393,10 @@ unsigned long get_wchan(struct task_stru #else r14 = *(unsigned long *) (bc+112); #endif - if (r14 < first_sched || r14 >= last_sched) + if (r14 < scheduling_functions_start_here || + r14 >= scheduling_functions_end_here) return r14; bc = (*(unsigned long *) bc) & PSW_ADDR_INSN; } while (count++ < 16); return 0; } -#undef last_sched -#undef first_sched - diff -prauN linux-2.6.0-test11/arch/s390/kernel/semaphore.c wli-2.6.0-test11-30/arch/s390/kernel/semaphore.c --- linux-2.6.0-test11/arch/s390/kernel/semaphore.c 2003-11-26 12:45:33.000000000 -0800 +++ wli-2.6.0-test11-30/arch/s390/kernel/semaphore.c 2003-12-04 08:35:58.000000000 -0800 @@ -11,6 +11,7 @@ */ #include #include +#include #include @@ -59,7 +60,7 @@ void __up(struct semaphore *sem) * count > 0: decrement count, wake up queue and exit. * count <= 0: set count to -1, go to sleep. */ -void __down(struct semaphore * sem) +__sched void __down(struct semaphore * sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -81,7 +82,7 @@ void __down(struct semaphore * sem) * count > 0: wake up queue and exit. * count <= 0: set count to 0, wake up queue and exit. */ -int __down_interruptible(struct semaphore * sem) +__sched int __down_interruptible(struct semaphore * sem) { int retval = 0; struct task_struct *tsk = current; diff -prauN linux-2.6.0-test11/arch/s390/kernel/vmlinux.lds.S wli-2.6.0-test11-30/arch/s390/kernel/vmlinux.lds.S --- linux-2.6.0-test11/arch/s390/kernel/vmlinux.lds.S 2003-11-26 12:43:35.000000000 -0800 +++ wli-2.6.0-test11-30/arch/s390/kernel/vmlinux.lds.S 2003-12-04 08:35:58.000000000 -0800 @@ -23,6 +23,9 @@ SECTIONS _text = .; /* Text and read-only data */ .text : { *(.text) + __scheduling_functions_start_here = .; + *(.sched.text) + __scheduling_functions_end_here = .; *(.fixup) *(.gnu.warning) } = 0x0700 diff -prauN linux-2.6.0-test11/arch/s390/mm/ioremap.c wli-2.6.0-test11-30/arch/s390/mm/ioremap.c --- linux-2.6.0-test11/arch/s390/mm/ioremap.c 2003-11-26 12:43:07.000000000 -0800 +++ wli-2.6.0-test11-30/arch/s390/mm/ioremap.c 2003-12-03 18:20:41.000000000 -0800 @@ -83,7 +83,7 @@ static int remap_area_pages(unsigned lon spin_lock(&init_mm.page_table_lock); do { pmd_t *pmd; - pmd = pmd_alloc(&init_mm, dir, address); + pmd = pmd_alloc_kernel(&init_mm, dir, address); error = -ENOMEM; if (!pmd) break; diff -prauN linux-2.6.0-test11/arch/sh/kernel/process.c wli-2.6.0-test11-30/arch/sh/kernel/process.c --- linux-2.6.0-test11/arch/sh/kernel/process.c 2003-11-26 12:44:56.000000000 -0800 +++ wli-2.6.0-test11-30/arch/sh/kernel/process.c 2003-12-04 08:35:58.000000000 -0800 @@ -375,11 +375,6 @@ out: /* * These bracket the sleeping functions.. */ -extern void scheduling_functions_start_here(void); -extern void scheduling_functions_end_here(void); -#define first_sched ((unsigned long) scheduling_functions_start_here) -#define last_sched ((unsigned long) scheduling_functions_end_here) - unsigned long get_wchan(struct task_struct *p) { unsigned long schedule_frame; diff -prauN linux-2.6.0-test11/arch/sh/kernel/semaphore.c wli-2.6.0-test11-30/arch/sh/kernel/semaphore.c --- linux-2.6.0-test11/arch/sh/kernel/semaphore.c 2003-11-26 12:45:11.000000000 -0800 +++ wli-2.6.0-test11-30/arch/sh/kernel/semaphore.c 2003-12-04 08:35:58.000000000 -0800 @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -103,7 +104,7 @@ void __up(struct semaphore *sem) tsk->state = TASK_RUNNING; \ remove_wait_queue(&sem->wait, &wait); -void __down(struct semaphore * sem) +__sched void __down(struct semaphore * sem) { DOWN_VAR DOWN_HEAD(TASK_UNINTERRUPTIBLE) @@ -113,7 +114,7 @@ void __down(struct semaphore * sem) DOWN_TAIL(TASK_UNINTERRUPTIBLE) } -int __down_interruptible(struct semaphore * sem) +__sched int __down_interruptible(struct semaphore * sem) { int ret = 0; DOWN_VAR diff -prauN linux-2.6.0-test11/arch/sh/kernel/vmlinux.lds.S wli-2.6.0-test11-30/arch/sh/kernel/vmlinux.lds.S --- linux-2.6.0-test11/arch/sh/kernel/vmlinux.lds.S 2003-11-26 12:44:20.000000000 -0800 +++ wli-2.6.0-test11-30/arch/sh/kernel/vmlinux.lds.S 2003-12-04 08:35:58.000000000 -0800 @@ -22,6 +22,9 @@ SECTIONS } = 0 .text : { *(.text) + __scheduling_functions_start_here =.; + *(.sched.text) + __scheduling_functions_end_here = .; *(.fixup) *(.gnu.warning) } = 0x0009 diff -prauN linux-2.6.0-test11/arch/sh/mm/ioremap.c wli-2.6.0-test11-30/arch/sh/mm/ioremap.c --- linux-2.6.0-test11/arch/sh/mm/ioremap.c 2003-11-26 12:44:27.000000000 -0800 +++ wli-2.6.0-test11-30/arch/sh/mm/ioremap.c 2003-12-03 18:20:41.000000000 -0800 @@ -45,7 +45,7 @@ static inline void remap_area_pte(pte_t } while (address && (address < end)); } -static inline int remap_area_pmd(pmd_t * pmd, unsigned long address, +static inline int remap_area_pmd(pgd_t *pgd, pmd_t * pmd, unsigned long address, unsigned long size, unsigned long phys_addr, unsigned long flags) { unsigned long end; @@ -83,11 +83,11 @@ int remap_area_pages(unsigned long addre spin_lock(&init_mm.page_table_lock); do { pmd_t *pmd; - pmd = pmd_alloc(&init_mm, dir, address); + pmd = pmd_alloc_map(&init_mm, dir, address); error = -ENOMEM; if (!pmd) break; - if (remap_area_pmd(pmd, address, end - address, + if (remap_area_pmd(dir, pmd, address, end - address, phys_addr + address, flags)) break; error = 0; diff -prauN linux-2.6.0-test11/arch/sparc/kernel/process.c wli-2.6.0-test11-30/arch/sparc/kernel/process.c --- linux-2.6.0-test11/arch/sparc/kernel/process.c 2003-11-26 12:43:07.000000000 -0800 +++ wli-2.6.0-test11-30/arch/sparc/kernel/process.c 2003-12-04 08:35:58.000000000 -0800 @@ -692,9 +692,6 @@ pid_t kernel_thread(int (*fn)(void *), v return retval; } -extern void scheduling_functions_start_here(void); -extern void scheduling_functions_end_here(void); - unsigned long get_wchan(struct task_struct *task) { unsigned long pc, fp, bias = 0; @@ -715,8 +712,8 @@ unsigned long get_wchan(struct task_stru break; rw = (struct reg_window *) fp; pc = rw->ins[7]; - if (pc < ((unsigned long) scheduling_functions_start_here) || - pc >= ((unsigned long) scheduling_functions_end_here)) { + if (pc < scheduling_functions_start_here || + pc >= scheduling_functions_end_here) { ret = pc; goto out; } diff -prauN linux-2.6.0-test11/arch/sparc/kernel/semaphore.c wli-2.6.0-test11-30/arch/sparc/kernel/semaphore.c --- linux-2.6.0-test11/arch/sparc/kernel/semaphore.c 2003-11-26 12:45:44.000000000 -0800 +++ wli-2.6.0-test11-30/arch/sparc/kernel/semaphore.c 2003-12-04 08:35:58.000000000 -0800 @@ -4,6 +4,7 @@ #include #include +#include #include @@ -45,7 +46,7 @@ void __up(struct semaphore *sem) static spinlock_t semaphore_lock = SPIN_LOCK_UNLOCKED; -void __down(struct semaphore * sem) +__sched void __down(struct semaphore * sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -78,7 +79,7 @@ void __down(struct semaphore * sem) wake_up(&sem->wait); } -int __down_interruptible(struct semaphore * sem) +__sched int __down_interruptible(struct semaphore * sem) { int retval = 0; struct task_struct *tsk = current; diff -prauN linux-2.6.0-test11/arch/sparc/kernel/vmlinux.lds.S wli-2.6.0-test11-30/arch/sparc/kernel/vmlinux.lds.S --- linux-2.6.0-test11/arch/sparc/kernel/vmlinux.lds.S 2003-11-26 12:42:56.000000000 -0800 +++ wli-2.6.0-test11-30/arch/sparc/kernel/vmlinux.lds.S 2003-12-04 08:35:58.000000000 -0800 @@ -12,6 +12,9 @@ SECTIONS .text 0xf0004000 : { *(.text) + __scheduling_functions_start_here = .; + *(.sched.text) + __scheduling_functions_end_here = .; *(.gnu.warning) } =0 _etext = .; diff -prauN linux-2.6.0-test11/arch/sparc/lib/rwsem.S wli-2.6.0-test11-30/arch/sparc/lib/rwsem.S --- linux-2.6.0-test11/arch/sparc/lib/rwsem.S 2003-11-26 12:44:41.000000000 -0800 +++ wli-2.6.0-test11-30/arch/sparc/lib/rwsem.S 2003-12-04 08:35:58.000000000 -0800 @@ -8,7 +8,7 @@ #include #include - .text + .section .sched.text .align 4 .globl ___down_read @@ -113,6 +113,7 @@ ___down_write: ba 2b restore %l5, %g0, %g5 + .text .globl ___up_read ___up_read: rd %psr, %g3 diff -prauN linux-2.6.0-test11/arch/sparc/mm/generic.c wli-2.6.0-test11-30/arch/sparc/mm/generic.c --- linux-2.6.0-test11/arch/sparc/mm/generic.c 2003-11-26 12:45:33.000000000 -0800 +++ wli-2.6.0-test11-30/arch/sparc/mm/generic.c 2003-12-03 18:20:41.000000000 -0800 @@ -67,7 +67,7 @@ static inline void io_remap_pte_range(pt } while (address < end); } -static inline int io_remap_pmd_range(pmd_t * pmd, unsigned long address, unsigned long size, +static inline int io_remap_pmd_range(pgd_t *pgd, pmd_t * pmd, unsigned long address, unsigned long size, unsigned long offset, pgprot_t prot, int space) { unsigned long end; @@ -78,7 +78,7 @@ static inline int io_remap_pmd_range(pmd end = PGDIR_SIZE; offset -= address; do { - pte_t * pte = pte_alloc_map(current->mm, pmd, address); + pte_t * pte = pte_alloc_map(current->mm, pgd, &pmd, address); if (!pte) return -ENOMEM; io_remap_pte_range(pte, address, end - address, address + offset, prot, space); @@ -103,11 +103,11 @@ int io_remap_page_range(struct vm_area_s spin_lock(&mm->page_table_lock); while (from < end) { - pmd_t *pmd = pmd_alloc(current->mm, dir, from); + pmd_t *pmd = pmd_alloc_map(current->mm, dir, from); error = -ENOMEM; if (!pmd) break; - error = io_remap_pmd_range(pmd, from, end - from, offset + from, prot, space); + error = io_remap_pmd_range(pgd, pmd, from, end - from, offset + from, prot, space); if (error) break; from = (from + PGDIR_SIZE) & PGDIR_MASK; diff -prauN linux-2.6.0-test11/arch/sparc/mm/srmmu.c wli-2.6.0-test11-30/arch/sparc/mm/srmmu.c --- linux-2.6.0-test11/arch/sparc/mm/srmmu.c 2003-11-26 12:44:23.000000000 -0800 +++ wli-2.6.0-test11-30/arch/sparc/mm/srmmu.c 2003-12-03 18:20:41.000000000 -0800 @@ -2180,7 +2180,7 @@ void __init ld_mmu_srmmu(void) BTFIXUPSET_CALL(pte_pfn, srmmu_pte_pfn, BTFIXUPCALL_NORM); BTFIXUPSET_CALL(pmd_page, srmmu_pmd_page, BTFIXUPCALL_NORM); - BTFIXUPSET_CALL(pgd_page, srmmu_pgd_page, BTFIXUPCALL_NORM); + BTFIXUPSET_CALL(__pgd_page, srmmu_pgd_page, BTFIXUPCALL_NORM); BTFIXUPSET_SETHI(none_mask, 0xF0000000); @@ -2212,7 +2212,7 @@ void __init ld_mmu_srmmu(void) BTFIXUPSET_CALL(pte_alloc_one_kernel, srmmu_pte_alloc_one_kernel, BTFIXUPCALL_NORM); BTFIXUPSET_CALL(pte_alloc_one, srmmu_pte_alloc_one, BTFIXUPCALL_NORM); BTFIXUPSET_CALL(free_pmd_fast, srmmu_pmd_free, BTFIXUPCALL_NORM); - BTFIXUPSET_CALL(pmd_alloc_one, srmmu_pmd_alloc_one, BTFIXUPCALL_NORM); + BTFIXUPSET_CALL(__pmd_alloc_one, srmmu_pmd_alloc_one, BTFIXUPCALL_NORM); BTFIXUPSET_CALL(free_pgd_fast, srmmu_free_pgd_fast, BTFIXUPCALL_NORM); BTFIXUPSET_CALL(get_pgd_fast, srmmu_get_pgd_fast, BTFIXUPCALL_NORM); diff -prauN linux-2.6.0-test11/arch/sparc/mm/sun4c.c wli-2.6.0-test11-30/arch/sparc/mm/sun4c.c --- linux-2.6.0-test11/arch/sparc/mm/sun4c.c 2003-11-26 12:45:06.000000000 -0800 +++ wli-2.6.0-test11-30/arch/sparc/mm/sun4c.c 2003-12-03 18:20:41.000000000 -0800 @@ -2211,7 +2211,7 @@ void __init ld_mmu_sun4c(void) BTFIXUPSET_CALL(pte_alloc_one_kernel, sun4c_pte_alloc_one_kernel, BTFIXUPCALL_NORM); BTFIXUPSET_CALL(pte_alloc_one, sun4c_pte_alloc_one, BTFIXUPCALL_NORM); BTFIXUPSET_CALL(free_pmd_fast, sun4c_free_pmd_fast, BTFIXUPCALL_NOP); - BTFIXUPSET_CALL(pmd_alloc_one, sun4c_pmd_alloc_one, BTFIXUPCALL_RETO0); + BTFIXUPSET_CALL(__pmd_alloc_one, sun4c_pmd_alloc_one, BTFIXUPCALL_RETO0); BTFIXUPSET_CALL(free_pgd_fast, sun4c_free_pgd_fast, BTFIXUPCALL_NORM); BTFIXUPSET_CALL(get_pgd_fast, sun4c_get_pgd_fast, BTFIXUPCALL_NORM); @@ -2252,5 +2252,5 @@ void __init ld_mmu_sun4c(void) /* These should _never_ get called with two level tables. */ BTFIXUPSET_CALL(pgd_set, sun4c_pgd_set, BTFIXUPCALL_NOP); - BTFIXUPSET_CALL(pgd_page, sun4c_pgd_page, BTFIXUPCALL_RETO0); + BTFIXUPSET_CALL(__pgd_page, sun4c_pgd_page, BTFIXUPCALL_RETO0); } diff -prauN linux-2.6.0-test11/arch/sparc64/kernel/process.c wli-2.6.0-test11-30/arch/sparc64/kernel/process.c --- linux-2.6.0-test11/arch/sparc64/kernel/process.c 2003-11-26 12:43:41.000000000 -0800 +++ wli-2.6.0-test11-30/arch/sparc64/kernel/process.c 2003-12-04 08:35:58.000000000 -0800 @@ -824,9 +824,6 @@ out: return error; } -extern void scheduling_functions_start_here(void); -extern void scheduling_functions_end_here(void); - unsigned long get_wchan(struct task_struct *task) { unsigned long pc, fp, bias = 0; @@ -850,8 +847,8 @@ unsigned long get_wchan(struct task_stru break; rw = (struct reg_window *) fp; pc = rw->ins[7]; - if (pc < ((unsigned long) scheduling_functions_start_here) || - pc >= ((unsigned long) scheduling_functions_end_here)) { + if (pc < scheduling_functions_start_here || + pc >= scheduling_functions_end_here) { ret = pc; goto out; } diff -prauN linux-2.6.0-test11/arch/sparc64/kernel/semaphore.c wli-2.6.0-test11-30/arch/sparc64/kernel/semaphore.c --- linux-2.6.0-test11/arch/sparc64/kernel/semaphore.c 2003-11-26 12:45:10.000000000 -0800 +++ wli-2.6.0-test11-30/arch/sparc64/kernel/semaphore.c 2003-12-04 08:35:58.000000000 -0800 @@ -8,6 +8,7 @@ #include #include +#include /* * Atomically update sem->count. @@ -90,7 +91,7 @@ void up(struct semaphore *sem) : "g5", "g7", "memory", "cc"); } -static void __down(struct semaphore * sem) +static __sched void __down(struct semaphore * sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -108,7 +109,7 @@ static void __down(struct semaphore * se wake_up(&sem->wait); } -void down(struct semaphore *sem) +__sched void down(struct semaphore *sem) { might_sleep(); /* This atomically does: @@ -192,7 +193,7 @@ int down_trylock(struct semaphore *sem) return ret; } -static int __down_interruptible(struct semaphore * sem) +static __sched int __down_interruptible(struct semaphore * sem) { int retval = 0; struct task_struct *tsk = current; @@ -216,7 +217,7 @@ static int __down_interruptible(struct s return retval; } -int down_interruptible(struct semaphore *sem) +__sched int down_interruptible(struct semaphore *sem) { int ret = 0; diff -prauN linux-2.6.0-test11/arch/sparc64/kernel/smp.c wli-2.6.0-test11-30/arch/sparc64/kernel/smp.c --- linux-2.6.0-test11/arch/sparc64/kernel/smp.c 2003-11-26 12:42:41.000000000 -0800 +++ wli-2.6.0-test11-30/arch/sparc64/kernel/smp.c 2003-12-04 06:13:40.000000000 -0800 @@ -675,9 +675,9 @@ static __inline__ void __local_flush_dca #if (L1DCACHE_SIZE > PAGE_SIZE) __flush_dcache_page(page->virtual, ((tlb_type == spitfire) && - page->mapping != NULL)); + page_mapping(page) != NULL)); #else - if (page->mapping != NULL && + if (page_mapping(page) != NULL && tlb_type == spitfire) __flush_icache_page(__pa(page->virtual)); #endif @@ -698,7 +698,7 @@ void smp_flush_dcache_page_impl(struct p if (tlb_type == spitfire) { data0 = ((u64)&xcall_flush_dcache_page_spitfire); - if (page->mapping != NULL) + if (page_mapping(page) != NULL) data0 |= ((u64)1 << 32); spitfire_xcall_deliver(data0, __pa(page->virtual), @@ -731,7 +731,7 @@ void flush_dcache_page_all(struct mm_str goto flush_self; if (tlb_type == spitfire) { data0 = ((u64)&xcall_flush_dcache_page_spitfire); - if (page->mapping != NULL) + if (page_mapping(page) != NULL) data0 |= ((u64)1 << 32); spitfire_xcall_deliver(data0, __pa(page->virtual), diff -prauN linux-2.6.0-test11/arch/sparc64/kernel/vmlinux.lds.S wli-2.6.0-test11-30/arch/sparc64/kernel/vmlinux.lds.S --- linux-2.6.0-test11/arch/sparc64/kernel/vmlinux.lds.S 2003-11-26 12:45:27.000000000 -0800 +++ wli-2.6.0-test11-30/arch/sparc64/kernel/vmlinux.lds.S 2003-12-04 08:35:58.000000000 -0800 @@ -15,6 +15,9 @@ SECTIONS .text 0x0000000000404000 : { *(.text) + __scheduling_functions_start_here = .; + *(.sched.text) + __scheduling_functions_end_here = .; *(.gnu.warning) } =0 _etext = .; diff -prauN linux-2.6.0-test11/arch/sparc64/lib/rwsem.c wli-2.6.0-test11-30/arch/sparc64/lib/rwsem.c --- linux-2.6.0-test11/arch/sparc64/lib/rwsem.c 2003-11-26 12:42:45.000000000 -0800 +++ wli-2.6.0-test11-30/arch/sparc64/lib/rwsem.c 2003-12-04 08:35:59.000000000 -0800 @@ -6,6 +6,7 @@ #include #include +#include #include extern struct rw_semaphore *FASTCALL(rwsem_down_read_failed(struct rw_semaphore *sem)); @@ -13,7 +14,7 @@ extern struct rw_semaphore *FASTCALL(rws extern struct rw_semaphore *FASTCALL(rwsem_wake(struct rw_semaphore *)); extern struct rw_semaphore *FASTCALL(rwsem_downgrade_wake(struct rw_semaphore *)); -void __down_read(struct rw_semaphore *sem) +__sched void __down_read(struct rw_semaphore *sem) { __asm__ __volatile__( "! beginning __down_read\n" @@ -72,7 +73,7 @@ int __down_read_trylock(struct rw_semaph } EXPORT_SYMBOL(__down_read_trylock); -void __down_write(struct rw_semaphore *sem) +__sched void __down_write(struct rw_semaphore *sem) { __asm__ __volatile__( "! beginning __down_write\n\t" diff -prauN linux-2.6.0-test11/arch/sparc64/mm/generic.c wli-2.6.0-test11-30/arch/sparc64/mm/generic.c --- linux-2.6.0-test11/arch/sparc64/mm/generic.c 2003-11-26 12:45:22.000000000 -0800 +++ wli-2.6.0-test11-30/arch/sparc64/mm/generic.c 2003-12-03 18:20:41.000000000 -0800 @@ -85,7 +85,7 @@ static inline void io_remap_pte_range(pt } while (address < end); } -static inline int io_remap_pmd_range(pmd_t * pmd, unsigned long address, unsigned long size, +static inline int io_remap_pmd_range(pgd_t *pgd, pmd_t * pmd, unsigned long address, unsigned long size, unsigned long offset, pgprot_t prot, int space) { unsigned long end; @@ -96,7 +96,7 @@ static inline int io_remap_pmd_range(pmd end = PGDIR_SIZE; offset -= address; do { - pte_t * pte = pte_alloc_map(current->mm, pmd, address); + pte_t * pte = pte_alloc_map(current->mm, pgd, &pmd, address); if (!pte) return -ENOMEM; io_remap_pte_range(pte, address, end - address, address + offset, prot, space); @@ -122,11 +122,11 @@ int io_remap_page_range(struct vm_area_s spin_lock(&mm->page_table_lock); while (from < end) { - pmd_t *pmd = pmd_alloc(current->mm, dir, from); + pmd_t *pmd = pmd_alloc_map(current->mm, dir, from); error = -ENOMEM; if (!pmd) break; - error = io_remap_pmd_range(pmd, from, end - from, offset + from, prot, space); + error = io_remap_pmd_range(pgd, pmd, from, end - from, offset + from, prot, space); if (error) break; from = (from + PGDIR_SIZE) & PGDIR_MASK; diff -prauN linux-2.6.0-test11/arch/sparc64/mm/hugetlbpage.c wli-2.6.0-test11-30/arch/sparc64/mm/hugetlbpage.c --- linux-2.6.0-test11/arch/sparc64/mm/hugetlbpage.c 2003-11-26 12:44:43.000000000 -0800 +++ wli-2.6.0-test11-30/arch/sparc64/mm/hugetlbpage.c 2003-12-04 08:44:05.000000000 -0800 @@ -89,9 +89,11 @@ static pte_t *huge_pte_alloc(struct mm_s pgd = pgd_offset(mm, addr); if (pgd) { - pmd = pmd_alloc(mm, pgd, addr); - if (pmd) - pte = pte_alloc_map(mm, pmd, addr); + pmd = pmd_alloc_map(mm, pgd, addr); + if (pmd) { + pte = pte_alloc_map(mm, pgd, &pmd, addr); + pmd_unmap(pmd); + } } return pte; } @@ -104,9 +106,11 @@ static pte_t *huge_pte_offset(struct mm_ pgd = pgd_offset(mm, addr); if (pgd) { - pmd = pmd_offset(pgd, addr); - if (pmd) + pmd = pmd_offset_map(pgd, addr); + if (pmd) { pte = pte_offset_map(pmd, addr); + pmd_unmap(pmd); + } } return pte; } @@ -248,7 +252,7 @@ struct page *follow_huge_pmd(struct mm_s static void free_huge_page(struct page *page) { BUG_ON(page_count(page)); - BUG_ON(page->mapping); + BUG_ON(page_mapping(page)); INIT_LIST_HEAD(&page->list); @@ -504,7 +508,7 @@ int is_hugepage_mem_enough(size_t size) * this far. */ static struct page *hugetlb_nopage(struct vm_area_struct *vma, - unsigned long address, int unused) + unsigned long address, int *unused) { BUG(); return NULL; diff -prauN linux-2.6.0-test11/arch/sparc64/mm/init.c wli-2.6.0-test11-30/arch/sparc64/mm/init.c --- linux-2.6.0-test11/arch/sparc64/mm/init.c 2003-11-26 12:43:35.000000000 -0800 +++ wli-2.6.0-test11-30/arch/sparc64/mm/init.c 2003-12-04 06:13:40.000000000 -0800 @@ -128,9 +128,9 @@ __inline__ void flush_dcache_page_impl(s #if (L1DCACHE_SIZE > PAGE_SIZE) __flush_dcache_page(page->virtual, ((tlb_type == spitfire) && - page->mapping != NULL)); + page_mapping(page) != NULL)); #else - if (page->mapping != NULL && + if (page_mapping(page) != NULL && tlb_type == spitfire) __flush_icache_page(__pa(page->virtual)); #endif @@ -192,7 +192,7 @@ void update_mmu_cache(struct vm_area_str pfn = pte_pfn(pte); if (pfn_valid(pfn) && - (page = pfn_to_page(pfn), page->mapping) && + (page = pfn_to_page(pfn), page_mapping(page)) && ((pg_flags = page->flags) & (1UL << PG_dcache_dirty))) { int cpu = ((pg_flags >> 24) & (NR_CPUS - 1UL)); @@ -216,9 +216,9 @@ void flush_dcache_page(struct page *page int dirty = test_bit(PG_dcache_dirty, &page->flags); int dirty_cpu = dcache_dirty_cpu(page); - if (page->mapping && - list_empty(&page->mapping->i_mmap) && - list_empty(&page->mapping->i_mmap_shared)) { + if (page_mapping(page) && + list_empty(&page_mapping(page)->i_mmap) && + list_empty(&page_mapping(page)->i_mmap_shared)) { if (dirty) { if (dirty_cpu == smp_processor_id()) return; @@ -226,7 +226,7 @@ void flush_dcache_page(struct page *page } set_dcache_dirty(page); } else { - /* We could delay the flush for the !page->mapping + /* We could delay the flush for the !page_mapping(page) * case too. But that case is for exec env/arg * pages and those are %99 certainly going to get * faulted into the tlb (and thus flushed) anyways. @@ -268,7 +268,7 @@ static inline void flush_cache_pte_range if (!pfn_valid(pfn)) continue; page = pfn_to_page(pfn); - if (PageReserved(page) || !page->mapping) + if (PageReserved(page) || !page_mapping(page)) continue; pgaddr = (unsigned long) page_address(page); uaddr = address + offset; diff -prauN linux-2.6.0-test11/arch/sparc64/mm/ultra.S wli-2.6.0-test11-30/arch/sparc64/mm/ultra.S --- linux-2.6.0-test11/arch/sparc64/mm/ultra.S 2003-11-26 12:44:56.000000000 -0800 +++ wli-2.6.0-test11-30/arch/sparc64/mm/ultra.S 2003-12-04 06:13:40.000000000 -0800 @@ -615,7 +615,7 @@ xcall_flush_dcache_page_cheetah: /* %g1 .globl xcall_flush_dcache_page_spitfire xcall_flush_dcache_page_spitfire: /* %g1 == physical page address %g7 == kernel page virtual address - %g5 == (page->mapping != NULL) */ + %g5 == (page_mapping(page) != NULL) */ #if (L1DCACHE_SIZE > PAGE_SIZE) srlx %g1, (13 - 2), %g1 ! Form tag comparitor sethi %hi(L1DCACHE_SIZE), %g3 ! D$ size == 16K diff -prauN linux-2.6.0-test11/arch/v850/kernel/process.c wli-2.6.0-test11-30/arch/v850/kernel/process.c --- linux-2.6.0-test11/arch/v850/kernel/process.c 2003-11-26 12:43:09.000000000 -0800 +++ wli-2.6.0-test11-30/arch/v850/kernel/process.c 2003-12-04 08:35:59.000000000 -0800 @@ -203,11 +203,6 @@ int sys_execve (char *name, char **argv, /* * These bracket the sleeping functions.. */ -extern void scheduling_functions_start_here (void); -extern void scheduling_functions_end_here (void); -#define first_sched ((unsigned long) scheduling_functions_start_here) -#define last_sched ((unsigned long) scheduling_functions_end_here) - unsigned long get_wchan (struct task_struct *p) { #if 0 /* Barf. Figure out the stack-layout later. XXX */ @@ -221,15 +216,16 @@ unsigned long get_wchan (struct task_str /* This quite disgusting function walks up the stack, following saved return address, until it something that's out of bounds - (as defined by `first_sched' and `last_sched'). It then - returns the last PC that was in-bounds. */ + (as defined by `scheduling_functions_start_here' and + `scheduling_functions_end_here'). It then returns the last + PC that was in-bounds. */ do { - if (fp < stack_page + sizeof (struct task_struct) || - fp >= 8184+stack_page) + if (fp < stack_page + sizeof(struct thread_info) || + fp >= THREAD_SIZE + stack_page - 8) return 0; pc = ((unsigned long *)fp)[1]; - /* FIXME: This depends on the order of these functions. */ - if (pc < first_sched || pc >= last_sched) + if (pc < scheduling_functions_start_here || + pc >= scheduling_functions_end_here) return pc; fp = *(unsigned long *) fp; } while (count++ < 16); diff -prauN linux-2.6.0-test11/arch/v850/kernel/semaphore.c wli-2.6.0-test11-30/arch/v850/kernel/semaphore.c --- linux-2.6.0-test11/arch/v850/kernel/semaphore.c 2003-11-26 12:45:11.000000000 -0800 +++ wli-2.6.0-test11-30/arch/v850/kernel/semaphore.c 2003-12-04 08:35:59.000000000 -0800 @@ -15,6 +15,7 @@ #include #include +#include #include @@ -56,7 +57,7 @@ void __up(struct semaphore *sem) static spinlock_t semaphore_lock = SPIN_LOCK_UNLOCKED; -void __down(struct semaphore * sem) +__sched void __down(struct semaphore * sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -89,7 +90,7 @@ void __down(struct semaphore * sem) wake_up(&sem->wait); } -int __down_interruptible(struct semaphore * sem) +__sched int __down_interruptible(struct semaphore * sem) { int retval = 0; struct task_struct *tsk = current; diff -prauN linux-2.6.0-test11/arch/v850/kernel/vmlinux.lds.S wli-2.6.0-test11-30/arch/v850/kernel/vmlinux.lds.S --- linux-2.6.0-test11/arch/v850/kernel/vmlinux.lds.S 2003-11-26 12:44:32.000000000 -0800 +++ wli-2.6.0-test11-30/arch/v850/kernel/vmlinux.lds.S 2003-12-04 08:35:59.000000000 -0800 @@ -64,6 +64,9 @@ #define TEXT_CONTENTS \ __stext = . ; \ *(.text) \ + __scheduling_functions_start_here = .; + *(.sched.text) + __scheduling_functions_end_here = .; *(.exit.text) /* 2.5 convention */ \ *(.text.exit) /* 2.4 convention */ \ *(.text.lock) \ diff -prauN linux-2.6.0-test11/arch/x86_64/ia32/ia32_binfmt.c wli-2.6.0-test11-30/arch/x86_64/ia32/ia32_binfmt.c --- linux-2.6.0-test11/arch/x86_64/ia32/ia32_binfmt.c 2003-11-26 12:42:52.000000000 -0800 +++ wli-2.6.0-test11-30/arch/x86_64/ia32/ia32_binfmt.c 2003-12-03 19:11:55.000000000 -0800 @@ -371,7 +371,8 @@ int setup_arg_pages(struct linux_binprm struct page *page = bprm->page[i]; if (page) { bprm->page[i] = NULL; - put_dirty_page(current,page,stack_base,PAGE_COPY_EXEC); + put_dirty_page(current, mpnt, page, + stack_base, PAGE_COPY_EXEC); } stack_base += PAGE_SIZE; } diff -prauN linux-2.6.0-test11/arch/x86_64/ia32/syscall32.c wli-2.6.0-test11-30/arch/x86_64/ia32/syscall32.c --- linux-2.6.0-test11/arch/x86_64/ia32/syscall32.c 2003-11-26 12:44:33.000000000 -0800 +++ wli-2.6.0-test11-30/arch/x86_64/ia32/syscall32.c 2003-12-03 18:20:41.000000000 -0800 @@ -29,12 +29,15 @@ char *syscall32_page; and let it be handled by generic VM */ int map_syscall32(struct mm_struct *mm, unsigned long address) { + pgd_t *pgd; + pmd_t *pmd; pte_t *pte; int err = 0; down_read(&mm->mmap_sem); spin_lock(&mm->page_table_lock); - pmd_t *pmd = pmd_alloc(mm, pgd_offset(mm, address), address); - if (pmd && (pte = pte_alloc_map(mm, pmd, address)) != NULL) { + pgd = pgd_offset(mm, address); + pmd = pmd_alloc_map(mm, pgd, address); + if (pmd && (pte = pte_alloc_map(mm, pgd, &pmd, address)) != NULL) { if (pte_none(*pte)) { set_pte(pte, mk_pte(virt_to_page(syscall32_page), diff -prauN linux-2.6.0-test11/arch/x86_64/kernel/process.c wli-2.6.0-test11-30/arch/x86_64/kernel/process.c --- linux-2.6.0-test11/arch/x86_64/kernel/process.c 2003-11-26 12:44:20.000000000 -0800 +++ wli-2.6.0-test11-30/arch/x86_64/kernel/process.c 2003-12-04 08:35:59.000000000 -0800 @@ -528,11 +528,6 @@ asmlinkage long sys_vfork(struct pt_regs /* * These bracket the sleeping functions.. */ -extern void scheduling_functions_start_here(void); -extern void scheduling_functions_end_here(void); -#define first_sched ((unsigned long) scheduling_functions_start_here) -#define last_sched ((unsigned long) scheduling_functions_end_here) - unsigned long get_wchan(struct task_struct *p) { u64 fp,rip; @@ -547,14 +542,13 @@ unsigned long get_wchan(struct task_stru if (fp < (unsigned long)p || fp > (unsigned long)p+THREAD_SIZE) return 0; rip = *(u64 *)(fp+8); - if (rip < first_sched || rip >= last_sched) + if (rip < scheduling_functions_start_here || + rip >= scheduling_functions_end_here) return rip; fp = *(u64 *)fp; } while (count++ < 16); return 0; } -#undef last_sched -#undef first_sched long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) { diff -prauN linux-2.6.0-test11/arch/x86_64/kernel/semaphore.c wli-2.6.0-test11-30/arch/x86_64/kernel/semaphore.c --- linux-2.6.0-test11/arch/x86_64/kernel/semaphore.c 2003-11-26 12:46:13.000000000 -0800 +++ wli-2.6.0-test11-30/arch/x86_64/kernel/semaphore.c 2003-12-04 08:35:59.000000000 -0800 @@ -14,6 +14,7 @@ */ #include #include +#include #include #include @@ -54,7 +55,7 @@ void __up(struct semaphore *sem) wake_up(&sem->wait); } -void __down(struct semaphore * sem) +__sched void __down(struct semaphore * sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -91,7 +92,7 @@ void __down(struct semaphore * sem) tsk->state = TASK_RUNNING; } -int __down_interruptible(struct semaphore * sem) +__sched int __down_interruptible(struct semaphore * sem) { int retval = 0; struct task_struct *tsk = current; diff -prauN linux-2.6.0-test11/arch/x86_64/kernel/vmlinux.lds.S wli-2.6.0-test11-30/arch/x86_64/kernel/vmlinux.lds.S --- linux-2.6.0-test11/arch/x86_64/kernel/vmlinux.lds.S 2003-11-26 12:42:55.000000000 -0800 +++ wli-2.6.0-test11-30/arch/x86_64/kernel/vmlinux.lds.S 2003-12-04 08:35:59.000000000 -0800 @@ -14,6 +14,9 @@ SECTIONS _text = .; /* Text and read-only data */ .text : { *(.text) + __scheduling_functions_start_here = .; + *(.sched.text) + __scheduling_functions_end_here = .; *(.fixup) *(.gnu.warning) } = 0x9090 diff -prauN linux-2.6.0-test11/arch/x86_64/lib/thunk.S wli-2.6.0-test11-30/arch/x86_64/lib/thunk.S --- linux-2.6.0-test11/arch/x86_64/lib/thunk.S 2003-11-26 12:43:52.000000000 -0800 +++ wli-2.6.0-test11-30/arch/x86_64/lib/thunk.S 2003-12-04 08:35:59.000000000 -0800 @@ -30,6 +30,7 @@ .endm + .section .sched.text #ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM thunk rwsem_down_read_failed_thunk,rwsem_down_read_failed thunk rwsem_down_write_failed_thunk,rwsem_down_write_failed @@ -53,7 +54,7 @@ restore_norax: #ifdef CONFIG_SMP /* Support for read/write spinlocks. */ - + .text /* rax: pointer to rwlock_t */ ENTRY(__write_lock_failed) lock diff -prauN linux-2.6.0-test11/arch/x86_64/mm/init.c wli-2.6.0-test11-30/arch/x86_64/mm/init.c --- linux-2.6.0-test11/arch/x86_64/mm/init.c 2003-11-26 12:43:24.000000000 -0800 +++ wli-2.6.0-test11-30/arch/x86_64/mm/init.c 2003-12-04 08:30:37.000000000 -0800 @@ -510,34 +510,3 @@ void __init reserve_bootmem_generic(unsi reserve_bootmem(phys, len); #endif } - -int kern_addr_valid(unsigned long addr) -{ - unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT; - pml4_t *pml4; - pgd_t *pgd; - pmd_t *pmd; - pte_t *pte; - - if (above != 0 && above != -1UL) - return 0; - - pml4 = pml4_offset_k(addr); - if (pml4_none(*pml4)) - return 0; - - pgd = pgd_offset_k(addr); - if (pgd_none(*pgd)) - return 0; - - pmd = pmd_offset(pgd, addr); - if (pmd_none(*pmd)) - return 0; - if (pmd_large(*pmd)) - return pfn_valid(pmd_pfn(*pmd)); - - pte = pte_offset_kernel(pmd, addr); - if (pte_none(*pte)) - return 0; - return pfn_valid(pte_pfn(*pte)); -} diff -prauN linux-2.6.0-test11/arch/x86_64/mm/ioremap.c wli-2.6.0-test11-30/arch/x86_64/mm/ioremap.c --- linux-2.6.0-test11/arch/x86_64/mm/ioremap.c 2003-11-26 12:45:38.000000000 -0800 +++ wli-2.6.0-test11-30/arch/x86_64/mm/ioremap.c 2003-12-03 18:20:41.000000000 -0800 @@ -82,7 +82,7 @@ static int remap_area_pages(unsigned lon spin_lock(&init_mm.page_table_lock); do { pmd_t *pmd; - pmd = pmd_alloc(&init_mm, dir, address); + pmd = pmd_alloc_kernel(&init_mm, dir, address); error = -ENOMEM; if (!pmd) break; diff -prauN linux-2.6.0-test11/drivers/char/agp/alpha-agp.c wli-2.6.0-test11-30/drivers/char/agp/alpha-agp.c --- linux-2.6.0-test11/drivers/char/agp/alpha-agp.c 2003-11-26 12:44:41.000000000 -0800 +++ wli-2.6.0-test11-30/drivers/char/agp/alpha-agp.c 2003-12-04 08:43:29.000000000 -0800 @@ -13,7 +13,7 @@ static struct page *alpha_core_agp_vm_nopage(struct vm_area_struct *vma, unsigned long address, - int write_access) + int *type) { alpha_agp_info *agp = agp_bridge->dev_private_data; dma_addr_t dma_addr; @@ -30,6 +30,8 @@ static struct page *alpha_core_agp_vm_no */ page = virt_to_page(__va(pa)); get_page(page); + if (type) + *type = VM_FAULT_MINOR; return page; } diff -prauN linux-2.6.0-test11/drivers/char/drm/drmP.h wli-2.6.0-test11-30/drivers/char/drm/drmP.h --- linux-2.6.0-test11/drivers/char/drm/drmP.h 2003-11-26 12:44:58.000000000 -0800 +++ wli-2.6.0-test11-30/drivers/char/drm/drmP.h 2003-12-04 08:43:29.000000000 -0800 @@ -760,16 +760,16 @@ extern int DRM(fasync)(int fd, stru /* Mapping support (drm_vm.h) */ extern struct page *DRM(vm_nopage)(struct vm_area_struct *vma, unsigned long address, - int write_access); + int *type); extern struct page *DRM(vm_shm_nopage)(struct vm_area_struct *vma, unsigned long address, - int write_access); + int *type); extern struct page *DRM(vm_dma_nopage)(struct vm_area_struct *vma, unsigned long address, - int write_access); + int *type); extern struct page *DRM(vm_sg_nopage)(struct vm_area_struct *vma, unsigned long address, - int write_access); + int *type); extern void DRM(vm_open)(struct vm_area_struct *vma); extern void DRM(vm_close)(struct vm_area_struct *vma); extern void DRM(vm_shm_close)(struct vm_area_struct *vma); diff -prauN linux-2.6.0-test11/drivers/char/drm/drm_memory.h wli-2.6.0-test11-30/drivers/char/drm/drm_memory.h --- linux-2.6.0-test11/drivers/char/drm/drm_memory.h 2003-11-26 12:43:09.000000000 -0800 +++ wli-2.6.0-test11-30/drivers/char/drm/drm_memory.h 2003-12-03 18:20:41.000000000 -0800 @@ -125,7 +125,7 @@ static inline unsigned long drm_follow_page (void *vaddr) { pgd_t *pgd = pgd_offset_k((unsigned long) vaddr); - pmd_t *pmd = pmd_offset(pgd, (unsigned long) vaddr); + pmd_t *pmd = pmd_offset_kernel(pgd, (unsigned long)vaddr); pte_t *ptep = pte_offset_kernel(pmd, (unsigned long) vaddr); return pte_pfn(*ptep) << PAGE_SHIFT; } diff -prauN linux-2.6.0-test11/drivers/char/drm/drm_vm.h wli-2.6.0-test11-30/drivers/char/drm/drm_vm.h --- linux-2.6.0-test11/drivers/char/drm/drm_vm.h 2003-11-26 12:46:08.000000000 -0800 +++ wli-2.6.0-test11-30/drivers/char/drm/drm_vm.h 2003-12-04 08:43:29.000000000 -0800 @@ -76,7 +76,7 @@ struct vm_operations_struct DRM(vm_sg_ */ struct page *DRM(vm_nopage)(struct vm_area_struct *vma, unsigned long address, - int write_access) + int *type) { #if __REALLY_HAVE_AGP drm_file_t *priv = vma->vm_file->private_data; @@ -133,6 +133,8 @@ struct page *DRM(vm_nopage)(struct vm_ar baddr, __va(agpmem->memory->memory[offset]), offset, atomic_read(&page->count)); + if (type) + *type = VM_FAULT_MINOR; return page; } vm_nopage_error: @@ -154,7 +156,7 @@ vm_nopage_error: */ struct page *DRM(vm_shm_nopage)(struct vm_area_struct *vma, unsigned long address, - int write_access) + int *type) { drm_map_t *map = (drm_map_t *)vma->vm_private_data; unsigned long offset; @@ -170,6 +172,8 @@ struct page *DRM(vm_shm_nopage)(struct v if (!page) return NOPAGE_OOM; get_page(page); + if (type) + *type = VM_FAULT_MINOR; DRM_DEBUG("shm_nopage 0x%lx\n", address); return page; @@ -268,7 +272,7 @@ void DRM(vm_shm_close)(struct vm_area_st */ struct page *DRM(vm_dma_nopage)(struct vm_area_struct *vma, unsigned long address, - int write_access) + int *type) { drm_file_t *priv = vma->vm_file->private_data; drm_device_t *dev = priv->dev; @@ -287,6 +291,8 @@ struct page *DRM(vm_dma_nopage)(struct v (offset & (~PAGE_MASK)))); get_page(page); + if (type) + *type = VM_FAULT_MINOR; DRM_DEBUG("dma_nopage 0x%lx (page %lu)\n", address, page_nr); return page; @@ -304,7 +310,7 @@ struct page *DRM(vm_dma_nopage)(struct v */ struct page *DRM(vm_sg_nopage)(struct vm_area_struct *vma, unsigned long address, - int write_access) + int *type) { drm_map_t *map = (drm_map_t *)vma->vm_private_data; drm_file_t *priv = vma->vm_file->private_data; @@ -325,6 +331,8 @@ struct page *DRM(vm_sg_nopage)(struct vm page_offset = (offset >> PAGE_SHIFT) + (map_offset >> PAGE_SHIFT); page = entry->pagelist[page_offset]; get_page(page); + if (type) + *type = VM_FAULT_MINOR; return page; } diff -prauN linux-2.6.0-test11/drivers/char/tty_io.c wli-2.6.0-test11-30/drivers/char/tty_io.c --- linux-2.6.0-test11/drivers/char/tty_io.c 2003-11-26 12:44:26.000000000 -0800 +++ wli-2.6.0-test11-30/drivers/char/tty_io.c 2003-12-03 18:50:57.000000000 -0800 @@ -479,8 +479,7 @@ void do_tty_hangup(void *data) read_lock(&tasklist_lock); if (tty->session > 0) { - struct list_head *l; - for_each_task_pid(tty->session, PIDTYPE_SID, p, l, pid) { + for_each_task_pid(tty->session, PIDTYPE_SID, p, pid) { if (p->tty == tty) p->tty = NULL; if (!p->leader) @@ -560,8 +559,7 @@ EXPORT_SYMBOL(tty_hung_up_p); void disassociate_ctty(int on_exit) { struct tty_struct *tty; - struct task_struct *p; - struct list_head *l; + task_t *p; struct pid *pid; int tty_pgrp = -1; @@ -591,7 +589,7 @@ void disassociate_ctty(int on_exit) tty->pgrp = -1; read_lock(&tasklist_lock); - for_each_task_pid(current->session, PIDTYPE_SID, p, l, pid) + for_each_task_pid(current->session, PIDTYPE_SID, p, pid) p->tty = NULL; read_unlock(&tasklist_lock); unlock_kernel(); @@ -1214,15 +1212,14 @@ static void release_dev(struct file * fi * tty. */ if (tty_closing || o_tty_closing) { - struct task_struct *p; - struct list_head *l; + task_t *p; struct pid *pid; read_lock(&tasklist_lock); - for_each_task_pid(tty->session, PIDTYPE_SID, p, l, pid) + for_each_task_pid(tty->session, PIDTYPE_SID, p, pid) p->tty = NULL; if (o_tty) - for_each_task_pid(o_tty->session, PIDTYPE_SID, p,l, pid) + for_each_task_pid(o_tty->session, PIDTYPE_SID, p, pid) p->tty = NULL; read_unlock(&tasklist_lock); } @@ -1539,7 +1536,6 @@ static int fionbio(struct file *file, in static int tiocsctty(struct tty_struct *tty, int arg) { - struct list_head *l; struct pid *pid; task_t *p; @@ -1563,7 +1559,7 @@ static int tiocsctty(struct tty_struct * */ read_lock(&tasklist_lock); - for_each_task_pid(tty->session, PIDTYPE_SID, p, l, pid) + for_each_task_pid(tty->session, PIDTYPE_SID, p, pid) p->tty = NULL; read_unlock(&tasklist_lock); } else @@ -1869,8 +1865,7 @@ static void __do_SAK(void *arg) tty_hangup(tty); #else struct tty_struct *tty = arg; - struct task_struct *p; - struct list_head *l; + task_t *p; struct pid *pid; int session; int i; @@ -1884,7 +1879,7 @@ static void __do_SAK(void *arg) if (tty->driver->flush_buffer) tty->driver->flush_buffer(tty); read_lock(&tasklist_lock); - for_each_task_pid(session, PIDTYPE_SID, p, l, pid) { + for_each_task_pid(session, PIDTYPE_SID, p, pid) { if (p->tty == tty || session > 0) { printk(KERN_NOTICE "SAK: killed process %d" " (%s): p->session==tty->session\n", diff -prauN linux-2.6.0-test11/drivers/ieee1394/dma.c wli-2.6.0-test11-30/drivers/ieee1394/dma.c --- linux-2.6.0-test11/drivers/ieee1394/dma.c 2003-11-26 12:44:12.000000000 -0800 +++ wli-2.6.0-test11-30/drivers/ieee1394/dma.c 2003-12-04 08:43:29.000000000 -0800 @@ -187,7 +187,7 @@ void dma_region_sync(struct dma_region * /* nopage() handler for mmap access */ static struct page* -dma_region_pagefault(struct vm_area_struct *area, unsigned long address, int write_access) +dma_region_pagefault(struct vm_area_struct *area, unsigned long address, int *type) { unsigned long offset; unsigned long kernel_virt_addr; @@ -202,6 +202,8 @@ dma_region_pagefault(struct vm_area_stru (address > (unsigned long) area->vm_start + (PAGE_SIZE * dma->n_pages)) ) goto out; + if (type) + *type = VM_FAULT_MINOR; offset = address - area->vm_start; kernel_virt_addr = (unsigned long) dma->kvirt + offset; ret = vmalloc_to_page((void*) kernel_virt_addr); diff -prauN linux-2.6.0-test11/drivers/media/video/video-buf.c wli-2.6.0-test11-30/drivers/media/video/video-buf.c --- linux-2.6.0-test11/drivers/media/video/video-buf.c 2003-11-26 12:44:08.000000000 -0800 +++ wli-2.6.0-test11-30/drivers/media/video/video-buf.c 2003-12-04 08:43:29.000000000 -0800 @@ -1078,7 +1078,7 @@ videobuf_vm_close(struct vm_area_struct */ static struct page* videobuf_vm_nopage(struct vm_area_struct *vma, unsigned long vaddr, - int write_access) + int *type) { struct page *page; @@ -1090,6 +1090,8 @@ videobuf_vm_nopage(struct vm_area_struct if (!page) return NOPAGE_OOM; clear_user_page(page_address(page), vaddr, page); + if (type) + *type = VM_FAULT_MINOR; return page; } diff -prauN linux-2.6.0-test11/drivers/scsi/sg.c wli-2.6.0-test11-30/drivers/scsi/sg.c --- linux-2.6.0-test11/drivers/scsi/sg.c 2003-11-26 12:43:52.000000000 -0800 +++ wli-2.6.0-test11-30/drivers/scsi/sg.c 2003-12-04 08:43:29.000000000 -0800 @@ -1118,7 +1118,7 @@ sg_rb_correct4mmap(Sg_scatter_hold * rsv } static struct page * -sg_vma_nopage(struct vm_area_struct *vma, unsigned long addr, int unused) +sg_vma_nopage(struct vm_area_struct *vma, unsigned long addr, int *type) { Sg_fd *sfp; struct page *page = NOPAGE_SIGBUS; @@ -1158,6 +1158,8 @@ sg_vma_nopage(struct vm_area_struct *vma page = virt_to_page(page_ptr); get_page(page); /* increment page count */ } + if (type) + *type = VM_FAULT_MINOR; return page; } diff -prauN linux-2.6.0-test11/fs/adfs/inode.c wli-2.6.0-test11-30/fs/adfs/inode.c --- linux-2.6.0-test11/fs/adfs/inode.c 2003-11-26 12:43:31.000000000 -0800 +++ wli-2.6.0-test11-30/fs/adfs/inode.c 2003-12-04 06:13:40.000000000 -0800 @@ -63,7 +63,7 @@ static int adfs_readpage(struct file *fi static int adfs_prepare_write(struct file *file, struct page *page, unsigned int from, unsigned int to) { return cont_prepare_write(page, from, to, adfs_get_block, - &ADFS_I(page->mapping->host)->mmu_private); + &ADFS_I(page_mapping(page)->host)->mmu_private); } static sector_t _adfs_bmap(struct address_space *mapping, sector_t block) diff -prauN linux-2.6.0-test11/fs/affs/file.c wli-2.6.0-test11-30/fs/affs/file.c --- linux-2.6.0-test11/fs/affs/file.c 2003-11-26 12:44:02.000000000 -0800 +++ wli-2.6.0-test11-30/fs/affs/file.c 2003-12-04 06:13:40.000000000 -0800 @@ -418,7 +418,7 @@ static int affs_readpage(struct file *fi static int affs_prepare_write(struct file *file, struct page *page, unsigned from, unsigned to) { return cont_prepare_write(page, from, to, affs_get_block, - &AFFS_I(page->mapping->host)->mmu_private); + &AFFS_I(page_mapping(page)->host)->mmu_private); } static sector_t _affs_bmap(struct address_space *mapping, sector_t block) { @@ -508,7 +508,7 @@ affs_file_write(struct file *file, const static int affs_do_readpage_ofs(struct file *file, struct page *page, unsigned from, unsigned to) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; struct super_block *sb = inode->i_sb; struct buffer_head *bh; char *data; @@ -616,7 +616,7 @@ out: static int affs_readpage_ofs(struct file *file, struct page *page) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; u32 to; int err; @@ -636,7 +636,7 @@ affs_readpage_ofs(struct file *file, str static int affs_prepare_write_ofs(struct file *file, struct page *page, unsigned from, unsigned to) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; u32 size, offset; u32 tmp; int err = 0; @@ -677,7 +677,7 @@ static int affs_prepare_write_ofs(struct static int affs_commit_write_ofs(struct file *file, struct page *page, unsigned from, unsigned to) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; struct super_block *sb = inode->i_sb; struct buffer_head *bh, *prev_bh; char *data; diff -prauN linux-2.6.0-test11/fs/affs/symlink.c wli-2.6.0-test11-30/fs/affs/symlink.c --- linux-2.6.0-test11/fs/affs/symlink.c 2003-11-26 12:43:24.000000000 -0800 +++ wli-2.6.0-test11-30/fs/affs/symlink.c 2003-12-04 06:13:40.000000000 -0800 @@ -20,7 +20,7 @@ static int affs_symlink_readpage(struct file *file, struct page *page) { struct buffer_head *bh; - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; char *link = kmap(page); struct slink_front *lf; int err; diff -prauN linux-2.6.0-test11/fs/afs/file.c wli-2.6.0-test11-30/fs/afs/file.c --- linux-2.6.0-test11/fs/afs/file.c 2003-11-26 12:44:58.000000000 -0800 +++ wli-2.6.0-test11-30/fs/afs/file.c 2003-12-04 06:31:49.000000000 -0800 @@ -51,7 +51,7 @@ struct file_operations afs_file_file_ope struct address_space_operations afs_fs_aops = { .readpage = afs_file_readpage, .sync_page = block_sync_page, - .set_page_dirty = __set_page_dirty_nobuffers, + .set_page_dirty = set_page_dirty_nobuffers, .releasepage = afs_file_releasepage, .invalidatepage = afs_file_invalidatepage, }; @@ -119,7 +119,7 @@ static int afs_file_readpage(struct file afs_vnode_t *vnode; int ret; - inode = page->mapping->host; + inode = page_mapping(page)->host; _enter("{%lu},{%lu}",inode->i_ino,page->index); @@ -242,7 +242,7 @@ static int afs_file_invalidatepage(struc BUG_ON(!PageLocked(page)); if (PagePrivate(page)) { #ifdef AFS_CACHING_SUPPORT - struct afs_vnode *vnode = AFS_FS_I(page->mapping->host); + struct afs_vnode *vnode = AFS_FS_I(page_mapping(page)->host); cachefs_uncache_page(vnode->cache,page); #endif @@ -256,7 +256,7 @@ static int afs_file_invalidatepage(struc ret = 0; if (!PageWriteback(page)) - ret = page->mapping->a_ops->releasepage(page, 0); + ret = page_mapping(page)->a_ops->releasepage(page, 0); } } @@ -276,7 +276,7 @@ static int afs_file_releasepage(struct p if (PagePrivate(page)) { #ifdef AFS_CACHING_SUPPORT - struct afs_vnode *vnode = AFS_FS_I(page->mapping->host); + struct afs_vnode *vnode = AFS_FS_I(page_mapping(page)->host); cachefs_uncache_page(vnode->cache,page); #endif diff -prauN linux-2.6.0-test11/fs/binfmt_elf.c wli-2.6.0-test11-30/fs/binfmt_elf.c --- linux-2.6.0-test11/fs/binfmt_elf.c 2003-11-26 12:43:51.000000000 -0800 +++ wli-2.6.0-test11-30/fs/binfmt_elf.c 2003-12-04 07:34:23.000000000 -0800 @@ -7,6 +7,7 @@ * Tools". * * Copyright 1993, 1994: Eric Youngdale (ericy@cais.com). + * Top-down vma allocation support, William Irwin, IBM, 2003 */ #include @@ -329,8 +330,13 @@ static unsigned long load_elf_interp(str if (retval < 0) goto out_close; +#ifndef CONFIG_MMAP_TOPDOWN eppnt = elf_phdata; for (i=0; ie_phnum; i++, eppnt++) { +#else + eppnt = &elf_phdata[interp_elf_ex->e_phnum - 1]; + for (i = interp_elf_ex->e_phnum - 1; i >= 0; --i, --eppnt) { +#endif if (eppnt->p_type == PT_LOAD) { int elf_type = MAP_PRIVATE | MAP_DENYWRITE; int elf_prot = 0; @@ -344,7 +350,8 @@ static unsigned long load_elf_interp(str if (interp_elf_ex->e_type == ET_EXEC || load_addr_set) elf_type |= MAP_FIXED; - map_addr = elf_map(interpreter, load_addr + vaddr, eppnt, elf_prot, elf_type); + map_addr = load_addr_set ? load_addr + vaddr : 0; + map_addr = elf_map(interpreter, map_addr, eppnt, elf_prot, elf_type); if (BAD_ADDR(map_addr)) goto out_close; diff -prauN linux-2.6.0-test11/fs/buffer.c wli-2.6.0-test11-30/fs/buffer.c --- linux-2.6.0-test11/fs/buffer.c 2003-11-26 12:44:23.000000000 -0800 +++ wli-2.6.0-test11-30/fs/buffer.c 2003-12-04 08:09:12.000000000 -0800 @@ -46,7 +46,7 @@ static void invalidate_bh_lrus(void); /* * Hashed waitqueue_head's for wait_on_buffer() */ -#define BH_WAIT_TABLE_ORDER 7 +#define BH_WAIT_TABLE_ORDER 12 static struct bh_wait_queue_head { wait_queue_head_t wqh; } ____cacheline_aligned_in_smp bh_wait_queue_heads[1<b_bdev, b)); - set_bit(AS_EIO, &page->mapping->flags); + set_bit(AS_EIO, &page_mapping(page)->flags); clear_buffer_uptodate(bh); SetPageError(page); } @@ -790,7 +790,7 @@ void write_boundary_block(struct block_d void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode) { struct address_space *mapping = inode->i_mapping; - struct address_space *buffer_mapping = bh->b_page->mapping; + struct address_space *buffer_mapping = page_mapping(bh->b_page); mark_buffer_dirty(bh); if (!mapping->assoc_mapping) { @@ -835,19 +835,10 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode); * * FIXME: may need to call ->reservepage here as well. That's rather up to the * address_space though. - * - * For now, we treat swapper_space specially. It doesn't use the normal - * block a_ops. */ -int __set_page_dirty_buffers(struct page *page) +int set_page_dirty_buffers(struct page *page) { - struct address_space * const mapping = page->mapping; - int ret = 0; - - if (mapping == NULL) { - SetPageDirty(page); - goto out; - } + struct address_space * const mapping = page_mapping(page); spin_lock(&mapping->private_lock); if (page_has_buffers(page)) { @@ -865,21 +856,19 @@ int __set_page_dirty_buffers(struct page spin_unlock(&mapping->private_lock); if (!TestSetPageDirty(page)) { - spin_lock(&mapping->page_lock); - if (page->mapping) { /* Race with truncate? */ + mapping_wrlock(&mapping->page_lock); + if (page_mapping(page)) { /* Race with truncate? */ if (!mapping->backing_dev_info->memory_backed) inc_page_state(nr_dirty); list_del(&page->list); list_add(&page->list, &mapping->dirty_pages); } - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); } - -out: - return ret; + return 0; } -EXPORT_SYMBOL(__set_page_dirty_buffers); +EXPORT_SYMBOL(set_page_dirty_buffers); /* * Write out and wait upon a list of buffers. @@ -1251,7 +1240,7 @@ __getblk_slow(struct block_device *bdev, * address_space's dirty_pages list and then attach the address_space's * inode to its superblock's dirty inode list. * - * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock, + * mark_buffer_dirty() is atomic. It takes page_mapping(bh->b_page)->private_lock, * mapping->page_lock and the global inode_lock. */ void mark_buffer_dirty(struct buffer_head *bh) @@ -1259,7 +1248,7 @@ void mark_buffer_dirty(struct buffer_hea if (!buffer_uptodate(bh)) buffer_error(); if (!buffer_dirty(bh) && !test_set_buffer_dirty(bh)) - __set_page_dirty_nobuffers(bh->b_page); + set_page_dirty_nobuffers(bh->b_page); } /* @@ -1287,7 +1276,7 @@ void __bforget(struct buffer_head *bh) { clear_buffer_dirty(bh); if (!list_empty(&bh->b_assoc_buffers)) { - struct address_space *buffer_mapping = bh->b_page->mapping; + struct address_space *buffer_mapping = page_mapping(bh->b_page); spin_lock(&buffer_mapping->private_lock); list_del_init(&bh->b_assoc_buffers); @@ -1574,7 +1563,7 @@ static inline void discard_buffer(struct */ int try_to_release_page(struct page *page, int gfp_mask) { - struct address_space * const mapping = page->mapping; + struct address_space * const mapping = page_mapping(page); if (!PageLocked(page)) BUG(); @@ -1640,7 +1629,7 @@ EXPORT_SYMBOL(block_invalidatepage); /* * We attach and possibly dirty the buffers atomically wrt - * __set_page_dirty_buffers() via private_lock. try_to_free_buffers + * set_page_dirty_buffers() via private_lock. try_to_free_buffers * is already excluded via the page lock. */ void create_empty_buffers(struct page *page, @@ -1657,7 +1646,7 @@ void create_empty_buffers(struct page *p } while (bh); tail->b_this_page = head; - spin_lock(&page->mapping->private_lock); + spin_lock(&page_mapping(page)->private_lock); if (PageUptodate(page) || PageDirty(page)) { bh = head; do { @@ -1669,7 +1658,7 @@ void create_empty_buffers(struct page *p } while (bh != head); } __set_page_buffers(page, head); - spin_unlock(&page->mapping->private_lock); + spin_unlock(&page_mapping(page)->private_lock); } EXPORT_SYMBOL(create_empty_buffers); @@ -1753,12 +1742,12 @@ static int __block_write_full_page(struc } /* - * Be very careful. We have no exclusion from __set_page_dirty_buffers + * Be very careful. We have no exclusion from set_page_dirty_buffers * here, and the (potentially unmapped) buffers may become dirty at * any time. If a buffer becomes dirty here after we've inspected it * then we just miss that fact, and the page stays dirty. * - * Buffers outside i_size may be dirtied by __set_page_dirty_buffers; + * Buffers outside i_size may be dirtied by set_page_dirty_buffers; * handle that here by just cleaning them. */ @@ -1809,7 +1798,7 @@ static int __block_write_full_page(struc lock_buffer(bh); } else { if (test_set_buffer_locked(bh)) { - __set_page_dirty_nobuffers(page); + set_page_dirty_nobuffers(page); continue; } } @@ -2062,7 +2051,7 @@ static int __block_commit_write(struct i */ int block_read_full_page(struct page *page, get_block_t *get_block) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; sector_t iblock, lblock; struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; unsigned int blocksize; @@ -2202,7 +2191,7 @@ out: int cont_prepare_write(struct page *page, unsigned offset, unsigned to, get_block_t *get_block, loff_t *bytes) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); struct inode *inode = mapping->host; struct page *new_page; unsigned long pgpos; @@ -2284,7 +2273,7 @@ out: int block_prepare_write(struct page *page, unsigned from, unsigned to, get_block_t *get_block) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; int err = __block_prepare_write(inode, page, from, to, get_block); if (err) ClearPageUptodate(page); @@ -2293,7 +2282,7 @@ int block_prepare_write(struct page *pag int block_commit_write(struct page *page, unsigned from, unsigned to) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; __block_commit_write(inode,page,from,to); return 0; } @@ -2301,7 +2290,7 @@ int block_commit_write(struct page *page int generic_commit_write(struct file *file, struct page *page, unsigned from, unsigned to) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; __block_commit_write(inode,page,from,to); /* @@ -2322,7 +2311,7 @@ int generic_commit_write(struct file *fi int nobh_prepare_write(struct page *page, unsigned from, unsigned to, get_block_t *get_block) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; const unsigned blkbits = inode->i_blkbits; const unsigned blocksize = 1 << blkbits; struct buffer_head map_bh; @@ -2456,7 +2445,7 @@ EXPORT_SYMBOL(nobh_prepare_write); int nobh_commit_write(struct file *file, struct page *page, unsigned from, unsigned to) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; set_page_dirty(page); @@ -2590,7 +2579,7 @@ out: int block_write_full_page(struct page *page, get_block_t *get_block, struct writeback_control *wbc) { - struct inode * const inode = page->mapping->host; + struct inode * const inode = page_mapping(page)->host; loff_t i_size = i_size_read(inode); const unsigned long end_index = i_size >> PAGE_CACHE_SHIFT; unsigned offset; @@ -2769,9 +2758,9 @@ void sync_dirty_buffer(struct buffer_hea static void check_ttfb_buffer(struct page *page, struct buffer_head *bh) { if (!buffer_uptodate(bh) && !buffer_req(bh)) { - if (PageUptodate(page) && page->mapping + if (PageUptodate(page) && page_mapping(page) && buffer_mapped(bh) /* discard_buffer */ - && S_ISBLK(page->mapping->host->i_mode)) + && S_ISBLK(page_mapping(page)->host->i_mode)) { buffer_error(); } @@ -2793,7 +2782,7 @@ static void check_ttfb_buffer(struct pag * * The same applies to regular filesystem pages: if all the buffers are * clean then we set the page clean and proceed. To do that, we require - * total exclusion from __set_page_dirty_buffers(). That is obtained with + * total exclusion from set_page_dirty_buffers(). That is obtained with * private_lock. * * try_to_free_buffers() is non-blocking. @@ -2815,7 +2804,7 @@ drop_buffers(struct page *page, struct b do { check_ttfb_buffer(page, bh); if (buffer_write_io_error(bh)) - set_bit(AS_EIO, &page->mapping->flags); + set_bit(AS_EIO, &page_mapping(page)->flags); if (buffer_busy(bh)) goto failed; if (!buffer_uptodate(bh) && !buffer_req(bh)) @@ -2842,7 +2831,7 @@ failed: int try_to_free_buffers(struct page *page) { - struct address_space * const mapping = page->mapping; + struct address_space * const mapping = page_mapping(page); struct buffer_head *buffers_to_free = NULL; int ret = 0; diff -prauN linux-2.6.0-test11/fs/cifs/file.c wli-2.6.0-test11-30/fs/cifs/file.c --- linux-2.6.0-test11/fs/cifs/file.c 2003-11-26 12:42:43.000000000 -0800 +++ wli-2.6.0-test11-30/fs/cifs/file.c 2003-12-04 06:25:26.000000000 -0800 @@ -593,14 +593,14 @@ cifs_write(struct file * file, const cha static int cifs_partialpagewrite(struct page *page,unsigned from, unsigned to) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); loff_t offset = (loff_t)page->index << PAGE_CACHE_SHIFT; char * write_data; int rc = -EFAULT; int bytes_written = 0; struct cifs_sb_info *cifs_sb; struct cifsTconInfo *pTcon; - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; struct cifsInodeInfo *cifsInode; struct cifsFileInfo *open_file = NULL; struct list_head *tmp; @@ -725,7 +725,7 @@ cifs_commit_write(struct file *file, str { int xid; int rc = 0; - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; loff_t position = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; struct cifsFileInfo *open_file; struct cifs_sb_info *cifs_sb; @@ -779,7 +779,7 @@ cifs_sync_page(struct page *page) int rc = 0; cFYI(1,("sync page %p",page)); - mapping = page->mapping; + mapping = page_mapping(page); if (!mapping) return 0; inode = mapping->host; @@ -898,11 +898,11 @@ static void cifs_copy_cache_pages(struct if(list_empty(pages)) break; - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); page = list_entry(pages->prev, struct page, list); list_del(&page->list); - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); if (add_to_page_cache(page, mapping, page->index, GFP_KERNEL)) { page_cache_release(page); @@ -962,14 +962,14 @@ cifs_readpages(struct file *file, struct pagevec_init(&lru_pvec, 0); for(i = 0;ipage_lock); + mapping_rdlock(&mapping->page_lock); if(list_empty(page_list)) { - spin_unlock(&mapping->page_lock); + mapping_rdunlock(&mapping->page_lock); break; } page = list_entry(page_list->prev, struct page, list); offset = (loff_t)page->index << PAGE_CACHE_SHIFT; - spin_unlock(&mapping->page_lock); + mapping_rdunlock(&mapping->page_lock); /* for reads over a certain size could initiate async read ahead */ @@ -989,12 +989,12 @@ cifs_readpages(struct file *file, struct cFYI(1,("Read error in readpages: %d",rc)); /* clean up remaing pages off list */ - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); while (!list_empty(page_list) && (i < num_pages)) { page = list_entry(page_list->prev, struct page, list); list_del(&page->list); } - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); break; } else if (bytes_read > 0) { pSMBr = (struct smb_com_read_rsp *)smb_read_data; diff -prauN linux-2.6.0-test11/fs/coda/symlink.c wli-2.6.0-test11-30/fs/coda/symlink.c --- linux-2.6.0-test11/fs/coda/symlink.c 2003-11-26 12:43:25.000000000 -0800 +++ wli-2.6.0-test11-30/fs/coda/symlink.c 2003-12-04 06:13:40.000000000 -0800 @@ -24,7 +24,7 @@ static int coda_symlink_filler(struct file *file, struct page *page) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; int error; struct coda_inode_info *cii; unsigned int len = PAGE_SIZE; diff -prauN linux-2.6.0-test11/fs/cramfs/inode.c wli-2.6.0-test11-30/fs/cramfs/inode.c --- linux-2.6.0-test11/fs/cramfs/inode.c 2003-11-26 12:42:54.000000000 -0800 +++ wli-2.6.0-test11-30/fs/cramfs/inode.c 2003-12-04 06:13:40.000000000 -0800 @@ -410,7 +410,7 @@ static struct dentry * cramfs_lookup(str static int cramfs_readpage(struct file *file, struct page * page) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; u32 maxblock, bytes_filled; void *pgdata; diff -prauN linux-2.6.0-test11/fs/dcache.c wli-2.6.0-test11-30/fs/dcache.c --- linux-2.6.0-test11/fs/dcache.c 2003-11-26 12:43:06.000000000 -0800 +++ wli-2.6.0-test11-30/fs/dcache.c 2003-12-04 08:30:37.000000000 -0800 @@ -1040,56 +1040,6 @@ struct dentry * __d_lookup(struct dentry return found; } -/** - * d_validate - verify dentry provided from insecure source - * @dentry: The dentry alleged to be valid child of @dparent - * @dparent: The parent dentry (known to be valid) - * @hash: Hash of the dentry - * @len: Length of the name - * - * An insecure source has sent us a dentry, here we verify it and dget() it. - * This is used by ncpfs in its readdir implementation. - * Zero is returned in the dentry is invalid. - */ - -int d_validate(struct dentry *dentry, struct dentry *dparent) -{ - unsigned long dent_addr = (unsigned long) dentry; - unsigned long min_addr = PAGE_OFFSET; - unsigned long align_mask = 0x0F; - struct hlist_head *base; - struct hlist_node *lhp; - - if (dent_addr < min_addr) - goto out; - if (dent_addr > (unsigned long)high_memory - sizeof(struct dentry)) - goto out; - if (dent_addr & align_mask) - goto out; - if ((!kern_addr_valid(dent_addr)) || (!kern_addr_valid(dent_addr -1 + - sizeof(struct dentry)))) - goto out; - - if (dentry->d_parent != dparent) - goto out; - - spin_lock(&dcache_lock); - base = d_hash(dparent, dentry->d_name.hash); - hlist_for_each(lhp,base) { - /* read_barrier_depends() not required for d_hash list - * as it is parsed under dcache_lock - */ - if (dentry == hlist_entry(lhp, struct dentry, d_hash)) { - __dget_locked(dentry); - spin_unlock(&dcache_lock); - return 1; - } - } - spin_unlock(&dcache_lock); -out: - return 0; -} - /* * When a file is deleted, we have two options: * - turn this dentry into a negative dentry @@ -1655,7 +1605,6 @@ EXPORT_SYMBOL(d_path); EXPORT_SYMBOL(d_prune_aliases); EXPORT_SYMBOL(d_rehash); EXPORT_SYMBOL(d_splice_alias); -EXPORT_SYMBOL(d_validate); EXPORT_SYMBOL(dget_locked); EXPORT_SYMBOL(dput); EXPORT_SYMBOL(find_inode_number); diff -prauN linux-2.6.0-test11/fs/efs/symlink.c wli-2.6.0-test11-30/fs/efs/symlink.c --- linux-2.6.0-test11/fs/efs/symlink.c 2003-11-26 12:44:25.000000000 -0800 +++ wli-2.6.0-test11-30/fs/efs/symlink.c 2003-12-04 06:13:40.000000000 -0800 @@ -16,7 +16,7 @@ static int efs_symlink_readpage(struct f { char *link = kmap(page); struct buffer_head * bh; - struct inode * inode = page->mapping->host; + struct inode * inode = page_mapping(page)->host; efs_block_t size = inode->i_size; int err; diff -prauN linux-2.6.0-test11/fs/exec.c wli-2.6.0-test11-30/fs/exec.c --- linux-2.6.0-test11/fs/exec.c 2003-11-26 12:43:36.000000000 -0800 +++ wli-2.6.0-test11-30/fs/exec.c 2003-12-04 07:19:00.000000000 -0800 @@ -44,7 +44,7 @@ #include #include #include -#include +#include #include #include @@ -189,6 +189,26 @@ static int count(char __user * __user * return i; } +static inline size_t exec_copy_from_user(struct page *page, + unsigned long offset, + const char __user *buf, + unsigned bytes) +{ + int left; + char *kaddr; + + kaddr = kmap_atomic(page, KM_USER0); + left = __copy_from_user(kaddr + offset, buf, bytes); + kunmap_atomic(kaddr, KM_USER0); + + if (left) { + kaddr = kmap(page); + left = __copy_from_user(kaddr + offset, buf, bytes); + kunmap(page); + } + return left; +} + /* * 'copy_strings()' copies argument/environment strings from user * memory to free pages in kernel mem. These are in a format ready @@ -196,8 +216,6 @@ static int count(char __user * __user * */ int copy_strings(int argc,char __user * __user * argv, struct linux_binprm *bprm) { - struct page *kmapped_page = NULL; - char *kaddr = NULL; int ret; while (argc-- > 0) { @@ -224,6 +242,7 @@ int copy_strings(int argc,char __user * int i, new, err; int offset, bytes_to_copy; struct page *page; + char *kaddr = NULL; offset = pos % PAGE_SIZE; i = pos/PAGE_SIZE; @@ -239,22 +258,26 @@ int copy_strings(int argc,char __user * new = 1; } - if (page != kmapped_page) { - if (kmapped_page) - kunmap(kmapped_page); - kmapped_page = page; - kaddr = kmap(kmapped_page); - } + bytes_to_copy = PAGE_SIZE - offset; + + if ((new && offset) || bytes_to_copy > len) + kaddr = kmap_atomic(page, KM_USER0); + if (new && offset) memset(kaddr, 0, offset); - bytes_to_copy = PAGE_SIZE - offset; + if (bytes_to_copy > len) { bytes_to_copy = len; if (new) memset(kaddr+offset+len, 0, PAGE_SIZE-offset-len); } - err = copy_from_user(kaddr+offset, str, bytes_to_copy); + + if (kaddr) + kunmap_atomic(kaddr, KM_USER0); + + fault_in_pages_readable(str, bytes_to_copy); + err = exec_copy_from_user(page, offset, str, bytes_to_copy); if (err) { ret = -EFAULT; goto out; @@ -267,8 +290,6 @@ int copy_strings(int argc,char __user * } ret = 0; out: - if (kmapped_page) - kunmap(kmapped_page); return ret; } @@ -292,52 +313,48 @@ EXPORT_SYMBOL(copy_strings_kernel); * This routine is used to map in a page into an address space: needed by * execve() for the initial stack and environment pages. * - * tsk->mmap_sem is held for writing. + * The caller should hold task->mm->mmap_sem for writing. */ -void put_dirty_page(struct task_struct *tsk, struct page *page, - unsigned long address, pgprot_t prot) +void put_dirty_page(task_t *task, struct vm_area_struct *vma, + struct page *page, unsigned long address, pgprot_t prot) { - pgd_t * pgd; - pmd_t * pmd; - pte_t * pte; - struct pte_chain *pte_chain; + struct mm_struct *mm = task->mm; + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; if (page_count(page) != 1) printk(KERN_ERR "mem_map disagrees with %p at %08lx\n", page, address); - pgd = pgd_offset(tsk->mm, address); - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) - goto out_sig; - spin_lock(&tsk->mm->page_table_lock); - pmd = pmd_alloc(tsk->mm, pgd, address); + pgd = pgd_offset(mm, address); + spin_lock(&mm->page_table_lock); + pmd = pmd_alloc_map(mm, pgd, address); if (!pmd) goto out; - pte = pte_alloc_map(tsk->mm, pmd, address); + pte = pte_alloc_map(mm, pgd, &pmd, address); if (!pte) goto out; if (!pte_none(*pte)) { pte_unmap(pte); + pmd_unmap(pmd); goto out; } + mm->rss++; lru_cache_add_active(page); flush_dcache_page(page); - set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(page, prot)))); - pte_chain = page_add_rmap(page, pte, pte_chain); + vm_set_pte(vma, pte, pte_mkdirty(pte_mkwrite(mk_pte(page, prot))), address); + page_add_rmap(page, vma, address, 1); pte_unmap(pte); - tsk->mm->rss++; - spin_unlock(&tsk->mm->page_table_lock); + pmd_unmap(pmd); + spin_unlock(&mm->page_table_lock); /* no need for flush_tlb */ - pte_chain_free(pte_chain); return; out: - spin_unlock(&tsk->mm->page_table_lock); -out_sig: + spin_unlock(&mm->page_table_lock); __free_page(page); - force_sig(SIGKILL, tsk); - pte_chain_free(pte_chain); + force_sig(SIGKILL, task); return; } @@ -439,7 +456,7 @@ int setup_arg_pages(struct linux_binprm struct page *page = bprm->page[i]; if (page) { bprm->page[i] = NULL; - put_dirty_page(current, page, stack_base, + put_dirty_page(current, mpnt, page, stack_base, mpnt->vm_page_prot); } stack_base += PAGE_SIZE; @@ -695,8 +712,9 @@ static inline int de_thread(struct task_ __ptrace_link(current, parent); } - list_del(¤t->tasks); - list_add_tail(¤t->tasks, &init_task.tasks); + /* is this necessary? only if the tgid changes... */ + remove_task_list(current); + insert_task_list(current); current->exit_signal = SIGCHLD; state = leader->state; @@ -718,7 +736,7 @@ no_thread_group: spin_lock(&newsighand->siglock); if (current == oldsig->curr_target) - oldsig->curr_target = next_thread(current); + oldsig->curr_target = another_thread(current); if (newsig) current->signal = newsig; current->sighand = newsighand; diff -prauN linux-2.6.0-test11/fs/ext2/dir.c wli-2.6.0-test11-30/fs/ext2/dir.c --- linux-2.6.0-test11/fs/ext2/dir.c 2003-11-26 12:44:18.000000000 -0800 +++ wli-2.6.0-test11-30/fs/ext2/dir.c 2003-12-04 06:13:40.000000000 -0800 @@ -64,10 +64,10 @@ ext2_last_byte(struct inode *inode, unsi static int ext2_commit_chunk(struct page *page, unsigned from, unsigned to) { - struct inode *dir = page->mapping->host; + struct inode *dir = page_mapping(page)->host; int err = 0; dir->i_version++; - page->mapping->a_ops->commit_write(NULL, page, from, to); + page_mapping(page)->a_ops->commit_write(NULL, page, from, to); if (IS_DIRSYNC(dir)) err = write_one_page(page, 1); else @@ -77,7 +77,7 @@ static int ext2_commit_chunk(struct page static void ext2_check_page(struct page *page) { - struct inode *dir = page->mapping->host; + struct inode *dir = page_mapping(page)->host; struct super_block *sb = dir->i_sb; unsigned chunk_size = ext2_chunk_size(dir); char *kaddr = page_address(page); @@ -412,7 +412,7 @@ void ext2_set_link(struct inode *dir, st int err; lock_page(page); - err = page->mapping->a_ops->prepare_write(NULL, page, from, to); + err = page_mapping(page)->a_ops->prepare_write(NULL, page, from, to); if (err) BUG(); de->inode = cpu_to_le32(inode->i_ino); @@ -495,7 +495,7 @@ int ext2_add_link (struct dentry *dentry got_it: from = (char*)de - (char*)page_address(page); to = from + rec_len; - err = page->mapping->a_ops->prepare_write(NULL, page, from, to); + err = page_mapping(page)->a_ops->prepare_write(NULL, page, from, to); if (err) goto out_unlock; if (de->inode) { @@ -528,7 +528,7 @@ out_unlock: */ int ext2_delete_entry (struct ext2_dir_entry_2 * dir, struct page * page ) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); struct inode *inode = mapping->host; char *kaddr = page_address(page); unsigned from = ((char*)dir - kaddr) & ~(ext2_chunk_size(inode)-1); diff -prauN linux-2.6.0-test11/fs/ext3/inode.c wli-2.6.0-test11-30/fs/ext3/inode.c --- linux-2.6.0-test11/fs/ext3/inode.c 2003-11-26 12:44:43.000000000 -0800 +++ wli-2.6.0-test11-30/fs/ext3/inode.c 2003-12-04 06:13:40.000000000 -0800 @@ -1078,7 +1078,7 @@ static int do_journal_get_write_access(h static int ext3_prepare_write(struct file *file, struct page *page, unsigned from, unsigned to) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; int ret, needed_blocks = ext3_writepage_trans_blocks(inode); handle_t *handle; @@ -1133,7 +1133,7 @@ static int ext3_ordered_commit_write(str unsigned from, unsigned to) { handle_t *handle = ext3_journal_current_handle(); - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; int ret = 0, ret2; ret = walk_page_buffers(handle, page_buffers(page), @@ -1162,7 +1162,7 @@ static int ext3_writeback_commit_write(s unsigned from, unsigned to) { handle_t *handle = ext3_journal_current_handle(); - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; int ret = 0, ret2; loff_t new_i_size; @@ -1180,7 +1180,7 @@ static int ext3_journalled_commit_write( struct page *page, unsigned from, unsigned to) { handle_t *handle = ext3_journal_current_handle(); - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; int ret = 0, ret2; int partial = 0; loff_t pos; @@ -1335,7 +1335,7 @@ static int journal_dirty_data_fn(handle_ static int ext3_ordered_writepage(struct page *page, struct writeback_control *wbc) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; struct buffer_head *page_bufs; handle_t *handle = NULL; int ret = 0; @@ -1395,7 +1395,7 @@ static int ext3_ordered_writepage(struct return ret; out_fail: - __set_page_dirty_nobuffers(page); + set_page_dirty_nobuffers(page); unlock_page(page); return ret; } @@ -1403,7 +1403,7 @@ out_fail: static int ext3_writeback_writepage(struct page *page, struct writeback_control *wbc) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; handle_t *handle = NULL; int ret = 0; int err; @@ -1424,7 +1424,7 @@ static int ext3_writeback_writepage(stru return ret; out_fail: - __set_page_dirty_nobuffers(page); + set_page_dirty_nobuffers(page); unlock_page(page); return ret; } @@ -1432,7 +1432,7 @@ out_fail: static int ext3_journalled_writepage(struct page *page, struct writeback_control *wbc) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; handle_t *handle = NULL; int ret = 0; int err; @@ -1480,7 +1480,7 @@ out: return ret; no_write: - __set_page_dirty_nobuffers(page); + set_page_dirty_nobuffers(page); out_unlock: unlock_page(page); goto out; @@ -1500,7 +1500,7 @@ ext3_readpages(struct file *file, struct static int ext3_invalidatepage(struct page *page, unsigned long offset) { - journal_t *journal = EXT3_JOURNAL(page->mapping->host); + journal_t *journal = EXT3_JOURNAL(page_mapping(page)->host); /* * If it's a full truncate we just forget about the pending dirtying @@ -1513,7 +1513,7 @@ static int ext3_invalidatepage(struct pa static int ext3_releasepage(struct page *page, int wait) { - journal_t *journal = EXT3_JOURNAL(page->mapping->host); + journal_t *journal = EXT3_JOURNAL(page_mapping(page)->host); WARN_ON(PageChecked(page)); return journal_try_to_free_buffers(journal, page, wait); @@ -1600,7 +1600,7 @@ out: static int ext3_journalled_set_page_dirty(struct page *page) { SetPageChecked(page); - return __set_page_dirty_nobuffers(page); + return set_page_dirty_nobuffers(page); } static struct address_space_operations ext3_ordered_aops = { diff -prauN linux-2.6.0-test11/fs/fat/inode.c wli-2.6.0-test11-30/fs/fat/inode.c --- linux-2.6.0-test11/fs/fat/inode.c 2003-11-26 12:43:50.000000000 -0800 +++ wli-2.6.0-test11-30/fs/fat/inode.c 2003-12-04 06:13:40.000000000 -0800 @@ -1100,7 +1100,7 @@ fat_prepare_write(struct file *file, str { kmap(page); return cont_prepare_write(page,from,to,fat_get_block, - &MSDOS_I(page->mapping->host)->mmu_private); + &MSDOS_I(page_mapping(page)->host)->mmu_private); } static int diff -prauN linux-2.6.0-test11/fs/fcntl.c wli-2.6.0-test11-30/fs/fcntl.c --- linux-2.6.0-test11/fs/fcntl.c 2003-11-26 12:44:43.000000000 -0800 +++ wli-2.6.0-test11-30/fs/fcntl.c 2003-12-03 18:50:57.000000000 -0800 @@ -488,9 +488,8 @@ void send_sigio(struct fown_struct *fown send_sigio_to_task(p, fown, fd, band); } } else { - struct list_head *l; struct pid *pidptr; - for_each_task_pid(-pid, PIDTYPE_PGID, p, l, pidptr) { + for_each_task_pid(-pid, PIDTYPE_PGID, p, pidptr) { send_sigio_to_task(p, fown, fd, band); } } @@ -525,9 +524,8 @@ int send_sigurg(struct fown_struct *fown send_sigurg_to_task(p, fown); } } else { - struct list_head *l; struct pid *pidptr; - for_each_task_pid(-pid, PIDTYPE_PGID, p, l, pidptr) { + for_each_task_pid(-pid, PIDTYPE_PGID, p, pidptr) { send_sigurg_to_task(p, fown); } } diff -prauN linux-2.6.0-test11/fs/freevxfs/vxfs_immed.c wli-2.6.0-test11-30/fs/freevxfs/vxfs_immed.c --- linux-2.6.0-test11/fs/freevxfs/vxfs_immed.c 2003-11-26 12:44:43.000000000 -0800 +++ wli-2.6.0-test11-30/fs/freevxfs/vxfs_immed.c 2003-12-04 06:13:40.000000000 -0800 @@ -122,7 +122,7 @@ vxfs_immed_follow_link(struct dentry *dp static int vxfs_immed_readpage(struct file *fp, struct page *pp) { - struct vxfs_inode_info *vip = VXFS_INO(pp->mapping->host); + struct vxfs_inode_info *vip = VXFS_INO(page_mapping(pp)->host); u_int64_t offset = pp->index << PAGE_CACHE_SHIFT; caddr_t kaddr; diff -prauN linux-2.6.0-test11/fs/fs-writeback.c wli-2.6.0-test11-30/fs/fs-writeback.c --- linux-2.6.0-test11/fs/fs-writeback.c 2003-11-26 12:46:09.000000000 -0800 +++ wli-2.6.0-test11-30/fs/fs-writeback.c 2003-12-03 19:34:55.000000000 -0800 @@ -152,10 +152,10 @@ __sync_single_inode(struct inode *inode, * read speculatively by this cpu before &= ~I_DIRTY -- mikulas */ - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); if (wait || !wbc->for_kupdate || list_empty(&mapping->io_pages)) list_splice_init(&mapping->dirty_pages, &mapping->io_pages); - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); spin_unlock(&inode_lock); do_writepages(mapping, wbc); diff -prauN linux-2.6.0-test11/fs/hfs/inode.c wli-2.6.0-test11-30/fs/hfs/inode.c --- linux-2.6.0-test11/fs/hfs/inode.c 2003-11-26 12:46:03.000000000 -0800 +++ wli-2.6.0-test11-30/fs/hfs/inode.c 2003-12-04 06:13:40.000000000 -0800 @@ -240,7 +240,7 @@ static int hfs_readpage(struct file *fil static int hfs_prepare_write(struct file *file, struct page *page, unsigned from, unsigned to) { return cont_prepare_write(page,from,to,hfs_get_block, - &HFS_I(page->mapping->host)->mmu_private); + &HFS_I(page_mapping(page)->host)->mmu_private); } static sector_t hfs_bmap(struct address_space *mapping, sector_t block) { diff -prauN linux-2.6.0-test11/fs/hpfs/file.c wli-2.6.0-test11-30/fs/hpfs/file.c --- linux-2.6.0-test11/fs/hpfs/file.c 2003-11-26 12:45:48.000000000 -0800 +++ wli-2.6.0-test11-30/fs/hpfs/file.c 2003-12-04 06:13:40.000000000 -0800 @@ -109,7 +109,7 @@ static int hpfs_readpage(struct file *fi static int hpfs_prepare_write(struct file *file, struct page *page, unsigned from, unsigned to) { return cont_prepare_write(page,from,to,hpfs_get_block, - &hpfs_i(page->mapping->host)->mmu_private); + &hpfs_i(page_mapping(page)->host)->mmu_private); } static sector_t _hpfs_bmap(struct address_space *mapping, sector_t block) { diff -prauN linux-2.6.0-test11/fs/hpfs/namei.c wli-2.6.0-test11-30/fs/hpfs/namei.c --- linux-2.6.0-test11/fs/hpfs/namei.c 2003-11-26 12:43:40.000000000 -0800 +++ wli-2.6.0-test11-30/fs/hpfs/namei.c 2003-12-04 06:13:40.000000000 -0800 @@ -452,7 +452,7 @@ int hpfs_rmdir(struct inode *dir, struct int hpfs_symlink_readpage(struct file *file, struct page *page) { char *link = kmap(page); - struct inode *i = page->mapping->host; + struct inode *i = page_mapping(page)->host; struct fnode *fnode; struct buffer_head *bh; int err; diff -prauN linux-2.6.0-test11/fs/hugetlbfs/inode.c wli-2.6.0-test11-30/fs/hugetlbfs/inode.c --- linux-2.6.0-test11/fs/hugetlbfs/inode.c 2003-11-26 12:45:31.000000000 -0800 +++ wli-2.6.0-test11-30/fs/hugetlbfs/inode.c 2003-12-04 08:20:27.000000000 -0800 @@ -194,6 +194,7 @@ static void hugetlbfs_delete_inode(struc hlist_del_init(&inode->i_hash); list_del_init(&inode->i_list); + list_del_init(&inode->i_sb_list); inode->i_state |= I_FREEING; inodes_stat.nr_inodes--; spin_unlock(&inode_lock); @@ -236,6 +237,7 @@ static void hugetlbfs_forget_inode(struc hlist_del_init(&inode->i_hash); out_truncate: list_del_init(&inode->i_list); + list_del_init(&inode->i_sb_list); inode->i_state |= I_FREEING; inodes_stat.nr_inodes--; spin_unlock(&inode_lock); @@ -269,12 +271,15 @@ hugetlb_vmtruncate_list(struct list_head { struct vm_area_struct *vma; - list_for_each_entry(vma, list, shared) { + list_for_each_entry_rcu(vma, list, shared) { unsigned long h_vm_pgoff; unsigned long v_length; unsigned long h_length; unsigned long v_offset; + if (vma->vm_flags & VM_DEAD) + continue; + h_vm_pgoff = vma->vm_pgoff << (HPAGE_SHIFT - PAGE_SHIFT); v_length = vma->vm_end - vma->vm_start; h_length = v_length >> HPAGE_SHIFT; @@ -319,12 +324,12 @@ static int hugetlb_vmtruncate(struct ino pgoff = offset >> HPAGE_SHIFT; inode->i_size = offset; - down(&mapping->i_shared_sem); + rcu_read_lock(); /* mapping->i_shared_lock */ if (!list_empty(&mapping->i_mmap)) hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff); if (!list_empty(&mapping->i_mmap_shared)) hugetlb_vmtruncate_list(&mapping->i_mmap_shared, pgoff); - up(&mapping->i_shared_sem); + rcu_read_unlock(); /* mapping->i_shared_lock */ truncate_hugepages(mapping, offset); return 0; } diff -prauN linux-2.6.0-test11/fs/inode.c wli-2.6.0-test11-30/fs/inode.c --- linux-2.6.0-test11/fs/inode.c 2003-11-26 12:45:53.000000000 -0800 +++ wli-2.6.0-test11-30/fs/inode.c 2003-12-04 08:20:27.000000000 -0800 @@ -184,8 +184,8 @@ void inode_init_once(struct inode *inode INIT_LIST_HEAD(&inode->i_devices); sema_init(&inode->i_sem, 1); INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC); - spin_lock_init(&inode->i_data.page_lock); - init_MUTEX(&inode->i_data.i_shared_sem); + mapping_rwlock_init(&inode->i_data.page_lock); + spin_lock_init(&inode->i_data.i_shared_lock); atomic_set(&inode->i_data.truncate_count, 0); INIT_LIST_HEAD(&inode->i_data.private_list); spin_lock_init(&inode->i_data.private_lock); @@ -285,7 +285,7 @@ static void dispose_list(struct list_hea /* * Invalidate all inodes for a device. */ -static int invalidate_list(struct list_head *head, struct super_block * sb, struct list_head * dispose) +static int invalidate_list(struct list_head *head, struct list_head *dispose) { struct list_head *next; int busy = 0, count = 0; @@ -298,13 +298,12 @@ static int invalidate_list(struct list_h next = next->next; if (tmp == head) break; - inode = list_entry(tmp, struct inode, i_list); - if (inode->i_sb != sb) - continue; + inode = list_entry(tmp, struct inode, i_sb_list); invalidate_inode_buffers(inode); if (!atomic_read(&inode->i_count)) { hlist_del_init(&inode->i_hash); list_del(&inode->i_list); + list_del(&inode->i_sb_list); list_add(&inode->i_list, dispose); inode->i_state |= I_FREEING; count++; @@ -340,10 +339,7 @@ int invalidate_inodes(struct super_block down(&iprune_sem); spin_lock(&inode_lock); - busy = invalidate_list(&inode_in_use, sb, &throw_away); - busy |= invalidate_list(&inode_unused, sb, &throw_away); - busy |= invalidate_list(&sb->s_dirty, sb, &throw_away); - busy |= invalidate_list(&sb->s_io, sb, &throw_away); + busy = invalidate_list(&sb->s_inodes, &throw_away); spin_unlock(&inode_lock); dispose_list(&throw_away); @@ -443,6 +439,7 @@ static void prune_icache(int nr_to_scan) continue; } hlist_del_init(&inode->i_hash); + list_del_init(&inode->i_sb_list); list_move(&inode->i_list, &freeable); inode->i_state |= I_FREEING; nr_pruned++; @@ -553,6 +550,7 @@ struct inode *new_inode(struct super_blo spin_lock(&inode_lock); inodes_stat.nr_inodes++; list_add(&inode->i_list, &inode_in_use); + list_add(&inode->i_sb_list, &sb->s_inodes); inode->i_ino = ++last_ino; inode->i_state = 0; spin_unlock(&inode_lock); @@ -601,6 +599,7 @@ static struct inode * get_new_inode(stru inodes_stat.nr_inodes++; list_add(&inode->i_list, &inode_in_use); + list_add(&inode->i_sb_list, &sb->s_inodes); hlist_add_head(&inode->i_hash, head); inode->i_state = I_LOCK|I_NEW; spin_unlock(&inode_lock); @@ -649,6 +648,7 @@ static struct inode * get_new_inode_fast inode->i_ino = ino; inodes_stat.nr_inodes++; list_add(&inode->i_list, &inode_in_use); + list_add(&inode->i_sb_list, &sb->s_inodes); hlist_add_head(&inode->i_hash, head); inode->i_state = I_LOCK|I_NEW; spin_unlock(&inode_lock); @@ -984,6 +984,7 @@ void generic_delete_inode(struct inode * struct super_operations *op = inode->i_sb->s_op; list_del_init(&inode->i_list); + list_del_init(&inode->i_sb_list); inode->i_state|=I_FREEING; inodes_stat.nr_inodes--; spin_unlock(&inode_lock); @@ -1031,6 +1032,7 @@ static void generic_forget_inode(struct hlist_del_init(&inode->i_hash); } list_del_init(&inode->i_list); + list_del_init(&inode->i_sb_list); inode->i_state|=I_FREEING; inodes_stat.nr_inodes--; spin_unlock(&inode_lock); @@ -1221,34 +1223,17 @@ int remove_inode_dquot_ref(struct inode void remove_dquot_ref(struct super_block *sb, int type) { struct inode *inode; - struct list_head *act_head; LIST_HEAD(tofree_head); if (!sb->dq_op) return; /* nothing to do */ spin_lock(&inode_lock); /* This lock is for inodes code */ /* We don't have to lock against quota code - test IS_QUOTAINIT is just for speedup... */ - - list_for_each(act_head, &inode_in_use) { - inode = list_entry(act_head, struct inode, i_list); - if (inode->i_sb == sb && IS_QUOTAINIT(inode)) - remove_inode_dquot_ref(inode, type, &tofree_head); - } - list_for_each(act_head, &inode_unused) { - inode = list_entry(act_head, struct inode, i_list); - if (inode->i_sb == sb && IS_QUOTAINIT(inode)) - remove_inode_dquot_ref(inode, type, &tofree_head); - } - list_for_each(act_head, &sb->s_dirty) { - inode = list_entry(act_head, struct inode, i_list); - if (IS_QUOTAINIT(inode)) - remove_inode_dquot_ref(inode, type, &tofree_head); - } - list_for_each(act_head, &sb->s_io) { - inode = list_entry(act_head, struct inode, i_list); + + list_for_each_entry(inode, &sb->s_inodes, i_sb_list) if (IS_QUOTAINIT(inode)) remove_inode_dquot_ref(inode, type, &tofree_head); - } + spin_unlock(&inode_lock); put_dquot_list(&tofree_head); @@ -1260,7 +1245,7 @@ void remove_dquot_ref(struct super_block * Hashed waitqueues for wait_on_inode(). The table is pretty small - the * kernel doesn't lock many inodes at the same time. */ -#define I_WAIT_TABLE_ORDER 3 +#define I_WAIT_TABLE_ORDER 12 static struct i_wait_queue_head { wait_queue_head_t wqh; } ____cacheline_aligned_in_smp i_wait_queue_heads[1<f_dentry); if (dd && dd->dd_fset) { - int (*cache_ioctl)(struct inode *, struct file *, unsigned int, unsigned long ) = filter_c2cdfops(dd->dd_fset->fset_cache->cache_filter)->ioctl; + int (*cache_ioctl)(struct inode *, struct file *, unsigned int, unsigned long); + cache_ioctl = filter_c2cdfops(dd->dd_fset->fset_cache->cache_filter)->ioctl; rc = -ENOTTY; if (cache_ioctl) rc = cache_ioctl(inode, file, cmd, arg); @@ -904,47 +906,49 @@ int presto_ioctl(struct inode *inode, st return -EPERM; } - memset(buf, 0, sizeof(buf)); - - if (izo_ioctl_getdata(buf, buf + 1024, (void *)arg)) { + /* allocate a zero'd buffer for data */ + PRESTO_ALLOC(buf, bufsz); + if (!buf) { + EXIT; + return -ENOMEM; + } + + if (izo_ioctl_getdata(buf, buf + bufsz, (void *)arg)) { CERROR("intermezzo ioctl: data error\n"); - return -EINVAL; + rc = -EINVAL; + goto done; } data = (struct izo_ioctl_data *)buf; switch(cmd) { case IZO_IOC_REINTKML: { - int rc; int cperr; rc = kml_reint_rec(file, data); - EXIT; cperr = copy_to_user((char *)arg, data, sizeof(*data)); if (cperr) { CERROR("WARNING: cperr %d\n", cperr); rc = -EFAULT; } - return rc; + goto done; } case IZO_IOC_GET_RCVD: { struct izo_rcvd_rec rec; struct presto_file_set *fset; - int rc; fset = presto_fset(file->f_dentry); if (fset == NULL) { - EXIT; - return -ENODEV; - } + rc = -ENODEV; + goto done; + } + rc = izo_rcvd_get(&rec, fset, data->ioc_uuid); - if (rc < 0) { - EXIT; - return rc; - } + if (rc < 0) + goto done; - EXIT; - return copy_to_user((char *)arg, &rec, sizeof(rec))? -EFAULT : 0; + rc = copy_to_user((char *)arg, &rec, sizeof(rec))? -EFAULT : 0; + goto done; } case IZO_IOC_REPSTATUS: { @@ -953,12 +957,11 @@ int presto_ioctl(struct inode *inode, st struct izo_rcvd_rec rec; struct presto_file_set *fset; int minor; - int rc; fset = presto_fset(file->f_dentry); if (fset == NULL) { - EXIT; - return -ENODEV; + rc = -ENODEV; + goto done; } minor = presto_f2m(fset); @@ -967,13 +970,11 @@ int presto_ioctl(struct inode *inode, st rc = izo_repstatus(fset, client_kmlsize, lr_client, &rec); - if (rc < 0) { - EXIT; - return rc; - } + if (rc < 0) + goto done; - EXIT; - return copy_to_user((char *)arg, &rec, sizeof(rec))? -EFAULT : 0; + rc = copy_to_user((char *)arg, &rec, sizeof(rec))? -EFAULT : 0; + goto done; } case IZO_IOC_GET_CHANNEL: { @@ -981,30 +982,28 @@ int presto_ioctl(struct inode *inode, st fset = presto_fset(file->f_dentry); if (fset == NULL) { - EXIT; - return -ENODEV; + rc = -ENODEV; + goto done; } data->ioc_dev = fset->fset_cache->cache_psdev->uc_minor; CDEBUG(D_PSDEV, "CHANNEL %d\n", data->ioc_dev); - EXIT; - return copy_to_user((char *)arg, data, sizeof(*data))? -EFAULT : 0; + rc = copy_to_user((char *)arg, data, sizeof(*data))? -EFAULT : 0; + goto done; } case IZO_IOC_SET_IOCTL_UID: izo_authorized_uid = data->ioc_uid; - EXIT; - return 0; + rc = 0; + goto done; case IZO_IOC_SET_PID: rc = izo_psdev_setpid(data->ioc_dev); - EXIT; - return rc; + goto done; case IZO_IOC_SET_CHANNEL: rc = izo_psdev_setchannel(file, data->ioc_dev); - EXIT; - return rc; + goto done; case IZO_IOC_GET_KML_SIZE: { struct presto_file_set *fset; @@ -1012,14 +1011,14 @@ int presto_ioctl(struct inode *inode, st fset = presto_fset(file->f_dentry); if (fset == NULL) { - EXIT; - return -ENODEV; + rc = -ENODEV; + goto done; } kmlsize = presto_kml_offset(fset) + fset->fset_kml_logical_off; - EXIT; - return copy_to_user((char *)arg, &kmlsize, sizeof(kmlsize))?-EFAULT : 0; + rc = copy_to_user((char *)arg, &kmlsize, sizeof(kmlsize))?-EFAULT : 0; + goto done; } case IZO_IOC_PURGE_FILE_DATA: { @@ -1027,37 +1026,37 @@ int presto_ioctl(struct inode *inode, st fset = presto_fset(file->f_dentry); if (fset == NULL) { - EXIT; - return -ENODEV; + rc = -ENODEV; + goto done; } rc = izo_purge_file(fset, data->ioc_inlbuf1); - EXIT; - return rc; + goto done; } case IZO_IOC_GET_FILEID: { rc = izo_get_fileid(file, data); - EXIT; if (rc) - return rc; - return copy_to_user((char *)arg, data, sizeof(*data))? -EFAULT : 0; + goto done; + + rc = copy_to_user((char *)arg, data, sizeof(*data))? -EFAULT : 0; + goto done; } case IZO_IOC_SET_FILEID: { rc = izo_set_fileid(file, data); - EXIT; if (rc) - return rc; - return copy_to_user((char *)arg, data, sizeof(*data))? -EFAULT : 0; + goto done; + + rc = copy_to_user((char *)arg, data, sizeof(*data))? -EFAULT : 0; + goto done; } case IZO_IOC_ADJUST_LML: { struct lento_vfs_context *info; info = (struct lento_vfs_context *)data->ioc_inlbuf1; rc = presto_adjust_lml(file, info); - EXIT; - return rc; + goto done; } case IZO_IOC_CONNECT: { @@ -1066,16 +1065,15 @@ int presto_ioctl(struct inode *inode, st fset = presto_fset(file->f_dentry); if (fset == NULL) { - EXIT; - return -ENODEV; + rc = -ENODEV; + goto done; } minor = presto_f2m(fset); rc = izo_upc_connect(minor, data->ioc_ino, data->ioc_generation, data->ioc_uuid, data->ioc_flags); - EXIT; - return rc; + goto done; } case IZO_IOC_GO_FETCH_KML: { @@ -1084,15 +1082,14 @@ int presto_ioctl(struct inode *inode, st fset = presto_fset(file->f_dentry); if (fset == NULL) { - EXIT; - return -ENODEV; + rc = -ENODEV; + goto done; } minor = presto_f2m(fset); rc = izo_upc_go_fetch_kml(minor, fset->fset_name, data->ioc_uuid, data->ioc_kmlsize); - EXIT; - return rc; + goto done; } case IZO_IOC_REVOKE_PERMIT: @@ -1100,26 +1097,23 @@ int presto_ioctl(struct inode *inode, st rc = izo_revoke_permit(file->f_dentry, data->ioc_uuid); else rc = izo_revoke_permit(file->f_dentry, NULL); - EXIT; - return rc; + goto done; case IZO_IOC_CLEAR_FSET: rc = izo_clear_fsetroot(file->f_dentry); - EXIT; - return rc; + goto done; case IZO_IOC_CLEAR_ALL_FSETS: { struct presto_file_set *fset; fset = presto_fset(file->f_dentry); if (fset == NULL) { - EXIT; - return -ENODEV; + rc = -ENODEV; + goto done; } rc = izo_clear_all_fsetroots(fset->fset_cache); - EXIT; - return rc; + goto done; } case IZO_IOC_SET_FSET: @@ -1129,9 +1123,7 @@ int presto_ioctl(struct inode *inode, st rc = presto_set_fsetroot_from_ioc(file->f_dentry, data->ioc_inlbuf1, data->ioc_flags); - EXIT; - return rc; - + goto done; case IZO_IOC_MARK: { int res = 0; /* resulting flags - returned to user */ @@ -1187,16 +1179,16 @@ int presto_ioctl(struct inode *inode, st } if (error) { - EXIT; - return error; + rc = error; + goto done; } data->ioc_mark_what = res; CDEBUG(D_DOWNCALL, "mark inode: %ld, and: %x, or: %x, what %x\n", file->f_dentry->d_inode->i_ino, data->ioc_and_flag, data->ioc_or_flag, data->ioc_mark_what); - EXIT; - return copy_to_user((char *)arg, data, sizeof(*data))? -EFAULT : 0; + rc = copy_to_user((char *)arg, data, sizeof(*data))? -EFAULT : 0; + goto done; } #if 0 case IZO_IOC_CLIENT_MAKE_BRANCH: { @@ -1205,16 +1197,15 @@ int presto_ioctl(struct inode *inode, st fset = presto_fset(file->f_dentry); if (fset == NULL) { - EXIT; - return -ENODEV; + rc = -ENODEV; + goto done; } minor = presto_f2m(fset); rc = izo_upc_client_make_branch(minor, fset->fset_name, data->ioc_inlbuf1, data->ioc_inlbuf2); - EXIT; - return rc; + goto done; } #endif case IZO_IOC_SERVER_MAKE_BRANCH: { @@ -1223,14 +1214,14 @@ int presto_ioctl(struct inode *inode, st fset = presto_fset(file->f_dentry); if (fset == NULL) { - EXIT; - return -ENODEV; + rc = -ENODEV; + goto done; } minor = presto_f2m(fset); izo_upc_server_make_branch(minor, data->ioc_inlbuf1); - EXIT; - return 0; + rc = 0; + goto done; } case IZO_IOC_SET_KMLSIZE: { struct presto_file_set *fset; @@ -1239,38 +1230,33 @@ int presto_ioctl(struct inode *inode, st fset = presto_fset(file->f_dentry); if (fset == NULL) { - EXIT; - return -ENODEV; + rc = -ENODEV; + goto done; } minor = presto_f2m(fset); rc = izo_upc_set_kmlsize(minor, fset->fset_name, data->ioc_uuid, data->ioc_kmlsize); - if (rc != 0) { - EXIT; - return rc; - } + if (rc != 0) + goto done; rc = izo_rcvd_get(&rec, fset, data->ioc_uuid); if (rc == -EINVAL) { /* We don't know anything about this uuid yet; no * worries. */ memset(&rec, 0, sizeof(rec)); - } else if (rc <= 0) { + } else if (rc <= 0) { /* do we really want to return 0 if rc == 0 here? */ CERROR("InterMezzo: error reading last_rcvd: %d\n", rc); - EXIT; - return rc; + goto done; } rec.lr_remote_offset = data->ioc_kmlsize; rc = izo_rcvd_write(fset, &rec); if (rc <= 0) { CERROR("InterMezzo: error writing last_rcvd: %d\n", rc); - EXIT; - return rc; + goto done; } - EXIT; - return rc; + goto done; } case IZO_IOC_BRANCH_UNDO: { struct presto_file_set *fset; @@ -1278,15 +1264,14 @@ int presto_ioctl(struct inode *inode, st fset = presto_fset(file->f_dentry); if (fset == NULL) { - EXIT; - return -ENODEV; + rc = -ENODEV; + goto done; } minor = presto_f2m(fset); rc = izo_upc_branch_undo(minor, fset->fset_name, data->ioc_inlbuf1); - EXIT; - return rc; + goto done; } case IZO_IOC_BRANCH_REDO: { struct presto_file_set *fset; @@ -1294,28 +1279,33 @@ int presto_ioctl(struct inode *inode, st fset = presto_fset(file->f_dentry); if (fset == NULL) { - EXIT; - return -ENODEV; + rc = -ENODEV; + goto done; } minor = presto_f2m(fset); rc = izo_upc_branch_redo(minor, fset->fset_name, data->ioc_inlbuf1); - EXIT; - return rc; + goto done; } case TCGETS: - EXIT; - return -EINVAL; + rc = -EINVAL; + goto done; default: EXIT; - return -EINVAL; - + rc = -EINVAL; + goto done; + } + + rc = 0; + + done: + PRESTO_FREE(buf, bufsz); EXIT; - return 0; + return rc; } struct file_operations presto_dir_fops = { diff -prauN linux-2.6.0-test11/fs/intermezzo/journal.c wli-2.6.0-test11-30/fs/intermezzo/journal.c --- linux-2.6.0-test11/fs/intermezzo/journal.c 2003-11-26 12:42:50.000000000 -0800 +++ wli-2.6.0-test11-30/fs/intermezzo/journal.c 2003-12-04 08:13:11.000000000 -0800 @@ -1235,12 +1235,16 @@ int presto_write_kml_logical_offset(stru return izo_rcvd_write(fset, &rec); } +/* we are called from presto_finish_kml_truncate, which is called */ +/* with fset->fset_kml.fd_lock held. Allocations must be GFP_ATOMIC */ struct file * presto_copy_kml_tail(struct presto_file_set *fset, unsigned long int start) { struct file *f; int len; loff_t read_off, write_off, bytes; + char* buf; + size_t bufsz; ENTRY; @@ -1254,21 +1258,31 @@ struct file * presto_copy_kml_tail(struc write_off = 0; read_off = start; bytes = fset->fset_kml.fd_offset - start; - while (bytes > 0) { - char buf[4096]; - int toread; - if (bytes > sizeof(buf)) - toread = sizeof(buf); - else - toread = bytes; + bufsz = bytes; + /* can't use PRESTO_ALLOC - alloction must be atomic */ + buf = kmalloc(bufsz, GFP_ATOMIC); + if (!buf) { + CERROR("IZO: out of memory at %s:%d (trying to " + "allocate %d)\n", __FILE__, __LINE__, + bufsz); + filp_close(f, NULL); + EXIT; + return ERR_PTR(-ENOMEM); + } + + presto_kmem_inc(buf, bufsz); + memset(buf, 0, bufsz); - len = presto_fread(fset->fset_kml.fd_file, buf, toread, + while (bytes > 0) { + len = presto_fread(fset->fset_kml.fd_file, buf, bufsz, &read_off); if (len <= 0) break; if (presto_fwrite(f, buf, len, &write_off) != len) { + kfree(buf); + presto_kmem_dec(buf, bufsz); filp_close(f, NULL); EXIT; return ERR_PTR(-EIO); @@ -1276,7 +1290,9 @@ struct file * presto_copy_kml_tail(struc bytes -= len; } - + + kfree(buf); + presto_kmem_dec(buf, bufsz); EXIT; return f; } @@ -1585,11 +1601,12 @@ int presto_get_fileid(int minor, struct { int opcode = KML_OPCODE_GET_FILEID; struct rec_info rec; - char *buffer, *path, *logrecord, record[4096]; /*include path*/ + char *buffer, *path, *logrecord, *record; /*include path*/ struct dentry *root; __u32 uid, gid, pathlen; int error, size; struct kml_suffix *suffix; + size_t record_size; ENTRY; @@ -1605,9 +1622,13 @@ int presto_get_fileid(int minor, struct size_round(le32_to_cpu(pathlen)) + sizeof(struct kml_suffix); + record_size = max(4096, size); + error = -ENOMEM; + PRESTO_ALLOC(record, record_size); + if (!record) + goto free_buffer; + CDEBUG(D_FILE, "kml size: %d\n", size); - if ( size > sizeof(record) ) - CERROR("InterMezzo: BUFFER OVERFLOW in %s!\n", __FUNCTION__); memset(&rec, 0, sizeof(rec)); rec.is_kml = 1; @@ -1628,6 +1649,9 @@ int presto_get_fileid(int minor, struct size_round(le32_to_cpu(pathlen)), path, fset->fset_name); + PRESTO_FREE(record, record_size); + + free_buffer: BUFF_FREE(buffer); EXIT; return error; diff -prauN linux-2.6.0-test11/fs/isofs/rock.c wli-2.6.0-test11-30/fs/isofs/rock.c --- linux-2.6.0-test11/fs/isofs/rock.c 2003-11-26 12:44:16.000000000 -0800 +++ wli-2.6.0-test11-30/fs/isofs/rock.c 2003-12-04 06:13:40.000000000 -0800 @@ -430,7 +430,7 @@ int parse_rock_ridge_inode(struct iso_di static int rock_ridge_symlink_readpage(struct file *file, struct page *page) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; char *link = kmap(page); unsigned long bufsize = ISOFS_BUFFER_SIZE(inode); unsigned char bufbits = ISOFS_BUFFER_BITS(inode); diff -prauN linux-2.6.0-test11/fs/jbd/commit.c wli-2.6.0-test11-30/fs/jbd/commit.c --- linux-2.6.0-test11/fs/jbd/commit.c 2003-11-26 12:45:29.000000000 -0800 +++ wli-2.6.0-test11-30/fs/jbd/commit.c 2003-12-04 06:13:40.000000000 -0800 @@ -60,7 +60,7 @@ static void release_buffer_page(struct b page = bh->b_page; if (!page) goto nope; - if (page->mapping) + if (page_mapping(page)) goto nope; /* OK, it's a truncated page */ diff -prauN linux-2.6.0-test11/fs/jbd/journal.c wli-2.6.0-test11-30/fs/jbd/journal.c --- linux-2.6.0-test11/fs/jbd/journal.c 2003-11-26 12:43:09.000000000 -0800 +++ wli-2.6.0-test11-30/fs/jbd/journal.c 2003-12-04 06:13:40.000000000 -0800 @@ -1676,7 +1676,7 @@ repeat: } else { J_ASSERT_BH(bh, (atomic_read(&bh->b_count) > 0) || - (bh->b_page && bh->b_page->mapping)); + (bh->b_page && page_mapping(bh->b_page))); if (!new_jh) { jbd_unlock_bh_journal_head(bh); diff -prauN linux-2.6.0-test11/fs/jffs/inode-v23.c wli-2.6.0-test11-30/fs/jffs/inode-v23.c --- linux-2.6.0-test11/fs/jffs/inode-v23.c 2003-11-26 12:44:28.000000000 -0800 +++ wli-2.6.0-test11-30/fs/jffs/inode-v23.c 2003-12-04 06:13:40.000000000 -0800 @@ -743,7 +743,7 @@ jffs_do_readpage_nolock(struct file *fil void *buf; unsigned long read_len; int result; - struct inode *inode = (struct inode*)page->mapping->host; + struct inode *inode = (struct inode*)page_mapping(page)->host; struct jffs_file *f = (struct jffs_file *)inode->u.generic_ip; struct jffs_control *c = (struct jffs_control *)inode->i_sb->s_fs_info; int r; diff -prauN linux-2.6.0-test11/fs/jffs2/file.c wli-2.6.0-test11-30/fs/jffs2/file.c --- linux-2.6.0-test11/fs/jffs2/file.c 2003-11-26 12:46:03.000000000 -0800 +++ wli-2.6.0-test11-30/fs/jffs2/file.c 2003-12-04 06:13:40.000000000 -0800 @@ -107,18 +107,18 @@ int jffs2_do_readpage_unlock(struct inod int jffs2_readpage (struct file *filp, struct page *pg) { - struct jffs2_inode_info *f = JFFS2_INODE_INFO(pg->mapping->host); + struct jffs2_inode_info *f = JFFS2_INODE_INFO(page_mapping(pg)->host); int ret; down(&f->sem); - ret = jffs2_do_readpage_unlock(pg->mapping->host, pg); + ret = jffs2_do_readpage_unlock(page_mapping(pg)->host, pg); up(&f->sem); return ret; } int jffs2_prepare_write (struct file *filp, struct page *pg, unsigned start, unsigned end) { - struct inode *inode = pg->mapping->host; + struct inode *inode = page_mapping(pg)->host; struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode); uint32_t pageofs = pg->index << PAGE_CACHE_SHIFT; int ret = 0; @@ -203,7 +203,7 @@ int jffs2_commit_write (struct file *fil /* Actually commit the write from the page cache page we're looking at. * For now, we write the full page out each time. It sucks, but it's simple */ - struct inode *inode = pg->mapping->host; + struct inode *inode = page_mapping(pg)->host; struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode); struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); struct jffs2_raw_inode *ri; diff -prauN linux-2.6.0-test11/fs/libfs.c wli-2.6.0-test11-30/fs/libfs.c --- linux-2.6.0-test11/fs/libfs.c 2003-11-26 12:42:48.000000000 -0800 +++ wli-2.6.0-test11-30/fs/libfs.c 2003-12-04 06:13:40.000000000 -0800 @@ -328,7 +328,7 @@ int simple_prepare_write(struct file *fi int simple_commit_write(struct file *file, struct page *page, unsigned offset, unsigned to) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; /* diff -prauN linux-2.6.0-test11/fs/minix/dir.c wli-2.6.0-test11-30/fs/minix/dir.c --- linux-2.6.0-test11/fs/minix/dir.c 2003-11-26 12:45:06.000000000 -0800 +++ wli-2.6.0-test11-30/fs/minix/dir.c 2003-12-04 06:13:40.000000000 -0800 @@ -47,9 +47,9 @@ static inline unsigned long dir_pages(st static int dir_commit_chunk(struct page *page, unsigned from, unsigned to) { - struct inode *dir = (struct inode *)page->mapping->host; + struct inode *dir = (struct inode *)page_mapping(page)->host; int err = 0; - page->mapping->a_ops->commit_write(NULL, page, from, to); + page_mapping(page)->a_ops->commit_write(NULL, page, from, to); if (IS_DIRSYNC(dir)) err = write_one_page(page, 1); else @@ -240,7 +240,7 @@ int minix_add_link(struct dentry *dentry got_it: from = (char*)de - (char*)page_address(page); to = from + sbi->s_dirsize; - err = page->mapping->a_ops->prepare_write(NULL, page, from, to); + err = page_mapping(page)->a_ops->prepare_write(NULL, page, from, to); if (err) goto out_unlock; memcpy (de->name, name, namelen); @@ -260,7 +260,7 @@ out_unlock: int minix_delete_entry(struct minix_dir_entry *de, struct page *page) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); struct inode *inode = (struct inode*)mapping->host; char *kaddr = page_address(page); unsigned from = (char*)de - kaddr; @@ -364,14 +364,14 @@ not_empty: void minix_set_link(struct minix_dir_entry *de, struct page *page, struct inode *inode) { - struct inode *dir = (struct inode*)page->mapping->host; + struct inode *dir = (struct inode*)page_mapping(page)->host; struct minix_sb_info *sbi = minix_sb(dir->i_sb); unsigned from = (char *)de-(char*)page_address(page); unsigned to = from + sbi->s_dirsize; int err; lock_page(page); - err = page->mapping->a_ops->prepare_write(NULL, page, from, to); + err = page_mapping(page)->a_ops->prepare_write(NULL, page, from, to); if (err == 0) { de->inode = inode->i_ino; err = dir_commit_chunk(page, from, to); diff -prauN linux-2.6.0-test11/fs/mpage.c wli-2.6.0-test11-30/fs/mpage.c --- linux-2.6.0-test11/fs/mpage.c 2003-11-26 12:43:25.000000000 -0800 +++ wli-2.6.0-test11-30/fs/mpage.c 2003-12-04 06:13:40.000000000 -0800 @@ -129,7 +129,7 @@ mpage_alloc(struct block_device *bdev, static void map_buffer_to_page(struct page *page, struct buffer_head *bh, int page_block) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; struct buffer_head *page_bh, *head; int block = 0; @@ -209,7 +209,7 @@ static struct bio * do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages, sector_t *last_block_in_bio, get_block_t get_block) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; const unsigned blkbits = inode->i_blkbits; const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits; const unsigned blocksize = 1 << blkbits; @@ -388,8 +388,8 @@ static struct bio * mpage_writepage(struct bio *bio, struct page *page, get_block_t get_block, sector_t *last_block_in_bio, int *ret, struct writeback_control *wbc) { - struct address_space *mapping = page->mapping; - struct inode *inode = page->mapping->host; + struct address_space *mapping = page_mapping(page); + struct inode *inode = page_mapping(page)->host; const unsigned blkbits = inode->i_blkbits; unsigned long end_index; const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits; @@ -416,7 +416,7 @@ mpage_writepage(struct bio *bio, struct if (!buffer_mapped(bh)) { /* * unmapped dirty buffers are created by - * __set_page_dirty_buffers -> mmapped data + * set_page_dirty_buffers -> mmapped data */ if (buffer_dirty(bh)) goto confused; @@ -562,7 +562,7 @@ alloc_new: confused: if (bio) bio = mpage_bio_submit(WRITE, bio); - *ret = page->mapping->a_ops->writepage(page, wbc); + *ret = page_mapping(page)->a_ops->writepage(page, wbc); /* * The caller has a ref on the inode, so *mapping is stable */ @@ -635,7 +635,7 @@ mpage_writepages(struct address_space *m if (get_block == NULL) writepage = mapping->a_ops->writepage; - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); while (!list_empty(&mapping->io_pages) && !done) { struct page *page = list_entry(mapping->io_pages.prev, struct page, list); @@ -655,12 +655,12 @@ mpage_writepages(struct address_space *m list_add(&page->list, &mapping->locked_pages); page_cache_get(page); - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); /* * At this point we hold neither mapping->page_lock nor * lock on the page itself: the page may be truncated or - * invalidated (changing page->mapping to NULL), or even + * invalidated (changing page_mapping(page) to NULL), or even * swizzled back from swapper_space to tmpfs file mapping. */ @@ -669,7 +669,7 @@ mpage_writepages(struct address_space *m if (wbc->sync_mode != WB_SYNC_NONE) wait_on_page_writeback(page); - if (page->mapping == mapping && !PageWriteback(page) && + if (page_mapping(page) == mapping && !PageWriteback(page) && test_clear_page_dirty(page)) { if (writepage) { ret = (*writepage)(page, wbc); @@ -695,12 +695,12 @@ mpage_writepages(struct address_space *m unlock_page(page); } page_cache_release(page); - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); } /* * Leave any remaining dirty pages on ->io_pages */ - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); if (bio) mpage_bio_submit(WRITE, bio); return ret; diff -prauN linux-2.6.0-test11/fs/ncpfs/mmap.c wli-2.6.0-test11-30/fs/ncpfs/mmap.c --- linux-2.6.0-test11/fs/ncpfs/mmap.c 2003-11-26 12:44:47.000000000 -0800 +++ wli-2.6.0-test11-30/fs/ncpfs/mmap.c 2003-12-04 08:43:29.000000000 -0800 @@ -26,7 +26,7 @@ * Fill in the supplied page for mmap */ static struct page* ncp_file_mmap_nopage(struct vm_area_struct *area, - unsigned long address, int write_access) + unsigned long address, int *type) { struct file *file = area->vm_file; struct dentry *dentry = file->f_dentry; @@ -85,6 +85,15 @@ static struct page* ncp_file_mmap_nopage memset(pg_addr + already_read, 0, PAGE_SIZE - already_read); flush_dcache_page(page); kunmap(page); + + /* + * If I understand ncp_read_kernel() properly, the above always + * fetches from the network, here the analogue of disk. + * -- wli + */ + if (type) + *type = VM_FAULT_MAJOR; + inc_page_state(pgmajfault); return page; } diff -prauN linux-2.6.0-test11/fs/ncpfs/symlink.c wli-2.6.0-test11-30/fs/ncpfs/symlink.c --- linux-2.6.0-test11/fs/ncpfs/symlink.c 2003-11-26 12:45:45.000000000 -0800 +++ wli-2.6.0-test11-30/fs/ncpfs/symlink.c 2003-12-04 06:13:40.000000000 -0800 @@ -43,7 +43,7 @@ static int ncp_symlink_readpage(struct file *file, struct page *page) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; int error, length, len; char *link, *rawlink; char *buf = kmap(page); diff -prauN linux-2.6.0-test11/fs/nfs/file.c wli-2.6.0-test11-30/fs/nfs/file.c --- linux-2.6.0-test11/fs/nfs/file.c 2003-11-26 12:44:42.000000000 -0800 +++ wli-2.6.0-test11-30/fs/nfs/file.c 2003-12-04 06:13:40.000000000 -0800 @@ -216,7 +216,7 @@ static int nfs_commit_write(struct file struct address_space_operations nfs_file_aops = { .readpage = nfs_readpage, .readpages = nfs_readpages, - .set_page_dirty = __set_page_dirty_nobuffers, + .set_page_dirty = set_page_dirty_nobuffers, .writepage = nfs_writepage, .writepages = nfs_writepages, .prepare_write = nfs_prepare_write, diff -prauN linux-2.6.0-test11/fs/nfs/read.c wli-2.6.0-test11-30/fs/nfs/read.c --- linux-2.6.0-test11/fs/nfs/read.c 2003-11-26 12:42:38.000000000 -0800 +++ wli-2.6.0-test11-30/fs/nfs/read.c 2003-12-04 06:13:40.000000000 -0800 @@ -308,7 +308,7 @@ nfs_readpage_result(struct rpc_task *tas int nfs_readpage(struct file *file, struct page *page) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; int error; dprintk("NFS: nfs_readpage (%p %ld@%lu)\n", @@ -349,14 +349,14 @@ static int readpage_sync_filler(void *data, struct page *page) { struct nfs_readdesc *desc = (struct nfs_readdesc *)data; - return nfs_readpage_sync(desc->filp, page->mapping->host, page); + return nfs_readpage_sync(desc->filp, page_mapping(page)->host, page); } static int readpage_async_filler(void *data, struct page *page) { struct nfs_readdesc *desc = (struct nfs_readdesc *)data; - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; struct nfs_page *new; nfs_wb_page(inode, page); diff -prauN linux-2.6.0-test11/fs/nfs/write.c wli-2.6.0-test11-30/fs/nfs/write.c --- linux-2.6.0-test11/fs/nfs/write.c 2003-11-26 12:44:58.000000000 -0800 +++ wli-2.6.0-test11-30/fs/nfs/write.c 2003-12-04 06:13:40.000000000 -0800 @@ -224,7 +224,7 @@ nfs_writepage_async(struct file *file, s int nfs_writepage(struct page *page, struct writeback_control *wbc) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; unsigned long end_index; unsigned offset = PAGE_CACHE_SIZE; loff_t i_size = i_size_read(inode); @@ -629,7 +629,7 @@ nfs_strategy(struct inode *inode) int nfs_flush_incompatible(struct file *file, struct page *page) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; struct nfs_page *req; int status = 0; /* @@ -659,7 +659,7 @@ int nfs_updatepage(struct file *file, struct page *page, unsigned int offset, unsigned int count) { struct dentry *dentry = file->f_dentry; - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; struct nfs_page *req; loff_t end; int status = 0; diff -prauN linux-2.6.0-test11/fs/ntfs/aops.c wli-2.6.0-test11-30/fs/ntfs/aops.c --- linux-2.6.0-test11/fs/ntfs/aops.c 2003-11-26 12:45:07.000000000 -0800 +++ wli-2.6.0-test11-30/fs/ntfs/aops.c 2003-12-04 06:13:40.000000000 -0800 @@ -55,7 +55,7 @@ static void ntfs_end_buffer_async_read(s int page_uptodate = 1; page = bh->b_page; - ni = NTFS_I(page->mapping->host); + ni = NTFS_I(page_mapping(page)->host); if (likely(uptodate)) { s64 file_ofs; @@ -176,7 +176,7 @@ static int ntfs_read_block(struct page * int i, nr; unsigned char blocksize_bits; - ni = NTFS_I(page->mapping->host); + ni = NTFS_I(page_mapping(page)->host); vol = ni->vol; blocksize_bits = VFS_I(ni)->i_blkbits; @@ -359,7 +359,7 @@ int ntfs_readpage(struct file *file, str return 0; } - ni = NTFS_I(page->mapping->host); + ni = NTFS_I(page_mapping(page)->host); if (NInoNonResident(ni)) { /* @@ -473,7 +473,7 @@ static int ntfs_write_block(struct page BOOL need_end_writeback; unsigned char blocksize_bits; - vi = page->mapping->host; + vi = page_mapping(page)->host; ni = NTFS_I(vi); vol = ni->vol; @@ -500,9 +500,9 @@ static int ntfs_write_block(struct page * buffer's dirty state as-is. */ // FIXME: Once Andrew's -EAGAIN patch goes in, remove the - // __set_page_dirty_nobuffers(page) and return -EAGAIN instead + // set_page_dirty_nobuffers(page) and return -EAGAIN instead // of zero. - __set_page_dirty_nobuffers(page); + set_page_dirty_nobuffers(page); unlock_page(page); return 0; } @@ -519,12 +519,12 @@ static int ntfs_write_block(struct page iblock = ni->initialized_size >> blocksize_bits; /* - * Be very careful. We have no exclusion from __set_page_dirty_buffers + * Be very careful. We have no exclusion from set_page_dirty_buffers * here, and the (potentially unmapped) buffers may become dirty at * any time. If a buffer becomes dirty here after we've inspected it * then we just miss that fact, and the page stays dirty. * - * Buffers outside i_size may be dirtied by __set_page_dirty_buffers; + * Buffers outside i_size may be dirtied by set_page_dirty_buffers; * handle that here by just cleaning them. */ @@ -579,7 +579,7 @@ static int ntfs_write_block(struct page // Update initialized size in the attribute and // in the inode. // Again, for each page do: - // __set_page_dirty_buffers(); + // set_page_dirty_buffers(); // page_cache_release() // We don't need to wait on the writes. // Update iblock. @@ -734,9 +734,9 @@ lock_retry_remap: * leave its buffer's dirty state as-is. */ // FIXME: Once Andrew's -EAGAIN patch goes in, remove - // the __set_page_dirty_nobuffers(page) and set err to + // the set_page_dirty_nobuffers(page) and set err to // -EAGAIN instead of zero. - __set_page_dirty_nobuffers(page); + set_page_dirty_nobuffers(page); err = 0; } else SetPageError(page); @@ -805,7 +805,7 @@ static int ntfs_writepage(struct page *p BUG_ON(!PageLocked(page)); - vi = page->mapping->host; + vi = page_mapping(page)->host; /* Is the page fully outside i_size? (truncate in progress) */ if (unlikely(page->index >= (vi->i_size + PAGE_CACHE_SIZE - 1) >> @@ -987,9 +987,9 @@ err_out: * buffer's dirty state as-is. */ // FIXME: Once Andrew's -EAGAIN patch goes in, remove the - // __set_page_dirty_nobuffers(page) and set err to -EAGAIN + // set_page_dirty_nobuffers(page) and set err to -EAGAIN // instead of zero. - __set_page_dirty_nobuffers(page); + set_page_dirty_nobuffers(page); err = 0; } else { ntfs_error(vi->i_sb, "Resident attribute write failed with " @@ -1024,7 +1024,7 @@ static int ntfs_prepare_nonresident_writ BOOL is_retry; unsigned char blocksize_bits; - vi = page->mapping->host; + vi = page_mapping(page)->host; ni = NTFS_I(vi); vol = ni->vol; @@ -1125,7 +1125,7 @@ static int ntfs_prepare_nonresident_writ // Update initialized size in the attribute and // in the inode. // Again, for each page do: - // __set_page_dirty_buffers(); + // set_page_dirty_buffers(); // page_cache_release() // We don't need to wait on the writes. // Update iblock. @@ -1361,7 +1361,7 @@ err_out: * ntfs_prepare_write - prepare a page for receiving data * * This is called from generic_file_write() with i_sem held on the inode - * (@page->mapping->host). The @page is locked and kmap()ped so page_address() + * (@page_mapping(page)->host). The @page is locked and kmap()ped so page_address() * can simply be used. The source data has not yet been copied into the @page. * * Need to extend the attribute/fill in holes if necessary, create blocks and @@ -1382,7 +1382,7 @@ err_out: static int ntfs_prepare_write(struct file *file, struct page *page, unsigned from, unsigned to) { - struct inode *vi = page->mapping->host; + struct inode *vi = page_mapping(page)->host; ntfs_inode *ni = NTFS_I(vi); ntfs_debug("Entering for inode %li, attribute type 0x%x, page index " @@ -1491,7 +1491,7 @@ static int ntfs_commit_nonresident_write unsigned int block_start, block_end, blocksize; BOOL partial; - vi = page->mapping->host; + vi = page_mapping(page)->host; ntfs_debug("Entering for inode %li, attribute type 0x%x, page index " "0x%lx, from = %u, to = %u.", vi->i_ino, @@ -1547,7 +1547,7 @@ static int ntfs_commit_nonresident_write * ntfs_commit_write - commit the received data * * This is called from generic_file_write() with i_sem held on the inode - * (@page->mapping->host). The @page is locked and kmap()ped so page_address() + * (@page_mapping(page)->host). The @page is locked and kmap()ped so page_address() * can simply be used. The source data has already been copied into the @page. * * Need to mark modified blocks dirty so they get written out later when @@ -1585,7 +1585,7 @@ static int ntfs_commit_write(struct file u32 attr_len, bytes; int err; - vi = page->mapping->host; + vi = page_mapping(page)->host; ni = NTFS_I(vi); ntfs_debug("Entering for inode %li, attribute type 0x%x, page index " @@ -1758,7 +1758,7 @@ err_out: * Put the page on mapping->dirty_pages, but leave its * buffer's dirty state as-is. */ - __set_page_dirty_nobuffers(page); + set_page_dirty_nobuffers(page); err = 0; } else ntfs_error(vi->i_sb, "Page is not uptodate. Written " diff -prauN linux-2.6.0-test11/fs/ntfs/compress.c wli-2.6.0-test11-30/fs/ntfs/compress.c --- linux-2.6.0-test11/fs/ntfs/compress.c 2003-11-26 12:44:11.000000000 -0800 +++ wli-2.6.0-test11-30/fs/ntfs/compress.c 2003-12-04 06:13:40.000000000 -0800 @@ -209,7 +209,7 @@ return_error: /* Second stage: finalize completed pages. */ if (nr_completed_pages > 0) { struct page *page = dest_pages[completed_pages[0]]; - ntfs_inode *ni = NTFS_I(page->mapping->host); + ntfs_inode *ni = NTFS_I(page_mapping(page)->host); for (i = 0; i < nr_completed_pages; i++) { int di = completed_pages[i]; @@ -467,7 +467,7 @@ return_overflow: */ int ntfs_read_compressed_block(struct page *page) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); ntfs_inode *ni = NTFS_I(mapping->host); ntfs_volume *vol = ni->vol; struct super_block *sb = vol->sb; diff -prauN linux-2.6.0-test11/fs/proc/array.c wli-2.6.0-test11-30/fs/proc/array.c --- linux-2.6.0-test11/fs/proc/array.c 2003-11-26 12:44:26.000000000 -0800 +++ wli-2.6.0-test11-30/fs/proc/array.c 2003-12-03 19:11:55.000000000 -0800 @@ -290,7 +290,7 @@ int proc_pid_status(struct task_struct * return buffer - orig; } -extern unsigned long task_vsize(struct mm_struct *); +unsigned long task_vsize(struct mm_struct *); int proc_pid_stat(struct task_struct *task, char * buffer) { unsigned long vsize, eip, esp, wchan; @@ -315,11 +315,9 @@ int proc_pid_stat(struct task_struct *ta } task_unlock(task); if (mm) { - down_read(&mm->mmap_sem); vsize = task_vsize(mm); eip = KSTK_EIP(task); esp = KSTK_ESP(task); - up_read(&mm->mmap_sem); } wchan = get_wchan(task); @@ -397,20 +395,20 @@ int proc_pid_stat(struct task_struct *ta return res; } -extern int task_statm(struct mm_struct *, int *, int *, int *, int *); +int task_statm(struct mm_struct *, int *, int *, int *, int *, int *, int *); int proc_pid_statm(struct task_struct *task, char *buffer) { - int size = 0, resident = 0, shared = 0, text = 0, lib = 0, data = 0; + int size, resident, shared, text, lib, data, dirty; struct mm_struct *mm = get_task_mm(task); - if (mm) { - down_read(&mm->mmap_sem); - size = task_statm(mm, &shared, &text, &data, &resident); - up_read(&mm->mmap_sem); - + if (!mm) + size = resident = shared = text = lib = data = dirty = 0; + else { + size = task_statm(mm, &shared, &text, &lib, &data, + &resident, &dirty); mmput(mm); } return sprintf(buffer,"%d %d %d %d %d %d %d\n", - size, resident, shared, text, lib, data, 0); + size, resident, shared, text, lib, data, dirty); } diff -prauN linux-2.6.0-test11/fs/proc/base.c wli-2.6.0-test11-30/fs/proc/base.c --- linux-2.6.0-test11/fs/proc/base.c 2003-11-26 12:44:31.000000000 -0800 +++ wli-2.6.0-test11-30/fs/proc/base.c 2003-12-04 08:15:58.000000000 -0800 @@ -716,8 +716,6 @@ static int proc_pid_readlink(struct dent struct dentry *de; struct vfsmount *mnt = NULL; - lock_kernel(); - if (current->fsuid != inode->i_uid && !capable(CAP_DAC_OVERRIDE)) goto out; error = proc_check_root(inode); @@ -732,7 +730,6 @@ static int proc_pid_readlink(struct dent dput(de); mntput(mnt); out: - unlock_kernel(); return error; } @@ -1624,91 +1621,44 @@ out: } #define PROC_NUMBUF 10 -#define PROC_MAXPIDS 20 - -/* - * Get a few tgid's to return for filldir - we need to hold the - * tasklist lock while doing this, and we must release it before - * we actually do the filldir itself, so we use a temp buffer.. - */ -static int get_tgid_list(int index, unsigned int *tgids) -{ - struct task_struct *p; - int nr_tgids = 0; - - index--; - read_lock(&tasklist_lock); - for_each_process(p) { - int tgid = p->pid; - if (!pid_alive(p)) - continue; - if (--index >= 0) - continue; - tgids[nr_tgids] = tgid; - nr_tgids++; - if (nr_tgids >= PROC_MAXPIDS) - break; - } - read_unlock(&tasklist_lock); - return nr_tgids; -} /* * Get a few tid's to return for filldir - we need to hold the * tasklist lock while doing this, and we must release it before * we actually do the filldir itself, so we use a temp buffer.. + * + * Rewrite this flaming bag of shit pronto. */ -static int get_tid_list(int index, unsigned int *tids, struct inode *dir) -{ - struct task_struct *leader_task = proc_task(dir); - struct task_struct *task = leader_task; - int nr_tids = 0; - - index -= 2; - read_lock(&tasklist_lock); - do { - int tid = task->pid; - if (!pid_alive(task)) - continue; - if (--index >= 0) - continue; - tids[nr_tids] = tid; - nr_tids++; - if (nr_tids >= PROC_MAXPIDS) - break; - } while ((task = next_thread(task)) != leader_task); - read_unlock(&tasklist_lock); - return nr_tids; -} - /* for the /proc/ directory itself, after non-process stuff has been done */ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) { - unsigned int tgid_array[PROC_MAXPIDS]; char buf[PROC_NUMBUF]; + int tgid_array[PROC_MAXPIDS]; unsigned int nr = filp->f_pos - FIRST_PROCESS_ENTRY; - unsigned int nr_tgids, i; + int k, tgid, nr_tgids; if (!nr) { - ino_t ino = fake_ino(0,PROC_TGID_INO); + ino_t ino = fake_ino(0, PROC_TGID_INO); if (filldir(dirent, "self", 4, filp->f_pos, ino, DT_LNK) < 0) return 0; filp->f_pos++; - nr++; + nr = 1; } - nr_tgids = get_tgid_list(nr, tgid_array); - - for (i = 0; i < nr_tgids; i++) { - int tgid = tgid_array[i]; - ino_t ino = fake_ino(tgid,PROC_TGID_INO); - unsigned long j = PROC_NUMBUF; - - do buf[--j] = '0' + (tgid % 10); while (tgid/=10); - - if (filldir(dirent, buf+j, PROC_NUMBUF-j, filp->f_pos, ino, DT_DIR) < 0) + tgid = nr - 1; + nr_tgids = find_tgids_after(tgid, tgid_array); + for (k = 0; k < nr_tgids; ++k) { + ino_t ino; + unsigned long i, j = PROC_NUMBUF; + + tgid = tgid_array[k]; + ino = fake_ino(tgid, PROC_TGID_INO); + i = tgid; + do buf[--j] = '0' + (i % 10); while (i /= 10); + if (filldir(dirent, buf + j, PROC_NUMBUF - j, + filp->f_pos, ino, DT_DIR) < 0) break; - filp->f_pos++; + filp->f_pos = tgid + 1 + FIRST_PROCESS_ENTRY; } return 0; } @@ -1716,51 +1666,50 @@ int proc_pid_readdir(struct file * filp, /* for the /proc/TGID/task/ directories */ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldir) { - unsigned int tid_array[PROC_MAXPIDS]; + int tid_array[PROC_MAXPIDS]; char buf[PROC_NUMBUF]; unsigned int nr_tids, i; struct dentry *dentry = filp->f_dentry; struct inode *inode = dentry->d_inode; int retval = -ENOENT; ino_t ino; - unsigned long pos = filp->f_pos; /* avoiding "long long" filp->f_pos */ if (!pid_alive(proc_task(inode))) goto out; retval = 0; - switch (pos) { + switch (filp->f_pos) { case 0: ino = inode->i_ino; - if (filldir(dirent, ".", 1, pos, ino, DT_DIR) < 0) + if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) < 0) goto out; - pos++; + filp->f_pos++; /* fall through */ case 1: ino = parent_ino(dentry); - if (filldir(dirent, "..", 2, pos, ino, DT_DIR) < 0) + if (filldir(dirent, "..", 2, filp->f_pos, ino, DT_DIR) < 0) goto out; - pos++; + filp->f_pos++; /* fall through */ } - nr_tids = get_tid_list(pos, tid_array, inode); + nr_tids = find_tids_after(proc_task(inode)->tgid, filp->f_pos - 2, tid_array); for (i = 0; i < nr_tids; i++) { - unsigned long j = PROC_NUMBUF; + unsigned long k, j = PROC_NUMBUF; int tid = tid_array[i]; - ino = fake_ino(tid,PROC_TID_INO); + ino = fake_ino(tid, PROC_TID_INO); + k = tid; do - buf[--j] = '0' + (tid % 10); - while (tid /= 10); + buf[--j] = '0' + (k % 10); + while (k /= 10); - if (filldir(dirent, buf+j, PROC_NUMBUF-j, pos, ino, DT_DIR) < 0) + if (filldir(dirent, buf+j, PROC_NUMBUF-j, filp->f_pos, ino, DT_DIR) < 0) break; - pos++; + filp->f_pos = tid + 2; } out: - filp->f_pos = pos; return retval; } diff -prauN linux-2.6.0-test11/fs/proc/kcore.c wli-2.6.0-test11-30/fs/proc/kcore.c --- linux-2.6.0-test11/fs/proc/kcore.c 2003-11-26 12:44:18.000000000 -0800 +++ wli-2.6.0-test11-30/fs/proc/kcore.c 2003-12-04 08:30:37.000000000 -0800 @@ -387,23 +387,17 @@ read_kcore(struct file *file, char __use } kfree(elf_buf); } else { - if (kern_addr_valid(start)) { - unsigned long n; + unsigned long n; - n = copy_to_user(buffer, (char *)start, tsz); - /* - * We cannot distingush between fault on source - * and fault on destination. When this happens - * we clear too and hope it will trigger the - * EFAULT again. - */ - if (n) { - if (clear_user(buffer + tsz - n, - tsz - n)) - return -EFAULT; - } - } else { - if (clear_user(buffer, tsz)) + n = copy_to_user(buffer, (char *)start, tsz); + /* + * We cannot distingush between fault on source + * and fault on destination. When this happens + * we clear too and hope it will trigger the + * EFAULT again. + */ + if (n) { + if (clear_user(buffer + tsz - n, tsz - n)) return -EFAULT; } } diff -prauN linux-2.6.0-test11/fs/proc/proc_misc.c wli-2.6.0-test11-30/fs/proc/proc_misc.c --- linux-2.6.0-test11/fs/proc/proc_misc.c 2003-11-26 12:43:07.000000000 -0800 +++ wli-2.6.0-test11-30/fs/proc/proc_misc.c 2003-12-04 07:13:42.000000000 -0800 @@ -200,6 +200,7 @@ static int meminfo_read_proc(char *page, "SwapFree: %8lu kB\n" "Dirty: %8lu kB\n" "Writeback: %8lu kB\n" + "Deferred: %8lu kB\n" "Mapped: %8lu kB\n" "Slab: %8lu kB\n" "Committed_AS: %8u kB\n" @@ -210,8 +211,8 @@ static int meminfo_read_proc(char *page, K(i.totalram), K(i.freeram), K(i.bufferram), - K(get_page_cache_size()-total_swapcache_pages-i.bufferram), - K(total_swapcache_pages), + K(get_page_cache_size() - i.bufferram), + K(ps.nr_swapcache), K(active), K(inactive), K(i.totalhigh), @@ -222,6 +223,7 @@ static int meminfo_read_proc(char *page, K(i.freeswap), K(ps.nr_dirty), K(ps.nr_writeback), + K(nr_deferred_pages()), K(ps.nr_mapped), K(ps.nr_slab), K(committed), diff -prauN linux-2.6.0-test11/fs/proc/root.c wli-2.6.0-test11-30/fs/proc/root.c --- linux-2.6.0-test11/fs/proc/root.c 2003-11-26 12:45:08.000000000 -0800 +++ wli-2.6.0-test11-30/fs/proc/root.c 2003-12-04 08:15:58.000000000 -0800 @@ -103,17 +103,12 @@ static int proc_root_readdir(struct file unsigned int nr = filp->f_pos; int ret; - lock_kernel(); - if (nr < FIRST_PROCESS_ENTRY) { int error = proc_readdir(filp, dirent, filldir); - if (error <= 0) { - unlock_kernel(); + if (error <= 0) return error; - } filp->f_pos = FIRST_PROCESS_ENTRY; } - unlock_kernel(); ret = proc_pid_readdir(filp, dirent, filldir); return ret; diff -prauN linux-2.6.0-test11/fs/proc/task_mmu.c wli-2.6.0-test11-30/fs/proc/task_mmu.c --- linux-2.6.0-test11/fs/proc/task_mmu.c 2003-11-26 12:43:07.000000000 -0800 +++ wli-2.6.0-test11-30/fs/proc/task_mmu.c 2003-12-03 19:11:55.000000000 -0800 @@ -5,27 +5,6 @@ char *task_mem(struct mm_struct *mm, char *buffer) { - unsigned long data = 0, stack = 0, exec = 0, lib = 0; - struct vm_area_struct *vma; - - down_read(&mm->mmap_sem); - for (vma = mm->mmap; vma; vma = vma->vm_next) { - unsigned long len = (vma->vm_end - vma->vm_start) >> 10; - if (!vma->vm_file) { - data += len; - if (vma->vm_flags & VM_GROWSDOWN) - stack += len; - continue; - } - if (vma->vm_flags & VM_WRITE) - continue; - if (vma->vm_flags & VM_EXEC) { - exec += len; - if (vma->vm_flags & VM_EXECUTABLE) - continue; - lib += len; - } - } buffer += sprintf(buffer, "VmSize:\t%8lu kB\n" "VmLck:\t%8lu kB\n" @@ -37,9 +16,10 @@ char *task_mem(struct mm_struct *mm, cha mm->total_vm << (PAGE_SHIFT-10), mm->locked_vm << (PAGE_SHIFT-10), mm->rss << (PAGE_SHIFT-10), - data - stack, stack, - exec - lib, lib); - up_read(&mm->mmap_sem); + (mm->data - mm->stack) << (PAGE_SHIFT-10), + mm->stack << (PAGE_SHIFT-10), + mm->text << (PAGE_SHIFT-10), + mm->lib << (PAGE_SHIFT-10)); return buffer; } @@ -49,30 +29,15 @@ unsigned long task_vsize(struct mm_struc } int task_statm(struct mm_struct *mm, int *shared, int *text, - int *data, int *resident) + int *lib, int *data, int *resident, int *dirty) { - struct vm_area_struct *vma; - int size = 0; - + *shared = mm->shared; + *text = mm->text; + *lib = mm->lib; + *data = mm->data; + *dirty = mm->dirty; *resident = mm->rss; - for (vma = mm->mmap; vma; vma = vma->vm_next) { - int pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; - - size += pages; - if (is_vm_hugetlb_page(vma)) { - if (!(vma->vm_flags & VM_DONTCOPY)) - *shared += pages; - continue; - } - if (vma->vm_flags & VM_SHARED || !list_empty(&vma->shared)) - *shared += pages; - if (vma->vm_flags & VM_EXECUTABLE) - *text += pages; - else - *data += pages; - } - - return size; + return mm->total_vm; } static int show_map(struct seq_file *m, void *v) diff -prauN linux-2.6.0-test11/fs/proc/task_nommu.c wli-2.6.0-test11-30/fs/proc/task_nommu.c --- linux-2.6.0-test11/fs/proc/task_nommu.c 2003-11-26 12:44:22.000000000 -0800 +++ wli-2.6.0-test11-30/fs/proc/task_nommu.c 2003-12-03 19:11:55.000000000 -0800 @@ -67,19 +67,23 @@ unsigned long task_vsize(struct mm_struc struct mm_tblock_struct *tbp; unsigned long vsize = 0; + down_read(&mm->mmap_sem); for (tbp = &mm->context.tblock; tbp; tbp = tbp->next) { if (tbp->rblock) vsize += kobjsize(tbp->rblock->kblock); } - + up_read(&mm->mmap_sem); return vsize; } int task_statm(struct mm_struct *mm, int *shared, int *text, - int *data, int *resident) + int *lib, int *data, int *resident, int *dirty) { struct mm_tblock_struct *tbp; - int size = kobjsize(mm); + int size; + + down_read(&mm->mmap_sem); + size = kobjsize(mm); for (tbp = &mm->context.tblock; tbp; tbp = tbp->next) { if (tbp->next) @@ -92,8 +96,9 @@ int task_statm(struct mm_struct *mm, int size += (*text = mm->end_code - mm->start_code); size += (*data = mm->start_stack - mm->start_data); - + *shared = *lib = *dirty = 0; *resident = size; + up_read(&mm->mmap_sem); return size; } diff -prauN linux-2.6.0-test11/fs/qnx4/inode.c wli-2.6.0-test11-30/fs/qnx4/inode.c --- linux-2.6.0-test11/fs/qnx4/inode.c 2003-11-26 12:45:29.000000000 -0800 +++ wli-2.6.0-test11-30/fs/qnx4/inode.c 2003-12-04 06:13:40.000000000 -0800 @@ -434,7 +434,7 @@ static int qnx4_readpage(struct file *fi static int qnx4_prepare_write(struct file *file, struct page *page, unsigned from, unsigned to) { - struct qnx4_inode_info *qnx4_inode = qnx4_i(page->mapping->host); + struct qnx4_inode_info *qnx4_inode = qnx4_i(page_mapping(page)->host); return cont_prepare_write(page, from, to, qnx4_get_block, &qnx4_inode->mmu_private); } diff -prauN linux-2.6.0-test11/fs/reiserfs/inode.c wli-2.6.0-test11-30/fs/reiserfs/inode.c --- linux-2.6.0-test11/fs/reiserfs/inode.c 2003-11-26 12:43:32.000000000 -0800 +++ wli-2.6.0-test11-30/fs/reiserfs/inode.c 2003-12-04 06:13:40.000000000 -0800 @@ -2050,7 +2050,7 @@ static void lock_buffer_for_writepage(st lock_buffer(bh); } else { if (test_set_buffer_locked(bh)) { - __set_page_dirty_nobuffers(page); + set_page_dirty_nobuffers(page); return; } } @@ -2069,7 +2069,7 @@ static void lock_buffer_for_writepage(st * code to handle reiserfs tails. */ static int reiserfs_write_full_page(struct page *page, struct writeback_control *wbc) { - struct inode *inode = page->mapping->host ; + struct inode *inode = page_mapping(page)->host ; unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT ; int error = 0; unsigned long block ; @@ -2222,7 +2222,7 @@ static int reiserfs_readpage (struct fil static int reiserfs_writepage (struct page * page, struct writeback_control *wbc) { - struct inode *inode = page->mapping->host ; + struct inode *inode = page_mapping(page)->host ; reiserfs_wait_on_write_block(inode->i_sb) ; return reiserfs_write_full_page(page, wbc) ; } @@ -2230,7 +2230,7 @@ static int reiserfs_writepage (struct pa int reiserfs_prepare_write(struct file *f, struct page *page, unsigned from, unsigned to) { - struct inode *inode = page->mapping->host ; + struct inode *inode = page_mapping(page)->host ; reiserfs_wait_on_write_block(inode->i_sb) ; fix_tail_page_for_writing(page) ; return block_prepare_write(page, from, to, reiserfs_get_block) ; @@ -2243,7 +2243,7 @@ static sector_t reiserfs_aop_bmap(struct static int reiserfs_commit_write(struct file *f, struct page *page, unsigned from, unsigned to) { - struct inode *inode = page->mapping->host ; + struct inode *inode = page_mapping(page)->host ; loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; int ret ; @@ -2345,7 +2345,7 @@ void i_attrs_to_sd_attrs( struct inode * */ static int reiserfs_releasepage(struct page *page, int unused_gfp_flags) { - struct inode *inode = page->mapping->host ; + struct inode *inode = page_mapping(page)->host ; struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb) ; struct buffer_head *head ; struct buffer_head *bh ; diff -prauN linux-2.6.0-test11/fs/reiserfs/tail_conversion.c wli-2.6.0-test11-30/fs/reiserfs/tail_conversion.c --- linux-2.6.0-test11/fs/reiserfs/tail_conversion.c 2003-11-26 12:46:12.000000000 -0800 +++ wli-2.6.0-test11-30/fs/reiserfs/tail_conversion.c 2003-12-04 06:26:23.000000000 -0800 @@ -149,7 +149,7 @@ void reiserfs_unmap_buffer(struct buffer interested in removing it from per-sb j_dirty_buffers list, to avoid BUG() on attempt to write not mapped buffer */ if ( !list_empty(&bh->b_assoc_buffers) && bh->b_page) { - struct inode *inode = bh->b_page->mapping->host; + struct inode *inode = page_mapping(bh->b_page)->host; struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb); spin_lock(&j->j_dirty_buffers_lock); list_del_init(&bh->b_assoc_buffers); diff -prauN linux-2.6.0-test11/fs/romfs/inode.c wli-2.6.0-test11-30/fs/romfs/inode.c --- linux-2.6.0-test11/fs/romfs/inode.c 2003-11-26 12:44:44.000000000 -0800 +++ wli-2.6.0-test11-30/fs/romfs/inode.c 2003-12-04 06:13:40.000000000 -0800 @@ -414,7 +414,7 @@ out: unlock_kernel(); static int romfs_readpage(struct file *file, struct page * page) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; unsigned long offset, avail, readlen; void *buf; int result = -EIO; diff -prauN linux-2.6.0-test11/fs/smbfs/file.c wli-2.6.0-test11-30/fs/smbfs/file.c --- linux-2.6.0-test11/fs/smbfs/file.c 2003-11-26 12:45:31.000000000 -0800 +++ wli-2.6.0-test11-30/fs/smbfs/file.c 2003-12-04 06:13:40.000000000 -0800 @@ -172,7 +172,7 @@ smb_writepage_sync(struct inode *inode, static int smb_writepage(struct page *page, struct writeback_control *wbc) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); struct inode *inode; unsigned long end_index; unsigned offset = PAGE_CACHE_SIZE; diff -prauN linux-2.6.0-test11/fs/super.c wli-2.6.0-test11-30/fs/super.c --- linux-2.6.0-test11/fs/super.c 2003-11-26 12:43:50.000000000 -0800 +++ wli-2.6.0-test11-30/fs/super.c 2003-12-04 08:19:38.000000000 -0800 @@ -66,6 +66,7 @@ static struct super_block *alloc_super(v INIT_LIST_HEAD(&s->s_files); INIT_LIST_HEAD(&s->s_instances); INIT_HLIST_HEAD(&s->s_anon); + INIT_LIST_HEAD(&s->s_inodes); init_rwsem(&s->s_umount); sema_init(&s->s_lock, 1); down_write(&s->s_umount); diff -prauN linux-2.6.0-test11/fs/sysv/dir.c wli-2.6.0-test11-30/fs/sysv/dir.c --- linux-2.6.0-test11/fs/sysv/dir.c 2003-11-26 12:43:27.000000000 -0800 +++ wli-2.6.0-test11-30/fs/sysv/dir.c 2003-12-04 06:13:40.000000000 -0800 @@ -39,10 +39,10 @@ static inline unsigned long dir_pages(st static int dir_commit_chunk(struct page *page, unsigned from, unsigned to) { - struct inode *dir = (struct inode *)page->mapping->host; + struct inode *dir = (struct inode *)page_mapping(page)->host; int err = 0; - page->mapping->a_ops->commit_write(NULL, page, from, to); + page_mapping(page)->a_ops->commit_write(NULL, page, from, to); if (IS_DIRSYNC(dir)) err = write_one_page(page, 1); else @@ -225,7 +225,7 @@ got_it: from = (char*)de - (char*)page_address(page); to = from + SYSV_DIRSIZE; lock_page(page); - err = page->mapping->a_ops->prepare_write(NULL, page, from, to); + err = page_mapping(page)->a_ops->prepare_write(NULL, page, from, to); if (err) goto out_unlock; memcpy (de->name, name, namelen); @@ -245,7 +245,7 @@ out_unlock: int sysv_delete_entry(struct sysv_dir_entry *de, struct page *page) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); struct inode *inode = (struct inode*)mapping->host; char *kaddr = (char*)page_address(page); unsigned from = (char*)de - kaddr; @@ -347,13 +347,13 @@ not_empty: void sysv_set_link(struct sysv_dir_entry *de, struct page *page, struct inode *inode) { - struct inode *dir = (struct inode*)page->mapping->host; + struct inode *dir = (struct inode*)page_mapping(page)->host; unsigned from = (char *)de-(char*)page_address(page); unsigned to = from + SYSV_DIRSIZE; int err; lock_page(page); - err = page->mapping->a_ops->prepare_write(NULL, page, from, to); + err = page_mapping(page)->a_ops->prepare_write(NULL, page, from, to); if (err) BUG(); de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino); diff -prauN linux-2.6.0-test11/fs/udf/file.c wli-2.6.0-test11-30/fs/udf/file.c --- linux-2.6.0-test11/fs/udf/file.c 2003-11-26 12:44:25.000000000 -0800 +++ wli-2.6.0-test11-30/fs/udf/file.c 2003-12-04 06:13:40.000000000 -0800 @@ -46,7 +46,7 @@ static int udf_adinicb_readpage(struct file *file, struct page * page) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; char *kaddr; if (!PageLocked(page)) @@ -64,7 +64,7 @@ static int udf_adinicb_readpage(struct f static int udf_adinicb_writepage(struct page *page, struct writeback_control *wbc) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; char *kaddr; if (!PageLocked(page)) @@ -87,7 +87,7 @@ static int udf_adinicb_prepare_write(str static int udf_adinicb_commit_write(struct file *file, struct page *page, unsigned offset, unsigned to) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; char *kaddr = page_address(page); memcpy(UDF_I_DATA(inode) + UDF_I_LENEATTR(inode) + offset, diff -prauN linux-2.6.0-test11/fs/udf/symlink.c wli-2.6.0-test11-30/fs/udf/symlink.c --- linux-2.6.0-test11/fs/udf/symlink.c 2003-11-26 12:43:25.000000000 -0800 +++ wli-2.6.0-test11-30/fs/udf/symlink.c 2003-12-04 06:13:40.000000000 -0800 @@ -80,7 +80,7 @@ static void udf_pc_to_char(struct super_ static int udf_symlink_filler(struct file *file, struct page *page) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; struct buffer_head *bh = NULL; char *symlink; int err = -EIO; diff -prauN linux-2.6.0-test11/fs/xfs/linux/xfs_aops.c wli-2.6.0-test11-30/fs/xfs/linux/xfs_aops.c --- linux-2.6.0-test11/fs/xfs/linux/xfs_aops.c 2003-11-26 12:45:32.000000000 -0800 +++ wli-2.6.0-test11-30/fs/xfs/linux/xfs_aops.c 2003-12-04 06:13:40.000000000 -0800 @@ -236,7 +236,7 @@ probe_unwritten_page( if (PageWriteback(page)) goto out; - if (page->mapping && page_has_buffers(page)) { + if (page_mapping(page) && page_has_buffers(page)) { struct buffer_head *bh, *head; unsigned long p_offset = 0; @@ -284,7 +284,7 @@ probe_unmapped_page( if (PageWriteback(page)) goto out; - if (page->mapping && PageDirty(page)) { + if (page_mapping(page) && PageDirty(page)) { if (page_has_buffers(page)) { struct buffer_head *bh, *head; @@ -363,7 +363,7 @@ probe_delalloc_page( if (PageWriteback(page)) goto out; - if (page->mapping && page_has_buffers(page)) { + if (page_mapping(page) && page_has_buffers(page)) { struct buffer_head *bh, *head; int acceptable = 0; @@ -1079,7 +1079,7 @@ linvfs_writepage( int error; int need_trans; int delalloc, unmapped, unwritten; - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; /* * We need a transaction if: @@ -1159,7 +1159,7 @@ linvfs_release_page( struct page *page, int gfp_mask) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; int delalloc, unmapped, unwritten; count_page_state(page, &delalloc, &unmapped, &unwritten); diff -prauN linux-2.6.0-test11/include/asm-alpha/mmzone.h wli-2.6.0-test11-30/include/asm-alpha/mmzone.h --- linux-2.6.0-test11/include/asm-alpha/mmzone.h 2003-11-26 12:45:33.000000000 -0800 +++ wli-2.6.0-test11-30/include/asm-alpha/mmzone.h 2003-12-04 08:30:37.000000000 -0800 @@ -72,10 +72,6 @@ PLAT_NODE_DATA_LOCALNR(unsigned long p, ((unsigned long)__va(NODE_DATA(kvaddr_to_nid(kaddr))->node_start_pfn \ << PAGE_SHIFT)) -#define kern_addr_valid(kaddr) \ - test_bit(local_mapnr(kaddr), \ - NODE_DATA(kvaddr_to_nid(kaddr))->valid_addr_bitmap) - #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) #define VALID_PAGE(page) (((page) - mem_map) < max_mapnr) diff -prauN linux-2.6.0-test11/include/asm-alpha/pgalloc.h wli-2.6.0-test11-30/include/asm-alpha/pgalloc.h --- linux-2.6.0-test11/include/asm-alpha/pgalloc.h 2003-11-26 12:42:47.000000000 -0800 +++ wli-2.6.0-test11-30/include/asm-alpha/pgalloc.h 2003-12-03 18:20:41.000000000 -0800 @@ -24,9 +24,9 @@ pmd_populate_kernel(struct mm_struct *mm } static inline void -pgd_populate(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmd) +pgd_populate(struct mm_struct *mm, pgd_t *pgd, struct page *pmd) { - pgd_set(pgd, pmd); + pgd_set(pgd, page_address(pmd)); } extern pgd_t *pgd_alloc(struct mm_struct *mm); @@ -37,19 +37,29 @@ pgd_free(pgd_t *pgd) free_page((unsigned long)pgd); } -static inline pmd_t * +static inline struct page * pmd_alloc_one(struct mm_struct *mm, unsigned long address) { - pmd_t *ret = (pmd_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT); - if (ret) - clear_page(ret); - return ret; + struct page *page = alloc_page(GFP_KERNEL|__GFP_REPEAT); + if (page) + clear_highpage(page); + return page; +} + +static inline pmd_t * +pmd_alloc_one_kernel(struct mm_struct *mm, unsigned long addr) +{ + struct page *page = pmd_alloc_one(mm, addr); + if (page) + return page_address(page); + else + return NULL; } static inline void -pmd_free(pmd_t *pmd) +pmd_free(struct page *pmd) { - free_page((unsigned long)pmd); + __free_page(pmd); } extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long addr); diff -prauN linux-2.6.0-test11/include/asm-alpha/pgtable.h wli-2.6.0-test11-30/include/asm-alpha/pgtable.h --- linux-2.6.0-test11/include/asm-alpha/pgtable.h 2003-11-26 12:43:29.000000000 -0800 +++ wli-2.6.0-test11-30/include/asm-alpha/pgtable.h 2003-12-04 08:30:37.000000000 -0800 @@ -228,9 +228,11 @@ pmd_page_kernel(pmd_t pmd) #define pmd_page(pmd) (mem_map + ((pmd_val(pmd) & _PFN_MASK) >> 32)) #endif -extern inline unsigned long pgd_page(pgd_t pgd) +extern inline unsigned long __pgd_page(pgd_t pgd) { return PAGE_OFFSET + ((pgd_val(pgd) & _PFN_MASK) >> (32-PAGE_SHIFT)); } +#define pgd_page(pgd) virt_to_page(__pgd_page(pgd)) + extern inline int pte_none(pte_t pte) { return !pte_val(pte); } extern inline int pte_present(pte_t pte) { return pte_val(pte) & _PAGE_VALID; } extern inline void pte_clear(pte_t *ptep) { pte_val(*ptep) = 0; } @@ -279,9 +281,15 @@ extern inline pte_t pte_mkyoung(pte_t pt /* Find an entry in the second-level page table.. */ extern inline pmd_t * pmd_offset(pgd_t * dir, unsigned long address) { - return (pmd_t *) pgd_page(*dir) + ((address >> PMD_SHIFT) & (PTRS_PER_PAGE - 1)); + return (pmd_t *)__pgd_page(*dir) + ((address >> PMD_SHIFT) & (PTRS_PER_PAGE - 1)); } +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) + /* Find an entry in the third-level page table.. */ extern inline pte_t * pte_offset_kernel(pmd_t * dir, unsigned long address) { @@ -323,10 +331,6 @@ extern inline pte_t mk_swap_pte(unsigned #define PTE_FILE_MAX_BITS 32 -#ifndef CONFIG_DISCONTIGMEM -#define kern_addr_valid(addr) (1) -#endif - #define io_remap_page_range(vma, start, busaddr, size, prot) \ remap_page_range(vma, start, virt_to_phys(__ioremap(busaddr, size)), size, prot) diff -prauN linux-2.6.0-test11/include/asm-arm/pgalloc.h wli-2.6.0-test11-30/include/asm-arm/pgalloc.h --- linux-2.6.0-test11/include/asm-arm/pgalloc.h 2003-11-26 12:45:38.000000000 -0800 +++ wli-2.6.0-test11-30/include/asm-arm/pgalloc.h 2003-12-03 18:20:41.000000000 -0800 @@ -17,7 +17,8 @@ /* * Since we have only two-level page tables, these are trivial */ -#define pmd_alloc_one(mm,addr) ({ BUG(); ((pmd_t *)2); }) +#define pmd_alloc_one(mm,addr) ({ BUG(); ((struct page *)2); }) +#define pmd_alloc_one_kernel(mm,addr) ({ BUG(); ((pmd_t *)2); }) #define pmd_free(pmd) do { } while (0) #define pgd_populate(mm,pmd,pte) BUG() diff -prauN linux-2.6.0-test11/include/asm-arm/pgtable.h wli-2.6.0-test11-30/include/asm-arm/pgtable.h --- linux-2.6.0-test11/include/asm-arm/pgtable.h 2003-11-26 12:43:26.000000000 -0800 +++ wli-2.6.0-test11-30/include/asm-arm/pgtable.h 2003-12-04 08:30:37.000000000 -0800 @@ -317,6 +317,11 @@ static inline pte_t *pmd_page_kernel(pmd /* Find an entry in the second-level page table.. */ #define pmd_offset(dir, addr) ((pmd_t *)(dir)) +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) /* Find an entry in the third-level page table.. */ #define __pte_index(addr) (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) @@ -339,10 +344,6 @@ extern pgd_t swapper_pg_dir[PTRS_PER_PGD #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(swp) ((pte_t) { (swp).val }) -/* Needs to be defined here and not in linux/mm.h, as it is arch dependent */ -/* FIXME: this is not correct */ -#define kern_addr_valid(addr) (1) - #include /* diff -prauN linux-2.6.0-test11/include/asm-arm26/pgalloc.h wli-2.6.0-test11-30/include/asm-arm26/pgalloc.h --- linux-2.6.0-test11/include/asm-arm26/pgalloc.h 2003-11-26 12:46:08.000000000 -0800 +++ wli-2.6.0-test11-30/include/asm-arm26/pgalloc.h 2003-12-03 18:20:41.000000000 -0800 @@ -55,7 +55,8 @@ pmd_populate_kernel(struct mm_struct *mm * is thrown away. It just cant be zero. -IM */ -#define pmd_alloc_one(mm,addr) ({ BUG(); ((pmd_t *)2); }) +#define pmd_alloc_one(mm,addr) ({ BUG(); ((struct page *)2); }) +#define pmd_alloc_one_kernel(mm,addr) ({ BUG(); ((pmd_t *)2); }) #define pmd_free(pmd) do { } while (0) #define pgd_populate(mm,pmd,pte) BUG() diff -prauN linux-2.6.0-test11/include/asm-arm26/pgtable.h wli-2.6.0-test11-30/include/asm-arm26/pgtable.h --- linux-2.6.0-test11/include/asm-arm26/pgtable.h 2003-11-26 12:42:58.000000000 -0800 +++ wli-2.6.0-test11-30/include/asm-arm26/pgtable.h 2003-12-04 08:30:37.000000000 -0800 @@ -99,7 +99,7 @@ extern struct page *empty_zero_page; * on arm26 we have no 2nd level page table. we simulate this by removing the * PMD. * - * pgd_none is 0 to prevernt pmd_alloc() calling __pmd_alloc(). This causes it + * pgd_none is 0 to prevernt pmd_alloc_map() calling __pmd_alloc(). This causes it * to return pmd_offset(pgd,addr) which is a pointer to the pgd (IOW, a no-op). * * however, to work this way, whilst we are allocating 32 pgds, containing 32 @@ -134,7 +134,7 @@ extern struct page *empty_zero_page; #define _PMD_PRESENT (0x01) -/* These definitions allow us to optimise out stuff like pmd_alloc() */ +/* These definitions allow us to optimise out stuff like pmd_alloc_map() */ #define pgd_none(pgd) (0) #define pgd_bad(pgd) (0) #define pgd_present(pgd) (1) @@ -188,6 +188,12 @@ extern struct page *empty_zero_page; #define pte_unmap(pte) do { } while (0) #define pte_unmap_nested(pte) do { } while (0) +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pgd, addr) do { } while (0) +#define pmd_unmap_nested(pgd, addr) do { } while (0) + #define _PAGE_PRESENT 0x01 #define _PAGE_READONLY 0x02 @@ -265,10 +271,6 @@ extern pgd_t swapper_pg_dir[PTRS_PER_PGD #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(swp) ((pte_t) { (swp).val }) -/* Needs to be defined here and not in linux/mm.h, as it is arch dependent */ -/* FIXME: this is not correct */ -#define kern_addr_valid(addr) (1) - /* * Conversion functions: convert a page and protection to a page entry, * and a page entry and page directory to the page they refer to. diff -prauN linux-2.6.0-test11/include/asm-arm26/rmap.h wli-2.6.0-test11-30/include/asm-arm26/rmap.h --- linux-2.6.0-test11/include/asm-arm26/rmap.h 2003-11-26 12:43:40.000000000 -0800 +++ wli-2.6.0-test11-30/include/asm-arm26/rmap.h 2003-12-04 06:13:40.000000000 -0800 @@ -14,14 +14,14 @@ static inline void pgtable_add_rmap(struct page *page, struct mm_struct * mm, unsigned long address) { - page->mapping = (void *)mm; + set_page_mapping(page, mm); page->index = address & ~((PTRS_PER_PTE * PAGE_SIZE) - 1); inc_page_state(nr_page_table_pages); } static inline void pgtable_remove_rmap(struct page *page) { - page->mapping = NULL; + set_page_mapping(page, NULL); page->index = 0; dec_page_state(nr_page_table_pages); } @@ -29,7 +29,7 @@ static inline void pgtable_remove_rmap(s static inline struct mm_struct * ptep_to_mm(pte_t * ptep) { struct page * page = virt_to_page(ptep); - return (struct mm_struct *)page->mapping; + return (struct mm_struct *)page_mapping(page); } /* The page table takes half of the page */ diff -prauN linux-2.6.0-test11/include/asm-cris/pgalloc.h wli-2.6.0-test11-30/include/asm-cris/pgalloc.h --- linux-2.6.0-test11/include/asm-cris/pgalloc.h 2003-11-26 12:45:26.000000000 -0800 +++ wli-2.6.0-test11-30/include/asm-cris/pgalloc.h 2003-12-03 18:20:41.000000000 -0800 @@ -57,7 +57,8 @@ extern inline void pte_free(struct page * the pgd will always be present.. */ -#define pmd_alloc_one(mm, addr) ({ BUG(); ((pmd_t *)2); }) +#define pmd_alloc_one(mm, addr) ({ BUG(); ((struct page *)2); }) +#define pmd_alloc_one_kernel(mm, addr) ({ BUG(); ((pmd_t *)2); }) #define pmd_free(x) do { } while (0) #define __pmd_free_tlb(tlb,x) do { } while (0) #define pgd_populate(mm, pmd, pte) BUG() diff -prauN linux-2.6.0-test11/include/asm-cris/pgtable.h wli-2.6.0-test11-30/include/asm-cris/pgtable.h --- linux-2.6.0-test11/include/asm-cris/pgtable.h 2003-11-26 12:42:50.000000000 -0800 +++ wli-2.6.0-test11-30/include/asm-cris/pgtable.h 2003-12-04 08:30:37.000000000 -0800 @@ -281,6 +281,12 @@ extern inline pmd_t * pmd_offset(pgd_t * return (pmd_t *) dir; } +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) + /* Find an entry in the third-level page table.. */ #define __pte_offset(address) \ (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) @@ -325,8 +331,6 @@ extern inline void update_mmu_cache(stru #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -#define kern_addr_valid(addr) (1) - #include /* diff -prauN linux-2.6.0-test11/include/asm-generic/rmap.h wli-2.6.0-test11-30/include/asm-generic/rmap.h --- linux-2.6.0-test11/include/asm-generic/rmap.h 2003-11-26 12:44:29.000000000 -0800 +++ wli-2.6.0-test11-30/include/asm-generic/rmap.h 1969-12-31 16:00:00.000000000 -0800 @@ -1,90 +0,0 @@ -#ifndef _GENERIC_RMAP_H -#define _GENERIC_RMAP_H -/* - * linux/include/asm-generic/rmap.h - * - * Architecture dependent parts of the reverse mapping code, - * this version should work for most architectures with a - * 'normal' page table layout. - * - * We use the struct page of the page table page to find out - * the process and full address of a page table entry: - * - page->mapping points to the process' mm_struct - * - page->index has the high bits of the address - * - the lower bits of the address are calculated from the - * offset of the page table entry within the page table page - * - * For CONFIG_HIGHPTE, we need to represent the address of a pte in a - * scalar pte_addr_t. The pfn of the pte's page is shifted left by PAGE_SIZE - * bits and is then ORed with the byte offset of the pte within its page. - * - * For CONFIG_HIGHMEM4G, the pte_addr_t is 32 bits. 20 for the pfn, 12 for - * the offset. - * - * For CONFIG_HIGHMEM64G, the pte_addr_t is 64 bits. 52 for the pfn, 12 for - * the offset. - */ -#include - -static inline void pgtable_add_rmap(struct page * page, struct mm_struct * mm, unsigned long address) -{ -#ifdef BROKEN_PPC_PTE_ALLOC_ONE - /* OK, so PPC calls pte_alloc() before mem_map[] is setup ... ;( */ - extern int mem_init_done; - - if (!mem_init_done) - return; -#endif - page->mapping = (void *)mm; - page->index = address & ~((PTRS_PER_PTE * PAGE_SIZE) - 1); - inc_page_state(nr_page_table_pages); -} - -static inline void pgtable_remove_rmap(struct page * page) -{ - page->mapping = NULL; - page->index = 0; - dec_page_state(nr_page_table_pages); -} - -static inline struct mm_struct * ptep_to_mm(pte_t * ptep) -{ - struct page * page = kmap_atomic_to_page(ptep); - return (struct mm_struct *) page->mapping; -} - -static inline unsigned long ptep_to_address(pte_t * ptep) -{ - struct page * page = kmap_atomic_to_page(ptep); - unsigned long low_bits; - low_bits = ((unsigned long)ptep & ~PAGE_MASK) * PTRS_PER_PTE; - return page->index + low_bits; -} - -#ifdef CONFIG_HIGHPTE -static inline pte_addr_t ptep_to_paddr(pte_t *ptep) -{ - pte_addr_t paddr; - paddr = ((pte_addr_t)page_to_pfn(kmap_atomic_to_page(ptep))) << PAGE_SHIFT; - return paddr + (pte_addr_t)((unsigned long)ptep & ~PAGE_MASK); -} -#else -static inline pte_addr_t ptep_to_paddr(pte_t *ptep) -{ - return (pte_addr_t)ptep; -} -#endif - -#ifndef CONFIG_HIGHPTE -static inline pte_t *rmap_ptep_map(pte_addr_t pte_paddr) -{ - return (pte_t *)pte_paddr; -} - -static inline void rmap_ptep_unmap(pte_t *pte) -{ - return; -} -#endif - -#endif /* _GENERIC_RMAP_H */ diff -prauN linux-2.6.0-test11/include/asm-h8300/pgtable.h wli-2.6.0-test11-30/include/asm-h8300/pgtable.h --- linux-2.6.0-test11/include/asm-h8300/pgtable.h 2003-11-26 12:43:39.000000000 -0800 +++ wli-2.6.0-test11-30/include/asm-h8300/pgtable.h 2003-12-04 08:30:37.000000000 -0800 @@ -13,11 +13,15 @@ typedef pte_t *pte_addr_t; #define pgd_none(pgd) (0) #define pgd_bad(pgd) (0) #define pgd_clear(pgdp) -#define kern_addr_valid(addr) (1) #define pmd_offset(a, b) ((void *)0) #define pmd_none(pmd) (1) #define pgd_offset_k(adrdress) ((pgd_t *)0) #define pte_offset_kernel(dir, address) ((pte_t *)0) +#define pmd_offset_kernel(a,b) pmd_offset(a,b) +#define pmd_offset_map(a,b) pmd_offset(a,b) +#define pmd_offset_map_nested(a,b) pmd_offset(a,b) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) #define PAGE_NONE __pgprot(0) /* these mean nothing to NO_MM */ #define PAGE_SHARED __pgprot(0) /* these mean nothing to NO_MM */ diff -prauN linux-2.6.0-test11/include/asm-i386/a.out.h wli-2.6.0-test11-30/include/asm-i386/a.out.h --- linux-2.6.0-test11/include/asm-i386/a.out.h 2003-11-26 12:43:37.000000000 -0800 +++ wli-2.6.0-test11-30/include/asm-i386/a.out.h 2003-12-04 07:37:51.000000000 -0800 @@ -19,7 +19,11 @@ struct exec #ifdef __KERNEL__ +#ifdef CONFIG_MMAP_TOPDOWN +#define STACK_TOP ((128 << 20) + (256 << 10)) +#else #define STACK_TOP TASK_SIZE +#endif #endif diff -prauN linux-2.6.0-test11/include/asm-i386/highmem.h wli-2.6.0-test11-30/include/asm-i386/highmem.h --- linux-2.6.0-test11/include/asm-i386/highmem.h 2003-11-26 12:42:57.000000000 -0800 +++ wli-2.6.0-test11-30/include/asm-i386/highmem.h 2003-12-03 19:25:21.000000000 -0800 @@ -41,9 +41,9 @@ extern void kmap_init(void); * chunk of RAM. */ #if NR_CPUS <= 32 -#define PKMAP_BASE (0xff800000UL) +#define PKMAP_BASE (0xff400000UL) #else -#define PKMAP_BASE (0xff600000UL) +#define PKMAP_BASE (0xfe800000UL) #endif #ifdef CONFIG_X86_PAE #define LAST_PKMAP 512 @@ -54,14 +54,60 @@ extern void kmap_init(void); #define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -extern void * FASTCALL(kmap_high(struct page *page)); -extern void FASTCALL(kunmap_high(struct page *page)); +void *FASTCALL(kmap_high(struct page *page)); +void FASTCALL(kunmap_high(struct page *page)); -void *kmap(struct page *page); -void kunmap(struct page *page); -void *kmap_atomic(struct page *page, enum km_type type); -void kunmap_atomic(void *kvaddr, enum km_type type); -struct page *kmap_atomic_to_page(void *ptr); +void *FASTCALL(__kmap_atomic(struct page *page, enum km_type type, unsigned long vaddr)); + +static inline void *kmap(struct page *page) +{ + might_sleep(); + if (page < highmem_start_page) + return lowmem_page_address(page); + else + return kmap_high(page); +} + +static inline void kunmap(struct page *page) +{ + BUG_ON(in_interrupt()); + if (page >= highmem_start_page) + kunmap_high(page); +} + +static inline void *kmap_atomic(struct page *page, enum km_type type) +{ + inc_preempt_count(); + if (page < highmem_start_page) + return lowmem_page_address(page); + else + return __kmap_atomic(page, type, __fix_to_virt(FIX_KMAP_BEGIN + type)); +} + +#ifdef CONFIG_DEBUG_HIGHMEM +void FASTCALL(__kunmap_atomic(void *kvaddr, enum km_type type, unsigned long vaddr)); +#else +static inline void __kunmap_atomic(void *kvaddr, enum km_type idx, unsigned long vaddr) +{ +} +#endif + +static inline void kunmap_atomic(void *kvaddr, enum km_type type) +{ + if ((unsigned long)kvaddr >= FIXADDR_START) + __kunmap_atomic(kvaddr, type, __fix_to_virt(FIX_KMAP_BEGIN + type)); + dec_preempt_count(); +} + +static inline struct page *kmap_atomic_to_page(void *vaddr) +{ + if ((unsigned long)vaddr < FIXADDR_START) + return virt_to_page(vaddr); + else { + unsigned long idx = virt_to_fix((unsigned long)vaddr); + return pte_page(*(kmap_pte - (idx - FIX_KMAP_BEGIN))); + } +} #define flush_cache_kmaps() do { } while (0) diff -prauN linux-2.6.0-test11/include/asm-i386/kmap_types.h wli-2.6.0-test11-30/include/asm-i386/kmap_types.h --- linux-2.6.0-test11/include/asm-i386/kmap_types.h 2003-11-26 12:44:56.000000000 -0800 +++ wli-2.6.0-test11-30/include/asm-i386/kmap_types.h 2003-12-03 18:20:41.000000000 -0800 @@ -17,14 +17,16 @@ D(3) KM_USER0, D(4) KM_USER1, D(5) KM_BIO_SRC_IRQ, D(6) KM_BIO_DST_IRQ, -D(7) KM_PTE0, -D(8) KM_PTE1, -D(9) KM_PTE2, -D(10) KM_IRQ0, -D(11) KM_IRQ1, -D(12) KM_SOFTIRQ0, -D(13) KM_SOFTIRQ1, -D(14) KM_TYPE_NR +D(7) KM_PMD0, +D(8) KM_PMD1, +D(9) KM_PTE0, +D(10) KM_PTE1, +D(11) KM_PTE2, +D(12) KM_IRQ0, +D(13) KM_IRQ1, +D(14) KM_SOFTIRQ0, +D(15) KM_SOFTIRQ1, +D(16) KM_TYPE_NR }; #undef D diff -prauN linux-2.6.0-test11/include/asm-i386/linkage.h wli-2.6.0-test11-30/include/asm-i386/linkage.h --- linux-2.6.0-test11/include/asm-i386/linkage.h 2003-11-26 12:46:10.000000000 -0800 +++ wli-2.6.0-test11-30/include/asm-i386/linkage.h 2003-12-03 19:38:56.000000000 -0800 @@ -3,6 +3,7 @@ #define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0))) #define FASTCALL(x) x __attribute__((regparm(3))) +#define IRQHANDLER(x) x __attribute__((regparm(1))) #ifdef CONFIG_X86_ALIGNMENT_16 #define __ALIGN .align 16,0x90 diff -prauN linux-2.6.0-test11/include/asm-i386/mmzone.h wli-2.6.0-test11-30/include/asm-i386/mmzone.h --- linux-2.6.0-test11/include/asm-i386/mmzone.h 2003-11-26 12:44:10.000000000 -0800 +++ wli-2.6.0-test11-30/include/asm-i386/mmzone.h 2003-12-04 08:30:37.000000000 -0800 @@ -62,13 +62,6 @@ extern struct pglist_data *node_data[]; (__pfn - node_start_pfn(pfn_to_nid(__pfn))); \ }) -#define kern_addr_valid(kaddr) \ -({ \ - unsigned long __kaddr = (unsigned long)(kaddr); \ - pg_data_t *__pgdat = NODE_DATA(kvaddr_to_nid(__kaddr)); \ - test_bit(local_mapnr(__kaddr), __pgdat->valid_addr_bitmap); \ -}) - #define pfn_to_page(pfn) \ ({ \ unsigned long __pfn = pfn; \ diff -prauN linux-2.6.0-test11/include/asm-i386/numaq.h wli-2.6.0-test11-30/include/asm-i386/numaq.h --- linux-2.6.0-test11/include/asm-i386/numaq.h 2003-11-26 12:44:18.000000000 -0800 +++ wli-2.6.0-test11-30/include/asm-i386/numaq.h 2003-12-04 07:27:23.000000000 -0800 @@ -28,7 +28,8 @@ #ifdef CONFIG_X86_NUMAQ -extern int get_memcfg_numaq(void); +#define MAX_NODE_CPUS 4 +int get_memcfg_numaq(void); /* * SYS_CFG_DATA_PRIV_ADDR, struct eachquadmem, and struct sys_cfg_data are the diff -prauN linux-2.6.0-test11/include/asm-i386/page.h wli-2.6.0-test11-30/include/asm-i386/page.h --- linux-2.6.0-test11/include/asm-i386/page.h 2003-11-26 12:43:09.000000000 -0800 +++ wli-2.6.0-test11-30/include/asm-i386/page.h 2003-12-03 19:38:56.000000000 -0800 @@ -3,7 +3,11 @@ /* PAGE_SHIFT determines the page size */ #define PAGE_SHIFT 12 -#define PAGE_SIZE (1UL << PAGE_SHIFT) +#ifndef __ASSEMBLY__ +#define PAGE_SIZE (1UL << PAGE_SHIFT) +#else +#define PAGE_SIZE (1 << PAGE_SHIFT) +#endif #define PAGE_MASK (~(PAGE_SIZE-1)) #define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1)) diff -prauN linux-2.6.0-test11/include/asm-i386/percpu.h wli-2.6.0-test11-30/include/asm-i386/percpu.h --- linux-2.6.0-test11/include/asm-i386/percpu.h 2003-11-26 12:45:37.000000000 -0800 +++ wli-2.6.0-test11-30/include/asm-i386/percpu.h 2003-12-04 07:27:23.000000000 -0800 @@ -3,4 +3,9 @@ #include +#ifdef CONFIG_NUMA +#undef __GENERIC_PER_CPU +void setup_per_cpu_areas(void); +#endif + #endif /* __ARCH_I386_PERCPU__ */ diff -prauN linux-2.6.0-test11/include/asm-i386/pgalloc.h wli-2.6.0-test11-30/include/asm-i386/pgalloc.h --- linux-2.6.0-test11/include/asm-i386/pgalloc.h 2003-11-26 12:42:55.000000000 -0800 +++ wli-2.6.0-test11-30/include/asm-i386/pgalloc.h 2003-12-03 18:30:38.000000000 -0800 @@ -31,25 +31,36 @@ static inline void pte_free_kernel(pte_t free_page((unsigned long)pte); } -static inline void pte_free(struct page *pte) -{ - __free_page(pte); -} - - -#define __pte_free_tlb(tlb,pte) tlb_remove_page((tlb),(pte)) - /* * allocating and freeing a pmd is trivial: the 1-entry pmd is * inside the pgd, so has no extra memory associated with it. * (In the PAE case we free the pmds as part of the pgd.) */ -#define pmd_alloc_one(mm, addr) ({ BUG(); ((pmd_t *)2); }) +#define pmd_alloc_one(mm, addr) ({ BUG(); ((struct page *)2); }) +#define pmd_alloc_one_kernel(mm, addr) ({ BUG(); ((pmd_t *)2); }) #define pmd_free(x) do { } while (0) #define __pmd_free_tlb(tlb,x) do { } while (0) #define pgd_populate(mm, pmd, pte) BUG() #define check_pgt_cache() do { } while (0) +#include + +static inline void pte_free(struct page *page) +{ + struct mmu_gather *tlb = &per_cpu(mmu_gathers, get_cpu()); + tlb_remove_page(tlb, page); + put_cpu(); +} + +static inline void pte_free_tlb(struct mmu_gather *tlb, struct page *page) +{ + tlb_remove_page(tlb, page); +} + +static inline void pmd_free_tlb(struct mmu_gather *tlb, struct page *page) +{ +} + #endif /* _I386_PGALLOC_H */ diff -prauN linux-2.6.0-test11/include/asm-i386/pgtable-2level.h wli-2.6.0-test11-30/include/asm-i386/pgtable-2level.h --- linux-2.6.0-test11/include/asm-i386/pgtable-2level.h 2003-11-26 12:44:17.000000000 -0800 +++ wli-2.6.0-test11-30/include/asm-i386/pgtable-2level.h 2003-12-03 18:20:41.000000000 -0800 @@ -48,13 +48,15 @@ static inline int pgd_present(pgd_t pgd) #define set_pmd(pmdptr, pmdval) (*(pmdptr) = pmdval) #define set_pgd(pgdptr, pgdval) (*(pgdptr) = pgdval) -#define pgd_page(pgd) \ -((unsigned long) __va(pgd_val(pgd) & PAGE_MASK)) +#define pgd_page(pgd) pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT) + +#define pmd_offset_map(pgd, addr) ({ (pmd_t *)(pgd); }) +#define pmd_offset_map_nested(pgd, addr) pmd_offset_map(pgd, addr) +#define pmd_offset_kernel(pgd, addr) pmd_offset_map(pgd, addr) + +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) -static inline pmd_t * pmd_offset(pgd_t * dir, unsigned long address) -{ - return (pmd_t *) dir; -} #define ptep_get_and_clear(xp) __pte(xchg(&(xp)->pte_low, 0)) #define pte_same(a, b) ((a).pte_low == (b).pte_low) #define pte_page(x) pfn_to_page(pte_pfn(x)) diff -prauN linux-2.6.0-test11/include/asm-i386/pgtable-3level.h wli-2.6.0-test11-30/include/asm-i386/pgtable-3level.h --- linux-2.6.0-test11/include/asm-i386/pgtable-3level.h 2003-11-26 12:45:20.000000000 -0800 +++ wli-2.6.0-test11-30/include/asm-i386/pgtable-3level.h 2003-12-03 18:20:41.000000000 -0800 @@ -64,12 +64,32 @@ static inline void set_pte(pte_t *ptep, */ static inline void pgd_clear (pgd_t * pgd) { } -#define pgd_page(pgd) \ -((unsigned long) __va(pgd_val(pgd) & PAGE_MASK)) +static inline unsigned long pgd_pfn(pgd_t pgd) +{ + return pgd_val(pgd) >> PAGE_SHIFT; +} + +#define pgd_page(pgd) pfn_to_page(pgd_pfn(pgd)) + +#define pmd_offset_kernel(pgd, addr) \ + ((pmd_t *)__va(pgd_val(*(pgd)) & PAGE_MASK) + pmd_index(addr)) /* Find an entry in the second-level page table.. */ -#define pmd_offset(dir, address) ((pmd_t *) pgd_page(*(dir)) + \ - pmd_index(address)) +#ifdef CONFIG_HIGHPMD +#define __pmd_offset(pgd, addr, type) \ + ((pmd_t *)kmap_atomic(pgd_page(*(pgd)), type) + pmd_index(addr)) +#define __pmd_unmap(pmd, type) kunmap_atomic(pmd, type) +#else +#define __pmd_offset(pgd, addr, type) \ + ((pmd_t *)__va(pgd_val(*(pgd)) & PAGE_MASK) + pmd_index(addr)) +#define __pmd_unmap(pmd, type) do { } while (0) +#endif + +#define pmd_offset_map(pgd, addr) __pmd_offset(pgd, addr, KM_PMD0) +#define pmd_offset_map_nested(pgd, addr) __pmd_offset(pgd, addr, KM_PMD1) + +#define pmd_unmap(pmd) __pmd_unmap(pmd, KM_PMD0); +#define pmd_unmap_nested(pmd) __pmd_unmap(pmd, KM_PMD1); static inline pte_t ptep_get_and_clear(pte_t *ptep) { diff -prauN linux-2.6.0-test11/include/asm-i386/pgtable.h wli-2.6.0-test11-30/include/asm-i386/pgtable.h --- linux-2.6.0-test11/include/asm-i386/pgtable.h 2003-11-26 12:44:59.000000000 -0800 +++ wli-2.6.0-test11-30/include/asm-i386/pgtable.h 2003-12-04 08:30:37.000000000 -0800 @@ -25,6 +25,10 @@ #include #include +#ifdef CONFIG_MMAP_TOPDOWN +#define HAVE_ARCH_UNMAPPED_AREA +#endif + /* * ZERO_PAGE is a global shared page that is always zero: used * for zero-mapped memory areas etc.. @@ -33,16 +37,17 @@ extern unsigned long empty_zero_page[1024]; extern pgd_t swapper_pg_dir[1024]; extern kmem_cache_t *pgd_cache; -extern kmem_cache_t *pmd_cache; extern spinlock_t pgd_lock; extern struct list_head pgd_list; -void pmd_ctor(void *, kmem_cache_t *, unsigned long); void pgd_ctor(void *, kmem_cache_t *, unsigned long); void pgd_dtor(void *, kmem_cache_t *, unsigned long); void pgtable_cache_init(void); void paging_init(void); +#define HAVE_ARCH_PAGETABLE_CACHE +void shrink_pagetable_cache(int gfp_mask); + #endif /* !__ASSEMBLY__ */ /* @@ -335,10 +340,6 @@ typedef pte_t *pte_addr_t; #endif /* !__ASSEMBLY__ */ -#ifndef CONFIG_DISCONTIGMEM -#define kern_addr_valid(addr) (1) -#endif /* !CONFIG_DISCONTIGMEM */ - #define io_remap_page_range remap_page_range #endif /* _I386_PGTABLE_H */ diff -prauN linux-2.6.0-test11/include/asm-i386/processor.h wli-2.6.0-test11-30/include/asm-i386/processor.h --- linux-2.6.0-test11/include/asm-i386/processor.h 2003-11-26 12:42:55.000000000 -0800 +++ wli-2.6.0-test11-30/include/asm-i386/processor.h 2003-12-03 19:40:24.000000000 -0800 @@ -481,22 +481,29 @@ struct task_struct; struct mm_struct; /* Free all resources held by a thread. */ -extern void release_thread(struct task_struct *); +void release_thread(struct task_struct *); /* Prepare to copy thread state - unlazy all lazy status */ -extern void prepare_to_copy(struct task_struct *tsk); +void prepare_to_copy(struct task_struct *); /* * create a kernel thread without removing it from tasklists */ -extern int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags); +int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags); -extern unsigned long thread_saved_pc(struct task_struct *tsk); +unsigned long thread_saved_pc(struct task_struct *); void show_trace(struct task_struct *task, unsigned long *stack); -unsigned long get_wchan(struct task_struct *p); -#define KSTK_EIP(tsk) (((unsigned long *)(4096+(unsigned long)(tsk)->thread_info))[1019]) -#define KSTK_ESP(tsk) (((unsigned long *)(4096+(unsigned long)(tsk)->thread_info))[1022]) +unsigned long get_wchan(struct task_struct *task); + +#define THREAD_SIZE_LONGS (THREAD_SIZE/sizeof(unsigned long)) +#define task_pt_regs(task) \ +({ \ + unsigned long *__ptr = (unsigned long *)(task)->thread_info; \ + (struct pt_regs *)(&__ptr[THREAD_SIZE_LONGS-1]); \ +}) +#define KSTK_EIP(task) (task_pt_regs(task)->eip) +#define KSTK_ESP(task) (task_pt_regs(task)->esp) struct microcode_header { unsigned int hdrver; diff -prauN linux-2.6.0-test11/include/asm-i386/rmap.h wli-2.6.0-test11-30/include/asm-i386/rmap.h --- linux-2.6.0-test11/include/asm-i386/rmap.h 2003-11-26 12:44:30.000000000 -0800 +++ wli-2.6.0-test11-30/include/asm-i386/rmap.h 1969-12-31 16:00:00.000000000 -0800 @@ -1,21 +0,0 @@ -#ifndef _I386_RMAP_H -#define _I386_RMAP_H - -/* nothing to see, move along */ -#include - -#ifdef CONFIG_HIGHPTE -static inline pte_t *rmap_ptep_map(pte_addr_t pte_paddr) -{ - unsigned long pfn = (unsigned long)(pte_paddr >> PAGE_SHIFT); - unsigned long off = ((unsigned long)pte_paddr) & ~PAGE_MASK; - return (pte_t *)((char *)kmap_atomic(pfn_to_page(pfn), KM_PTE2) + off); -} - -static inline void rmap_ptep_unmap(pte_t *pte) -{ - kunmap_atomic(pte, KM_PTE2); -} -#endif - -#endif diff -prauN linux-2.6.0-test11/include/asm-i386/srat.h wli-2.6.0-test11-30/include/asm-i386/srat.h --- linux-2.6.0-test11/include/asm-i386/srat.h 2003-11-26 12:43:27.000000000 -0800 +++ wli-2.6.0-test11-30/include/asm-i386/srat.h 2003-12-04 07:27:23.000000000 -0800 @@ -31,7 +31,8 @@ #error CONFIG_ACPI_SRAT not defined, and srat.h header has been included #endif -extern int get_memcfg_from_srat(void); -extern unsigned long *get_zholes_size(int); +#define MAX_NODE_CPUS 4 +int get_memcfg_from_srat(void); +unsigned long *get_zholes_size(int); #endif /* _ASM_SRAT_H_ */ diff -prauN linux-2.6.0-test11/include/asm-i386/system.h wli-2.6.0-test11-30/include/asm-i386/system.h --- linux-2.6.0-test11/include/asm-i386/system.h 2003-11-26 12:42:48.000000000 -0800 +++ wli-2.6.0-test11-30/include/asm-i386/system.h 2003-12-03 18:30:38.000000000 -0800 @@ -461,6 +461,18 @@ struct alt_instr { /* For spinlocks etc */ #define local_irq_save(x) __asm__ __volatile__("pushfl ; popl %0 ; cli":"=g" (x): /* no input */ :"memory") +#ifdef CONFIG_SMP +#define smp_local_irq_save(x) local_irq_save(x) +#define smp_local_irq_restore(x) local_irq_restore(x) +#define smp_local_irq_disable() local_irq_disable() +#define smp_local_irq_enable() local_irq_enable() +#else +#define smp_local_irq_save(x) do { (void)(x); } while (0) +#define smp_local_irq_restore(x) do { (void)(x); } while (0) +#define smp_local_irq_disable() do { } while (0) +#define smp_local_irq_enable() do { } while (0) +#endif /* CONFIG_SMP */ + /* * disable hlt during certain critical i/o operations */ diff -prauN linux-2.6.0-test11/include/asm-i386/thread_info.h wli-2.6.0-test11-30/include/asm-i386/thread_info.h --- linux-2.6.0-test11/include/asm-i386/thread_info.h 2003-11-26 12:43:06.000000000 -0800 +++ wli-2.6.0-test11-30/include/asm-i386/thread_info.h 2003-12-03 19:41:16.000000000 -0800 @@ -9,6 +9,8 @@ #ifdef __KERNEL__ +#include +#include #ifndef __ASSEMBLY__ #include #endif @@ -30,9 +32,11 @@ struct thread_info { __s32 preempt_count; /* 0 => preemptable, <0 => BUG */ mm_segment_t addr_limit; /* thread address space: + 0 for interrupts: illegal 0-0xBFFFFFFF for user-thead 0-0xFFFFFFFF for kernel-thread */ + struct thread_info *irq_stack; /* pointer to cpu irq stack */ struct restart_block restart_block; __u8 supervisor_stack[0]; @@ -48,7 +52,8 @@ struct thread_info { #define TI_CPU 0x00000010 #define TI_PRE_COUNT 0x00000014 #define TI_ADDR_LIMIT 0x00000018 -#define TI_RESTART_BLOCK 0x000001C +#define TI_IRQ_STACK 0x0000001C +#define TI_RESTART_BLOCK 0x0000026 #endif @@ -59,46 +64,60 @@ struct thread_info { * * preempt_count needs to be 1 initially, until the scheduler is functional. */ +#ifdef CONFIG_4K_STACK +#define THREAD_ORDER 0 +#else +#define THREAD_ORDER 1 +#endif +#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER) +#define INIT_THREAD_SIZE THREAD_SIZE +#define STACK_WARN (THREAD_SIZE/4) +#define STACK_PANIC (THREAD_SIZE >= 8192 ? THREAD_SIZE/8 : 256) + #ifndef __ASSEMBLY__ -#define INIT_THREAD_INFO(tsk) \ -{ \ - .task = &tsk, \ - .exec_domain = &default_exec_domain, \ - .flags = 0, \ - .cpu = 0, \ - .preempt_count = 1, \ - .addr_limit = KERNEL_DS, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ +#define INIT_THREAD_INFO(tsk) \ +{ \ + .task = &tsk, \ + .exec_domain = &default_exec_domain, \ + .flags = 0, \ + .cpu = 0, \ + .preempt_count = 1, \ + .addr_limit = KERNEL_DS, \ + .irq_stack = &init_irq_union.thread_info, \ + .restart_block = { \ + .fn = do_no_restart_syscall, \ + } \ } #define init_thread_info (init_thread_union.thread_info) #define init_stack (init_thread_union.stack) +/* thread information allocation */ +#define alloc_thread_info(task) ((struct thread_info *)kmalloc(THREAD_SIZE, GFP_KERNEL)) +#define free_thread_info(info) kfree(info) +#define get_thread_info(ti) get_task_struct((ti)->task) +#define put_thread_info(ti) put_task_struct((ti)->task) + /* how to get the thread information struct from C */ static inline struct thread_info *current_thread_info(void) { struct thread_info *ti; - __asm__("andl %%esp,%0; ":"=r" (ti) : "0" (~8191UL)); + __asm__("andl %%esp,%0; ":"=r" (ti) : "0" (~(THREAD_SIZE - 1))); return ti; } -/* thread information allocation */ -#define THREAD_SIZE (2*PAGE_SIZE) -#define alloc_thread_info(task) ((struct thread_info *)kmalloc(THREAD_SIZE, GFP_KERNEL)) -#define free_thread_info(info) kfree(info) -#define get_thread_info(ti) get_task_struct((ti)->task) -#define put_thread_info(ti) put_task_struct((ti)->task) - #else /* !__ASSEMBLY__ */ /* how to get the thread information struct from ASM */ #define GET_THREAD_INFO(reg) \ - movl $-8192, reg; \ + movl $-THREAD_SIZE, reg; \ andl %esp, reg +/* use this one if reg already contains %esp */ +#define GET_THREAD_INFO_WITH_ESP(reg) \ + andl $-THREAD_SIZE, reg + #endif /* diff -prauN linux-2.6.0-test11/include/asm-i386/tlb.h wli-2.6.0-test11-30/include/asm-i386/tlb.h --- linux-2.6.0-test11/include/asm-i386/tlb.h 2003-11-26 12:43:30.000000000 -0800 +++ wli-2.6.0-test11-30/include/asm-i386/tlb.h 2003-12-03 18:30:38.000000000 -0800 @@ -1,10 +1,58 @@ #ifndef _I386_TLB_H #define _I386_TLB_H +/* + * include/asm-i386/tlb.h + * (C) June 2003 William Irwin, IBM + * Routines for pagetable cacheing and release. + */ + +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_HIGHPTE +#define GFP_PTE (GFP_KERNEL|__GFP_REPEAT|__GFP_HIGHMEM) +#else +#define GFP_PTE (GFP_KERNEL|__GFP_REPEAT) +#endif + +#ifdef CONFIG_HIGHPMD +#define GFP_PMD (GFP_KERNEL|__GFP_REPEAT|__GFP_HIGHMEM) +#else +#define GFP_PMD (GFP_KERNEL|__GFP_REPEAT) +#endif + +#define PG_PTE PG_arch_1 +#define NR_PTE 128 +#define FREE_PTE_NR NR_PTE +#define NR_NONPTE 512 +#define MAX_ZONE_ID (MAX_NUMNODES * MAX_NR_ZONES) + +#define PagePTE(page) test_bit(PG_PTE, &(page)->flags) +#define SetPagePTE(page) set_bit(PG_PTE, &(page)->flags) +#define ClearPagePTE(page) clear_bit(PG_PTE, &(page)->flags) +#define TestSetPagePTE(page) test_and_set_bit(PG_PTE, &(page)->flags) +#define TestClearPagePTE(page) test_and_clear_bit(PG_PTE, &(page)->flags) +#define PageZoneID(page) ((page)->flags >> ZONE_SHIFT) /* - * x86 doesn't need any special per-pte or - * per-vma handling.. + * vmscan.c does smp_call_function() to shoot down cached pagetables under + * memory pressure. */ +struct mmu_gather { + struct mm_struct *mm; + int nr_pte_active, nr_pte_ready, nr_nonpte, need_flush, fullmm, freed; + struct list_head active_list[MAX_ZONE_ID], ready_list[MAX_ZONE_ID]; + int active_count[MAX_ZONE_ID], ready_count[MAX_ZONE_ID]; + struct page *nonpte[NR_NONPTE]; +}; + +DECLARE_PER_CPU(struct mmu_gather, mmu_gathers); + #define tlb_start_vma(tlb, vma) do { } while (0) #define tlb_end_vma(tlb, vma) do { } while (0) #define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0) @@ -15,6 +63,122 @@ */ #define tlb_flush(tlb) flush_tlb_mm((tlb)->mm) -#include +void tlb_init(void); -#endif +static inline +struct mmu_gather *tlb_gather_mmu(struct mm_struct *mm, unsigned int flush) +{ + struct mmu_gather *tlb = &per_cpu(mmu_gathers, get_cpu()); + tlb->mm = mm; + tlb->fullmm = flush; + tlb->freed = 0; + put_cpu(); + return tlb; +} + +static inline +void tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *pte, unsigned long addr) +{ + tlb->need_flush = 1; +} + +static inline +void tlb_flush_ready(struct mmu_gather *tlb) +{ + int zone; + + for (zone = 0; tlb->nr_pte_ready >= NR_PTE && zone < MAX_ZONE_ID; ++zone) { + struct page *head; + + if (!tlb->ready_count[zone]) + continue; + + head = list_entry(tlb->ready_list[zone].next, struct page, list); + list_del_init(&head->list); + list_splice_init(&tlb->ready_list[zone], &head->list); + head->private = tlb->ready_count[zone]; + tlb->nr_pte_ready -= tlb->ready_count[zone]; + tlb->ready_count[zone] = 0; + free_pages_bulk(zone_table[zone], head, 0); + } +} + +static inline +void tlb_flush_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) +{ + int zone; + unsigned long flags; + + if (!tlb->need_flush && tlb->nr_nonpte < NR_NONPTE) + return; + + tlb->need_flush = 0; + tlb_flush(tlb); + + smp_local_irq_save(flags); + + if (tlb->nr_nonpte) { + free_pages_and_swap_cache(tlb->nonpte, tlb->nr_nonpte); + tlb->nr_nonpte = 0; + } + + for (zone = 0; zone < MAX_ZONE_ID; ++zone) { + if (!tlb->active_count[zone]) + continue; + + list_splice_init(&tlb->active_list[zone], &tlb->ready_list[zone]); + tlb->ready_count[zone] += tlb->active_count[zone]; + tlb->active_count[zone] = 0; + } + tlb->nr_pte_ready += tlb->nr_pte_active; + tlb->nr_pte_active = 0; + if (tlb->nr_pte_ready >= NR_PTE) + tlb_flush_ready(tlb); + + smp_local_irq_restore(flags); +} + +static inline +void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) +{ + if (tlb->mm->rss >= tlb->freed) + tlb->mm->rss -= tlb->freed; + else + tlb->mm->rss = 0; + tlb_flush_mmu(tlb, start, end); +} + +static inline +void tlb_remove_nonpte_page(struct mmu_gather *tlb, struct page *page) +{ + tlb->nonpte[tlb->nr_nonpte] = page; + tlb->nr_nonpte++; + if (tlb->nr_nonpte >= NR_NONPTE) + tlb_flush_mmu(tlb, 0, 0); +} + +static inline +void tlb_remove_pte_page(struct mmu_gather *tlb, struct page *page) +{ + int zone = PageZoneID(page); + ClearPagePTE(page); + tlb->nr_pte_active++; + tlb->active_count[zone]++; + list_add(&page->list, &tlb->active_list[zone]); +} + +static inline +void tlb_remove_page(struct mmu_gather *tlb, struct page *page) +{ + unsigned long flags; + + smp_local_irq_save(flags); + tlb->need_flush = 1; + if (PagePTE(page)) + tlb_remove_pte_page(tlb, page); + else + tlb_remove_nonpte_page(tlb, page); + smp_local_irq_restore(flags); +} + +#endif /* _I386_TLB_H */ diff -prauN linux-2.6.0-test11/include/asm-ia64/pgalloc.h wli-2.6.0-test11-30/include/asm-ia64/pgalloc.h --- linux-2.6.0-test11/include/asm-ia64/pgalloc.h 2003-11-26 12:42:51.000000000 -0800 +++ wli-2.6.0-test11-30/include/asm-ia64/pgalloc.h 2003-12-03 18:20:41.000000000 -0800 @@ -71,9 +71,9 @@ pgd_free (pgd_t *pgd) } static inline void -pgd_populate (struct mm_struct *mm, pgd_t *pgd_entry, pmd_t *pmd) +pgd_populate (struct mm_struct *mm, pgd_t *pgd_entry, struct page *pmd) { - pgd_val(*pgd_entry) = __pa(pmd); + pgd_val(*pgd_entry) = __pa(page_address(pmd)); } @@ -90,8 +90,8 @@ pmd_alloc_one_fast (struct mm_struct *mm return (pmd_t *)ret; } -static inline pmd_t* -pmd_alloc_one (struct mm_struct *mm, unsigned long addr) +static inline pmd_t * +pmd_alloc_one_kernel(struct mm_struct *mm, unsigned long addr) { pmd_t *pmd = (pmd_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT); @@ -100,9 +100,16 @@ pmd_alloc_one (struct mm_struct *mm, uns return pmd; } +static inline struct page *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) +{ + pmd_t *pmd = pmd_alloc_one_kernel(mm, addr); + return pmd ? virt_to_page(pmd) : NULL; +} + static inline void -pmd_free (pmd_t *pmd) +pmd_free(struct page *page) { + pmd_t *pmd = page_address(page); *(unsigned long *)pmd = (unsigned long) pmd_quicklist; pmd_quicklist = (unsigned long *) pmd; ++pgtable_cache_size; diff -prauN linux-2.6.0-test11/include/asm-ia64/pgtable.h wli-2.6.0-test11-30/include/asm-ia64/pgtable.h --- linux-2.6.0-test11/include/asm-ia64/pgtable.h 2003-11-26 12:43:30.000000000 -0800 +++ wli-2.6.0-test11-30/include/asm-ia64/pgtable.h 2003-12-04 08:30:37.000000000 -0800 @@ -176,22 +176,6 @@ ia64_phys_addr_valid (unsigned long addr } /* - * kern_addr_valid(ADDR) tests if ADDR is pointing to valid kernel - * memory. For the return value to be meaningful, ADDR must be >= - * PAGE_OFFSET. This operation can be relatively expensive (e.g., - * require a hash-, or multi-level tree-lookup or something of that - * sort) but it guarantees to return TRUE only if accessing the page - * at that address does not cause an error. Note that there may be - * addresses for which kern_addr_valid() returns FALSE even though an - * access would not cause an error (e.g., this is typically true for - * memory mapped I/O regions. - * - * XXX Need to implement this for IA-64. - */ -#define kern_addr_valid(addr) (1) - - -/* * Now come the defines and routines to manage and access the three-level * page table. */ @@ -253,7 +237,8 @@ ia64_phys_addr_valid (unsigned long addr #define pgd_bad(pgd) (!ia64_phys_addr_valid(pgd_val(pgd))) #define pgd_present(pgd) (pgd_val(pgd) != 0UL) #define pgd_clear(pgdp) (pgd_val(*(pgdp)) = 0UL) -#define pgd_page(pgd) ((unsigned long) __va(pgd_val(pgd) & _PFN_MASK)) +#define __pgd_page(pgd) ((unsigned long)__va(pgd_val(pgd) & _PFN_MASK)) +#define pgd_page(pgd) virt_to_page(__pgd_page(pgd)) /* * The following have defined behavior only work if pte_present() is true. @@ -322,7 +307,13 @@ pgd_offset (struct mm_struct *mm, unsign /* Find an entry in the second-level page table.. */ #define pmd_offset(dir,addr) \ - ((pmd_t *) pgd_page(*(dir)) + (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))) + ((pmd_t *)__pgd_page(*(dir)) + (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))) + +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) /* * Find an entry in the third-level page table. This looks more complicated than it diff -prauN linux-2.6.0-test11/include/asm-m68k/motorola_pgalloc.h wli-2.6.0-test11-30/include/asm-m68k/motorola_pgalloc.h --- linux-2.6.0-test11/include/asm-m68k/motorola_pgalloc.h 2003-11-26 12:46:12.000000000 -0800 +++ wli-2.6.0-test11-30/include/asm-m68k/motorola_pgalloc.h 2003-12-03 18:20:41.000000000 -0800 @@ -63,19 +63,28 @@ static inline void __pte_free_tlb(struct } -static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address) +static inline pmd_t *pmd_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { return get_pointer_table(); } -static inline int pmd_free(pmd_t *pmd) +static inline struct page *pmd_alloc_one_kernel(struct mm_struct *mm, unsigned long addr) { - return free_pointer_table(pmd); + pmd_t *pmd = pmd_alloc_one_kernel(mm, addr); + if (pmd) + return virt_to_page(pmd); + else + return NULL; } -static inline int __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) +static inline int pmd_free(struct page *pmd) { - return free_pointer_table(pmd); + return free_pointer_table(page_address(pmd)); +} + +static inline int __pmd_free_tlb(struct mmu_gather *tlb, struct page *pmd) +{ + return free_pointer_table(page_address(pmd)); } @@ -100,9 +109,9 @@ static inline void pmd_populate(struct m pmd_set(pmd, page_address(page)); } -static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmd) +static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, struct page *pmd) { - pgd_set(pgd, pmd); + pgd_set(pgd, page_address(pmd)); } #endif /* _MOTOROLA_PGALLOC_H */ diff -prauN linux-2.6.0-test11/include/asm-m68k/motorola_pgtable.h wli-2.6.0-test11-30/include/asm-m68k/motorola_pgtable.h --- linux-2.6.0-test11/include/asm-m68k/motorola_pgtable.h 2003-11-26 12:44:30.000000000 -0800 +++ wli-2.6.0-test11-30/include/asm-m68k/motorola_pgtable.h 2003-12-03 18:20:41.000000000 -0800 @@ -116,6 +116,7 @@ extern inline void pgd_set(pgd_t * pgdp, #define __pte_page(pte) ((unsigned long)__va(pte_val(pte) & PAGE_MASK)) #define __pmd_page(pmd) ((unsigned long)__va(pmd_val(pmd) & _TABLE_MASK)) #define __pgd_page(pgd) ((unsigned long)__va(pgd_val(pgd) & _TABLE_MASK)) +#define pgd_page(pgd) virt_to_page(__pgd_page(pgd)) #define pte_none(pte) (!pte_val(pte)) @@ -204,6 +205,12 @@ extern inline pmd_t * pmd_offset(pgd_t * return (pmd_t *)__pgd_page(*dir) + ((address >> PMD_SHIFT) & (PTRS_PER_PMD-1)); } +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) + /* Find an entry in the third-level page table.. */ extern inline pte_t * pte_offset_kernel(pmd_t * pmdp, unsigned long address) { diff -prauN linux-2.6.0-test11/include/asm-m68k/pgtable.h wli-2.6.0-test11-30/include/asm-m68k/pgtable.h --- linux-2.6.0-test11/include/asm-m68k/pgtable.h 2003-11-26 12:45:28.000000000 -0800 +++ wli-2.6.0-test11-30/include/asm-m68k/pgtable.h 2003-12-04 08:30:37.000000000 -0800 @@ -136,8 +136,6 @@ extern inline void update_mmu_cache(stru #endif /* !__ASSEMBLY__ */ -#define kern_addr_valid(addr) (1) - #define io_remap_page_range remap_page_range /* MMU-specific headers */ diff -prauN linux-2.6.0-test11/include/asm-m68k/sun3_pgalloc.h wli-2.6.0-test11-30/include/asm-m68k/sun3_pgalloc.h --- linux-2.6.0-test11/include/asm-m68k/sun3_pgalloc.h 2003-11-26 12:43:40.000000000 -0800 +++ wli-2.6.0-test11-30/include/asm-m68k/sun3_pgalloc.h 2003-12-03 18:20:41.000000000 -0800 @@ -18,7 +18,8 @@ extern const char bad_pmd_string[]; -#define pmd_alloc_one(mm,address) ({ BUG(); ((pmd_t *)2); }) +#define pmd_alloc_one(mm,address) ({ BUG(); ((struct page *)2); }) +#define pmd_alloc_one_kernel(mm,address) ({ BUG(); ((pmd_t *)2); }) static inline void pte_free_kernel(pte_t * pte) diff -prauN linux-2.6.0-test11/include/asm-m68knommu/pgtable.h wli-2.6.0-test11-30/include/asm-m68knommu/pgtable.h --- linux-2.6.0-test11/include/asm-m68knommu/pgtable.h 2003-11-26 12:44:25.000000000 -0800 +++ wli-2.6.0-test11-30/include/asm-m68knommu/pgtable.h 2003-12-04 08:30:37.000000000 -0800 @@ -20,8 +20,12 @@ typedef pte_t *pte_addr_t; #define pgd_none(pgd) (0) #define pgd_bad(pgd) (0) #define pgd_clear(pgdp) -#define kern_addr_valid(addr) (1) -#define pmd_offset(a, b) ((void *)0) +#define pmd_offset(a, b) ((void *)0) +#define pmd_offset_kernel(a, b) pmd_offset(a, b) +#define pmd_offset_map(a, b) pmd_offset(a, b) +#define pmd_offset_map_nested(a, b) pmd_offset(a, b) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) #define PAGE_NONE __pgprot(0) #define PAGE_SHARED __pgprot(0) diff -prauN linux-2.6.0-test11/include/asm-mips/mmzone.h wli-2.6.0-test11-30/include/asm-mips/mmzone.h --- linux-2.6.0-test11/include/asm-mips/mmzone.h 2003-11-26 12:42:55.000000000 -0800 +++ wli-2.6.0-test11-30/include/asm-mips/mmzone.h 2003-12-04 08:30:37.000000000 -0800 @@ -75,10 +75,6 @@ extern plat_pg_data_t *plat_node_data[]; (((unsigned long)ADDR_TO_MAPBASE((kaddr)) - PAGE_OFFSET) / \ sizeof(struct page)))) -#define kern_addr_valid(addr) ((KVADDR_TO_NID((unsigned long)addr) > \ - -1) ? 0 : (test_bit(LOCAL_MAP_NR((addr)), \ - NODE_DATA(KVADDR_TO_NID((unsigned long)addr))->valid_addr_bitmap))) - #define pfn_to_page(pfn) (mem_map + (pfn)) #define page_to_pfn(page) \ ((((page)-(page)->zone->zone_mem_map) + (page)->zone->zone_start_pfn) \ diff -prauN linux-2.6.0-test11/include/asm-mips/pgalloc.h wli-2.6.0-test11-30/include/asm-mips/pgalloc.h --- linux-2.6.0-test11/include/asm-mips/pgalloc.h 2003-11-26 12:44:15.000000000 -0800 +++ wli-2.6.0-test11-30/include/asm-mips/pgalloc.h 2003-12-03 18:20:41.000000000 -0800 @@ -95,7 +95,8 @@ static inline void pte_free(struct page * allocating and freeing a pmd is trivial: the 1-entry pmd is * inside the pgd, so has no extra memory associated with it. */ -#define pmd_alloc_one(mm, addr) ({ BUG(); ((pmd_t *)2); }) +#define pmd_alloc_one(mm, addr) ({ BUG(); ((struct page *)2); }) +#define pmd_alloc_one_kernel(mm, addr) ({ BUG(); ((pmd_t *)2); }) #define pmd_free(x) do { } while (0) #endif diff -prauN linux-2.6.0-test11/include/asm-mips/pgtable-32.h wli-2.6.0-test11-30/include/asm-mips/pgtable-32.h --- linux-2.6.0-test11/include/asm-mips/pgtable-32.h 2003-11-26 12:44:14.000000000 -0800 +++ wli-2.6.0-test11-30/include/asm-mips/pgtable-32.h 2003-12-03 18:20:41.000000000 -0800 @@ -180,6 +180,12 @@ static inline pmd_t *pmd_offset(pgd_t *d return (pmd_t *) dir; } +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) + /* Find an entry in the third-level page table.. */ #define __pte_offset(address) \ (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) diff -prauN linux-2.6.0-test11/include/asm-mips/pgtable-64.h wli-2.6.0-test11-30/include/asm-mips/pgtable-64.h --- linux-2.6.0-test11/include/asm-mips/pgtable-64.h 2003-11-26 12:42:56.000000000 -0800 +++ wli-2.6.0-test11-30/include/asm-mips/pgtable-64.h 2003-12-03 18:20:41.000000000 -0800 @@ -161,10 +161,16 @@ static inline unsigned long pgd_page(pgd /* Find an entry in the second-level page table.. */ static inline pmd_t *pmd_offset(pgd_t * dir, unsigned long address) { - return (pmd_t *) pgd_page(*dir) + + return (pmd_t *)page_address(pgd_page(*dir)) + ((address >> PMD_SHIFT) & (PTRS_PER_PMD - 1)); } +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) + /* Find an entry in the third-level page table.. */ #define __pte_offset(address) \ (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) diff -prauN linux-2.6.0-test11/include/asm-mips/pgtable.h wli-2.6.0-test11-30/include/asm-mips/pgtable.h --- linux-2.6.0-test11/include/asm-mips/pgtable.h 2003-11-26 12:43:28.000000000 -0800 +++ wli-2.6.0-test11-30/include/asm-mips/pgtable.h 2003-12-04 08:30:37.000000000 -0800 @@ -229,10 +229,6 @@ static inline void update_mmu_cache(stru __update_cache(vma, address, pte); } -#ifndef CONFIG_DISCONTIGMEM -#define kern_addr_valid(addr) (1) -#endif - #include /* diff -prauN linux-2.6.0-test11/include/asm-parisc/cacheflush.h wli-2.6.0-test11-30/include/asm-parisc/cacheflush.h --- linux-2.6.0-test11/include/asm-parisc/cacheflush.h 2003-11-26 12:45:45.000000000 -0800 +++ wli-2.6.0-test11-30/include/asm-parisc/cacheflush.h 2003-12-04 06:13:40.000000000 -0800 @@ -69,7 +69,7 @@ extern void __flush_dcache_page(struct p static inline void flush_dcache_page(struct page *page) { - if (page->mapping && list_empty(&page->mapping->i_mmap) && + if (page_mapping(page) && list_empty(&page_mapping(page)->i_mmap) && list_empty(&page->mapping->i_mmap_shared)) { set_bit(PG_dcache_dirty, &page->flags); } else { diff -prauN linux-2.6.0-test11/include/asm-parisc/pgalloc.h wli-2.6.0-test11-30/include/asm-parisc/pgalloc.h --- linux-2.6.0-test11/include/asm-parisc/pgalloc.h 2003-11-26 12:45:45.000000000 -0800 +++ wli-2.6.0-test11-30/include/asm-parisc/pgalloc.h 2003-12-03 18:20:41.000000000 -0800 @@ -27,12 +27,12 @@ static inline void pgd_free(pgd_t *pgd) /* Three Level Page Table Support for pmd's */ -static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmd) +static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, struct page *pmd) { - pgd_val(*pgd) = _PAGE_TABLE + __pa((unsigned long)pmd); + pgd_val(*pgd) = _PAGE_TABLE + __pa(page_address(pmd)); } -static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address) +static inline pmd_t *pmd_alloc_one_kernel(struct mm_struct *mm, unsigned long addr) { pmd_t *pmd = (pmd_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT); if (pmd) @@ -40,9 +40,18 @@ static inline pmd_t *pmd_alloc_one(struc return pmd; } -static inline void pmd_free(pmd_t *pmd) +static inline struct page *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) { - free_page((unsigned long)pmd); + pmd_t *pmd = pmd_alloc_one_kernel(mm, addr); + if (pmd) + return virt_to_page(pmd); + else + return NULL; +} + +static inline void pmd_free(struct page *pmd) +{ + __free_page(pmd); } #else @@ -54,7 +63,8 @@ static inline void pmd_free(pmd_t *pmd) * inside the pgd, so has no extra memory associated with it. */ -#define pmd_alloc_one(mm, addr) ({ BUG(); ((pmd_t *)2); }) +#define pmd_alloc_one(mm, addr) ({ BUG(); ((struct page *)2); }) +#define pmd_alloc_one_kernel(mm, addr) pmd_alloc_one(mm, addr) #define pmd_free(x) do { } while (0) #define pgd_populate(mm, pmd, pte) BUG() diff -prauN linux-2.6.0-test11/include/asm-parisc/pgtable.h wli-2.6.0-test11-30/include/asm-parisc/pgtable.h --- linux-2.6.0-test11/include/asm-parisc/pgtable.h 2003-11-26 12:45:29.000000000 -0800 +++ wli-2.6.0-test11-30/include/asm-parisc/pgtable.h 2003-12-04 08:30:37.000000000 -0800 @@ -14,21 +14,6 @@ #include #include -/* - * kern_addr_valid(ADDR) tests if ADDR is pointing to valid kernel - * memory. For the return value to be meaningful, ADDR must be >= - * PAGE_OFFSET. This operation can be relatively expensive (e.g., - * require a hash-, or multi-level tree-lookup or something of that - * sort) but it guarantees to return TRUE only if accessing the page - * at that address does not cause an error. Note that there may be - * addresses for which kern_addr_valid() returns FALSE even though an - * access would not cause an error (e.g., this is typically true for - * memory mapped I/O regions. - * - * XXX Need to implement this for parisc. - */ -#define kern_addr_valid(addr) (1) - /* Certain architectures need to do special things when PTEs * within a page table are directly modified. Thus, the following * hook is made available. @@ -242,7 +227,8 @@ extern unsigned long *empty_zero_page; #ifdef __LP64__ -#define pgd_page(pgd) ((unsigned long) __va(pgd_val(pgd) & PAGE_MASK)) +#define __pgd_page(pgd) ((unsigned long) __va(pgd_val(pgd) & PAGE_MASK)) +#define pgd_page(pgd) virt_to_page(__pgd_page(pgd)) /* For 64 bit we have three level tables */ @@ -339,11 +325,17 @@ extern inline pte_t pte_modify(pte_t pte #ifdef __LP64__ #define pmd_offset(dir,address) \ -((pmd_t *) pgd_page(*(dir)) + (((address)>>PMD_SHIFT) & (PTRS_PER_PMD-1))) +((pmd_t *)__pgd_page(*(dir)) + (((address)>>PMD_SHIFT) & (PTRS_PER_PMD-1))) #else #define pmd_offset(dir,addr) ((pmd_t *) dir) #endif +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) + /* Find an entry in the third-level page table.. */ #define pte_index(address) (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE-1)) #define pte_offset_kernel(pmd, address) \ diff -prauN linux-2.6.0-test11/include/asm-ppc/pgalloc.h wli-2.6.0-test11-30/include/asm-ppc/pgalloc.h --- linux-2.6.0-test11/include/asm-ppc/pgalloc.h 2003-11-26 12:42:43.000000000 -0800 +++ wli-2.6.0-test11-30/include/asm-ppc/pgalloc.h 2003-12-03 18:20:41.000000000 -0800 @@ -14,7 +14,8 @@ extern void pgd_free(pgd_t *pgd); * We don't have any real pmd's, and this code never triggers because * the pgd will always be present.. */ -#define pmd_alloc_one(mm,address) ({ BUG(); ((pmd_t *)2); }) +#define pmd_alloc_one(mm,address) ({ BUG(); ((struct page *)2); }) +#define pmd_alloc_one_kernel(mm,addr) ({ BUG(); ((pmd_t *)2); }) #define pmd_free(x) do { } while (0) #define __pmd_free_tlb(tlb,x) do { } while (0) #define pgd_populate(mm, pmd, pte) BUG() diff -prauN linux-2.6.0-test11/include/asm-ppc/pgtable.h wli-2.6.0-test11-30/include/asm-ppc/pgtable.h --- linux-2.6.0-test11/include/asm-ppc/pgtable.h 2003-11-26 12:43:25.000000000 -0800 +++ wli-2.6.0-test11-30/include/asm-ppc/pgtable.h 2003-12-04 08:30:37.000000000 -0800 @@ -426,8 +426,9 @@ static inline int pgd_bad(pgd_t pgd) { static inline int pgd_present(pgd_t pgd) { return 1; } #define pgd_clear(xp) do { } while (0) -#define pgd_page(pgd) \ +#define __pgd_page(pgd) \ ((unsigned long) __va(pgd_val(pgd) & PAGE_MASK)) +#define pgd_page(pgd) virt_to_page(__pgd_page(pgd)) /* * The following only work if pte_present() is true. @@ -575,6 +576,12 @@ static inline pmd_t * pmd_offset(pgd_t * return (pmd_t *) dir; } +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) + /* Find an entry in the third-level page table.. */ #define pte_index(address) \ (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) @@ -648,9 +655,6 @@ extern unsigned long kernel_map(unsigned extern void kernel_set_cachemode (unsigned long address, unsigned long size, unsigned int cmode); -/* Needs to be defined here and not in linux/mm.h, as it is arch dependent */ -#define kern_addr_valid(addr) (1) - #define io_remap_page_range remap_page_range /* diff -prauN linux-2.6.0-test11/include/asm-ppc64/mmzone.h wli-2.6.0-test11-30/include/asm-ppc64/mmzone.h --- linux-2.6.0-test11/include/asm-ppc64/mmzone.h 2003-11-26 12:45:40.000000000 -0800 +++ wli-2.6.0-test11-30/include/asm-ppc64/mmzone.h 2003-12-04 08:30:37.000000000 -0800 @@ -72,12 +72,6 @@ static inline int pa_to_nid(unsigned lon #define local_mapnr(kvaddr) \ ( (__pa(kvaddr) >> PAGE_SHIFT) - node_start_pfn(kvaddr_to_nid(kvaddr)) -#if 0 -/* XXX fix - Anton */ -#define kern_addr_valid(kaddr) test_bit(local_mapnr(kaddr), \ - NODE_DATA(kvaddr_to_nid(kaddr))->valid_addr_bitmap) -#endif - /* Written this way to avoid evaluating arguments twice */ #define discontigmem_pfn_to_page(pfn) \ ({ \ diff -prauN linux-2.6.0-test11/include/asm-ppc64/pgalloc.h wli-2.6.0-test11-30/include/asm-ppc64/pgalloc.h --- linux-2.6.0-test11/include/asm-ppc64/pgalloc.h 2003-11-26 12:42:54.000000000 -0800 +++ wli-2.6.0-test11-30/include/asm-ppc64/pgalloc.h 2003-12-03 18:20:41.000000000 -0800 @@ -26,18 +26,27 @@ pgd_free(pgd_t *pgd) kmem_cache_free(zero_cache, pgd); } -#define pgd_populate(MM, PGD, PMD) pgd_set(PGD, PMD) +#define pgd_populate(MM, PGD, PMD) pgd_set(PGD, page_address(PMD)) static inline pmd_t * -pmd_alloc_one(struct mm_struct *mm, unsigned long addr) +pmd_alloc_one_kernel(struct mm_struct *mm, unsigned long addr) { return kmem_cache_alloc(zero_cache, GFP_KERNEL|__GFP_REPEAT); } +static inline struct page *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) +{ + pmd_t *pmd = pmd_alloc_one_kernel(mm, addr); + if (pmd) + return virt_to_page(pmd); + else + return NULL; +} + static inline void -pmd_free(pmd_t *pmd) +pmd_free(struct page *pmd) { - kmem_cache_free(zero_cache, pmd); + kmem_cache_free(zero_cache, page_address(pmd)); } #define __pmd_free_tlb(tlb, pmd) pmd_free(pmd) diff -prauN linux-2.6.0-test11/include/asm-ppc64/pgtable.h wli-2.6.0-test11-30/include/asm-ppc64/pgtable.h --- linux-2.6.0-test11/include/asm-ppc64/pgtable.h 2003-11-26 12:45:39.000000000 -0800 +++ wli-2.6.0-test11-30/include/asm-ppc64/pgtable.h 2003-12-04 08:30:37.000000000 -0800 @@ -212,7 +212,8 @@ int hash_huge_page(struct mm_struct *mm, #define pgd_bad(pgd) ((pgd_val(pgd)) == 0) #define pgd_present(pgd) (pgd_val(pgd) != 0UL) #define pgd_clear(pgdp) (pgd_val(*(pgdp)) = 0UL) -#define pgd_page(pgd) (__bpn_to_ba(pgd_val(pgd))) +#define __pgd_page(pgd) (__bpn_to_ba(pgd_val(pgd))) +#define pgd_page(pgd) virt_to_page(__pgd_page(pgd)) /* * Find an entry in a page-table-directory. We combine the address region @@ -225,12 +226,18 @@ int hash_huge_page(struct mm_struct *mm, /* Find an entry in the second-level page table.. */ #define pmd_offset(dir,addr) \ - ((pmd_t *) pgd_page(*(dir)) + (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))) + ((pmd_t *)__pgd_page(*(dir)) + (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))) /* Find an entry in the third-level page table.. */ #define pte_offset_kernel(dir,addr) \ ((pte_t *) pmd_page_kernel(*(dir)) + (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))) +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) + #define pte_offset_map(dir,addr) pte_offset_kernel((dir), (addr)) #define pte_offset_map_nested(dir,addr) pte_offset_kernel((dir), (addr)) #define pte_unmap(pte) do { } while(0) @@ -384,14 +391,6 @@ extern void update_mmu_cache(struct vm_a #define pgoff_to_pte(off) ((pte_t) {((off) << PTE_SHIFT)|_PAGE_FILE}) #define PTE_FILE_MAX_BITS (BITS_PER_LONG - PTE_SHIFT) -/* - * kern_addr_valid is intended to indicate whether an address is a valid - * kernel address. Most 32-bit archs define it as always true (like this) - * but most 64-bit archs actually perform a test. What should we do here? - * The only use is in fs/ncpfs/dir.c - */ -#define kern_addr_valid(addr) (1) - #define io_remap_page_range remap_page_range void pgtable_cache_init(void); diff -prauN linux-2.6.0-test11/include/asm-s390/pgalloc.h wli-2.6.0-test11-30/include/asm-s390/pgalloc.h --- linux-2.6.0-test11/include/asm-s390/pgalloc.h 2003-11-26 12:44:30.000000000 -0800 +++ wli-2.6.0-test11-30/include/asm-s390/pgalloc.h 2003-12-03 18:20:41.000000000 -0800 @@ -61,12 +61,13 @@ static inline void pgd_free(pgd_t *pgd) * We use pmd cache only on s390x, so these are dummy routines. This * code never triggers because the pgd will always be present. */ -#define pmd_alloc_one(mm,address) ({ BUG(); ((pmd_t *)2); }) +#define pmd_alloc_one(mm,address) ({ BUG(); ((struct page *)2); }) +#define pmd_alloc_one_kernel(mm,addr) ({ BUG(); ((pmd_t *)2); }) #define pmd_free(x) do { } while (0) #define __pmd_free_tlb(tlb,x) do { } while (0) #define pgd_populate(mm, pmd, pte) BUG() #else /* __s390x__ */ -static inline pmd_t * pmd_alloc_one(struct mm_struct *mm, unsigned long vmaddr) +static inline pmd_t * pmd_alloc_one_kernel(struct mm_struct *mm, unsigned long vmaddr) { pmd_t *pmd; int i; @@ -79,16 +80,25 @@ static inline pmd_t * pmd_alloc_one(stru return pmd; } -static inline void pmd_free (pmd_t *pmd) +static inline struct page *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) { - free_pages((unsigned long) pmd, 2); + pmd_t *pmd = pmd_alloc_one_kernel(mm, addr); + if (pmd) + return virt_to_page(pmd); + else + return NULL; +} + +static inline void pmd_free(struct page *pmd) +{ + __free_pages(pmd, 2); } #define __pmd_free_tlb(tlb,pmd) pmd_free(pmd) -static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmd) +static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, struct page *pmd) { - pgd_val(*pgd) = _PGD_ENTRY | __pa(pmd); + pgd_val(*pgd) = _PGD_ENTRY | __pa(page_address(pmd)); } #endif /* __s390x__ */ diff -prauN linux-2.6.0-test11/include/asm-s390/pgtable.h wli-2.6.0-test11-30/include/asm-s390/pgtable.h --- linux-2.6.0-test11/include/asm-s390/pgtable.h 2003-11-26 12:45:11.000000000 -0800 +++ wli-2.6.0-test11-30/include/asm-s390/pgtable.h 2003-12-04 08:30:37.000000000 -0800 @@ -612,6 +612,7 @@ static inline pte_t mk_pte_phys(unsigned /* to find an entry in a page-table-directory */ #define pgd_index(address) ((address >> PGDIR_SHIFT) & (PTRS_PER_PGD-1)) #define pgd_offset(mm, address) ((mm)->pgd+pgd_index(address)) +#define pgd_page(pgd) virt_to_page(pgd_page_kernel(pgd)) /* to find an entry in a kernel page-table-directory */ #define pgd_offset_k(address) pgd_offset(&init_mm, address) @@ -633,6 +634,12 @@ extern inline pmd_t * pmd_offset(pgd_t * #endif /* __s390x__ */ +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) + /* Find an entry in the third-level page table.. */ #define pte_index(address) (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE-1)) #define pte_offset_kernel(pmd, address) \ @@ -713,8 +720,6 @@ typedef pte_t *pte_addr_t; #endif /* !__ASSEMBLY__ */ -#define kern_addr_valid(addr) (1) - /* * No page table caches to initialise */ diff -prauN linux-2.6.0-test11/include/asm-sh/pgalloc.h wli-2.6.0-test11-30/include/asm-sh/pgalloc.h --- linux-2.6.0-test11/include/asm-sh/pgalloc.h 2003-11-26 12:44:10.000000000 -0800 +++ wli-2.6.0-test11-30/include/asm-sh/pgalloc.h 2003-12-04 06:13:40.000000000 -0800 @@ -94,7 +94,8 @@ static inline void pte_free(struct page * inside the pgd, so has no extra memory associated with it. */ -#define pmd_alloc_one(mm, addr) ({ BUG(); ((pmd_t *)2); }) +#define pmd_alloc_one_kernel(mm, addr) ({ BUG(); ((pmd_t *)2); }) +#define pmd_alloc_one(mm, addr) ({ BUG(); ((struct page *)2); }) #define pmd_free(x) do { } while (0) #define __pmd_free_tlb(tlb,x) do { } while (0) #define pgd_populate(mm, pmd, pte) BUG() @@ -115,8 +116,8 @@ static inline pte_t ptep_get_and_clear(p unsigned long pfn = pte_pfn(pte); if (pfn_valid(pfn)) { page = pfn_to_page(pfn); - if (!page->mapping - || list_empty(&page->mapping->i_mmap_shared)) + if (!page_mapping(page) + || list_empty(&page_mapping(page)->i_mmap_shared)) __clear_bit(PG_mapped, &page->flags); } } diff -prauN linux-2.6.0-test11/include/asm-sh/pgtable-2level.h wli-2.6.0-test11-30/include/asm-sh/pgtable-2level.h --- linux-2.6.0-test11/include/asm-sh/pgtable-2level.h 2003-11-26 12:45:46.000000000 -0800 +++ wli-2.6.0-test11-30/include/asm-sh/pgtable-2level.h 2003-12-03 18:20:41.000000000 -0800 @@ -48,14 +48,21 @@ static inline void pgd_clear (pgd_t * pg #define set_pmd(pmdptr, pmdval) (*(pmdptr) = pmdval) #define set_pgd(pgdptr, pgdval) (*(pgdptr) = pgdval) -#define pgd_page(pgd) \ +#define __pgd_page(pgd) \ ((unsigned long) __va(pgd_val(pgd) & PAGE_MASK)) +#define pgd_page(pgd) virt_to_page(__pgd_page(pgd)) static inline pmd_t * pmd_offset(pgd_t * dir, unsigned long address) { return (pmd_t *) dir; } +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) + #define pte_pfn(x) ((unsigned long)(((x).pte >> PAGE_SHIFT))) #define pfn_pte(pfn, prot) __pte(((pfn) << PAGE_SHIFT) | pgprot_val(prot)) #define pfn_pmd(pfn, prot) __pmd(((pfn) << PAGE_SHIFT) | pgprot_val(prot)) diff -prauN linux-2.6.0-test11/include/asm-sh/pgtable.h wli-2.6.0-test11-30/include/asm-sh/pgtable.h --- linux-2.6.0-test11/include/asm-sh/pgtable.h 2003-11-26 12:46:12.000000000 -0800 +++ wli-2.6.0-test11-30/include/asm-sh/pgtable.h 2003-12-04 08:30:37.000000000 -0800 @@ -267,8 +267,6 @@ typedef pte_t *pte_addr_t; #endif /* !__ASSEMBLY__ */ -#define kern_addr_valid(addr) (1) - #define io_remap_page_range remap_page_range /* diff -prauN linux-2.6.0-test11/include/asm-sparc/pgalloc.h wli-2.6.0-test11-30/include/asm-sparc/pgalloc.h --- linux-2.6.0-test11/include/asm-sparc/pgalloc.h 2003-11-26 12:42:56.000000000 -0800 +++ wli-2.6.0-test11-30/include/asm-sparc/pgalloc.h 2003-12-03 18:20:41.000000000 -0800 @@ -38,15 +38,24 @@ BTFIXUPDEF_CALL(void, free_pgd_fast, pgd BTFIXUPDEF_CALL(void, pgd_set, pgd_t *, pmd_t *) #define pgd_set(pgdp,pmdp) BTFIXUP_CALL(pgd_set)(pgdp,pmdp) -#define pgd_populate(MM, PGD, PMD) pgd_set(PGD, PMD) +#define pgd_populate(MM, PGD, PMD) pgd_set(PGD, page_address(PMD)) -BTFIXUPDEF_CALL(pmd_t *, pmd_alloc_one, struct mm_struct *, unsigned long) -#define pmd_alloc_one(mm, address) BTFIXUP_CALL(pmd_alloc_one)(mm, address) +BTFIXUPDEF_CALL(pmd_t *, __pmd_alloc_one, struct mm_struct *, unsigned long) +#define pmd_alloc_one_kernel(mm, address) BTFIXUP_CALL(__pmd_alloc_one)(mm, address) + +static inline struct page *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) +{ + pmd_t *pmd = pmd_alloc_one_kernel(mm, addr); + if (pmd) + return virt_to_page(pmd); + else + return NULL; +} BTFIXUPDEF_CALL(void, free_pmd_fast, pmd_t *) #define free_pmd_fast(pmd) BTFIXUP_CALL(free_pmd_fast)(pmd) -#define pmd_free(pmd) free_pmd_fast(pmd) +#define pmd_free(pmd) free_pmd_fast(page_address(pmd)) #define __pmd_free_tlb(tlb, pmd) pmd_free(pmd) BTFIXUPDEF_CALL(void, pmd_populate, pmd_t *, struct page *) diff -prauN linux-2.6.0-test11/include/asm-sparc/pgtable.h wli-2.6.0-test11-30/include/asm-sparc/pgtable.h --- linux-2.6.0-test11/include/asm-sparc/pgtable.h 2003-11-26 12:44:21.000000000 -0800 +++ wli-2.6.0-test11-30/include/asm-sparc/pgtable.h 2003-12-04 08:30:37.000000000 -0800 @@ -200,10 +200,11 @@ extern unsigned long empty_zero_page; /* */ BTFIXUPDEF_CALL_CONST(struct page *, pmd_page, pmd_t) -BTFIXUPDEF_CALL_CONST(unsigned long, pgd_page, pgd_t) +BTFIXUPDEF_CALL_CONST(unsigned long, __pgd_page, pgd_t) #define pmd_page(pmd) BTFIXUP_CALL(pmd_page)(pmd) -#define pgd_page(pgd) BTFIXUP_CALL(pgd_page)(pgd) +#define __pgd_page(pgd) BTFIXUP_CALL(__pgd_page)(pgd) +#define pgd_page(pgd) virt_to_page(__pgd_page(pgd)) BTFIXUPDEF_SETHI(none_mask) BTFIXUPDEF_CALL_CONST(int, pte_present, pte_t) @@ -350,6 +351,11 @@ extern __inline__ pte_t pte_modify(pte_t /* Find an entry in the second-level page table.. */ BTFIXUPDEF_CALL(pmd_t *, pmd_offset, pgd_t *, unsigned long) #define pmd_offset(dir,addr) BTFIXUP_CALL(pmd_offset)(dir,addr) +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) /* Find an entry in the third-level page table.. */ BTFIXUPDEF_CALL(pte_t *, pte_offset_kernel, pmd_t *, unsigned long) @@ -481,10 +487,6 @@ __get_iospace (unsigned long addr) extern unsigned long *sparc_valid_addr_bitmap; -/* Needs to be defined here and not in linux/mm.h, as it is arch dependent */ -#define kern_addr_valid(addr) \ - (test_bit(__pa((unsigned long)(addr))>>20, sparc_valid_addr_bitmap)) - extern int io_remap_page_range(struct vm_area_struct *vma, unsigned long from, unsigned long to, unsigned long size, pgprot_t prot, int space); diff -prauN linux-2.6.0-test11/include/asm-sparc64/pgalloc.h wli-2.6.0-test11-30/include/asm-sparc64/pgalloc.h --- linux-2.6.0-test11/include/asm-sparc64/pgalloc.h 2003-11-26 12:45:40.000000000 -0800 +++ wli-2.6.0-test11-30/include/asm-sparc64/pgalloc.h 2003-12-03 18:20:41.000000000 -0800 @@ -133,7 +133,7 @@ static __inline__ void free_pgd_slow(pgd #define DCACHE_COLOR(address) 0 #endif -#define pgd_populate(MM, PGD, PMD) pgd_set(PGD, PMD) +#define pgd_populate(MM, PGD, PMD) pgd_set(PGD, page_address(PMD)) static __inline__ pmd_t *pmd_alloc_one_fast(struct mm_struct *mm, unsigned long address) { @@ -154,7 +154,7 @@ static __inline__ pmd_t *pmd_alloc_one_f return (pmd_t *)ret; } -static __inline__ pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address) +static __inline__ pmd_t *pmd_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { pmd_t *pmd; @@ -167,6 +167,15 @@ static __inline__ pmd_t *pmd_alloc_one(s return pmd; } +static inline struct page *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) +{ + pmd_t *pmd = pmd_alloc_one_kernel(mm, addr); + if (pmd) + return virt_to_page(pmd); + else + return NULL; +} + static __inline__ void free_pmd_fast(pmd_t *pmd) { unsigned long color = DCACHE_COLOR((unsigned long)pmd); @@ -223,7 +232,7 @@ static __inline__ void free_pte_slow(pte #define pte_free_kernel(pte) free_pte_fast(pte) #define pte_free(pte) free_pte_fast(page_address(pte)) -#define pmd_free(pmd) free_pmd_fast(pmd) +#define pmd_free(pmd) free_pmd_fast(page_address(pmd)) #define pgd_free(pgd) free_pgd_fast(pgd) #define pgd_alloc(mm) get_pgd_fast() diff -prauN linux-2.6.0-test11/include/asm-sparc64/pgtable.h wli-2.6.0-test11-30/include/asm-sparc64/pgtable.h --- linux-2.6.0-test11/include/asm-sparc64/pgtable.h 2003-11-26 12:42:50.000000000 -0800 +++ wli-2.6.0-test11-30/include/asm-sparc64/pgtable.h 2003-12-04 08:30:37.000000000 -0800 @@ -236,7 +236,8 @@ static inline pte_t pte_modify(pte_t ori (pgd_val(*(pgdp)) = (__pa((unsigned long) (pmdp)) >> 11UL)) #define __pmd_page(pmd) ((unsigned long) __va((pmd_val(pmd)<<11UL))) #define pmd_page(pmd) virt_to_page((void *)__pmd_page(pmd)) -#define pgd_page(pgd) ((unsigned long) __va((pgd_val(pgd)<<11UL))) +#define __pgd_page(pgd) ((unsigned long) __va((pgd_val(pgd)<<11UL))) +#define pgd_page(pgd) virt_to_page(__pgd_page(pgd)) #define pte_none(pte) (!pte_val(pte)) #define pte_present(pte) (pte_val(pte) & _PAGE_PRESENT) #define pte_clear(pte) (pte_val(*(pte)) = 0UL) @@ -278,8 +279,13 @@ static inline pte_t pte_modify(pte_t ori #define pgd_offset_k(address) pgd_offset(&init_mm, address) /* Find an entry in the second-level page table.. */ -#define pmd_offset(dir, address) ((pmd_t *) pgd_page(*(dir)) + \ +#define pmd_offset(dir, address) ((pmd_t *)__pgd_page(*(dir)) + \ ((address >> PMD_SHIFT) & (REAL_PTRS_PER_PMD-1))) +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) /* Find an entry in the third-level page table.. */ #define pte_index(dir, address) ((pte_t *) __pmd_page(*(dir)) + \ @@ -359,10 +365,6 @@ __get_iospace (unsigned long addr) extern unsigned long *sparc64_valid_addr_bitmap; -/* Needs to be defined here and not in linux/mm.h, as it is arch dependent */ -#define kern_addr_valid(addr) \ - (test_bit(__pa((unsigned long)(addr))>>22, sparc64_valid_addr_bitmap)) - extern int io_remap_page_range(struct vm_area_struct *vma, unsigned long from, unsigned long offset, unsigned long size, pgprot_t prot, int space); diff -prauN linux-2.6.0-test11/include/asm-um/pgalloc.h wli-2.6.0-test11-30/include/asm-um/pgalloc.h --- linux-2.6.0-test11/include/asm-um/pgalloc.h 2003-11-26 12:45:26.000000000 -0800 +++ wli-2.6.0-test11-30/include/asm-um/pgalloc.h 2003-12-03 18:20:41.000000000 -0800 @@ -42,7 +42,8 @@ static inline void pte_free(struct page * inside the pgd, so has no extra memory associated with it. */ -#define pmd_alloc_one(mm, addr) ({ BUG(); ((pmd_t *)2); }) +#define pmd_alloc_one(mm, addr) ({ BUG(); ((struct page *)2); }) +#define pmd_alloc_one_kernel(mm, addr) ({ BUG(); ((pmd_t *)2); }) #define pmd_free(x) do { } while (0) #define __pmd_free_tlb(tlb,x) do { } while (0) #define pgd_populate(mm, pmd, pte) BUG() diff -prauN linux-2.6.0-test11/include/asm-um/pgtable.h wli-2.6.0-test11-30/include/asm-um/pgtable.h --- linux-2.6.0-test11/include/asm-um/pgtable.h 2003-11-26 12:46:12.000000000 -0800 +++ wli-2.6.0-test11-30/include/asm-um/pgtable.h 2003-12-04 08:30:37.000000000 -0800 @@ -373,6 +373,12 @@ static inline pmd_t * pmd_offset(pgd_t * return (pmd_t *) dir; } +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pgd, addr) do { } while (0) +#define pmd_unmap_nested(pgd, addr) do { } while (0) + /* Find an entry in the third-level page table.. */ #define pte_index(address) (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) #define pte_offset_kernel(dir, address) \ @@ -408,8 +414,6 @@ typedef pte_t *pte_addr_t; ((swp_entry_t) { pte_val(pte_mkuptodate(pte)) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -#define kern_addr_valid(addr) (1) - #include #endif diff -prauN linux-2.6.0-test11/include/asm-v850/pgtable.h wli-2.6.0-test11-30/include/asm-v850/pgtable.h --- linux-2.6.0-test11/include/asm-v850/pgtable.h 2003-11-26 12:44:51.000000000 -0800 +++ wli-2.6.0-test11-30/include/asm-v850/pgtable.h 2003-12-04 08:30:37.000000000 -0800 @@ -13,9 +13,11 @@ typedef pte_t *pte_addr_t; #define pgd_clear(pgdp) ((void)0) #define pmd_offset(a, b) ((void *)0) - -#define kern_addr_valid(addr) (1) - +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) #define __swp_type(x) (0) #define __swp_offset(x) (0) diff -prauN linux-2.6.0-test11/include/asm-x86_64/pgalloc.h wli-2.6.0-test11-30/include/asm-x86_64/pgalloc.h --- linux-2.6.0-test11/include/asm-x86_64/pgalloc.h 2003-11-26 12:44:15.000000000 -0800 +++ wli-2.6.0-test11-30/include/asm-x86_64/pgalloc.h 2003-12-03 18:20:41.000000000 -0800 @@ -10,7 +10,7 @@ #define pmd_populate_kernel(mm, pmd, pte) \ set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte))) #define pgd_populate(mm, pgd, pmd) \ - set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pmd))) + set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(page_address(pmd)))) static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte) { @@ -22,18 +22,25 @@ extern __inline__ pmd_t *get_pmd(void) return (pmd_t *)get_zeroed_page(GFP_KERNEL); } -extern __inline__ void pmd_free(pmd_t *pmd) +extern __inline__ void pmd_free(struct page *pmd) { - if ((unsigned long)pmd & (PAGE_SIZE-1)) - BUG(); - free_page((unsigned long)pmd); + __free_page(pmd); } -static inline pmd_t *pmd_alloc_one (struct mm_struct *mm, unsigned long addr) +static inline pmd_t *pmd_alloc_one_kernel(struct mm_struct *mm, unsigned long addr) { return (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); } +static inline struct page *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) +{ + pmd_t *pmd = pmd_alloc_one_kernel(mm, addr); + if (pmd) + return virt_to_page(pmd); + else + return NULL; +} + static inline pgd_t *pgd_alloc (struct mm_struct *mm) { return (pgd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); diff -prauN linux-2.6.0-test11/include/asm-x86_64/pgtable.h wli-2.6.0-test11-30/include/asm-x86_64/pgtable.h --- linux-2.6.0-test11/include/asm-x86_64/pgtable.h 2003-11-26 12:44:57.000000000 -0800 +++ wli-2.6.0-test11-30/include/asm-x86_64/pgtable.h 2003-12-04 08:30:37.000000000 -0800 @@ -98,8 +98,9 @@ static inline void set_pml4(pml4_t *dst, pml4_val(*dst) = pml4_val(val); } -#define pgd_page(pgd) \ +#define __pgd_page(pgd) \ ((unsigned long) __va(pgd_val(pgd) & PHYSICAL_PAGE_MASK)) +#define pgd_page(pgd) virt_to_page(__pgd_page(pgd)) #define ptep_get_and_clear(xp) __pte(xchg(&(xp)->pte, 0)) #define pte_same(a, b) ((a).pte == (b).pte) @@ -331,8 +332,13 @@ static inline pgd_t *current_pgd_offset_ #define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT)) #define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1)) -#define pmd_offset(dir, address) ((pmd_t *) pgd_page(*(dir)) + \ +#define pmd_offset(dir, address) ((pmd_t *)__pgd_page(*(dir)) + \ pmd_index(address)) +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) #define pmd_none(x) (!pmd_val(x)) #define pmd_present(x) (pmd_val(x) & _PAGE_PRESENT) #define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0) @@ -391,8 +397,6 @@ typedef pte_t *pte_addr_t; #endif /* !__ASSEMBLY__ */ -extern int kern_addr_valid(unsigned long addr); - #define io_remap_page_range remap_page_range #define HAVE_ARCH_UNMAPPED_AREA diff -prauN linux-2.6.0-test11/include/linux/dcache.h wli-2.6.0-test11-30/include/linux/dcache.h --- linux-2.6.0-test11/include/linux/dcache.h 2003-11-26 12:46:03.000000000 -0800 +++ wli-2.6.0-test11-30/include/linux/dcache.h 2003-12-04 08:30:38.000000000 -0800 @@ -247,9 +247,6 @@ extern void d_move(struct dentry *, stru extern struct dentry * d_lookup(struct dentry *, struct qstr *); extern struct dentry * __d_lookup(struct dentry *, struct qstr *); -/* validate "insecure" dentry pointer */ -extern int d_validate(struct dentry *, struct dentry *); - extern char * d_path(struct dentry *, struct vfsmount *, char *, int); /* Allocation counts.. */ diff -prauN linux-2.6.0-test11/include/linux/fs.h wli-2.6.0-test11-30/include/linux/fs.h --- linux-2.6.0-test11/include/linux/fs.h 2003-11-26 12:43:26.000000000 -0800 +++ wli-2.6.0-test11-30/include/linux/fs.h 2003-12-04 08:19:38.000000000 -0800 @@ -19,6 +19,8 @@ #include #include #include +#include +#include #include struct iovec; @@ -315,11 +317,29 @@ struct address_space_operations { loff_t offset, unsigned long nr_segs); }; +#if NR_CPUS > 8 +typedef rwlock_t mapping_rwlock_t; +#define mapping_rdlock(lock) read_lock(lock) +#define mapping_rdunlock(lock) read_unlock(lock) +#define mapping_wrlock(lock) write_lock(lock) +#define mapping_wrunlock(lock) write_unlock(lock) +#define mapping_rwlock_init(lock) rwlock_init(lock) +#define MAPPING_RW_LOCK_UNLOCKED RW_LOCK_UNLOCKED +#else +typedef spinlock_t mapping_rwlock_t; +#define mapping_rdlock(lock) spin_lock(lock) +#define mapping_rdunlock(lock) spin_unlock(lock) +#define mapping_wrlock(lock) spin_lock(lock) +#define mapping_wrunlock(lock) spin_unlock(lock) +#define mapping_rwlock_init(lock) spin_lock_init(lock) +#define MAPPING_RW_LOCK_UNLOCKED SPIN_LOCK_UNLOCKED +#endif + struct backing_dev_info; struct address_space { struct inode *host; /* owner: inode, block_device */ struct radix_tree_root page_tree; /* radix tree of all pages */ - spinlock_t page_lock; /* and spinlock protecting it */ + mapping_rwlock_t page_lock; /* and spinlock protecting it */ struct list_head clean_pages; /* list of clean pages */ struct list_head dirty_pages; /* list of dirty pages */ struct list_head locked_pages; /* list of locked pages */ @@ -328,7 +348,7 @@ struct address_space { struct address_space_operations *a_ops; /* methods */ struct list_head i_mmap; /* list of private mappings */ struct list_head i_mmap_shared; /* list of shared mappings */ - struct semaphore i_shared_sem; /* protect both above lists */ + spinlock_t i_shared_lock; /* protect both above lists */ atomic_t truncate_count; /* Cover race condition with truncate */ unsigned long dirtied_when; /* jiffies of first page dirtying */ unsigned long flags; /* error bits/gfp mask */ @@ -369,6 +389,7 @@ struct block_device { struct inode { struct hlist_node i_hash; struct list_head i_list; + struct list_head i_sb_list; struct list_head i_dentry; unsigned long i_ino; atomic_t i_count; @@ -687,6 +708,7 @@ struct super_block { atomic_t s_active; void *s_security; + struct list_head s_inodes; /* all inodes */ struct list_head s_dirty; /* dirty inodes */ struct list_head s_io; /* parked for writeback */ struct hlist_head s_anon; /* anonymous dentries for (nfs) exporting */ diff -prauN linux-2.6.0-test11/include/linux/gfp.h wli-2.6.0-test11-30/include/linux/gfp.h --- linux-2.6.0-test11/include/linux/gfp.h 2003-11-26 12:43:26.000000000 -0800 +++ wli-2.6.0-test11-30/include/linux/gfp.h 2003-12-03 18:30:38.000000000 -0800 @@ -79,6 +79,7 @@ static inline struct page * alloc_pages_ extern unsigned long FASTCALL(__get_free_pages(unsigned int gfp_mask, unsigned int order)); extern unsigned long FASTCALL(get_zeroed_page(unsigned int gfp_mask)); +void free_pages_bulk(struct zone *zone, struct page *page, unsigned int order); #define __get_free_page(gfp_mask) \ __get_free_pages((gfp_mask),0) diff -prauN linux-2.6.0-test11/include/linux/hugetlb.h wli-2.6.0-test11-30/include/linux/hugetlb.h --- linux-2.6.0-test11/include/linux/hugetlb.h 2003-11-26 12:44:08.000000000 -0800 +++ wli-2.6.0-test11-30/include/linux/hugetlb.h 2003-12-03 19:11:55.000000000 -0800 @@ -41,6 +41,11 @@ mark_mm_hugetlb(struct mm_struct *mm, st #define is_hugepage_only_range(addr, len) 0 #endif +#define vm_account_huge_inc(vma, pte, addr) \ + vm_account(vma, pte, addr, HPAGE_SIZE/PAGE_SIZE) +#define vm_account_huge_dec(vma, pte, addr) \ + vm_account(vma, pte, addr, -(HPAGE_SIZE/PAGE_SIZE)) + #else /* !CONFIG_HUGETLB_PAGE */ static inline int is_vm_hugetlb_page(struct vm_area_struct *vma) diff -prauN linux-2.6.0-test11/include/linux/init.h wli-2.6.0-test11-30/include/linux/init.h --- linux-2.6.0-test11/include/linux/init.h 2003-11-26 12:42:55.000000000 -0800 +++ wli-2.6.0-test11-30/include/linux/init.h 2003-12-04 08:35:59.000000000 -0800 @@ -46,6 +46,8 @@ #define __exitdata __attribute__ ((__section__(".exit.data"))) #define __exit_call __attribute_used__ __attribute__ ((__section__ (".exitcall.exit"))) +#define __sched __attribute__((__section__(".sched.text"))) + #ifdef MODULE #define __exit __attribute__ ((__section__(".exit.text"))) #else diff -prauN linux-2.6.0-test11/include/linux/init_task.h wli-2.6.0-test11-30/include/linux/init_task.h --- linux-2.6.0-test11/include/linux/init_task.h 2003-11-26 12:42:58.000000000 -0800 +++ wli-2.6.0-test11-30/include/linux/init_task.h 2003-12-03 19:01:47.000000000 -0800 @@ -56,6 +56,29 @@ .siglock = SPIN_LOCK_UNLOCKED, \ } +#define INIT_PID(tsk, type) \ +{ \ + .nr = 0, \ + .count = ATOMIC_INIT(1), \ + .task = &(tsk), \ + .task_list = { \ + .rb_node = NULL, \ + }, \ + .hash_chain = LIST_HEAD_INIT((tsk).pids[type].pid.hash_chain), \ +} + +#define INIT_PID_LINK(task, type) \ + { \ + .pid_chain = { \ + .rb_parent = NULL, \ + .rb_left = NULL, \ + .rb_right = NULL, \ + .rb_color = RB_BLACK, \ + }, \ + .pidptr = NULL, \ + .pid = INIT_PID(task, type), \ + } + /* * INIT_TASK is used to set up the first task table, touch at * your own risk!. Base=0, limit=0x1fffff (=2MB) @@ -75,7 +98,12 @@ .active_mm = &init_mm, \ .run_list = LIST_HEAD_INIT(tsk.run_list), \ .time_slice = HZ, \ - .tasks = LIST_HEAD_INIT(tsk.tasks), \ + .tasks = { \ + .rb_parent = NULL, \ + .rb_left = NULL, \ + .rb_right = NULL, \ + .rb_color = RB_BLACK, \ + }, \ .ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children), \ .ptrace_list = LIST_HEAD_INIT(tsk.ptrace_list), \ .real_parent = &tsk, \ @@ -108,6 +136,12 @@ .proc_lock = SPIN_LOCK_UNLOCKED, \ .switch_lock = SPIN_LOCK_UNLOCKED, \ .journal_info = NULL, \ + .pids = { \ + INIT_PID_LINK(tsk, 0), \ + INIT_PID_LINK(tsk, 1), \ + INIT_PID_LINK(tsk, 2), \ + INIT_PID_LINK(tsk, 3), \ + }, \ } diff -prauN linux-2.6.0-test11/include/linux/mm.h wli-2.6.0-test11-30/include/linux/mm.h --- linux-2.6.0-test11/include/linux/mm.h 2003-11-26 12:42:55.000000000 -0800 +++ wli-2.6.0-test11-30/include/linux/mm.h 2003-12-04 08:57:13.000000000 -0800 @@ -12,6 +12,7 @@ #include #include #include +#include #ifndef CONFIG_DISCONTIGMEM /* Don't use mapnrs, do it properly */ extern unsigned long max_mapnr; @@ -77,6 +78,7 @@ struct vm_area_struct { units, *not* PAGE_CACHE_SIZE */ struct file * vm_file; /* File we map to (can be NULL). */ void * vm_private_data; /* was vm_pte (shared mem) */ + struct rcu_head rcu; }; /* @@ -111,6 +113,7 @@ struct vm_area_struct { #define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ #define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */ +#define VM_DEAD 0x01000000 /* vma is dead, don't touch */ #ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */ #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS @@ -143,12 +146,10 @@ extern pgprot_t protection_map[16]; struct vm_operations_struct { void (*open)(struct vm_area_struct * area); void (*close)(struct vm_area_struct * area); - struct page * (*nopage)(struct vm_area_struct * area, unsigned long address, int unused); + struct page * (*nopage)(struct vm_area_struct * area, unsigned long address, int *type); int (*populate)(struct vm_area_struct * area, unsigned long address, unsigned long len, pgprot_t prot, unsigned long pgoff, int nonblock); }; -/* forward declaration; pte_chain is meant to be internal to rmap.c */ -struct pte_chain; struct mmu_gather; struct inode; @@ -172,15 +173,12 @@ struct page { updated asynchronously */ atomic_t count; /* Usage count, see below. */ struct list_head list; /* ->mapping has some page lists. */ - struct address_space *mapping; /* The inode (or ...) we belong to. */ + unsigned long __mapping; /* The inode (or ...) we belong to. */ unsigned long index; /* Our offset within mapping. */ struct list_head lru; /* Pageout list, eg. active_list; protected by zone->lru_lock !! */ - union { - struct pte_chain *chain;/* Reverse pte mapping pointer. - * protected by PG_chainlock */ - pte_addr_t direct; - } pte; + atomic_t mapcount; + struct rmap_chain *chain; unsigned long private; /* mapping-private opaque data */ /* @@ -375,13 +373,41 @@ void page_address_init(void); #endif /* + * On an anonymous page mapped into a user virutal memory area, + * page->mapping points to its anonmm, not to a struct address_space. + * + * Please note that, confusingly, page_mapping() refers to the inode + * struct address_space which maps the page from disk, where page_mapped() + * refers to whether it's mapped into a user virtual address space. + */ +static inline struct address_space *page_mapping(struct page *page) +{ + if (PageAnon(page)) + return NULL; + else + return (struct address_space *)page->__mapping; +} + +struct anon; +static inline struct anon *page_anon(struct page *page) +{ + BUG_ON(!PageAnon(page)); + return (struct anon *)page->__mapping; +} + +static inline void set_page_mapping(struct page *page, void *ptr) +{ + page->__mapping = (unsigned long)ptr; +} + +/* * Return true if this page is mapped into pagetables. Subtle: test pte.direct * rather than pte.chain. Because sometimes pte.direct is 64-bit, and .chain * is only 32-bit. */ static inline int page_mapped(struct page *page) { - return page->pte.direct != 0; + return atomic_read(&page->mapcount) != 0; } /* @@ -405,7 +431,7 @@ static inline int page_mapped(struct pag extern void show_free_areas(void); struct page *shmem_nopage(struct vm_area_struct * vma, - unsigned long address, int unused); + unsigned long address, int *type); struct file *shmem_file_setup(char * name, loff_t size, unsigned long flags); void shmem_lock(struct file * file, int lock); int shmem_zero_setup(struct vm_area_struct *); @@ -428,8 +454,9 @@ extern void invalidate_mmap_range(struct loff_t const holelen); extern int vmtruncate(struct inode * inode, loff_t offset); extern pmd_t *FASTCALL(__pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)); +pmd_t *FASTCALL(__pmd_alloc_kernel(struct mm_struct *mm, pgd_t *pmd, unsigned long address)); extern pte_t *FASTCALL(pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address)); -extern pte_t *FASTCALL(pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address)); +pte_t *FASTCALL(pte_alloc_map(struct mm_struct *mm, pgd_t *pgd, pmd_t **pmd, unsigned long address)); extern int install_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot); extern int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long pgoff, pgprot_t prot); extern int handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma, unsigned long address, int write_access); @@ -437,16 +464,19 @@ extern int make_pages_present(unsigned l extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write); extern long sys_remap_file_pages(unsigned long start, unsigned long size, unsigned long prot, unsigned long pgoff, unsigned long nonblock); extern long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice); -void put_dirty_page(struct task_struct *tsk, struct page *page, - unsigned long address, pgprot_t prot); +void put_dirty_page(task_t *task, struct vm_area_struct *vma, + struct page *page, unsigned long address, pgprot_t prot); int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int len, int write, int force, struct page **pages, struct vm_area_struct **vmas); -int __set_page_dirty_buffers(struct page *page); -int __set_page_dirty_nobuffers(struct page *page); +int set_page_dirty(struct page *page); +int set_page_dirty_buffers(struct page *page); +int set_page_dirty_nobuffers(struct page *page); int set_page_dirty_lock(struct page *page); +void free_vma(struct vm_area_struct *); + /* * Prototype to add a shrinker callback for ageable caches. * @@ -471,33 +501,15 @@ extern struct shrinker *set_shrinker(int extern void remove_shrinker(struct shrinker *shrinker); /* - * If the mapping doesn't provide a set_page_dirty a_op, then - * just fall through and assume that it wants buffer_heads. - * FIXME: make the method unconditional. - */ -static inline int set_page_dirty(struct page *page) -{ - if (page->mapping) { - int (*spd)(struct page *); - - spd = page->mapping->a_ops->set_page_dirty; - if (spd) - return (*spd)(page); - } - return __set_page_dirty_buffers(page); -} - -/* * On a two-level page table, this ends up being trivial. Thus the * inlining and the symmetry break with pte_alloc_map() that does all * of this out-of-line. */ -static inline pmd_t *pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) -{ - if (pgd_none(*pgd)) - return __pmd_alloc(mm, pgd, address); - return pmd_offset(pgd, address); -} +#define pmd_alloc_map(mm, pgd, addr) \ + (pgd_none(*(pgd))? __pmd_alloc(mm,pgd,addr): pmd_offset_map(pgd,addr)) + +#define pmd_alloc_kernel(mm, pgd, addr) \ + (pgd_none(*(pgd))? __pmd_alloc_kernel(mm,pgd,addr): pmd_offset_kernel(pgd,addr)) extern void free_area_init(unsigned long * zones_size); extern void free_area_init_node(int nid, pg_data_t *pgdat, struct page *pmap, @@ -563,7 +575,7 @@ extern unsigned long page_unuse(struct p extern void truncate_inode_pages(struct address_space *, loff_t); /* generic vm_area_ops exported for stackable file systems */ -extern struct page *filemap_nopage(struct vm_area_struct *, unsigned long, int); +struct page *filemap_nopage(struct vm_area_struct *, unsigned long, int *); /* mm/page-writeback.c */ int write_one_page(struct page *page, int wait); @@ -621,6 +633,75 @@ kernel_map_pages(struct page *page, int { } #endif + +static inline void vm_account(struct vm_area_struct *vma, pte_t pte, + unsigned long addr, long adjustment) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long pfn; + struct page *page; + + if (!pte_present(pte)) + return; + + pfn = pte_pfn(pte); + if (!pfn_valid(pfn)) + goto out; + + page = pfn_to_page(pfn); + if (PageReserved(page)) + goto out; + + if (vma->vm_flags & VM_EXECUTABLE) + mm->text += adjustment; + else if (vma->vm_flags & (VM_STACK_FLAGS & (VM_GROWSUP | VM_GROWSDOWN))) { + mm->data += adjustment; + mm->stack += adjustment; + } else if (addr >= TASK_UNMAPPED_BASE) + mm->lib += adjustment; + else + mm->data += adjustment; + + if (page_mapping(page)) + mm->shared += adjustment; + +out: + if (pte_write(pte)) + mm->dirty += adjustment; +} + +#define vm_account_inc(vma, pte, addr) vm_account(vma, pte, addr, +1) +#define vm_account_dec(vma, pte, addr) vm_account(vma, pte, addr, -1) + +static inline void vm_ptep_set_wrprotect(struct mm_struct *mm, pte_t *pte) +{ + if (pte_write(*pte)) + mm->dirty--; + ptep_set_wrprotect(pte); +} + +static inline void vm_set_pte(struct vm_area_struct *vma, pte_t *dst, + pte_t val, unsigned long addr) +{ + vm_account_inc(vma, val, addr); + set_pte(dst, val); +} + +static inline pte_t vm_ptep_get_and_clear(struct vm_area_struct *vma, + pte_t *pte, unsigned long addr) +{ + pte_t val = ptep_get_and_clear(pte); + vm_account_dec(vma, val, addr); + return val; +} + +static inline void vm_pte_clear(struct vm_area_struct *vma, pte_t *pte, + unsigned long addr) +{ + pte_t val = *pte; + pte_clear(pte); + vm_account_dec(vma, val, addr); +} #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff -prauN linux-2.6.0-test11/include/linux/mmzone.h wli-2.6.0-test11-30/include/linux/mmzone.h --- linux-2.6.0-test11/include/linux/mmzone.h 2003-11-26 12:44:20.000000000 -0800 +++ wli-2.6.0-test11-30/include/linux/mmzone.h 2003-12-04 08:30:38.000000000 -0800 @@ -21,8 +21,8 @@ #endif struct free_area { - struct list_head free_list; - unsigned long *map; + struct list_head free_list, deferred_pages; + unsigned long *map, globally_free, active, locally_free; }; struct pglist_data; @@ -195,7 +195,6 @@ typedef struct pglist_data { struct zonelist node_zonelists[MAX_NR_ZONES]; int nr_zones; struct page *node_mem_map; - unsigned long *valid_addr_bitmap; struct bootmem_data *bdata; unsigned long node_start_pfn; unsigned long node_present_pages; /* total number of physical pages */ diff -prauN linux-2.6.0-test11/include/linux/page-flags.h wli-2.6.0-test11-30/include/linux/page-flags.h --- linux-2.6.0-test11/include/linux/page-flags.h 2003-11-26 12:44:52.000000000 -0800 +++ wli-2.6.0-test11-30/include/linux/page-flags.h 2003-12-04 07:55:42.000000000 -0800 @@ -69,12 +69,13 @@ #define PG_private 12 /* Has something at ->private */ #define PG_writeback 13 /* Page is under writeback */ #define PG_nosave 14 /* Used for system suspend/resume */ -#define PG_chainlock 15 /* lock bit for ->pte_chain */ +#define PG_rmaplock 15 /* lock bit for ->pte_chain */ -#define PG_direct 16 /* ->pte_chain points directly at pte */ -#define PG_mappedtodisk 17 /* Has blocks allocated on-disk */ -#define PG_reclaim 18 /* To be reclaimed asap */ -#define PG_compound 19 /* Part of a compound page */ +#define PG_mappedtodisk 16 /* Has blocks allocated on-disk */ +#define PG_reclaim 17 /* To be reclaimed asap */ +#define PG_compound 18 /* Part of a compound page */ +#define PG_anon 19 /* Anonymous page */ +#define PG_swapcache 20 /* Swap page; swp_entry_t in ->private */ /* @@ -87,6 +88,7 @@ struct page_state { unsigned long nr_unstable; /* NFS unstable pages */ unsigned long nr_page_table_pages;/* Pages used for pagetables */ unsigned long nr_mapped; /* mapped into pagetables */ + unsigned long nr_swapcache; /* in swapcache */ unsigned long nr_slab; /* In slab */ #define GET_PAGE_STATE_LAST nr_slab @@ -116,6 +118,9 @@ struct page_state { unsigned long pageoutrun; /* kswapd's calls to page reclaim */ unsigned long allocstall; /* direct reclaim calls */ unsigned long pgrotated; /* pages rotated to tail of the LRU */ + unsigned long swapaddfail; /* swap addition failures */ + unsigned long unmapfail; /* unmapping failures */ + unsigned long unmapagain; /* unmapping needs retry */ } ____cacheline_aligned; DECLARE_PER_CPU(struct page_state, page_states); @@ -250,12 +255,6 @@ extern void get_full_page_state(struct p #define ClearPageNosave(page) clear_bit(PG_nosave, &(page)->flags) #define TestClearPageNosave(page) test_and_clear_bit(PG_nosave, &(page)->flags) -#define PageDirect(page) test_bit(PG_direct, &(page)->flags) -#define SetPageDirect(page) set_bit(PG_direct, &(page)->flags) -#define TestSetPageDirect(page) test_and_set_bit(PG_direct, &(page)->flags) -#define ClearPageDirect(page) clear_bit(PG_direct, &(page)->flags) -#define TestClearPageDirect(page) test_and_clear_bit(PG_direct, &(page)->flags) - #define PageMappedToDisk(page) test_bit(PG_mappedtodisk, &(page)->flags) #define SetPageMappedToDisk(page) set_bit(PG_mappedtodisk, &(page)->flags) #define ClearPageMappedToDisk(page) clear_bit(PG_mappedtodisk, &(page)->flags) @@ -269,15 +268,16 @@ extern void get_full_page_state(struct p #define SetPageCompound(page) set_bit(PG_compound, &(page)->flags) #define ClearPageCompound(page) clear_bit(PG_compound, &(page)->flags) -/* - * The PageSwapCache predicate doesn't use a PG_flag at this time, - * but it may again do so one day. - */ +#define PageAnon(page) test_bit(PG_anon, &(page)->flags) +#define SetPageAnon(page) set_bit(PG_anon, &(page)->flags) +#define ClearPageAnon(page) clear_bit(PG_anon, &(page)->flags) + #ifdef CONFIG_SWAP -extern struct address_space swapper_space; -#define PageSwapCache(page) ((page)->mapping == &swapper_space) +#define PageSwapCache(page) test_bit(PG_swapcache, &(page)->flags) +#define SetPageSwapCache(page) set_bit(PG_swapcache, &(page)->flags) +#define ClearPageSwapCache(page) clear_bit(PG_swapcache, &(page)->flags) #else -#define PageSwapCache(page) 0 +#define PageSwapCache(page) 0 #endif struct page; /* forward declaration */ diff -prauN linux-2.6.0-test11/include/linux/pagemap.h wli-2.6.0-test11-30/include/linux/pagemap.h --- linux-2.6.0-test11/include/linux/pagemap.h 2003-11-26 12:42:49.000000000 -0800 +++ wli-2.6.0-test11-30/include/linux/pagemap.h 2003-12-04 06:13:40.000000000 -0800 @@ -139,17 +139,6 @@ static inline unsigned long get_page_cac return atomic_read(&nr_pagecache); } -static inline void ___add_to_page_cache(struct page *page, - struct address_space *mapping, unsigned long index) -{ - list_add(&page->list, &mapping->clean_pages); - page->mapping = mapping; - page->index = index; - - mapping->nrpages++; - pagecache_acct(1); -} - extern void FASTCALL(__lock_page(struct page *page)); extern void FASTCALL(unlock_page(struct page *page)); diff -prauN linux-2.6.0-test11/include/linux/pagevec.h wli-2.6.0-test11-30/include/linux/pagevec.h --- linux-2.6.0-test11/include/linux/pagevec.h 2003-11-26 12:45:53.000000000 -0800 +++ wli-2.6.0-test11-30/include/linux/pagevec.h 2003-12-04 08:09:12.000000000 -0800 @@ -4,8 +4,15 @@ * In many places it is efficient to batch an operation up against multiple * pages. A pagevec is a multipage container which is used for that. */ +#include -#define PAGEVEC_SIZE 16 +#define __MIN_PVEC_SIZE 16 +#define __MAX_PVEC_SIZE 1024 +#define __PVEC_MIN(x,y) ((x) < (y) ? (x) : (y)) +#define __PVEC_MAX(x,y) ((x) > (y) ? (x) : (y)) +#define __PVEC_SIZE (4*NR_CPUS) +#define __PAGEVEC_SIZE __PVEC_MIN(__PVEC_SIZE, __MAX_PVEC_SIZE) +#define PAGEVEC_SIZE __PVEC_MAX(__PAGEVEC_SIZE, __MIN_PVEC_SIZE) struct page; struct address_space; diff -prauN linux-2.6.0-test11/include/linux/pid.h wli-2.6.0-test11-30/include/linux/pid.h --- linux-2.6.0-test11/include/linux/pid.h 2003-11-26 12:43:25.000000000 -0800 +++ wli-2.6.0-test11-30/include/linux/pid.h 2003-12-03 18:50:57.000000000 -0800 @@ -14,51 +14,76 @@ struct pid { int nr; atomic_t count; - struct task_struct *task; - struct list_head task_list; + task_t *task; + struct rb_root task_list; struct list_head hash_chain; }; struct pid_link { - struct list_head pid_chain; + struct rb_node pid_chain; struct pid *pidptr; struct pid pid; }; #define pid_task(elem, type) \ - list_entry(elem, struct task_struct, pids[type].pid_chain) + rb_entry(elem, task_t, pids[type].pid_chain) /* * attach_pid() and link_pid() must be called with the tasklist_lock * write-held. */ -extern int FASTCALL(attach_pid(struct task_struct *task, enum pid_type type, int nr)); +int FASTCALL(attach_pid(task_t *task, enum pid_type type, int nr)); -extern void FASTCALL(link_pid(struct task_struct *task, struct pid_link *link, struct pid *pid)); +void FASTCALL(link_pid(task_t *task, struct pid *pid, enum pid_type type)); /* * detach_pid() must be called with the tasklist_lock write-held. */ -extern void FASTCALL(detach_pid(struct task_struct *task, enum pid_type)); +void FASTCALL(detach_pid(task_t *task, enum pid_type)); /* * look up a PID in the hash table. Must be called with the tasklist_lock * held. */ -extern struct pid *FASTCALL(find_pid(enum pid_type, int)); +struct pid *FASTCALL(find_pid(enum pid_type, int)); +int find_next_tgid(int); -extern int alloc_pidmap(void); -extern void FASTCALL(free_pidmap(int)); -extern void switch_exec_pids(struct task_struct *leader, struct task_struct *thread); +#define PROC_MAXPIDS 32 +int find_tgids_after(int tgid, int tgids[PROC_MAXPIDS]); +int find_tids_after(int tgid, int tid, int tids[PROC_MAXPIDS]); + +int alloc_pidmap(void); +void FASTCALL(free_pidmap(int)); +void switch_exec_pids(task_t *leader, task_t *thread); +void insert_task_list(task_t *); +void remove_task_list(task_t *); +task_t *first_task(void); + +#define __first_task_pid(pid, type) \ + pid_task(rb_first(&(pid)->task_list), type) +#define first_task_pid(task, type) \ + __first_task_pid((task)->pids[type].pidptr, type) +#define next_task_pid(task, type) \ +({ \ + struct rb_node *__node = rb_next(&(task)->pids[type].pid_chain);\ + __node ? pid_task(__node, type) : NULL; \ +}) +#define first_thread(task) first_task_pid(task, PIDTYPE_TGID) +#define next_thread(task) next_task_pid(task, PIDTYPE_TGID) +#define another_thread(task) \ +({ \ + task_t *__other = next_thread(task); \ + __other ? __other : first_task_pid(task, PIDTYPE_TGID); \ +}) + +#define __for_each_task_pid(type, task, pid) \ + for (task = __first_task_pid(pid, type); \ + task; \ + task = next_task_pid(task, type)) -#define for_each_task_pid(who, type, task, elem, pid) \ +#define for_each_task_pid(who, type, task, pid) \ if ((pid = find_pid(type, who))) \ - for (elem = pid->task_list.next, \ - prefetch(elem->next), \ - task = pid_task(elem, type); \ - elem != &pid->task_list; \ - elem = elem->next, prefetch(elem->next), \ - task = pid_task(elem, type)) + __for_each_task_pid(type, task, pid) #endif /* _LINUX_PID_H */ diff -prauN linux-2.6.0-test11/include/linux/rmap-locking.h wli-2.6.0-test11-30/include/linux/rmap-locking.h --- linux-2.6.0-test11/include/linux/rmap-locking.h 2003-11-26 12:42:59.000000000 -0800 +++ wli-2.6.0-test11-30/include/linux/rmap-locking.h 1969-12-31 16:00:00.000000000 -0800 @@ -1,23 +0,0 @@ -/* - * include/linux/rmap-locking.h - * - * Locking primitives for exclusive access to a page's reverse-mapping - * pte chain. - */ - -#include - -struct pte_chain; -extern kmem_cache_t *pte_chain_cache; - -#define pte_chain_lock(page) bit_spin_lock(PG_chainlock, &page->flags) -#define pte_chain_unlock(page) bit_spin_unlock(PG_chainlock, &page->flags) - -struct pte_chain *pte_chain_alloc(int gfp_flags); -void __pte_chain_free(struct pte_chain *pte_chain); - -static inline void pte_chain_free(struct pte_chain *pte_chain) -{ - if (pte_chain) - __pte_chain_free(pte_chain); -} diff -prauN linux-2.6.0-test11/include/linux/rmap.h wli-2.6.0-test11-30/include/linux/rmap.h --- linux-2.6.0-test11/include/linux/rmap.h 1969-12-31 16:00:00.000000000 -0800 +++ wli-2.6.0-test11-30/include/linux/rmap.h 2003-12-04 07:55:53.000000000 -0800 @@ -0,0 +1,163 @@ +/* + * include/linux/rmap.h + * + * Locking primitives for exclusive access to a page's reverse-mapping + * pte chain. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +struct anon { + atomic_t count; + spinlock_t lock; + struct list_head list; + struct rcu_head rcu; +}; + +#ifdef CONFIG_MMU + +int FASTCALL(rmap_get_cpu(void)); +void FASTCALL(page_turn_rmap(struct page *, struct vm_area_struct *)); +void FASTCALL(page_move_rmap(struct page *, struct vm_area_struct *, unsigned long, unsigned long)); +void FASTCALL(add_rmap_address(struct page *, unsigned long)); +void FASTCALL(clear_page_chained(struct page *page)); + +/* + * Called from mm/vmscan.c to handle pageout + */ +int FASTCALL(page_referenced(struct page *)); +int FASTCALL(try_to_unmap(struct page *)); + +void init_rmap(void); +int exec_rmap(struct mm_struct *); +void dup_rmap(struct mm_struct *, struct mm_struct *); +void exit_rmap(struct mm_struct *); + +/* + * Return values of try_to_unmap(): + */ +#define SWAP_SUCCESS 0 +#define SWAP_AGAIN 1 +#define SWAP_FAIL 2 + +#else /* !CONFIG_MMU */ +#define page_referenced(page) TestClearPageReferenced(page) +#define init_rmap() do { } while (0) +#define exec_rmap(mm) ({ 0; }) +#define dup_rmap(new, old) ({ 0; }) +#define exit_rmap(mm) do { } while (0) +#define try_to_unmap(page) ({ SWAP_FAIL; }) +#endif /* CONFIG_MMU */ + +#define NOADDR (~0UL) + +static inline void rmap_lock(struct page *page) +{ + bit_spin_lock(PG_rmaplock, &page->flags); +} + +static inline void rmap_unlock(struct page *page) +{ + bit_spin_unlock(PG_rmaplock, &page->flags); +} + +#define NRSLOT ((L1_CACHE_BYTES - sizeof(unsigned long))/sizeof(unsigned long)) + +struct rmap_chain { + unsigned long slot[NRSLOT]; /* first contains count, then */ + struct rmap_chain *next; /* user virtual addresses */ +}; + +static inline void page_dup_rmap(struct page *page) +{ + atomic_inc(&page->mapcount); +} + +static inline void clear_page_anon(struct page *page) +{ + set_page_mapping(page, NULL); + ClearPageAnon(page); +} + +/** + * page_remove_rmap - take down reverse mapping to a page + * @page: page to remove mapping from + * + * For general use: Remove the reverse mapping from the page. + * after that the caller can clear the page table entry and free + * the page. Caller needs to hold the mm->page_table_lock. + */ +static inline void page_remove_rmap(struct page *page) +{ + if (!atomic_dec_and_test(&page->mapcount)) + return; + + rmap_lock(page); + dec_page_state(nr_mapped); + if (PageAnon(page)) + clear_page_anon(page); + if (page->chain) + clear_page_chained(page); + rmap_unlock(page); +} + +static inline unsigned long vma_address(struct page *page, struct vm_area_struct *vma) +{ + unsigned long pgoff, address; + pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); + if (address < vma->vm_start || address >= vma->vm_end) + return NOADDR; + else + return address; +} + +/** + * page_add_rmap - add reverse mapping entry to a page + * @page: the page to add the mapping to + * @vma: the vma into which this page is being mapped + * @address: the virtual address at which the page is being mapped + * @anon: is this an anonymous (not file-backed) page? + * + * Add a new pte reverse mapping to a page. + * The caller needs to hold the mm->page_table_lock. + */ +static inline void page_add_rmap(struct page *page, struct vm_area_struct *vma, + unsigned long address, int anon) +{ + if (!pfn_valid(page_to_pfn(page)) || PageReserved(page)) + return; + + address &= PAGE_MASK; + + rmap_lock(page); + + if (!page_mapped(page)) + inc_page_state(nr_mapped); + + atomic_inc(&page->mapcount); + + if (page->__mapping) { + if (anon) { + BUG_ON(!PageAnon(page)); + if (address != page->index) + add_rmap_address(page, address); + } else { + BUG_ON(PageAnon(page)); + if (address != vma_address(page, vma)) + add_rmap_address(page, address); + } + } else if (anon) { + SetPageAnon(page); + set_page_mapping(page, vma->vm_mm->anon); + page->index = address; + } + rmap_unlock(page); +} diff -prauN linux-2.6.0-test11/include/linux/sched.h wli-2.6.0-test11-30/include/linux/sched.h --- linux-2.6.0-test11/include/linux/sched.h 2003-11-26 12:42:58.000000000 -0800 +++ wli-2.6.0-test11-30/include/linux/sched.h 2003-12-04 08:57:22.000000000 -0800 @@ -27,8 +27,8 @@ #include #include #include -#include #include +#include struct exec_domain; @@ -145,34 +145,37 @@ extern rwlock_t tasklist_lock; extern spinlock_t mmlist_lock; typedef struct task_struct task_t; +#include -extern void sched_init(void); -extern void init_idle(task_t *idle, int cpu); +void sched_init(void); +void init_idle(task_t *idle, int cpu); -extern void show_state(void); -extern void show_regs(struct pt_regs *); +void show_state(void); +void show_regs(struct pt_regs *); /* * TASK is a pointer to the task whose backtrace we want to see (or NULL for current * task), SP is the stack pointer of the first frame that should be shown in the back * trace (or NULL if the entire call-chain of the task should be shown). */ -extern void show_stack(struct task_struct *task, unsigned long *sp); +void show_stack(task_t *task, unsigned long *sp); void io_schedule(void); long io_schedule_timeout(long timeout); -extern void cpu_init (void); -extern void trap_init(void); -extern void update_process_times(int user); -extern void update_one_process(struct task_struct *p, unsigned long user, +void cpu_init (void); +void trap_init(void); +void update_process_times(int user); +void update_one_process(task_t *p, unsigned long user, unsigned long system, int cpu); -extern void scheduler_tick(int user_tick, int system); +void scheduler_tick(int user_tick, int system); extern unsigned long cache_decay_ticks; +extern const unsigned long scheduling_functions_start_here; +extern const unsigned long scheduling_functions_end_here; #define MAX_SCHEDULE_TIMEOUT LONG_MAX -extern signed long FASTCALL(schedule_timeout(signed long timeout)); +signed long FASTCALL(schedule_timeout(signed long timeout)); asmlinkage void schedule(void); struct namespace; @@ -198,14 +201,16 @@ struct mm_struct { * together off init_mm.mmlist, and are protected * by mmlist_lock */ + struct anon *anon; /* set of forks between execs */ + struct list_head anon_list; /* chain of mm's against anon */ unsigned long start_code, end_code, start_data, end_data; unsigned long start_brk, brk, start_stack; unsigned long arg_start, arg_end, env_start, env_end; unsigned long rss, total_vm, locked_vm; + unsigned long shared, text, lib, data, dirty, stack; unsigned long def_flags; cpumask_t cpu_vm_mask; - unsigned long swap_address; unsigned long saved_auxv[40]; /* for /proc/PID/auxv */ @@ -225,6 +230,7 @@ struct mm_struct { struct kioctx *ioctx_list; struct kioctx default_kioctx; + struct rcu_head rcu; }; extern int mmlist_nr; @@ -352,7 +358,7 @@ struct task_struct { cpumask_t cpus_allowed; unsigned int time_slice, first_time_slice; - struct list_head tasks; + struct rb_node tasks; struct list_head ptrace_children; struct list_head ptrace_list; @@ -553,7 +559,7 @@ static inline int kstack_end(void *addr) #endif extern union thread_union init_thread_union; -extern struct task_struct init_task; +extern task_t init_task; extern struct mm_struct init_mm; @@ -719,47 +725,61 @@ extern void wait_task_inactive(task_t * #define REMOVE_LINKS(p) do { \ if (thread_group_leader(p)) \ - list_del_init(&(p)->tasks); \ + remove_task_list(p); \ remove_parent(p); \ } while (0) #define SET_LINKS(p) do { \ if (thread_group_leader(p)) \ - list_add_tail(&(p)->tasks,&init_task.tasks); \ + insert_task_list(p); \ add_parent(p, (p)->parent); \ } while (0) -#define next_task(p) list_entry((p)->tasks.next, struct task_struct, tasks) -#define prev_task(p) list_entry((p)->tasks.prev, struct task_struct, tasks) +static inline task_t *next_task(task_t *task) +{ + struct rb_node *node = rb_next(&task->tasks); + return node ? rb_entry(node, task_t, tasks) : NULL; +} + +static inline task_t *prev_task(task_t *task) +{ + struct rb_node *node = rb_prev(&task->tasks); + return node ? rb_entry(node, task_t, tasks) : NULL; +} #define for_each_process(p) \ - for (p = &init_task ; (p = next_task(p)) != &init_task ; ) + for (p = first_task(); p; p = next_task(p)) /* * Careful: do_each_thread/while_each_thread is a double loop so * 'break' will not work as expected - use goto instead. */ #define do_each_thread(g, t) \ - for (g = t = &init_task ; (g = t = next_task(g)) != &init_task ; ) do + for (g = first_task(), t = first_thread(g); \ + g; \ + g = next_task(g), t = g ? first_thread(g) : NULL) do #define while_each_thread(g, t) \ - while ((t = next_thread(t)) != g) + while ((t = next_thread(t))) -extern task_t * FASTCALL(next_thread(task_t *p)); - -#define thread_group_leader(p) (p->pid == p->tgid) - -static inline int thread_group_empty(task_t *p) +static inline int thread_group_leader(task_t *task) { - struct pid *pid = p->pids[PIDTYPE_TGID].pidptr; + return task->pid == task->tgid; +} - return pid->task_list.next->next == &pid->task_list; +/* a singleton's element has no successor */ +static inline int thread_group_empty(task_t *task) +{ + struct pid *pid = task->pids[PIDTYPE_TGID].pidptr; + return !rb_next(rb_first(&pid->task_list)); } -#define delay_group_leader(p) \ - (thread_group_leader(p) && !thread_group_empty(p)) +static inline int delay_group_leader(task_t *task) +{ + return thread_group_leader(task) && !thread_group_empty(task); +} -extern void unhash_process(struct task_struct *p); +void unhash_process(task_t *task); /* Protects ->fs, ->files, ->mm, and synchronises with wait4(). * Nests both inside and outside of read_lock(&tasklist_lock). diff -prauN linux-2.6.0-test11/include/linux/swap.h wli-2.6.0-test11-30/include/linux/swap.h --- linux-2.6.0-test11/include/linux/swap.h 2003-11-26 12:42:52.000000000 -0800 +++ wli-2.6.0-test11-30/include/linux/swap.h 2003-12-04 06:13:40.000000000 -0800 @@ -76,7 +76,6 @@ struct reclaim_state { #ifdef __KERNEL__ struct address_space; -struct pte_chain; struct sysinfo; struct writeback_control; struct zone; @@ -162,6 +161,7 @@ extern unsigned int nr_free_pages(void); extern unsigned int nr_free_pages_pgdat(pg_data_t *pgdat); extern unsigned int nr_free_buffer_pages(void); extern unsigned int nr_free_pagecache_pages(void); +unsigned long nr_deferred_pages(void); /* linux/mm/swap.c */ extern void FASTCALL(lru_cache_add(struct page *)); @@ -177,25 +177,8 @@ extern int try_to_free_pages(struct zone extern int shrink_all_memory(int); extern int vm_swappiness; -/* linux/mm/rmap.c */ -#ifdef CONFIG_MMU -int FASTCALL(page_referenced(struct page *)); -struct pte_chain *FASTCALL(page_add_rmap(struct page *, pte_t *, - struct pte_chain *)); -void FASTCALL(page_remove_rmap(struct page *, pte_t *)); -int FASTCALL(try_to_unmap(struct page *)); - /* linux/mm/shmem.c */ -extern int shmem_unuse(swp_entry_t entry, struct page *page); -#else -#define page_referenced(page) TestClearPageReferenced(page) -#define try_to_unmap(page) SWAP_FAIL -#endif /* CONFIG_MMU */ - -/* return values of try_to_unmap */ -#define SWAP_SUCCESS 0 -#define SWAP_AGAIN 1 -#define SWAP_FAIL 2 +int shmem_unuse(swp_entry_t entry, struct page *page); #ifdef CONFIG_SWAP /* linux/mm/page_io.c */ @@ -205,7 +188,6 @@ extern int rw_swap_page_sync(int, swp_en /* linux/mm/swap_state.c */ extern struct address_space swapper_space; -#define total_swapcache_pages swapper_space.nrpages extern void show_swap_cache_info(void); extern int add_to_swap(struct page *); extern void __delete_from_swap_cache(struct page *); @@ -244,7 +226,6 @@ extern spinlock_t swaplock; #else /* CONFIG_SWAP */ #define total_swap_pages 0 -#define total_swapcache_pages 0UL #define si_swapinfo(val) \ do { (val)->freeswap = (val)->totalswap = 0; } while (0) diff -prauN linux-2.6.0-test11/init/main.c wli-2.6.0-test11-30/init/main.c --- linux-2.6.0-test11/init/main.c 2003-11-26 12:43:09.000000000 -0800 +++ wli-2.6.0-test11-30/init/main.c 2003-12-04 06:38:56.000000000 -0800 @@ -80,7 +80,6 @@ extern void signals_init(void); extern void buffer_init(void); extern void pidhash_init(void); extern void pidmap_init(void); -extern void pte_chain_init(void); extern void radix_tree_init(void); extern void free_initmem(void); extern void populate_rootfs(void); @@ -442,7 +441,6 @@ asmlinkage void __init start_kernel(void calibrate_delay(); pidmap_init(); pgtable_cache_init(); - pte_chain_init(); fork_init(num_physpages); proc_caches_init(); buffer_init(); diff -prauN linux-2.6.0-test11/ipc/shm.c wli-2.6.0-test11-30/ipc/shm.c --- linux-2.6.0-test11/ipc/shm.c 2003-11-26 12:44:10.000000000 -0800 +++ wli-2.6.0-test11-30/ipc/shm.c 2003-12-03 19:34:55.000000000 -0800 @@ -380,9 +380,9 @@ static void shm_get_stat(unsigned long * if (is_file_hugepages(shp->shm_file)) { struct address_space *mapping = inode->i_mapping; - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); *rss += (HPAGE_SIZE/PAGE_SIZE)*mapping->nrpages; - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); } else { struct shmem_inode_info *info = SHMEM_I(inode); spin_lock(&info->lock); diff -prauN linux-2.6.0-test11/kernel/capability.c wli-2.6.0-test11-30/kernel/capability.c --- linux-2.6.0-test11/kernel/capability.c 2003-11-26 12:44:43.000000000 -0800 +++ wli-2.6.0-test11-30/kernel/capability.c 2003-12-03 18:50:57.000000000 -0800 @@ -89,10 +89,9 @@ static inline void cap_set_pg(int pgrp, kernel_cap_t *permitted) { task_t *g, *target; - struct list_head *l; struct pid *pid; - for_each_task_pid(pgrp, PIDTYPE_PGID, g, l, pid) { + for_each_task_pid(pgrp, PIDTYPE_PGID, g, pid) { target = g; while_each_thread(g, target) security_capset_set(target, effective, inheritable, permitted); diff -prauN linux-2.6.0-test11/kernel/exit.c wli-2.6.0-test11-30/kernel/exit.c --- linux-2.6.0-test11/kernel/exit.c 2003-11-26 12:45:29.000000000 -0800 +++ wli-2.6.0-test11-30/kernel/exit.c 2003-12-03 18:52:15.000000000 -0800 @@ -112,13 +112,12 @@ void unhash_process(struct task_struct * */ int session_of_pgrp(int pgrp) { - struct task_struct *p; - struct list_head *l; + task_t *p; struct pid *pid; int sid = -1; read_lock(&tasklist_lock); - for_each_task_pid(pgrp, PIDTYPE_PGID, p, l, pid) + for_each_task_pid(pgrp, PIDTYPE_PGID, p, pid) if (p->session > 0) { sid = p->session; goto out; @@ -143,11 +142,10 @@ out: static int will_become_orphaned_pgrp(int pgrp, task_t *ignored_task) { struct task_struct *p; - struct list_head *l; struct pid *pid; int ret = 1; - for_each_task_pid(pgrp, PIDTYPE_PGID, p, l, pid) { + for_each_task_pid(pgrp, PIDTYPE_PGID, p, pid) { if (p == ignored_task || p->state >= TASK_ZOMBIE || p->real_parent->pid == 1) @@ -176,10 +174,9 @@ static inline int has_stopped_jobs(int p { int retval = 0; struct task_struct *p; - struct list_head *l; struct pid *pid; - for_each_task_pid(pgrp, PIDTYPE_PGID, p, l, pid) { + for_each_task_pid(pgrp, PIDTYPE_PGID, p, pid) { if (p->state != TASK_STOPPED) continue; @@ -592,10 +589,11 @@ static inline void forget_original_paren * Send signals to all our closest relatives so that they know * to properly mourn us.. */ -static void exit_notify(struct task_struct *tsk) +static void exit_notify(task_t *tsk) { int state; - struct task_struct *t; + task_t *t; + struct pid *pid = tsk->pids[PIDTYPE_TGID].pidptr; if (signal_pending(tsk) && !tsk->signal->group_exit && !thread_group_empty(tsk)) { @@ -610,12 +608,15 @@ static void exit_notify(struct task_stru */ read_lock(&tasklist_lock); spin_lock_irq(&tsk->sighand->siglock); - for (t = next_thread(tsk); t != tsk; t = next_thread(t)) + __for_each_task_pid(PIDTYPE_TGID, t, pid) { + if (t == tsk) + continue; if (!signal_pending(t) && !(t->flags & PF_EXITING)) { recalc_sigpending_tsk(t); if (signal_pending(t)) signal_wake_up(t, 0); } + } spin_unlock_irq(&tsk->sighand->siglock); read_unlock(&tasklist_lock); } @@ -782,27 +783,6 @@ asmlinkage long sys_exit(int error_code) do_exit((error_code&0xff)<<8); } -task_t *next_thread(task_t *p) -{ - struct pid_link *link = p->pids + PIDTYPE_TGID; - struct list_head *tmp, *head = &link->pidptr->task_list; - -#ifdef CONFIG_SMP - if (!p->sighand) - BUG(); - if (!spin_is_locked(&p->sighand->siglock) && - !rwlock_is_locked(&tasklist_lock)) - BUG(); -#endif - tmp = link->pid_chain.next; - if (tmp == head) - tmp = head->next; - - return pid_task(tmp, PIDTYPE_TGID); -} - -EXPORT_SYMBOL(next_thread); - /* * Take down every thread in the group. This is called by fatal signals * as well as by sys_exit_group (below). @@ -1038,7 +1018,8 @@ static int wait_task_stopped(task_t *p, asmlinkage long sys_wait4(pid_t pid,unsigned int * stat_addr, int options, struct rusage * ru) { DECLARE_WAITQUEUE(wait, current); - struct task_struct *tsk; + struct pid *tgrp_pid = current->pids[PIDTYPE_TGID].pidptr; + task_t *tsk; int flag, retval; if (options & ~(WNOHANG|WUNTRACED|__WNOTHREAD|__WCLONE|__WALL)) @@ -1049,14 +1030,14 @@ repeat: flag = 0; current->state = TASK_INTERRUPTIBLE; read_lock(&tasklist_lock); - tsk = current; - do { - struct task_struct *p; + __for_each_task_pid(PIDTYPE_TGID, tsk, tgrp_pid) { + task_t *p; struct list_head *_p; int ret; + BUG_ON(tsk->signal != current->signal); list_for_each(_p,&tsk->children) { - p = list_entry(_p,struct task_struct,sibling); + p = list_entry(_p, task_t, sibling); ret = eligible_child(pid, options, p); if (!ret) @@ -1096,10 +1077,7 @@ repeat: } if (options & __WNOTHREAD) break; - tsk = next_thread(tsk); - if (tsk->signal != current->signal) - BUG(); - } while (tsk != current); + } read_unlock(&tasklist_lock); if (flag) { retval = 0; diff -prauN linux-2.6.0-test11/kernel/fork.c wli-2.6.0-test11-30/kernel/fork.c --- linux-2.6.0-test11/kernel/fork.c 2003-11-26 12:42:58.000000000 -0800 +++ wli-2.6.0-test11-30/kernel/fork.c 2003-12-04 06:59:27.000000000 -0800 @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -290,9 +291,9 @@ static inline int dup_mmap(struct mm_str atomic_dec(&inode->i_writecount); /* insert tmp into the share list, just after mpnt */ - down(&inode->i_mapping->i_shared_sem); - list_add_tail(&tmp->shared, &mpnt->shared); - up(&inode->i_mapping->i_shared_sem); + spin_lock(&inode->i_mapping->i_shared_lock); + list_add_tail_rcu(&tmp->shared, &mpnt->shared); + spin_unlock(&inode->i_mapping->i_shared_lock); } /* @@ -346,8 +347,21 @@ static inline void mm_free_pgd(struct mm spinlock_t mmlist_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; int mmlist_nr; +/* SLAB cache for mm_struct structures (tsk->mm) */ +static kmem_cache_t *mm_cachep; + #define allocate_mm() (kmem_cache_alloc(mm_cachep, SLAB_KERNEL)) -#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) + +static void __free_mm(void *mm) +{ + kmem_cache_free(mm_cachep, mm); +} + +void free_mm(struct mm_struct *mm) +{ + INIT_RCU_HEAD(&mm->rcu); + call_rcu(&mm->rcu, __free_mm, mm); +} #include @@ -362,6 +376,7 @@ static struct mm_struct * mm_init(struct mm->ioctx_list = NULL; mm->default_kioctx = (struct kioctx)INIT_KIOCTX(mm->default_kioctx, *mm); mm->free_area_cache = TASK_UNMAPPED_BASE; + mm->shared = mm->text = mm->lib = mm->data = mm->dirty = mm->stack = 0; if (likely(!mm_alloc_pgd(mm))) { mm->def_flags = 0; @@ -379,11 +394,15 @@ struct mm_struct * mm_alloc(void) struct mm_struct * mm; mm = allocate_mm(); - if (mm) { - memset(mm, 0, sizeof(*mm)); - return mm_init(mm); + if (!mm) + return NULL; + memset(mm, 0, sizeof(*mm)); + if (exec_rmap(mm)) { + mm_free_pgd(mm); + free_mm(mm); + return NULL; } - return NULL; + return mm_init(mm); } /* @@ -410,6 +429,7 @@ void mmput(struct mm_struct *mm) spin_unlock(&mmlist_lock); exit_aio(mm); exit_mmap(mm); + exit_rmap(mm); mmdrop(mm); } } @@ -514,6 +534,8 @@ static int copy_mm(unsigned long clone_f if (!mm_init(mm)) goto fail_nomem; + dup_rmap(mm, oldmm); + if (init_new_context(tsk,mm)) goto fail_nocontext; @@ -1045,7 +1067,7 @@ struct task_struct *copy_process(unsigne if (p->pid) __get_cpu_var(process_counts)++; } else - link_pid(p, p->pids + PIDTYPE_TGID, &p->group_leader->pids[PIDTYPE_TGID].pid); + link_pid(p, &p->group_leader->pids[PIDTYPE_TGID].pid, PIDTYPE_TGID); nr_threads++; write_unlock_irq(&tasklist_lock); @@ -1188,8 +1210,7 @@ kmem_cache_t *fs_cachep; /* SLAB cache for vm_area_struct structures */ kmem_cache_t *vm_area_cachep; -/* SLAB cache for mm_struct structures (tsk->mm) */ -kmem_cache_t *mm_cachep; +void init_rmap(void); void __init proc_caches_init(void) { @@ -1228,4 +1249,6 @@ void __init proc_caches_init(void) SLAB_HWCACHE_ALIGN, NULL, NULL); if(!mm_cachep) panic("vma_init: Cannot alloc mm_struct SLAB cache"); + + init_rmap(); } diff -prauN linux-2.6.0-test11/kernel/pid.c wli-2.6.0-test11-30/kernel/pid.c --- linux-2.6.0-test11/kernel/pid.c 2003-11-26 12:44:21.000000000 -0800 +++ wli-2.6.0-test11-30/kernel/pid.c 2003-12-03 19:01:47.000000000 -0800 @@ -159,30 +159,111 @@ inline struct pid *find_pid(enum pid_typ return NULL; } -void link_pid(task_t *task, struct pid_link *link, struct pid *pid) +static struct rb_root tasklist_root = { .rb_node = &init_task.tasks }; + +task_t *first_task(void) +{ + struct rb_node *node = rb_first(&tasklist_root); + BUG_ON(!node); + return rb_entry(node, task_t, tasks); +} +EXPORT_SYMBOL(first_task); + +void insert_task_list(task_t *task) +{ + struct rb_node **node = &tasklist_root.rb_node, *parent = NULL; + task_t *candidate; + + while (*node) { + parent = *node; + candidate = rb_entry(parent, task_t, tasks); + if (candidate->tgid < task->tgid) + node = &parent->rb_right; + else if (candidate->tgid > task->tgid) + node = &parent->rb_left; + else /* already there? give up */ + return; + } + rb_link_node(&task->tasks, parent, node); + rb_insert_color(&task->tasks, &tasklist_root); +} + +void remove_task_list(task_t *task) +{ + /* pray this is not called on something in the == case above */ + rb_erase(&task->tasks, &tasklist_root); +} + +/* + * need to be sorted by the id's of the type's predecessor + * if there's no predecessor (PIDTYPE_PID) then we expect a + * singleton and just use the type's own id for uniformity + * with the lists sorted this way we can perform efficient + * incremental enumeration + */ +static void insert_pid_chain(task_t *task, struct pid *pid, enum pid_type type) +{ + struct pid_link *candidate, *link = &task->pids[type]; + struct rb_node **node, *parent = NULL; + int n = type ? (link - 1)->pid.nr : link->pid.nr; + + node = &pid->task_list.rb_node; + while (*node) { + int k; + parent = *node; + candidate = rb_entry(parent, struct pid_link, pid_chain); + k = type ? (candidate - 1)->pid.nr : candidate->pid.nr; + + if (k <= n) + node = &parent->rb_right; + else /* if (k > n) */ + node = &parent->rb_left; + /* + * else + * something unexpected happened + * return; + */ + } + rb_link_node(&link->pid_chain, parent, node); + rb_insert_color(&link->pid_chain, &pid->task_list); +} + +static void remove_pid_chain(task_t *task, struct pid *pid, enum pid_type type) +{ + rb_erase(&task->pids[type].pid_chain, &pid->task_list); +} + +void link_pid(task_t *task, struct pid *pid, enum pid_type type) { atomic_inc(&pid->count); - list_add_tail(&link->pid_chain, &pid->task_list); - link->pidptr = pid; + insert_pid_chain(task, pid, type); + task->pids[type].pidptr = pid; } int attach_pid(task_t *task, enum pid_type type, int nr) { struct pid *pid = find_pid(type, nr); + struct pid_link *link = &task->pids[type]; if (pid) atomic_inc(&pid->count); else { - pid = &task->pids[type].pid; + struct list_head *bucket; + + pid = &link->pid; pid->nr = nr; atomic_set(&pid->count, 1); - INIT_LIST_HEAD(&pid->task_list); + pid->task_list.rb_node = NULL; pid->task = task; get_task_struct(task); - list_add(&pid->hash_chain, &pid_hash[type][pid_hashfn(nr)]); + bucket = &pid_hash[type][pid_hashfn(nr)]; + list_add_tail(&pid->hash_chain, bucket); } - list_add_tail(&task->pids[type].pid_chain, &pid->task_list); - task->pids[type].pidptr = pid; + link->pid_chain.rb_parent = link->pid_chain.rb_left + = link->pid_chain.rb_right = NULL; + link->pid_chain.rb_color = RB_BLACK; + link->pidptr = pid; + insert_pid_chain(task, pid, type); return 0; } @@ -193,7 +274,7 @@ static inline int __detach_pid(task_t *t struct pid *pid = link->pidptr; int nr; - list_del(&link->pid_chain); + remove_pid_chain(task, pid, type); if (!atomic_dec_and_test(&pid->count)) return 0; @@ -222,13 +303,87 @@ void detach_pid(task_t *task, enum pid_t free_pidmap(nr); } +/** + * find_tgids_after - Returns the tgids of tasks after tgid. + * @tgid: strict lower bound on tgids to return + * @tgids: buffer for return of tgids + * + * Returns the number of tgids returned in tgids + * The function works even if the input tgid value + * is not valid anymore. + */ +int find_tgids_after(int tgid, int tgids[PROC_MAXPIDS]) +{ + struct rb_node *node; + task_t *task = NULL, *lub = NULL; + int k; + + read_lock(&tasklist_lock); + node = tasklist_root.rb_node; + while (node) { + task = rb_entry(node, task_t, tasks); + if (task->tgid < tgid) + node = node->rb_right; + else if (task->tgid > tgid) { + node = node->rb_left; + lub = task; + } else { + struct rb_node *lub_node = rb_next(node); + if (lub_node) + lub = rb_entry(lub_node, task_t, tasks); + break; + } + } + + for (k = 0, task = lub; task && k < PROC_MAXPIDS; ++k, task = next_task(task)) + tgids[k] = task->tgid; + read_unlock(&tasklist_lock); + return k; +} +EXPORT_SYMBOL(find_tgids_after); + +int find_tids_after(int tgid, int tid, int tids[PROC_MAXPIDS]) +{ + struct pid *pid; + struct rb_node *node; + task_t *task = NULL, *lub = NULL; + int k = 0; + + read_lock(&tasklist_lock); + pid = find_pid(PIDTYPE_TGID, tgid); + if (!pid) + goto out; + node = pid->task_list.rb_node; + while (node) { + task = pid_task(node, PIDTYPE_TGID); + if (task->pid < tid) + node = node->rb_right; + else if (task->pid > tid) { + lub = task; + node = node->rb_left; + } else { + struct rb_node *lub_node = rb_next(node); + if (lub_node) + lub = pid_task(lub_node, PIDTYPE_TGID); + break; + } + } + + for (task = lub; task && k < PROC_MAXPIDS; ++k, task = next_task_pid(task, PIDTYPE_TGID)) + tids[k] = task->pid; +out: + read_unlock(&tasklist_lock); + return k; +} +EXPORT_SYMBOL(find_tids_after); + task_t *find_task_by_pid(int nr) { struct pid *pid = find_pid(PIDTYPE_PID, nr); if (!pid) return NULL; - return pid_task(pid->task_list.next, PIDTYPE_PID); + return __first_task_pid(pid, PIDTYPE_PID); } EXPORT_SYMBOL(find_task_by_pid); @@ -255,7 +410,7 @@ void switch_exec_pids(task_t *leader, ta attach_pid(thread, PIDTYPE_TGID, thread->tgid); attach_pid(thread, PIDTYPE_PGID, leader->__pgrp); attach_pid(thread, PIDTYPE_SID, thread->session); - list_add_tail(&thread->tasks, &init_task.tasks); + insert_task_list(thread); attach_pid(leader, PIDTYPE_PID, leader->pid); attach_pid(leader, PIDTYPE_TGID, leader->tgid); diff -prauN linux-2.6.0-test11/kernel/sched.c wli-2.6.0-test11-30/kernel/sched.c --- linux-2.6.0-test11/kernel/sched.c 2003-11-26 12:45:17.000000000 -0800 +++ wli-2.6.0-test11-30/kernel/sched.c 2003-12-04 08:35:59.000000000 -0800 @@ -221,6 +221,13 @@ static DEFINE_PER_CPU(struct runqueue, r #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) +extern unsigned long __scheduling_functions_start_here; +extern unsigned long __scheduling_functions_end_here; +const unsigned long scheduling_functions_start_here = + (unsigned long)&__scheduling_functions_start_here; +const unsigned long scheduling_functions_end_here = + (unsigned long)&__scheduling_functions_end_here; + /* * Default context-switch locking: */ @@ -1463,12 +1470,10 @@ out: rebalance_tick(rq, 0); } -void scheduling_functions_start_here(void) { } - /* * schedule() is the main scheduler function. */ -asmlinkage void schedule(void) +asmlinkage __sched void schedule(void) { task_t *prev, *next; runqueue_t *rq; @@ -1611,7 +1616,7 @@ EXPORT_SYMBOL(schedule); * off of preempt_enable. Kernel preemptions off return from interrupt * occur there and call schedule directly. */ -asmlinkage void preempt_schedule(void) +asmlinkage __sched void preempt_schedule(void) { struct thread_info *ti = current_thread_info(); @@ -1746,7 +1751,7 @@ void complete_all(struct completion *x) spin_unlock_irqrestore(&x->wait.lock, flags); } -void wait_for_completion(struct completion *x) +__sched void wait_for_completion(struct completion *x) { might_sleep(); spin_lock_irq(&x->wait.lock); @@ -1784,7 +1789,7 @@ EXPORT_SYMBOL(wait_for_completion); __remove_wait_queue(q, &wait); \ spin_unlock_irqrestore(&q->lock, flags); -void interruptible_sleep_on(wait_queue_head_t *q) +__sched void interruptible_sleep_on(wait_queue_head_t *q) { SLEEP_ON_VAR @@ -1797,7 +1802,7 @@ void interruptible_sleep_on(wait_queue_h EXPORT_SYMBOL(interruptible_sleep_on); -long interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) +__sched long interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) { SLEEP_ON_VAR @@ -1812,7 +1817,7 @@ long interruptible_sleep_on_timeout(wait EXPORT_SYMBOL(interruptible_sleep_on_timeout); -void sleep_on(wait_queue_head_t *q) +__sched void sleep_on(wait_queue_head_t *q) { SLEEP_ON_VAR @@ -1825,7 +1830,7 @@ void sleep_on(wait_queue_head_t *q) EXPORT_SYMBOL(sleep_on); -long sleep_on_timeout(wait_queue_head_t *q, long timeout) +__sched long sleep_on_timeout(wait_queue_head_t *q, long timeout) { SLEEP_ON_VAR @@ -1840,8 +1845,6 @@ long sleep_on_timeout(wait_queue_head_t EXPORT_SYMBOL(sleep_on_timeout); -void scheduling_functions_end_here(void) { } - void set_user_nice(task_t *p, long nice) { unsigned long flags; @@ -2291,7 +2294,7 @@ asmlinkage long sys_sched_yield(void) return 0; } -void __cond_resched(void) +__sched void __cond_resched(void) { set_current_state(TASK_RUNNING); schedule(); @@ -2305,7 +2308,7 @@ EXPORT_SYMBOL(__cond_resched); * this is a shortcut for kernel-space yielding - it marks the * thread runnable and calls sys_sched_yield(). */ -void yield(void) +__sched void yield(void) { set_current_state(TASK_RUNNING); sys_sched_yield(); @@ -2320,7 +2323,7 @@ EXPORT_SYMBOL(yield); * But don't do that if it is a deliberate, throttling IO wait (this task * has set its backing_dev_info: the queue against which it should throttle) */ -void io_schedule(void) +__sched void io_schedule(void) { struct runqueue *rq = this_rq(); @@ -2331,7 +2334,7 @@ void io_schedule(void) EXPORT_SYMBOL(io_schedule); -long io_schedule_timeout(long timeout) +__sched long io_schedule_timeout(long timeout) { struct runqueue *rq = this_rq(); long ret; @@ -2881,7 +2884,7 @@ EXPORT_SYMBOL(__might_sleep); * * Called inside preempt_disable(). */ -void __preempt_spin_lock(spinlock_t *lock) +__sched void __preempt_spin_lock(spinlock_t *lock) { if (preempt_count() > 1) { _raw_spin_lock(lock); @@ -2897,7 +2900,7 @@ void __preempt_spin_lock(spinlock_t *loc EXPORT_SYMBOL(__preempt_spin_lock); -void __preempt_write_lock(rwlock_t *lock) +__sched void __preempt_write_lock(rwlock_t *lock) { if (preempt_count() > 1) { _raw_write_lock(lock); diff -prauN linux-2.6.0-test11/kernel/signal.c wli-2.6.0-test11-30/kernel/signal.c --- linux-2.6.0-test11/kernel/signal.c 2003-11-26 12:43:37.000000000 -0800 +++ wli-2.6.0-test11-30/kernel/signal.c 2003-12-03 18:50:57.000000000 -0800 @@ -353,7 +353,7 @@ void __exit_signal(struct task_struct *t spin_lock(&sighand->siglock); if (atomic_dec_and_test(&sig->count)) { if (tsk == sig->curr_target) - sig->curr_target = next_thread(tsk); + sig->curr_target = another_thread(tsk); tsk->signal = NULL; spin_unlock(&sighand->siglock); flush_sigqueue(&sig->shared_pending); @@ -368,7 +368,7 @@ void __exit_signal(struct task_struct *t sig->group_exit_task = NULL; } if (tsk == sig->curr_target) - sig->curr_target = next_thread(tsk); + sig->curr_target = another_thread(tsk); tsk->signal = NULL; spin_unlock(&sighand->siglock); } @@ -610,20 +610,18 @@ static void do_notify_parent_cldstop(str * actual continuing for SIGCONT, but not the actual stopping for stop * signals. The process stop is done as a signal action for SIG_DFL. */ -static void handle_stop_signal(int sig, struct task_struct *p) +static void handle_stop_signal(int sig, task_t *p) { - struct task_struct *t; + struct pid *pid = p->pids[PIDTYPE_TGID].pidptr; + task_t *t; if (sig_kernel_stop(sig)) { /* * This is a stop signal. Remove SIGCONT from all queues. */ rm_from_queue(sigmask(SIGCONT), &p->signal->shared_pending); - t = p; - do { + __for_each_task_pid(PIDTYPE_TGID, t, pid) rm_from_queue(sigmask(SIGCONT), &t->pending); - t = next_thread(t); - } while (t != p); } else if (sig == SIGCONT) { /* * Remove all stop signals from all queues, @@ -651,8 +649,7 @@ static void handle_stop_signal(int sig, p->group_leader->real_parent); } rm_from_queue(SIG_KERNEL_STOP_MASK, &p->signal->shared_pending); - t = p; - do { + __for_each_task_pid(PIDTYPE_TGID, t, pid) { unsigned int state; rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending); @@ -676,9 +673,7 @@ static void handle_stop_signal(int sig, state |= TASK_INTERRUPTIBLE; } wake_up_state(t, state); - - t = next_thread(t); - } while (t != p); + } } } @@ -843,7 +838,8 @@ force_sig_specific(int sig, struct task_ static inline void __group_complete_signal(int sig, struct task_struct *p, unsigned int mask) { - struct task_struct *t; + task_t *t; + struct pid *pid = p->pids[PIDTYPE_TGID].pidptr; /* * Now find a thread we can wake up to take the signal off the queue. @@ -870,7 +866,7 @@ __group_complete_signal(int sig, struct BUG_ON(t->tgid != p->tgid); while (!wants_signal(sig, t, mask)) { - t = next_thread(t); + t = another_thread(t); if (t == p->signal->curr_target) /* * No thread needs to be woken. @@ -902,12 +898,10 @@ __group_complete_signal(int sig, struct p->signal->group_exit = 1; p->signal->group_exit_code = sig; p->signal->group_stop_count = 0; - t = p; - do { + __for_each_task_pid(PIDTYPE_TGID, t, pid) { sigaddset(&t->pending.signal, SIGKILL); signal_wake_up(t, 1); - t = next_thread(t); - } while (t != p); + } return; } @@ -925,12 +919,10 @@ __group_complete_signal(int sig, struct rm_from_queue(SIG_KERNEL_STOP_MASK, &p->signal->shared_pending); p->signal->group_stop_count = 0; p->signal->group_exit_task = t; - t = p; - do { + __for_each_task_pid(PIDTYPE_TGID, t, pid) { p->signal->group_stop_count++; signal_wake_up(t, 0); - t = next_thread(t); - } while (t != p); + } wake_up_process(p->signal->group_exit_task); return; } @@ -993,16 +985,19 @@ __group_send_sig_info(int sig, struct si /* * Nuke all other threads in the group. */ -void zap_other_threads(struct task_struct *p) +void zap_other_threads(task_t *p) { - struct task_struct *t; + struct pid *pid = p->pids[PIDTYPE_TGID].pidptr; + task_t *t; p->signal->group_stop_count = 0; if (thread_group_empty(p)) return; - for (t = next_thread(p); t != p; t = next_thread(t)) { + __for_each_task_pid(PIDTYPE_TGID, t, pid) { + if (t == p) + continue; /* * Don't bother with already dead threads */ @@ -1051,15 +1046,14 @@ int group_send_sig_info(int sig, struct int __kill_pg_info(int sig, struct siginfo *info, pid_t pgrp) { - struct task_struct *p; - struct list_head *l; + task_t *p; struct pid *pid; int err, retval = -ESRCH; if (pgrp <= 0) return -EINVAL; - for_each_task_pid(pgrp, PIDTYPE_PGID, p, l, pid) { + for_each_task_pid(pgrp, PIDTYPE_PGID, p, pid) { err = group_send_sig_info(sig, info, p); if (retval) retval = err; @@ -1091,15 +1085,14 @@ kill_sl_info(int sig, struct siginfo *in { int err, retval = -EINVAL; struct pid *pid; - struct list_head *l; - struct task_struct *p; + task_t *p; if (sid <= 0) goto out; retval = -ESRCH; read_lock(&tasklist_lock); - for_each_task_pid(sid, PIDTYPE_SID, p, l, pid) { + for_each_task_pid(sid, PIDTYPE_SID, p, pid) { if (!p->leader) continue; err = group_send_sig_info(sig, info, p); @@ -1372,25 +1365,23 @@ out: * Joy. Or not. Pthread wants us to wake up every thread * in our parent group. */ -static inline void __wake_up_parent(struct task_struct *p, - struct task_struct *parent) +static inline void __wake_up_parent(task_t *p, task_t *parent) { - struct task_struct *tsk = parent; + task_t *task; + struct pid *pid = parent->pids[PIDTYPE_TGID].pidptr; /* * Fortunately this is not necessary for thread groups: */ - if (p->tgid == tsk->tgid) { - wake_up_interruptible(&tsk->wait_chldexit); + if (p->tgid == parent->tgid) { + wake_up_interruptible(&parent->wait_chldexit); return; } - do { - wake_up_interruptible(&tsk->wait_chldexit); - tsk = next_thread(tsk); - if (tsk->signal != parent->signal) - BUG(); - } while (tsk != parent); + __for_each_task_pid(PIDTYPE_TGID, task, pid) { + wake_up_interruptible(&task->wait_chldexit); + BUG_ON(task->signal != parent->signal); + } } /* @@ -1625,10 +1616,12 @@ do_signal_stop(int signr) } if (sig->group_stop_count == 0) { + struct pid *pid = current->pids[PIDTYPE_TGID].pidptr; sig->group_exit_code = signr; stop_count = 0; - for (t = next_thread(current); t != current; - t = next_thread(t)) + __for_each_task_pid(PIDTYPE_TGID, t, pid) { + if (t == current) + continue; /* * Setting state to TASK_STOPPED for a group * stop is always done with the siglock held, @@ -1638,9 +1631,9 @@ do_signal_stop(int signr) stop_count++; signal_wake_up(t, 0); } + } sig->group_stop_count = stop_count; - } - else { + } else { /* A race with another thread while unlocked. */ signr = sig->group_exit_code; stop_count = --sig->group_stop_count; @@ -2299,7 +2292,9 @@ do_sigaction(int sig, const struct k_sig * Now we must do this little unlock and relock * dance to maintain the lock hierarchy. */ - struct task_struct *t = current; + task_t *t = current; + struct pid *pid = t->pids[PIDTYPE_TGID].pidptr; + spin_unlock_irq(&t->sighand->siglock); read_lock(&tasklist_lock); spin_lock_irq(&t->sighand->siglock); @@ -2307,11 +2302,10 @@ do_sigaction(int sig, const struct k_sig sigdelsetmask(&k->sa.sa_mask, sigmask(SIGKILL) | sigmask(SIGSTOP)); rm_from_queue(sigmask(sig), &t->signal->shared_pending); - do { + __for_each_task_pid(PIDTYPE_TGID, t, pid) { rm_from_queue(sigmask(sig), &t->pending); recalc_sigpending_tsk(t); - t = next_thread(t); - } while (t != current); + } spin_unlock_irq(¤t->sighand->siglock); read_unlock(&tasklist_lock); return 0; diff -prauN linux-2.6.0-test11/kernel/sys.c wli-2.6.0-test11-30/kernel/sys.c --- linux-2.6.0-test11/kernel/sys.c 2003-11-26 12:42:58.000000000 -0800 +++ wli-2.6.0-test11-30/kernel/sys.c 2003-12-04 08:43:29.000000000 -0800 @@ -282,7 +282,6 @@ asmlinkage long sys_setpriority(int whic struct task_struct *g, *p; struct user_struct *user; struct pid *pid; - struct list_head *l; int error = -EINVAL; if (which > 2 || which < 0) @@ -307,7 +306,7 @@ asmlinkage long sys_setpriority(int whic case PRIO_PGRP: if (!who) who = process_group(current); - for_each_task_pid(who, PIDTYPE_PGID, p, l, pid) + for_each_task_pid(who, PIDTYPE_PGID, p, pid) error = set_one_prio(p, niceval, error); break; case PRIO_USER: @@ -339,8 +338,7 @@ out: */ asmlinkage long sys_getpriority(int which, int who) { - struct task_struct *g, *p; - struct list_head *l; + task_t *g, *p; struct pid *pid; struct user_struct *user; long niceval, retval = -ESRCH; @@ -363,7 +361,7 @@ asmlinkage long sys_getpriority(int whic case PRIO_PGRP: if (!who) who = process_group(current); - for_each_task_pid(who, PIDTYPE_PGID, p, l, pid) { + for_each_task_pid(who, PIDTYPE_PGID, p, pid) { niceval = 20 - task_nice(p); if (niceval > retval) retval = niceval; @@ -983,11 +981,10 @@ asmlinkage long sys_setpgid(pid_t pid, p goto out; if (pgid != pid) { - struct task_struct *p; + task_t *p; struct pid *pid; - struct list_head *l; - for_each_task_pid(pgid, PIDTYPE_PGID, p, l, pid) + for_each_task_pid(pgid, PIDTYPE_PGID, p, pid) if (p->session == current->session) goto ok_pgid; goto out; @@ -1323,8 +1320,6 @@ asmlinkage long sys_setrlimit(unsigned i * either stopped or zombied. In the zombied case the task won't get * reaped till shortly after the call to getrusage(), in both cases the * task being examined is in a frozen state so the counters won't change. - * - * FIXME! Get the fault counts properly! */ int getrusage(struct task_struct *p, int who, struct rusage __user *ru) { diff -prauN linux-2.6.0-test11/kernel/timer.c wli-2.6.0-test11-30/kernel/timer.c --- linux-2.6.0-test11/kernel/timer.c 2003-11-26 12:45:25.000000000 -0800 +++ wli-2.6.0-test11-30/kernel/timer.c 2003-12-04 08:35:59.000000000 -0800 @@ -1000,7 +1000,7 @@ static void process_timeout(unsigned lon * * In all cases the return value is guaranteed to be non-negative. */ -signed long schedule_timeout(signed long timeout) +__sched signed long schedule_timeout(signed long timeout) { struct timer_list timer; unsigned long expire; @@ -1060,7 +1060,7 @@ asmlinkage long sys_gettid(void) return current->pid; } -static long nanosleep_restart(struct restart_block *restart) +static __sched long nanosleep_restart(struct restart_block *restart) { unsigned long expire = restart->arg0, now = jiffies; struct timespec *rmtp = (struct timespec *) restart->arg1; diff -prauN linux-2.6.0-test11/lib/rwsem.c wli-2.6.0-test11-30/lib/rwsem.c --- linux-2.6.0-test11/lib/rwsem.c 2003-11-26 12:42:52.000000000 -0800 +++ wli-2.6.0-test11-30/lib/rwsem.c 2003-12-04 08:35:59.000000000 -0800 @@ -5,6 +5,7 @@ */ #include #include +#include #include struct rwsem_waiter { @@ -162,7 +163,7 @@ static inline struct rw_semaphore *rwsem /* * wait for the read lock to be granted */ -struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem) +__sched struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem) { struct rwsem_waiter waiter; @@ -178,7 +179,7 @@ struct rw_semaphore *rwsem_down_read_fai /* * wait for the write lock to be granted */ -struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem) +__sched struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem) { struct rwsem_waiter waiter; diff -prauN linux-2.6.0-test11/mm/filemap.c wli-2.6.0-test11-30/mm/filemap.c --- linux-2.6.0-test11/mm/filemap.c 2003-11-26 12:43:33.000000000 -0800 +++ wli-2.6.0-test11-30/mm/filemap.c 2003-12-04 08:43:29.000000000 -0800 @@ -55,17 +55,17 @@ /* * Lock ordering: * - * ->i_shared_sem (vmtruncate) - * ->private_lock (__free_pte->__set_page_dirty_buffers) + * ->i_shared_lock (vmtruncate) + * ->private_lock (__free_pte->set_page_dirty_buffers) * ->swap_list_lock * ->swap_device_lock (exclusive_swap_page, others) * ->mapping->page_lock * * ->i_sem - * ->i_shared_sem (truncate->invalidate_mmap_range) + * ->i_shared_lock (truncate->invalidate_mmap_range) * * ->mmap_sem - * ->i_shared_sem (various places) + * ->i_shared_lock (various places) * * ->mmap_sem * ->lock_page (access_process_vm) @@ -73,6 +73,9 @@ * ->mmap_sem * ->i_sem (msync) * + * ->lock_page + * ->i_shared_lock (page_convert_anon) + * * ->inode_lock * ->sb_lock (fs/fs-writeback.c) * ->mapping->page_lock (__sync_single_inode) @@ -94,11 +97,11 @@ */ void __remove_from_page_cache(struct page *page) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); radix_tree_delete(&mapping->page_tree, page->index); list_del(&page->list); - page->mapping = NULL; + set_page_mapping(page, NULL); mapping->nrpages--; pagecache_acct(-1); @@ -106,22 +109,24 @@ void __remove_from_page_cache(struct pag void remove_from_page_cache(struct page *page) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); - if (unlikely(!PageLocked(page))) - PAGE_BUG(page); + BUG_ON(!PageLocked(page)); - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); __remove_from_page_cache(page); - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); + page_cache_release(page); } static inline int sync_page(struct page *page) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); if (mapping && mapping->a_ops && mapping->a_ops->sync_page) return mapping->a_ops->sync_page(page); + if (PageSwapCache(page)) + blk_run_queues(); return 0; } @@ -145,9 +150,9 @@ static int __filemap_fdatawrite(struct a if (mapping->backing_dev_info->memory_backed) return 0; - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); list_splice_init(&mapping->dirty_pages, &mapping->io_pages); - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); ret = do_writepages(mapping, &wbc); return ret; } @@ -180,7 +185,7 @@ int filemap_fdatawait(struct address_spa restart: progress = 0; - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); while (!list_empty(&mapping->locked_pages)) { struct page *page; @@ -194,7 +199,7 @@ restart: if (!PageWriteback(page)) { if (++progress > 32) { if (need_resched()) { - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); __cond_resched(); goto restart; } @@ -204,16 +209,16 @@ restart: progress = 0; page_cache_get(page); - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); wait_on_page_writeback(page); if (PageError(page)) ret = -EIO; page_cache_release(page); - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); } - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); /* Check for outstanding write errors */ if (test_and_clear_bit(AS_ENOSPC, &mapping->flags)) @@ -230,16 +235,9 @@ EXPORT_SYMBOL(filemap_fdatawait); * This adds a page to the page cache, starting out as locked, unreferenced, * not uptodate and with no errors. * - * This function is used for two things: adding newly allocated pagecache - * pages and for moving existing anon pages into swapcache. - * - * In the case of pagecache pages, the page is new, so we can just run - * SetPageLocked() against it. The other page state flags were set by - * rmqueue() - * - * In the case of swapcache, try_to_swap_out() has already locked the page, so - * SetPageLocked() is ugly-but-OK there too. The required page state has been - * set up by swap_out_add_to_swap_cache(). + * This function is used to add newly allocated pagecache pages; + * the page is new, so we can just run SetPageLocked() against it. + * The other page state flags were set by rmqueue(). * * This function does not add the page to the LRU. The caller must do that. */ @@ -250,15 +248,19 @@ int add_to_page_cache(struct page *page, if (error == 0) { page_cache_get(page); - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); error = radix_tree_insert(&mapping->page_tree, offset, page); if (!error) { SetPageLocked(page); - ___add_to_page_cache(page, mapping, offset); + list_add(&page->list, &mapping->clean_pages); + set_page_mapping(page, mapping); + page->index = offset; + mapping->nrpages++; + pagecache_acct(+1); } else { page_cache_release(page); } - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); radix_tree_preload_end(); } return error; @@ -394,11 +396,11 @@ struct page * find_get_page(struct addre * We scan the hash list read-only. Addition to and removal from * the hash-list needs a held write-lock. */ - spin_lock(&mapping->page_lock); + mapping_rdlock(&mapping->page_lock); page = radix_tree_lookup(&mapping->page_tree, offset); if (page) page_cache_get(page); - spin_unlock(&mapping->page_lock); + mapping_rdunlock(&mapping->page_lock); return page; } @@ -411,11 +413,11 @@ struct page *find_trylock_page(struct ad { struct page *page; - spin_lock(&mapping->page_lock); + mapping_rdlock(&mapping->page_lock); page = radix_tree_lookup(&mapping->page_tree, offset); if (page && TestSetPageLocked(page)) page = NULL; - spin_unlock(&mapping->page_lock); + mapping_rdunlock(&mapping->page_lock); return page; } @@ -437,25 +439,25 @@ struct page *find_lock_page(struct addre { struct page *page; - spin_lock(&mapping->page_lock); + mapping_rdlock(&mapping->page_lock); repeat: page = radix_tree_lookup(&mapping->page_tree, offset); if (page) { page_cache_get(page); if (TestSetPageLocked(page)) { - spin_unlock(&mapping->page_lock); + mapping_rdunlock(&mapping->page_lock); lock_page(page); - spin_lock(&mapping->page_lock); + mapping_rdlock(&mapping->page_lock); /* Has the page been truncated while we slept? */ - if (page->mapping != mapping || page->index != offset) { + if (page_mapping(page) != mapping || page->index != offset) { unlock_page(page); page_cache_release(page); goto repeat; } } } - spin_unlock(&mapping->page_lock); + mapping_rdunlock(&mapping->page_lock); return page; } @@ -529,12 +531,12 @@ unsigned int find_get_pages(struct addre unsigned int i; unsigned int ret; - spin_lock(&mapping->page_lock); + mapping_rdlock(&mapping->page_lock); ret = radix_tree_gang_lookup(&mapping->page_tree, (void **)pages, start, nr_pages); for (i = 0; i < ret; i++) page_cache_get(pages[i]); - spin_unlock(&mapping->page_lock); + mapping_rdunlock(&mapping->page_lock); return ret; } @@ -664,8 +666,8 @@ page_not_up_to_date: /* Get exclusive access to the page ... */ lock_page(page); - /* Did it get unhashed before we got the lock? */ - if (!page->mapping) { + /* Did it get removed from the radix tree before we got the lock? */ + if (!page_mapping(page)) { unlock_page(page); page_cache_release(page); continue; @@ -990,7 +992,7 @@ static int page_cache_read(struct file * * it in the page cache, and handles the special cases reasonably without * having a lot of duplicated code. */ -struct page * filemap_nopage(struct vm_area_struct * area, unsigned long address, int unused) +struct page * filemap_nopage(struct vm_area_struct * area, unsigned long address, int *type) { int error; struct file *file = area->vm_file; @@ -999,7 +1001,7 @@ struct page * filemap_nopage(struct vm_a struct inode *inode = mapping->host; struct page *page; unsigned long size, pgoff, endoff; - int did_readaround = 0; + int did_readaround = 0, majmin = VM_FAULT_MINOR; pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff; endoff = ((area->vm_end - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff; @@ -1048,6 +1050,14 @@ retry_find: if (ra->mmap_miss > ra->mmap_hit + MMAP_LOTSAMISS) goto no_cached_page; + /* + * To keep the pgmajfault counter straight, we need to + * check did_readaround, as this is an inner loop. + */ + if (!did_readaround) { + majmin = VM_FAULT_MAJOR; + inc_page_state(pgmajfault); + } did_readaround = 1; do_page_cache_readahead(mapping, file, pgoff & ~(MMAP_READAROUND-1), MMAP_READAROUND); @@ -1069,6 +1079,8 @@ success: * Found the page and have a reference on it. */ mark_page_accessed(page); + if (type) + *type = majmin; return page; outside_data_content: @@ -1104,11 +1116,14 @@ no_cached_page: return NULL; page_not_uptodate: - inc_page_state(pgmajfault); + if (!did_readaround) { + majmin = VM_FAULT_MAJOR; + inc_page_state(pgmajfault); + } lock_page(page); - /* Did it get unhashed while we waited for it? */ - if (!page->mapping) { + /* Did it get removed from the radix tree while we waited for it? */ + if (!page_mapping(page)) { unlock_page(page); page_cache_release(page); goto retry_all; @@ -1135,7 +1150,7 @@ page_not_uptodate: lock_page(page); /* Somebody truncated the page on us? */ - if (!page->mapping) { + if (!page_mapping(page)) { unlock_page(page); page_cache_release(page); goto retry_all; @@ -1216,8 +1231,8 @@ no_cached_page: page_not_uptodate: lock_page(page); - /* Did it get unhashed while we waited for it? */ - if (!page->mapping) { + /* Did it get removed from the radix tree while we waited for it? */ + if (!page_mapping(page)) { unlock_page(page); goto err; } @@ -1243,7 +1258,7 @@ page_not_uptodate: lock_page(page); /* Somebody truncated the page on us? */ - if (!page->mapping) { + if (!page_mapping(page)) { unlock_page(page); goto err; } @@ -1428,7 +1443,7 @@ retry: goto out; lock_page(page); - if (!page->mapping) { + if (!page_mapping(page)) { unlock_page(page); page_cache_release(page); goto retry; diff -prauN linux-2.6.0-test11/mm/fremap.c wli-2.6.0-test11-30/mm/fremap.c --- linux-2.6.0-test11/mm/fremap.c 2003-11-26 12:42:50.000000000 -0800 +++ wli-2.6.0-test11-30/mm/fremap.c 2003-12-04 08:28:46.000000000 -0800 @@ -12,13 +12,16 @@ #include #include #include -#include +#include #include #include #include #include +/* + * This is never done to an anonymous page so page->mapping is never altered. + */ static inline int zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { @@ -30,13 +33,13 @@ static inline int zap_pte(struct mm_stru unsigned long pfn = pte_pfn(pte); flush_cache_page(vma, addr); - pte = ptep_get_and_clear(ptep); + pte = vm_ptep_get_and_clear(vma, ptep, addr); if (pfn_valid(pfn)) { struct page *page = pfn_to_page(pfn); if (!PageReserved(page)) { if (pte_dirty(pte)) set_page_dirty(page); - page_remove_rmap(page, ptep); + page_remove_rmap(page); page_cache_release(page); mm->rss--; } @@ -45,7 +48,7 @@ static inline int zap_pte(struct mm_stru } else { if (!pte_file(pte)) free_swap_and_cache(pte_to_swp_entry(pte)); - pte_clear(ptep); + vm_pte_clear(vma, ptep, addr); return 0; } } @@ -62,19 +65,18 @@ int install_page(struct mm_struct *mm, s pgd_t *pgd; pmd_t *pmd; pte_t pte_val; - struct pte_chain *pte_chain; - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) - goto err; pgd = pgd_offset(mm, addr); + if (!rmap_get_cpu()) + goto err; spin_lock(&mm->page_table_lock); + put_cpu(); - pmd = pmd_alloc(mm, pgd, addr); + pmd = pmd_alloc_map(mm, pgd, addr); if (!pmd) goto err_unlock; - pte = pte_alloc_map(mm, pmd, addr); + pte = pte_alloc_map(mm, pgd, &pmd, addr); if (!pte) goto err_unlock; @@ -82,20 +84,20 @@ int install_page(struct mm_struct *mm, s mm->rss++; flush_icache_page(vma, page); - set_pte(pte, mk_pte(page, prot)); - pte_chain = page_add_rmap(page, pte, pte_chain); + vm_set_pte(vma, pte, mk_pte(page, prot), addr); + if (!PageReserved(page)) + page_add_rmap(page, vma, addr, 0); pte_val = *pte; pte_unmap(pte); + pmd_unmap(pmd); if (flush) flush_tlb_page(vma, addr); update_mmu_cache(vma, addr, pte_val); spin_unlock(&mm->page_table_lock); - pte_chain_free(pte_chain); return 0; err_unlock: spin_unlock(&mm->page_table_lock); - pte_chain_free(pte_chain); err: return err; } @@ -118,19 +120,22 @@ int install_file_pte(struct mm_struct *m pgd = pgd_offset(mm, addr); spin_lock(&mm->page_table_lock); - pmd = pmd_alloc(mm, pgd, addr); + pmd = pmd_alloc_map(mm, pgd, addr); if (!pmd) goto err_unlock; - pte = pte_alloc_map(mm, pmd, addr); - if (!pte) + pte = pte_alloc_map(mm, pgd, &pmd, addr); + if (!pte) { + pmd_unmap(pmd); goto err_unlock; + } flush = zap_pte(mm, vma, addr, pte); - set_pte(pte, pgoff_to_pte(pgoff)); + vm_set_pte(vma, pte, pgoff_to_pte(pgoff), addr); pte_val = *pte; pte_unmap(pte); + pmd_unmap(pmd); if (flush) flush_tlb_page(vma, addr); update_mmu_cache(vma, addr, pte_val); @@ -200,10 +205,22 @@ long sys_remap_file_pages(unsigned long vma->vm_ops && vma->vm_ops->populate && end > start && start >= vma->vm_start && end <= vma->vm_end) { + unsigned long index; + index = (start - vma->vm_start)/PAGE_SIZE + vma->vm_pgoff; - /* Must set VM_NONLINEAR before any pages are populated. */ - if (pgoff != ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff) + /* + * Must set VM_NONLINEAR before any pages are populated. + * Take ->i_shared_lock to lock out invalidate_mmap_range(). + */ + if (pgoff != index && !(vma->vm_flags & VM_NONLINEAR)) { + struct file *file = vma->vm_file; + struct address_space *mapping; + + mapping = file->f_dentry->d_inode->i_mapping; + spin_lock(&mapping->i_shared_lock); vma->vm_flags |= VM_NONLINEAR; + spin_unlock(&mapping->i_shared_lock); + } /* ->populate can take a long time, so downgrade the lock. */ downgrade_write(&mm->mmap_sem); diff -prauN linux-2.6.0-test11/mm/memory.c wli-2.6.0-test11-30/mm/memory.c --- linux-2.6.0-test11/mm/memory.c 2003-11-26 12:43:52.000000000 -0800 +++ wli-2.6.0-test11-30/mm/memory.c 2003-12-04 08:47:49.000000000 -0800 @@ -43,11 +43,10 @@ #include #include #include -#include +#include #include #include -#include #include #include #include @@ -103,7 +102,7 @@ static inline void free_one_pmd(struct m } page = pmd_page(*dir); pmd_clear(dir); - pgtable_remove_rmap(page); + dec_page_state(nr_page_table_pages); pte_free_tlb(tlb, page); } @@ -111,6 +110,7 @@ static inline void free_one_pgd(struct m { int j; pmd_t * pmd; + struct page *page; if (pgd_none(*dir)) return; @@ -119,11 +119,13 @@ static inline void free_one_pgd(struct m pgd_clear(dir); return; } - pmd = pmd_offset(dir, 0); + page = pgd_page(*dir); + pmd = pmd_offset_map(dir, 0); pgd_clear(dir); for (j = 0; j < PTRS_PER_PMD ; j++) free_one_pmd(tlb, pmd+j); - pmd_free_tlb(tlb, pmd); + pmd_unmap(pmd); + pmd_free_tlb(tlb, page); } /* @@ -143,30 +145,38 @@ void clear_page_tables(struct mmu_gather } while (--nr); } -pte_t * pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address) +/* + * error return happens with pmd unmapped + */ +pte_t *pte_alloc_map(struct mm_struct *mm, pgd_t *pgd, pmd_t **pmd, unsigned long addr) { - if (!pmd_present(*pmd)) { + if (!pmd_present(**pmd)) { struct page *new; + pmd_unmap(*pmd); spin_unlock(&mm->page_table_lock); - new = pte_alloc_one(mm, address); + new = pte_alloc_one(mm, addr); spin_lock(&mm->page_table_lock); - if (!new) + if (!new) { + *pmd = NULL; return NULL; + } + + *pmd = pmd_offset_map(pgd, addr); /* * Because we dropped the lock, we should re-check the * entry, as somebody else could have populated it.. */ - if (pmd_present(*pmd)) { + if (pmd_present(**pmd)) { pte_free(new); goto out; } - pgtable_add_rmap(new, mm, address); - pmd_populate(mm, pmd, new); + inc_page_state(nr_page_table_pages); + pmd_populate(mm, *pmd, new); } out: - return pte_offset_map(pmd, address); + return pte_offset_map(*pmd, addr); } pte_t * pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address) @@ -188,7 +198,7 @@ pte_t * pte_alloc_kernel(struct mm_struc pte_free_kernel(new); goto out; } - pgtable_add_rmap(virt_to_page(new), mm, address); + inc_page_state(nr_page_table_pages); pmd_populate_kernel(mm, pmd, new); } out: @@ -206,7 +216,7 @@ out: * variable count and make things faster. -jj * * dst->page_table_lock is held on entry and exit, - * but may be dropped within pmd_alloc() and pte_alloc_map(). + * but may be dropped within pmd_alloc_map() and pte_alloc_map(). */ int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma) @@ -215,20 +225,10 @@ int copy_page_range(struct mm_struct *ds unsigned long address = vma->vm_start; unsigned long end = vma->vm_end; unsigned long cow; - struct pte_chain *pte_chain = NULL; if (is_vm_hugetlb_page(vma)) return copy_hugetlb_page_range(dst, src, vma); - pte_chain = pte_chain_alloc(GFP_ATOMIC); - if (!pte_chain) { - spin_unlock(&dst->page_table_lock); - pte_chain = pte_chain_alloc(GFP_KERNEL); - spin_lock(&dst->page_table_lock); - if (!pte_chain) - goto nomem; - } - cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; src_pgd = pgd_offset(src, address)-1; dst_pgd = pgd_offset(dst, address)-1; @@ -251,11 +251,10 @@ skip_copy_pmd_range: address = (address continue; } - src_pmd = pmd_offset(src_pgd, address); - dst_pmd = pmd_alloc(dst, dst_pgd, address); + dst_pmd = pmd_alloc_map(dst, dst_pgd, address); if (!dst_pmd) goto nomem; - + src_pmd = pmd_offset_map_nested(src_pgd, address); do { pte_t * src_pte, * dst_pte; @@ -268,15 +267,20 @@ skip_copy_pmd_range: address = (address pmd_clear(src_pmd); skip_copy_pte_range: address = (address + PMD_SIZE) & PMD_MASK; - if (address >= end) + if (address >= end) { + pmd_unmap(dst_pmd); + pmd_unmap_nested(src_pmd); goto out; + } goto cont_copy_pmd_range; } - dst_pte = pte_alloc_map(dst, dst_pmd, address); + pmd_unmap_nested(src_pmd); + dst_pte = pte_alloc_map(dst, dst_pgd, &dst_pmd, address); if (!dst_pte) goto nomem; spin_lock(&src->page_table_lock); + src_pmd = pmd_offset_map_nested(src_pgd, address); src_pte = pte_offset_map_nested(src_pmd, address); do { pte_t pte = *src_pte; @@ -291,8 +295,7 @@ skip_copy_pte_range: if (!pte_present(pte)) { if (!pte_file(pte)) swap_duplicate(pte_to_swp_entry(pte)); - set_pte(dst_pte, pte); - goto cont_copy_pte_range_noset; + goto cont_copy_pte_range; } pfn = pte_pfn(pte); /* the pte points outside of valid memory, the @@ -300,13 +303,13 @@ skip_copy_pte_range: * and not mapped via rmap - duplicate the * mapping as is. */ - page = NULL; - if (pfn_valid(pfn)) - page = pfn_to_page(pfn); - - if (!page || PageReserved(page)) { - set_pte(dst_pte, pte); - goto cont_copy_pte_range_noset; + if (!pfn_valid(pfn)) { + page = NULL; + goto cont_copy_pte_range; + } else { + page = pfn_to_page(pfn); + if (PageReserved(page)) + goto cont_copy_pte_range; } /* @@ -314,7 +317,7 @@ skip_copy_pte_range: * in the parent and the child */ if (cow) { - ptep_set_wrprotect(src_pte); + vm_ptep_set_wrprotect(src, src_pte); pte = *src_pte; } @@ -327,35 +330,14 @@ skip_copy_pte_range: pte = pte_mkold(pte); get_page(page); dst->rss++; - - set_pte(dst_pte, pte); - pte_chain = page_add_rmap(page, dst_pte, - pte_chain); - if (pte_chain) - goto cont_copy_pte_range_noset; - pte_chain = pte_chain_alloc(GFP_ATOMIC); - if (pte_chain) - goto cont_copy_pte_range_noset; - - /* - * pte_chain allocation failed, and we need to - * run page reclaim. - */ - pte_unmap_nested(src_pte); - pte_unmap(dst_pte); - spin_unlock(&src->page_table_lock); - spin_unlock(&dst->page_table_lock); - pte_chain = pte_chain_alloc(GFP_KERNEL); - spin_lock(&dst->page_table_lock); - if (!pte_chain) - goto nomem; - spin_lock(&src->page_table_lock); - dst_pte = pte_offset_map(dst_pmd, address); - src_pte = pte_offset_map_nested(src_pmd, - address); + page_dup_rmap(page); +cont_copy_pte_range: + vm_set_pte(vma, dst_pte, pte, address); cont_copy_pte_range_noset: address += PAGE_SIZE; if (address >= end) { + pmd_unmap(dst_pmd); + pmd_unmap_nested(src_pmd); pte_unmap_nested(src_pte); pte_unmap(dst_pte); goto out_unlock; @@ -371,19 +353,19 @@ cont_copy_pmd_range: src_pmd++; dst_pmd++; } while ((unsigned long)src_pmd & PMD_TABLE_MASK); + pmd_unmap_nested(src_pmd-1); + pmd_unmap(dst_pmd-1); } out_unlock: spin_unlock(&src->page_table_lock); out: - pte_chain_free(pte_chain); return 0; nomem: - pte_chain_free(pte_chain); return -ENOMEM; } static void -zap_pte_range(struct mmu_gather *tlb, pmd_t * pmd, +zap_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long address, unsigned long size) { unsigned long offset; @@ -408,32 +390,32 @@ zap_pte_range(struct mmu_gather *tlb, pm if (pte_present(pte)) { unsigned long pfn = pte_pfn(pte); - pte = ptep_get_and_clear(ptep); + pte = vm_ptep_get_and_clear(vma, ptep, address + offset); tlb_remove_tlb_entry(tlb, ptep, address+offset); if (pfn_valid(pfn)) { struct page *page = pfn_to_page(pfn); if (!PageReserved(page)) { if (pte_dirty(pte)) set_page_dirty(page); - if (page->mapping && pte_young(pte) && + if (page_mapping(page) && pte_young(pte) && !PageSwapCache(page)) mark_page_accessed(page); tlb->freed++; - page_remove_rmap(page, ptep); + page_remove_rmap(page); tlb_remove_page(tlb, page); } } } else { if (!pte_file(pte)) free_swap_and_cache(pte_to_swp_entry(pte)); - pte_clear(ptep); + vm_pte_clear(vma, ptep, address); } } pte_unmap(ptep-1); } static void -zap_pmd_range(struct mmu_gather *tlb, pgd_t * dir, +zap_pmd_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pgd_t *dir, unsigned long address, unsigned long size) { pmd_t * pmd; @@ -446,15 +428,16 @@ zap_pmd_range(struct mmu_gather *tlb, pg pgd_clear(dir); return; } - pmd = pmd_offset(dir, address); + pmd = pmd_offset_map(dir, address); end = address + size; if (end > ((address + PGDIR_SIZE) & PGDIR_MASK)) end = ((address + PGDIR_SIZE) & PGDIR_MASK); do { - zap_pte_range(tlb, pmd, address, end - address); + zap_pte_range(tlb, vma, pmd, address, end - address); address = (address + PMD_SIZE) & PMD_MASK; pmd++; } while (address < end); + pmd_unmap(pmd - 1); } void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, @@ -472,7 +455,7 @@ void unmap_page_range(struct mmu_gather dir = pgd_offset(vma->vm_mm, address); tlb_start_vma(tlb, vma); do { - zap_pmd_range(tlb, dir, address, end - address); + zap_pmd_range(tlb, vma, dir, address, end - address); address = (address + PGDIR_SIZE) & PGDIR_MASK; dir++; } while (address && (address < end)); @@ -636,20 +619,27 @@ follow_page(struct mm_struct *mm, unsign if (pgd_none(*pgd) || pgd_bad(*pgd)) goto out; - pmd = pmd_offset(pgd, address); + pmd = pmd_offset_map(pgd, address); if (pmd_none(*pmd)) - goto out; - if (pmd_huge(*pmd)) - return follow_huge_pmd(mm, address, pmd, write); - if (pmd_bad(*pmd)) - goto out; + goto out_unmap; + if (pmd_bad(*pmd)) { + pmd_ERROR(*pmd); + pmd_clear(pmd); + goto out_unmap; + } + if (pmd_huge(*pmd)) { + struct page *page = follow_huge_pmd(mm, address, pmd, write); + pmd_unmap(pmd); + return page; + } ptep = pte_offset_map(pmd, address); if (!ptep) - goto out; + goto out_unmap; pte = *ptep; pte_unmap(ptep); + pmd_unmap(pmd); if (pte_present(pte)) { if (!write || (pte_write(pte) && pte_dirty(pte))) { pfn = pte_pfn(pte); @@ -664,6 +654,9 @@ follow_page(struct mm_struct *mm, unsign out: return NULL; +out_unmap: + pmd_unmap(pmd); + goto out; } /* @@ -722,7 +715,7 @@ int get_user_pages(struct task_struct *t pgd = pgd_offset_k(pg); if (!pgd) return i ? : -EFAULT; - pmd = pmd_offset(pgd, pg); + pmd = pmd_offset_kernel(pgd, pg); if (!pmd) return i ? : -EFAULT; pte = pte_offset_kernel(pmd, pg); @@ -798,8 +791,8 @@ out: EXPORT_SYMBOL(get_user_pages); -static void zeromap_pte_range(pte_t * pte, unsigned long address, - unsigned long size, pgprot_t prot) +static void zeromap_pte_range(struct vm_area_struct *vma, pte_t *pte, + unsigned long address, unsigned long size, pgprot_t prot) { unsigned long end; @@ -810,14 +803,14 @@ static void zeromap_pte_range(pte_t * pt do { pte_t zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE(address), prot)); BUG_ON(!pte_none(*pte)); - set_pte(pte, zero_pte); + vm_set_pte(vma, pte, zero_pte, address); address += PAGE_SIZE; pte++; } while (address && (address < end)); } -static inline int zeromap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned long address, - unsigned long size, pgprot_t prot) +static inline int zeromap_pmd_range(struct vm_area_struct *vma, pgd_t *pgd, + pmd_t **pmd, unsigned long address, unsigned long size, pgprot_t prot) { unsigned long base, end; @@ -827,13 +820,13 @@ static inline int zeromap_pmd_range(stru if (end > PGDIR_SIZE) end = PGDIR_SIZE; do { - pte_t * pte = pte_alloc_map(mm, pmd, base + address); + pte_t *pte = pte_alloc_map(vma->vm_mm, pgd, pmd, base + address); if (!pte) return -ENOMEM; - zeromap_pte_range(pte, base + address, end - address, prot); + zeromap_pte_range(vma, pte, base + address, end - address, prot); pte_unmap(pte); address = (address + PMD_SIZE) & PMD_MASK; - pmd++; + (*pmd)++; } while (address && (address < end)); return 0; } @@ -853,13 +846,14 @@ int zeromap_page_range(struct vm_area_st spin_lock(&mm->page_table_lock); do { - pmd_t *pmd = pmd_alloc(mm, dir, address); + pmd_t *pmd = pmd_alloc_map(mm, dir, address); error = -ENOMEM; if (!pmd) break; - error = zeromap_pmd_range(mm, pmd, address, end - address, prot); + error = zeromap_pmd_range(vma, dir, &pmd, address, end - address, prot); if (error) break; + pmd_unmap(pmd - 1); address = (address + PGDIR_SIZE) & PGDIR_MASK; dir++; } while (address && (address < end)); @@ -873,8 +867,9 @@ int zeromap_page_range(struct vm_area_st * mappings are removed. any references to nonexistent pages results * in null mappings (currently treated as "copy-on-access") */ -static inline void remap_pte_range(pte_t * pte, unsigned long address, unsigned long size, - unsigned long phys_addr, pgprot_t prot) +static inline void remap_pte_range(struct vm_area_struct *vma, pte_t *pte, + unsigned long address, unsigned long size, + unsigned long phys_addr, pgprot_t prot) { unsigned long end; unsigned long pfn; @@ -887,15 +882,16 @@ static inline void remap_pte_range(pte_t do { BUG_ON(!pte_none(*pte)); if (!pfn_valid(pfn) || PageReserved(pfn_to_page(pfn))) - set_pte(pte, pfn_pte(pfn, prot)); + vm_set_pte(vma, pte, pfn_pte(pfn, prot), address); address += PAGE_SIZE; pfn++; pte++; } while (address && (address < end)); } -static inline int remap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned long address, unsigned long size, - unsigned long phys_addr, pgprot_t prot) +static inline int remap_pmd_range(struct vm_area_struct *vma, pgd_t *pgd, + pmd_t **pmd, unsigned long address, unsigned long size, + unsigned long phys_addr, pgprot_t prot) { unsigned long base, end; @@ -906,13 +902,13 @@ static inline int remap_pmd_range(struct end = PGDIR_SIZE; phys_addr -= address; do { - pte_t * pte = pte_alloc_map(mm, pmd, base + address); + pte_t *pte = pte_alloc_map(vma->vm_mm, pgd, pmd, base + address); if (!pte) return -ENOMEM; - remap_pte_range(pte, base + address, end - address, address + phys_addr, prot); + remap_pte_range(vma, pte, base + address, end - address, address + phys_addr, prot); pte_unmap(pte); address = (address + PMD_SIZE) & PMD_MASK; - pmd++; + (*pmd)++; } while (address && (address < end)); return 0; } @@ -934,13 +930,14 @@ int remap_page_range(struct vm_area_stru spin_lock(&mm->page_table_lock); do { - pmd_t *pmd = pmd_alloc(mm, dir, from); + pmd_t *pmd = pmd_alloc_map(mm, dir, from); error = -ENOMEM; if (!pmd) break; - error = remap_pmd_range(mm, pmd, from, end - from, phys_addr + from, prot); + error = remap_pmd_range(vma, dir, &pmd, from, end - from, phys_addr + from, prot); if (error) break; + pmd_unmap(pmd - 1); from = (from + PGDIR_SIZE) & PGDIR_MASK; dir++; } while (from && (from < end)); @@ -959,9 +956,10 @@ EXPORT_SYMBOL(remap_page_range); * * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock */ -static inline void establish_pte(struct vm_area_struct * vma, unsigned long address, pte_t *page_table, pte_t entry) +static inline void establish_pte(struct vm_area_struct *vma, + unsigned long address, pte_t *page_table, pte_t entry) { - set_pte(page_table, entry); + vm_set_pte(vma, page_table, entry, address); flush_tlb_page(vma, address); update_mmu_cache(vma, address, entry); } @@ -969,8 +967,9 @@ static inline void establish_pte(struct /* * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock */ -static inline void break_cow(struct vm_area_struct * vma, struct page * new_page, unsigned long address, - pte_t *page_table) +static inline void break_cow(struct vm_area_struct *vma, + struct page *new_page, unsigned long address, + pte_t *page_table) { flush_cache_page(vma, address); establish_pte(vma, address, page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)))); @@ -1001,7 +1000,6 @@ static int do_wp_page(struct mm_struct * { struct page *old_page, *new_page; unsigned long pfn = pte_pfn(pte); - struct pte_chain *pte_chain; if (unlikely(!pfn_valid(pfn))) { /* @@ -1010,6 +1008,7 @@ static int do_wp_page(struct mm_struct * * data, but for the moment just pretend this is OOM. */ pte_unmap(page_table); + pmd_unmap(pmd); printk(KERN_ERR "do_wp_page: bogus page at address %08lx\n", address); spin_unlock(&mm->page_table_lock); @@ -1019,17 +1018,22 @@ static int do_wp_page(struct mm_struct * if (!TestSetPageLocked(old_page)) { int reuse = can_share_swap_page(old_page); - unlock_page(old_page); - if (reuse) { + if (!reuse) + unlock_page(old_page); + else { flush_cache_page(vma, address); establish_pte(vma, address, page_table, pte_mkyoung(pte_mkdirty(pte_mkwrite(pte)))); + page_turn_rmap(old_page, vma); pte_unmap(page_table); + pmd_unmap(pmd); spin_unlock(&mm->page_table_lock); + unlock_page(old_page); return VM_FAULT_MINOR; } } pte_unmap(page_table); + pmd_unmap(pmd); /* * Ok, we need to copy. Oh, well.. @@ -1037,9 +1041,6 @@ static int do_wp_page(struct mm_struct * page_cache_get(old_page); spin_unlock(&mm->page_table_lock); - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) - goto no_pte_chain; new_page = alloc_page(GFP_HIGHUSER); if (!new_page) goto no_new_page; @@ -1049,32 +1050,132 @@ static int do_wp_page(struct mm_struct * * Re-check the pte - we dropped the lock */ spin_lock(&mm->page_table_lock); + pmd = pmd_offset_map(pgd_offset(mm, address), address); page_table = pte_offset_map(pmd, address); if (pte_same(*page_table, pte)) { if (PageReserved(old_page)) ++mm->rss; - page_remove_rmap(old_page, page_table); + else + page_remove_rmap(old_page); break_cow(vma, new_page, address, page_table); - pte_chain = page_add_rmap(new_page, page_table, pte_chain); + page_add_rmap(new_page, vma, address, 1); lru_cache_add_active(new_page); /* Free the old page.. */ new_page = old_page; } pte_unmap(page_table); + pmd_unmap(pmd); page_cache_release(new_page); page_cache_release(old_page); spin_unlock(&mm->page_table_lock); - pte_chain_free(pte_chain); return VM_FAULT_MINOR; no_new_page: - pte_chain_free(pte_chain); -no_pte_chain: page_cache_release(old_page); return VM_FAULT_OOM; } +static void +invalidate_mmap_nonlinear_range(struct vm_area_struct *vma, + const unsigned long pgoff, + const unsigned long len) +{ + unsigned long addr; + pgd_t *pgd; + struct mmu_gather *tlb; + + spin_lock(&vma->vm_mm->page_table_lock); + addr = vma->vm_start; + pgd = pgd_offset(vma->vm_mm, addr); + tlb = tlb_gather_mmu(vma->vm_mm, vma->vm_start); + + tlb_start_vma(tlb, vma); + while (1) { + pmd_t *pmd; + + if (pgd_none(*pgd)) { + addr = (addr + PGDIR_SIZE) & PGDIR_MASK; + goto skip_pgd; + } else if (pgd_bad(*pgd)) { + pgd_ERROR(*pgd); + pgd_clear(pgd); +skip_pgd: addr = (addr + PGDIR_SIZE) & PGDIR_MASK; + if (!addr || addr >= vma->vm_end) + break; + goto next_pgd; + } + + pmd = pmd_offset_map(pgd, addr); + do { + pte_t *pte; + + if (pmd_none(*pmd)) { + goto skip_pmd; + } else if (pmd_bad(*pmd)) { + pmd_ERROR(*pmd); + pmd_clear(pmd); +skip_pmd: addr = (addr + PMD_SIZE) & PMD_MASK; + if (!addr || addr >= vma->vm_end) { + pmd_unmap(pmd); + goto out; + } + goto next_pmd; + } + pte = pte_offset_map(pmd, addr); + do { + unsigned long pfn; + struct page *page; + + if (pte_none(*pte)) + goto next_pte; + if (!pte_present(*pte)) { + unsigned long index; + if (!pte_file(*pte)) + goto next_pte; + index = pte_to_pgoff(*pte); + if (index >= pgoff && + index - pgoff < len) + vm_pte_clear(vma, pte, addr); + goto next_pte; + } + pfn = pte_pfn(*pte); + if (!pfn_valid(pfn)) + goto next_pte; + page = pfn_to_page(pfn); + if (page->index < pgoff || + page->index - pgoff >= len) + goto next_pte; + tlb_remove_tlb_entry(tlb, pte, addr); + if (pte_dirty(*pte)) + set_page_dirty(page); + if (page_mapping(page) && + pte_young(*pte) && + !PageSwapCache(page)) + mark_page_accessed(page); + tlb->freed++; + page_remove_rmap(page); + tlb_remove_page(tlb, page); + vm_pte_clear(vma, pte, addr); +next_pte: addr += PAGE_SIZE; + if (addr >= vma->vm_end) { + pmd_unmap(pmd); + pte_unmap(pte); + goto out; + } + ++pte; + } while ((unsigned long)pte & PTE_TABLE_MASK); + pte_unmap(pte - 1); +next_pmd: ++pmd; + } while ((unsigned long)pmd & PMD_TABLE_MASK); + pmd_unmap(pmd - 1); +next_pgd: ++pgd; + } +out: tlb_end_vma(tlb, vma); + tlb_finish_mmu(tlb, vma->vm_start, vma->vm_end); + spin_unlock(&vma->vm_mm->page_table_lock); +} + /* * Helper function for invalidate_mmap_range(). * Both hba and hlen are page numbers in PAGE_SIZE units. @@ -1096,17 +1197,35 @@ invalidate_mmap_range_list(struct list_h hea = hba + hlen - 1; /* avoid overflow. */ if (hea < hba) hea = ULONG_MAX; - list_for_each(curr, head) { + list_for_each_rcu(curr, head) { + struct mmu_gather *tlb; + unsigned long start, end; + vp = list_entry(curr, struct vm_area_struct, shared); + + if (vp->vm_flags & VM_DEAD) + continue; + + if (unlikely(vp->vm_flags & VM_NONLINEAR)) { + invalidate_mmap_nonlinear_range(vp, hba, hlen); + continue; + } + vba = vp->vm_pgoff; vea = vba + ((vp->vm_end - vp->vm_start) >> PAGE_SHIFT) - 1; if (hea < vba || vea < hba) continue; /* Mapping disjoint from hole. */ zba = (hba <= vba) ? vba : hba; zea = (vea <= hea) ? vea : hea; - zap_page_range(vp, - ((zba - vba) << PAGE_SHIFT) + vp->vm_start, - (zea - zba + 1) << PAGE_SHIFT); + + start = vp->vm_start + ((zba - vba) << PAGE_SHIFT); + end = start + ((zea - zba + 1) << PAGE_SHIFT); + + spin_lock(&vp->vm_mm->page_table_lock); + tlb = tlb_gather_mmu(vp->vm_mm, 0); + unmap_page_range(tlb, vp, start, end); + tlb_finish_mmu(tlb, start, end); + spin_unlock(&vp->vm_mm->page_table_lock); } } @@ -1138,14 +1257,14 @@ void invalidate_mmap_range(struct addres if (holeend & ~(long long)ULONG_MAX) hlen = ULONG_MAX - hba + 1; } - down(&mapping->i_shared_sem); + rcu_read_lock(); /* Protect against page fault */ atomic_inc(&mapping->truncate_count); if (unlikely(!list_empty(&mapping->i_mmap))) invalidate_mmap_range_list(&mapping->i_mmap, hba, hlen); if (unlikely(!list_empty(&mapping->i_mmap_shared))) invalidate_mmap_range_list(&mapping->i_mmap_shared, hba, hlen); - up(&mapping->i_shared_sem); + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(invalidate_mmap_range); @@ -1228,9 +1347,9 @@ static int do_swap_page(struct mm_struct swp_entry_t entry = pte_to_swp_entry(orig_pte); pte_t pte; int ret = VM_FAULT_MINOR; - struct pte_chain *pte_chain = NULL; pte_unmap(page_table); + pmd_unmap(pmd); spin_unlock(&mm->page_table_lock); page = lookup_swap_cache(entry); if (!page) { @@ -1242,12 +1361,14 @@ static int do_swap_page(struct mm_struct * we released the page table lock. */ spin_lock(&mm->page_table_lock); + pmd = pmd_offset_map(pgd_offset(mm, address), address); page_table = pte_offset_map(pmd, address); if (pte_same(*page_table, orig_pte)) ret = VM_FAULT_OOM; else ret = VM_FAULT_MINOR; pte_unmap(page_table); + pmd_unmap(pmd); spin_unlock(&mm->page_table_lock); goto out; } @@ -1258,26 +1379,27 @@ static int do_swap_page(struct mm_struct } mark_page_accessed(page); - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) { - ret = -ENOMEM; - goto out; - } lock_page(page); + if (!rmap_get_cpu()) { + ret = VM_FAULT_OOM; + goto outrel; + } + spin_lock(&mm->page_table_lock); + put_cpu(); + pmd = pmd_offset_map(pgd_offset(mm, address), address); + page_table = pte_offset_map(pmd, address); + /* * Back out if somebody else faulted in this pte while we * released the page table lock. */ - spin_lock(&mm->page_table_lock); - page_table = pte_offset_map(pmd, address); if (!pte_same(*page_table, orig_pte)) { pte_unmap(page_table); + pmd_unmap(pmd); spin_unlock(&mm->page_table_lock); - unlock_page(page); - page_cache_release(page); ret = VM_FAULT_MINOR; - goto out; + goto outrel; } /* The page isn't present yet, go ahead with the fault. */ @@ -1290,19 +1412,23 @@ static int do_swap_page(struct mm_struct pte = mk_pte(page, vma->vm_page_prot); if (write_access && can_share_swap_page(page)) pte = pte_mkdirty(pte_mkwrite(pte)); - unlock_page(page); flush_icache_page(vma, page); - set_pte(page_table, pte); - pte_chain = page_add_rmap(page, page_table, pte_chain); + vm_set_pte(vma, page_table, pte, address); + page_add_rmap(page, vma, address, 1); + unlock_page(page); /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, address, pte); + pmd_unmap(pmd); pte_unmap(page_table); spin_unlock(&mm->page_table_lock); out: - pte_chain_free(pte_chain); return ret; +outrel: + unlock_page(page); + page_cache_release(page); + goto out; } /* @@ -1317,20 +1443,8 @@ do_anonymous_page(struct mm_struct *mm, { pte_t entry; struct page * page = ZERO_PAGE(addr); - struct pte_chain *pte_chain; int ret; - pte_chain = pte_chain_alloc(GFP_ATOMIC); - if (!pte_chain) { - pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) - goto no_mem; - spin_lock(&mm->page_table_lock); - page_table = pte_offset_map(pmd, addr); - } - /* Read-only mapping of ZERO_PAGE. */ entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot)); @@ -1338,6 +1452,7 @@ do_anonymous_page(struct mm_struct *mm, if (write_access) { /* Allocate our own private page. */ pte_unmap(page_table); + pmd_unmap(pmd); spin_unlock(&mm->page_table_lock); page = alloc_page(GFP_HIGHUSER); @@ -1346,9 +1461,11 @@ do_anonymous_page(struct mm_struct *mm, clear_user_highpage(page, addr); spin_lock(&mm->page_table_lock); + pmd = pmd_offset_map(pgd_offset(mm, addr), addr); page_table = pte_offset_map(pmd, addr); if (!pte_none(*page_table)) { + pmd_unmap(pmd); pte_unmap(page_table); page_cache_release(page); spin_unlock(&mm->page_table_lock); @@ -1357,26 +1474,26 @@ do_anonymous_page(struct mm_struct *mm, } mm->rss++; entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); + } + + vm_set_pte(vma, page_table, entry, addr); + if (write_access) { + page_add_rmap(page, vma, addr, 1); lru_cache_add_active(page); mark_page_accessed(page); } - - set_pte(page_table, entry); - /* ignores ZERO_PAGE */ - pte_chain = page_add_rmap(page, page_table, pte_chain); + pmd_unmap(pmd); pte_unmap(page_table); /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, addr, entry); spin_unlock(&mm->page_table_lock); ret = VM_FAULT_MINOR; - goto out; - -no_mem: - ret = VM_FAULT_OOM; out: - pte_chain_free(pte_chain); return ret; +no_mem: + ret = VM_FAULT_OOM; + goto out; } /* @@ -1398,14 +1515,14 @@ do_no_page(struct mm_struct *mm, struct struct page * new_page; struct address_space *mapping = NULL; pte_t entry; - struct pte_chain *pte_chain; int sequence = 0; - int ret; + int ret = VM_FAULT_MINOR, anon = 0; if (!vma->vm_ops || !vma->vm_ops->nopage) return do_anonymous_page(mm, vma, page_table, pmd, write_access, address); pte_unmap(page_table); + pmd_unmap(pmd); spin_unlock(&mm->page_table_lock); if (vma->vm_file) { @@ -1414,7 +1531,7 @@ do_no_page(struct mm_struct *mm, struct } smp_rmb(); /* Prevent CPU from reordering lock-free ->nopage() */ retry: - new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, 0); + new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret); /* no page was available -- either SIGBUS or OOM */ if (new_page == NOPAGE_SIGBUS) @@ -1422,26 +1539,25 @@ retry: if (new_page == NOPAGE_OOM) return VM_FAULT_OOM; - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) - goto oom; - /* * Should we do an early C-O-W break? */ if (write_access && !(vma->vm_flags & VM_SHARED)) { struct page * page = alloc_page(GFP_HIGHUSER); - if (!page) { - page_cache_release(new_page); + if (!page) goto oom; - } + /* start with refcount 1 */ copy_user_highpage(page, new_page, address); page_cache_release(new_page); - lru_cache_add_active(page); + anon = 1; new_page = page; } + if (!rmap_get_cpu()) + goto oom; spin_lock(&mm->page_table_lock); + put_cpu(); + /* * For a file-backed vma, someone could have truncated or otherwise * invalidated this page. If invalidate_mmap_range got called, @@ -1452,9 +1568,9 @@ retry: sequence = atomic_read(&mapping->truncate_count); spin_unlock(&mm->page_table_lock); page_cache_release(new_page); - pte_chain_free(pte_chain); goto retry; } + pmd = pmd_offset_map(pgd_offset(mm, address), address); page_table = pte_offset_map(pmd, address); /* @@ -1475,28 +1591,43 @@ retry: entry = mk_pte(new_page, vma->vm_page_prot); if (write_access) entry = pte_mkwrite(pte_mkdirty(entry)); - set_pte(page_table, entry); - pte_chain = page_add_rmap(new_page, page_table, pte_chain); + vm_set_pte(vma, page_table, entry, address); + + /* + * PG_locked not held for the anon case, but we have a + * unique reference, and ->__mapping is untouched when file-backed + */ + if (!PageReserved(new_page)) + page_add_rmap(new_page, vma, address, anon); + + /* kswapd can find us now, but we're already prepped */ + if (anon) + lru_cache_add_active(new_page); pte_unmap(page_table); + pmd_unmap(pmd); } else { /* One of our sibling threads was faster, back out. */ pte_unmap(page_table); + pmd_unmap(pmd); + /* + * In the anon case, we never hit the LRU, so we free instantly, + * where in mainline the LRU retains a reference. In the file- + * backed case, we merely release a reference acquired earlier. + */ page_cache_release(new_page); spin_unlock(&mm->page_table_lock); - ret = VM_FAULT_MINOR; goto out; } /* no need to invalidate: a not-present page shouldn't be cached */ update_mmu_cache(vma, address, entry); spin_unlock(&mm->page_table_lock); - ret = VM_FAULT_MAJOR; - goto out; -oom: - ret = VM_FAULT_OOM; out: - pte_chain_free(pte_chain); return ret; +oom: + page_cache_release(new_page); + ret = VM_FAULT_OOM; + goto out; } /* @@ -1517,13 +1648,14 @@ static int do_file_page(struct mm_struct */ if (!vma->vm_ops || !vma->vm_ops->populate || (write_access && !(vma->vm_flags & VM_SHARED))) { - pte_clear(pte); + vm_pte_clear(vma, pte, address); return do_no_page(mm, vma, address, write_access, pte, pmd); } pgoff = pte_to_pgoff(*pte); pte_unmap(pte); + pmd_unmap(pmd); spin_unlock(&mm->page_table_lock); err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE, vma->vm_page_prot, pgoff, 0); @@ -1584,6 +1716,7 @@ static inline int handle_pte_fault(struc entry = pte_mkyoung(entry); establish_pte(vma, address, pte, entry); pte_unmap(pte); + pmd_unmap(pmd); spin_unlock(&mm->page_table_lock); return VM_FAULT_MINOR; } @@ -1610,10 +1743,10 @@ int handle_mm_fault(struct mm_struct *mm * and the SMP-safe atomic PTE updates. */ spin_lock(&mm->page_table_lock); - pmd = pmd_alloc(mm, pgd, address); + pmd = pmd_alloc_map(mm, pgd, address); if (pmd) { - pte_t * pte = pte_alloc_map(mm, pmd, address); + pte_t *pte = pte_alloc_map(mm, pgd, &pmd, address); if (pte) return handle_pte_fault(mm, vma, address, write_access, pte, pmd); } @@ -1632,10 +1765,33 @@ int handle_mm_fault(struct mm_struct *mm */ pmd_t *__pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) { + struct page *page; + + spin_unlock(&mm->page_table_lock); + page = pmd_alloc_one(mm, address); + spin_lock(&mm->page_table_lock); + if (!page) + return NULL; + + /* + * Because we dropped the lock, we should re-check the + * entry, as somebody else could have populated it.. + */ + if (pgd_present(*pgd)) { + pmd_free(page); + goto out; + } + pgd_populate(mm, pgd, page); +out: + return pmd_offset_map(pgd, address); +} + +pmd_t *__pmd_alloc_kernel(struct mm_struct *mm, pgd_t *pgd, unsigned long address) +{ pmd_t *new; spin_unlock(&mm->page_table_lock); - new = pmd_alloc_one(mm, address); + new = pmd_alloc_one_kernel(mm, address); spin_lock(&mm->page_table_lock); if (!new) return NULL; @@ -1645,12 +1801,12 @@ pmd_t *__pmd_alloc(struct mm_struct *mm, * entry, as somebody else could have populated it.. */ if (pgd_present(*pgd)) { - pmd_free(new); + pmd_free(virt_to_page(new)); goto out; } - pgd_populate(mm, pgd, new); + pgd_populate(mm, pgd, virt_to_page(new)); out: - return pmd_offset(pgd, address); + return pmd_offset_kernel(pgd, address); } int make_pages_present(unsigned long addr, unsigned long end) @@ -1684,7 +1840,7 @@ struct page * vmalloc_to_page(void * vma pte_t *ptep, pte; if (!pgd_none(*pgd)) { - pmd = pmd_offset(pgd, addr); + pmd = pmd_offset_map(pgd, addr); if (!pmd_none(*pmd)) { preempt_disable(); ptep = pte_offset_map(pmd, addr); @@ -1694,6 +1850,7 @@ struct page * vmalloc_to_page(void * vma pte_unmap(ptep); preempt_enable(); } + pmd_unmap(pmd); } return page; } diff -prauN linux-2.6.0-test11/mm/mincore.c wli-2.6.0-test11-30/mm/mincore.c --- linux-2.6.0-test11/mm/mincore.c 2003-11-26 12:45:52.000000000 -0800 +++ wli-2.6.0-test11-30/mm/mincore.c 2003-12-04 08:02:31.000000000 -0800 @@ -22,7 +22,7 @@ * and is up to date; i.e. that no page-in operation would be required * at this time if an application were to map and access this page. */ -static unsigned char mincore_page(struct vm_area_struct * vma, +static unsigned char mincore_linear_page(struct vm_area_struct *vma, unsigned long pgoff) { unsigned char present = 0; @@ -38,6 +38,67 @@ static unsigned char mincore_page(struct return present; } +static unsigned char mincore_nonlinear_page(struct vm_area_struct *vma, + unsigned long pgoff) +{ + unsigned char present = 0; + unsigned long vaddr; + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + + spin_lock(&vma->vm_mm->page_table_lock); + vaddr = PAGE_SIZE*(pgoff - vma->vm_pgoff) + vma->vm_start; + pgd = pgd_offset(vma->vm_mm, vaddr); + if (pgd_none(*pgd)) + goto out; + else if (pgd_bad(*pgd)) { + pgd_ERROR(*pgd); + pgd_clear(pgd); + goto out; + } + pmd = pmd_offset_map(pgd, vaddr); + if (pmd_none(*pmd)) + goto out_unmap; + else if (pmd_ERROR(*pmd)) { + pmd_ERROR(*pmd); + pmd_clear(pmd); + goto out_unmap; + } + + pte = pte_offset_map(pmd, vaddr); + + /* pte presence overrides the calculated offset */ + if (pte_present(*pte)) + present = 1; + + /* PTE_FILE ptes have the same file, but pgoff can differ */ + else if (pte_file(*pte)) + present = mincore_linear_page(vma, pte_to_pgoff(*pte)); + + /* matching offsets are the faulted in if the pte isn't set */ + else + present = mincore_linear_page(vma, pgoff); + + pte_unmap(pte); +out_unmap: + pmd_unmap(pmd); +out: + spin_unlock(&vma->vm_mm->page_table_lock); + return present; +} + +static inline unsigned char mincore_page(struct vm_area_struct *vma, + unsigned long pgoff) +{ + unsigned char ret; + if (vma->vm_flags & VM_NONLINEAR) + ret = mincore_nonlinear_page(vma, pgoff); + else + ret = mincore_linear_page(vma, pgoff); + return ret; +} + static long mincore_vma(struct vm_area_struct * vma, unsigned long start, unsigned long end, unsigned char __user * vec) { diff -prauN linux-2.6.0-test11/mm/mmap.c wli-2.6.0-test11-30/mm/mmap.c --- linux-2.6.0-test11/mm/mmap.c 2003-11-26 12:44:31.000000000 -0800 +++ wli-2.6.0-test11-30/mm/mmap.c 2003-12-04 05:58:54.000000000 -0800 @@ -58,8 +58,19 @@ EXPORT_SYMBOL(sysctl_overcommit_memory); EXPORT_SYMBOL(sysctl_overcommit_ratio); EXPORT_SYMBOL(vm_committed_space); +static void __free_vma(void *vma) +{ + kmem_cache_free(vm_area_cachep, vma); +} + +void free_vma(struct vm_area_struct *vma) +{ + INIT_LIST_HEAD(&vma->rcu.list); + call_rcu(&vma->rcu, __free_vma, vma); +} + /* - * Requires inode->i_mapping->i_shared_sem + * Requires inode->i_mapping->i_shared_lock */ static inline void __remove_shared_vm_struct(struct vm_area_struct *vma, struct inode *inode) @@ -67,7 +78,8 @@ __remove_shared_vm_struct(struct vm_area if (inode) { if (vma->vm_flags & VM_DENYWRITE) atomic_inc(&inode->i_writecount); - list_del_init(&vma->shared); + vma->vm_flags |= VM_DEAD; + list_del_rcu(&vma->shared); } } @@ -81,9 +93,9 @@ static void remove_shared_vm_struct(stru if (file) { struct inode *inode = file->f_dentry->d_inode; - down(&inode->i_mapping->i_shared_sem); + spin_lock(&inode->i_mapping->i_shared_lock); __remove_shared_vm_struct(vma, inode); - up(&inode->i_mapping->i_shared_sem); + spin_unlock(&inode->i_mapping->i_shared_lock); } } @@ -241,9 +253,9 @@ static inline void __vma_link_file(struc atomic_dec(&inode->i_writecount); if (vma->vm_flags & VM_SHARED) - list_add_tail(&vma->shared, &mapping->i_mmap_shared); + list_add_tail_rcu(&vma->shared, &mapping->i_mmap_shared); else - list_add_tail(&vma->shared, &mapping->i_mmap); + list_add_tail_rcu(&vma->shared, &mapping->i_mmap); } } @@ -267,12 +279,12 @@ static void vma_link(struct mm_struct *m mapping = vma->vm_file->f_dentry->d_inode->i_mapping; if (mapping) - down(&mapping->i_shared_sem); + spin_lock(&mapping->i_shared_lock); spin_lock(&mm->page_table_lock); __vma_link(mm, vma, prev, rb_link, rb_parent); spin_unlock(&mm->page_table_lock); if (mapping) - up(&mapping->i_shared_sem); + spin_unlock(&mapping->i_shared_lock); mark_mm_hugetlb(mm, vma); mm->map_count++; @@ -282,7 +294,7 @@ static void vma_link(struct mm_struct *m /* * Insert vm structure into process list sorted by address and into the inode's * i_mmap ring. The caller should hold mm->page_table_lock and - * ->f_mappping->i_shared_sem if vm_file is non-NULL. + * ->f_mappping->i_shared_lock if vm_file is non-NULL. */ static void __insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) @@ -319,6 +331,28 @@ static inline int is_mergeable_vma(struc return 1; } +static void move_vma_start(struct vm_area_struct *vma, unsigned long addr) +{ + spinlock_t *lock = &vma->vm_mm->page_table_lock; + struct inode *inode = NULL; + + if (vma->vm_file) { + inode = vma->vm_file->f_dentry->d_inode; + spin_lock(&inode->i_mapping->i_shared_lock); + } + spin_lock(lock); + if (inode) + __remove_shared_vm_struct(vma, inode); + /* If no vm_file, perhaps we should always keep vm_pgoff at 0?? */ + vma->vm_pgoff += (long)(addr - vma->vm_start) >> PAGE_SHIFT; + vma->vm_start = addr; + if (inode) { + __vma_link_file(vma); + spin_unlock(&inode->i_mapping->i_shared_lock); + } + spin_unlock(lock); +} + /* * Return true if we can merge this (vm_flags,file,vm_pgoff,size) * in front of (at a lower virtual address and file offset than) the vma. @@ -371,9 +405,7 @@ static int vma_merge(struct mm_struct *m unsigned long end, unsigned long vm_flags, struct file *file, unsigned long pgoff) { - spinlock_t *lock = &mm->page_table_lock; struct inode *inode = file ? file->f_dentry->d_inode : NULL; - struct semaphore *i_shared_sem; /* * We later require that vma->vm_flags == vm_flags, so this tests @@ -382,8 +414,6 @@ static int vma_merge(struct mm_struct *m if (vm_flags & VM_SPECIAL) return 0; - i_shared_sem = file ? &inode->i_mapping->i_shared_sem : NULL; - if (!prev) { prev = rb_entry(rb_parent, struct vm_area_struct, vm_rb); goto merge_next; @@ -396,11 +426,12 @@ static int vma_merge(struct mm_struct *m is_mergeable_vma(prev, file, vm_flags) && can_vma_merge_after(prev, vm_flags, file, pgoff)) { struct vm_area_struct *next; + spinlock_t *lock = &mm->page_table_lock; int need_up = 0; if (unlikely(file && prev->vm_next && prev->vm_next->vm_file == file)) { - down(i_shared_sem); + spin_lock(&inode->i_mapping->i_shared_lock); need_up = 1; } spin_lock(lock); @@ -418,7 +449,7 @@ static int vma_merge(struct mm_struct *m __remove_shared_vm_struct(next, inode); spin_unlock(lock); if (need_up) - up(i_shared_sem); + spin_unlock(&inode->i_mapping->i_shared_lock); if (file) fput(file); @@ -428,7 +459,7 @@ static int vma_merge(struct mm_struct *m } spin_unlock(lock); if (need_up) - up(i_shared_sem); + spin_unlock(&inode->i_mapping->i_shared_lock); return 1; } @@ -443,13 +474,10 @@ static int vma_merge(struct mm_struct *m return 0; if (end == prev->vm_start) { if (file) - down(i_shared_sem); - spin_lock(lock); - prev->vm_start = addr; - prev->vm_pgoff -= (end - addr) >> PAGE_SHIFT; - spin_unlock(lock); + spin_lock(&inode->i_mapping->i_shared_lock); + move_vma_start(prev, addr); if (file) - up(i_shared_sem); + spin_unlock(&inode->i_mapping->i_shared_lock); return 1; } } @@ -676,7 +704,7 @@ munmap_back: atomic_inc(&inode->i_writecount); fput(file); } - kmem_cache_free(vm_area_cachep, vma); + free_vma(vma); } out: mm->total_vm += len >> PAGE_SHIFT; @@ -701,7 +729,7 @@ unmap_and_free_vma: /* Undo any partial mapping done by a device driver. */ zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start); free_vma: - kmem_cache_free(vm_area_cachep, vma); + free_vma(vma); unacct_error: if (charged) vm_unacct_memory(charged); @@ -1094,7 +1122,7 @@ static void unmap_vma(struct mm_struct * area->vm_ops->close(area); if (area->vm_file) fput(area->vm_file); - kmem_cache_free(vm_area_cachep, area); + free_vma(area); } /* @@ -1183,9 +1211,10 @@ int split_vma(struct mm_struct * mm, str INIT_LIST_HEAD(&new->shared); - if (new_below) + if (new_below) { new->vm_end = addr; - else { + move_vma_start(vma, addr); + } else { new->vm_start = addr; new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT); } @@ -1200,7 +1229,7 @@ int split_vma(struct mm_struct * mm, str mapping = vma->vm_file->f_dentry->d_inode->i_mapping; if (mapping) - down(&mapping->i_shared_sem); + spin_lock(&mapping->i_shared_lock); spin_lock(&mm->page_table_lock); if (new_below) { @@ -1213,7 +1242,7 @@ int split_vma(struct mm_struct * mm, str spin_unlock(&mm->page_table_lock); if (mapping) - up(&mapping->i_shared_sem); + spin_unlock(&mapping->i_shared_lock); return 0; } @@ -1456,14 +1485,14 @@ void exit_mmap(struct mm_struct *mm) } if (vma->vm_file) fput(vma->vm_file); - kmem_cache_free(vm_area_cachep, vma); + free_vma(vma); vma = next; } } /* Insert vm structure into process list sorted by address * and into the inode's i_mmap ring. If vm_file is non-NULL - * then i_shared_sem is taken here. + * then i_shared_lock is taken here. */ void insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) { diff -prauN linux-2.6.0-test11/mm/mprotect.c wli-2.6.0-test11-30/mm/mprotect.c --- linux-2.6.0-test11/mm/mprotect.c 2003-11-26 12:43:38.000000000 -0800 +++ wli-2.6.0-test11-30/mm/mprotect.c 2003-12-04 05:51:48.000000000 -0800 @@ -24,11 +24,11 @@ #include static inline void -change_pte_range(pmd_t *pmd, unsigned long address, - unsigned long size, pgprot_t newprot) +change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long address, unsigned long size, pgprot_t newprot) { pte_t * pte; - unsigned long end; + unsigned long start, end; if (pmd_none(*pmd)) return; @@ -38,6 +38,7 @@ change_pte_range(pmd_t *pmd, unsigned lo return; } pte = pte_offset_map(pmd, address); + start = address & PMD_MASK; address &= ~PMD_MASK; end = address + size; if (end > PMD_SIZE) @@ -50,8 +51,8 @@ change_pte_range(pmd_t *pmd, unsigned lo * bits by wiping the pte and then setting the new pte * into place. */ - entry = ptep_get_and_clear(pte); - set_pte(pte, pte_modify(entry, newprot)); + entry = vm_ptep_get_and_clear(vma, pte, address + start); + vm_set_pte(vma, pte, pte_modify(entry, newprot), start + address); } address += PAGE_SIZE; pte++; @@ -60,11 +61,11 @@ change_pte_range(pmd_t *pmd, unsigned lo } static inline void -change_pmd_range(pgd_t *pgd, unsigned long address, - unsigned long size, pgprot_t newprot) +change_pmd_range(struct vm_area_struct *vma, pgd_t *pgd, + unsigned long address, unsigned long size, pgprot_t newprot) { pmd_t * pmd; - unsigned long end; + unsigned long start, end; if (pgd_none(*pgd)) return; @@ -73,16 +74,18 @@ change_pmd_range(pgd_t *pgd, unsigned lo pgd_clear(pgd); return; } - pmd = pmd_offset(pgd, address); + pmd = pmd_offset_map(pgd, address); + start = address & PGDIR_MASK; address &= ~PGDIR_MASK; end = address + size; if (end > PGDIR_SIZE) end = PGDIR_SIZE; do { - change_pte_range(pmd, address, end - address, newprot); + change_pte_range(vma, pmd, start + address, end - address, newprot); address = (address + PMD_SIZE) & PMD_MASK; pmd++; } while (address && (address < end)); + pmd_unmap(pmd - 1); } static void @@ -98,7 +101,7 @@ change_protection(struct vm_area_struct BUG(); spin_lock(¤t->mm->page_table_lock); do { - change_pmd_range(dir, start, end - start, newprot); + change_pmd_range(vma, dir, start, end - start, newprot); start = (start + PGDIR_SIZE) & PGDIR_MASK; dir++; } while (start && (start < end)); @@ -135,7 +138,7 @@ mprotect_attempt_merge(struct vm_area_st __vma_unlink(mm, vma, prev); spin_unlock(&mm->page_table_lock); - kmem_cache_free(vm_area_cachep, vma); + free_vma(vma); mm->map_count--; return 1; } @@ -322,7 +325,7 @@ sys_mprotect(unsigned long start, size_t __vma_unlink(prev->vm_mm, next, prev); spin_unlock(&prev->vm_mm->page_table_lock); - kmem_cache_free(vm_area_cachep, next); + free_vma(next); prev->vm_mm->map_count--; } out: diff -prauN linux-2.6.0-test11/mm/mremap.c wli-2.6.0-test11-30/mm/mremap.c --- linux-2.6.0-test11/mm/mremap.c 2003-11-26 12:44:19.000000000 -0800 +++ wli-2.6.0-test11-30/mm/mremap.c 2003-12-04 06:59:27.000000000 -0800 @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include #include @@ -38,7 +38,7 @@ static pte_t *get_one_pte_map_nested(str goto end; } - pmd = pmd_offset(pgd, addr); + pmd = pmd_offset_map_nested(pgd, addr); if (pmd_none(*pmd)) goto end; if (pmd_bad(*pmd)) { @@ -53,6 +53,7 @@ static pte_t *get_one_pte_map_nested(str pte = NULL; } end: + pmd_unmap_nested(pmd); return pte; } @@ -60,50 +61,51 @@ static inline int page_table_present(str { pgd_t *pgd; pmd_t *pmd; + int ret; pgd = pgd_offset(mm, addr); if (pgd_none(*pgd)) return 0; - pmd = pmd_offset(pgd, addr); - return pmd_present(*pmd); + pmd = pmd_offset_map(pgd, addr); + ret = pmd_present(*pmd); + pmd_unmap(pmd); + return ret != 0; } static inline pte_t *alloc_one_pte_map(struct mm_struct *mm, unsigned long addr) { + pgd_t *pgd; pmd_t *pmd; pte_t *pte = NULL; - pmd = pmd_alloc(mm, pgd_offset(mm, addr), addr); + pgd = pgd_offset(mm, addr); + pmd = pmd_alloc_map(mm, pgd, addr); if (pmd) - pte = pte_alloc_map(mm, pmd, addr); + pte = pte_alloc_map(mm, pgd, &pmd, addr); + pmd_unmap(pmd); return pte; } static int -copy_one_pte(struct mm_struct *mm, pte_t *src, pte_t *dst, - struct pte_chain **pte_chainp) +copy_one_pte(struct vm_area_struct *vma, pte_t *src, pte_t *dst, + unsigned long old_addr, unsigned long new_addr) { - int error = 0; - pte_t pte; - struct page *page = NULL; - - if (pte_present(*src)) - page = pte_page(*src); - if (!pte_none(*src)) { - if (page) - page_remove_rmap(page, src); - pte = ptep_get_and_clear(src); - if (!dst) { - /* No dest? We must put it back. */ - dst = src; - error++; + pte_t pte; + if (!dst) + return -1; + pte = vm_ptep_get_and_clear(vma, src, old_addr); + vm_set_pte(vma, dst, pte, new_addr); + if (pte_present(pte)) { + unsigned long pfn = pte_pfn(pte); + if (pfn_valid(pfn)) { + struct page *page = pfn_to_page(pfn); + if (!PageReserved(page)) + page_move_rmap(page, vma, old_addr, new_addr); + } } - set_pte(dst, pte); - if (page) - *pte_chainp = page_add_rmap(page, dst, *pte_chainp); } - return error; + return 0; } static int @@ -111,16 +113,16 @@ move_one_page(struct vm_area_struct *vma unsigned long new_addr) { struct mm_struct *mm = vma->vm_mm; - int error = 0; pte_t *src, *dst; - struct pte_chain *pte_chain; + int error = 0; - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) { + if (!rmap_get_cpu()) { error = -ENOMEM; goto out; } + spin_lock(&mm->page_table_lock); + put_cpu(); src = get_one_pte_map_nested(mm, old_addr); if (src) { /* @@ -135,13 +137,12 @@ move_one_page(struct vm_area_struct *vma dst = alloc_one_pte_map(mm, new_addr); if (src == NULL) src = get_one_pte_map_nested(mm, old_addr); - error = copy_one_pte(mm, src, dst, &pte_chain); + error = copy_one_pte(vma, src, dst, old_addr, new_addr); pte_unmap_nested(src); pte_unmap(dst); } flush_tlb_page(vma, old_addr); spin_unlock(&mm->page_table_lock); - pte_chain_free(pte_chain); out: return error; } @@ -210,7 +211,7 @@ static unsigned long move_vma(struct vm_ if (vma == next) vma = prev; mm->map_count--; - kmem_cache_free(vm_area_cachep, next); + free_vma(next); } } else if (next->vm_start == new_addr + new_len && can_vma_merge(next, vma->vm_flags) && @@ -286,7 +287,7 @@ static unsigned long move_vma(struct vm_ return new_addr; } if (allocated_vma) - kmem_cache_free(vm_area_cachep, new_vma); + free_vma(new_vma); out: return -ENOMEM; } diff -prauN linux-2.6.0-test11/mm/msync.c wli-2.6.0-test11-30/mm/msync.c --- linux-2.6.0-test11/mm/msync.c 2003-11-26 12:43:36.000000000 -0800 +++ wli-2.6.0-test11-30/mm/msync.c 2003-12-03 18:20:41.000000000 -0800 @@ -82,7 +82,7 @@ static inline int filemap_sync_pmd_range pgd_clear(pgd); return 0; } - pmd = pmd_offset(pgd, address); + pmd = pmd_offset_map(pgd, address); if ((address & PGDIR_MASK) != (end & PGDIR_MASK)) end = (address & PGDIR_MASK) + PGDIR_SIZE; error = 0; @@ -91,6 +91,7 @@ static inline int filemap_sync_pmd_range address = (address + PMD_SIZE) & PMD_MASK; pmd++; } while (address && (address < end)); + pmd_unmap(pmd - 1); return error; } diff -prauN linux-2.6.0-test11/mm/nommu.c wli-2.6.0-test11-30/mm/nommu.c --- linux-2.6.0-test11/mm/nommu.c 2003-11-26 12:45:28.000000000 -0800 +++ wli-2.6.0-test11-30/mm/nommu.c 2003-12-04 06:38:56.000000000 -0800 @@ -562,7 +562,3 @@ unsigned long get_unmapped_area(struct f { return -ENOMEM; } - -void pte_chain_init(void) -{ -} diff -prauN linux-2.6.0-test11/mm/page-writeback.c wli-2.6.0-test11-30/mm/page-writeback.c --- linux-2.6.0-test11/mm/page-writeback.c 2003-11-26 12:44:45.000000000 -0800 +++ wli-2.6.0-test11-30/mm/page-writeback.c 2003-12-04 06:13:40.000000000 -0800 @@ -457,7 +457,7 @@ int do_writepages(struct address_space * */ int write_one_page(struct page *page, int wait) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); int ret = 0; struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, @@ -469,12 +469,12 @@ int write_one_page(struct page *page, in if (wait) wait_on_page_writeback(page); - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); list_del(&page->list); if (test_clear_page_dirty(page)) { list_add(&page->list, &mapping->locked_pages); page_cache_get(page); - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); ret = mapping->a_ops->writepage(page, &wbc); if (ret == 0 && wait) { wait_on_page_writeback(page); @@ -484,7 +484,7 @@ int write_one_page(struct page *page, in page_cache_release(page); } else { list_add(&page->list, &mapping->clean_pages); - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); unlock_page(page); } return ret; @@ -496,31 +496,31 @@ EXPORT_SYMBOL(write_one_page); * and move it to the dirty_pages list. Also perform space reservation if * required. * - * __set_page_dirty_nobuffers() may return -ENOSPC. But if it does, the page + * set_page_dirty_nobuffers() may return -ENOSPC. But if it does, the page * is still safe, as long as it actually manages to find some blocks at * writeback time. * * This is also used when a single buffer is being dirtied: we want to set the * page dirty in that case, but not all the buffers. This is a "bottom-up" - * dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying. + * dirtying, whereas set_page_dirty_buffers() is a "top-down" dirtying. */ -int __set_page_dirty_nobuffers(struct page *page) +int set_page_dirty_nobuffers(struct page *page) { int ret = 0; if (!TestSetPageDirty(page)) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); if (mapping) { - spin_lock(&mapping->page_lock); - if (page->mapping) { /* Race with truncate? */ - BUG_ON(page->mapping != mapping); + mapping_wrlock(&mapping->page_lock); + if (page_mapping(page)) { /* Race with truncate? */ + BUG_ON(page_mapping(page) != mapping); if (!mapping->backing_dev_info->memory_backed) inc_page_state(nr_dirty); list_del(&page->list); list_add(&page->list, &mapping->dirty_pages); } - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); if (!PageSwapCache(page)) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); @@ -528,7 +528,28 @@ int __set_page_dirty_nobuffers(struct pa } return ret; } -EXPORT_SYMBOL(__set_page_dirty_nobuffers); +EXPORT_SYMBOL(set_page_dirty_nobuffers); + +/* + * If the mapping doesn't provide a set_page_dirty() a_op, then + * just fall through and assume that it wants bh's. + */ +int set_page_dirty(struct page *page) +{ + struct address_space *mapping = page_mapping(page); + int (*spd)(struct page *); + + if (!mapping) { + SetPageDirty(page); + return 0; + } + spd = mapping->a_ops->set_page_dirty; + if (spd) + return (*spd)(page); + else + return set_page_dirty_buffers(page); +} +EXPORT_SYMBOL(set_page_dirty); /* * set_page_dirty() is racy if the caller has no reference against @@ -558,7 +579,7 @@ EXPORT_SYMBOL(set_page_dirty_lock); int test_clear_page_dirty(struct page *page) { if (TestClearPageDirty(page)) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); if (mapping && !mapping->backing_dev_info->memory_backed) dec_page_state(nr_dirty); diff -prauN linux-2.6.0-test11/mm/page_alloc.c wli-2.6.0-test11-30/mm/page_alloc.c --- linux-2.6.0-test11/mm/page_alloc.c 2003-11-26 12:42:56.000000000 -0800 +++ wli-2.6.0-test11-30/mm/page_alloc.c 2003-12-04 08:30:38.000000000 -0800 @@ -59,7 +59,7 @@ int min_free_kbytes = 1024; /* * Temporary debugging check for pages not lying within a given zone. */ -static int bad_range(struct zone *zone, struct page *page) +static inline int bad_range(struct zone *zone, struct page *page) { if (page_to_pfn(page) >= zone->zone_start_pfn + zone->spanned_pages) return 1; @@ -74,7 +74,7 @@ static void bad_page(const char *functio { printk("Bad page state at %s\n", function); printk("flags:0x%08lx mapping:%p mapped:%d count:%d\n", - page->flags, page->mapping, + page->flags, (void *)page->__mapping, page_mapped(page), page_count(page)); printk("Backtrace:\n"); dump_stack(); @@ -84,9 +84,12 @@ static void bad_page(const char *functio 1 << PG_lru | 1 << PG_active | 1 << PG_dirty | + 1 << PG_rmaplock | + 1 << PG_anon | + 1 << PG_swapcache | 1 << PG_writeback); set_page_count(page, 0); - page->mapping = NULL; + set_page_mapping(page, NULL); } #ifndef CONFIG_HUGETLB_PAGE @@ -168,7 +171,7 @@ static void destroy_compound_page(struct * -- wli */ -static inline void __free_pages_bulk (struct page *page, struct page *base, +static inline void buddy_free(struct page *page, struct page *base, struct zone *zone, struct free_area *area, unsigned long mask, unsigned int order) { @@ -181,7 +184,6 @@ static inline void __free_pages_bulk (st BUG(); index = page_idx >> (1 + order); - zone->free_pages -= mask; while (mask + (1 << (MAX_ORDER-1))) { struct page *buddy1, *buddy2; @@ -202,17 +204,45 @@ static inline void __free_pages_bulk (st BUG_ON(bad_range(zone, buddy2)); list_del(&buddy1->list); mask <<= 1; + area->globally_free--; area++; index >>= 1; page_idx &= mask; } list_add(&(base + page_idx)->list, &area->free_list); + area->globally_free++; +} + +static inline void __free_pages_bulk(struct page *page, struct page *base, + struct zone *zone, struct free_area *area, unsigned long mask, + unsigned int order) +{ + switch (area->active - area->locally_free) { + case 0: + if (!list_empty(&area->deferred_pages)) { + struct page *defer = list_entry(area->deferred_pages.next, struct page, list); + list_del(&defer->list); + area->locally_free--; + buddy_free(defer, base, zone, area, mask, order); + } + /* fall through */ + case 1: + buddy_free(page, base, zone, area, mask, order); + break; + default: + list_add(&page->list, &area->deferred_pages); + area->locally_free++; + break; + } + if (area->active) + area->active--; + zone->free_pages += 1 << order; } static inline void free_pages_check(const char *function, struct page *page) { if ( page_mapped(page) || - page->mapping != NULL || + page->__mapping != 0 || page_count(page) != 0 || (page->flags & ( 1 << PG_lru | @@ -220,6 +250,8 @@ static inline void free_pages_check(cons 1 << PG_locked | 1 << PG_active | 1 << PG_reclaim | + 1 << PG_rmaplock | + 1 << PG_anon | 1 << PG_slab | 1 << PG_writeback ))) bad_page(function, page); @@ -238,41 +270,78 @@ static inline void free_pages_check(cons * And clear the zone's pages_scanned counter, to hold off the "all pages are * pinned" detection logic. */ -static int -free_pages_bulk(struct zone *zone, int count, - struct list_head *list, unsigned int order) +void free_pages_bulk(struct zone *zone, struct page *page, unsigned int order) { - unsigned long mask, flags; + unsigned long mask, flags, count; struct free_area *area; - struct page *base, *page = NULL; - int ret = 0; + struct page *base, *save; + LIST_HEAD(tmp); + count = page->private; mask = (~0UL) << order; base = zone->zone_mem_map; area = zone->free_area + order; spin_lock_irqsave(&zone->lock, flags); zone->all_unreclaimable = 0; zone->pages_scanned = 0; - while (!list_empty(list) && count--) { - page = list_entry(list->prev, struct page, list); - /* have to delete it as __free_pages_bulk list manipulates */ - list_del(&page->list); - __free_pages_bulk(page, base, zone, area, mask, order); - ret++; + + if (order || area->active - area->locally_free <= 2*count) { + list_splice(&page->list, &tmp); + list_add(&page->list, &tmp); + page->private = 0; + } + + if (order) { + list_for_each_entry_safe(page, save, &tmp, list) { + list_del(&page->list); + __free_pages_bulk(page, base, zone, area, mask, order); + } + } else if (area->active - area->locally_free <= 2*count) { + /* + * This is a somewhat ad hoc approach to dealing with + * the interaction of gang allocation and the deferred + * coalescing heuristics. + */ + if (area->active - area->locally_free < count) { + int local = 0; + + while (local < count && area->locally_free) { + struct page *follow, *head = + list_entry(area->deferred_pages.next, struct page, lru); + list_del(&head->lru); + list_for_each_entry_safe(follow, save, &head->list, list) { + list_del(&follow->list); + buddy_free(follow, base, zone, area, mask, 0); + } + local += head->private; + area->locally_free -= head->private; + head->private = 0; + buddy_free(head, base, zone, area, mask, 0); + } + } + list_for_each_entry_safe(page, save, &tmp, list) { + list_del(&page->list); + buddy_free(page, base, zone, area, mask, order); + } + } else { + area->locally_free += count; + list_add(&page->lru, &area->deferred_pages); + } + if (!order) { + zone->free_pages += count; + area->active -= min(area->active, count); } spin_unlock_irqrestore(&zone->lock, flags); - return ret; } void __free_pages_ok(struct page *page, unsigned int order) { - LIST_HEAD(list); - mod_page_state(pgfree, 1 << order); free_pages_check(__FUNCTION__, page); - list_add(&page->list, &list); kernel_map_pages(page, 1<private = 1; + INIT_LIST_HEAD(&page->list); + free_pages_bulk(page_zone(page), page, order); } #define MARK_USED(index, order, area) \ @@ -285,10 +354,10 @@ expand(struct zone *zone, struct page *p unsigned long size = 1 << high; while (high > low) { - BUG_ON(bad_range(zone, page)); area--; high--; size >>= 1; + area->globally_free++; list_add(&page->list, &area->free_list); MARK_USED(index, high, area); index += size; @@ -318,7 +387,7 @@ static inline void set_page_refs(struct */ static void prep_new_page(struct page *page, int order) { - if (page->mapping || page_mapped(page) || + if (page->__mapping || page_mapped(page) || (page->flags & ( 1 << PG_private | 1 << PG_locked | @@ -326,6 +395,9 @@ static void prep_new_page(struct page *p 1 << PG_active | 1 << PG_dirty | 1 << PG_reclaim | + 1 << PG_rmaplock | + 1 << PG_anon | + 1 << PG_swapcache | 1 << PG_writeback ))) bad_page(__FUNCTION__, page); @@ -340,7 +412,7 @@ static void prep_new_page(struct page *p * Do the hard work of removing an element from the buddy allocator. * Call me with the zone->lock already held. */ -static struct page *__rmqueue(struct zone *zone, unsigned int order) +static struct page *buddy_alloc(struct zone *zone, unsigned int order) { struct free_area * area; unsigned int current_order; @@ -354,16 +426,144 @@ static struct page *__rmqueue(struct zon page = list_entry(area->free_list.next, struct page, list); list_del(&page->list); + area->globally_free--; index = page - zone->zone_mem_map; if (current_order != MAX_ORDER-1) MARK_USED(index, current_order, area); - zone->free_pages -= 1UL << order; return expand(zone, page, index, order, current_order, area); } return NULL; } +/* + * This is bad; some way to avoid putting singleton pages on the + * deferred lists should be worked out at some point. + */ +static void split_pages(struct zone *zone, struct page *page, int page_order, int deferred_order) +{ + int split_order = deferred_order - 1; + unsigned long split_offset = 1UL << split_order; + struct page *split_page; + + while (split_order >= page_order) { + split_page = &page[split_offset]; + if (split_order) + list_add(&split_page->list, + &zone->free_area[split_order].deferred_pages); + else if (!zone->free_area[split_order].locally_free) { + INIT_LIST_HEAD(&split_page->list); + split_page->private = 1; + list_add(&split_page->lru, + &zone->free_area[split_order].deferred_pages); + } else { + struct page *head; + head = list_entry(zone->free_area[split_order].deferred_pages.next, struct page, lru); + head->private++; + list_add(&split_page->list, &head->list); + } + zone->free_area[split_order].locally_free++; + --split_order; + split_offset >>= 1; + } +} + +#define COALESCE_BATCH 256 +static struct page *steal_deferred_page(struct zone *zone, int order) +{ + struct page *page; + struct list_head *elem; + struct free_area *area = zone->free_area; + int found_order, k; + + if (zone->free_pages < (1 << order)) + return NULL; + + /* the range of found_order precludes order 0 */ + for (found_order = order + 1; found_order < MAX_ORDER; ++found_order) + if (!list_empty(&area[found_order].deferred_pages)) { + elem = area[found_order].deferred_pages.next; + page = list_entry(elem, struct page, list); + list_del(elem); + area[found_order].locally_free--; + split_pages(zone, page, order, found_order); + return page; + } + + for (found_order = order - 1; found_order >= 0; --found_order) { + for (k = 0; k < COALESCE_BATCH; ++k) { + unsigned long mask = (~0UL) << found_order; + if (list_empty(&area[found_order].deferred_pages)) + break; + elem = area[found_order].deferred_pages.next; + if (found_order) { + page = list_entry(elem, struct page, list); + list_del(elem); + area[found_order].locally_free--; + buddy_free(page, zone->zone_mem_map, zone, &area[found_order], mask, found_order); + } else { + LIST_HEAD(tmp); + struct page *save; + + page = list_entry(elem, struct page, lru); + list_del(elem); + area[found_order].locally_free -= page->private; + page->private = 0; + list_splice(&page->list, &tmp); + list_add(&page->list, &tmp); + list_for_each_entry_safe(page, save, &tmp, list) { + list_del(&page->list); + buddy_free(page, zone->zone_mem_map, zone, &area[found_order], mask, found_order); + } + } + } + page = buddy_alloc(zone, order); + if (page) + return page; + } + return buddy_alloc(zone, order); +} + +static inline int __rmqueue(struct zone *zone, unsigned int order, struct list_head *list) +{ + struct free_area *area = &zone->free_area[order]; + struct page *page; + int count; + + if (!list_empty(&area->deferred_pages)) { + if (order) { + page = list_entry(area->deferred_pages.next, struct page, list); + list_del(&page->list); + count = 1; + } else { + page = list_entry(area->deferred_pages.next, struct page, lru); + list_del(&page->lru); + count = page->private; + page->private = 0; + list_splice(&page->list, list); + } + + area->locally_free -= count; + area->active += count; + zone->free_pages -= count << order; + } else { + page = buddy_alloc(zone, order); + if (page) + count = 1; + else { + page = steal_deferred_page(zone, order); + if (page) + count = 1; + else + return 0; + } + area->active += count; + zone->free_pages -= count << order; + } + list_add(&page->list, list); + return count; +} + /* * Obtain a specified number of elements from the buddy allocator, all under * a single hold of the lock, for efficiency. Add them to the supplied list. @@ -373,17 +573,14 @@ static int rmqueue_bulk(struct zone *zon unsigned long count, struct list_head *list) { unsigned long flags; - int i; - int allocated = 0; - struct page *page; + int i, j, allocated = 0; spin_lock_irqsave(&zone->lock, flags); - for (i = 0; i < count; ++i) { - page = __rmqueue(zone, order); - if (page == NULL) + for (i = 0; i < count && allocated < count; ++i) { + j = __rmqueue(zone, order, list); + if (!j) break; - allocated++; - list_add_tail(&page->list, list); + allocated += j; } spin_unlock_irqrestore(&zone->lock, flags); return allocated; @@ -428,10 +625,14 @@ void drain_local_pages(void) pset = &zone->pageset[smp_processor_id()]; for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { struct per_cpu_pages *pcp; + struct page *page, *save; pcp = &pset->pcp[i]; - pcp->count -= free_pages_bulk(zone, pcp->count, - &pcp->list, 0); + list_for_each_entry_safe(page, save, &pcp->list, lru) { + list_del(&page->lru); + pcp->count -= page->private; + free_pages_bulk(zone, page, 0); + } } } local_irq_restore(flags); @@ -447,15 +648,28 @@ static void free_hot_cold_page(struct pa struct zone *zone = page_zone(page); struct per_cpu_pages *pcp; unsigned long flags; + struct page *head; kernel_map_pages(page, 1, 0); inc_page_state(pgfree); free_pages_check(__FUNCTION__, page); pcp = &zone->pageset[get_cpu()].pcp[cold]; local_irq_save(flags); - if (pcp->count >= pcp->high) - pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0); - list_add(&page->list, &pcp->list); + while (pcp->count >= pcp->high) { + struct page *free = list_entry(pcp->list.prev, struct page, lru); + list_del(&free->lru); + pcp->count -= free->private; + free_pages_bulk(zone, free, 0); + } + head = list_entry(pcp->list.next, struct page, lru); + if (!list_empty(&pcp->list) && head->private < pcp->batch) { + list_add(&page->list, &head->list); + head->private++; + } else { + INIT_LIST_HEAD(&page->list); + list_add(&page->lru, &pcp->list); + page->private = 1; + } pcp->count++; local_irq_restore(flags); put_cpu(); @@ -480,31 +694,75 @@ void free_cold_page(struct page *page) static struct page *buffered_rmqueue(struct zone *zone, int order, int cold) { unsigned long flags; - struct page *page = NULL; + struct page *head, *page = NULL; + struct per_cpu_pages *pcp = NULL; if (order == 0) { - struct per_cpu_pages *pcp; - pcp = &zone->pageset[get_cpu()].pcp[cold]; local_irq_save(flags); - if (pcp->count <= pcp->low) - pcp->count += rmqueue_bulk(zone, 0, - pcp->batch, &pcp->list); + if (pcp->count <= pcp->low) { + LIST_HEAD(tmp); + int k; + + k = rmqueue_bulk(zone, 0, pcp->batch, &tmp); + if (k) { + pcp->count += k; + head = list_entry(tmp.next, struct page, list); + list_del_init(&head->list); + head->private = k; + list_splice(&tmp, &head->list); + list_add(&head->lru, &pcp->list); + } + } if (pcp->count) { - page = list_entry(pcp->list.next, struct page, list); - list_del(&page->list); + head = list_entry(pcp->list.next, struct page, lru); + if (head->private == 1) { + list_del(&head->lru); + page = head; + page->private = 0; + } else { + page = list_entry(head->list.next, struct page,list); + list_del(&page->list); + head->private--; + } pcp->count--; } local_irq_restore(flags); put_cpu(); } - if (page == NULL) { + if (unlikely(!page)) { + LIST_HEAD(tmp); + int count; + + if (!order) + pcp = &zone->pageset[get_cpu()].pcp[cold]; + spin_lock_irqsave(&zone->lock, flags); - page = __rmqueue(zone, order); - spin_unlock_irqrestore(&zone->lock, flags); + count = __rmqueue(zone, order, &tmp); + spin_unlock(&zone->lock); + + if (!list_empty(&tmp)) + page = list_entry(tmp.next, struct page, list); + + if (!order && count > 1) { + struct page *head; + + list_del(&page->list); + pcp->count += count - 1; + head = list_entry(tmp.next, struct page, list); + list_del_init(&head->list); + head->private = count - 1; + list_splice(&tmp, &head->list); + list_add(&head->lru, &pcp->list); + } + + local_irq_restore(flags); + if (order && page) prep_compound_page(page, order); + else if (!order) + put_cpu(); } if (page != NULL) { @@ -845,6 +1103,17 @@ static void show_node(struct zone *zone) #define show_node(zone) do { } while (0) #endif +unsigned long nr_deferred_pages(void) +{ + struct zone *zone; + unsigned long order, pages = 0; + + for_each_zone(zone) + for (order = 0; order < MAX_ORDER; ++order) + pages += zone->free_area[order].locally_free << order; + return pages; +} + /* * Accumulate the page_state information across all CPUs. * The result is unavoidably approximate - it can change @@ -1018,8 +1287,7 @@ void show_free_areas(void) } for_each_zone(zone) { - struct list_head *elem; - unsigned long nr, flags, order, total = 0; + unsigned long order, total = 0; show_node(zone); printk("%s: ", zone->name); @@ -1028,16 +1296,20 @@ void show_free_areas(void) continue; } - spin_lock_irqsave(&zone->lock, flags); + printk("buddy: "); for (order = 0; order < MAX_ORDER; order++) { - nr = 0; - list_for_each(elem, &zone->free_area[order].free_list) - ++nr; - total += nr << order; - printk("%lu*%lukB ", nr, K(1UL) << order); + printk("%lu*%lukB ", zone->free_area[order].globally_free, K(1UL) << order); + total += zone->free_area[order].globally_free << order; } - spin_unlock_irqrestore(&zone->lock, flags); - printk("= %lukB\n", K(total)); + printk("\ndefer: "); + for (order = 0; order < MAX_ORDER; order++) { + printk("%lu*%lukB ", zone->free_area[order].locally_free, K(1UL) << order); + total += zone->free_area[order].locally_free << order; + } + printk("\nactive: "); + for (order = 0; order < MAX_ORDER; order++) + printk("%lu*%lukB ", zone->free_area[order].active, K(1UL) << order); + printk("\n= %lukB\n", K(total)); } show_swap_cache_info(); @@ -1145,9 +1417,13 @@ static inline unsigned long wait_table_s * on IO we've got bigger problems than wait queue collision. * Limit the size of the wait table to a reasonable size. */ - size = min(size, 4096UL); + size = min(size, 1UL << (16 + fls(NR_CPUS))); - return max(size, 4UL); + /* + * Internal fragmentation in the bootmem allocator makes anything + * smaller than this a waste anyway. + */ + return max(size, 1UL << fls(PAGE_SIZE/sizeof(wait_queue_head_t))); } /* @@ -1181,25 +1457,6 @@ static void __init calculate_zone_totalp } /* - * Get space for the valid bitmap. - */ -static void __init calculate_zone_bitmap(struct pglist_data *pgdat, - unsigned long *zones_size) -{ - unsigned long size = 0; - int i; - - for (i = 0; i < MAX_NR_ZONES; i++) - size += zones_size[i]; - size = LONG_ALIGN((size + 7) >> 3); - if (size) { - pgdat->valid_addr_bitmap = - (unsigned long *)alloc_bootmem_node(pgdat, size); - memset(pgdat->valid_addr_bitmap, 0, size); - } -} - -/* * Initially all pages are reserved - free ones are freed * up by free_all_bootmem() once the early boot process is * done. Non-atomic initialization, single-pass. @@ -1274,7 +1531,7 @@ static void __init free_area_init_core(s batch = zone->present_pages / 1024; if (batch * PAGE_SIZE > 256 * 1024) batch = (256 * 1024) / PAGE_SIZE; - batch /= 4; /* We effectively *= 4 below */ + batch *= 4; /* We effectively *= 4 below */ if (batch < 1) batch = 1; @@ -1334,8 +1591,11 @@ static void __init free_area_init_core(s for (i = 0; ; i++) { unsigned long bitmap_size; - + INIT_LIST_HEAD(&zone->free_area[i].deferred_pages); INIT_LIST_HEAD(&zone->free_area[i].free_list); + zone->free_area[i].globally_free = 0; + zone->free_area[i].locally_free = 0; + zone->free_area[i].active = 0; if (i == MAX_ORDER-1) { zone->free_area[i].map = NULL; break; @@ -1389,8 +1649,6 @@ void __init free_area_init_node(int nid, free_area_init_core(pgdat, zones_size, zholes_size); memblk_set_online(node_to_memblk(nid)); - - calculate_zone_bitmap(pgdat, zones_size); } #ifndef CONFIG_DISCONTIGMEM @@ -1443,24 +1701,22 @@ static int frag_show(struct seq_file *m, pg_data_t *pgdat = (pg_data_t *)arg; struct zone *zone; struct zone *node_zones = pgdat->node_zones; - unsigned long flags; int order; for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { if (!zone->present_pages) continue; - spin_lock_irqsave(&zone->lock, flags); - seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); - for (order = 0; order < MAX_ORDER; ++order) { - unsigned long nr_bufs = 0; - struct list_head *elem; - - list_for_each(elem, &(zone->free_area[order].free_list)) - ++nr_bufs; - seq_printf(m, "%6lu ", nr_bufs); - } - spin_unlock_irqrestore(&zone->lock, flags); + seq_printf(m, "Node %d, zone %8s\n", pgdat->node_id, zone->name); + seq_puts(m, "buddy: "); + for (order = 0; order < MAX_ORDER; ++order) + seq_printf(m, "%6lu ", zone->free_area[order].globally_free); + seq_puts(m, "\ndefer: "); + for (order = 0; order < MAX_ORDER; ++order) + seq_printf(m, "%6lu ", zone->free_area[order].locally_free); + seq_puts(m, "\nactive: "); + for (order = 0; order < MAX_ORDER; ++order) + seq_printf(m, "%6lu ", zone->free_area[order].active); seq_putc(m, '\n'); } return 0; @@ -1479,6 +1735,7 @@ static char *vmstat_text[] = { "nr_unstable", "nr_page_table_pages", "nr_mapped", + "nr_swapcache", "nr_slab", "pgpgin", @@ -1503,6 +1760,9 @@ static char *vmstat_text[] = { "pageoutrun", "allocstall", "pgrotated", + "swapaddfail", + "unmapfail", + "unmapagain", }; static void *vmstat_start(struct seq_file *m, loff_t *pos) diff -prauN linux-2.6.0-test11/mm/page_io.c wli-2.6.0-test11-30/mm/page_io.c --- linux-2.6.0-test11/mm/page_io.c 2003-11-26 12:43:05.000000000 -0800 +++ wli-2.6.0-test11-30/mm/page_io.c 2003-12-04 06:13:40.000000000 -0800 @@ -16,8 +16,6 @@ #include #include #include -#include /* for block_sync_page() */ -#include #include #include @@ -32,7 +30,7 @@ get_swap_bio(int gfp_flags, struct page swp_entry_t entry; BUG_ON(!PageSwapCache(page)); - entry.val = page->index; + entry.val = page->private; sis = get_swap_info_struct(swp_type(entry)); bio->bi_sector = map_swap_page(sis, swp_offset(entry)) * @@ -130,13 +128,6 @@ out: return ret; } -struct address_space_operations swap_aops = { - .writepage = swap_writepage, - .readpage = swap_readpage, - .sync_page = block_sync_page, - .set_page_dirty = __set_page_dirty_nobuffers, -}; - /* * A scruffy utility function to read or write an arbitrary swap page * and wait on the I/O. @@ -150,9 +141,8 @@ int rw_swap_page_sync(int rw, swp_entry_ lock_page(page); - BUG_ON(page->mapping); - page->mapping = &swapper_space; - page->index = entry.val; + SetPageSwapCache(page); + page->private = entry.val; if (rw == READ) { ret = swap_readpage(NULL, page); @@ -161,7 +151,7 @@ int rw_swap_page_sync(int rw, swp_entry_ ret = swap_writepage(page, &swap_wbc); wait_on_page_writeback(page); } - page->mapping = NULL; + ClearPageSwapCache(page); if (ret == 0 && (!PageUptodate(page) || PageError(page))) ret = -EIO; return ret; diff -prauN linux-2.6.0-test11/mm/readahead.c wli-2.6.0-test11-30/mm/readahead.c --- linux-2.6.0-test11/mm/readahead.c 2003-11-26 12:42:56.000000000 -0800 +++ wli-2.6.0-test11-30/mm/readahead.c 2003-12-03 19:34:55.000000000 -0800 @@ -229,7 +229,7 @@ __do_page_cache_readahead(struct address /* * Preallocate as many pages as we will need. */ - spin_lock(&mapping->page_lock); + mapping_rdlock(&mapping->page_lock); for (page_idx = 0; page_idx < nr_to_read; page_idx++) { unsigned long page_offset = offset + page_idx; @@ -240,16 +240,16 @@ __do_page_cache_readahead(struct address if (page) continue; - spin_unlock(&mapping->page_lock); + mapping_rdunlock(&mapping->page_lock); page = page_cache_alloc_cold(mapping); - spin_lock(&mapping->page_lock); + mapping_rdlock(&mapping->page_lock); if (!page) break; page->index = page_offset; list_add(&page->list, &page_pool); ret++; } - spin_unlock(&mapping->page_lock); + mapping_rdunlock(&mapping->page_lock); /* * Now start the IO. We ignore I/O errors - if the page is not diff -prauN linux-2.6.0-test11/mm/rmap.c wli-2.6.0-test11-30/mm/rmap.c --- linux-2.6.0-test11/mm/rmap.c 2003-11-26 12:45:36.000000000 -0800 +++ wli-2.6.0-test11-30/mm/rmap.c 2003-12-04 07:59:02.000000000 -0800 @@ -5,527 +5,635 @@ * Released under the General Public License (GPL). * * - * Simple, low overhead pte-based reverse mapping scheme. - * This is kept modular because we may want to experiment - * with object-based reverse mapping schemes. Please try - * to keep this thing as modular as possible. + * Simple, low overhead reverse mapping scheme. + * Please try to keep this thing as modular as possible. */ /* * Locking: - * - the page->pte.chain is protected by the PG_chainlock bit, + * - the page->rmap field is protected by the PG_rmaplock bit, * which nests within the the mm->page_table_lock, * which nests within the page lock. * - because swapout locking is opposite to the locking order * in the page fault path, the swapout path uses trylocks * on the mm->page_table_lock */ + #include #include #include #include #include #include -#include +#include #include #include - -#include -#include -#include +#include +#include #include /* #define DEBUG_RMAP */ /* - * Shared pages have a chain of pte_chain structures, used to locate - * all the mappings to this page. We only need a pointer to the pte - * here, the page struct for the page table page contains the process - * it belongs to and the offset within that process. - * - * We use an array of pte pointers in this structure to minimise cache misses - * while traversing reverse maps. + * struct addresser: for next_rmap_address to dole out user addresses + * one by one to page_referenced() or try_to_unmap() */ -#define NRPTE ((L1_CACHE_BYTES - sizeof(unsigned long))/sizeof(pte_addr_t)) +struct addresser { + unsigned long address, count; + struct rmap_chain *chain; + int index; +}; -/* - * next_and_idx encodes both the address of the next pte_chain and the - * offset of the highest-index used pte in ptes[]. - */ -struct pte_chain { - unsigned long next_and_idx; - pte_addr_t ptes[NRPTE]; -} ____cacheline_aligned; +static kmem_cache_t *rmap_chain_cache; + +static DEFINE_PER_CPU(struct rmap_chain *, rmap_chain) = NULL; -kmem_cache_t *pte_chain_cache; +kmem_cache_t *anon_cache; -static inline struct pte_chain *pte_chain_next(struct pte_chain *pte_chain) +static void anon_ctor(void *arg, kmem_cache_t *cache, unsigned long unused) { - return (struct pte_chain *)(pte_chain->next_and_idx & ~NRPTE); + struct anon *anon = (struct anon *)arg; + atomic_set(&anon->count, 1); + anon->lock = SPIN_LOCK_UNLOCKED; + INIT_LIST_HEAD(&anon->list); + INIT_RCU_HEAD(&anon->rcu); } -static inline struct pte_chain *pte_chain_ptr(unsigned long pte_chain_addr) +static void rmap_chain_ctor(void *arg, kmem_cache_t *cache, unsigned long flags) { - return (struct pte_chain *)(pte_chain_addr & ~NRPTE); + int i; + struct rmap_chain *chain = (struct rmap_chain *)arg; + + for (i = 0; i < NRSLOT; ++i) + chain->slot[i] = NOADDR; + chain->next = NULL; } -static inline int pte_chain_idx(struct pte_chain *pte_chain) +static inline void rmap_chain_dtor(struct rmap_chain *chain) { - return pte_chain->next_and_idx & NRPTE; + int i; + for (i = 0; i < NRSLOT; ++i) + if (chain->slot[i] != NOADDR) + chain->slot[i] = NOADDR; + if (chain->next) + chain->next = NULL; } -static inline unsigned long -pte_chain_encode(struct pte_chain *pte_chain, int idx) +void __init init_rmap(void) { - return (unsigned long)pte_chain | idx; + anon_cache = kmem_cache_create("anon", sizeof(struct anon), 0, 0, anon_ctor, NULL); + if (!anon_cache) + panic("init_rmap: Cannot alloc anon slab cache\n"); + rmap_chain_cache = kmem_cache_create("rmap_chain", sizeof(struct rmap_chain), 0, 0, rmap_chain_ctor, NULL); } -/* - * pte_chain list management policy: - * - * - If a page has a pte_chain list then it is shared by at least two processes, - * because a single sharing uses PageDirect. (Well, this isn't true yet, - * coz this code doesn't collapse singletons back to PageDirect on the remove - * path). - * - A pte_chain list has free space only in the head member - all succeeding - * members are 100% full. - * - If the head element has free space, it occurs in its leading slots. - * - All free space in the pte_chain is at the start of the head member. - * - Insertion into the pte_chain puts a pte pointer in the last free slot of - * the head member. - * - Removal from a pte chain moves the head pte of the head member onto the - * victim pte and frees the head member if it became empty. - */ +int exec_rmap(struct mm_struct *mm) +{ + struct anon *anon = kmem_cache_alloc(anon_cache, GFP_KERNEL); + if (!anon) + return -ENOMEM; + mm->anon = anon; + /* unique reference; no locking required */ + list_add_rcu(&mm->anon_list, &anon->list); + return 0; +} -/** - ** VM stuff below this comment - **/ +void dup_rmap(struct mm_struct *new, struct mm_struct *old) +{ + struct anon *anon = old->anon; + atomic_inc(&anon->count); + new->anon = anon; + spin_lock(&anon->lock); + list_add_tail_rcu(&new->anon_list, &anon->list); + spin_unlock(&anon->lock); +} -/** - * page_referenced - test if the page was referenced - * @page: the page to test - * - * Quick test_and_clear_referenced for all mappings to a page, - * returns the number of processes which referenced the page. - * Caller needs to hold the pte_chain_lock. - * - * If the page has a single-entry pte_chain, collapse that back to a PageDirect - * representation. This way, it's only done under memory pressure. - */ -int page_referenced(struct page * page) +static void free_anon(void *__anon) { - struct pte_chain *pc; - int referenced = 0; + struct anon *anon = (struct anon *)__anon; + INIT_LIST_HEAD(&anon->list); + atomic_set(&anon->count, 1); + kmem_cache_free(anon_cache, anon); +} - if (TestClearPageReferenced(page)) - referenced++; +void exit_rmap(struct mm_struct *mm) +{ + struct anon *anon = mm->anon; - if (PageDirect(page)) { - pte_t *pte = rmap_ptep_map(page->pte.direct); - if (ptep_test_and_clear_young(pte)) - referenced++; - rmap_ptep_unmap(pte); - } else { - int nr_chains = 0; + mm->anon = NULL; + spin_lock(&anon->lock); + list_del_rcu(&mm->anon_list); + spin_unlock(&anon->lock); + + if (!atomic_dec_and_test(&anon->count)) + return; + + call_rcu(&anon->rcu, free_anon, anon); +} + +/** + ** Functions for manipulating struct rmap_chain. + **/ - /* Check all the page tables mapping this page. */ - for (pc = page->pte.chain; pc; pc = pte_chain_next(pc)) { - int i; - - for (i = NRPTE-1; i >= 0; i--) { - pte_addr_t pte_paddr = pc->ptes[i]; - pte_t *p; - - if (!pte_paddr) - break; - p = rmap_ptep_map(pte_paddr); - if (ptep_test_and_clear_young(p)) - referenced++; - rmap_ptep_unmap(p); - nr_chains++; +/* + * Boolean rmap_get_cpu() ensures the cpu has an rmap_chain cached + * in case it is needed later while lock is held. It is never needed + * when page_add_rmap() is adding a freshly allocated anon page. + * caller does put_cpu() once ->page_table_lock prevents preemption. + */ +int rmap_get_cpu(void) +{ + struct rmap_chain **cache, *chain; + might_sleep(); + cache = &per_cpu(rmap_chain, get_cpu()); + if (*cache) + return 1; + put_cpu(); + chain = kmem_cache_alloc(rmap_chain_cache, GFP_KERNEL); + cache = &per_cpu(rmap_chain, get_cpu()); + if (*cache) + kmem_cache_free(rmap_chain_cache, chain); + else if (chain) + *cache = chain; + else { + put_cpu(); + return 0; + } + return 1; +} + +static struct rmap_chain *get_rmap_chain(void) +{ + struct rmap_chain **cache, *chain; + int i; + + /* + * ->page_table_lock and rmap_lock are held, no need to get_cpu() + */ + cache = &per_cpu(rmap_chain, smp_processor_id()); + chain = *cache; + *cache = NULL; + for (i = 0; i < NRSLOT; ++i) + chain->slot[i] = NOADDR; + chain->next = NULL; + return chain; +} + +void add_rmap_address(struct page *page, unsigned long address) +{ + struct rmap_chain *chain = page->chain; + int i = 0; + + if (!chain) + chain = page->chain = get_rmap_chain(); + else { + /* + * Check lest duplicates arise, and find a free slot at the end + */ + for (chain = page->chain; ; chain = chain->next) { + for (i = 0; i < NRSLOT; ++i) { + if (chain->slot[i] == NOADDR) + goto set; + else if (chain->slot[i] == address) + return; } + if (!chain->next) + chain->next = get_rmap_chain(); } - if (nr_chains == 1) { - pc = page->pte.chain; - page->pte.direct = pc->ptes[NRPTE-1]; - SetPageDirect(page); - pc->ptes[NRPTE-1] = 0; - __pte_chain_free(pc); + } +set: + chain->slot[i] = address; +} + +static int +next_rmap_address(struct page *page, struct vm_area_struct *vma, + struct addresser *addresser) +{ + /* bootstrap it */ + if (addresser->address == NOADDR) { + /* set chain and index for next call */ + addresser->chain = page->chain; + addresser->index = 0; + if (vma) { + addresser->address = vma_address(page, vma); + if (addresser->address != NOADDR) + return 1; + } else { + addresser->address = page->index; + return 1; } } - return referenced; + while (addresser->chain) { + if (addresser->index >= NRSLOT) + addresser->index = 0; + addresser->address = + addresser->chain->slot[addresser->index]; + if (addresser->address == NOADDR) + break; + addresser->index++; + if (addresser->index >= NRSLOT) + addresser->chain = addresser->chain->next; + if (!vma || addresser->address != vma_address(page, vma)) + return 1; + } + return 0; } -/** - * page_add_rmap - add reverse mapping entry to a page - * @page: the page to add the mapping to - * @ptep: the page table entry mapping this page - * - * Add a new pte reverse mapping to a page. - * The caller needs to hold the mm->page_table_lock. - */ -struct pte_chain * -page_add_rmap(struct page *page, pte_t *ptep, struct pte_chain *pte_chain) +void clear_page_chained(struct page *page) { - pte_addr_t pte_paddr = ptep_to_paddr(ptep); - struct pte_chain *cur_pte_chain; + struct rmap_chain *chain = page->chain; + page->chain = NULL; - if (!pfn_valid(page_to_pfn(page)) || PageReserved(page)) - return pte_chain; + /* + * This is only called when mapcount goes to 0, which + * means it's possible for a page to accumulate a large + * chain of stale addresses. But normally try_to_unmap_one() + * will bring the count to 0 and free them all here. + */ + while (chain) { + struct rmap_chain *next = chain->next; + rmap_chain_dtor(chain); + kmem_cache_free(rmap_chain_cache, chain); + chain = next; + } +} - pte_chain_lock(page); +/** + ** Subfunctions of page_referenced(): page_referenced_one() called + ** repeatedly from page_referenced_obj(); + **/ - if (page->pte.direct == 0) { - page->pte.direct = pte_paddr; - SetPageDirect(page); - inc_page_state(nr_mapped); - goto out; - } +static inline int page_referenced_one(struct page *page, struct mm_struct *mm, + struct addresser *addresser) +{ + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + int referenced = 0; - if (PageDirect(page)) { - /* Convert a direct pointer into a pte_chain */ - ClearPageDirect(page); - pte_chain->ptes[NRPTE-1] = page->pte.direct; - pte_chain->ptes[NRPTE-2] = pte_paddr; - pte_chain->next_and_idx = pte_chain_encode(NULL, NRPTE-2); - page->pte.direct = 0; - page->pte.chain = pte_chain; - pte_chain = NULL; /* We consumed it */ + if (!spin_trylock(&mm->page_table_lock)) { + referenced = 1; goto out; } - cur_pte_chain = page->pte.chain; - if (cur_pte_chain->ptes[0]) { /* It's full */ - pte_chain->next_and_idx = pte_chain_encode(cur_pte_chain, - NRPTE - 1); - page->pte.chain = pte_chain; - pte_chain->ptes[NRPTE-1] = pte_paddr; - pte_chain = NULL; /* We consumed it */ - goto out; - } - cur_pte_chain->ptes[pte_chain_idx(cur_pte_chain) - 1] = pte_paddr; - cur_pte_chain->next_and_idx--; + pgd = pgd_offset(mm, addresser->address); + if (!pgd_present(*pgd)) + goto out_unlock; + + pmd = pmd_offset_map(pgd, addresser->address); + if (!pmd) + goto out_unlock; + + if (!pmd_present(*pmd)) + goto out_unmap_pmd; + + pte = pte_offset_map(pmd, addresser->address); + if (!pte_present(*pte)) + goto out_unmap_pte; + + if (page_to_pfn(page) != pte_pfn(*pte)) + goto out_unmap_pte; + + referenced = ptep_test_and_clear_young(pte); + addresser->count--; + +out_unmap_pte: + pte_unmap(pte); +out_unmap_pmd: + pmd_unmap(pmd); +out_unlock: + spin_unlock(&mm->page_table_lock); out: - pte_chain_unlock(page); - return pte_chain; + return referenced; } -/** - * page_remove_rmap - take down reverse mapping to a page - * @page: page to remove mapping from - * @ptep: page table entry to remove - * - * Removes the reverse mapping from the pte_chain of the page, - * after that the caller can clear the page table entry and free - * the page. - * Caller needs to hold the mm->page_table_lock. - */ -void page_remove_rmap(struct page *page, pte_t *ptep) +static inline int +page_referenced_anon(struct page *page, struct addresser *addresser) { - pte_addr_t pte_paddr = ptep_to_paddr(ptep); - struct pte_chain *pc; + struct mm_struct *mm; + struct anon *anon; + int referenced = 0; - if (!pfn_valid(page_to_pfn(page)) || PageReserved(page)) - return; + rcu_read_lock(); /* anon->lock */ - pte_chain_lock(page); + anon = page_anon(page); + if (!anon) + goto out; - if (!page_mapped(page)) - goto out_unlock; /* remap_page_range() from a driver? */ + list_for_each_entry_rcu(mm, &anon->list, anon_list) { + if (!mm->anon || !mm->rss) + continue; + addresser->address = NOADDR; + while (next_rmap_address(page, NULL, addresser)) { + referenced += page_referenced_one(page, mm, addresser); + if (!addresser->count) + goto out; + } + } +out: + rcu_read_unlock(); /* anon->lock */ + return referenced; +} + +static inline int page_referenced_obj(struct page *page, struct addresser *addresser) +{ + struct address_space *mapping = page_mapping(page); + struct vm_area_struct *vma; + int referenced = 0; - if (PageDirect(page)) { - if (page->pte.direct == pte_paddr) { - page->pte.direct = 0; - ClearPageDirect(page); - goto out; + /* bail if it's a Morton page */ + if (!mapping) + return 0; + + rcu_read_lock(); /* mapping->i_shared_lock */ + list_for_each_entry_rcu(vma, &mapping->i_mmap, shared) { + if (vma->vm_flags & VM_DEAD) + continue; + if (!vma->vm_mm->rss) + continue; + addresser->address = NOADDR; + while (next_rmap_address(page, vma, addresser)) { + referenced += page_referenced_one(page, vma->vm_mm, addresser); + if (!addresser->count) + goto out; } - } else { - struct pte_chain *start = page->pte.chain; - struct pte_chain *next; - int victim_i = -1; - - for (pc = start; pc; pc = next) { - int i; - - next = pte_chain_next(pc); - if (next) - prefetch(next); - for (i = pte_chain_idx(pc); i < NRPTE; i++) { - pte_addr_t pa = pc->ptes[i]; - - if (victim_i == -1) - victim_i = i; - if (pa != pte_paddr) - continue; - pc->ptes[i] = start->ptes[victim_i]; - start->ptes[victim_i] = 0; - if (victim_i == NRPTE-1) { - /* Emptied a pte_chain */ - page->pte.chain = pte_chain_next(start); - __pte_chain_free(start); - } else { - start->next_and_idx++; - } + } + + list_for_each_entry_rcu(vma, &mapping->i_mmap_shared, shared) { + if (vma->vm_flags & VM_DEAD) + continue; + if (!vma->vm_mm->rss) + continue; + addresser->address = NOADDR; + while (next_rmap_address(page, vma, addresser)) { + referenced += page_referenced_one(page, vma->vm_mm, addresser); + if (!addresser->count) goto out; - } } } out: - if (!page_mapped(page)) - dec_page_state(nr_mapped); -out_unlock: - pte_chain_unlock(page); - return; + rcu_read_unlock(); /* mapping->i_shared_lock */ + return referenced; } /** - * try_to_unmap_one - worker function for try_to_unmap - * @page: page to unmap - * @ptep: page table entry to unmap from page + * page_referenced - test if the page was referenced + * @page: the page to test * - * Internal helper function for try_to_unmap, called for each page - * table entry mapping a page. Because locking order here is opposite - * to the locking order used by the page fault path, we use trylocks. - * Locking: - * page lock shrink_list(), trylock - * pte_chain_lock shrink_list() - * mm->page_table_lock try_to_unmap_one(), trylock + * returns the number of ptes which referenced the page. + * Caller needs to hold the rmap_lock. */ -static int FASTCALL(try_to_unmap_one(struct page *, pte_addr_t)); -static int try_to_unmap_one(struct page * page, pte_addr_t paddr) +int page_referenced(struct page * page) { - pte_t *ptep = rmap_ptep_map(paddr); - unsigned long address = ptep_to_address(ptep); - struct mm_struct * mm = ptep_to_mm(ptep); - struct vm_area_struct * vma; - pte_t pte; - int ret; + int referenced = !!TestClearPageReferenced(page); + struct addresser addresser; + + addresser.count = atomic_read(&page->mapcount); + if (!addresser.count || !page->__mapping) + return 0; + else if (PageAnon(page)) + referenced += page_referenced_anon(page, &addresser); + else + referenced += page_referenced_obj(page, &addresser); + return referenced; +} + +void page_turn_rmap(struct page *page, struct vm_area_struct *vma) +{ + struct anon *old, *new; + old = page_anon(page); + new = vma->vm_mm->anon; - if (!mm) - BUG(); + BUG_ON(!PageAnon(page)); + BUG_ON(atomic_read(&page->mapcount) != 1); + + if (old == new) + return; + + rmap_lock(page); + set_page_mapping(page, new); + rmap_unlock(page); +} + +void page_move_rmap(struct page *page, struct vm_area_struct *vma, + unsigned long old, unsigned long new) +{ + if (!page_mapped(page) || !page->__mapping) + return; + + rmap_lock(page); + + if (PageAnon(page)) { + /* + * Don't check atomic_read(&page->mapcount) == 1 here + * because the mapcount could be 1 but the page + * could still have a chain, and our new address + * in that chain. + */ + if (atomic_read(&page->mapcount) == 1) + page->index = new; + else if (new != page->index) + add_rmap_address(page, new); + } else { + /* + * Just in case things are nonlinear. + */ + if (old != vma_address(page, vma)) + add_rmap_address(page, new); + } + + rmap_unlock(page); +} + +static int try_to_unmap_one(struct page *page, struct mm_struct *mm, + struct addresser *addresser, struct vm_area_struct *vma) +{ + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + pte_t pteval; + unsigned long address = addresser->address; + int ret = SWAP_AGAIN; /* * We need the page_table_lock to protect us from page faults, * munmap, fork, etc... */ - if (!spin_trylock(&mm->page_table_lock)) { - rmap_ptep_unmap(ptep); - return SWAP_AGAIN; - } - + if (!spin_trylock(&mm->page_table_lock)) + goto out; - /* During mremap, it's possible pages are not in a VMA. */ - vma = find_vma(mm, address); - if (!vma) { + /* If the page is mlock()'d, we can't unmap it. */ + if (!vma) + vma = find_vma(mm, address); + if (!vma || (vma->vm_flags & VM_LOCKED)) { ret = SWAP_FAIL; goto out_unlock; } - /* The page is mlock()d, we cannot swap it out. */ - if (vma->vm_flags & VM_LOCKED) { - ret = SWAP_FAIL; + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) goto out_unlock; - } + pmd = pmd_offset_map(pgd, address); + if (!pmd_present(*pmd)) + goto out_unmap_pmd; + pte = pte_offset_map(pmd, address); + if (!pte_present(*pte)) + goto out_unmap_pte; + + if (page_to_pfn(page) != pte_pfn(*pte)) + goto out_unmap_pte; + + addresser->count--; /* Nuke the page table entry. */ flush_cache_page(vma, address); - pte = ptep_get_and_clear(ptep); + pteval = vm_ptep_get_and_clear(vma, pte, address); flush_tlb_page(vma, address); - if (PageSwapCache(page)) { + if (PageAnon(page)) { /* * Store the swap location in the pte. * See handle_pte_fault() ... */ - swp_entry_t entry = { .val = page->index }; + swp_entry_t entry = { .val = page->private }; + BUG_ON(!PageSwapCache(page)); swap_duplicate(entry); - set_pte(ptep, swp_entry_to_pte(entry)); - BUG_ON(pte_file(*ptep)); + vm_set_pte(vma, pte, swp_entry_to_pte(entry), address); + BUG_ON(pte_file(*pte)); } else { - unsigned long pgidx; /* - * If a nonlinear mapping then store the file page offset - * in the pte. + * If a nonlinear mapping from sys_remap_file_pages(), + * then store the file page offset in the pte. */ - pgidx = (address - vma->vm_start) >> PAGE_SHIFT; - pgidx += vma->vm_pgoff; - pgidx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT; - if (page->index != pgidx) { - set_pte(ptep, pgoff_to_pte(page->index)); - BUG_ON(!pte_file(*ptep)); + if (address != vma_address(page, vma)) { + vm_set_pte(vma, pte, pgoff_to_pte(page->index), address); + BUG_ON(!pte_file(*pte)); } } /* Move the dirty bit to the physical page now the pte is gone. */ - if (pte_dirty(pte)) + if (pte_dirty(pteval)) set_page_dirty(page); - mm->rss--; + BUG_ON(!atomic_read(&page->mapcount)); + if (atomic_dec_and_test(&page->mapcount)) + if (page->chain) + clear_page_chained(page); page_cache_release(page); - ret = SWAP_SUCCESS; + mm->rss--; +out_unmap_pte: + pte_unmap(pte); +out_unmap_pmd: + pmd_unmap(pmd); out_unlock: - rmap_ptep_unmap(ptep); spin_unlock(&mm->page_table_lock); +out: return ret; } -/** - * try_to_unmap - try to remove all page table mappings to a page - * @page: the page to get unmapped - * - * Tries to remove all the page table entries which are mapping this - * page, used in the pageout path. Caller must hold the page lock - * and its pte chain lock. Return values are: - * - * SWAP_SUCCESS - we succeeded in removing all mappings - * SWAP_AGAIN - we missed a trylock, try again later - * SWAP_FAIL - the page is unswappable - */ -int try_to_unmap(struct page * page) +static inline int try_to_unmap_anon(struct page *page, struct addresser *addresser) { - struct pte_chain *pc, *next_pc, *start; - int ret = SWAP_SUCCESS; - int victim_i = -1; - - /* This page should not be on the pageout lists. */ - if (PageReserved(page)) - BUG(); - if (!PageLocked(page)) - BUG(); - /* We need backing store to swap out a page. */ - if (!page->mapping) - BUG(); - - if (PageDirect(page)) { - ret = try_to_unmap_one(page, page->pte.direct); - if (ret == SWAP_SUCCESS) { - page->pte.direct = 0; - ClearPageDirect(page); - } + struct mm_struct *mm; + struct anon *anon; + int ret = SWAP_AGAIN; + + rcu_read_lock(); /* anon->lock */ + + anon = page_anon(page); + if (!anon) goto out; - } - start = page->pte.chain; - for (pc = start; pc; pc = next_pc) { - int i; - - next_pc = pte_chain_next(pc); - if (next_pc) - prefetch(next_pc); - for (i = pte_chain_idx(pc); i < NRPTE; i++) { - pte_addr_t pte_paddr = pc->ptes[i]; - - if (!pte_paddr) - continue; - if (victim_i == -1) - victim_i = i; - - switch (try_to_unmap_one(page, pte_paddr)) { - case SWAP_SUCCESS: - /* - * Release a slot. If we're releasing the - * first pte in the first pte_chain then - * pc->ptes[i] and start->ptes[victim_i] both - * refer to the same thing. It works out. - */ - pc->ptes[i] = start->ptes[victim_i]; - start->ptes[victim_i] = 0; - victim_i++; - if (victim_i == NRPTE) { - page->pte.chain = pte_chain_next(start); - __pte_chain_free(start); - start = page->pte.chain; - victim_i = 0; - } else { - start->next_and_idx++; - } - break; - case SWAP_AGAIN: - /* Skip this pte, remembering status. */ - ret = SWAP_AGAIN; - continue; - case SWAP_FAIL: - ret = SWAP_FAIL; + list_for_each_entry_rcu(mm, &anon->list, anon_list) { + if (!mm->anon) + continue; + addresser->address = NOADDR; + while (next_rmap_address(page, NULL, addresser)) { + ret = try_to_unmap_one(page, mm, addresser, NULL); + if (ret == SWAP_FAIL || !addresser->count) goto out; - } } } out: - if (!page_mapped(page)) - dec_page_state(nr_mapped); + rcu_read_unlock(); /* anon->lock */ return ret; } -/** - ** No more VM stuff below this comment, only pte_chain helper - ** functions. - **/ - -static void pte_chain_ctor(void *p, kmem_cache_t *cachep, unsigned long flags) +static inline int try_to_unmap_obj(struct page *page, struct addresser *addresser) { - struct pte_chain *pc = p; + struct address_space *mapping; + struct vm_area_struct *vma; + int ret = SWAP_AGAIN; + + mapping = page_mapping(page); + + /* bail if it's a Morton page */ + if (!mapping) + return SWAP_FAIL; + + rcu_read_lock(); /* mapping->i_shared_lock */ + + list_for_each_entry_rcu(vma, &mapping->i_mmap, shared) { + if (vma->vm_flags & VM_DEAD) + continue; + if (!vma->vm_mm->rss) + continue; + addresser->address = NOADDR; + while (next_rmap_address(page, vma, addresser)) { + ret = try_to_unmap_one(page, vma->vm_mm, addresser, vma); + if (ret == SWAP_FAIL || !addresser->count) + goto out; + } + } - memset(pc, 0, sizeof(*pc)); + list_for_each_entry_rcu(vma, &mapping->i_mmap_shared, shared) { + if (vma->vm_flags & VM_DEAD) + continue; + if (!vma->vm_mm->rss) + continue; + addresser->address = NOADDR; + while (next_rmap_address(page, vma, addresser)) { + ret = try_to_unmap_one(page, vma->vm_mm, addresser, vma); + if (ret == SWAP_FAIL || !addresser->count) + goto out; + } + } +out: + rcu_read_unlock(); /* mapping->i_shared_lock */ + return ret; } -DEFINE_PER_CPU(struct pte_chain *, local_pte_chain) = 0; - /** - * __pte_chain_free - free pte_chain structure - * @pte_chain: pte_chain struct to free - */ -void __pte_chain_free(struct pte_chain *pte_chain) -{ - struct pte_chain **pte_chainp; - - pte_chainp = &get_cpu_var(local_pte_chain); - if (pte_chain->next_and_idx) - pte_chain->next_and_idx = 0; - if (*pte_chainp) - kmem_cache_free(pte_chain_cache, *pte_chainp); - *pte_chainp = pte_chain; - put_cpu_var(local_pte_chain); -} - -/* - * pte_chain_alloc(): allocate a pte_chain structure for use by page_add_rmap(). + * try_to_unmap - try to remove all page table mappings to a page + * @page: the page to get unmapped + * + * Tries to remove all the page table entries which are mapping this + * page, used in the pageout path. Caller must hold the page lock + * and its pte chain lock. Return values are: * - * The caller of page_add_rmap() must perform the allocation because - * page_add_rmap() is invariably called under spinlock. Often, page_add_rmap() - * will not actually use the pte_chain, because there is space available in one - * of the existing pte_chains which are attached to the page. So the case of - * allocating and then freeing a single pte_chain is specially optimised here, - * with a one-deep per-cpu cache. + * SWAP_SUCCESS - we succeeded in removing all mappings + * SWAP_AGAIN - we missed a trylock, try again later + * SWAP_FAIL - the page is unswappable */ -struct pte_chain *pte_chain_alloc(int gfp_flags) +int try_to_unmap(struct page *page) { - struct pte_chain *ret; - struct pte_chain **pte_chainp; - - might_sleep_if(gfp_flags & __GFP_WAIT); + struct addresser addresser; + int ret; - pte_chainp = &get_cpu_var(local_pte_chain); - if (*pte_chainp) { - ret = *pte_chainp; - *pte_chainp = NULL; - put_cpu_var(local_pte_chain); - } else { - put_cpu_var(local_pte_chain); - ret = kmem_cache_alloc(pte_chain_cache, gfp_flags); + BUG_ON(PageReserved(page)); + BUG_ON(!PageLocked(page)); + BUG_ON(!page_mapped(page)); + + addresser.count = atomic_read(&page->mapcount); + if (PageAnon(page)) + ret = try_to_unmap_anon(page, &addresser); + else + ret = try_to_unmap_obj(page, &addresser); + if (!page_mapped(page)) { + dec_page_state(nr_mapped); + if (PageAnon(page)) + clear_page_anon(page); + ret = SWAP_SUCCESS; } return ret; } - -void __init pte_chain_init(void) -{ - pte_chain_cache = kmem_cache_create( "pte_chain", - sizeof(struct pte_chain), - 0, - SLAB_MUST_HWCACHE_ALIGN, - pte_chain_ctor, - NULL); - - if (!pte_chain_cache) - panic("failed to create pte_chain cache!\n"); -} diff -prauN linux-2.6.0-test11/mm/shmem.c wli-2.6.0-test11-30/mm/shmem.c --- linux-2.6.0-test11/mm/shmem.c 2003-11-26 12:43:41.000000000 -0800 +++ wli-2.6.0-test11-30/mm/shmem.c 2003-12-04 08:46:52.000000000 -0800 @@ -71,7 +71,7 @@ enum sgp_type { }; static int shmem_getpage(struct inode *inode, unsigned long idx, - struct page **pagep, enum sgp_type sgp); + struct page **pagep, enum sgp_type sgp, int *type); static inline struct page *shmem_dir_alloc(unsigned int gfp_mask) { @@ -540,7 +540,7 @@ static int shmem_notify_change(struct de if (attr->ia_size & (PAGE_CACHE_SIZE-1)) { (void) shmem_getpage(inode, attr->ia_size>>PAGE_CACHE_SHIFT, - &page, SGP_READ); + &page, SGP_READ, NULL); } /* * Reset SHMEM_PAGEIN flag so that shmem_truncate can @@ -721,7 +721,7 @@ static int shmem_writepage(struct page * BUG_ON(!PageLocked(page)); BUG_ON(page_mapped(page)); - mapping = page->mapping; + mapping = page_mapping(page); index = page->index; inode = mapping->host; info = SHMEM_I(inode); @@ -765,7 +765,7 @@ redirty: * vm. If we swap it in we mark it dirty since we also free the swap * entry since a page cannot live in both the swap and page cache */ -static int shmem_getpage(struct inode *inode, unsigned long idx, struct page **pagep, enum sgp_type sgp) +static int shmem_getpage(struct inode *inode, unsigned long idx, struct page **pagep, enum sgp_type sgp, int *type) { struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); @@ -774,7 +774,7 @@ static int shmem_getpage(struct inode *i struct page *swappage; swp_entry_t *entry; swp_entry_t swap; - int error; + int error, majmin = VM_FAULT_MINOR; if (idx >= SHMEM_MAX_INDEX) return -EFBIG; @@ -811,6 +811,10 @@ repeat: if (!swappage) { shmem_swp_unmap(entry); spin_unlock(&info->lock); + /* here we actually do the io */ + if (majmin == VM_FAULT_MINOR && type) + inc_page_state(pgmajfault); + majmin = VM_FAULT_MAJOR; swapin_readahead(swap); swappage = read_swap_cache_async(swap); if (!swappage) { @@ -959,6 +963,8 @@ done: } else *pagep = ZERO_PAGE(0); } + if (type) + *type = majmin; return 0; failed: @@ -969,7 +975,7 @@ failed: return error; } -struct page *shmem_nopage(struct vm_area_struct *vma, unsigned long address, int unused) +struct page *shmem_nopage(struct vm_area_struct *vma, unsigned long address, int *type) { struct inode *inode = vma->vm_file->f_dentry->d_inode; struct page *page = NULL; @@ -980,7 +986,7 @@ struct page *shmem_nopage(struct vm_area idx += vma->vm_pgoff; idx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT; - error = shmem_getpage(inode, idx, &page, SGP_CACHE); + error = shmem_getpage(inode, idx, &page, SGP_CACHE, type); if (error) return (error == -ENOMEM)? NOPAGE_OOM: NOPAGE_SIGBUS; @@ -1007,7 +1013,7 @@ static int shmem_populate(struct vm_area /* * Will need changing if PAGE_CACHE_SIZE != PAGE_SIZE */ - err = shmem_getpage(inode, pgoff, &page, sgp); + err = shmem_getpage(inode, pgoff, &page, sgp, NULL); if (err) return err; if (page) { @@ -1156,8 +1162,8 @@ static struct inode_operations shmem_sym static int shmem_prepare_write(struct file *file, struct page *page, unsigned offset, unsigned to) { - struct inode *inode = page->mapping->host; - return shmem_getpage(inode, page->index, &page, SGP_WRITE); + struct inode *inode = page_mapping(page)->host; + return shmem_getpage(inode, page->index, &page, SGP_WRITE, NULL); } static ssize_t @@ -1214,7 +1220,7 @@ shmem_file_write(struct file *file, cons * But it still may be a good idea to prefault below. */ - err = shmem_getpage(inode, index, &page, SGP_WRITE); + err = shmem_getpage(inode, index, &page, SGP_WRITE, NULL); if (err) break; @@ -1296,7 +1302,7 @@ static void do_shmem_file_read(struct fi break; } - desc->error = shmem_getpage(inode, index, &page, SGP_READ); + desc->error = shmem_getpage(inode, index, &page, SGP_READ, NULL); if (desc->error) { if (desc->error == -EINVAL) desc->error = 0; @@ -1552,7 +1558,7 @@ static int shmem_symlink(struct inode *d iput(inode); return -ENOMEM; } - error = shmem_getpage(inode, 0, &page, SGP_WRITE); + error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL); if (error) { vm_unacct_memory(VM_ACCT(1)); iput(inode); @@ -1590,7 +1596,7 @@ static int shmem_follow_link_inline(stru static int shmem_readlink(struct dentry *dentry, char __user *buffer, int buflen) { struct page *page = NULL; - int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ); + int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL); if (res) return res; res = vfs_readlink(dentry, buffer, buflen, kmap(page)); @@ -1603,7 +1609,7 @@ static int shmem_readlink(struct dentry static int shmem_follow_link(struct dentry *dentry, struct nameidata *nd) { struct page *page = NULL; - int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ); + int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL); if (res) return res; res = vfs_follow_link(nd, kmap(page)); @@ -1818,7 +1824,7 @@ static void destroy_inodecache(void) static struct address_space_operations shmem_aops = { .writepage = shmem_writepage, - .set_page_dirty = __set_page_dirty_nobuffers, + .set_page_dirty = set_page_dirty_nobuffers, #ifdef CONFIG_TMPFS .prepare_write = shmem_prepare_write, .commit_write = simple_commit_write, diff -prauN linux-2.6.0-test11/mm/slab.c wli-2.6.0-test11-30/mm/slab.c --- linux-2.6.0-test11/mm/slab.c 2003-11-26 12:45:08.000000000 -0800 +++ wli-2.6.0-test11-30/mm/slab.c 2003-12-03 18:20:41.000000000 -0800 @@ -2780,7 +2780,7 @@ void ptrinfo(unsigned long addr) printk("No pgd.\n"); break; } - pmd = pmd_offset(pgd, addr); + pmd = pmd_offset_kernel(pgd, addr); if (pmd_none(*pmd)) { printk("No pmd.\n"); break; diff -prauN linux-2.6.0-test11/mm/swap_state.c wli-2.6.0-test11-30/mm/swap_state.c --- linux-2.6.0-test11/mm/swap_state.c 2003-11-26 12:43:48.000000000 -0800 +++ wli-2.6.0-test11-30/mm/swap_state.c 2003-12-04 06:28:26.000000000 -0800 @@ -21,23 +21,16 @@ static struct backing_dev_info swap_back .memory_backed = 1, /* Does not contribute to dirty memory */ }; -extern struct address_space_operations swap_aops; +static struct address_space_operations swap_aops = { + .writepage = swap_writepage, + .readpage = swap_readpage, +}; struct address_space swapper_space = { .page_tree = RADIX_TREE_INIT(GFP_ATOMIC), - .page_lock = SPIN_LOCK_UNLOCKED, - .clean_pages = LIST_HEAD_INIT(swapper_space.clean_pages), - .dirty_pages = LIST_HEAD_INIT(swapper_space.dirty_pages), - .io_pages = LIST_HEAD_INIT(swapper_space.io_pages), - .locked_pages = LIST_HEAD_INIT(swapper_space.locked_pages), + .page_lock = MAPPING_RW_LOCK_UNLOCKED, .a_ops = &swap_aops, .backing_dev_info = &swap_backing_dev_info, - .i_mmap = LIST_HEAD_INIT(swapper_space.i_mmap), - .i_mmap_shared = LIST_HEAD_INIT(swapper_space.i_mmap_shared), - .i_shared_sem = __MUTEX_INITIALIZER(swapper_space.i_shared_sem), - .truncate_count = ATOMIC_INIT(0), - .private_lock = SPIN_LOCK_UNLOCKED, - .private_list = LIST_HEAD_INIT(swapper_space.private_list), }; #define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) @@ -59,30 +52,50 @@ void show_swap_cache_info(void) swap_cache_info.noent_race, swap_cache_info.exist_race); } +static int __add_to_swap_cache(struct page *page, swp_entry_t entry) +{ + int error; + + BUG_ON(PageSwapCache(page)); + BUG_ON(PagePrivate(page)); + error = radix_tree_preload(GFP_ATOMIC); + if (error) + return error; + + page_cache_get(page); + mapping_wrlock(&swapper_space.page_lock); + error = radix_tree_insert(&swapper_space.page_tree, entry.val, page); + if (error) + page_cache_release(page); + else { + SetPageLocked(page); + SetPageSwapCache(page); + page->private = entry.val; + inc_page_state(nr_swapcache); + } + mapping_wrunlock(&swapper_space.page_lock); + radix_tree_preload_end(); + return error; +} + static int add_to_swap_cache(struct page *page, swp_entry_t entry) { int error; - if (page->mapping) - BUG(); if (!swap_duplicate(entry)) { INC_CACHE_INFO(noent_race); return -ENOENT; } - error = add_to_page_cache(page, &swapper_space, entry.val, GFP_KERNEL); + error = __add_to_swap_cache(page, entry); /* * Anon pages are already on the LRU, we don't run lru_cache_add here. */ - if (error != 0) { + if (error) { swap_free(entry); if (error == -EEXIST) INC_CACHE_INFO(exist_race); return error; } - if (!PageLocked(page)) - BUG(); - if (!PageSwapCache(page)) - BUG(); INC_CACHE_INFO(add_total); return 0; } @@ -96,7 +109,9 @@ void __delete_from_swap_cache(struct pag BUG_ON(!PageLocked(page)); BUG_ON(!PageSwapCache(page)); BUG_ON(PageWriteback(page)); - __remove_from_page_cache(page); + radix_tree_delete(&swapper_space.page_tree, page->private); + ClearPageSwapCache(page); + dec_page_state(nr_swapcache); INC_CACHE_INFO(del_total); } @@ -140,8 +155,7 @@ int add_to_swap(struct page * page) /* * Add it to the swap cache and mark it dirty */ - err = add_to_page_cache(page, &swapper_space, - entry.val, GFP_ATOMIC); + err = __add_to_swap_cache(page, entry); if (pf_flags & PF_MEMALLOC) current->flags |= PF_MEMALLOC; @@ -149,8 +163,7 @@ int add_to_swap(struct page * page) switch (err) { case 0: /* Success */ SetPageUptodate(page); - ClearPageDirty(page); - set_page_dirty(page); + SetPageDirty(page); INC_CACHE_INFO(add_total); return 1; case -EEXIST: @@ -176,15 +189,16 @@ void delete_from_swap_cache(struct page { swp_entry_t entry; + BUG_ON(!PageSwapCache(page)); BUG_ON(!PageLocked(page)); BUG_ON(PageWriteback(page)); BUG_ON(PagePrivate(page)); - entry.val = page->index; + entry.val = page->private; - spin_lock(&swapper_space.page_lock); + mapping_wrlock(&swapper_space.page_lock); __delete_from_swap_cache(page); - spin_unlock(&swapper_space.page_lock); + mapping_wrunlock(&swapper_space.page_lock); swap_free(entry); page_cache_release(page); @@ -192,27 +206,11 @@ void delete_from_swap_cache(struct page int move_to_swap_cache(struct page *page, swp_entry_t entry) { - struct address_space *mapping = page->mapping; - int err; - - spin_lock(&swapper_space.page_lock); - spin_lock(&mapping->page_lock); - - err = radix_tree_insert(&swapper_space.page_tree, entry.val, page); - if (!err) { - __remove_from_page_cache(page); - ___add_to_page_cache(page, &swapper_space, entry.val); - } - - spin_unlock(&mapping->page_lock); - spin_unlock(&swapper_space.page_lock); - + int err = __add_to_swap_cache(page, entry); if (!err) { - if (!swap_duplicate(entry)) - BUG(); - /* shift page from clean_pages to dirty_pages list */ - BUG_ON(PageDirty(page)); - set_page_dirty(page); + remove_from_page_cache(page); + BUG_ON(!swap_duplicate(entry)); + SetPageDirty(page); INC_CACHE_INFO(add_total); } else if (err == -EEXIST) INC_CACHE_INFO(exist_race); @@ -222,29 +220,13 @@ int move_to_swap_cache(struct page *page int move_from_swap_cache(struct page *page, unsigned long index, struct address_space *mapping) { - swp_entry_t entry; - int err; - - BUG_ON(!PageLocked(page)); - BUG_ON(PageWriteback(page)); - BUG_ON(PagePrivate(page)); - - entry.val = page->index; - - spin_lock(&swapper_space.page_lock); - spin_lock(&mapping->page_lock); - - err = radix_tree_insert(&mapping->page_tree, index, page); - if (!err) { - __delete_from_swap_cache(page); - ___add_to_page_cache(page, mapping, index); + int err = add_to_page_cache(page, mapping, index, GFP_ATOMIC); + if (err == -EEXIST) { + INC_CACHE_INFO(exist_race); + err = 0; } - - spin_unlock(&mapping->page_lock); - spin_unlock(&swapper_space.page_lock); - if (!err) { - swap_free(entry); + delete_from_swap_cache(page); /* shift page from clean_pages to dirty_pages list */ ClearPageDirty(page); set_page_dirty(page); @@ -308,11 +290,17 @@ void free_pages_and_swap_cache(struct pa * lock getting page table operations atomic even if we drop the page * lock before returning. */ -struct page * lookup_swap_cache(swp_entry_t entry) +struct page *lookup_swap_cache(swp_entry_t entry) { - struct page *found; + struct page *page; - found = find_get_page(&swapper_space, entry.val); + mapping_rdlock(&swapper_space.page_lock); + page = radix_tree_lookup(&swapper_space.page_tree, entry.val); + if (page) { + page_cache_get(page); + INC_CACHE_INFO(find_success); + } + mapping_rdunlock(&swapper_space.page_lock); /* * Unsafe to assert PageSwapCache and mapping on page found: * if SMP nothing prevents swapoff from deleting this page from @@ -320,9 +308,7 @@ struct page * lookup_swap_cache(swp_entr * that, but no need to change: we _have_ got the right page. */ INC_CACHE_INFO(find_total); - if (found) - INC_CACHE_INFO(find_success); - return found; + return page; } /* @@ -331,7 +317,7 @@ struct page * lookup_swap_cache(swp_entr * A failure return means that either the page allocation failed or that * the swap entry is no longer in use. */ -struct page * read_swap_cache_async(swp_entry_t entry) +struct page *read_swap_cache_async(swp_entry_t entry) { struct page *found_page, *new_page = NULL; int err; @@ -343,7 +329,11 @@ struct page * read_swap_cache_async(swp_ * that would confuse statistics: use find_get_page() * directly. */ - found_page = find_get_page(&swapper_space, entry.val); + mapping_rdlock(&swapper_space.page_lock); + found_page = radix_tree_lookup(&swapper_space.page_tree, entry.val); + if (found_page) + page_cache_get(found_page); + mapping_rdunlock(&swapper_space.page_lock); if (found_page) break; diff -prauN linux-2.6.0-test11/mm/swapfile.c wli-2.6.0-test11-30/mm/swapfile.c --- linux-2.6.0-test11/mm/swapfile.c 2003-11-26 12:43:24.000000000 -0800 +++ wli-2.6.0-test11-30/mm/swapfile.c 2003-12-04 06:59:27.000000000 -0800 @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include @@ -247,16 +247,16 @@ static int exclusive_swap_page(struct pa struct swap_info_struct * p; swp_entry_t entry; - entry.val = page->index; + entry.val = page->private; p = swap_info_get(entry); if (p) { /* Is the only swap cache user the cache itself? */ if (p->swap_map[swp_offset(entry)] == 1) { /* Recheck the page count with the pagecache lock held.. */ - spin_lock(&swapper_space.page_lock); + mapping_rdlock(&swapper_space.page_lock); if (page_count(page) - !!PagePrivate(page) == 2) retval = 1; - spin_unlock(&swapper_space.page_lock); + mapping_rdunlock(&swapper_space.page_lock); } swap_info_put(p); } @@ -315,7 +315,7 @@ int remove_exclusive_swap_page(struct pa if (page_count(page) != 2) /* 2: us + cache */ return 0; - entry.val = page->index; + entry.val = page->private; p = swap_info_get(entry); if (!p) return 0; @@ -324,13 +324,13 @@ int remove_exclusive_swap_page(struct pa retval = 0; if (p->swap_map[swp_offset(entry)] == 1) { /* Recheck the page count with the pagecache lock held.. */ - spin_lock(&swapper_space.page_lock); + mapping_wrlock(&swapper_space.page_lock); if ((page_count(page) == 2) && !PageWriteback(page)) { __delete_from_swap_cache(page); SetPageDirty(page); retval = 1; } - spin_unlock(&swapper_space.page_lock); + mapping_wrunlock(&swapper_space.page_lock); } swap_info_put(p); @@ -353,8 +353,13 @@ void free_swap_and_cache(swp_entry_t ent p = swap_info_get(entry); if (p) { - if (swap_entry_free(p, swp_offset(entry)) == 1) - page = find_trylock_page(&swapper_space, entry.val); + if (swap_entry_free(p, swp_offset(entry)) == 1) { + mapping_rdlock(&swapper_space.page_lock); + page = radix_tree_lookup(&swapper_space.page_tree, entry.val); + if (page && TestSetPageLocked(page)) + page = NULL; + mapping_rdunlock(&swapper_space.page_lock); + } swap_info_put(p); } if (page) { @@ -383,21 +388,21 @@ void free_swap_and_cache(swp_entry_t ent * what to do if a write is requested later. */ /* vma->vm_mm->page_table_lock is held */ -static void +static inline void unuse_pte(struct vm_area_struct *vma, unsigned long address, pte_t *dir, - swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp) + swp_entry_t entry, struct page *page) { - vma->vm_mm->rss++; get_page(page); - set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot))); - *pte_chainp = page_add_rmap(page, dir, *pte_chainp); + vm_set_pte(vma, dir, pte_mkold(mk_pte(page, vma->vm_page_prot)), address); + vma->vm_mm->rss++; + page_add_rmap(page, vma, address, 1); swap_free(entry); } /* vma->vm_mm->page_table_lock is held */ static int unuse_pmd(struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long size, unsigned long offset, - swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp) + swp_entry_t entry, struct page *page) { pte_t * pte; unsigned long end; @@ -422,8 +427,7 @@ static int unuse_pmd(struct vm_area_stru * Test inline before going to call unuse_pte. */ if (unlikely(pte_same(*pte, swp_pte))) { - unuse_pte(vma, offset + address, pte, - entry, page, pte_chainp); + unuse_pte(vma, offset + address, pte, entry, page); pte_unmap(pte); return 1; } @@ -437,7 +441,7 @@ static int unuse_pmd(struct vm_area_stru /* vma->vm_mm->page_table_lock is held */ static int unuse_pgd(struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long size, - swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp) + swp_entry_t entry, struct page *page) { pmd_t * pmd; unsigned long offset, end; @@ -449,7 +453,7 @@ static int unuse_pgd(struct vm_area_stru pgd_clear(dir); return 0; } - pmd = pmd_offset(dir, address); + pmd = pmd_offset_map(dir, address); offset = address & PGDIR_MASK; address &= ~PGDIR_MASK; end = address + size; @@ -458,26 +462,25 @@ static int unuse_pgd(struct vm_area_stru if (address >= end) BUG(); do { - if (unuse_pmd(vma, pmd, address, end - address, - offset, entry, page, pte_chainp)) + if (unuse_pmd(vma, pmd, address, end - address, offset, entry, page)) return 1; address = (address + PMD_SIZE) & PMD_MASK; pmd++; } while (address && (address < end)); + pmd_unmap(pmd - 1); return 0; } /* vma->vm_mm->page_table_lock is held */ static int unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir, - swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp) + swp_entry_t entry, struct page *page) { unsigned long start = vma->vm_start, end = vma->vm_end; if (start >= end) BUG(); do { - if (unuse_pgd(vma, pgdir, start, end - start, - entry, page, pte_chainp)) + if (unuse_pgd(vma, pgdir, start, end - start, entry, page)) return 1; start = (start + PGDIR_SIZE) & PGDIR_MASK; pgdir++; @@ -489,23 +492,20 @@ static int unuse_process(struct mm_struc swp_entry_t entry, struct page* page) { struct vm_area_struct* vma; - struct pte_chain *pte_chain; - - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) - return -ENOMEM; /* * Go through process' page directory. */ + if (!rmap_get_cpu()) + return -ENOMEM; spin_lock(&mm->page_table_lock); + put_cpu(); for (vma = mm->mmap; vma; vma = vma->vm_next) { pgd_t * pgd = pgd_offset(mm, vma->vm_start); - if (unuse_vma(vma, pgd, entry, page, &pte_chain)) + if (unuse_vma(vma, pgd, entry, page)) break; } spin_unlock(&mm->page_table_lock); - pte_chain_free(pte_chain); return 0; } @@ -653,8 +653,14 @@ static int try_to_unuse(unsigned int typ if (swcount > 1) { if (start_mm == &init_mm) shmem = shmem_unuse(entry, page); - else + else { retval = unuse_process(start_mm, entry, page); + if (retval) { + unlock_page(page); + page_cache_release(page); + break; + } + } } if (*swap_map > 1) { int set_start_mm = (*swap_map >= swcount); @@ -677,9 +683,7 @@ static int try_to_unuse(unsigned int typ cond_resched(); swcount = *swap_map; - if (swcount <= 1) - ; - else if (mm == &init_mm) { + if (mm == &init_mm) { set_start_mm = 1; shmem = shmem_unuse(entry, page); } else @@ -995,9 +999,10 @@ int page_queue_congested(struct page *pa BUG_ON(!PageLocked(page)); /* It pins the swap_info_struct */ - bdi = page->mapping->backing_dev_info; - if (PageSwapCache(page)) { - swp_entry_t entry = { .val = page->index }; + if (!PageSwapCache(page)) + bdi = page_mapping(page)->backing_dev_info; + else { + swp_entry_t entry = { .val = page->private }; struct swap_info_struct *sis; sis = get_swap_info_struct(swp_type(entry)); diff -prauN linux-2.6.0-test11/mm/truncate.c wli-2.6.0-test11-30/mm/truncate.c --- linux-2.6.0-test11/mm/truncate.c 2003-11-26 12:45:39.000000000 -0800 +++ wli-2.6.0-test11-30/mm/truncate.c 2003-12-04 06:13:40.000000000 -0800 @@ -19,7 +19,7 @@ static int do_invalidatepage(struct page *page, unsigned long offset) { int (*invalidatepage)(struct page *, unsigned long); - invalidatepage = page->mapping->a_ops->invalidatepage; + invalidatepage = page_mapping(page)->a_ops->invalidatepage; if (invalidatepage == NULL) invalidatepage = block_invalidatepage; return (*invalidatepage)(page, offset); @@ -37,7 +37,7 @@ static inline void truncate_partial_page * becomes anonymous. It will be left on the LRU and may even be mapped into * user pagetables if we're racing with filemap_nopage(). * - * We need to bale out if page->mapping is no longer equal to the original + * We need to bale out if page_mapping(page) is no longer equal to the original * mapping. This happens a) when the VM reclaimed the page while we waited on * its lock, b) when a concurrent invalidate_inode_pages got there first and * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space. @@ -45,7 +45,7 @@ static inline void truncate_partial_page static void truncate_complete_page(struct address_space *mapping, struct page *page) { - if (page->mapping != mapping) + if (page_mapping(page) != mapping) return; if (PagePrivate(page)) @@ -55,32 +55,31 @@ truncate_complete_page(struct address_sp ClearPageUptodate(page); ClearPageMappedToDisk(page); remove_from_page_cache(page); - page_cache_release(page); /* pagecache ref */ } /* * This is for invalidate_inode_pages(). That function can be called at * any time, and is not supposed to throw away dirty pages. But pages can * be marked dirty at any time too. So we re-check the dirtiness inside - * ->page_lock. That provides exclusion against the __set_page_dirty + * ->page_lock. That provides exclusion against the set_page_dirty * functions. */ static int invalidate_complete_page(struct address_space *mapping, struct page *page) { - if (page->mapping != mapping) + if (page_mapping(page) != mapping) return 0; if (PagePrivate(page) && !try_to_release_page(page, 0)) return 0; - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); if (PageDirty(page)) { - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); return 0; } __remove_from_page_cache(page); - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); ClearPageUptodate(page); page_cache_release(page); /* pagecache ref */ return 1; @@ -255,7 +254,7 @@ void invalidate_inode_pages2(struct addr struct page *page = pvec.pages[i]; lock_page(page); - if (page->mapping == mapping) { /* truncate race? */ + if (page_mapping(page) == mapping) { /* truncate race? */ wait_on_page_writeback(page); next = page->index + 1; if (page_mapped(page)) diff -prauN linux-2.6.0-test11/mm/vmalloc.c wli-2.6.0-test11-30/mm/vmalloc.c --- linux-2.6.0-test11/mm/vmalloc.c 2003-11-26 12:44:23.000000000 -0800 +++ wli-2.6.0-test11-30/mm/vmalloc.c 2003-12-03 18:20:41.000000000 -0800 @@ -71,7 +71,7 @@ static void unmap_area_pmd(pgd_t *dir, u return; } - pmd = pmd_offset(dir, address); + pmd = pmd_offset_kernel(dir, address); address &= ~PGDIR_MASK; end = address + size; if (end > PGDIR_SIZE) @@ -160,7 +160,7 @@ int map_vm_area(struct vm_struct *area, dir = pgd_offset_k(address); spin_lock(&init_mm.page_table_lock); do { - pmd_t *pmd = pmd_alloc(&init_mm, dir, address); + pmd_t *pmd = pmd_alloc_kernel(&init_mm, dir, address); if (!pmd) { err = -ENOMEM; break; diff -prauN linux-2.6.0-test11/mm/vmscan.c wli-2.6.0-test11-30/mm/vmscan.c --- linux-2.6.0-test11/mm/vmscan.c 2003-11-26 12:43:06.000000000 -0800 +++ wli-2.6.0-test11-30/mm/vmscan.c 2003-12-04 07:13:42.000000000 -0800 @@ -28,7 +28,7 @@ #include #include #include -#include +#include #include #include @@ -177,23 +177,23 @@ static int shrink_slab(long scanned, uns return 0; } -/* Must be called with page's pte_chain_lock held. */ +/* Must be called with page's rmap_lock held. */ static inline int page_mapping_inuse(struct page *page) { - struct address_space *mapping = page->mapping; + struct address_space *mapping; /* Page is in somebody's page tables. */ if (page_mapped(page)) return 1; - /* XXX: does this happen ? */ - if (!mapping) - return 0; - /* Be more reluctant to reclaim swapcache than pagecache */ if (PageSwapCache(page)) return 1; + mapping = page_mapping(page); + if (!mapping) + return 0; + /* File is mmap'd by somebody. */ if (!list_empty(&mapping->i_mmap)) return 1; @@ -237,7 +237,7 @@ static void handle_write_error(struct ad struct page *page, int error) { lock_page(page); - if (page->mapping == mapping) { + if (page_mapping(page) == mapping) { if (error == -ENOSPC) set_bit(AS_ENOSPC, &mapping->flags); else @@ -284,15 +284,15 @@ shrink_list(struct list_head *page_list, if (PageWriteback(page)) goto keep_locked; - pte_chain_lock(page); + rmap_lock(page); referenced = page_referenced(page); if (referenced && page_mapping_inuse(page)) { /* In active use or really unfreeable. Activate it. */ - pte_chain_unlock(page); + rmap_unlock(page); goto activate_locked; } - mapping = page->mapping; + mapping = page_mapping(page); #ifdef CONFIG_SWAP /* @@ -301,12 +301,16 @@ shrink_list(struct list_head *page_list, * * XXX: implement swap clustering ? */ - if (page_mapped(page) && !mapping && !PagePrivate(page)) { - pte_chain_unlock(page); - if (!add_to_swap(page)) + if (PageSwapCache(page)) + mapping = &swapper_space; + else if (PageAnon(page)) { + rmap_unlock(page); + if (!add_to_swap(page)) { + inc_page_state(swapaddfail); goto activate_locked; - pte_chain_lock(page); - mapping = page->mapping; + } + rmap_lock(page); + mapping = &swapper_space; } #endif /* CONFIG_SWAP */ @@ -317,16 +321,18 @@ shrink_list(struct list_head *page_list, if (page_mapped(page) && mapping) { switch (try_to_unmap(page)) { case SWAP_FAIL: - pte_chain_unlock(page); + inc_page_state(unmapfail); + rmap_unlock(page); goto activate_locked; case SWAP_AGAIN: - pte_chain_unlock(page); + inc_page_state(unmapagain); + rmap_unlock(page); goto keep_locked; case SWAP_SUCCESS: ; /* try to free the page below */ } } - pte_chain_unlock(page); + rmap_unlock(page); /* * If the page is dirty, only perform writeback if that write @@ -358,7 +364,7 @@ shrink_list(struct list_head *page_list, goto keep_locked; if (!may_write_to_queue(mapping->backing_dev_info)) goto keep_locked; - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); if (test_clear_page_dirty(page)) { int res; struct writeback_control wbc = { @@ -368,8 +374,9 @@ shrink_list(struct list_head *page_list, .for_reclaim = 1, }; - list_move(&page->list, &mapping->locked_pages); - spin_unlock(&mapping->page_lock); + if (!PageSwapCache(page)) + list_move(&page->list, &mapping->locked_pages); + mapping_wrunlock(&mapping->page_lock); SetPageReclaim(page); res = mapping->a_ops->writepage(page, &wbc); @@ -385,7 +392,7 @@ shrink_list(struct list_head *page_list, } goto keep; } - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); } /* @@ -402,7 +409,7 @@ shrink_list(struct list_head *page_list, * try_to_release_page() will discover that cleanness and will * drop the buffers and mark the page clean - it can be freed. * - * Rarely, pages can have buffers and no ->mapping. These are + * Rarely, pages can have buffers and no page_mapping(). These are * the pages which were not successfully invalidated in * truncate_complete_page(). We try to drop those buffers here * and if that worked, and the page is no longer mapped into @@ -419,7 +426,7 @@ shrink_list(struct list_head *page_list, if (!mapping) goto keep_locked; /* truncate got there first */ - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); /* * The non-racy check for busy page. It is critical to check @@ -427,15 +434,15 @@ shrink_list(struct list_head *page_list, * not in use by anybody. (pagecache + us == 2) */ if (page_count(page) != 2 || PageDirty(page)) { - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); goto keep_locked; } #ifdef CONFIG_SWAP if (PageSwapCache(page)) { - swp_entry_t swap = { .val = page->index }; + swp_entry_t swap = { .val = page->private }; __delete_from_swap_cache(page); - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); swap_free(swap); __put_page(page); /* The pagecache ref */ goto free_it; @@ -443,7 +450,7 @@ shrink_list(struct list_head *page_list, #endif /* CONFIG_SWAP */ __remove_from_page_cache(page); - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); __put_page(page); free_it: @@ -663,13 +670,13 @@ refill_inactive_zone(struct zone *zone, page = list_entry(l_hold.prev, struct page, lru); list_del(&page->lru); if (page_mapped(page)) { - pte_chain_lock(page); + rmap_lock(page); if (page_mapped(page) && page_referenced(page)) { - pte_chain_unlock(page); + rmap_unlock(page); list_add(&page->lru, &l_active); continue; } - pte_chain_unlock(page); + rmap_unlock(page); if (!reclaim_mapped) { list_add(&page->lru, &l_active); continue; @@ -679,7 +686,7 @@ refill_inactive_zone(struct zone *zone, * FIXME: need to consider page_count(page) here if/when we * reap orphaned pages via the LRU (Daniel's locking stuff) */ - if (total_swap_pages == 0 && !page->mapping && + if (total_swap_pages == 0 && !page_mapping(page) && !PagePrivate(page)) { list_add(&page->lru, &l_active); continue; @@ -837,6 +844,10 @@ shrink_caches(struct zone *classzone, in } return ret; } + +#ifndef HAVE_ARCH_PAGETABLE_CACHE +#define shrink_pagetable_cache(gfp_mask) do { } while (0) +#endif /* * This is the main entry point to direct page reclaim. @@ -890,6 +901,9 @@ int try_to_free_pages(struct zone *cz, */ wakeup_bdflush(total_scanned); + /* shoot down some pagetable caches before napping */ + shrink_pagetable_cache(gfp_mask); + /* Take a nap, wait for some writeback to complete */ blk_congestion_wait(WRITE, HZ/10); if (cz - cz->zone_pgdat->node_zones < ZONE_HIGHMEM) { @@ -981,8 +995,10 @@ static int balance_pgdat(pg_data_t *pgda } if (all_zones_ok) break; - if (to_free > 0) + if (to_free > 0) { + shrink_pagetable_cache(GFP_HIGHUSER); blk_congestion_wait(WRITE, HZ/10); + } } for (i = 0; i < pgdat->nr_zones; i++) { diff -prauN linux-2.6.0-test11/sound/core/pcm_native.c wli-2.6.0-test11-30/sound/core/pcm_native.c --- linux-2.6.0-test11/sound/core/pcm_native.c 2003-11-26 12:43:32.000000000 -0800 +++ wli-2.6.0-test11-30/sound/core/pcm_native.c 2003-12-04 08:48:31.000000000 -0800 @@ -2779,7 +2779,7 @@ unsigned int snd_pcm_capture_poll(struct return mask; } -static struct page * snd_pcm_mmap_status_nopage(struct vm_area_struct *area, unsigned long address, int no_share) +static struct page * snd_pcm_mmap_status_nopage(struct vm_area_struct *area, unsigned long address, int *type) { snd_pcm_substream_t *substream = (snd_pcm_substream_t *)area->vm_private_data; snd_pcm_runtime_t *runtime; @@ -2791,6 +2791,8 @@ static struct page * snd_pcm_mmap_status page = virt_to_page(runtime->status); if (!PageReserved(page)) get_page(page); + if (type) + *type = VM_FAULT_MINOR; return page; } @@ -2817,7 +2819,7 @@ int snd_pcm_mmap_status(snd_pcm_substrea return 0; } -static struct page * snd_pcm_mmap_control_nopage(struct vm_area_struct *area, unsigned long address, int no_share) +static struct page * snd_pcm_mmap_control_nopage(struct vm_area_struct *area, unsigned long address, int *type) { snd_pcm_substream_t *substream = (snd_pcm_substream_t *)area->vm_private_data; snd_pcm_runtime_t *runtime; @@ -2829,6 +2831,8 @@ static struct page * snd_pcm_mmap_contro page = virt_to_page(runtime->control); if (!PageReserved(page)) get_page(page); + if (type) + *type = VM_FAULT_MINOR; return page; } @@ -2867,7 +2871,7 @@ static void snd_pcm_mmap_data_close(stru atomic_dec(&substream->runtime->mmap_count); } -static struct page * snd_pcm_mmap_data_nopage(struct vm_area_struct *area, unsigned long address, int no_share) +static struct page * snd_pcm_mmap_data_nopage(struct vm_area_struct *area, unsigned long address, int *type) { snd_pcm_substream_t *substream = (snd_pcm_substream_t *)area->vm_private_data; snd_pcm_runtime_t *runtime; @@ -2895,6 +2899,8 @@ static struct page * snd_pcm_mmap_data_n } if (!PageReserved(page)) get_page(page); + if (type) + *type = VM_FAULT_MINOR; return page; } diff -prauN linux-2.6.0-test11/sound/oss/emu10k1/audio.c wli-2.6.0-test11-30/sound/oss/emu10k1/audio.c --- linux-2.6.0-test11/sound/oss/emu10k1/audio.c 2003-11-26 12:42:57.000000000 -0800 +++ wli-2.6.0-test11-30/sound/oss/emu10k1/audio.c 2003-12-04 08:43:29.000000000 -0800 @@ -989,7 +989,7 @@ static int emu10k1_audio_ioctl(struct in return 0; } -static struct page *emu10k1_mm_nopage (struct vm_area_struct * vma, unsigned long address, int write_access) +static struct page *emu10k1_mm_nopage (struct vm_area_struct * vma, unsigned long address, int *type) { struct emu10k1_wavedevice *wave_dev = vma->vm_private_data; struct woinst *woinst = wave_dev->woinst; @@ -1032,6 +1032,8 @@ static struct page *emu10k1_mm_nopage (s get_page (dmapage); DPD(3, "page: %#lx\n", (unsigned long) dmapage); + if (type) + *type = VM_FAULT_MINOR; return dmapage; } diff -prauN linux-2.6.0-test11/sound/oss/via82cxxx_audio.c wli-2.6.0-test11-30/sound/oss/via82cxxx_audio.c --- linux-2.6.0-test11/sound/oss/via82cxxx_audio.c 2003-11-26 12:43:29.000000000 -0800 +++ wli-2.6.0-test11-30/sound/oss/via82cxxx_audio.c 2003-12-04 08:43:29.000000000 -0800 @@ -2116,7 +2116,7 @@ static void via_dsp_cleanup (struct via_ static struct page * via_mm_nopage (struct vm_area_struct * vma, - unsigned long address, int write_access) + unsigned long address, int *type) { struct via_info *card = vma->vm_private_data; struct via_channel *chan = &card->ch_out; @@ -2124,12 +2124,11 @@ static struct page * via_mm_nopage (stru unsigned long pgoff; int rd, wr; - DPRINTK ("ENTER, start %lXh, ofs %lXh, pgoff %ld, addr %lXh, wr %d\n", + DPRINTK ("ENTER, start %lXh, ofs %lXh, pgoff %ld, addr %lXh\n", vma->vm_start, address - vma->vm_start, (address - vma->vm_start) >> PAGE_SHIFT, - address, - write_access); + address); if (address > vma->vm_end) { DPRINTK ("EXIT, returning NOPAGE_SIGBUS\n"); @@ -2167,6 +2166,8 @@ static struct page * via_mm_nopage (stru DPRINTK ("EXIT, returning page %p for cpuaddr %lXh\n", dmapage, (unsigned long) chan->pgtbl[pgoff].cpuaddr); get_page (dmapage); + if (type) + *type = VM_FAULT_MINOR; return dmapage; }