diff -prauN linux-2.5.73/Documentation/filesystems/Locking wli-2.5.73-29/Documentation/filesystems/Locking --- linux-2.5.73/Documentation/filesystems/Locking 2003-06-22 11:33:15.000000000 -0700 +++ wli-2.5.73-29/Documentation/filesystems/Locking 2003-06-23 10:46:31.000000000 -0700 @@ -186,7 +186,7 @@ currently-in-progress I/O. If the filesystem is not called for "sync" and it determines that it would need to block against in-progress I/O to be able to start new I/O against the page the filesystem shoud redirty the page (usually with -__set_page_dirty_nobuffers()), then unlock the page and return zero. +set_page_dirty_nobuffers()), then unlock the page and return zero. This may also be done to avoid internal deadlocks, but rarely. If the filesytem is called for sync then it must wait on any diff -prauN linux-2.5.73/Documentation/vm/locking wli-2.5.73-29/Documentation/vm/locking --- linux-2.5.73/Documentation/vm/locking 2003-06-22 11:32:39.000000000 -0700 +++ wli-2.5.73-29/Documentation/vm/locking 2003-06-23 10:44:16.000000000 -0700 @@ -66,7 +66,7 @@ in some cases it is not really needed. E expand_stack(), it is hard to come up with a destructive scenario without having the vmlist protection in this case. -The page_table_lock nests with the inode i_shared_sem and the kmem cache +The page_table_lock nests with the inode i_shared_lock and the kmem cache c_spinlock spinlocks. This is okay, since the kmem code asks for pages after dropping c_spinlock. The page_table_lock also nests with pagecache_lock and pagemap_lru_lock spinlocks, and no code asks for memory with these locks diff -prauN linux-2.5.73/Makefile wli-2.5.73-29/Makefile --- linux-2.5.73/Makefile 2003-06-22 11:32:58.000000000 -0700 +++ wli-2.5.73-29/Makefile 2003-06-23 10:53:23.000000000 -0700 @@ -214,7 +214,7 @@ NOSTDINC_FLAGS = -nostdinc -iwithprefix CPPFLAGS := -D__KERNEL__ -Iinclude CFLAGS := $(CPPFLAGS) -Wall -Wstrict-prototypes -Wno-trigraphs -O2 \ - -fno-strict-aliasing -fno-common + -fno-strict-aliasing -fno-common -g AFLAGS := -D__ASSEMBLY__ $(CPPFLAGS) export VERSION PATCHLEVEL SUBLEVEL EXTRAVERSION KERNELRELEASE ARCH \ diff -prauN linux-2.5.73/arch/arm/mm/fault-armv.c wli-2.5.73-29/arch/arm/mm/fault-armv.c --- linux-2.5.73/arch/arm/mm/fault-armv.c 2003-06-22 11:32:37.000000000 -0700 +++ wli-2.5.73-29/arch/arm/mm/fault-armv.c 2003-06-23 10:46:31.000000000 -0700 @@ -187,19 +187,22 @@ void __flush_dcache_page(struct page *pa __cpuc_flush_dcache_page(page_address(page)); - if (!page->mapping) + if (!page_mapping(page)) return; /* * With a VIVT cache, we need to also write back * and invalidate any user data. */ - list_for_each(l, &page->mapping->i_mmap_shared) { + list_for_each_rcu(l, &page_mapping(page)->i_mmap_shared) { struct vm_area_struct *mpnt; unsigned long off; mpnt = list_entry(l, struct vm_area_struct, shared); + if (mpnt->vm_flags & VM_DEAD) + continue; + /* * If this VMA is not in our MM, we can ignore it. */ @@ -230,12 +233,15 @@ make_coherent(struct vm_area_struct *vma * space, then we need to handle them specially to maintain * cache coherency. */ - list_for_each(l, &page->mapping->i_mmap_shared) { + list_for_each_rcu(l, &page_mapping(page)->i_mmap_shared) { struct vm_area_struct *mpnt; unsigned long off; mpnt = list_entry(l, struct vm_area_struct, shared); + if (mpnt->vm_flags & VM_DEAD) + continue; + /* * If this VMA is not in our MM, we can ignore it. * Note that we intentionally don't mask out the VMA @@ -288,7 +294,7 @@ void update_mmu_cache(struct vm_area_str if (!pfn_valid(pfn)) return; page = pfn_to_page(pfn); - if (page->mapping) { + if (page_mapping(page)) { int dirty = test_and_clear_bit(PG_dcache_dirty, &page->flags); if (dirty) diff -prauN linux-2.5.73/arch/i386/Kconfig wli-2.5.73-29/arch/i386/Kconfig --- linux-2.5.73/arch/i386/Kconfig 2003-06-22 11:32:34.000000000 -0700 +++ wli-2.5.73-29/arch/i386/Kconfig 2003-06-23 10:44:16.000000000 -0700 @@ -397,6 +397,11 @@ config X86_OOSTORE depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 default y +config X86_CMOV + bool + depends on M686 || MPENTIUMII || MPENTIUMIII || MPENTIUM4 || MK8 || MCRUSOE + default y + config HUGETLB_PAGE bool "Huge TLB Page Support" help @@ -723,6 +728,25 @@ config HIGHPTE low memory. Setting this option will put user-space page table entries in high memory. +config HIGHPMD + bool "Allocate 2nd-level pagetables from highmem" + depends on HIGHMEM64G + help + The VM uses one pmd entry for each pagetable page of physical + memory allocated. For systems with extreme amounts of highmem, + this cannot be tolerated. Setting this option will put + userspace 2nd-level pagetables in highmem. + +config 4K_STACK + bool "Use smaller 4k per-task stacks" + help + This option will shrink the kernel's per-task stack from 8k to + 4k. This will greatly increase your chance of overflowing it. + But, if you use the per-cpu interrupt stacks as well, your chances + go way down. Also try the CONFIG_X86_STACK_CHECK overflow + detection. It is much more reliable than the currently in-kernel + version. + config MATH_EMULATION bool "Math emulation" ---help--- @@ -1399,6 +1423,15 @@ config DEBUG_SPINLOCK best used in conjunction with the NMI watchdog so that spinlock deadlocks are also debuggable. +config SPINLINE + bool "Spinlock inlining" + depends on DEBUG_KERNEL + help + This will change spinlocks from out of line to inline, making them + account cost to the callers in readprofile, rather than the lock + itself (as ".text.lock.filename"). This can be helpful for finding + the callers of locks. + config DEBUG_HIGHMEM bool "Highmem debugging" depends on DEBUG_KERNEL && HIGHMEM @@ -1427,6 +1460,25 @@ config FRAME_POINTER If you don't debug the kernel, you can say N, but we may not be able to solve problems without frame pointers. +config X86_STACK_CHECK + bool "Detect stack overflows" + depends on FRAME_POINTER + help + Say Y here to have the kernel attempt to detect when the per-task + kernel stack overflows. This is much more robust checking than + the above overflow check, which will only occasionally detect + an overflow. The level of guarantee here is much greater. + + Some older versions of gcc don't handle the -p option correctly. + Kernprof is affected by the same problem, which is described here: + http://oss.sgi.com/projects/kernprof/faq.html#Q9 + + Basically, if you get oopses in __free_pages_ok during boot when + you have this turned on, you need to fix gcc. The Redhat 2.96 + version and gcc-3.x seem to work. + + If not debugging a stack overflow problem, say N + config X86_EXTRA_IRQS bool depends on X86_LOCAL_APIC || X86_VOYAGER diff -prauN linux-2.5.73/arch/i386/Makefile wli-2.5.73-29/arch/i386/Makefile --- linux-2.5.73/arch/i386/Makefile 2003-06-22 11:32:41.000000000 -0700 +++ wli-2.5.73-29/arch/i386/Makefile 2003-06-23 10:42:51.000000000 -0700 @@ -85,6 +85,10 @@ mcore-$(CONFIG_X86_ES7000) := mach-es700 # default subarch .h files mflags-y += -Iinclude/asm-i386/mach-default +ifdef CONFIG_X86_STACK_CHECK +CFLAGS += -p +endif + head-y := arch/i386/kernel/head.o arch/i386/kernel/init_task.o libs-y += arch/i386/lib/ diff -prauN linux-2.5.73/arch/i386/boot/compressed/misc.c wli-2.5.73-29/arch/i386/boot/compressed/misc.c --- linux-2.5.73/arch/i386/boot/compressed/misc.c 2003-06-22 11:32:56.000000000 -0700 +++ wli-2.5.73-29/arch/i386/boot/compressed/misc.c 2003-06-23 10:42:51.000000000 -0700 @@ -379,3 +379,7 @@ asmlinkage int decompress_kernel(struct if (high_loaded) close_output_buffer_if_we_run_high(mv); return high_loaded; } + +/* We don't actually check for stack overflows this early. */ +__asm__(".globl mcount ; mcount: ret\n"); + diff -prauN linux-2.5.73/arch/i386/kernel/apic.c wli-2.5.73-29/arch/i386/kernel/apic.c --- linux-2.5.73/arch/i386/kernel/apic.c 2003-06-22 11:33:35.000000000 -0700 +++ wli-2.5.73-29/arch/i386/kernel/apic.c 2003-06-23 10:42:31.000000000 -0700 @@ -1037,7 +1037,8 @@ inline void smp_local_timer_interrupt(st * interrupt as well. Thus we cannot inline the local irq ... ] */ -void smp_apic_timer_interrupt(struct pt_regs regs) +struct pt_regs * IRQHANDLER(smp_apic_timer_interrupt(struct pt_regs* regs)); +struct pt_regs * smp_apic_timer_interrupt(struct pt_regs* regs) { int cpu = smp_processor_id(); @@ -1057,14 +1058,16 @@ void smp_apic_timer_interrupt(struct pt_ * interrupt lock, which is the WrongThing (tm) to do. */ irq_enter(); - smp_local_timer_interrupt(®s); + smp_local_timer_interrupt(regs); irq_exit(); + return regs; } /* * This interrupt should _never_ happen with our APIC/SMP architecture */ -asmlinkage void smp_spurious_interrupt(void) +struct pt_regs * IRQHANDLER(smp_spurious_interrupt(struct pt_regs* regs)); +struct pt_regs * smp_spurious_interrupt(struct pt_regs* regs) { unsigned long v; @@ -1082,13 +1085,15 @@ asmlinkage void smp_spurious_interrupt(v printk(KERN_INFO "spurious APIC interrupt on CPU#%d, should never happen.\n", smp_processor_id()); irq_exit(); + return regs; } /* * This interrupt should never happen with our APIC/SMP architecture */ -asmlinkage void smp_error_interrupt(void) +struct pt_regs * IRQHANDLER(smp_error_interrupt(struct pt_regs* regs)); +struct pt_regs * smp_error_interrupt(struct pt_regs* regs) { unsigned long v, v1; @@ -1113,6 +1118,7 @@ asmlinkage void smp_error_interrupt(void printk (KERN_INFO "APIC error on CPU%d: %02lx(%02lx)\n", smp_processor_id(), v , v1); irq_exit(); + return regs; } /* diff -prauN linux-2.5.73/arch/i386/kernel/cpu/mcheck/p4.c wli-2.5.73-29/arch/i386/kernel/cpu/mcheck/p4.c --- linux-2.5.73/arch/i386/kernel/cpu/mcheck/p4.c 2003-06-22 11:32:57.000000000 -0700 +++ wli-2.5.73-29/arch/i386/kernel/cpu/mcheck/p4.c 2003-06-23 10:42:31.000000000 -0700 @@ -61,11 +61,13 @@ static void intel_thermal_interrupt(stru /* Thermal interrupt handler for this CPU setup */ static void (*vendor_thermal_interrupt)(struct pt_regs *regs) = unexpected_thermal_interrupt; -asmlinkage void smp_thermal_interrupt(struct pt_regs regs) +struct pt_regs * IRQHANDLER(smp_thermal_interrupt(struct pt_regs* regs)); +struct pt_regs * smp_thermal_interrupt(struct pt_regs* regs) { irq_enter(); vendor_thermal_interrupt(®s); irq_exit(); + return regs; } /* P4/Xeon Thermal regulation detect and init */ diff -prauN linux-2.5.73/arch/i386/kernel/entry.S wli-2.5.73-29/arch/i386/kernel/entry.S --- linux-2.5.73/arch/i386/kernel/entry.S 2003-06-22 11:32:38.000000000 -0700 +++ wli-2.5.73-29/arch/i386/kernel/entry.S 2003-06-23 10:42:51.000000000 -0700 @@ -160,7 +160,7 @@ do_lcall: movl %eax,EFLAGS(%ebp) # movl %edx,EIP(%ebp) # Now we move them to their "normal" places movl %ecx,CS(%ebp) # - andl $-8192, %ebp # GET_THREAD_INFO + GET_THREAD_INFO_WITH_ESP(%ebp) # GET_THREAD_INFO movl TI_EXEC_DOMAIN(%ebp), %edx # Get the execution domain call *4(%edx) # Call the lcall7 handler for the domain addl $4, %esp @@ -394,17 +394,78 @@ ENTRY(irq_entries_start) vector=vector+1 .endr + +# lets play optimizing compiler... +#ifdef CONFIG_X86_CMOV +#define COND_MOVE cmovnz %esi,%esp; +#else +#define COND_MOVE \ + jz 1f; \ + mov %esi,%esp; \ +1: +#endif + +# These macros will switch you to, and from a per-cpu interrupt stack +# They take the pt_regs arg and move it from the normal place on the +# stack to %eax. Any handler function can retrieve it using regparm(1). +# The handlers are expected to return the stack to switch back to in +# the same register. +# +# This means that the irq handlers need to return their arg +# +# SWITCH_TO_IRQSTACK clobbers %ebx, %ecx, %edx, %esi +# old stack gets put in %eax + +.macro SWITCH_TO_IRQSTACK + GET_THREAD_INFO(%ebx); + movl TI_IRQ_STACK(%ebx),%ecx; + movl TI_TASK(%ebx),%edx; + movl %esp,%eax; + + # %ecx+THREAD_SIZE is next stack -4 keeps us in the right one + leal (THREAD_SIZE-4)(%ecx),%esi; + + # is there a valid irq_stack? + testl %ecx,%ecx; + COND_MOVE; + + # update the task pointer in the irq stack + GET_THREAD_INFO(%esi); + movl %edx,TI_TASK(%esi); + + # update the preempt count in the irq stack + movl TI_PRE_COUNT(%ebx),%ecx; + movl %ecx,TI_PRE_COUNT(%esi); +.endm + +# copy flags from the irq stack back into the task's thread_info +# %esi is saved over the irq handler call and contains the irq stack's +# thread_info pointer +# %eax was returned from the handler, as described above +# %ebx contains the original thread_info pointer + +.macro RESTORE_FROM_IRQSTACK + movl %eax,%esp; + movl TI_FLAGS(%esi),%eax; + movl $0,TI_FLAGS(%esi); + LOCK orl %eax,TI_FLAGS(%ebx); +.endm + ALIGN common_interrupt: SAVE_ALL + SWITCH_TO_IRQSTACK call do_IRQ + RESTORE_FROM_IRQSTACK jmp ret_from_intr #define BUILD_INTERRUPT(name, nr) \ ENTRY(name) \ pushl $nr-256; \ SAVE_ALL \ - call smp_/**/name; \ + SWITCH_TO_IRQSTACK; \ + call smp_/**/name; \ + RESTORE_FROM_IRQSTACK; \ jmp ret_from_intr; /* The include is where all of the SMP etc. interrupts come from */ @@ -604,6 +665,61 @@ ENTRY(spurious_interrupt_bug) pushl $do_spurious_interrupt_bug jmp error_code + +#ifdef CONFIG_X86_STACK_CHECK +.data + .globl stack_overflowed +stack_overflowed: + .long 0 +.text + +ENTRY(mcount) + push %eax + movl $(THREAD_SIZE - 1),%eax + andl %esp,%eax + cmpl $STACK_WARN,%eax /* more than half the stack is used*/ + jle 1f +2: + popl %eax + ret +1: + lock; btsl $0,stack_overflowed + jc 2b + + # switch to overflow stack + movl %esp,%eax + movl $(stack_overflow_stack + THREAD_SIZE - 4),%esp + + pushf + cli + pushl %eax + + # push eip then esp of error for stack_overflow_panic + pushl 4(%eax) + pushl %eax + + # update the task pointer and cpu in the overflow stack's thread_info. + GET_THREAD_INFO_WITH_ESP(%eax) + movl TI_TASK(%eax),%ebx + movl %ebx,stack_overflow_stack+TI_TASK + movl TI_CPU(%eax),%ebx + movl %ebx,stack_overflow_stack+TI_CPU + + call stack_overflow + + # pop off call arguments + addl $8,%esp + + popl %eax + popf + movl %eax,%esp + popl %eax + movl $0,stack_overflowed + ret + +#warning stack check enabled +#endif + .data ENTRY(sys_call_table) .long sys_restart_syscall /* 0 - old "setup()" system call, used for restarting */ diff -prauN linux-2.5.73/arch/i386/kernel/head.S wli-2.5.73-29/arch/i386/kernel/head.S --- linux-2.5.73/arch/i386/kernel/head.S 2003-06-22 11:32:32.000000000 -0700 +++ wli-2.5.73-29/arch/i386/kernel/head.S 2003-06-23 10:42:09.000000000 -0700 @@ -16,6 +16,7 @@ #include #include #include +#include #define OLD_CL_MAGIC_ADDR 0x90020 #define OLD_CL_MAGIC 0xA33F @@ -325,7 +326,7 @@ rp_sidt: ret ENTRY(stack_start) - .long init_thread_union+8192 + .long init_thread_union+THREAD_SIZE .long __BOOT_DS /* This is the default interrupt "handler" :-) */ diff -prauN linux-2.5.73/arch/i386/kernel/i386_ksyms.c wli-2.5.73-29/arch/i386/kernel/i386_ksyms.c --- linux-2.5.73/arch/i386/kernel/i386_ksyms.c 2003-06-22 11:33:35.000000000 -0700 +++ wli-2.5.73-29/arch/i386/kernel/i386_ksyms.c 2003-06-23 10:55:31.000000000 -0700 @@ -187,10 +187,6 @@ extern void * memcpy(void *,const void * EXPORT_SYMBOL_NOVERS(memcpy); EXPORT_SYMBOL_NOVERS(memset); -#ifdef CONFIG_HAVE_DEC_LOCK -EXPORT_SYMBOL(atomic_dec_and_lock); -#endif - extern int is_sony_vaio_laptop; EXPORT_SYMBOL(is_sony_vaio_laptop); @@ -208,3 +204,8 @@ EXPORT_SYMBOL(kmap_atomic_to_page); EXPORT_SYMBOL(edd); EXPORT_SYMBOL(eddnr); #endif + +#ifdef CONFIG_X86_STACK_CHECK +extern void mcount(void); +EXPORT_SYMBOL(mcount); +#endif diff -prauN linux-2.5.73/arch/i386/kernel/init_task.c wli-2.5.73-29/arch/i386/kernel/init_task.c --- linux-2.5.73/arch/i386/kernel/init_task.c 2003-06-22 11:33:32.000000000 -0700 +++ wli-2.5.73-29/arch/i386/kernel/init_task.c 2003-06-23 10:42:51.000000000 -0700 @@ -14,6 +14,14 @@ static struct signal_struct init_signals static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); struct mm_struct init_mm = INIT_MM(init_mm); +union thread_union init_irq_union + __attribute__((__section__(".data.init_task"))); + +#ifdef CONFIG_X86_STACK_CHECK +union thread_union stack_overflow_stack + __attribute__((__section__(".data.init_task"))); +#endif + /* * Initial thread structure. * diff -prauN linux-2.5.73/arch/i386/kernel/irq.c wli-2.5.73-29/arch/i386/kernel/irq.c --- linux-2.5.73/arch/i386/kernel/irq.c 2003-06-22 11:32:32.000000000 -0700 +++ wli-2.5.73-29/arch/i386/kernel/irq.c 2003-06-23 10:43:28.000000000 -0700 @@ -403,7 +403,8 @@ void enable_irq(unsigned int irq) * SMP cross-CPU interrupts have their own specific * handlers). */ -asmlinkage unsigned int do_IRQ(struct pt_regs regs) +struct pt_regs * IRQHANDLER(do_IRQ(struct pt_regs *regs)); +struct pt_regs * do_IRQ(struct pt_regs *regs) { /* * We ack quickly, we don't want the irq controller @@ -415,7 +416,7 @@ asmlinkage unsigned int do_IRQ(struct pt * 0 return value means that this irq is already being * handled by some other CPU. (or is disabled) */ - int irq = regs.orig_eax & 0xff; /* high bits used in ret_from_ code */ + int irq = regs->orig_eax & 0xff; /* high bits used in ret_from_ code */ int cpu = smp_processor_id(); irq_desc_t *desc = irq_desc + irq; struct irqaction * action; @@ -429,7 +430,7 @@ asmlinkage unsigned int do_IRQ(struct pt long esp; __asm__ __volatile__("andl %%esp,%0" : - "=r" (esp) : "0" (8191)); + "=r" (esp) : "0" (THREAD_SIZE - 1)); if (unlikely(esp < (sizeof(struct thread_info) + 1024))) { printk("do_IRQ: stack overflow: %ld\n", esp - sizeof(struct thread_info)); @@ -482,7 +483,7 @@ asmlinkage unsigned int do_IRQ(struct pt irqreturn_t action_ret; spin_unlock(&desc->lock); - action_ret = handle_IRQ_event(irq, ®s, action); + action_ret = handle_IRQ_event(irq, regs, action); spin_lock(&desc->lock); if (!noirqdebug) note_interrupt(irq, desc, action_ret); @@ -502,7 +503,7 @@ out: irq_exit(); - return 1; + return regs; } /** diff -prauN linux-2.5.73/arch/i386/kernel/process.c wli-2.5.73-29/arch/i386/kernel/process.c --- linux-2.5.73/arch/i386/kernel/process.c 2003-06-22 11:32:27.000000000 -0700 +++ wli-2.5.73-29/arch/i386/kernel/process.c 2003-06-23 11:01:06.000000000 -0700 @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -160,7 +161,25 @@ static int __init idle_setup (char *str) __setup("idle=", idle_setup); -void show_regs(struct pt_regs * regs) +void stack_overflow(unsigned long esp, unsigned long eip) +{ + int panicing = ((esp&(THREAD_SIZE-1)) <= STACK_PANIC); + + printk( "esp: 0x%lx masked: 0x%lx STACK_PANIC:0x%x %d %d\n", + esp, (esp&(THREAD_SIZE-1)), STACK_PANIC, (((esp&(THREAD_SIZE-1)) <= STACK_PANIC)), panicing ); + + if (panicing) + print_symbol("stack overflow from %s\n", eip); + else + print_symbol("excessive stack use from %s\n", eip); + printk("esp: %p\n", (void*)esp); + show_trace(NULL, (void*)esp); + + if (panicing) + panic("stack overflow\n"); +} + +asmlinkage void show_regs(struct pt_regs * regs) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; @@ -449,6 +468,7 @@ struct task_struct * __switch_to(struct /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ + next_p->thread_info->irq_stack = prev_p->thread_info->irq_stack; unlazy_fpu(prev_p); /* diff -prauN linux-2.5.73/arch/i386/kernel/smp.c wli-2.5.73-29/arch/i386/kernel/smp.c --- linux-2.5.73/arch/i386/kernel/smp.c 2003-06-22 11:32:32.000000000 -0700 +++ wli-2.5.73-29/arch/i386/kernel/smp.c 2003-06-23 10:42:31.000000000 -0700 @@ -305,7 +305,8 @@ static inline void leave_mm (unsigned lo * 2) Leave the mm if we are in the lazy tlb mode. */ -asmlinkage void smp_invalidate_interrupt (void) +struct pt_regs * IRQHANDLER(smp_invalidate_interrupt(struct pt_regs *regs)); +struct pt_regs * smp_invalidate_interrupt(struct pt_regs *regs) { unsigned long cpu; @@ -336,6 +337,7 @@ asmlinkage void smp_invalidate_interrupt out: put_cpu_no_resched(); + return regs; } static void flush_tlb_others (unsigned long cpumask, struct mm_struct *mm, @@ -559,12 +561,15 @@ void smp_send_stop(void) * all the work is done automatically when * we return from the interrupt. */ -asmlinkage void smp_reschedule_interrupt(void) +struct pt_regs *IRQHANDLER(smp_reschedule_interrupt(struct pt_regs *)); +struct pt_regs *smp_reschedule_interrupt(struct pt_regs *regs) { ack_APIC_irq(); + return regs; } -asmlinkage void smp_call_function_interrupt(void) +struct pt_regs *IRQHANDLER(smp_call_function_interrupt(struct pt_regs *)); +struct pt_regs *smp_call_function_interrupt(struct pt_regs *regs) { void (*func) (void *info) = call_data->func; void *info = call_data->info; @@ -588,5 +593,6 @@ asmlinkage void smp_call_function_interr mb(); atomic_inc(&call_data->finished); } + return regs; } diff -prauN linux-2.5.73/arch/i386/kernel/smpboot.c wli-2.5.73-29/arch/i386/kernel/smpboot.c --- linux-2.5.73/arch/i386/kernel/smpboot.c 2003-06-22 11:32:44.000000000 -0700 +++ wli-2.5.73-29/arch/i386/kernel/smpboot.c 2003-06-23 10:42:31.000000000 -0700 @@ -71,6 +71,11 @@ static unsigned long smp_commenced_mask; /* Per CPU bogomips and other parameters */ struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned; +/* Per CPU interrupt stacks */ +extern union thread_union init_irq_union; +union thread_union *irq_stacks[NR_CPUS] __cacheline_aligned = + { &init_irq_union, }; + /* Set when the idlers are all forked */ int smp_threads_ready; @@ -770,6 +775,28 @@ wakeup_secondary_cpu(int phys_apicid, un } #endif /* WAKE_SECONDARY_VIA_INIT */ +static void __init setup_irq_stack(struct task_struct *p, int cpu) +{ + unsigned long stk; + + stk = __get_free_pages(GFP_KERNEL, THREAD_ORDER); + if (!stk) + panic("I can't seem to allocate my irq stack. Oh well, giving up."); + + irq_stacks[cpu] = (void *)stk; + memset(irq_stacks[cpu], 0, THREAD_SIZE); + irq_stacks[cpu]->thread_info.cpu = cpu; + irq_stacks[cpu]->thread_info.preempt_count = 1; + /* interrupts are not preemptable */ + p->thread_info->irq_stack = &irq_stacks[cpu]->thread_info; + + /* If we want to make the irq stack more than one unit + * deep, we can chain then off of the irq_stack pointer + * here. + */ +} + + extern unsigned long cpu_initialized; static int __init do_boot_cpu(int apicid) @@ -793,6 +820,7 @@ static int __init do_boot_cpu(int apicid idle = fork_by_hand(); if (IS_ERR(idle)) panic("failed fork for CPU %d", cpu); + setup_irq_stack(idle, cpu); wake_up_forked_process(idle); /* diff -prauN linux-2.5.73/arch/i386/kernel/vm86.c wli-2.5.73-29/arch/i386/kernel/vm86.c --- linux-2.5.73/arch/i386/kernel/vm86.c 2003-06-22 11:32:33.000000000 -0700 +++ wli-2.5.73-29/arch/i386/kernel/vm86.c 2003-06-23 10:37:43.000000000 -0700 @@ -127,16 +127,17 @@ struct pt_regs * save_v86_state(struct k return ret; } -static void mark_screen_rdonly(struct task_struct * tsk) +static void mark_screen_rdonly(task_t *task) { + struct mm_struct *mm = task->mm; pgd_t *pgd; pmd_t *pmd; pte_t *pte, *mapped; int i; preempt_disable(); - spin_lock(&tsk->mm->page_table_lock); - pgd = pgd_offset(tsk->mm, 0xA0000); + spin_lock(&mm->page_table_lock); + pgd = pgd_offset(mm, 0xA0000); if (pgd_none(*pgd)) goto out; if (pgd_bad(*pgd)) { @@ -144,23 +145,26 @@ static void mark_screen_rdonly(struct ta pgd_clear(pgd); goto out; } - pmd = pmd_offset(pgd, 0xA0000); - if (pmd_none(*pmd)) + pmd = pmd_offset_map(pgd, 0xA0000); + if (pmd_none(*pmd)) { + pmd_unmap(pmd); goto out; - if (pmd_bad(*pmd)) { + } else if (pmd_bad(*pmd)) { pmd_ERROR(*pmd); pmd_clear(pmd); + pmd_unmap(pmd); goto out; } pte = mapped = pte_offset_map(pmd, 0xA0000); for (i = 0; i < 32; i++) { if (pte_present(*pte)) - set_pte(pte, pte_wrprotect(*pte)); + vm_ptep_set_wrprotect(mm, pte); pte++; } pte_unmap(mapped); + pmd_unmap(pmd); out: - spin_unlock(&tsk->mm->page_table_lock); + spin_unlock(&mm->page_table_lock); preempt_enable(); flush_tlb(); } diff -prauN linux-2.5.73/arch/i386/lib/Makefile wli-2.5.73-29/arch/i386/lib/Makefile --- linux-2.5.73/arch/i386/lib/Makefile 2003-06-22 11:32:31.000000000 -0700 +++ wli-2.5.73-29/arch/i386/lib/Makefile 2003-06-23 10:55:31.000000000 -0700 @@ -8,5 +8,4 @@ lib-y = checksum.o delay.o \ memcpy.o strstr.o lib-$(CONFIG_X86_USE_3DNOW) += mmx.o -lib-$(CONFIG_HAVE_DEC_LOCK) += dec_and_lock.o lib-$(CONFIG_DEBUG_IOVIRT) += iodebug.o diff -prauN linux-2.5.73/arch/i386/lib/dec_and_lock.c wli-2.5.73-29/arch/i386/lib/dec_and_lock.c --- linux-2.5.73/arch/i386/lib/dec_and_lock.c 2003-06-22 11:32:37.000000000 -0700 +++ wli-2.5.73-29/arch/i386/lib/dec_and_lock.c 1969-12-31 16:00:00.000000000 -0800 @@ -1,40 +0,0 @@ -/* - * x86 version of "atomic_dec_and_lock()" using - * the atomic "cmpxchg" instruction. - * - * (For CPU's lacking cmpxchg, we use the slow - * generic version, and this one never even gets - * compiled). - */ - -#include -#include - -int atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock) -{ - int counter; - int newcount; - -repeat: - counter = atomic_read(atomic); - newcount = counter-1; - - if (!newcount) - goto slow_path; - - asm volatile("lock; cmpxchgl %1,%2" - :"=a" (newcount) - :"r" (newcount), "m" (atomic->counter), "0" (counter)); - - /* If the above failed, "eax" will have changed */ - if (newcount != counter) - goto repeat; - return 0; - -slow_path: - spin_lock(lock); - if (atomic_dec_and_test(atomic)) - return 1; - spin_unlock(lock); - return 0; -} diff -prauN linux-2.5.73/arch/i386/mm/fault.c wli-2.5.73-29/arch/i386/mm/fault.c --- linux-2.5.73/arch/i386/mm/fault.c 2003-06-22 11:32:28.000000000 -0700 +++ wli-2.5.73-29/arch/i386/mm/fault.c 2003-06-23 10:31:02.000000000 -0700 @@ -330,8 +330,8 @@ vmalloc_fault: * and redundant with the set_pmd() on non-PAE. */ - pmd = pmd_offset(pgd, address); - pmd_k = pmd_offset(pgd_k, address); + pmd = pmd_offset_kernel(pgd, address); + pmd_k = pmd_offset_kernel(pgd_k, address); if (!pmd_present(*pmd_k)) goto no_context; set_pmd(pmd, *pmd_k); diff -prauN linux-2.5.73/arch/i386/mm/highmem.c wli-2.5.73-29/arch/i386/mm/highmem.c --- linux-2.5.73/arch/i386/mm/highmem.c 2003-06-22 11:32:55.000000000 -0700 +++ wli-2.5.73-29/arch/i386/mm/highmem.c 2003-06-23 10:38:47.000000000 -0700 @@ -1,22 +1,5 @@ #include -void *kmap(struct page *page) -{ - might_sleep(); - if (page < highmem_start_page) - return page_address(page); - return kmap_high(page); -} - -void kunmap(struct page *page) -{ - if (in_interrupt()) - BUG(); - if (page < highmem_start_page) - return; - kunmap_high(page); -} - /* * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because * no global lock is needed and because the kmap code must perform a global TLB @@ -25,40 +8,38 @@ void kunmap(struct page *page) * However when holding an atomic kmap is is not legal to sleep, so atomic * kmaps are appropriate for short, tight code paths only. */ -void *kmap_atomic(struct page *page, enum km_type type) +void *__kmap_atomic(struct page *page, enum km_type type) { enum fixed_addresses idx; unsigned long vaddr; - - inc_preempt_count(); - if (page < highmem_start_page) - return page_address(page); + pte_t old_pte, pte = mk_pte(page, kmap_prot); idx = type + KM_TYPE_NR*smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); + old_pte = *(kmap_pte - idx); + #ifdef CONFIG_DEBUG_HIGHMEM - if (!pte_none(*(kmap_pte-idx))) - BUG(); + BUG_ON(!pte_none(old_pte)); #endif - set_pte(kmap_pte-idx, mk_pte(page, kmap_prot)); - __flush_tlb_one(vaddr); - return (void*) vaddr; + if (!pte_same(old_pte, pte)) { + set_pte(kmap_pte-idx, mk_pte(page, kmap_prot)); + if (!pte_none(old_pte)) + __flush_tlb_one(vaddr); + } + return (void *)vaddr; } -void kunmap_atomic(void *kvaddr, enum km_type type) -{ #ifdef CONFIG_DEBUG_HIGHMEM +void __kunmap_atomic(void *kvaddr, enum km_type type) +{ unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); - if (vaddr < FIXADDR_START) { // FIXME - dec_preempt_count(); + if (vaddr < FIXADDR_START) // FIXME return; - } - if (vaddr != __fix_to_virt(FIX_KMAP_BEGIN+idx)) - BUG(); + BUG_ON(vaddr != __fix_to_virt(FIX_KMAP_BEGIN+idx)); /* * force other mappings to Oops if they'll try to access @@ -66,21 +47,15 @@ void kunmap_atomic(void *kvaddr, enum km */ pte_clear(kmap_pte-idx); __flush_tlb_one(vaddr); -#endif - - dec_preempt_count(); } +#endif -struct page *kmap_atomic_to_page(void *ptr) +struct page *__kmap_atomic_to_page(void *ptr) { unsigned long idx, vaddr = (unsigned long)ptr; pte_t *pte; - if (vaddr < FIXADDR_START) - return virt_to_page(ptr); - idx = virt_to_fix(vaddr); pte = kmap_pte - (idx - FIX_KMAP_BEGIN); return pte_page(*pte); } - diff -prauN linux-2.5.73/arch/i386/mm/hugetlbpage.c wli-2.5.73-29/arch/i386/mm/hugetlbpage.c --- linux-2.5.73/arch/i386/mm/hugetlbpage.c 2003-06-22 11:33:17.000000000 -0700 +++ wli-2.5.73-29/arch/i386/mm/hugetlbpage.c 2003-06-23 10:46:31.000000000 -0700 @@ -87,8 +87,8 @@ static pte_t *huge_pte_alloc(struct mm_s pmd_t *pmd = NULL; pgd = pgd_offset(mm, addr); - pmd = pmd_alloc(mm, pgd, addr); - return (pte_t *) pmd; + pmd = pmd_alloc_map(mm, pgd, addr); + return (pte_t *)pmd; } static pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) @@ -97,11 +97,13 @@ static pte_t *huge_pte_offset(struct mm_ pmd_t *pmd = NULL; pgd = pgd_offset(mm, addr); - pmd = pmd_offset(pgd, addr); - return (pte_t *) pmd; + pmd = pmd_offset_map(pgd, addr); + return (pte_t *)pmd; } -static void set_huge_pte(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page, pte_t * page_table, int write_access) +static void set_huge_pte(struct mm_struct *mm, struct vm_area_struct *vma, + struct page *page, pte_t * page_table, + unsigned long addr, int write_access) { pte_t entry; @@ -114,6 +116,7 @@ static void set_huge_pte(struct mm_struc entry = pte_mkyoung(entry); mk_pte_huge(entry); set_pte(page_table, entry); + vm_account_huge_inc(vma, *page_table, addr); } /* @@ -145,6 +148,8 @@ int copy_hugetlb_page_range(struct mm_st ptepage = pte_page(entry); get_page(ptepage); set_pte(dst_pte, entry); + pmd_unmap(dst_pte); + pmd_unmap_nested(src_pte); dst->rss += (HPAGE_SIZE / PAGE_SIZE); addr += HPAGE_SIZE; } @@ -182,6 +187,7 @@ follow_hugetlb_page(struct mm_struct *mm get_page(page); pages[i] = page; + pmd_unmap(pte); } if (vmas) @@ -271,6 +277,7 @@ follow_huge_pmd(struct mm_struct *mm, un page += ((address & ~HPAGE_MASK) >> PAGE_SHIFT); get_page(page); } + pmd_unmap(pmd); return page; } #endif @@ -278,7 +285,7 @@ follow_huge_pmd(struct mm_struct *mm, un void free_huge_page(struct page *page) { BUG_ON(page_count(page)); - BUG_ON(page->mapping); + BUG_ON(page_mapping(page)); INIT_LIST_HEAD(&page->list); @@ -314,6 +321,8 @@ void unmap_hugepage_range(struct vm_area page = pte_page(*pte); huge_page_release(page); pte_clear(pte); + vm_account_huge_dec(vma, *pte, address); + pmd_unmap(pte); } mm->rss -= (end - start) >> PAGE_SHIFT; flush_tlb_range(vma, start, end); @@ -358,16 +367,19 @@ int hugetlb_prefault(struct address_spac page = alloc_hugetlb_page(); if (!page) { ret = -ENOMEM; + pmd_unmap(pte); goto out; } ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC); unlock_page(page); if (ret) { free_huge_page(page); + pmd_unmap(pte); goto out; } } - set_huge_pte(mm, vma, page, pte, vma->vm_flags & VM_WRITE); + set_huge_pte(mm, vma, page, pte, addr, vma->vm_flags & VM_WRITE); + pmd_unmap(pte); } out: spin_unlock(&mm->page_table_lock); diff -prauN linux-2.5.73/arch/i386/mm/init.c wli-2.5.73-29/arch/i386/mm/init.c --- linux-2.5.73/arch/i386/mm/init.c 2003-06-22 11:33:06.000000000 -0700 +++ wli-2.5.73-29/arch/i386/mm/init.c 2003-06-23 10:33:55.000000000 -0700 @@ -59,10 +59,10 @@ static pmd_t * __init one_md_table_init( #ifdef CONFIG_X86_PAE pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); - if (pmd_table != pmd_offset(pgd, 0)) + if (pmd_table != pmd_offset_kernel(pgd, 0)) BUG(); #else - pmd_table = pmd_offset(pgd, 0); + pmd_table = pmd_offset_kernel(pgd, 0); #endif return pmd_table; @@ -113,7 +113,7 @@ static void __init page_table_range_init if (pgd_none(*pgd)) one_md_table_init(pgd); - pmd = pmd_offset(pgd, vaddr); + pmd = pmd_offset_kernel(pgd, vaddr); for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) { if (pmd_none(*pmd)) one_page_table_init(pmd); @@ -194,7 +194,7 @@ pte_t *kmap_pte; pgprot_t kmap_prot; #define kmap_get_fixmap_pte(vaddr) \ - pte_offset_kernel(pmd_offset(pgd_offset_k(vaddr), (vaddr)), (vaddr)) + pte_offset_kernel(pmd_offset_kernel(pgd_offset_k(vaddr), (vaddr)), (vaddr)) void __init kmap_init(void) { @@ -218,7 +218,7 @@ void __init permanent_kmaps_init(pgd_t * page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base); pgd = swapper_pg_dir + pgd_index(vaddr); - pmd = pmd_offset(pgd, vaddr); + pmd = pmd_offset_kernel(pgd, vaddr); pte = pte_offset_kernel(pmd, vaddr); pkmap_page_table = pte; } @@ -465,7 +465,7 @@ void __init mem_init(void) /* this will put all low memory onto the freelists */ totalram_pages += __free_all_bootmem(); - + tlb_init(); reservedpages = 0; for (tmp = 0; tmp < max_low_pfn; tmp++) /* @@ -512,20 +512,19 @@ void __init mem_init(void) #endif } -#ifdef CONFIG_X86_PAE -struct kmem_cache_s *pae_pgd_cachep; +kmem_cache_t *pgd_cache; void __init pgtable_cache_init(void) { - /* - * PAE pgds must be 16-byte aligned: - */ - pae_pgd_cachep = kmem_cache_create("pae_pgd", 32, 0, - SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN, NULL, NULL); - if (!pae_pgd_cachep) - panic("init_pae(): Cannot alloc pae_pgd SLAB cache"); + pgd_cache = kmem_cache_create("pgd", + PTRS_PER_PGD*sizeof(pgd_t), + 0, + SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN, + pgd_ctor, + PTRS_PER_PMD == 1 ? pgd_dtor : NULL); + if (!pgd_cache) + panic("pagetable_cache_init(): Cannot create pgd cache"); } -#endif /* * This function cannot be __init, since exceptions don't work in that diff -prauN linux-2.5.73/arch/i386/mm/ioremap.c wli-2.5.73-29/arch/i386/mm/ioremap.c --- linux-2.5.73/arch/i386/mm/ioremap.c 2003-06-22 11:32:38.000000000 -0700 +++ wli-2.5.73-29/arch/i386/mm/ioremap.c 2003-06-23 10:31:02.000000000 -0700 @@ -82,7 +82,7 @@ static int remap_area_pages(unsigned lon spin_lock(&init_mm.page_table_lock); do { pmd_t *pmd; - pmd = pmd_alloc(&init_mm, dir, address); + pmd = pmd_alloc_kernel(&init_mm, dir, address); error = -ENOMEM; if (!pmd) break; diff -prauN linux-2.5.73/arch/i386/mm/pageattr.c wli-2.5.73-29/arch/i386/mm/pageattr.c --- linux-2.5.73/arch/i386/mm/pageattr.c 2003-06-22 11:32:43.000000000 -0700 +++ wli-2.5.73-29/arch/i386/mm/pageattr.c 2003-06-23 10:38:47.000000000 -0700 @@ -19,7 +19,7 @@ static inline pte_t *lookup_address(unsi pmd_t *pmd; if (pgd_none(*pgd)) return NULL; - pmd = pmd_offset(pgd, address); + pmd = pmd_offset_kernel(pgd, address); if (pmd_none(*pmd)) return NULL; if (pmd_large(*pmd)) @@ -58,19 +58,22 @@ static void flush_kernel_map(void *dummy static void set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) { + struct page *page; + unsigned long flags; + set_pte_atomic(kpte, pte); /* change init_mm */ -#ifndef CONFIG_X86_PAE - { - struct list_head *l; - spin_lock(&mmlist_lock); - list_for_each(l, &init_mm.mmlist) { - struct mm_struct *mm = list_entry(l, struct mm_struct, mmlist); - pmd_t *pmd = pmd_offset(pgd_offset(mm, address), address); - set_pte_atomic((pte_t *)pmd, pte); - } - spin_unlock(&mmlist_lock); + if (PTRS_PER_PMD > 1) + return; + + spin_lock_irqsave(&pgd_lock, flags); + list_for_each_entry(page, &pgd_list, lru) { + pgd_t *pgd; + pmd_t *pmd; + pgd = (pgd_t *)page_address(page) + pgd_index(address); + pmd = pmd_offset_kernel(pgd, address); + set_pte_atomic((pte_t *)pmd, pte); } -#endif + spin_unlock_irqrestore(&pgd_lock, flags); } /* @@ -80,7 +83,7 @@ static void set_pmd_pte(pte_t *kpte, uns static inline void revert_page(struct page *kpte_page, unsigned long address) { pte_t *linear = (pte_t *) - pmd_offset(pgd_offset(&init_mm, address), address); + pmd_offset_kernel(pgd_offset_k(address), address); set_pmd_pte(linear, address, pfn_pte((__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); diff -prauN linux-2.5.73/arch/i386/mm/pgtable.c wli-2.5.73-29/arch/i386/mm/pgtable.c --- linux-2.5.73/arch/i386/mm/pgtable.c 2003-06-22 11:33:36.000000000 -0700 +++ wli-2.5.73-29/arch/i386/mm/pgtable.c 2003-06-23 10:33:55.000000000 -0700 @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -69,7 +70,7 @@ static void set_pte_pfn(unsigned long va BUG(); return; } - pmd = pmd_offset(pgd, vaddr); + pmd = pmd_offset_kernel(pgd, vaddr); if (pmd_none(*pmd)) { BUG(); return; @@ -109,7 +110,7 @@ void set_pmd_pfn(unsigned long vaddr, un printk ("set_pmd_pfn: pgd_none\n"); return; /* BUG(); */ } - pmd = pmd_offset(pgd, vaddr); + pmd = pmd_offset_kernel(pgd, vaddr); set_pmd(pmd, pfn_pmd(pfn, flags)); /* * It's enough to flush this one mapping. @@ -137,75 +138,142 @@ pte_t *pte_alloc_one_kernel(struct mm_st return pte; } -struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) +void tlb_init(void) { - struct page *pte; + int cpu; + for (cpu = 0; cpu < NR_CPUS; ++cpu) { + int zone; + struct mmu_gather *tlb = &per_cpu(mmu_gathers, cpu); + for (zone = 0; zone < MAX_ZONE_ID; ++zone) { + INIT_LIST_HEAD(&tlb->active_list[zone]); + INIT_LIST_HEAD(&tlb->ready_list[zone]); + } + } +} -#ifdef CONFIG_HIGHPTE - pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT, 0); -#else - pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT, 0); -#endif - if (pte) - clear_highpage(pte); - return pte; +static inline struct page *pte_alloc_fresh(int gfp_mask) +{ + struct page *page = alloc_page(gfp_mask); + if (page) { + clear_highpage(page); + if (TestSetPagePTE(page)) + BUG(); + } + return page; } -#ifdef CONFIG_X86_PAE +static inline int zone_high(struct zone *zone) +{ + if (!zone) + return 1; + else + return zone - zone->zone_pgdat->node_zones >= ZONE_HIGHMEM; +} -pgd_t *pgd_alloc(struct mm_struct *mm) +static inline struct page *pte_alloc_ready(int gfp_flags) { - int i; - pgd_t *pgd = kmem_cache_alloc(pae_pgd_cachep, GFP_KERNEL); + struct mmu_gather *tlb = &per_cpu(mmu_gathers, get_cpu()); + struct page *page = NULL; - if (pgd) { - for (i = 0; i < USER_PTRS_PER_PGD; i++) { - unsigned long pmd = __get_free_page(GFP_KERNEL); - if (!pmd) - goto out_oom; - clear_page(pmd); - set_pgd(pgd + i, __pgd(1 + __pa(pmd))); + if (tlb->nr_pte_ready) { + int z; + for (z = MAX_ZONE_ID - 1; z >= 0; --z) { + struct zone *zone = zone_table[z]; + if (!(gfp_flags & __GFP_HIGHMEM) && zone_high(zone)) + continue; + if (!list_empty(&tlb->ready_list[z])) + break; } - memcpy(pgd + USER_PTRS_PER_PGD, - swapper_pg_dir + USER_PTRS_PER_PGD, - (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); + page = list_entry(tlb->ready_list[z].next, struct page, list); + if (TestSetPagePTE(page)) + BUG(); + list_del(&page->list); + tlb->ready_count[z]--; + tlb->nr_pte_ready--; } - return pgd; -out_oom: - for (i--; i >= 0; i--) - free_page((unsigned long)__va(pgd_val(pgd[i])-1)); - kmem_cache_free(pae_pgd_cachep, pgd); - return NULL; + put_cpu(); + return page; } -void pgd_free(pgd_t *pgd) +struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) { - int i; + struct page *page = pte_alloc_ready(GFP_PTE); + return page ? page : pte_alloc_fresh(GFP_PTE); +} - for (i = 0; i < USER_PTRS_PER_PGD; i++) - free_page((unsigned long)__va(pgd_val(pgd[i])-1)); - kmem_cache_free(pae_pgd_cachep, pgd); +static inline struct page *__pmd_alloc_one(void) +{ + struct page *page = pte_alloc_ready(GFP_PMD); + return page ? page : pte_alloc_fresh(GFP_PMD); } -#else +LIST_HEAD(pgd_list); +spinlock_t pgd_lock = SPIN_LOCK_UNLOCKED; -pgd_t *pgd_alloc(struct mm_struct *mm) +void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused) { - pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL); + unsigned long flags; + + if (PTRS_PER_PMD == 1) + spin_lock_irqsave(&pgd_lock, flags); - if (pgd) { - memset(pgd, 0, USER_PTRS_PER_PGD * sizeof(pgd_t)); - memcpy(pgd + USER_PTRS_PER_PGD, + memcpy((pgd_t *)pgd + USER_PTRS_PER_PGD, swapper_pg_dir + USER_PTRS_PER_PGD, - (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); + (PTRS_PER_PGD - USER_PTRS_PER_PGD)*sizeof(pgd_t)); + + if (PTRS_PER_PMD > 1) + return; + + list_add(&virt_to_page(pgd)->lru, &pgd_list); + spin_unlock_irqrestore(&pgd_lock, flags); + memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); +} + +void pgd_dtor(void *pgd, kmem_cache_t *cache, unsigned long unused) +{ + unsigned long flags; + + spin_lock_irqsave(&pgd_lock, flags); + list_del(&virt_to_page(pgd)->lru); + spin_unlock_irqrestore(&pgd_lock, flags); +} + +pgd_t *pgd_alloc(struct mm_struct *mm) +{ + int i; + pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL); + + if (PTRS_PER_PMD == 1 || !pgd) + return pgd; + + for (i = 0; i < USER_PTRS_PER_PGD; i++) { + struct page *pmd = __pmd_alloc_one(); + if (!pmd) + goto out_oom; + set_pgd(&pgd[i], __pgd(1ULL | (u64)page_to_pfn(pmd) << PAGE_SHIFT)); } + return pgd; + + /* + * This looks unusual. pte_free() is actually a convenient wrapper + * for queueing up preconstructed pmd and/or pte pages. The cases + * fall through to just queueing them in the per-cpu lists. + */ +out_oom: + for (i--; i >= 0; i--) + pte_free(pgd_page(pgd[i])); + kmem_cache_free(pgd_cache, pgd); + return NULL; } + void pgd_free(pgd_t *pgd) { - free_page((unsigned long)pgd); + if (PTRS_PER_PMD > 1) { + int i; + for (i = 0; i < USER_PTRS_PER_PGD; i++) + pte_free(pgd_page(pgd[i])); + } + kmem_cache_free(pgd_cache, pgd); } - -#endif /* CONFIG_X86_PAE */ - diff -prauN linux-2.5.73/arch/i386/pci/numa.c wli-2.5.73-29/arch/i386/pci/numa.c --- linux-2.5.73/arch/i386/pci/numa.c 2003-06-22 11:32:36.000000000 -0700 +++ wli-2.5.73-29/arch/i386/pci/numa.c 2003-06-23 16:24:51.000000000 -0700 @@ -115,7 +115,7 @@ static int __init pci_numa_init(void) return 0; pci_root_bus = pcibios_scan_root(0); - if (numnodes > 1) { + if (0 && numnodes > 1) { for (quad = 1; quad < numnodes; ++quad) { printk("Scanning PCI bus %d for quad %d\n", QUADLOCAL2BUS(quad,0), quad); diff -prauN linux-2.5.73/arch/ia64/ia32/binfmt_elf32.c wli-2.5.73-29/arch/ia64/ia32/binfmt_elf32.c --- linux-2.5.73/arch/ia64/ia32/binfmt_elf32.c 2003-06-22 11:33:17.000000000 -0700 +++ wli-2.5.73-29/arch/ia64/ia32/binfmt_elf32.c 2003-06-23 10:37:43.000000000 -0700 @@ -201,7 +201,8 @@ ia32_setup_arg_pages (struct linux_binpr struct page *page = bprm->page[i]; if (page) { bprm->page[i] = NULL; - put_dirty_page(current, page, stack_base, PAGE_COPY); + put_dirty_page(current, mpnt, page, + stack_base, PAGE_COPY); } stack_base += PAGE_SIZE; } diff -prauN linux-2.5.73/arch/ia64/mm/hugetlbpage.c wli-2.5.73-29/arch/ia64/mm/hugetlbpage.c --- linux-2.5.73/arch/ia64/mm/hugetlbpage.c 2003-06-22 11:32:54.000000000 -0700 +++ wli-2.5.73-29/arch/ia64/mm/hugetlbpage.c 2003-06-23 10:46:31.000000000 -0700 @@ -223,7 +223,7 @@ follow_huge_pmd(struct mm_struct *mm, un void free_huge_page(struct page *page) { BUG_ON(page_count(page)); - BUG_ON(page->mapping); + BUG_ON(page_mapping(page)); INIT_LIST_HEAD(&page->list); diff -prauN linux-2.5.73/arch/parisc/kernel/cache.c wli-2.5.73-29/arch/parisc/kernel/cache.c --- linux-2.5.73/arch/parisc/kernel/cache.c 2003-06-22 11:32:43.000000000 -0700 +++ wli-2.5.73-29/arch/parisc/kernel/cache.c 2003-06-23 10:46:31.000000000 -0700 @@ -64,7 +64,7 @@ update_mmu_cache(struct vm_area_struct * { struct page *page = pte_page(pte); - if (VALID_PAGE(page) && page->mapping && + if (VALID_PAGE(page) && page_mapping(page) && test_bit(PG_dcache_dirty, &page->flags)) { flush_kernel_dcache_page(page_address(page)); @@ -230,14 +230,16 @@ void __flush_dcache_page(struct page *pa flush_kernel_dcache_page(page_address(page)); - if (!page->mapping) + if (!page_mapping(page)) return; - list_for_each(l, &page->mapping->i_mmap_shared) { + list_for_each_rcu(l, &page_mapping(page)->i_mmap_shared) { struct vm_area_struct *mpnt; unsigned long off; mpnt = list_entry(l, struct vm_area_struct, shared); + if (mpnt->vm_flags & VM_DEAD) + continue; /* * If this VMA is not in our MM, we can ignore it. diff -prauN linux-2.5.73/arch/ppc/mm/init.c wli-2.5.73-29/arch/ppc/mm/init.c --- linux-2.5.73/arch/ppc/mm/init.c 2003-06-22 11:32:37.000000000 -0700 +++ wli-2.5.73-29/arch/ppc/mm/init.c 2003-06-23 10:46:31.000000000 -0700 @@ -472,14 +472,14 @@ void __init mem_init(void) printk(KERN_INFO "AGP special page: 0x%08lx\n", agp_special_page); #endif - /* Make sure all our pagetable pages have page->mapping + /* Make sure all our pagetable pages have page_mapping(page) and page->index set correctly. */ for (addr = KERNELBASE; addr != 0; addr += PGDIR_SIZE) { struct page *pg; pmd_t *pmd = pmd_offset(pgd_offset_k(addr), addr); if (pmd_present(*pmd)) { pg = pmd_page(*pmd); - pg->mapping = (void *) &init_mm; + set_page_mapping(pg, &init_mm); pg->index = addr; } } diff -prauN linux-2.5.73/arch/s390/kernel/compat_exec.c wli-2.5.73-29/arch/s390/kernel/compat_exec.c --- linux-2.5.73/arch/s390/kernel/compat_exec.c 2003-06-22 11:33:15.000000000 -0700 +++ wli-2.5.73-29/arch/s390/kernel/compat_exec.c 2003-06-23 10:37:43.000000000 -0700 @@ -80,7 +80,8 @@ int setup_arg_pages32(struct linux_binpr struct page *page = bprm->page[i]; if (page) { bprm->page[i] = NULL; - put_dirty_page(current,page,stack_base,PAGE_COPY); + put_dirty_page(current, mpnt, page, + stack_base, PAGE_COPY); } stack_base += PAGE_SIZE; } diff -prauN linux-2.5.73/arch/sparc/mm/srmmu.c wli-2.5.73-29/arch/sparc/mm/srmmu.c --- linux-2.5.73/arch/sparc/mm/srmmu.c 2003-06-22 11:32:56.000000000 -0700 +++ wli-2.5.73-29/arch/sparc/mm/srmmu.c 2003-06-23 10:31:02.000000000 -0700 @@ -2180,7 +2180,7 @@ void __init ld_mmu_srmmu(void) BTFIXUPSET_CALL(pte_pfn, srmmu_pte_pfn, BTFIXUPCALL_NORM); BTFIXUPSET_CALL(pmd_page, srmmu_pmd_page, BTFIXUPCALL_NORM); - BTFIXUPSET_CALL(pgd_page, srmmu_pgd_page, BTFIXUPCALL_NORM); + BTFIXUPSET_CALL(__pgd_page, srmmu_pgd_page, BTFIXUPCALL_NORM); BTFIXUPSET_SETHI(none_mask, 0xF0000000); diff -prauN linux-2.5.73/arch/sparc/mm/sun4c.c wli-2.5.73-29/arch/sparc/mm/sun4c.c --- linux-2.5.73/arch/sparc/mm/sun4c.c 2003-06-22 11:33:06.000000000 -0700 +++ wli-2.5.73-29/arch/sparc/mm/sun4c.c 2003-06-23 10:31:02.000000000 -0700 @@ -2252,5 +2252,5 @@ void __init ld_mmu_sun4c(void) /* These should _never_ get called with two level tables. */ BTFIXUPSET_CALL(pgd_set, sun4c_pgd_set, BTFIXUPCALL_NOP); - BTFIXUPSET_CALL(pgd_page, sun4c_pgd_page, BTFIXUPCALL_RETO0); + BTFIXUPSET_CALL(__pgd_page, sun4c_pgd_page, BTFIXUPCALL_RETO0); } diff -prauN linux-2.5.73/arch/sparc64/kernel/smp.c wli-2.5.73-29/arch/sparc64/kernel/smp.c --- linux-2.5.73/arch/sparc64/kernel/smp.c 2003-06-22 11:32:27.000000000 -0700 +++ wli-2.5.73-29/arch/sparc64/kernel/smp.c 2003-06-23 10:46:31.000000000 -0700 @@ -675,9 +675,9 @@ static __inline__ void __local_flush_dca #if (L1DCACHE_SIZE > PAGE_SIZE) __flush_dcache_page(page->virtual, ((tlb_type == spitfire) && - page->mapping != NULL)); + page_mapping(page) != NULL)); #else - if (page->mapping != NULL && + if (page_mapping(page) != NULL && tlb_type == spitfire) __flush_icache_page(__pa(page->virtual)); #endif @@ -698,7 +698,7 @@ void smp_flush_dcache_page_impl(struct p if (tlb_type == spitfire) { data0 = ((u64)&xcall_flush_dcache_page_spitfire); - if (page->mapping != NULL) + if (page_mapping(page) != NULL) data0 |= ((u64)1 << 32); spitfire_xcall_deliver(data0, __pa(page->virtual), @@ -729,7 +729,7 @@ void flush_dcache_page_all(struct mm_str goto flush_self; if (tlb_type == spitfire) { data0 = ((u64)&xcall_flush_dcache_page_spitfire); - if (page->mapping != NULL) + if (page_mapping(page) != NULL) data0 |= ((u64)1 << 32); spitfire_xcall_deliver(data0, __pa(page->virtual), diff -prauN linux-2.5.73/arch/sparc64/mm/hugetlbpage.c wli-2.5.73-29/arch/sparc64/mm/hugetlbpage.c --- linux-2.5.73/arch/sparc64/mm/hugetlbpage.c 2003-06-22 11:32:58.000000000 -0700 +++ wli-2.5.73-29/arch/sparc64/mm/hugetlbpage.c 2003-06-23 10:46:31.000000000 -0700 @@ -74,8 +74,8 @@ static struct page *alloc_hugetlb_page(v static void free_hugetlb_page(struct page *page) { spin_lock(&htlbpage_lock); - if ((page->mapping != NULL) && (page_count(page) == 2)) { - struct inode *inode = page->mapping->host; + if ((page_mapping(page) != NULL) && (page_count(page) == 2)) { + struct inode *inode = page_mapping(page)->host; int i; ClearPageDirty(page); diff -prauN linux-2.5.73/arch/sparc64/mm/init.c wli-2.5.73-29/arch/sparc64/mm/init.c --- linux-2.5.73/arch/sparc64/mm/init.c 2003-06-22 11:32:41.000000000 -0700 +++ wli-2.5.73-29/arch/sparc64/mm/init.c 2003-06-23 10:46:31.000000000 -0700 @@ -129,9 +129,9 @@ __inline__ void flush_dcache_page_impl(s #if (L1DCACHE_SIZE > PAGE_SIZE) __flush_dcache_page(page->virtual, ((tlb_type == spitfire) && - page->mapping != NULL)); + page_mapping(page) != NULL)); #else - if (page->mapping != NULL && + if (page_mapping(page) != NULL && tlb_type == spitfire) __flush_icache_page(__pa(page->virtual)); #endif @@ -193,7 +193,7 @@ void update_mmu_cache(struct vm_area_str pfn = pte_pfn(pte); if (pfn_valid(pfn) && - (page = pfn_to_page(pfn), page->mapping) && + (page = pfn_to_page(pfn), page_mapping(page)) && ((pg_flags = page->flags) & (1UL << PG_dcache_dirty))) { int cpu = ((pg_flags >> 24) & (NR_CPUS - 1UL)); @@ -217,9 +217,9 @@ void flush_dcache_page(struct page *page int dirty = test_bit(PG_dcache_dirty, &page->flags); int dirty_cpu = dcache_dirty_cpu(page); - if (page->mapping && - list_empty(&page->mapping->i_mmap) && - list_empty(&page->mapping->i_mmap_shared)) { + if (page_mapping(page) && + list_empty(&page_mapping(page)->i_mmap) && + list_empty(&page_mapping(page)->i_mmap_shared)) { if (dirty) { if (dirty_cpu == smp_processor_id()) return; @@ -227,7 +227,7 @@ void flush_dcache_page(struct page *page } set_dcache_dirty(page); } else { - /* We could delay the flush for the !page->mapping + /* We could delay the flush for the !page_mapping(page) * case too. But that case is for exec env/arg * pages and those are %99 certainly going to get * faulted into the tlb (and thus flushed) anyways. @@ -269,7 +269,7 @@ static inline void flush_cache_pte_range if (!pfn_valid(pfn)) continue; page = pfn_to_page(pfn); - if (PageReserved(page) || !page->mapping) + if (PageReserved(page) || !page_mapping(page)) continue; pgaddr = (unsigned long) page_address(page); uaddr = address + offset; diff -prauN linux-2.5.73/arch/sparc64/mm/ultra.S wli-2.5.73-29/arch/sparc64/mm/ultra.S --- linux-2.5.73/arch/sparc64/mm/ultra.S 2003-06-22 11:33:01.000000000 -0700 +++ wli-2.5.73-29/arch/sparc64/mm/ultra.S 2003-06-23 10:46:31.000000000 -0700 @@ -615,7 +615,7 @@ xcall_flush_dcache_page_cheetah: /* %g1 .globl xcall_flush_dcache_page_spitfire xcall_flush_dcache_page_spitfire: /* %g1 == physical page address %g7 == kernel page virtual address - %g5 == (page->mapping != NULL) */ + %g5 == (page_mapping(page) != NULL) */ #if (L1DCACHE_SIZE > PAGE_SIZE) srlx %g1, (13 - 2), %g1 ! Form tag comparitor sethi %hi(L1DCACHE_SIZE), %g3 ! D$ size == 16K diff -prauN linux-2.5.73/arch/x86_64/ia32/ia32_binfmt.c wli-2.5.73-29/arch/x86_64/ia32/ia32_binfmt.c --- linux-2.5.73/arch/x86_64/ia32/ia32_binfmt.c 2003-06-22 11:32:31.000000000 -0700 +++ wli-2.5.73-29/arch/x86_64/ia32/ia32_binfmt.c 2003-06-23 10:37:43.000000000 -0700 @@ -363,7 +363,8 @@ int setup_arg_pages(struct linux_binprm struct page *page = bprm->page[i]; if (page) { bprm->page[i] = NULL; - put_dirty_page(current,page,stack_base,PAGE_COPY_EXEC); + put_dirty_page(current, mpnt, page, + stack_base, PAGE_COPY_EXEC); } stack_base += PAGE_SIZE; } diff -prauN linux-2.5.73/drivers/char/drm/drm_memory.h wli-2.5.73-29/drivers/char/drm/drm_memory.h --- linux-2.5.73/drivers/char/drm/drm_memory.h 2003-06-22 11:32:35.000000000 -0700 +++ wli-2.5.73-29/drivers/char/drm/drm_memory.h 2003-06-23 10:31:02.000000000 -0700 @@ -123,7 +123,7 @@ static inline unsigned long drm_follow_page (void *vaddr) { pgd_t *pgd = pgd_offset_k((unsigned long) vaddr); - pmd_t *pmd = pmd_offset(pgd, (unsigned long) vaddr); + pmd_t *pmd = pmd_offset_kernel(pgd, (unsigned long)vaddr); pte_t *ptep = pte_offset_kernel(pmd, (unsigned long) vaddr); return pte_pfn(*ptep) << PAGE_SHIFT; } diff -prauN linux-2.5.73/drivers/pci/hotplug.c wli-2.5.73-29/drivers/pci/hotplug.c --- linux-2.5.73/drivers/pci/hotplug.c 2003-06-22 11:33:17.000000000 -0700 +++ wli-2.5.73-29/drivers/pci/hotplug.c 2003-06-23 23:35:42.000000000 -0700 @@ -172,6 +172,7 @@ int pci_visit_dev (struct pci_visit *fn, return result; } EXPORT_SYMBOL(pci_visit_dev); +#endif /* CONFIG_HOTPLUG */ static void pci_destroy_dev(struct pci_dev *dev) { @@ -191,6 +192,7 @@ static void pci_destroy_dev(struct pci_d pci_dev_put(dev); } +#ifdef CONFIG_HOTPLUG /** * pci_remove_device_safe - remove an unused hotplug device * @dev: the device to remove diff -prauN linux-2.5.73/fs/adfs/inode.c wli-2.5.73-29/fs/adfs/inode.c --- linux-2.5.73/fs/adfs/inode.c 2003-06-22 11:32:39.000000000 -0700 +++ wli-2.5.73-29/fs/adfs/inode.c 2003-06-23 10:46:31.000000000 -0700 @@ -64,7 +64,7 @@ static int adfs_readpage(struct file *fi static int adfs_prepare_write(struct file *file, struct page *page, unsigned int from, unsigned int to) { return cont_prepare_write(page, from, to, adfs_get_block, - &ADFS_I(page->mapping->host)->mmu_private); + &ADFS_I(page_mapping(page)->host)->mmu_private); } static sector_t _adfs_bmap(struct address_space *mapping, sector_t block) diff -prauN linux-2.5.73/fs/affs/file.c wli-2.5.73-29/fs/affs/file.c --- linux-2.5.73/fs/affs/file.c 2003-06-22 11:32:44.000000000 -0700 +++ wli-2.5.73-29/fs/affs/file.c 2003-06-23 10:46:31.000000000 -0700 @@ -418,7 +418,7 @@ static int affs_readpage(struct file *fi static int affs_prepare_write(struct file *file, struct page *page, unsigned from, unsigned to) { return cont_prepare_write(page, from, to, affs_get_block, - &AFFS_I(page->mapping->host)->mmu_private); + &AFFS_I(page_mapping(page)->host)->mmu_private); } static sector_t _affs_bmap(struct address_space *mapping, sector_t block) { @@ -507,7 +507,7 @@ affs_file_write(struct file *file, const static int affs_do_readpage_ofs(struct file *file, struct page *page, unsigned from, unsigned to) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; struct super_block *sb = inode->i_sb; struct buffer_head *bh; char *data; @@ -615,7 +615,7 @@ out: static int affs_readpage_ofs(struct file *file, struct page *page) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; u32 to; int err; @@ -635,7 +635,7 @@ affs_readpage_ofs(struct file *file, str static int affs_prepare_write_ofs(struct file *file, struct page *page, unsigned from, unsigned to) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; u32 size, offset; u32 tmp; int err = 0; @@ -676,7 +676,7 @@ static int affs_prepare_write_ofs(struct static int affs_commit_write_ofs(struct file *file, struct page *page, unsigned from, unsigned to) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; struct super_block *sb = inode->i_sb; struct buffer_head *bh, *prev_bh; char *data; diff -prauN linux-2.5.73/fs/affs/symlink.c wli-2.5.73-29/fs/affs/symlink.c --- linux-2.5.73/fs/affs/symlink.c 2003-06-22 11:32:35.000000000 -0700 +++ wli-2.5.73-29/fs/affs/symlink.c 2003-06-23 10:46:31.000000000 -0700 @@ -20,7 +20,7 @@ static int affs_symlink_readpage(struct file *file, struct page *page) { struct buffer_head *bh; - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; char *link = kmap(page); struct slink_front *lf; int err; diff -prauN linux-2.5.73/fs/afs/file.c wli-2.5.73-29/fs/afs/file.c --- linux-2.5.73/fs/afs/file.c 2003-06-22 11:33:03.000000000 -0700 +++ wli-2.5.73-29/fs/afs/file.c 2003-06-23 10:46:31.000000000 -0700 @@ -75,7 +75,7 @@ static int afs_file_readpage(struct file afs_vnode_t *vnode; int ret; - inode = page->mapping->host; + inode = page_mapping(page)->host; _enter("{%lu},{%lu}",inode->i_ino,page->index); diff -prauN linux-2.5.73/fs/buffer.c wli-2.5.73-29/fs/buffer.c --- linux-2.5.73/fs/buffer.c 2003-06-22 11:32:56.000000000 -0700 +++ wli-2.5.73-29/fs/buffer.c 2003-06-23 10:46:31.000000000 -0700 @@ -764,7 +764,7 @@ void write_boundary_block(struct block_d void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode) { struct address_space *mapping = inode->i_mapping; - struct address_space *buffer_mapping = bh->b_page->mapping; + struct address_space *buffer_mapping = page_mapping(bh->b_page); mark_buffer_dirty(bh); if (!mapping->assoc_mapping) { @@ -809,19 +809,10 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode); * * FIXME: may need to call ->reservepage here as well. That's rather up to the * address_space though. - * - * For now, we treat swapper_space specially. It doesn't use the normal - * block a_ops. */ -int __set_page_dirty_buffers(struct page *page) +int set_page_dirty_buffers(struct page *page) { - struct address_space * const mapping = page->mapping; - int ret = 0; - - if (mapping == NULL) { - SetPageDirty(page); - goto out; - } + struct address_space * const mapping = page_mapping(page); spin_lock(&mapping->private_lock); if (page_has_buffers(page)) { @@ -839,21 +830,19 @@ int __set_page_dirty_buffers(struct page spin_unlock(&mapping->private_lock); if (!TestSetPageDirty(page)) { - spin_lock(&mapping->page_lock); - if (page->mapping) { /* Race with truncate? */ + mapping_wrlock(&mapping->page_lock); + if (page_mapping(page)) { /* Race with truncate? */ if (!mapping->backing_dev_info->memory_backed) inc_page_state(nr_dirty); list_del(&page->list); list_add(&page->list, &mapping->dirty_pages); } - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); } - -out: - return ret; + return 0; } -EXPORT_SYMBOL(__set_page_dirty_buffers); +EXPORT_SYMBOL(set_page_dirty_buffers); /* * Write out and wait upon a list of buffers. @@ -1225,7 +1214,7 @@ __getblk_slow(struct block_device *bdev, * address_space's dirty_pages list and then attach the address_space's * inode to its superblock's dirty inode list. * - * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock, + * mark_buffer_dirty() is atomic. It takes page_mapping(bh->b_page)->private_lock, * mapping->page_lock and the global inode_lock. */ void mark_buffer_dirty(struct buffer_head *bh) @@ -1233,7 +1222,7 @@ void mark_buffer_dirty(struct buffer_hea if (!buffer_uptodate(bh)) buffer_error(); if (!buffer_dirty(bh) && !test_set_buffer_dirty(bh)) - __set_page_dirty_nobuffers(bh->b_page); + set_page_dirty_nobuffers(bh->b_page); } /* @@ -1261,7 +1250,7 @@ void __bforget(struct buffer_head *bh) { clear_buffer_dirty(bh); if (!list_empty(&bh->b_assoc_buffers)) { - struct address_space *buffer_mapping = bh->b_page->mapping; + struct address_space *buffer_mapping = page_mapping(bh->b_page); spin_lock(&buffer_mapping->private_lock); list_del_init(&bh->b_assoc_buffers); @@ -1538,7 +1527,7 @@ static inline void discard_buffer(struct */ int try_to_release_page(struct page *page, int gfp_mask) { - struct address_space * const mapping = page->mapping; + struct address_space * const mapping = page_mapping(page); if (!PageLocked(page)) BUG(); @@ -1604,7 +1593,7 @@ EXPORT_SYMBOL(block_invalidatepage); /* * We attach and possibly dirty the buffers atomically wrt - * __set_page_dirty_buffers() via private_lock. try_to_free_buffers + * set_page_dirty_buffers() via private_lock. try_to_free_buffers * is already excluded via the page lock. */ void create_empty_buffers(struct page *page, @@ -1621,7 +1610,7 @@ void create_empty_buffers(struct page *p } while (bh); tail->b_this_page = head; - spin_lock(&page->mapping->private_lock); + spin_lock(&page_mapping(page)->private_lock); if (PageUptodate(page) || PageDirty(page)) { bh = head; do { @@ -1633,7 +1622,7 @@ void create_empty_buffers(struct page *p } while (bh != head); } __set_page_buffers(page, head); - spin_unlock(&page->mapping->private_lock); + spin_unlock(&page_mapping(page)->private_lock); } EXPORT_SYMBOL(create_empty_buffers); @@ -1717,12 +1706,12 @@ static int __block_write_full_page(struc } /* - * Be very careful. We have no exclusion from __set_page_dirty_buffers + * Be very careful. We have no exclusion from set_page_dirty_buffers * here, and the (potentially unmapped) buffers may become dirty at * any time. If a buffer becomes dirty here after we've inspected it * then we just miss that fact, and the page stays dirty. * - * Buffers outside i_size may be dirtied by __set_page_dirty_buffers; + * Buffers outside i_size may be dirtied by set_page_dirty_buffers; * handle that here by just cleaning them. */ @@ -1773,7 +1762,7 @@ static int __block_write_full_page(struc lock_buffer(bh); } else { if (test_set_buffer_locked(bh)) { - __set_page_dirty_nobuffers(page); + set_page_dirty_nobuffers(page); continue; } } @@ -2026,7 +2015,7 @@ static int __block_commit_write(struct i */ int block_read_full_page(struct page *page, get_block_t *get_block) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; sector_t iblock, lblock; struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; unsigned int blocksize; @@ -2166,7 +2155,7 @@ out: int cont_prepare_write(struct page *page, unsigned offset, unsigned to, get_block_t *get_block, loff_t *bytes) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); struct inode *inode = mapping->host; struct page *new_page; unsigned long pgpos; @@ -2248,7 +2237,7 @@ out: int block_prepare_write(struct page *page, unsigned from, unsigned to, get_block_t *get_block) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; int err = __block_prepare_write(inode, page, from, to, get_block); if (err) ClearPageUptodate(page); @@ -2257,7 +2246,7 @@ int block_prepare_write(struct page *pag int block_commit_write(struct page *page, unsigned from, unsigned to) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; __block_commit_write(inode,page,from,to); return 0; } @@ -2265,7 +2254,7 @@ int block_commit_write(struct page *page int generic_commit_write(struct file *file, struct page *page, unsigned from, unsigned to) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; __block_commit_write(inode,page,from,to); if (pos > inode->i_size) { @@ -2282,7 +2271,7 @@ int generic_commit_write(struct file *fi int nobh_prepare_write(struct page *page, unsigned from, unsigned to, get_block_t *get_block) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; const unsigned blkbits = inode->i_blkbits; const unsigned blocksize = 1 << blkbits; struct buffer_head map_bh; @@ -2416,7 +2405,7 @@ EXPORT_SYMBOL(nobh_prepare_write); int nobh_commit_write(struct file *file, struct page *page, unsigned from, unsigned to) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; set_page_dirty(page); @@ -2550,7 +2539,7 @@ out: int block_write_full_page(struct page *page, get_block_t *get_block, struct writeback_control *wbc) { - struct inode * const inode = page->mapping->host; + struct inode * const inode = page_mapping(page)->host; const unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT; unsigned offset; void *kaddr; @@ -2725,9 +2714,9 @@ void sync_dirty_buffer(struct buffer_hea static void check_ttfb_buffer(struct page *page, struct buffer_head *bh) { if (!buffer_uptodate(bh) && !buffer_req(bh)) { - if (PageUptodate(page) && page->mapping + if (PageUptodate(page) && page_mapping(page) && buffer_mapped(bh) /* discard_buffer */ - && S_ISBLK(page->mapping->host->i_mode)) + && S_ISBLK(page_mapping(page)->host->i_mode)) { buffer_error(); } @@ -2749,7 +2738,7 @@ static void check_ttfb_buffer(struct pag * * The same applies to regular filesystem pages: if all the buffers are * clean then we set the page clean and proceed. To do that, we require - * total exclusion from __set_page_dirty_buffers(). That is obtained with + * total exclusion from set_page_dirty_buffers(). That is obtained with * private_lock. * * try_to_free_buffers() is non-blocking. @@ -2796,7 +2785,7 @@ failed: int try_to_free_buffers(struct page *page) { - struct address_space * const mapping = page->mapping; + struct address_space * const mapping = page_mapping(page); struct buffer_head *buffers_to_free = NULL; int ret = 0; diff -prauN linux-2.5.73/fs/cifs/file.c wli-2.5.73-29/fs/cifs/file.c --- linux-2.5.73/fs/cifs/file.c 2003-06-22 11:32:28.000000000 -0700 +++ wli-2.5.73-29/fs/cifs/file.c 2003-06-23 10:46:31.000000000 -0700 @@ -409,14 +409,14 @@ cifs_write(struct file * file, const cha static int cifs_partialpagewrite(struct page *page,unsigned from, unsigned to) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); loff_t offset = (loff_t)page->index << PAGE_CACHE_SHIFT; char * write_data; int rc = -EFAULT; int bytes_written = 0; struct cifs_sb_info *cifs_sb; struct cifsTconInfo *pTcon; - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; struct cifsInodeInfo *cifsInode; struct cifsFileInfo *open_file = NULL; struct list_head *tmp; @@ -528,7 +528,7 @@ cifs_commit_write(struct file *file, str { int xid; int rc = 0; - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; loff_t position = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; struct cifsFileInfo *open_file; struct cifs_sb_info *cifs_sb; @@ -582,7 +582,7 @@ cifs_sync_page(struct page *page) int rc = 0; cFYI(1,("sync page %p",page)); - mapping = page->mapping; + mapping = page_mapping(page); if (!mapping) return 0; inode = mapping->host; diff -prauN linux-2.5.73/fs/coda/symlink.c wli-2.5.73-29/fs/coda/symlink.c --- linux-2.5.73/fs/coda/symlink.c 2003-06-22 11:32:37.000000000 -0700 +++ wli-2.5.73-29/fs/coda/symlink.c 2003-06-23 10:46:31.000000000 -0700 @@ -24,7 +24,7 @@ static int coda_symlink_filler(struct file *file, struct page *page) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; int error; struct coda_inode_info *cii; unsigned int len = PAGE_SIZE; diff -prauN linux-2.5.73/fs/cramfs/inode.c wli-2.5.73-29/fs/cramfs/inode.c --- linux-2.5.73/fs/cramfs/inode.c 2003-06-22 11:32:31.000000000 -0700 +++ wli-2.5.73-29/fs/cramfs/inode.c 2003-06-23 10:46:31.000000000 -0700 @@ -400,7 +400,7 @@ static struct dentry * cramfs_lookup(str static int cramfs_readpage(struct file *file, struct page * page) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; u32 maxblock, bytes_filled; void *pgdata; diff -prauN linux-2.5.73/fs/efs/symlink.c wli-2.5.73-29/fs/efs/symlink.c --- linux-2.5.73/fs/efs/symlink.c 2003-06-22 11:32:56.000000000 -0700 +++ wli-2.5.73-29/fs/efs/symlink.c 2003-06-23 10:46:31.000000000 -0700 @@ -16,7 +16,7 @@ static int efs_symlink_readpage(struct f { char *link = kmap(page); struct buffer_head * bh; - struct inode * inode = page->mapping->host; + struct inode * inode = page_mapping(page)->host; efs_block_t size = inode->i_size; int err; diff -prauN linux-2.5.73/fs/exec.c wli-2.5.73-29/fs/exec.c --- linux-2.5.73/fs/exec.c 2003-06-22 11:32:41.000000000 -0700 +++ wli-2.5.73-29/fs/exec.c 2003-06-23 10:53:23.000000000 -0700 @@ -44,7 +44,7 @@ #include #include #include -#include +#include #include #include @@ -285,52 +285,49 @@ int copy_strings_kernel(int argc,char ** * This routine is used to map in a page into an address space: needed by * execve() for the initial stack and environment pages. * - * tsk->mmap_sem is held for writing. + * The caller should hold task->mm->mmap_sem for writing. */ -void put_dirty_page(struct task_struct *tsk, struct page *page, - unsigned long address, pgprot_t prot) +void put_dirty_page(task_t *task, struct vm_area_struct *vma, + struct page *page, unsigned long address, pgprot_t prot) { - pgd_t * pgd; - pmd_t * pmd; - pte_t * pte; - struct pte_chain *pte_chain; + struct mm_struct *mm = task->mm; + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; if (page_count(page) != 1) printk(KERN_ERR "mem_map disagrees with %p at %08lx\n", page, address); - pgd = pgd_offset(tsk->mm, address); - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) - goto out_sig; - spin_lock(&tsk->mm->page_table_lock); - pmd = pmd_alloc(tsk->mm, pgd, address); + pgd = pgd_offset(mm, address); + spin_lock(&mm->page_table_lock); + pmd = pmd_alloc_map(mm, pgd, address); if (!pmd) goto out; - pte = pte_alloc_map(tsk->mm, pmd, address); + pte = pte_alloc_map(mm, &pmd, address); if (!pte) goto out; if (!pte_none(*pte)) { pte_unmap(pte); goto out; } + mm->rss++; lru_cache_add_active(page); flush_dcache_page(page); - set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(page, prot)))); - pte_chain = page_add_rmap(page, pte, pte_chain); + vm_set_pte(vma, pte, pte_mkdirty(pte_mkwrite(mk_pte(page, prot))), address); + page_add_rmap(page, vma, address, 1); pte_unmap(pte); - tsk->mm->rss++; - spin_unlock(&tsk->mm->page_table_lock); + pmd_unmap(pmd); + spin_unlock(&mm->page_table_lock); /* no need for flush_tlb */ - pte_chain_free(pte_chain); return; out: - spin_unlock(&tsk->mm->page_table_lock); -out_sig: + if (pmd) + pmd_unmap(pmd); + spin_unlock(&mm->page_table_lock); __free_page(page); - force_sig(SIGKILL, tsk); - pte_chain_free(pte_chain); + force_sig(SIGKILL, task); return; } @@ -423,7 +420,7 @@ int setup_arg_pages(struct linux_binprm struct page *page = bprm->page[i]; if (page) { bprm->page[i] = NULL; - put_dirty_page(current, page, stack_base, + put_dirty_page(current, mpnt, page, stack_base, mpnt->vm_page_prot); } stack_base += PAGE_SIZE; diff -prauN linux-2.5.73/fs/ext2/dir.c wli-2.5.73-29/fs/ext2/dir.c --- linux-2.5.73/fs/ext2/dir.c 2003-06-22 11:32:55.000000000 -0700 +++ wli-2.5.73-29/fs/ext2/dir.c 2003-06-23 10:46:31.000000000 -0700 @@ -64,10 +64,10 @@ ext2_last_byte(struct inode *inode, unsi static int ext2_commit_chunk(struct page *page, unsigned from, unsigned to) { - struct inode *dir = page->mapping->host; + struct inode *dir = page_mapping(page)->host; int err = 0; dir->i_version++; - page->mapping->a_ops->commit_write(NULL, page, from, to); + page_mapping(page)->a_ops->commit_write(NULL, page, from, to); if (IS_DIRSYNC(dir)) err = write_one_page(page, 1); else @@ -77,7 +77,7 @@ static int ext2_commit_chunk(struct page static void ext2_check_page(struct page *page) { - struct inode *dir = page->mapping->host; + struct inode *dir = page_mapping(page)->host; struct super_block *sb = dir->i_sb; unsigned chunk_size = ext2_chunk_size(dir); char *kaddr = page_address(page); @@ -412,7 +412,7 @@ void ext2_set_link(struct inode *dir, st int err; lock_page(page); - err = page->mapping->a_ops->prepare_write(NULL, page, from, to); + err = page_mapping(page)->a_ops->prepare_write(NULL, page, from, to); if (err) BUG(); de->inode = cpu_to_le32(inode->i_ino); @@ -495,7 +495,7 @@ int ext2_add_link (struct dentry *dentry got_it: from = (char*)de - (char*)page_address(page); to = from + rec_len; - err = page->mapping->a_ops->prepare_write(NULL, page, from, to); + err = page_mapping(page)->a_ops->prepare_write(NULL, page, from, to); if (err) goto out_unlock; if (de->inode) { @@ -528,7 +528,7 @@ out_unlock: */ int ext2_delete_entry (struct ext2_dir_entry_2 * dir, struct page * page ) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); struct inode *inode = mapping->host; char *kaddr = page_address(page); unsigned from = ((char*)dir - kaddr) & ~(ext2_chunk_size(inode)-1); diff -prauN linux-2.5.73/fs/ext3/inode.c wli-2.5.73-29/fs/ext3/inode.c --- linux-2.5.73/fs/ext3/inode.c 2003-06-22 11:32:58.000000000 -0700 +++ wli-2.5.73-29/fs/ext3/inode.c 2003-06-23 11:03:10.000000000 -0700 @@ -1083,7 +1083,7 @@ static int do_journal_get_write_access(h static int ext3_prepare_write(struct file *file, struct page *page, unsigned from, unsigned to) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; int ret, needed_blocks = ext3_writepage_trans_blocks(inode); handle_t *handle; @@ -1138,7 +1138,7 @@ static int ext3_ordered_commit_write(str unsigned from, unsigned to) { handle_t *handle = ext3_journal_current_handle(); - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; int ret = 0, ret2; ret = walk_page_buffers(handle, page_buffers(page), @@ -1167,7 +1167,7 @@ static int ext3_writeback_commit_write(s unsigned from, unsigned to) { handle_t *handle = ext3_journal_current_handle(); - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; int ret = 0, ret2; loff_t new_i_size; @@ -1185,7 +1185,7 @@ static int ext3_journalled_commit_write( struct page *page, unsigned from, unsigned to) { handle_t *handle = ext3_journal_current_handle(); - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; int ret = 0, ret2; int partial = 0; loff_t pos; @@ -1340,7 +1340,7 @@ static int journal_dirty_data_fn(handle_ static int ext3_ordered_writepage(struct page *page, struct writeback_control *wbc) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; struct buffer_head *page_bufs; handle_t *handle = NULL; int ret = 0; @@ -1400,7 +1400,7 @@ static int ext3_ordered_writepage(struct return ret; out_fail: - __set_page_dirty_nobuffers(page); + set_page_dirty_nobuffers(page); unlock_page(page); return ret; } @@ -1408,7 +1408,7 @@ out_fail: static int ext3_writeback_writepage(struct page *page, struct writeback_control *wbc) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; handle_t *handle = NULL; int ret = 0; int err; @@ -1429,7 +1429,7 @@ static int ext3_writeback_writepage(stru return ret; out_fail: - __set_page_dirty_nobuffers(page); + set_page_dirty_nobuffers(page); unlock_page(page); return ret; } @@ -1437,7 +1437,7 @@ out_fail: static int ext3_journalled_writepage(struct page *page, struct writeback_control *wbc) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; handle_t *handle = NULL; int ret = 0; int err; @@ -1485,7 +1485,7 @@ out: return ret; no_write: - __set_page_dirty_nobuffers(page); + set_page_dirty_nobuffers(page); out_unlock: unlock_page(page); goto out; @@ -1505,7 +1505,7 @@ ext3_readpages(struct file *file, struct static int ext3_invalidatepage(struct page *page, unsigned long offset) { - journal_t *journal = EXT3_JOURNAL(page->mapping->host); + journal_t *journal = EXT3_JOURNAL(page_mapping(page)->host); /* * If it's a full truncate we just forget about the pending dirtying @@ -1518,7 +1518,7 @@ static int ext3_invalidatepage(struct pa static int ext3_releasepage(struct page *page, int wait) { - journal_t *journal = EXT3_JOURNAL(page->mapping->host); + journal_t *journal = EXT3_JOURNAL(page_mapping(page)->host); WARN_ON(PageChecked(page)); return journal_try_to_free_buffers(journal, page, wait); @@ -1604,7 +1604,7 @@ out: static int ext3_journalled_set_page_dirty(struct page *page) { SetPageChecked(page); - return __set_page_dirty_nobuffers(page); + return set_page_dirty_nobuffers(page); } static struct address_space_operations ext3_ordered_aops = { diff -prauN linux-2.5.73/fs/fat/inode.c wli-2.5.73-29/fs/fat/inode.c --- linux-2.5.73/fs/fat/inode.c 2003-06-22 11:32:43.000000000 -0700 +++ wli-2.5.73-29/fs/fat/inode.c 2003-06-23 10:46:31.000000000 -0700 @@ -1070,7 +1070,7 @@ fat_prepare_write(struct file *file, str { kmap(page); return cont_prepare_write(page,from,to,fat_get_block, - &MSDOS_I(page->mapping->host)->mmu_private); + &MSDOS_I(page_mapping(page)->host)->mmu_private); } static int diff -prauN linux-2.5.73/fs/freevxfs/vxfs_immed.c wli-2.5.73-29/fs/freevxfs/vxfs_immed.c --- linux-2.5.73/fs/freevxfs/vxfs_immed.c 2003-06-22 11:32:58.000000000 -0700 +++ wli-2.5.73-29/fs/freevxfs/vxfs_immed.c 2003-06-23 10:46:31.000000000 -0700 @@ -122,7 +122,7 @@ vxfs_immed_follow_link(struct dentry *dp static int vxfs_immed_readpage(struct file *fp, struct page *pp) { - struct vxfs_inode_info *vip = VXFS_INO(pp->mapping->host); + struct vxfs_inode_info *vip = VXFS_INO(page_mapping(pp)->host); u_int64_t offset = pp->index << PAGE_CACHE_SHIFT; caddr_t kaddr; diff -prauN linux-2.5.73/fs/fs-writeback.c wli-2.5.73-29/fs/fs-writeback.c --- linux-2.5.73/fs/fs-writeback.c 2003-06-22 11:33:36.000000000 -0700 +++ wli-2.5.73-29/fs/fs-writeback.c 2003-06-23 10:38:47.000000000 -0700 @@ -150,10 +150,10 @@ __sync_single_inode(struct inode *inode, * read speculatively by this cpu before &= ~I_DIRTY -- mikulas */ - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); if (wait || !wbc->for_kupdate || list_empty(&mapping->io_pages)) list_splice_init(&mapping->dirty_pages, &mapping->io_pages); - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); spin_unlock(&inode_lock); do_writepages(mapping, wbc); diff -prauN linux-2.5.73/fs/hfs/inode.c wli-2.5.73-29/fs/hfs/inode.c --- linux-2.5.73/fs/hfs/inode.c 2003-06-22 11:33:35.000000000 -0700 +++ wli-2.5.73-29/fs/hfs/inode.c 2003-06-23 10:46:31.000000000 -0700 @@ -240,7 +240,7 @@ static int hfs_readpage(struct file *fil static int hfs_prepare_write(struct file *file, struct page *page, unsigned from, unsigned to) { return cont_prepare_write(page,from,to,hfs_get_block, - &HFS_I(page->mapping->host)->mmu_private); + &HFS_I(page_mapping(page)->host)->mmu_private); } static sector_t hfs_bmap(struct address_space *mapping, sector_t block) { diff -prauN linux-2.5.73/fs/hpfs/file.c wli-2.5.73-29/fs/hpfs/file.c --- linux-2.5.73/fs/hpfs/file.c 2003-06-22 11:33:33.000000000 -0700 +++ wli-2.5.73-29/fs/hpfs/file.c 2003-06-23 10:46:31.000000000 -0700 @@ -109,7 +109,7 @@ static int hpfs_readpage(struct file *fi static int hpfs_prepare_write(struct file *file, struct page *page, unsigned from, unsigned to) { return cont_prepare_write(page,from,to,hpfs_get_block, - &hpfs_i(page->mapping->host)->mmu_private); + &hpfs_i(page_mapping(page)->host)->mmu_private); } static sector_t _hpfs_bmap(struct address_space *mapping, sector_t block) { diff -prauN linux-2.5.73/fs/hpfs/namei.c wli-2.5.73-29/fs/hpfs/namei.c --- linux-2.5.73/fs/hpfs/namei.c 2003-06-22 11:32:42.000000000 -0700 +++ wli-2.5.73-29/fs/hpfs/namei.c 2003-06-23 10:46:31.000000000 -0700 @@ -446,7 +446,7 @@ int hpfs_rmdir(struct inode *dir, struct int hpfs_symlink_readpage(struct file *file, struct page *page) { char *link = kmap(page); - struct inode *i = page->mapping->host; + struct inode *i = page_mapping(page)->host; struct fnode *fnode; struct buffer_head *bh; int err; diff -prauN linux-2.5.73/fs/hugetlbfs/inode.c wli-2.5.73-29/fs/hugetlbfs/inode.c --- linux-2.5.73/fs/hugetlbfs/inode.c 2003-06-22 11:33:16.000000000 -0700 +++ wli-2.5.73-29/fs/hugetlbfs/inode.c 2003-06-23 10:44:16.000000000 -0700 @@ -296,12 +296,15 @@ hugetlb_vmtruncate_list(struct list_head { struct vm_area_struct *vma; - list_for_each_entry(vma, list, shared) { + list_for_each_entry_rcu(vma, list, shared) { unsigned long h_vm_pgoff; unsigned long v_length; unsigned long h_length; unsigned long v_offset; + if (vma->vm_flags & VM_DEAD) + continue; + h_vm_pgoff = vma->vm_pgoff << (HPAGE_SHIFT - PAGE_SHIFT); v_length = vma->vm_end - vma->vm_start; h_length = v_length >> HPAGE_SHIFT; @@ -346,12 +349,12 @@ static int hugetlb_vmtruncate(struct ino pgoff = offset >> HPAGE_SHIFT; inode->i_size = offset; - down(&mapping->i_shared_sem); + rcu_read_lock(); /* mapping->i_shared_lock */ if (!list_empty(&mapping->i_mmap)) hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff); if (!list_empty(&mapping->i_mmap_shared)) hugetlb_vmtruncate_list(&mapping->i_mmap_shared, pgoff); - up(&mapping->i_shared_sem); + rcu_read_unlock(); /* mapping->i_shared_lock */ truncate_hugepages(mapping, offset); return 0; } diff -prauN linux-2.5.73/fs/inode.c wli-2.5.73-29/fs/inode.c --- linux-2.5.73/fs/inode.c 2003-06-22 11:33:34.000000000 -0700 +++ wli-2.5.73-29/fs/inode.c 2003-06-23 10:44:16.000000000 -0700 @@ -182,8 +182,8 @@ void inode_init_once(struct inode *inode INIT_LIST_HEAD(&inode->i_devices); sema_init(&inode->i_sem, 1); INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC); - spin_lock_init(&inode->i_data.page_lock); - init_MUTEX(&inode->i_data.i_shared_sem); + mapping_rwlock_init(&inode->i_data.page_lock); + spin_lock_init(&inode->i_data.i_shared_lock); INIT_LIST_HEAD(&inode->i_data.private_list); spin_lock_init(&inode->i_data.private_lock); INIT_LIST_HEAD(&inode->i_data.i_mmap); diff -prauN linux-2.5.73/fs/isofs/rock.c wli-2.5.73-29/fs/isofs/rock.c --- linux-2.5.73/fs/isofs/rock.c 2003-06-22 11:32:55.000000000 -0700 +++ wli-2.5.73-29/fs/isofs/rock.c 2003-06-23 10:46:31.000000000 -0700 @@ -430,7 +430,7 @@ int parse_rock_ridge_inode(struct iso_di static int rock_ridge_symlink_readpage(struct file *file, struct page *page) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; char *link = kmap(page); unsigned long bufsize = ISOFS_BUFFER_SIZE(inode); unsigned char bufbits = ISOFS_BUFFER_BITS(inode); diff -prauN linux-2.5.73/fs/jbd/commit.c wli-2.5.73-29/fs/jbd/commit.c --- linux-2.5.73/fs/jbd/commit.c 2003-06-22 11:33:15.000000000 -0700 +++ wli-2.5.73-29/fs/jbd/commit.c 2003-06-23 10:51:48.000000000 -0700 @@ -60,7 +60,7 @@ static void release_buffer_page(struct b page = bh->b_page; if (!page) goto nope; - if (page->mapping) + if (page_mapping(page)) goto nope; /* OK, it's a truncated page */ diff -prauN linux-2.5.73/fs/jbd/journal.c wli-2.5.73-29/fs/jbd/journal.c --- linux-2.5.73/fs/jbd/journal.c 2003-06-22 11:32:34.000000000 -0700 +++ wli-2.5.73-29/fs/jbd/journal.c 2003-06-23 10:48:24.000000000 -0700 @@ -1673,7 +1673,7 @@ repeat: } else { J_ASSERT_BH(bh, (atomic_read(&bh->b_count) > 0) || - (bh->b_page && bh->b_page->mapping)); + (bh->b_page && page_mapping(bh->b_page))); if (!new_jh) { jbd_unlock_bh_journal_head(bh); diff -prauN linux-2.5.73/fs/jffs/inode-v23.c wli-2.5.73-29/fs/jffs/inode-v23.c --- linux-2.5.73/fs/jffs/inode-v23.c 2003-06-22 11:32:56.000000000 -0700 +++ wli-2.5.73-29/fs/jffs/inode-v23.c 2003-06-23 10:46:31.000000000 -0700 @@ -744,7 +744,7 @@ jffs_do_readpage_nolock(struct file *fil void *buf; unsigned long read_len; int result; - struct inode *inode = (struct inode*)page->mapping->host; + struct inode *inode = (struct inode*)page_mapping(page)->host; struct jffs_file *f = (struct jffs_file *)inode->u.generic_ip; struct jffs_control *c = (struct jffs_control *)inode->i_sb->s_fs_info; int r; diff -prauN linux-2.5.73/fs/jffs2/file.c wli-2.5.73-29/fs/jffs2/file.c --- linux-2.5.73/fs/jffs2/file.c 2003-06-22 11:33:35.000000000 -0700 +++ wli-2.5.73-29/fs/jffs2/file.c 2003-06-23 10:46:31.000000000 -0700 @@ -266,18 +266,18 @@ int jffs2_do_readpage_unlock(struct inod int jffs2_readpage (struct file *filp, struct page *pg) { - struct jffs2_inode_info *f = JFFS2_INODE_INFO(pg->mapping->host); + struct jffs2_inode_info *f = JFFS2_INODE_INFO(page_mapping(pg)->host); int ret; down(&f->sem); - ret = jffs2_do_readpage_unlock(pg->mapping->host, pg); + ret = jffs2_do_readpage_unlock(page_mapping(pg)->host, pg); up(&f->sem); return ret; } int jffs2_prepare_write (struct file *filp, struct page *pg, unsigned start, unsigned end) { - struct inode *inode = pg->mapping->host; + struct inode *inode = page_mapping(pg)->host; struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode); uint32_t pageofs = pg->index << PAGE_CACHE_SHIFT; int ret = 0; @@ -362,7 +362,7 @@ int jffs2_commit_write (struct file *fil /* Actually commit the write from the page cache page we're looking at. * For now, we write the full page out each time. It sucks, but it's simple */ - struct inode *inode = pg->mapping->host; + struct inode *inode = page_mapping(pg)->host; struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode); struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); struct jffs2_raw_inode *ri; diff -prauN linux-2.5.73/fs/libfs.c wli-2.5.73-29/fs/libfs.c --- linux-2.5.73/fs/libfs.c 2003-06-22 11:32:29.000000000 -0700 +++ wli-2.5.73-29/fs/libfs.c 2003-06-23 10:46:31.000000000 -0700 @@ -325,7 +325,7 @@ int simple_prepare_write(struct file *fi int simple_commit_write(struct file *file, struct page *page, unsigned offset, unsigned to) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; if (pos > inode->i_size) diff -prauN linux-2.5.73/fs/minix/dir.c wli-2.5.73-29/fs/minix/dir.c --- linux-2.5.73/fs/minix/dir.c 2003-06-22 11:33:06.000000000 -0700 +++ wli-2.5.73-29/fs/minix/dir.c 2003-06-23 10:46:31.000000000 -0700 @@ -47,9 +47,9 @@ static inline unsigned long dir_pages(st static int dir_commit_chunk(struct page *page, unsigned from, unsigned to) { - struct inode *dir = (struct inode *)page->mapping->host; + struct inode *dir = (struct inode *)page_mapping(page)->host; int err = 0; - page->mapping->a_ops->commit_write(NULL, page, from, to); + page_mapping(page)->a_ops->commit_write(NULL, page, from, to); if (IS_DIRSYNC(dir)) err = write_one_page(page, 1); else @@ -240,7 +240,7 @@ int minix_add_link(struct dentry *dentry got_it: from = (char*)de - (char*)page_address(page); to = from + sbi->s_dirsize; - err = page->mapping->a_ops->prepare_write(NULL, page, from, to); + err = page_mapping(page)->a_ops->prepare_write(NULL, page, from, to); if (err) goto out_unlock; memcpy (de->name, name, namelen); @@ -260,7 +260,7 @@ out_unlock: int minix_delete_entry(struct minix_dir_entry *de, struct page *page) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); struct inode *inode = (struct inode*)mapping->host; char *kaddr = page_address(page); unsigned from = (char*)de - kaddr; @@ -364,14 +364,14 @@ not_empty: void minix_set_link(struct minix_dir_entry *de, struct page *page, struct inode *inode) { - struct inode *dir = (struct inode*)page->mapping->host; + struct inode *dir = (struct inode*)page_mapping(page)->host; struct minix_sb_info *sbi = minix_sb(dir->i_sb); unsigned from = (char *)de-(char*)page_address(page); unsigned to = from + sbi->s_dirsize; int err; lock_page(page); - err = page->mapping->a_ops->prepare_write(NULL, page, from, to); + err = page_mapping(page)->a_ops->prepare_write(NULL, page, from, to); if (err == 0) { de->inode = inode->i_ino; err = dir_commit_chunk(page, from, to); diff -prauN linux-2.5.73/fs/mpage.c wli-2.5.73-29/fs/mpage.c --- linux-2.5.73/fs/mpage.c 2003-06-22 11:32:37.000000000 -0700 +++ wli-2.5.73-29/fs/mpage.c 2003-06-23 10:46:31.000000000 -0700 @@ -129,7 +129,7 @@ mpage_alloc(struct block_device *bdev, static void map_buffer_to_page(struct page *page, struct buffer_head *bh, int page_block) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; struct buffer_head *page_bh, *head; int block = 0; @@ -209,7 +209,7 @@ static struct bio * do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages, sector_t *last_block_in_bio, get_block_t get_block) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; const unsigned blkbits = inode->i_blkbits; const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits; const unsigned blocksize = 1 << blkbits; @@ -388,7 +388,7 @@ static struct bio * mpage_writepage(struct bio *bio, struct page *page, get_block_t get_block, sector_t *last_block_in_bio, int *ret, struct writeback_control *wbc) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; const unsigned blkbits = inode->i_blkbits; unsigned long end_index; const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits; @@ -415,7 +415,7 @@ mpage_writepage(struct bio *bio, struct if (!buffer_mapped(bh)) { /* * unmapped dirty buffers are created by - * __set_page_dirty_buffers -> mmapped data + * set_page_dirty_buffers -> mmapped data */ if (buffer_dirty(bh)) goto confused; @@ -561,7 +561,7 @@ alloc_new: confused: if (bio) bio = mpage_bio_submit(WRITE, bio); - *ret = page->mapping->a_ops->writepage(page, wbc); + *ret = page_mapping(page)->a_ops->writepage(page, wbc); out: return bio; } @@ -627,7 +627,7 @@ mpage_writepages(struct address_space *m writepage = mapping->a_ops->writepage; pagevec_init(&pvec, 0); - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); while (!list_empty(&mapping->io_pages) && !done) { struct page *page = list_entry(mapping->io_pages.prev, struct page, list); @@ -647,12 +647,12 @@ mpage_writepages(struct address_space *m list_add(&page->list, &mapping->locked_pages); page_cache_get(page); - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); /* * At this point we hold neither mapping->page_lock nor * lock on the page itself: the page may be truncated or - * invalidated (changing page->mapping to NULL), or even + * invalidated (changing page_mapping(page) to NULL), or even * swizzled back from swapper_space to tmpfs file mapping. */ @@ -661,7 +661,7 @@ mpage_writepages(struct address_space *m if (wbc->sync_mode != WB_SYNC_NONE) wait_on_page_writeback(page); - if (page->mapping == mapping && !PageWriteback(page) && + if (page_mapping(page) == mapping && !PageWriteback(page) && test_clear_page_dirty(page)) { if (writepage) { ret = (*writepage)(page, wbc); @@ -679,12 +679,12 @@ mpage_writepages(struct address_space *m unlock_page(page); } page_cache_release(page); - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); } /* * Leave any remaining dirty pages on ->io_pages */ - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); if (bio) mpage_bio_submit(WRITE, bio); return ret; diff -prauN linux-2.5.73/fs/namei.c wli-2.5.73-29/fs/namei.c --- linux-2.5.73/fs/namei.c 2003-06-22 11:32:41.000000000 -0700 +++ wli-2.5.73-29/fs/namei.c 2003-06-23 10:38:47.000000000 -0700 @@ -434,19 +434,17 @@ int follow_up(struct vfsmount **mnt, str return 1; } +/* no need for dcache_lock, as serialization is taken care in + * namespace.c + */ static int follow_mount(struct vfsmount **mnt, struct dentry **dentry) { int res = 0; while (d_mountpoint(*dentry)) { - struct vfsmount *mounted; - spin_lock(&dcache_lock); - mounted = lookup_mnt(*mnt, *dentry); - if (!mounted) { - spin_unlock(&dcache_lock); + struct vfsmount *mounted = lookup_mnt(*mnt, *dentry); + if (!mounted) break; - } - *mnt = mntget(mounted); - spin_unlock(&dcache_lock); + *mnt = mounted; dput(*dentry); mntput(mounted->mnt_parent); *dentry = dget(mounted->mnt_root); @@ -455,21 +453,21 @@ static int follow_mount(struct vfsmount return res; } +/* no need for dcache_lock, as serialization is taken care in + * namespace.c + */ static inline int __follow_down(struct vfsmount **mnt, struct dentry **dentry) { struct vfsmount *mounted; - - spin_lock(&dcache_lock); + mounted = lookup_mnt(*mnt, *dentry); if (mounted) { - *mnt = mntget(mounted); - spin_unlock(&dcache_lock); + *mnt = mounted; dput(*dentry); mntput(mounted->mnt_parent); *dentry = dget(mounted->mnt_root); return 1; } - spin_unlock(&dcache_lock); return 0; } diff -prauN linux-2.5.73/fs/namespace.c wli-2.5.73-29/fs/namespace.c --- linux-2.5.73/fs/namespace.c 2003-06-22 11:32:57.000000000 -0700 +++ wli-2.5.73-29/fs/namespace.c 2003-06-23 10:38:47.000000000 -0700 @@ -26,6 +26,8 @@ extern int __init init_rootfs(void); extern int __init sysfs_init(void); +/* spinlock for vfsmount related operation, inplace of dcache_lock */ +spinlock_t vfsmount_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; static struct list_head *mount_hashtable; static int hash_mask, hash_bits; static kmem_cache_t *mnt_cache; @@ -66,30 +68,43 @@ void free_vfsmnt(struct vfsmount *mnt) kmem_cache_free(mnt_cache, mnt); } +/* + * Now, lookup_mnt increments the ref count before returning + * the vfsmount struct. + * + * lookup_mnt can be done without taking any lock, as now we + * do synchronize_kernel() while removing vfsmount struct + * from mnt_hash list. rcu_read_(un)lock is required for + * pre-emptive kernels. + */ struct vfsmount *lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) { struct list_head * head = mount_hashtable + hash(mnt, dentry); struct list_head * tmp = head; - struct vfsmount *p; + struct vfsmount *p, *found = NULL; + rcu_read_lock(); for (;;) { tmp = tmp->next; p = NULL; if (tmp == head) break; p = list_entry(tmp, struct vfsmount, mnt_hash); - if (p->mnt_parent == mnt && p->mnt_mountpoint == dentry) + if (p->mnt_parent == mnt && p->mnt_mountpoint == dentry) { + found = mntget(p); break; + } } - return p; + rcu_read_unlock(); + return found; } static int check_mnt(struct vfsmount *mnt) { - spin_lock(&dcache_lock); + spin_lock(&vfsmount_lock); while (mnt->mnt_parent != mnt) mnt = mnt->mnt_parent; - spin_unlock(&dcache_lock); + spin_unlock(&vfsmount_lock); return mnt == current->namespace->root; } @@ -97,10 +112,19 @@ static void detach_mnt(struct vfsmount * { old_nd->dentry = mnt->mnt_mountpoint; old_nd->mnt = mnt->mnt_parent; + + /* remove from the hash_list, before other things */ + list_del_rcu(&mnt->mnt_hash); + spin_unlock(&vfsmount_lock); + + /* There could be existing users doing lookup_mnt, let + * them finish their work. + */ + synchronize_kernel(); + spin_lock(&vfsmount_lock); mnt->mnt_parent = mnt; mnt->mnt_mountpoint = mnt->mnt_root; list_del_init(&mnt->mnt_child); - list_del_init(&mnt->mnt_hash); old_nd->dentry->d_mounted--; } @@ -108,7 +132,7 @@ static void attach_mnt(struct vfsmount * { mnt->mnt_parent = mntget(nd->mnt); mnt->mnt_mountpoint = dget(nd->dentry); - list_add(&mnt->mnt_hash, mount_hashtable+hash(nd->mnt, nd->dentry)); + list_add_rcu(&mnt->mnt_hash, mount_hashtable+hash(nd->mnt, nd->dentry)); list_add_tail(&mnt->mnt_child, &nd->mnt->mnt_mounts); nd->dentry->d_mounted++; } @@ -263,15 +287,15 @@ void umount_tree(struct vfsmount *mnt) mnt = list_entry(kill.next, struct vfsmount, mnt_list); list_del_init(&mnt->mnt_list); if (mnt->mnt_parent == mnt) { - spin_unlock(&dcache_lock); + spin_unlock(&vfsmount_lock); } else { struct nameidata old_nd; detach_mnt(mnt, &old_nd); - spin_unlock(&dcache_lock); + spin_unlock(&vfsmount_lock); path_release(&old_nd); } mntput(mnt); - spin_lock(&dcache_lock); + spin_lock(&vfsmount_lock); } } @@ -324,17 +348,17 @@ static int do_umount(struct vfsmount *mn } down_write(¤t->namespace->sem); - spin_lock(&dcache_lock); + spin_lock(&vfsmount_lock); if (atomic_read(&sb->s_active) == 1) { /* last instance - try to be smart */ - spin_unlock(&dcache_lock); + spin_unlock(&vfsmount_lock); lock_kernel(); DQUOT_OFF(sb); acct_auto_close(sb); unlock_kernel(); security_sb_umount_close(mnt); - spin_lock(&dcache_lock); + spin_lock(&vfsmount_lock); } retval = -EBUSY; if (atomic_read(&mnt->mnt_count) == 2 || flags & MNT_DETACH) { @@ -342,7 +366,7 @@ static int do_umount(struct vfsmount *mn umount_tree(mnt); retval = 0; } - spin_unlock(&dcache_lock); + spin_unlock(&vfsmount_lock); if (retval) security_sb_umount_busy(mnt); up_write(¤t->namespace->sem); @@ -449,18 +473,18 @@ static struct vfsmount *copy_tree(struct q = clone_mnt(p, p->mnt_root); if (!q) goto Enomem; - spin_lock(&dcache_lock); + spin_lock(&vfsmount_lock); list_add_tail(&q->mnt_list, &res->mnt_list); attach_mnt(q, &nd); - spin_unlock(&dcache_lock); + spin_unlock(&vfsmount_lock); } } return res; Enomem: if (res) { - spin_lock(&dcache_lock); + spin_lock(&vfsmount_lock); umount_tree(res); - spin_unlock(&dcache_lock); + spin_unlock(&vfsmount_lock); } return NULL; } @@ -485,7 +509,7 @@ static int graft_tree(struct vfsmount *m goto out_unlock; err = -ENOENT; - spin_lock(&dcache_lock); + spin_lock(&vfsmount_lock); if (IS_ROOT(nd->dentry) || !d_unhashed(nd->dentry)) { struct list_head head; @@ -495,7 +519,7 @@ static int graft_tree(struct vfsmount *m mntget(mnt); err = 0; } - spin_unlock(&dcache_lock); + spin_unlock(&vfsmount_lock); out_unlock: up(&nd->dentry->d_inode->i_sem); if (!err) @@ -532,9 +556,9 @@ static int do_loopback(struct nameidata if (mnt) { err = graft_tree(mnt, nd); if (err) { - spin_lock(&dcache_lock); + spin_lock(&vfsmount_lock); umount_tree(mnt); - spin_unlock(&dcache_lock); + spin_unlock(&vfsmount_lock); } else mntput(mnt); } @@ -599,7 +623,7 @@ static int do_move_mount(struct nameidat if (IS_DEADDIR(nd->dentry->d_inode)) goto out1; - spin_lock(&dcache_lock); + spin_lock(&vfsmount_lock); if (!IS_ROOT(nd->dentry) && d_unhashed(nd->dentry)) goto out2; @@ -623,7 +647,7 @@ static int do_move_mount(struct nameidat detach_mnt(old_nd.mnt, &parent_nd); attach_mnt(old_nd.mnt, nd); out2: - spin_unlock(&dcache_lock); + spin_unlock(&vfsmount_lock); out1: up(&nd->dentry->d_inode->i_sem); out: @@ -804,9 +828,9 @@ int copy_namespace(int flags, struct tas down_write(&tsk->namespace->sem); /* First pass: copy the tree topology */ new_ns->root = copy_tree(namespace->root, namespace->root->mnt_root); - spin_lock(&dcache_lock); + spin_lock(&vfsmount_lock); list_add_tail(&new_ns->list, &new_ns->root->mnt_list); - spin_unlock(&dcache_lock); + spin_unlock(&vfsmount_lock); /* Second pass: switch the tsk->fs->* elements */ if (fs) { @@ -1027,7 +1051,7 @@ asmlinkage long sys_pivot_root(const cha if (new_nd.mnt->mnt_root != new_nd.dentry) goto out2; /* not a mountpoint */ tmp = old_nd.mnt; /* make sure we can reach put_old from new_root */ - spin_lock(&dcache_lock); + spin_lock(&vfsmount_lock); if (tmp != new_nd.mnt) { for (;;) { if (tmp->mnt_parent == tmp) @@ -1044,7 +1068,7 @@ asmlinkage long sys_pivot_root(const cha detach_mnt(user_nd.mnt, &root_parent); attach_mnt(user_nd.mnt, &old_nd); attach_mnt(new_nd.mnt, &root_parent); - spin_unlock(&dcache_lock); + spin_unlock(&vfsmount_lock); chroot_fs_refs(&user_nd, &new_nd); security_sb_post_pivotroot(&user_nd, &new_nd); error = 0; @@ -1061,7 +1085,7 @@ out0: unlock_kernel(); return error; out3: - spin_unlock(&dcache_lock); + spin_unlock(&vfsmount_lock); goto out2; } diff -prauN linux-2.5.73/fs/ncpfs/symlink.c wli-2.5.73-29/fs/ncpfs/symlink.c --- linux-2.5.73/fs/ncpfs/symlink.c 2003-06-22 11:33:32.000000000 -0700 +++ wli-2.5.73-29/fs/ncpfs/symlink.c 2003-06-23 10:46:31.000000000 -0700 @@ -43,7 +43,7 @@ static int ncp_symlink_readpage(struct file *file, struct page *page) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; int error, length, len; char *link, *rawlink; char *buf = kmap(page); diff -prauN linux-2.5.73/fs/nfs/file.c wli-2.5.73-29/fs/nfs/file.c --- linux-2.5.73/fs/nfs/file.c 2003-06-22 11:32:58.000000000 -0700 +++ wli-2.5.73-29/fs/nfs/file.c 2003-06-23 10:46:31.000000000 -0700 @@ -213,7 +213,7 @@ static int nfs_commit_write(struct file struct address_space_operations nfs_file_aops = { .readpage = nfs_readpage, .readpages = nfs_readpages, - .set_page_dirty = __set_page_dirty_nobuffers, + .set_page_dirty = set_page_dirty_nobuffers, .writepage = nfs_writepage, .writepages = nfs_writepages, .prepare_write = nfs_prepare_write, diff -prauN linux-2.5.73/fs/nfs/read.c wli-2.5.73-29/fs/nfs/read.c --- linux-2.5.73/fs/nfs/read.c 2003-06-22 11:32:27.000000000 -0700 +++ wli-2.5.73-29/fs/nfs/read.c 2003-06-23 10:46:31.000000000 -0700 @@ -300,7 +300,7 @@ nfs_readpage_result(struct rpc_task *tas int nfs_readpage(struct file *file, struct page *page) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; int error; dprintk("NFS: nfs_readpage (%p %ld@%lu)\n", @@ -341,14 +341,14 @@ static int readpage_sync_filler(void *data, struct page *page) { struct nfs_readdesc *desc = (struct nfs_readdesc *)data; - return nfs_readpage_sync(desc->filp, page->mapping->host, page); + return nfs_readpage_sync(desc->filp, page_mapping(page)->host, page); } static int readpage_async_filler(void *data, struct page *page) { struct nfs_readdesc *desc = (struct nfs_readdesc *)data; - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; struct nfs_page *new; nfs_wb_page(inode, page); diff -prauN linux-2.5.73/fs/nfs/write.c wli-2.5.73-29/fs/nfs/write.c --- linux-2.5.73/fs/nfs/write.c 2003-06-22 11:33:03.000000000 -0700 +++ wli-2.5.73-29/fs/nfs/write.c 2003-06-23 10:46:31.000000000 -0700 @@ -224,7 +224,7 @@ nfs_writepage_async(struct file *file, s int nfs_writepage(struct page *page, struct writeback_control *wbc) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; unsigned long end_index; unsigned offset = PAGE_CACHE_SIZE; int err; @@ -628,7 +628,7 @@ nfs_strategy(struct inode *inode) int nfs_flush_incompatible(struct file *file, struct page *page) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; struct rpc_cred *cred = nfs_file_cred(file); struct nfs_page *req; int status = 0; @@ -659,7 +659,7 @@ int nfs_updatepage(struct file *file, struct page *page, unsigned int offset, unsigned int count) { struct dentry *dentry = file->f_dentry; - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; struct nfs_page *req; loff_t end; int status = 0; diff -prauN linux-2.5.73/fs/ntfs/aops.c wli-2.5.73-29/fs/ntfs/aops.c --- linux-2.5.73/fs/ntfs/aops.c 2003-06-22 11:33:07.000000000 -0700 +++ wli-2.5.73-29/fs/ntfs/aops.c 2003-06-23 10:46:31.000000000 -0700 @@ -55,7 +55,7 @@ static void ntfs_end_buffer_async_read(s int page_uptodate = 1; page = bh->b_page; - ni = NTFS_I(page->mapping->host); + ni = NTFS_I(page_mapping(page)->host); if (likely(uptodate)) { s64 file_ofs; @@ -176,7 +176,7 @@ static int ntfs_read_block(struct page * int i, nr; unsigned char blocksize_bits; - ni = NTFS_I(page->mapping->host); + ni = NTFS_I(page_mapping(page)->host); vol = ni->vol; blocksize_bits = VFS_I(ni)->i_blkbits; @@ -359,7 +359,7 @@ int ntfs_readpage(struct file *file, str return 0; } - ni = NTFS_I(page->mapping->host); + ni = NTFS_I(page_mapping(page)->host); if (NInoNonResident(ni)) { /* @@ -473,7 +473,7 @@ static int ntfs_write_block(struct page BOOL need_end_writeback; unsigned char blocksize_bits; - vi = page->mapping->host; + vi = page_mapping(page)->host; ni = NTFS_I(vi); vol = ni->vol; @@ -500,9 +500,9 @@ static int ntfs_write_block(struct page * buffer's dirty state as-is. */ // FIXME: Once Andrew's -EAGAIN patch goes in, remove the - // __set_page_dirty_nobuffers(page) and return -EAGAIN instead + // set_page_dirty_nobuffers(page) and return -EAGAIN instead // of zero. - __set_page_dirty_nobuffers(page); + set_page_dirty_nobuffers(page); unlock_page(page); return 0; } @@ -519,12 +519,12 @@ static int ntfs_write_block(struct page iblock = ni->initialized_size >> blocksize_bits; /* - * Be very careful. We have no exclusion from __set_page_dirty_buffers + * Be very careful. We have no exclusion from set_page_dirty_buffers * here, and the (potentially unmapped) buffers may become dirty at * any time. If a buffer becomes dirty here after we've inspected it * then we just miss that fact, and the page stays dirty. * - * Buffers outside i_size may be dirtied by __set_page_dirty_buffers; + * Buffers outside i_size may be dirtied by set_page_dirty_buffers; * handle that here by just cleaning them. */ @@ -579,7 +579,7 @@ static int ntfs_write_block(struct page // Update initialized size in the attribute and // in the inode. // Again, for each page do: - // __set_page_dirty_buffers(); + // set_page_dirty_buffers(); // page_cache_release() // We don't need to wait on the writes. // Update iblock. @@ -734,9 +734,9 @@ lock_retry_remap: * leave its buffer's dirty state as-is. */ // FIXME: Once Andrew's -EAGAIN patch goes in, remove - // the __set_page_dirty_nobuffers(page) and set err to + // the set_page_dirty_nobuffers(page) and set err to // -EAGAIN instead of zero. - __set_page_dirty_nobuffers(page); + set_page_dirty_nobuffers(page); err = 0; } else SetPageError(page); @@ -805,7 +805,7 @@ static int ntfs_writepage(struct page *p BUG_ON(!PageLocked(page)); - vi = page->mapping->host; + vi = page_mapping(page)->host; /* Is the page fully outside i_size? (truncate in progress) */ if (unlikely(page->index >= (vi->i_size + PAGE_CACHE_SIZE - 1) >> @@ -987,9 +987,9 @@ err_out: * buffer's dirty state as-is. */ // FIXME: Once Andrew's -EAGAIN patch goes in, remove the - // __set_page_dirty_nobuffers(page) and set err to -EAGAIN + // set_page_dirty_nobuffers(page) and set err to -EAGAIN // instead of zero. - __set_page_dirty_nobuffers(page); + set_page_dirty_nobuffers(page); err = 0; } else { ntfs_error(vi->i_sb, "Resident attribute write failed with " @@ -1024,7 +1024,7 @@ static int ntfs_prepare_nonresident_writ BOOL is_retry; unsigned char blocksize_bits; - vi = page->mapping->host; + vi = page_mapping(page)->host; ni = NTFS_I(vi); vol = ni->vol; @@ -1125,7 +1125,7 @@ static int ntfs_prepare_nonresident_writ // Update initialized size in the attribute and // in the inode. // Again, for each page do: - // __set_page_dirty_buffers(); + // set_page_dirty_buffers(); // page_cache_release() // We don't need to wait on the writes. // Update iblock. @@ -1361,7 +1361,7 @@ err_out: * ntfs_prepare_write - prepare a page for receiving data * * This is called from generic_file_write() with i_sem held on the inode - * (@page->mapping->host). The @page is locked and kmap()ped so page_address() + * (@page_mapping(page)->host). The @page is locked and kmap()ped so page_address() * can simply be used. The source data has not yet been copied into the @page. * * Need to extend the attribute/fill in holes if necessary, create blocks and @@ -1382,7 +1382,7 @@ err_out: static int ntfs_prepare_write(struct file *file, struct page *page, unsigned from, unsigned to) { - struct inode *vi = page->mapping->host; + struct inode *vi = page_mapping(page)->host; ntfs_inode *ni = NTFS_I(vi); ntfs_debug("Entering for inode %li, attribute type 0x%x, page index " @@ -1491,7 +1491,7 @@ static int ntfs_commit_nonresident_write unsigned int block_start, block_end, blocksize; BOOL partial; - vi = page->mapping->host; + vi = page_mapping(page)->host; ntfs_debug("Entering for inode %li, attribute type 0x%x, page index " "0x%lx, from = %u, to = %u.", vi->i_ino, @@ -1547,7 +1547,7 @@ static int ntfs_commit_nonresident_write * ntfs_commit_write - commit the received data * * This is called from generic_file_write() with i_sem held on the inode - * (@page->mapping->host). The @page is locked and kmap()ped so page_address() + * (@page_mapping(page)->host). The @page is locked and kmap()ped so page_address() * can simply be used. The source data has already been copied into the @page. * * Need to mark modified blocks dirty so they get written out later when @@ -1585,7 +1585,7 @@ static int ntfs_commit_write(struct file u32 attr_len, bytes; int err; - vi = page->mapping->host; + vi = page_mapping(page)->host; ni = NTFS_I(vi); ntfs_debug("Entering for inode %li, attribute type 0x%x, page index " @@ -1758,7 +1758,7 @@ err_out: * Put the page on mapping->dirty_pages, but leave its * buffer's dirty state as-is. */ - __set_page_dirty_nobuffers(page); + set_page_dirty_nobuffers(page); err = 0; } else ntfs_error(vi->i_sb, "Page is not uptodate. Written " diff -prauN linux-2.5.73/fs/ntfs/compress.c wli-2.5.73-29/fs/ntfs/compress.c --- linux-2.5.73/fs/ntfs/compress.c 2003-06-22 11:32:46.000000000 -0700 +++ wli-2.5.73-29/fs/ntfs/compress.c 2003-06-23 10:46:31.000000000 -0700 @@ -209,7 +209,7 @@ return_error: /* Second stage: finalize completed pages. */ if (nr_completed_pages > 0) { struct page *page = dest_pages[completed_pages[0]]; - ntfs_inode *ni = NTFS_I(page->mapping->host); + ntfs_inode *ni = NTFS_I(page_mapping(page)->host); for (i = 0; i < nr_completed_pages; i++) { int di = completed_pages[i]; @@ -467,7 +467,7 @@ return_overflow: */ int ntfs_read_compressed_block(struct page *page) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); ntfs_inode *ni = NTFS_I(mapping->host); ntfs_volume *vol = ni->vol; struct super_block *sb = vol->sb; diff -prauN linux-2.5.73/fs/proc/array.c wli-2.5.73-29/fs/proc/array.c --- linux-2.5.73/fs/proc/array.c 2003-06-22 11:32:56.000000000 -0700 +++ wli-2.5.73-29/fs/proc/array.c 2003-06-23 10:38:47.000000000 -0700 @@ -283,7 +283,7 @@ int proc_pid_status(struct task_struct * return buffer - orig; } -extern unsigned long task_vsize(struct mm_struct *); +unsigned long task_vsize(struct mm_struct *); int proc_pid_stat(struct task_struct *task, char * buffer) { unsigned long vsize, eip, esp, wchan; @@ -307,11 +307,9 @@ int proc_pid_stat(struct task_struct *ta } task_unlock(task); if (mm) { - down_read(&mm->mmap_sem); vsize = task_vsize(mm); eip = KSTK_EIP(task); esp = KSTK_ESP(task); - up_read(&mm->mmap_sem); } wchan = get_wchan(task); @@ -388,20 +386,23 @@ int proc_pid_stat(struct task_struct *ta return res; } -extern int task_statm(struct mm_struct *, int *, int *, int *, int *); +int task_statm(struct mm_struct *, int *, int *, int *, int *, int *, int *); int proc_pid_statm(struct task_struct *task, char *buffer) { - int size = 0, resident = 0, shared = 0, text = 0, lib = 0, data = 0; + int size, resident, shared, text, lib, data, dirty; struct mm_struct *mm = get_task_mm(task); - if (mm) { + if (!mm) + size = resident = shared = text = lib = data = dirty = 0; + else { down_read(&mm->mmap_sem); - size = task_statm(mm, &shared, &text, &data, &resident); + size = task_statm(mm, &shared, &text, &lib, &data, + &resident, &dirty); up_read(&mm->mmap_sem); mmput(mm); } return sprintf(buffer,"%d %d %d %d %d %d %d\n", - size, resident, shared, text, lib, data, 0); + size, resident, shared, text, lib, data, dirty); } diff -prauN linux-2.5.73/fs/proc/base.c wli-2.5.73-29/fs/proc/base.c --- linux-2.5.73/fs/proc/base.c 2003-06-22 11:32:57.000000000 -0700 +++ wli-2.5.73-29/fs/proc/base.c 2003-06-23 10:44:16.000000000 -0700 @@ -298,7 +298,7 @@ static int proc_check_root(struct inode { struct dentry *de, *base, *root; struct vfsmount *our_vfsmnt, *vfsmnt, *mnt; - int res = 0; + int subdir, res = 0; if (proc_root_link(inode, &root, &vfsmnt)) /* Ewww... */ return -ENOENT; @@ -307,20 +307,23 @@ static int proc_check_root(struct inode base = dget(current->fs->root); read_unlock(¤t->fs->lock); - spin_lock(&dcache_lock); + rcu_read_lock(); /* vfsmount_lock */ de = root; mnt = vfsmnt; while (vfsmnt != our_vfsmnt) { if (vfsmnt == vfsmnt->mnt_parent) - goto out; + goto out_unlock; de = vfsmnt->mnt_mountpoint; vfsmnt = vfsmnt->mnt_parent; } + /* rcu_read_unlock(); vfsmount_lock */ - if (!is_subdir(de, base)) + /* rcu_read_lock(); dcache_lock */ + subdir = is_subdir(de, base); + rcu_read_unlock(); /* dcache_lock */ + if (!subdir) goto out; - spin_unlock(&dcache_lock); exit: dput(base); @@ -328,8 +331,9 @@ exit: dput(root); mntput(mnt); return res; +out_unlock: + rcu_read_unlock(); /* vfsmount_lock */ out: - spin_unlock(&dcache_lock); res = -EACCES; goto exit; } @@ -637,8 +641,6 @@ static int proc_pid_readlink(struct dent struct dentry *de; struct vfsmount *mnt = NULL; - lock_kernel(); - if (current->fsuid != inode->i_uid && !capable(CAP_DAC_OVERRIDE)) goto out; error = proc_check_root(inode); @@ -653,7 +655,6 @@ static int proc_pid_readlink(struct dent dput(de); mntput(mnt); out: - unlock_kernel(); return error; } @@ -1387,62 +1388,37 @@ out: } #define PROC_NUMBUF 10 -#define PROC_MAXPIDS 20 - -/* - * Get a few pid's to return for filldir - we need to hold the - * tasklist lock while doing this, and we must release it before - * we actually do the filldir itself, so we use a temp buffer.. - */ -static int get_pid_list(int index, unsigned int *pids) -{ - struct task_struct *p; - int nr_pids = 0; - - index--; - read_lock(&tasklist_lock); - for_each_process(p) { - int pid = p->pid; - if (!pid_alive(p)) - continue; - if (--index >= 0) - continue; - pids[nr_pids] = pid; - nr_pids++; - if (nr_pids >= PROC_MAXPIDS) - break; - } - read_unlock(&tasklist_lock); - return nr_pids; -} int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) { - unsigned int pid_array[PROC_MAXPIDS]; char buf[PROC_NUMBUF]; unsigned int nr = filp->f_pos - FIRST_PROCESS_ENTRY; - unsigned int nr_pids, i; + int pid; if (!nr) { ino_t ino = fake_ino(0,PROC_PID_INO); if (filldir(dirent, "self", 4, filp->f_pos, ino, DT_LNK) < 0) return 0; filp->f_pos++; - nr++; + nr = 1; } + pid = nr - 1; + for (;;) { + unsigned long i, j; + ino_t ino; - nr_pids = get_pid_list(nr, pid_array); - - for (i = 0; i < nr_pids; i++) { - int pid = pid_array[i]; - ino_t ino = fake_ino(pid,PROC_PID_INO); - unsigned long j = PROC_NUMBUF; + pid = find_next_pid(pid); + if (pid < 0) + break; - do buf[--j] = '0' + (pid % 10); while (pid/=10); + i = pid; + j = PROC_NUMBUF; + do buf[--j] = '0' + (i % 10); while (i/=10); + ino = fake_ino(pid, PROC_PID_INO); if (filldir(dirent, buf+j, PROC_NUMBUF-j, filp->f_pos, ino, DT_DIR) < 0) break; - filp->f_pos++; + filp->f_pos = pid + 1 + FIRST_PROCESS_ENTRY; } return 0; } diff -prauN linux-2.5.73/fs/proc/proc_misc.c wli-2.5.73-29/fs/proc/proc_misc.c --- linux-2.5.73/fs/proc/proc_misc.c 2003-06-22 11:32:33.000000000 -0700 +++ wli-2.5.73-29/fs/proc/proc_misc.c 2003-06-23 10:46:31.000000000 -0700 @@ -200,6 +200,7 @@ static int meminfo_read_proc(char *page, "SwapFree: %8lu kB\n" "Dirty: %8lu kB\n" "Writeback: %8lu kB\n" + "Deferred: %8lu kB\n" "Mapped: %8lu kB\n" "Slab: %8lu kB\n" "Committed_AS: %8u kB\n" @@ -210,8 +211,8 @@ static int meminfo_read_proc(char *page, K(i.totalram), K(i.freeram), K(i.bufferram), - K(get_page_cache_size()-total_swapcache_pages-i.bufferram), - K(total_swapcache_pages), + K(get_page_cache_size() - i.bufferram - ps.nr_swapcache), + K(ps.nr_swapcache), K(active), K(inactive), K(i.totalhigh), @@ -222,6 +223,7 @@ static int meminfo_read_proc(char *page, K(i.freeswap), K(ps.nr_dirty), K(ps.nr_writeback), + K(nr_deferred_pages()), K(ps.nr_mapped), K(ps.nr_slab), K(committed), @@ -497,11 +499,10 @@ static int ds1286_read_proc(char *page, static int locks_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) { - int len; - lock_kernel(); - len = get_locks_status(page, start, off, count); - unlock_kernel(); - if (len < count) *eof = 1; + int len = get_locks_status(page, start, off, count); + + if (len < count) + *eof = 1; return len; } diff -prauN linux-2.5.73/fs/proc/root.c wli-2.5.73-29/fs/proc/root.c --- linux-2.5.73/fs/proc/root.c 2003-06-22 11:33:07.000000000 -0700 +++ wli-2.5.73-29/fs/proc/root.c 2003-06-23 10:32:25.000000000 -0700 @@ -81,11 +81,13 @@ void __init proc_root_init(void) static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry) { - if (dir->i_ino == PROC_ROOT_INO) { /* check for safety... */ - lock_kernel(); + /* + * nr_threads is actually protected by the tasklist_lock; + * however, it's conventional to do reads, especially for + * reporting, without any locking whatsoever. + */ + if (dir->i_ino == PROC_ROOT_INO) /* check for safety... */ dir->i_nlink = proc_root.nlink + nr_threads; - unlock_kernel(); - } if (!proc_lookup(dir, dentry)) { return NULL; diff -prauN linux-2.5.73/fs/proc/task_mmu.c wli-2.5.73-29/fs/proc/task_mmu.c --- linux-2.5.73/fs/proc/task_mmu.c 2003-06-22 11:32:33.000000000 -0700 +++ wli-2.5.73-29/fs/proc/task_mmu.c 2003-06-23 10:38:47.000000000 -0700 @@ -5,27 +5,6 @@ char *task_mem(struct mm_struct *mm, char *buffer) { - unsigned long data = 0, stack = 0, exec = 0, lib = 0; - struct vm_area_struct *vma; - - down_read(&mm->mmap_sem); - for (vma = mm->mmap; vma; vma = vma->vm_next) { - unsigned long len = (vma->vm_end - vma->vm_start) >> 10; - if (!vma->vm_file) { - data += len; - if (vma->vm_flags & VM_GROWSDOWN) - stack += len; - continue; - } - if (vma->vm_flags & VM_WRITE) - continue; - if (vma->vm_flags & VM_EXEC) { - exec += len; - if (vma->vm_flags & VM_EXECUTABLE) - continue; - lib += len; - } - } buffer += sprintf(buffer, "VmSize:\t%8lu kB\n" "VmLck:\t%8lu kB\n" @@ -37,9 +16,10 @@ char *task_mem(struct mm_struct *mm, cha mm->total_vm << (PAGE_SHIFT-10), mm->locked_vm << (PAGE_SHIFT-10), mm->rss << (PAGE_SHIFT-10), - data - stack, stack, - exec - lib, lib); - up_read(&mm->mmap_sem); + (mm->data - mm->stack) << (PAGE_SHIFT-10), + mm->stack << (PAGE_SHIFT-10), + mm->text << (PAGE_SHIFT-10), + mm->lib << (PAGE_SHIFT-10)); return buffer; } @@ -49,30 +29,15 @@ unsigned long task_vsize(struct mm_struc } int task_statm(struct mm_struct *mm, int *shared, int *text, - int *data, int *resident) + int *lib, int *data, int *resident, int *dirty) { - struct vm_area_struct *vma; - int size = 0; - + *shared = mm->shared; + *text = mm->text; + *lib = mm->lib; + *data = mm->data; + *dirty = mm->dirty; *resident = mm->rss; - for (vma = mm->mmap; vma; vma = vma->vm_next) { - int pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; - - size += pages; - if (is_vm_hugetlb_page(vma)) { - if (!(vma->vm_flags & VM_DONTCOPY)) - *shared += pages; - continue; - } - if (vma->vm_flags & VM_SHARED || !list_empty(&vma->shared)) - *shared += pages; - if (vma->vm_flags & VM_EXECUTABLE) - *text += pages; - else - *data += pages; - } - - return size; + return mm->total_vm; } static int show_map(struct seq_file *m, void *v) diff -prauN linux-2.5.73/fs/proc/task_nommu.c wli-2.5.73-29/fs/proc/task_nommu.c --- linux-2.5.73/fs/proc/task_nommu.c 2003-06-22 11:32:56.000000000 -0700 +++ wli-2.5.73-29/fs/proc/task_nommu.c 2003-06-23 10:38:47.000000000 -0700 @@ -67,16 +67,17 @@ unsigned long task_vsize(struct mm_struc struct mm_tblock_struct *tbp; unsigned long vsize = 0; + down_read(&mm->mmap_sem); for (tbp = &mm->context.tblock; tbp; tbp = tbp->next) { if (tbp->rblock) vsize += kobjsize(tbp->rblock->kblock); } - + up_read(&mm->mmap_sem); return vsize; } int task_statm(struct mm_struct *mm, int *shared, int *text, - int *data, int *resident) + int *lib, int *data, int *resident, int *dirty) { struct mm_tblock_struct *tbp; int size = kobjsize(mm); @@ -92,7 +93,7 @@ int task_statm(struct mm_struct *mm, int size += (*text = mm->end_code - mm->start_code); size += (*data = mm->start_stack - mm->start_data); - + *shared = *lib = *dirty = 0; *resident = size; return size; } diff -prauN linux-2.5.73/fs/qnx4/inode.c wli-2.5.73-29/fs/qnx4/inode.c --- linux-2.5.73/fs/qnx4/inode.c 2003-06-22 11:33:15.000000000 -0700 +++ wli-2.5.73-29/fs/qnx4/inode.c 2003-06-23 10:46:31.000000000 -0700 @@ -434,7 +434,7 @@ static int qnx4_readpage(struct file *fi static int qnx4_prepare_write(struct file *file, struct page *page, unsigned from, unsigned to) { - struct qnx4_inode_info *qnx4_inode = qnx4_i(page->mapping->host); + struct qnx4_inode_info *qnx4_inode = qnx4_i(page_mapping(page)->host); return cont_prepare_write(page, from, to, qnx4_get_block, &qnx4_inode->mmu_private); } diff -prauN linux-2.5.73/fs/reiserfs/inode.c wli-2.5.73-29/fs/reiserfs/inode.c --- linux-2.5.73/fs/reiserfs/inode.c 2003-06-22 11:32:39.000000000 -0700 +++ wli-2.5.73-29/fs/reiserfs/inode.c 2003-06-23 10:46:31.000000000 -0700 @@ -1998,7 +1998,7 @@ static void lock_buffer_for_writepage(st lock_buffer(bh); } else { if (test_set_buffer_locked(bh)) { - __set_page_dirty_nobuffers(page); + set_page_dirty_nobuffers(page); return; } } @@ -2017,7 +2017,7 @@ static void lock_buffer_for_writepage(st * code to handle reiserfs tails. */ static int reiserfs_write_full_page(struct page *page, struct writeback_control *wbc) { - struct inode *inode = page->mapping->host ; + struct inode *inode = page_mapping(page)->host ; unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT ; int error = 0; unsigned long block ; @@ -2170,7 +2170,7 @@ static int reiserfs_readpage (struct fil static int reiserfs_writepage (struct page * page, struct writeback_control *wbc) { - struct inode *inode = page->mapping->host ; + struct inode *inode = page_mapping(page)->host ; reiserfs_wait_on_write_block(inode->i_sb) ; return reiserfs_write_full_page(page, wbc) ; } @@ -2178,7 +2178,7 @@ static int reiserfs_writepage (struct pa int reiserfs_prepare_write(struct file *f, struct page *page, unsigned from, unsigned to) { - struct inode *inode = page->mapping->host ; + struct inode *inode = page_mapping(page)->host ; reiserfs_wait_on_write_block(inode->i_sb) ; fix_tail_page_for_writing(page) ; return block_prepare_write(page, from, to, reiserfs_get_block) ; @@ -2191,7 +2191,7 @@ static sector_t reiserfs_aop_bmap(struct static int reiserfs_commit_write(struct file *f, struct page *page, unsigned from, unsigned to) { - struct inode *inode = page->mapping->host ; + struct inode *inode = page_mapping(page)->host ; loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; int ret ; @@ -2282,7 +2282,7 @@ void i_attrs_to_sd_attrs( struct inode * */ static int reiserfs_releasepage(struct page *page, int unused_gfp_flags) { - struct inode *inode = page->mapping->host ; + struct inode *inode = page_mapping(page)->host ; struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb) ; struct buffer_head *head ; struct buffer_head *bh ; diff -prauN linux-2.5.73/fs/romfs/inode.c wli-2.5.73-29/fs/romfs/inode.c --- linux-2.5.73/fs/romfs/inode.c 2003-06-22 11:32:58.000000000 -0700 +++ wli-2.5.73-29/fs/romfs/inode.c 2003-06-23 10:46:31.000000000 -0700 @@ -414,7 +414,7 @@ out: unlock_kernel(); static int romfs_readpage(struct file *file, struct page * page) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; unsigned long offset, avail, readlen; void *buf; int result = -EIO; diff -prauN linux-2.5.73/fs/smbfs/file.c wli-2.5.73-29/fs/smbfs/file.c --- linux-2.5.73/fs/smbfs/file.c 2003-06-22 11:33:16.000000000 -0700 +++ wli-2.5.73-29/fs/smbfs/file.c 2003-06-23 10:46:31.000000000 -0700 @@ -172,7 +172,7 @@ smb_writepage_sync(struct inode *inode, static int smb_writepage(struct page *page, struct writeback_control *wbc) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); struct inode *inode; unsigned long end_index; unsigned offset = PAGE_CACHE_SIZE; diff -prauN linux-2.5.73/fs/sysv/dir.c wli-2.5.73-29/fs/sysv/dir.c --- linux-2.5.73/fs/sysv/dir.c 2003-06-22 11:32:38.000000000 -0700 +++ wli-2.5.73-29/fs/sysv/dir.c 2003-06-23 10:46:31.000000000 -0700 @@ -39,10 +39,10 @@ static inline unsigned long dir_pages(st static int dir_commit_chunk(struct page *page, unsigned from, unsigned to) { - struct inode *dir = (struct inode *)page->mapping->host; + struct inode *dir = (struct inode *)page_mapping(page)->host; int err = 0; - page->mapping->a_ops->commit_write(NULL, page, from, to); + page_mapping(page)->a_ops->commit_write(NULL, page, from, to); if (IS_DIRSYNC(dir)) err = write_one_page(page, 1); else @@ -225,7 +225,7 @@ got_it: from = (char*)de - (char*)page_address(page); to = from + SYSV_DIRSIZE; lock_page(page); - err = page->mapping->a_ops->prepare_write(NULL, page, from, to); + err = page_mapping(page)->a_ops->prepare_write(NULL, page, from, to); if (err) goto out_unlock; memcpy (de->name, name, namelen); @@ -245,7 +245,7 @@ out_unlock: int sysv_delete_entry(struct sysv_dir_entry *de, struct page *page) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); struct inode *inode = (struct inode*)mapping->host; char *kaddr = (char*)page_address(page); unsigned from = (char*)de - kaddr; @@ -347,13 +347,13 @@ not_empty: void sysv_set_link(struct sysv_dir_entry *de, struct page *page, struct inode *inode) { - struct inode *dir = (struct inode*)page->mapping->host; + struct inode *dir = (struct inode*)page_mapping(page)->host; unsigned from = (char *)de-(char*)page_address(page); unsigned to = from + SYSV_DIRSIZE; int err; lock_page(page); - err = page->mapping->a_ops->prepare_write(NULL, page, from, to); + err = page_mapping(page)->a_ops->prepare_write(NULL, page, from, to); if (err) BUG(); de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino); diff -prauN linux-2.5.73/fs/udf/file.c wli-2.5.73-29/fs/udf/file.c --- linux-2.5.73/fs/udf/file.c 2003-06-22 11:32:56.000000000 -0700 +++ wli-2.5.73-29/fs/udf/file.c 2003-06-23 10:46:31.000000000 -0700 @@ -46,7 +46,7 @@ static int udf_adinicb_readpage(struct file *file, struct page * page) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; char *kaddr; if (!PageLocked(page)) @@ -64,7 +64,7 @@ static int udf_adinicb_readpage(struct f static int udf_adinicb_writepage(struct page *page, struct writeback_control *wbc) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; char *kaddr; if (!PageLocked(page)) @@ -87,7 +87,7 @@ static int udf_adinicb_prepare_write(str static int udf_adinicb_commit_write(struct file *file, struct page *page, unsigned offset, unsigned to) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; char *kaddr = page_address(page); memcpy(UDF_I_DATA(inode) + UDF_I_LENEATTR(inode) + offset, diff -prauN linux-2.5.73/fs/udf/symlink.c wli-2.5.73-29/fs/udf/symlink.c --- linux-2.5.73/fs/udf/symlink.c 2003-06-22 11:32:37.000000000 -0700 +++ wli-2.5.73-29/fs/udf/symlink.c 2003-06-23 10:46:31.000000000 -0700 @@ -80,7 +80,7 @@ static void udf_pc_to_char(struct super_ static int udf_symlink_filler(struct file *file, struct page *page) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; struct buffer_head *bh = NULL; char *symlink; int err = -EIO; diff -prauN linux-2.5.73/fs/xfs/linux/xfs_aops.c wli-2.5.73-29/fs/xfs/linux/xfs_aops.c --- linux-2.5.73/fs/xfs/linux/xfs_aops.c 2003-06-22 11:33:16.000000000 -0700 +++ wli-2.5.73-29/fs/xfs/linux/xfs_aops.c 2003-06-23 10:46:31.000000000 -0700 @@ -210,7 +210,7 @@ probe_unwritten_page( if (PageWriteback(page)) goto out; - if (page->mapping && page_has_buffers(page)) { + if (page_mapping(page) && page_has_buffers(page)) { struct buffer_head *bh, *head; unsigned long p_offset = 0; @@ -257,7 +257,7 @@ probe_unmapped_page( if (PageWriteback(page)) goto out; - if (page->mapping && PageDirty(page)) { + if (page_mapping(page) && PageDirty(page)) { if (page_has_buffers(page)) { struct buffer_head *bh, *head; @@ -337,7 +337,7 @@ probe_delalloc_page( if (PageWriteback(page)) goto out; - if (page->mapping && page_has_buffers(page)) { + if (page_mapping(page) && page_has_buffers(page)) { struct buffer_head *bh, *head; int acceptable = 0; @@ -621,7 +621,7 @@ page_state_convert( int startio, int unmapped) /* also implies page uptodate */ { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; struct buffer_head *bh_arr[MAX_BUF_PER_PAGE], *bh, *head; page_buf_bmap_t *mp, map; unsigned long p_offset = 0, end_index; @@ -1032,7 +1032,7 @@ linvfs_writepage( int error; int need_trans; int delalloc, unmapped, unwritten; - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; /* * We need a transaction if: diff -prauN linux-2.5.73/include/asm-alpha/pgtable.h wli-2.5.73-29/include/asm-alpha/pgtable.h --- linux-2.5.73/include/asm-alpha/pgtable.h 2003-06-22 11:32:38.000000000 -0700 +++ wli-2.5.73-29/include/asm-alpha/pgtable.h 2003-06-23 10:31:02.000000000 -0700 @@ -229,9 +229,11 @@ pmd_page_kernel(pmd_t pmd) #define pmd_page(pmd) (mem_map + ((pmd_val(pmd) & _PFN_MASK) >> 32)) #endif -extern inline unsigned long pgd_page(pgd_t pgd) +extern inline unsigned long __pgd_page(pgd_t pgd) { return PAGE_OFFSET + ((pgd_val(pgd) & _PFN_MASK) >> (32-PAGE_SHIFT)); } +#defiene pgd_page(pgd) virt_to_page(__pgd_page(pgd)) + extern inline int pte_none(pte_t pte) { return !pte_val(pte); } extern inline int pte_present(pte_t pte) { return pte_val(pte) & _PAGE_VALID; } extern inline void pte_clear(pte_t *ptep) { pte_val(*ptep) = 0; } @@ -280,7 +282,7 @@ extern inline pte_t pte_mkyoung(pte_t pt /* Find an entry in the second-level page table.. */ extern inline pmd_t * pmd_offset(pgd_t * dir, unsigned long address) { - return (pmd_t *) pgd_page(*dir) + ((address >> PMD_SHIFT) & (PTRS_PER_PAGE - 1)); + return (pmd_t *)__pgd_page(*dir) + ((address >> PMD_SHIFT) & (PTRS_PER_PAGE - 1)); } /* Find an entry in the third-level page table.. */ diff -prauN linux-2.5.73/include/asm-arm/pgtable.h wli-2.5.73-29/include/asm-arm/pgtable.h --- linux-2.5.73/include/asm-arm/pgtable.h 2003-06-22 11:32:38.000000000 -0700 +++ wli-2.5.73-29/include/asm-arm/pgtable.h 2003-06-23 10:31:02.000000000 -0700 @@ -125,6 +125,11 @@ extern struct page *empty_zero_page; /* Find an entry in the second-level page table.. */ #define pmd_offset(dir, addr) ((pmd_t *)(dir)) +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) /* Find an entry in the third-level page table.. */ #define __pte_index(addr) (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) diff -prauN linux-2.5.73/include/asm-arm/proc-armv/cache.h wli-2.5.73-29/include/asm-arm/proc-armv/cache.h --- linux-2.5.73/include/asm-arm/proc-armv/cache.h 2003-06-22 11:32:43.000000000 -0700 +++ wli-2.5.73-29/include/asm-arm/proc-armv/cache.h 2003-06-23 10:46:31.000000000 -0700 @@ -246,8 +246,8 @@ flush_cache_page(struct vm_area_struct * * flush_dcache_page is used when the kernel has written to the page * cache page at virtual address page->virtual. * - * If this page isn't mapped (ie, page->mapping = NULL), or it has - * userspace mappings (page->mapping->i_mmap or page->mapping->i_mmap_shared) + * If this page isn't mapped (ie, page_mapping(page) = NULL), or it has + * userspace mappings (page_mapping(page)->i_mmap or page_mapping(page)->i_mmap_shared) * then we _must_ always clean + invalidate the dcache entries associated * with the kernel mapping. * @@ -262,7 +262,7 @@ extern void __flush_dcache_page(struct p static inline void flush_dcache_page(struct page *page) { - if (page->mapping && !mapping_mapped(page->mapping)) + if (page_mapping(page) && !mapping_mapped(page_mapping(page))) set_bit(PG_dcache_dirty, &page->flags); else __flush_dcache_page(page); diff -prauN linux-2.5.73/include/asm-arm26/pgtable.h wli-2.5.73-29/include/asm-arm26/pgtable.h --- linux-2.5.73/include/asm-arm26/pgtable.h 2003-06-22 11:32:32.000000000 -0700 +++ wli-2.5.73-29/include/asm-arm26/pgtable.h 2003-06-23 10:31:02.000000000 -0700 @@ -189,6 +189,12 @@ extern struct page *empty_zero_page; #define pte_unmap(pte) do { } while (0) #define pte_unmap_nested(pte) do { } while (0) +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pgd, addr) do { } while (0) +#define pmd_unmap_nested(pgd, addr) do { } while (0) + #define _PAGE_PRESENT 0x01 #define _PAGE_READONLY 0x02 diff -prauN linux-2.5.73/include/asm-arm26/rmap.h wli-2.5.73-29/include/asm-arm26/rmap.h --- linux-2.5.73/include/asm-arm26/rmap.h 2003-06-22 11:32:42.000000000 -0700 +++ wli-2.5.73-29/include/asm-arm26/rmap.h 2003-06-23 10:46:31.000000000 -0700 @@ -14,14 +14,14 @@ static inline void pgtable_add_rmap(struct page *page, struct mm_struct * mm, unsigned long address) { - page->mapping = (void *)mm; + set_page_mapping(page, mm); page->index = address & ~((PTRS_PER_PTE * PAGE_SIZE) - 1); inc_page_state(nr_page_table_pages); } static inline void pgtable_remove_rmap(struct page *page) { - page->mapping = NULL; + set_page_mapping(page, NULL); page->index = 0; dec_page_state(nr_page_table_pages); } @@ -29,7 +29,7 @@ static inline void pgtable_remove_rmap(s static inline struct mm_struct * ptep_to_mm(pte_t * ptep) { struct page * page = virt_to_page(ptep); - return (struct mm_struct *)page->mapping; + return (struct mm_struct *)page_mapping(page); } /* The page table takes half of the page */ diff -prauN linux-2.5.73/include/asm-generic/rmap.h wli-2.5.73-29/include/asm-generic/rmap.h --- linux-2.5.73/include/asm-generic/rmap.h 2003-06-22 11:32:56.000000000 -0700 +++ wli-2.5.73-29/include/asm-generic/rmap.h 1969-12-31 16:00:00.000000000 -0800 @@ -1,90 +0,0 @@ -#ifndef _GENERIC_RMAP_H -#define _GENERIC_RMAP_H -/* - * linux/include/asm-generic/rmap.h - * - * Architecture dependent parts of the reverse mapping code, - * this version should work for most architectures with a - * 'normal' page table layout. - * - * We use the struct page of the page table page to find out - * the process and full address of a page table entry: - * - page->mapping points to the process' mm_struct - * - page->index has the high bits of the address - * - the lower bits of the address are calculated from the - * offset of the page table entry within the page table page - * - * For CONFIG_HIGHPTE, we need to represent the address of a pte in a - * scalar pte_addr_t. The pfn of the pte's page is shifted left by PAGE_SIZE - * bits and is then ORed with the byte offset of the pte within its page. - * - * For CONFIG_HIGHMEM4G, the pte_addr_t is 32 bits. 20 for the pfn, 12 for - * the offset. - * - * For CONFIG_HIGHMEM64G, the pte_addr_t is 64 bits. 52 for the pfn, 12 for - * the offset. - */ -#include - -static inline void pgtable_add_rmap(struct page * page, struct mm_struct * mm, unsigned long address) -{ -#ifdef BROKEN_PPC_PTE_ALLOC_ONE - /* OK, so PPC calls pte_alloc() before mem_map[] is setup ... ;( */ - extern int mem_init_done; - - if (!mem_init_done) - return; -#endif - page->mapping = (void *)mm; - page->index = address & ~((PTRS_PER_PTE * PAGE_SIZE) - 1); - inc_page_state(nr_page_table_pages); -} - -static inline void pgtable_remove_rmap(struct page * page) -{ - page->mapping = NULL; - page->index = 0; - dec_page_state(nr_page_table_pages); -} - -static inline struct mm_struct * ptep_to_mm(pte_t * ptep) -{ - struct page * page = kmap_atomic_to_page(ptep); - return (struct mm_struct *) page->mapping; -} - -static inline unsigned long ptep_to_address(pte_t * ptep) -{ - struct page * page = kmap_atomic_to_page(ptep); - unsigned long low_bits; - low_bits = ((unsigned long)ptep & ~PAGE_MASK) * PTRS_PER_PTE; - return page->index + low_bits; -} - -#ifdef CONFIG_HIGHPTE -static inline pte_addr_t ptep_to_paddr(pte_t *ptep) -{ - pte_addr_t paddr; - paddr = ((pte_addr_t)page_to_pfn(kmap_atomic_to_page(ptep))) << PAGE_SHIFT; - return paddr + (pte_addr_t)((unsigned long)ptep & ~PAGE_MASK); -} -#else -static inline pte_addr_t ptep_to_paddr(pte_t *ptep) -{ - return (pte_addr_t)ptep; -} -#endif - -#ifndef CONFIG_HIGHPTE -static inline pte_t *rmap_ptep_map(pte_addr_t pte_paddr) -{ - return (pte_t *)pte_paddr; -} - -static inline void rmap_ptep_unmap(pte_t *pte) -{ - return; -} -#endif - -#endif /* _GENERIC_RMAP_H */ diff -prauN linux-2.5.73/include/asm-h8300/pgtable.h wli-2.5.73-29/include/asm-h8300/pgtable.h --- linux-2.5.73/include/asm-h8300/pgtable.h 2003-06-22 11:32:42.000000000 -0700 +++ wli-2.5.73-29/include/asm-h8300/pgtable.h 2003-06-23 10:31:02.000000000 -0700 @@ -15,6 +15,11 @@ typedef pte_t *pte_addr_t; #define pgd_clear(pgdp) #define kern_addr_valid(addr) (1) #define pmd_offset(a, b) ((void *)0) +#define pmd_offset_kernel(a,b) pmd_offset(a,b) +#define pmd_offset_map(a,b) pmd_offset(a,b) +#define pmd_offset_map_nested(a,b) pmd_offset(a,b) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) #define PAGE_NONE __pgprot(0) /* these mean nothing to NO_MM */ #define PAGE_SHARED __pgprot(0) /* these mean nothing to NO_MM */ diff -prauN linux-2.5.73/include/asm-i386/atomic.h wli-2.5.73-29/include/asm-i386/atomic.h --- linux-2.5.73/include/asm-i386/atomic.h 2003-06-22 11:32:38.000000000 -0700 +++ wli-2.5.73-29/include/asm-i386/atomic.h 2003-06-23 10:55:31.000000000 -0700 @@ -201,4 +201,14 @@ __asm__ __volatile__(LOCK "orl %0,%1" \ #define smp_mb__before_atomic_inc() barrier() #define smp_mb__after_atomic_inc() barrier() +#if !defined(CONFIG_SMP) && defined(CONFIG_PREEMPT) +#define atomic_dec_and_lock(ctr, lock) \ +({ \ + int __adal__ = atomic_dec_and_test(ctr); \ + if (__adal__) \ + preempt_disable(); \ + __adal__; \ +}) +#endif + #endif diff -prauN linux-2.5.73/include/asm-i386/highmem.h wli-2.5.73-29/include/asm-i386/highmem.h --- linux-2.5.73/include/asm-i386/highmem.h 2003-06-22 11:32:32.000000000 -0700 +++ wli-2.5.73-29/include/asm-i386/highmem.h 2003-06-23 10:38:47.000000000 -0700 @@ -52,11 +52,55 @@ extern void kmap_init(void); extern void * FASTCALL(kmap_high(struct page *page)); extern void FASTCALL(kunmap_high(struct page *page)); -void *kmap(struct page *page); -void kunmap(struct page *page); -void *kmap_atomic(struct page *page, enum km_type type); -void kunmap_atomic(void *kvaddr, enum km_type type); -struct page *kmap_atomic_to_page(void *ptr); +void *__kmap_atomic(struct page *page, enum km_type type); +struct page *__kmap_atomic_to_page(void *ptr); + +#ifdef CONFIG_DEBUG_HIGHMEM +void __kunmap_atomic(void *kvaddr, enum km_type type); +#else +static inline void __kunmap_atomic(void *kvaddr, enum km_type type) +{ +} +#endif + +static inline void *kmap(struct page *page) +{ + might_sleep(); + if (page < highmem_start_page) + return lowmem_page_address(page); + else + return kmap_high(page); +} + +static inline void kunmap(struct page *page) +{ + BUG_ON(in_interrupt()); + if (page >= highmem_start_page) + kunmap_high(page); +} + +static inline void *kmap_atomic(struct page *page, enum km_type type) +{ + inc_preempt_count(); + if (page < highmem_start_page) + return lowmem_page_address(page); + else + return __kmap_atomic(page, type); +} + +static inline void kunmap_atomic(void *vaddr, enum km_type type) +{ + __kunmap_atomic(vaddr, type); + dec_preempt_count(); +} + +static inline struct page *kmap_atomic_to_page(void *vaddr) +{ + if ((unsigned long)vaddr < FIXADDR_START) + return virt_to_page(vaddr); + else + return __kmap_atomic_to_page(vaddr); +} #endif /* __KERNEL__ */ diff -prauN linux-2.5.73/include/asm-i386/kmap_types.h wli-2.5.73-29/include/asm-i386/kmap_types.h --- linux-2.5.73/include/asm-i386/kmap_types.h 2003-06-22 11:33:01.000000000 -0700 +++ wli-2.5.73-29/include/asm-i386/kmap_types.h 2003-06-23 10:31:02.000000000 -0700 @@ -17,14 +17,16 @@ D(3) KM_USER0, D(4) KM_USER1, D(5) KM_BIO_SRC_IRQ, D(6) KM_BIO_DST_IRQ, -D(7) KM_PTE0, -D(8) KM_PTE1, -D(9) KM_PTE2, -D(10) KM_IRQ0, -D(11) KM_IRQ1, -D(12) KM_SOFTIRQ0, -D(13) KM_SOFTIRQ1, -D(14) KM_TYPE_NR +D(7) KM_PMD0, +D(8) KM_PMD1, +D(9) KM_PTE0, +D(10) KM_PTE1, +D(11) KM_PTE2, +D(12) KM_IRQ0, +D(13) KM_IRQ1, +D(14) KM_SOFTIRQ0, +D(15) KM_SOFTIRQ1, +D(16) KM_TYPE_NR }; #undef D diff -prauN linux-2.5.73/include/asm-i386/linkage.h wli-2.5.73-29/include/asm-i386/linkage.h --- linux-2.5.73/include/asm-i386/linkage.h 2003-06-22 11:33:36.000000000 -0700 +++ wli-2.5.73-29/include/asm-i386/linkage.h 2003-06-23 10:42:31.000000000 -0700 @@ -3,6 +3,7 @@ #define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0))) #define FASTCALL(x) x __attribute__((regparm(3))) +#define IRQHANDLER(x) x __attribute__((regparm(1))) #ifdef CONFIG_X86_ALIGNMENT_16 #define __ALIGN .align 16,0x90 diff -prauN linux-2.5.73/include/asm-i386/mach-numaq/mach_apic.h wli-2.5.73-29/include/asm-i386/mach-numaq/mach_apic.h --- linux-2.5.73/include/asm-i386/mach-numaq/mach_apic.h 2003-06-22 11:32:42.000000000 -0700 +++ wli-2.5.73-29/include/asm-i386/mach-numaq/mach_apic.h 2003-06-23 16:24:39.000000000 -0700 @@ -31,6 +31,7 @@ static inline void init_apic_ldr(void) static inline void clustered_apic_check(void) { + nr_ioapics = min(2, nr_ioapics); printk("Enabling APIC mode: %s. Using %d I/O APICs\n", "NUMA-Q", nr_ioapics); } diff -prauN linux-2.5.73/include/asm-i386/page.h wli-2.5.73-29/include/asm-i386/page.h --- linux-2.5.73/include/asm-i386/page.h 2003-06-22 11:32:34.000000000 -0700 +++ wli-2.5.73-29/include/asm-i386/page.h 2003-06-23 10:42:09.000000000 -0700 @@ -3,7 +3,11 @@ /* PAGE_SHIFT determines the page size */ #define PAGE_SHIFT 12 -#define PAGE_SIZE (1UL << PAGE_SHIFT) +#ifndef __ASSEMBLY__ +#define PAGE_SIZE (1UL << PAGE_SHIFT) +#else +#define PAGE_SIZE (1 << PAGE_SHIFT) +#endif #define PAGE_MASK (~(PAGE_SIZE-1)) #define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1)) diff -prauN linux-2.5.73/include/asm-i386/pgalloc.h wli-2.5.73-29/include/asm-i386/pgalloc.h --- linux-2.5.73/include/asm-i386/pgalloc.h 2003-06-22 11:32:31.000000000 -0700 +++ wli-2.5.73-29/include/asm-i386/pgalloc.h 2003-06-23 10:33:02.000000000 -0700 @@ -31,14 +31,6 @@ static inline void pte_free_kernel(pte_t free_page((unsigned long)pte); } -static inline void pte_free(struct page *pte) -{ - __free_page(pte); -} - - -#define __pte_free_tlb(tlb,pte) tlb_remove_page((tlb),(pte)) - /* * allocating and freeing a pmd is trivial: the 1-entry pmd is * inside the pgd, so has no extra memory associated with it. @@ -46,10 +38,29 @@ static inline void pte_free(struct page */ #define pmd_alloc_one(mm, addr) ({ BUG(); ((pmd_t *)2); }) +#define pmd_alloc_one_kernel(mm, addr) ({ BUG(); ((pmd_t *)2); }) #define pmd_free(x) do { } while (0) #define __pmd_free_tlb(tlb,x) do { } while (0) #define pgd_populate(mm, pmd, pte) BUG() #define check_pgt_cache() do { } while (0) +#include + +static inline void pte_free(struct page *page) +{ + struct mmu_gather *tlb = &per_cpu(mmu_gathers, get_cpu()); + tlb_remove_page(tlb, page); + put_cpu(); +} + +static inline void pte_free_tlb(struct mmu_gather *tlb, struct page *page) +{ + tlb_remove_page(tlb, page); +} + +static inline void pmd_free_tlb(struct mmu_gather *tlb, struct page *page) +{ +} + #endif /* _I386_PGALLOC_H */ diff -prauN linux-2.5.73/include/asm-i386/pgtable-2level.h wli-2.5.73-29/include/asm-i386/pgtable-2level.h --- linux-2.5.73/include/asm-i386/pgtable-2level.h 2003-06-22 11:32:55.000000000 -0700 +++ wli-2.5.73-29/include/asm-i386/pgtable-2level.h 2003-06-23 10:31:02.000000000 -0700 @@ -48,13 +48,15 @@ static inline int pgd_present(pgd_t pgd) #define set_pmd(pmdptr, pmdval) (*(pmdptr) = pmdval) #define set_pgd(pgdptr, pgdval) (*(pgdptr) = pgdval) -#define pgd_page(pgd) \ -((unsigned long) __va(pgd_val(pgd) & PAGE_MASK)) +#define pgd_page(pgd) pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT) + +#define pmd_offset_map(pgd, addr) ({ (pmd_t *)(pgd); }) +#define pmd_offset_map_nested(pgd, addr) pmd_offset_map(pgd, addr) +#define pmd_offset_kernel(pgd, addr) pmd_offset_map(pgd, addr) + +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) -static inline pmd_t * pmd_offset(pgd_t * dir, unsigned long address) -{ - return (pmd_t *) dir; -} #define ptep_get_and_clear(xp) __pte(xchg(&(xp)->pte_low, 0)) #define pte_same(a, b) ((a).pte_low == (b).pte_low) #define pte_page(x) pfn_to_page(pte_pfn(x)) diff -prauN linux-2.5.73/include/asm-i386/pgtable-3level.h wli-2.5.73-29/include/asm-i386/pgtable-3level.h --- linux-2.5.73/include/asm-i386/pgtable-3level.h 2003-06-22 11:33:08.000000000 -0700 +++ wli-2.5.73-29/include/asm-i386/pgtable-3level.h 2003-06-23 10:33:55.000000000 -0700 @@ -64,12 +64,25 @@ static inline void set_pte(pte_t *ptep, */ static inline void pgd_clear (pgd_t * pgd) { } -#define pgd_page(pgd) \ -((unsigned long) __va(pgd_val(pgd) & PAGE_MASK)) +#define pgd_page(pgd) pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT) + +static inline unsigned long pgd_pfn(pgd_t pgd) +{ + return pgd_val(pgd) >> PAGE_SHIFT; +} + +#define pmd_offset_kernel(pgd, addr) \ + ((pmd_t *)__va(pgd_val(*(pgd)) & PAGE_MASK) + pmd_index(addr)) /* Find an entry in the second-level page table.. */ -#define pmd_offset(dir, address) ((pmd_t *) pgd_page(*(dir)) + \ - pmd_index(address)) +#define __pmd_offset(pgd, addr, type) \ + ((pmd_t *)kmap_atomic(pgd_page(*(pgd)), type) + pmd_index(addr)) + +#define pmd_offset_map(pgd, addr) __pmd_offset(pgd, addr, KM_PMD0) +#define pmd_offset_map_nested(pgd, addr) __pmd_offset(pgd, addr, KM_PMD1) + +#define pmd_unmap(pmd) kunmap_atomic(pmd, KM_PMD0); +#define pmd_unmap_nested(pmd) kunmap_atomic(pmd, KM_PMD1); static inline pte_t ptep_get_and_clear(pte_t *ptep) { @@ -123,6 +136,4 @@ static inline pmd_t pfn_pmd(unsigned lon #define pgoff_to_pte(off) ((pte_t) { _PAGE_FILE, (off) }) #define PTE_FILE_MAX_BITS 32 -extern struct kmem_cache_s *pae_pgd_cachep; - #endif /* _I386_PGTABLE_3LEVEL_H */ diff -prauN linux-2.5.73/include/asm-i386/pgtable.h wli-2.5.73-29/include/asm-i386/pgtable.h --- linux-2.5.73/include/asm-i386/pgtable.h 2003-06-22 11:33:04.000000000 -0700 +++ wli-2.5.73-29/include/asm-i386/pgtable.h 2003-06-23 10:33:55.000000000 -0700 @@ -16,6 +16,9 @@ #include #include #include +#include +#include +#include #ifndef _I386_BITOPS_H #include @@ -31,33 +34,26 @@ extern void paging_init(void); extern unsigned long empty_zero_page[1024]; #define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) -#endif /* !__ASSEMBLY__ */ +extern kmem_cache_t *pgd_cache; +extern struct list_head pgd_list; +extern spinlock_t pgd_lock; +void pgtable_cache_init(void); +void pgd_ctor(void *, kmem_cache_t *, unsigned long); +void pgd_dtor(void *, kmem_cache_t *, unsigned long); /* * The Linux x86 paging architecture is 'compile-time dual-mode', it * implements both the traditional 2-level x86 page tables and the * newer 3-level PAE-mode page tables. */ -#ifndef __ASSEMBLY__ #ifdef CONFIG_X86_PAE # include - -/* - * Need to initialise the X86 PAE caches - */ -extern void pgtable_cache_init(void); - #else # include - -/* - * No page table caches to initialise - */ -#define pgtable_cache_init() do { } while (0) - -#endif #endif +#endif /* !__ASSEMBLY__ */ + #define PMD_SIZE (1UL << PMD_SHIFT) #define PMD_MASK (~(PMD_SIZE-1)) #define PGDIR_SIZE (1UL << PGDIR_SHIFT) @@ -294,32 +290,25 @@ static inline pte_t pte_modify(pte_t pte #define pte_offset_kernel(dir, address) \ ((pte_t *) pmd_page_kernel(*(dir)) + pte_index(address)) -#if defined(CONFIG_HIGHPTE) -#define pte_offset_map(dir, address) \ - ((pte_t *)kmap_atomic(pmd_page(*(dir)),KM_PTE0) + pte_index(address)) -#define pte_offset_map_nested(dir, address) \ - ((pte_t *)kmap_atomic(pmd_page(*(dir)),KM_PTE1) + pte_index(address)) -#define pte_unmap(pte) kunmap_atomic(pte, KM_PTE0) -#define pte_unmap_nested(pte) kunmap_atomic(pte, KM_PTE1) -#else -#define pte_offset_map(dir, address) \ - ((pte_t *)page_address(pmd_page(*(dir))) + pte_index(address)) -#define pte_offset_map_nested(dir, address) pte_offset_map(dir, address) -#define pte_unmap(pte) do { } while (0) -#define pte_unmap_nested(pte) do { } while (0) -#endif +#define __pte_offset(pmd, addr, type) \ + ((pte_t *)kmap_atomic(pmd_page(*pmd), type) + pte_index(addr)) -#if defined(CONFIG_HIGHPTE) && defined(CONFIG_HIGHMEM4G) -typedef u32 pte_addr_t; -#endif +#define pte_offset_map(pmd, addr) __pte_offset(pmd, addr, KM_PTE0) +#define pte_offset_map_nested(pmd, addr) __pte_offset(pmd, addr, KM_PTE1) +#define pte_unmap(pte) kunmap_atomic(pte, KM_PTE0) +#define pte_unmap_nested(pte) kunmap_atomic(pte, KM_PTE1) + +#ifdef CONFIG_HIGHPTE -#if defined(CONFIG_HIGHPTE) && defined(CONFIG_HIGHMEM64G) +#ifdef CONFIG_HIGHMEM64G typedef u64 pte_addr_t; -#endif +#else /* CONFIG_HIGHMEM4G */ +typedef u32 pte_addr_t; +#endif /* CONFIG_HIGHMEM4G */ -#if !defined(CONFIG_HIGHPTE) +#else /* !CONFIG_HIGHPTE */ typedef pte_t *pte_addr_t; -#endif +#endif /* !CONFIG_HIGHPTE */ /* * The i386 doesn't have any external MMU info: the kernel page diff -prauN linux-2.5.73/include/asm-i386/rmap.h wli-2.5.73-29/include/asm-i386/rmap.h --- linux-2.5.73/include/asm-i386/rmap.h 2003-06-22 11:32:57.000000000 -0700 +++ wli-2.5.73-29/include/asm-i386/rmap.h 1969-12-31 16:00:00.000000000 -0800 @@ -1,21 +0,0 @@ -#ifndef _I386_RMAP_H -#define _I386_RMAP_H - -/* nothing to see, move along */ -#include - -#ifdef CONFIG_HIGHPTE -static inline pte_t *rmap_ptep_map(pte_addr_t pte_paddr) -{ - unsigned long pfn = (unsigned long)(pte_paddr >> PAGE_SHIFT); - unsigned long off = ((unsigned long)pte_paddr) & ~PAGE_MASK; - return (pte_t *)((char *)kmap_atomic(pfn_to_page(pfn), KM_PTE2) + off); -} - -static inline void rmap_ptep_unmap(pte_t *pte) -{ - kunmap_atomic(pte, KM_PTE2); -} -#endif - -#endif diff -prauN linux-2.5.73/include/asm-i386/rwlock.h wli-2.5.73-29/include/asm-i386/rwlock.h --- linux-2.5.73/include/asm-i386/rwlock.h 2003-06-22 11:33:32.000000000 -0700 +++ wli-2.5.73-29/include/asm-i386/rwlock.h 2003-06-23 10:44:16.000000000 -0700 @@ -20,28 +20,52 @@ #define RW_LOCK_BIAS 0x01000000 #define RW_LOCK_BIAS_STR "0x01000000" -#define __build_read_lock_ptr(rw, helper) \ - asm volatile(LOCK "subl $1,(%0)\n\t" \ - "js 2f\n" \ - "1:\n" \ - LOCK_SECTION_START("") \ - "2:\tcall " helper "\n\t" \ - "jmp 1b\n" \ - LOCK_SECTION_END \ - ::"a" (rw) : "memory") - -#define __build_read_lock_const(rw, helper) \ - asm volatile(LOCK "subl $1,%0\n\t" \ - "js 2f\n" \ - "1:\n" \ - LOCK_SECTION_START("") \ - "2:\tpushl %%eax\n\t" \ - "leal %0,%%eax\n\t" \ - "call " helper "\n\t" \ - "popl %%eax\n\t" \ - "jmp 1b\n" \ - LOCK_SECTION_END \ - :"=m" (*(volatile int *)rw) : : "memory") +#ifdef CONFIG_SPINLINE + + #define __build_read_lock_ptr(rw, helper) \ + asm volatile(LOCK "subl $1,(%0)\n\t" \ + "jns 1f\n\t" \ + "call " helper "\n\t" \ + "1:\t" \ + ::"a" (rw) : "memory") + + #define __build_read_lock_const(rw, helper) \ + asm volatile(LOCK "subl $1,%0\n\t" \ + "jns 1f\n\t" \ + "pushl %%eax\n\t" \ + "leal %0,%%eax\n\t" \ + "call " helper "\n\t" \ + "popl %%eax\n\t" \ + "1:\t" \ + :"=m" (*(volatile int *)rw) : : "memory") + +#else /* !CONFIG_SPINLINE */ + + #define __build_read_lock_ptr(rw, helper) \ + asm volatile(LOCK "subl $1,(%0)\n\t" \ + "js 2f\n" \ + "1:\n" \ + LOCK_SECTION_START("") \ + "2:\tcall " helper "\n\t" \ + "jmp 1b\n" \ + LOCK_SECTION_END \ + ::"a" (rw) : "memory") + + #define __build_read_lock_const(rw, helper) \ + asm volatile(LOCK "subl $1,%0\n\t" \ + "js 2f\n" \ + "1:\n" \ + LOCK_SECTION_START("") \ + "2:\tpushl %%eax\n\t" \ + "leal %0,%%eax\n\t" \ + "call " helper "\n\t" \ + "popl %%eax\n\t" \ + "jmp 1b\n" \ + LOCK_SECTION_END \ + :"=m" (*(volatile int *)rw) : : "memory") + +#endif /* CONFIG_SPINLINE */ + #define __build_read_lock(rw, helper) do { \ if (__builtin_constant_p(rw)) \ @@ -50,28 +74,51 @@ __build_read_lock_ptr(rw, helper); \ } while (0) -#define __build_write_lock_ptr(rw, helper) \ - asm volatile(LOCK "subl $" RW_LOCK_BIAS_STR ",(%0)\n\t" \ - "jnz 2f\n" \ - "1:\n" \ - LOCK_SECTION_START("") \ - "2:\tcall " helper "\n\t" \ - "jmp 1b\n" \ - LOCK_SECTION_END \ - ::"a" (rw) : "memory") - -#define __build_write_lock_const(rw, helper) \ - asm volatile(LOCK "subl $" RW_LOCK_BIAS_STR ",%0\n\t" \ - "jnz 2f\n" \ - "1:\n" \ - LOCK_SECTION_START("") \ - "2:\tpushl %%eax\n\t" \ - "leal %0,%%eax\n\t" \ - "call " helper "\n\t" \ - "popl %%eax\n\t" \ - "jmp 1b\n" \ - LOCK_SECTION_END \ - :"=m" (*(volatile int *)rw) : : "memory") +#ifdef CONFIG_SPINLINE + + #define __build_write_lock_ptr(rw, helper) \ + asm volatile(LOCK "subl $" RW_LOCK_BIAS_STR ",(%0)\n\t" \ + "jz 1f\n\t" \ + "call " helper "\n\t" \ + "1:\n" \ + ::"a" (rw) : "memory") + + #define __build_write_lock_const(rw, helper) \ + asm volatile(LOCK "subl $" RW_LOCK_BIAS_STR ",%0\n\t" \ + "jz 1f\n\t" \ + "pushl %%eax\n\t" \ + "leal %0,%%eax\n\t" \ + "call " helper "\n\t" \ + "popl %%eax\n\t" \ + "1:\n" \ + :"=m" (*(volatile int *)rw) : : "memory") + +#else /* !CONFIG_SPINLINE */ + + #define __build_write_lock_ptr(rw, helper) \ + asm volatile(LOCK "subl $" RW_LOCK_BIAS_STR ",(%0)\n\t" \ + "jnz 2f\n" \ + "1:\n" \ + LOCK_SECTION_START("") \ + "2:\tcall " helper "\n\t" \ + "jmp 1b\n" \ + LOCK_SECTION_END \ + ::"a" (rw) : "memory") + + #define __build_write_lock_const(rw, helper) \ + asm volatile(LOCK "subl $" RW_LOCK_BIAS_STR ",%0\n\t" \ + "jnz 2f\n" \ + "1:\n" \ + LOCK_SECTION_START("") \ + "2:\tpushl %%eax\n\t" \ + "leal %0,%%eax\n\t" \ + "call " helper "\n\t" \ + "popl %%eax\n\t" \ + "jmp 1b\n" \ + LOCK_SECTION_END \ + :"=m" (*(volatile int *)rw) : : "memory") + +#endif /* CONFIG_SPINLINE */ #define __build_write_lock(rw, helper) do { \ if (__builtin_constant_p(rw)) \ diff -prauN linux-2.5.73/include/asm-i386/spinlock.h wli-2.5.73-29/include/asm-i386/spinlock.h --- linux-2.5.73/include/asm-i386/spinlock.h 2003-06-22 11:32:37.000000000 -0700 +++ wli-2.5.73-29/include/asm-i386/spinlock.h 2003-06-23 23:48:32.000000000 -0700 @@ -6,6 +6,7 @@ #include #include #include +#include extern int printk(const char * fmt, ...) __attribute__ ((format (printf, 1, 2))); @@ -43,18 +44,35 @@ typedef struct { #define spin_is_locked(x) (*(volatile signed char *)(&(x)->lock) <= 0) #define spin_unlock_wait(x) do { barrier(); } while(spin_is_locked(x)) -#define spin_lock_string \ - "\n1:\t" \ - "lock ; decb %0\n\t" \ - "js 2f\n" \ - LOCK_SECTION_START("") \ - "2:\t" \ - "rep;nop\n\t" \ - "cmpb $0,%0\n\t" \ - "jle 2b\n\t" \ - "jmp 1b\n" \ - LOCK_SECTION_END +#ifdef CONFIG_SPINLINE + #define spin_lock_string \ + "\n1:\t" \ + "lock ; decb %0\n\t" \ + "js 2f\n" \ + "jmp 3f\n" \ + "2:\t" \ + "rep;nop\n\t" \ + "cmpb $0,%0\n\t" \ + "jle 2b\n\t" \ + "jmp 1b\n" \ + "3:\t" + +#else /* !CONFIG_SPINLINE */ + + #define spin_lock_string \ + "\n1:\t" \ + "lock ; decb %0\n\t" \ + "js 2f\n" \ + LOCK_SECTION_START("") \ + "2:\t" \ + "rep;nop\n\t" \ + "cmpb $0,%0\n\t" \ + "jle 2b\n\t" \ + "jmp 1b\n" \ + LOCK_SECTION_END + +#endif /* CONFIG_SPINLINE */ /* * This works. Despite all the confusion. * (except on PPro SMP or if we are using OOSTORE) @@ -196,4 +214,51 @@ static inline int _raw_write_trylock(rwl return 0; } +/* + * x86 version of "atomic_dec_and_lock()" using + * the atomic "cmpxchg" instruction. + * + * (For CPU's lacking cmpxchg, we use the slow + * generic version, and this one never even gets + * compiled). + */ +#include +static inline int atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock) +{ + int counter; + int newcount; +#ifdef CONFIG_PREEMPT + struct thread_info *info = current_thread_info(); +#endif + +repeat: + counter = atomic_read(atomic); + newcount = counter-1; + + if (!newcount) + goto slow_path; + + asm volatile("lock; cmpxchgl %1,%2" + :"=a" (newcount) + :"r" (newcount), "m" (atomic->counter), "0" (counter)); + + /* If the above failed, "eax" will have changed */ + if (newcount != counter) + goto repeat; + return 0; + +slow_path: +#ifdef CONFIG_PREEMPT + info->preempt_count++; +#endif + _raw_spin_lock(lock); + if (atomic_dec_and_test(atomic)) + return 1; + _raw_spin_unlock(lock); +#ifdef CONFIG_PREEMPT + info->preempt_count--; +#endif + return 0; +} + #endif /* __ASM_SPINLOCK_H */ diff -prauN linux-2.5.73/include/asm-i386/thread_info.h wli-2.5.73-29/include/asm-i386/thread_info.h --- linux-2.5.73/include/asm-i386/thread_info.h 2003-06-22 11:32:33.000000000 -0700 +++ wli-2.5.73-29/include/asm-i386/thread_info.h 2003-06-23 10:43:28.000000000 -0700 @@ -9,6 +9,8 @@ #ifdef __KERNEL__ +#include +#include #ifndef __ASSEMBLY__ #include #endif @@ -30,9 +32,11 @@ struct thread_info { __s32 preempt_count; /* 0 => preemptable, <0 => BUG */ mm_segment_t addr_limit; /* thread address space: + 0 for interrupts: illegal 0-0xBFFFFFFF for user-thead 0-0xFFFFFFFF for kernel-thread */ + struct thread_info *irq_stack; /* pointer to cpu irq stack */ struct restart_block restart_block; __u8 supervisor_stack[0]; @@ -48,7 +52,8 @@ struct thread_info { #define TI_CPU 0x00000010 #define TI_PRE_COUNT 0x00000014 #define TI_ADDR_LIMIT 0x00000018 -#define TI_RESTART_BLOCK 0x000001C +#define TI_IRQ_STACK 0x0000001C +#define TI_RESTART_BLOCK 0x0000026 #endif @@ -59,46 +64,64 @@ struct thread_info { * * preempt_count needs to be 1 initially, until the scheduler is functional. */ +#ifdef CONFIG_4K_STACK +#define THREAD_ORDER 0 +#define STACK_WARN 0x200 +#define STACK_PANIC 0x100 +#else +#define THREAD_ORDER 1 +#define STACK_WARN ((THREAD_SIZE)>>1) +#define STACK_PANIC 0x100 +#endif +#define INIT_THREAD_SIZE THREAD_SIZE + #ifndef __ASSEMBLY__ -#define INIT_THREAD_INFO(tsk) \ -{ \ - .task = &tsk, \ - .exec_domain = &default_exec_domain, \ - .flags = 0, \ - .cpu = 0, \ - .preempt_count = 1, \ - .addr_limit = KERNEL_DS, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ +#define INIT_THREAD_INFO(tsk) \ +{ \ + .task = &tsk, \ + .exec_domain = &default_exec_domain, \ + .flags = 0, \ + .cpu = 0, \ + .preempt_count = 1, \ + .addr_limit = KERNEL_DS, \ + .irq_stack = &init_irq_union.thread_info, \ + .restart_block = { \ + .fn = do_no_restart_syscall, \ + } \ } #define init_thread_info (init_thread_union.thread_info) #define init_stack (init_thread_union.stack) +/* thread information allocation */ +#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER) +#define alloc_thread_info(tsk) ((struct thread_info *) __get_free_pages(GFP_KERNEL,THREAD_ORDER)) +#define free_thread_info(ti) free_pages((unsigned long) (ti), THREAD_ORDER) +#define get_thread_info(ti) get_task_struct((ti)->task) +#define put_thread_info(ti) put_task_struct((ti)->task) + /* how to get the thread information struct from C */ static inline struct thread_info *current_thread_info(void) { struct thread_info *ti; - __asm__("andl %%esp,%0; ":"=r" (ti) : "0" (~8191UL)); + __asm__("andl %%esp,%0; ":"=r" (ti) : "0" (~(THREAD_SIZE - 1))); return ti; } -/* thread information allocation */ -#define THREAD_SIZE (2*PAGE_SIZE) -#define alloc_thread_info(tsk) ((struct thread_info *) __get_free_pages(GFP_KERNEL,1)) -#define free_thread_info(ti) free_pages((unsigned long) (ti), 1) -#define get_thread_info(ti) get_task_struct((ti)->task) -#define put_thread_info(ti) put_task_struct((ti)->task) - #else /* !__ASSEMBLY__ */ +#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER) + /* how to get the thread information struct from ASM */ #define GET_THREAD_INFO(reg) \ - movl $-8192, reg; \ + movl $-THREAD_SIZE, reg; \ andl %esp, reg +/* use this one if reg already contains %esp */ +#define GET_THREAD_INFO_WITH_ESP(reg) \ +andl $-THREAD_SIZE, reg + #endif /* diff -prauN linux-2.5.73/include/asm-i386/tlb.h wli-2.5.73-29/include/asm-i386/tlb.h --- linux-2.5.73/include/asm-i386/tlb.h 2003-06-22 11:32:39.000000000 -0700 +++ wli-2.5.73-29/include/asm-i386/tlb.h 2003-06-23 10:33:02.000000000 -0700 @@ -1,10 +1,54 @@ #ifndef _I386_TLB_H #define _I386_TLB_H - /* - * x86 doesn't need any special per-pte or - * per-vma handling.. + * include/asm-i386/tlb.h + * (C) June 2003 William Irwin, IBM + * Routines for pagetable cacheing and release. */ + +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_HIGHPTE +#define GFP_PTE (GFP_KERNEL|__GFP_REPEAT|__GFP_HIGHMEM) +#else +#define GFP_PTE (GFP_KERNEL|__GFP_REPEAT) +#endif + +#ifdef CONFIG_HIGHPMD +#define GFP_PMD (GFP_KERNEL|__GFP_REPEAT|__GFP_HIGHMEM) +#else +#define GFP_PMD (GFP_KERNEL|__GFP_REPEAT) +#endif + +#define PG_PTE PG_arch_1 +#define NR_PTE 128 +#define FREE_PTE_NR NR_PTE +#define NR_NONPTE 512 +#define MAX_ZONE_ID (MAX_NUMNODES * MAX_NR_ZONES) + +#define PagePTE(page) test_bit(PG_PTE, &(page)->flags) +#define SetPagePTE(page) set_bit(PG_PTE, &(page)->flags) +#define ClearPagePTE(page) clear_bit(PG_PTE, &(page)->flags) +#define TestSetPagePTE(page) test_and_set_bit(PG_PTE, &(page)->flags) +#define TestClearPagePTE(page) test_and_clear_bit(PG_PTE, &(page)->flags) +#define PageZoneID(page) ((page)->flags >> ZONE_SHIFT) + +struct mmu_gather { + struct mm_struct *mm; + int nr_pte_active, nr_pte_ready, nr_nonpte, need_flush, fullmm, freed; + struct list_head active_list[MAX_ZONE_ID], ready_list[MAX_ZONE_ID]; + int active_count[MAX_ZONE_ID], ready_count[MAX_ZONE_ID]; + struct page *nonpte[NR_NONPTE]; +}; + +DECLARE_PER_CPU(struct mmu_gather, mmu_gathers); + #define tlb_start_vma(tlb, vma) do { } while (0) #define tlb_end_vma(tlb, vma) do { } while (0) #define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0) @@ -15,6 +59,109 @@ */ #define tlb_flush(tlb) flush_tlb_mm((tlb)->mm) -#include +void tlb_init(void); -#endif +static inline +struct mmu_gather *tlb_gather_mmu(struct mm_struct *mm, unsigned int flush) +{ + struct mmu_gather *tlb = &per_cpu(mmu_gathers, get_cpu()); + tlb->mm = mm; + tlb->fullmm = flush; + tlb->freed = 0; + put_cpu(); + return tlb; +} + +static inline +void tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *pte, unsigned long addr) +{ + tlb->need_flush = 1; +} + +static inline +void tlb_flush_ready(struct mmu_gather *tlb) +{ + int zone = 0; + while (tlb->nr_pte_ready >= NR_PTE) { + if (!list_empty(&tlb->ready_list[zone])) { + struct page *head = list_entry(tlb->ready_list[zone].next, struct page, list); + list_del_init(&head->list); + list_splice_init(&tlb->ready_list[zone], &head->list); + head->private = tlb->ready_count[zone]; + tlb->nr_pte_ready -= tlb->ready_count[zone]; + tlb->ready_count[zone] = 0; + free_pages_bulk(zone_table[zone], head, 0); + } + ++zone; + } +} + +static inline +void tlb_flush_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) +{ + int zone; + + if (!tlb->need_flush && tlb->nr_nonpte < NR_NONPTE) + return; + + tlb->need_flush = 0; + tlb_flush(tlb); + if (tlb->nr_nonpte) { + free_pages_and_swap_cache(tlb->nonpte, tlb->nr_nonpte); + tlb->nr_nonpte = 0; + } + + for (zone = 0; zone < MAX_ZONE_ID; ++zone) { + if (!tlb->active_count[zone]) + continue; + + list_splice_init(&tlb->active_list[zone], &tlb->ready_list[zone]); + tlb->ready_count[zone] += tlb->active_count[zone]; + tlb->active_count[zone] = 0; + } + tlb->nr_pte_ready += tlb->nr_pte_active; + tlb->nr_pte_active = 0; + if (tlb->nr_pte_ready >= NR_PTE) + tlb_flush_ready(tlb); +} + +static inline +void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) +{ + if (tlb->mm->rss >= tlb->freed) + tlb->mm->rss -= tlb->freed; + else + tlb->mm->rss = 0; + tlb_flush_mmu(tlb, start, end); +} + +static inline +void tlb_remove_nonpte_page(struct mmu_gather *tlb, struct page *page) +{ + tlb->nonpte[tlb->nr_nonpte] = page; + tlb->nr_nonpte++; + if (tlb->nr_nonpte >= NR_NONPTE) + tlb_flush_mmu(tlb, 0, 0); +} + +static inline +void tlb_remove_pte_page(struct mmu_gather *tlb, struct page *page) +{ + int zone = PageZoneID(page); + ClearPagePTE(page); + tlb->nr_pte_active++; + tlb->active_count[zone]++; + list_add(&page->list, &tlb->active_list[zone]); +} + +static inline +void tlb_remove_page(struct mmu_gather *tlb, struct page *page) +{ + tlb->need_flush = 1; + if (PagePTE(page)) + tlb_remove_pte_page(tlb, page); + else + tlb_remove_nonpte_page(tlb, page); +} + +#endif /* _I386_TLB_H */ diff -prauN linux-2.5.73/include/asm-ia64/pgtable.h wli-2.5.73-29/include/asm-ia64/pgtable.h --- linux-2.5.73/include/asm-ia64/pgtable.h 2003-06-22 11:32:39.000000000 -0700 +++ wli-2.5.73-29/include/asm-ia64/pgtable.h 2003-06-23 10:31:02.000000000 -0700 @@ -257,7 +257,8 @@ ia64_phys_addr_valid (unsigned long addr #define pgd_bad(pgd) (!ia64_phys_addr_valid(pgd_val(pgd))) #define pgd_present(pgd) (pgd_val(pgd) != 0UL) #define pgd_clear(pgdp) (pgd_val(*(pgdp)) = 0UL) -#define pgd_page(pgd) ((unsigned long) __va(pgd_val(pgd) & _PFN_MASK)) +#define __pgd_page(pgd) ((unsigned long)__va(pgd_val(pgd) & _PFN_MASK)) +#define pgd_page(pgd) virt_to_page(__pgd_page(pgd)) /* * The following have defined behavior only work if pte_present() is true. @@ -326,7 +327,13 @@ pgd_offset (struct mm_struct *mm, unsign /* Find an entry in the second-level page table.. */ #define pmd_offset(dir,addr) \ - ((pmd_t *) pgd_page(*(dir)) + (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))) + ((pmd_t *)__pgd_page(*(dir)) + (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))) + +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) /* * Find an entry in the third-level page table. This looks more complicated than it diff -prauN linux-2.5.73/include/asm-m68k/motorola_pgtable.h wli-2.5.73-29/include/asm-m68k/motorola_pgtable.h --- linux-2.5.73/include/asm-m68k/motorola_pgtable.h 2003-06-22 11:32:57.000000000 -0700 +++ wli-2.5.73-29/include/asm-m68k/motorola_pgtable.h 2003-06-23 10:31:02.000000000 -0700 @@ -115,6 +115,7 @@ extern inline void pgd_set(pgd_t * pgdp, #define __pte_page(pte) ((unsigned long)__va(pte_val(pte) & PAGE_MASK)) #define __pmd_page(pmd) ((unsigned long)__va(pmd_val(pmd) & _TABLE_MASK)) #define __pgd_page(pgd) ((unsigned long)__va(pgd_val(pgd) & _TABLE_MASK)) +#define pgd_page(pgd) virt_to_page(__pgd_page(pgd)) #define pte_none(pte) (!pte_val(pte)) diff -prauN linux-2.5.73/include/asm-m68knommu/pgtable.h wli-2.5.73-29/include/asm-m68knommu/pgtable.h --- linux-2.5.73/include/asm-m68knommu/pgtable.h 2003-06-22 11:32:56.000000000 -0700 +++ wli-2.5.73-29/include/asm-m68knommu/pgtable.h 2003-06-23 10:31:02.000000000 -0700 @@ -21,7 +21,12 @@ typedef pte_t *pte_addr_t; #define pgd_bad(pgd) (0) #define pgd_clear(pgdp) #define kern_addr_valid(addr) (1) -#define pmd_offset(a, b) ((void *)0) +#define pmd_offset(a, b) ((void *)0) +#define pmd_offset_kernel(a, b) pmd_offset(a, b) +#define pmd_offset_map(a, b) pmd_offset(a, b) +#define pmd_offset_map_nested(a, b) pmd_offset(a, b) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) #define PAGE_NONE __pgprot(0) #define PAGE_SHARED __pgprot(0) diff -prauN linux-2.5.73/include/asm-mips64/pgtable.h wli-2.5.73-29/include/asm-mips64/pgtable.h --- linux-2.5.73/include/asm-mips64/pgtable.h 2003-06-22 11:33:34.000000000 -0700 +++ wli-2.5.73-29/include/asm-mips64/pgtable.h 2003-06-23 10:31:02.000000000 -0700 @@ -274,11 +274,13 @@ extern inline unsigned long pmd_page(pmd return pmd_val(pmd); } -extern inline unsigned long pgd_page(pgd_t pgd) +extern inline unsigned long __pgd_page(pgd_t pgd) { return pgd_val(pgd); } +#define pgd_page(pgd) virt_to_page(__pgd_page(pgd)) + extern inline void pmd_set(pmd_t * pmdp, pte_t * ptep) { pmd_val(*pmdp) = (((unsigned long) ptep) & PAGE_MASK); @@ -520,7 +522,7 @@ extern inline pgd_t *pgd_offset(struct m /* Find an entry in the second-level page table.. */ extern inline pmd_t * pmd_offset(pgd_t * dir, unsigned long address) { - return (pmd_t *) pgd_page(*dir) + + return (pmd_t *)__pgd_page(*dir) + ((address >> PMD_SHIFT) & (PTRS_PER_PMD - 1)); } diff -prauN linux-2.5.73/include/asm-parisc/cacheflush.h wli-2.5.73-29/include/asm-parisc/cacheflush.h --- linux-2.5.73/include/asm-parisc/cacheflush.h 2003-06-22 11:33:32.000000000 -0700 +++ wli-2.5.73-29/include/asm-parisc/cacheflush.h 2003-06-23 10:46:31.000000000 -0700 @@ -66,7 +66,7 @@ extern void __flush_dcache_page(struct p static inline void flush_dcache_page(struct page *page) { - if (page->mapping && list_empty(&page->mapping->i_mmap) && + if (page_mapping(page) && list_empty(&page_mapping(page)->i_mmap) && list_empty(&page->mapping->i_mmap_shared)) { set_bit(PG_dcache_dirty, &page->flags); } else { diff -prauN linux-2.5.73/include/asm-parisc/pgtable.h wli-2.5.73-29/include/asm-parisc/pgtable.h --- linux-2.5.73/include/asm-parisc/pgtable.h 2003-06-22 11:33:15.000000000 -0700 +++ wli-2.5.73-29/include/asm-parisc/pgtable.h 2003-06-23 10:31:02.000000000 -0700 @@ -242,7 +242,8 @@ extern unsigned long *empty_zero_page; #ifdef __LP64__ -#define pgd_page(pgd) ((unsigned long) __va(pgd_val(pgd) & PAGE_MASK)) +#define __pgd_page(pgd) ((unsigned long) __va(pgd_val(pgd) & PAGE_MASK)) +#define pgd_page(pgd) virt_to_page(__pgd_page(pgd)) /* For 64 bit we have three level tables */ @@ -339,11 +340,17 @@ extern inline pte_t pte_modify(pte_t pte #ifdef __LP64__ #define pmd_offset(dir,address) \ -((pmd_t *) pgd_page(*(dir)) + (((address)>>PMD_SHIFT) & (PTRS_PER_PMD-1))) +((pmd_t *)__pgd_page(*(dir)) + (((address)>>PMD_SHIFT) & (PTRS_PER_PMD-1))) #else #define pmd_offset(dir,addr) ((pmd_t *) dir) #endif +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) + /* Find an entry in the third-level page table.. */ #define pte_index(address) (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE-1)) #define pte_offset_kernel(pmd, address) \ diff -prauN linux-2.5.73/include/asm-ppc/pgtable.h wli-2.5.73-29/include/asm-ppc/pgtable.h --- linux-2.5.73/include/asm-ppc/pgtable.h 2003-06-22 11:32:37.000000000 -0700 +++ wli-2.5.73-29/include/asm-ppc/pgtable.h 2003-06-23 10:31:02.000000000 -0700 @@ -370,8 +370,9 @@ static inline int pgd_bad(pgd_t pgd) { static inline int pgd_present(pgd_t pgd) { return 1; } #define pgd_clear(xp) do { } while (0) -#define pgd_page(pgd) \ +#define __pgd_page(pgd) \ ((unsigned long) __va(pgd_val(pgd) & PAGE_MASK)) +#define pgd_page(pgd) virt_to_page(__pgd_page(pgd)) /* * The following only work if pte_present() is true. diff -prauN linux-2.5.73/include/asm-ppc64/pgtable.h wli-2.5.73-29/include/asm-ppc64/pgtable.h --- linux-2.5.73/include/asm-ppc64/pgtable.h 2003-06-22 11:33:18.000000000 -0700 +++ wli-2.5.73-29/include/asm-ppc64/pgtable.h 2003-06-23 10:31:02.000000000 -0700 @@ -190,7 +190,8 @@ extern unsigned long empty_zero_page[PAG #define pgd_bad(pgd) ((pgd_val(pgd)) == 0) #define pgd_present(pgd) (pgd_val(pgd) != 0UL) #define pgd_clear(pgdp) (pgd_val(*(pgdp)) = 0UL) -#define pgd_page(pgd) (__bpn_to_ba(pgd_val(pgd))) +#define __pgd_page(pgd) (__bpn_to_ba(pgd_val(pgd))) +#define pgd_page(pgd) virt_to_page(__pgd_page(pgd)) /* * Find an entry in a page-table-directory. We combine the address region @@ -203,12 +204,18 @@ extern unsigned long empty_zero_page[PAG /* Find an entry in the second-level page table.. */ #define pmd_offset(dir,addr) \ - ((pmd_t *) pgd_page(*(dir)) + (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))) + ((pmd_t *)__pgd_page(*(dir)) + (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))) /* Find an entry in the third-level page table.. */ #define pte_offset_kernel(dir,addr) \ ((pte_t *) pmd_page_kernel(*(dir)) + (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))) +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) + #define pte_offset_map(dir,addr) pte_offset_kernel((dir), (addr)) #define pte_offset_map_nested(dir,addr) pte_offset_kernel((dir), (addr)) #define pte_unmap(pte) do { } while(0) diff -prauN linux-2.5.73/include/asm-s390/pgtable.h wli-2.5.73-29/include/asm-s390/pgtable.h --- linux-2.5.73/include/asm-s390/pgtable.h 2003-06-22 11:33:07.000000000 -0700 +++ wli-2.5.73-29/include/asm-s390/pgtable.h 2003-06-23 10:31:02.000000000 -0700 @@ -613,6 +613,7 @@ static inline pte_t mk_pte_phys(unsigned /* to find an entry in a page-table-directory */ #define pgd_index(address) ((address >> PGDIR_SHIFT) & (PTRS_PER_PGD-1)) #define pgd_offset(mm, address) ((mm)->pgd+pgd_index(address)) +#define pgd_page(pgd) virt_to_page(pgd_page_kernel(pgd)) /* to find an entry in a kernel page-table-directory */ #define pgd_offset_k(address) pgd_offset(&init_mm, address) @@ -634,6 +635,12 @@ extern inline pmd_t * pmd_offset(pgd_t * #endif /* __s390x__ */ +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) + /* Find an entry in the third-level page table.. */ #define pte_index(address) (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE-1)) #define pte_offset_kernel(pmd, address) \ diff -prauN linux-2.5.73/include/asm-sh/pgalloc.h wli-2.5.73-29/include/asm-sh/pgalloc.h --- linux-2.5.73/include/asm-sh/pgalloc.h 2003-06-22 11:32:45.000000000 -0700 +++ wli-2.5.73-29/include/asm-sh/pgalloc.h 2003-06-23 10:46:31.000000000 -0700 @@ -109,7 +109,7 @@ static inline pte_t ptep_get_and_clear(p unsigned long pfn = pte_pfn(pte); if (pfn_valid(pfn)) { page = pfn_to_page(page); - if (!page->mapping || !page->mapping->i_mmap_shared) + if (!page_mapping(page) || !page_mapping(page)->i_mmap_shared) __clear_bit(PG_mapped, &page->flags); } } diff -prauN linux-2.5.73/include/asm-sh/pgtable-2level.h wli-2.5.73-29/include/asm-sh/pgtable-2level.h --- linux-2.5.73/include/asm-sh/pgtable-2level.h 2003-06-22 11:33:32.000000000 -0700 +++ wli-2.5.73-29/include/asm-sh/pgtable-2level.h 2003-06-23 10:31:02.000000000 -0700 @@ -48,8 +48,9 @@ static inline void pgd_clear (pgd_t * pg #define set_pmd(pmdptr, pmdval) (*(pmdptr) = pmdval) #define set_pgd(pgdptr, pgdval) (*(pgdptr) = pgdval) -#define pgd_page(pgd) \ +#define __pgd_page(pgd) \ ((unsigned long) __va(pgd_val(pgd) & PAGE_MASK)) +#define pgd_page(pgd) virt_to_page(__pgd_page(pgd)) static inline pmd_t * pmd_offset(pgd_t * dir, unsigned long address) { diff -prauN linux-2.5.73/include/asm-sparc/pgtable.h wli-2.5.73-29/include/asm-sparc/pgtable.h --- linux-2.5.73/include/asm-sparc/pgtable.h 2003-06-22 11:32:56.000000000 -0700 +++ wli-2.5.73-29/include/asm-sparc/pgtable.h 2003-06-23 10:31:02.000000000 -0700 @@ -202,10 +202,11 @@ extern unsigned long empty_zero_page; /* */ BTFIXUPDEF_CALL_CONST(struct page *, pmd_page, pmd_t) -BTFIXUPDEF_CALL_CONST(unsigned long, pgd_page, pgd_t) +BTFIXUPDEF_CALL_CONST(unsigned long, __pgd_page, pgd_t) #define pmd_page(pmd) BTFIXUP_CALL(pmd_page)(pmd) -#define pgd_page(pgd) BTFIXUP_CALL(pgd_page)(pgd) +#define __pgd_page(pgd) BTFIXUP_CALL(__pgd_page)(pgd) +#define pgd_page(pgd) virt_to_page(__pgd_page(pgd)) BTFIXUPDEF_SETHI(none_mask) BTFIXUPDEF_CALL_CONST(int, pte_present, pte_t) @@ -352,6 +353,11 @@ extern __inline__ pte_t pte_modify(pte_t /* Find an entry in the second-level page table.. */ BTFIXUPDEF_CALL(pmd_t *, pmd_offset, pgd_t *, unsigned long) #define pmd_offset(dir,addr) BTFIXUP_CALL(pmd_offset)(dir,addr) +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) /* Find an entry in the third-level page table.. */ BTFIXUPDEF_CALL(pte_t *, pte_offset_kernel, pmd_t *, unsigned long) diff -prauN linux-2.5.73/include/asm-sparc64/pgtable.h wli-2.5.73-29/include/asm-sparc64/pgtable.h --- linux-2.5.73/include/asm-sparc64/pgtable.h 2003-06-22 11:32:31.000000000 -0700 +++ wli-2.5.73-29/include/asm-sparc64/pgtable.h 2003-06-23 10:31:02.000000000 -0700 @@ -228,7 +228,8 @@ static inline pte_t pte_modify(pte_t ori (pgd_val(*(pgdp)) = (__pa((unsigned long) (pmdp)) >> 11UL)) #define __pmd_page(pmd) ((unsigned long) __va((pmd_val(pmd)<<11UL))) #define pmd_page(pmd) virt_to_page((void *)__pmd_page(pmd)) -#define pgd_page(pgd) ((unsigned long) __va((pgd_val(pgd)<<11UL))) +#define __pgd_page(pgd) ((unsigned long) __va((pgd_val(pgd)<<11UL))) +#define pgd_page(pgd) virt_to_page(__pgd_page(pgd)) #define pte_none(pte) (!pte_val(pte)) #define pte_present(pte) (pte_val(pte) & _PAGE_PRESENT) #define pte_clear(pte) (pte_val(*(pte)) = 0UL) @@ -270,8 +271,13 @@ static inline pte_t pte_modify(pte_t ori #define pgd_offset_k(address) pgd_offset(&init_mm, address) /* Find an entry in the second-level page table.. */ -#define pmd_offset(dir, address) ((pmd_t *) pgd_page(*(dir)) + \ +#define pmd_offset(dir, address) ((pmd_t *)__pgd_page(*(dir)) + \ ((address >> PMD_SHIFT) & (REAL_PTRS_PER_PMD-1))) +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) /* Find an entry in the third-level page table.. */ #define pte_index(dir, address) ((pte_t *) __pmd_page(*(dir)) + \ diff -prauN linux-2.5.73/include/asm-v850/pgtable.h wli-2.5.73-29/include/asm-v850/pgtable.h --- linux-2.5.73/include/asm-v850/pgtable.h 2003-06-22 11:32:58.000000000 -0700 +++ wli-2.5.73-29/include/asm-v850/pgtable.h 2003-06-23 10:31:02.000000000 -0700 @@ -13,6 +13,11 @@ typedef pte_t *pte_addr_t; #define pgd_clear(pgdp) ((void)0) #define pmd_offset(a, b) ((void *)0) +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) #define kern_addr_valid(addr) (1) diff -prauN linux-2.5.73/include/asm-x86_64/pgtable.h wli-2.5.73-29/include/asm-x86_64/pgtable.h --- linux-2.5.73/include/asm-x86_64/pgtable.h 2003-06-22 11:33:02.000000000 -0700 +++ wli-2.5.73-29/include/asm-x86_64/pgtable.h 2003-06-23 10:31:02.000000000 -0700 @@ -98,8 +98,9 @@ static inline void set_pml4(pml4_t *dst, pml4_val(*dst) = pml4_val(val); } -#define pgd_page(pgd) \ +#define __pgd_page(pgd) \ ((unsigned long) __va(pgd_val(pgd) & PHYSICAL_PAGE_MASK)) +#define pgd_page(pgd) virt_to_page(__pgd_page(pgd)) #define ptep_get_and_clear(xp) __pte(xchg(&(xp)->pte, 0)) #define pte_same(a, b) ((a).pte == (b).pte) @@ -332,8 +333,13 @@ static inline pgd_t *current_pgd_offset_ #define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT)) #define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1)) -#define pmd_offset(dir, address) ((pmd_t *) pgd_page(*(dir)) + \ +#define pmd_offset(dir, address) ((pmd_t *)__pgd_page(*(dir)) + \ pmd_index(address)) +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) #define pmd_none(x) (!pmd_val(x)) #define pmd_present(x) (pmd_val(x) & _PAGE_PRESENT) #define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0) diff -prauN linux-2.5.73/include/linux/dcache.h wli-2.5.73-29/include/linux/dcache.h --- linux-2.5.73/include/linux/dcache.h 2003-06-22 11:33:35.000000000 -0700 +++ wli-2.5.73-29/include/linux/dcache.h 2003-06-23 10:38:47.000000000 -0700 @@ -154,6 +154,7 @@ d_iput: no no yes #define DCACHE_UNHASHED 0x0010 extern spinlock_t dcache_lock; +extern spinlock_t vfsmount_lock; /** * d_drop - drop a dentry diff -prauN linux-2.5.73/include/linux/fs.h wli-2.5.73-29/include/linux/fs.h --- linux-2.5.73/include/linux/fs.h 2003-06-22 11:32:38.000000000 -0700 +++ wli-2.5.73-29/include/linux/fs.h 2003-06-23 10:44:16.000000000 -0700 @@ -19,6 +19,8 @@ #include #include #include +#include +#include #include struct iovec; @@ -309,11 +311,29 @@ struct address_space_operations { loff_t offset, unsigned long nr_segs); }; +#if NR_CPUS > 8 +typedef rwlock_t mapping_rwlock_t; +#define mapping_rdlock(lock) read_lock(lock) +#define mapping_rdunlock(lock) read_unlock(lock) +#define mapping_wrlock(lock) write_lock(lock) +#define mapping_wrunlock(lock) write_unlock(lock) +#define mapping_rwlock_init(lock) rwlock_init(lock) +#define MAPPING_RW_LOCK_UNLOCKED RW_LOCK_UNLOCKED +#else +typedef spinlock_t mapping_rwlock_t; +#define mapping_rdlock(lock) spin_lock(lock) +#define mapping_rdunlock(lock) spin_unlock(lock) +#define mapping_wrlock(lock) spin_lock(lock) +#define mapping_wrunlock(lock) spin_unlock(lock) +#define mapping_rwlock_init(lock) spin_lock_init(lock) +#define MAPPING_RW_LOCK_UNLOCKED SPIN_LOCK_UNLOCKED +#endif + struct backing_dev_info; struct address_space { struct inode *host; /* owner: inode, block_device */ struct radix_tree_root page_tree; /* radix tree of all pages */ - spinlock_t page_lock; /* and spinlock protecting it */ + mapping_rwlock_t page_lock; /* and spinlock protecting it */ struct list_head clean_pages; /* list of clean pages */ struct list_head dirty_pages; /* list of dirty pages */ struct list_head locked_pages; /* list of locked pages */ @@ -322,7 +342,7 @@ struct address_space { struct address_space_operations *a_ops; /* methods */ struct list_head i_mmap; /* list of private mappings */ struct list_head i_mmap_shared; /* list of shared mappings */ - struct semaphore i_shared_sem; /* protect both above lists */ + spinlock_t i_shared_lock; /* protect both above lists */ unsigned long dirtied_when; /* jiffies of first page dirtying */ int gfp_mask; /* how to allocate the pages */ struct backing_dev_info *backing_dev_info; /* device readahead, etc */ diff -prauN linux-2.5.73/include/linux/gfp.h wli-2.5.73-29/include/linux/gfp.h --- linux-2.5.73/include/linux/gfp.h 2003-06-22 11:32:38.000000000 -0700 +++ wli-2.5.73-29/include/linux/gfp.h 2003-06-23 10:33:02.000000000 -0700 @@ -76,6 +76,7 @@ static inline struct page * alloc_pages_ extern unsigned long FASTCALL(__get_free_pages(unsigned int gfp_mask, unsigned int order)); extern unsigned long FASTCALL(get_zeroed_page(unsigned int gfp_mask)); +void free_pages_bulk(struct zone *zone, struct page *page, unsigned int order); #define __get_free_page(gfp_mask) \ __get_free_pages((gfp_mask),0) diff -prauN linux-2.5.73/include/linux/hugetlb.h wli-2.5.73-29/include/linux/hugetlb.h --- linux-2.5.73/include/linux/hugetlb.h 2003-06-22 11:32:45.000000000 -0700 +++ wli-2.5.73-29/include/linux/hugetlb.h 2003-06-23 10:38:04.000000000 -0700 @@ -41,6 +41,11 @@ mark_mm_hugetlb(struct mm_struct *mm, st #define is_hugepage_only_range(addr, len) 0 #endif +#define vm_account_huge_inc(vma, pte, addr) \ + vm_account(vma, pte, addr, HPAGE_SIZE/PAGE_SIZE) +#define vm_account_huge_dec(vma, pte, addr) \ + vm_account(vma, pte, addr, -(HPAGE_SIZE/PAGE_SIZE)) + #else /* !CONFIG_HUGETLB_PAGE */ static inline int is_vm_hugetlb_page(struct vm_area_struct *vma) diff -prauN linux-2.5.73/include/linux/mm.h wli-2.5.73-29/include/linux/mm.h --- linux-2.5.73/include/linux/mm.h 2003-06-22 11:32:31.000000000 -0700 +++ wli-2.5.73-29/include/linux/mm.h 2003-06-23 10:53:46.000000000 -0700 @@ -12,6 +12,7 @@ #include #include #include +#include #ifndef CONFIG_DISCONTIGMEM /* Don't use mapnrs, do it properly */ extern unsigned long max_mapnr; @@ -77,6 +78,7 @@ struct vm_area_struct { units, *not* PAGE_CACHE_SIZE */ struct file * vm_file; /* File we map to (can be NULL). */ void * vm_private_data; /* was vm_pte (shared mem) */ + struct rcu_head rcu; }; /* @@ -110,6 +112,7 @@ struct vm_area_struct { #define VM_RESERVED 0x00080000 /* Don't unmap it from swap_out */ #define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ +#define VM_DEAD 0x00800000 /* vma is dead, don't touch */ #ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */ #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS @@ -146,8 +149,6 @@ struct vm_operations_struct { int (*populate)(struct vm_area_struct * area, unsigned long address, unsigned long len, pgprot_t prot, unsigned long pgoff, int nonblock); }; -/* forward declaration; pte_chain is meant to be internal to rmap.c */ -struct pte_chain; struct mmu_gather; struct inode; @@ -171,15 +172,14 @@ struct page { updated asynchronously */ atomic_t count; /* Usage count, see below. */ struct list_head list; /* ->mapping has some page lists. */ - struct address_space *mapping; /* The inode (or ...) we belong to. */ + unsigned long __mapping; /* The inode (or ...) we belong to. */ unsigned long index; /* Our offset within mapping. */ struct list_head lru; /* Pageout list, eg. active_list; protected by zone->lru_lock !! */ union { - struct pte_chain *chain;/* Reverse pte mapping pointer. - * protected by PG_chainlock */ - pte_addr_t direct; - } pte; + unsigned long count; + struct rmap_chain *chain; + } rmap; unsigned long private; /* mapping-private opaque data */ /* @@ -339,9 +339,14 @@ static inline void set_page_zone(struct page->flags |= zone_num << ZONE_SHIFT; } -static inline void * lowmem_page_address(struct page *page) +#ifndef CONFIG_DISCONTIGMEM +/* The array of struct pages - for discontigmem use pgdat->lmem_map */ +extern struct page *mem_map; +#endif + +static inline void *lowmem_page_address(struct page *page) { - return __va( ( (page - page_zone(page)->zone_mem_map) + page_zone(page)->zone_start_pfn) << PAGE_SHIFT); + return __va(page_to_pfn(page) << PAGE_SHIFT); } #if defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) @@ -370,13 +375,40 @@ void page_address_init(void); #endif /* + * On an anonymous page mapped into a user virutal memory area, + * page->mapping points to its anonmm, not to a struct address_space. + * + * Please note that, confusingly, page_mapping() refers to the inode + * struct address_space which maps the page from disk, where page_mapped() + * refers to whether it's mapped into a user virtual address space. + */ +static inline struct address_space *page_mapping(struct page *page) +{ + if (PageAnon(page)) + return NULL; + else + return (struct address_space *)page->__mapping; +} + +static inline struct mm_struct *page_mm(struct page *page) +{ + BUG_ON(!PageAnon(page)); + return (struct mm_struct *)page->__mapping; +} + +static inline void set_page_mapping(struct page *page, void *ptr) +{ + page->__mapping = (unsigned long)ptr; +} + +/* * Return true if this page is mapped into pagetables. Subtle: test pte.direct * rather than pte.chain. Because sometimes pte.direct is 64-bit, and .chain * is only 32-bit. */ static inline int page_mapped(struct page *page) { - return page->pte.direct != 0; + return page->rmap.count != 0; } /* @@ -395,11 +427,6 @@ static inline int page_mapped(struct pag #define VM_FAULT_MINOR 1 #define VM_FAULT_MAJOR 2 -#ifndef CONFIG_DISCONTIGMEM -/* The array of struct pages - for discontigmem use pgdat->lmem_map */ -extern struct page *mem_map; -#endif - extern void show_free_areas(void); struct page *shmem_nopage(struct vm_area_struct * vma, @@ -423,23 +450,27 @@ int zeromap_page_range(struct vm_area_st extern int vmtruncate(struct inode * inode, loff_t offset); extern pmd_t *FASTCALL(__pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)); +pmd_t *FASTCALL(__pmd_alloc_kernel(struct mm_struct *mm, pgd_t *pmd, unsigned long address)); extern pte_t *FASTCALL(pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address)); -extern pte_t *FASTCALL(pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address)); +pte_t *FASTCALL(pte_alloc_map(struct mm_struct *mm, pmd_t **pmd, unsigned long address)); extern int install_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot); extern int handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma, unsigned long address, int write_access); extern int make_pages_present(unsigned long addr, unsigned long end); extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write); extern long sys_remap_file_pages(unsigned long start, unsigned long size, unsigned long prot, unsigned long pgoff, unsigned long nonblock); -void put_dirty_page(struct task_struct *tsk, struct page *page, - unsigned long address, pgprot_t prot); +void put_dirty_page(task_t *task, struct vm_area_struct *vma, + struct page *page, unsigned long address, pgprot_t prot); int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int len, int write, int force, struct page **pages, struct vm_area_struct **vmas); -int __set_page_dirty_buffers(struct page *page); -int __set_page_dirty_nobuffers(struct page *page); +int set_page_dirty(struct page *page); +int set_page_dirty_buffers(struct page *page); +int set_page_dirty_nobuffers(struct page *page); int set_page_dirty_lock(struct page *page); +void free_vma(struct vm_area_struct *); + /* * Prototype to add a shrinker callback for ageable caches. * @@ -464,33 +495,15 @@ extern struct shrinker *set_shrinker(int extern void remove_shrinker(struct shrinker *shrinker); /* - * If the mapping doesn't provide a set_page_dirty a_op, then - * just fall through and assume that it wants buffer_heads. - * FIXME: make the method unconditional. - */ -static inline int set_page_dirty(struct page *page) -{ - if (page->mapping) { - int (*spd)(struct page *); - - spd = page->mapping->a_ops->set_page_dirty; - if (spd) - return (*spd)(page); - } - return __set_page_dirty_buffers(page); -} - -/* * On a two-level page table, this ends up being trivial. Thus the * inlining and the symmetry break with pte_alloc_map() that does all * of this out-of-line. */ -static inline pmd_t *pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) -{ - if (pgd_none(*pgd)) - return __pmd_alloc(mm, pgd, address); - return pmd_offset(pgd, address); -} +#define pmd_alloc_map(mm, pgd, addr) \ + (pgd_none(*(pgd))? __pmd_alloc(mm,pgd,addr): pmd_offset_map(pgd,addr)) + +#define pmd_alloc_kernel(mm, pgd, addr) \ + (pgd_none(*(pgd))? __pmd_alloc_kernel(mm,pgd,addr): pmd_offset_kernel(pgd,addr)) extern void free_area_init(unsigned long * zones_size); extern void free_area_init_node(int nid, pg_data_t *pgdat, struct page *pmap, @@ -609,5 +622,75 @@ extern struct page * follow_page(struct int write); extern int remap_page_range(struct vm_area_struct *vma, unsigned long from, unsigned long to, unsigned long size, pgprot_t prot); + +static inline void vm_account(struct vm_area_struct *vma, pte_t pte, + unsigned long addr, long adjustment) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long pfn; + struct page *page; + + if (!pte_present(pte)) + return; + + pfn = pte_pfn(pte); + if (!pfn_valid(pfn)) + goto out; + + page = pfn_to_page(pfn); + if (PageReserved(page)) + goto out; + + if (vma->vm_flags & VM_EXECUTABLE) + mm->text += adjustment; + else if (vma->vm_flags & (VM_STACK_FLAGS & (VM_GROWSUP | VM_GROWSDOWN))) { + mm->data += adjustment; + mm->stack += adjustment; + } else if (addr >= TASK_UNMAPPED_BASE) + mm->lib += adjustment; + else + mm->data += adjustment; + + if (page_mapping(page)) + mm->shared += adjustment; + +out: + if (pte_write(pte)) + mm->dirty += adjustment; +} + +#define vm_account_inc(vma, pte, addr) vm_account(vma, pte, addr, +1) +#define vm_account_dec(vma, pte, addr) vm_account(vma, pte, addr, -1) + +static inline void vm_ptep_set_wrprotect(struct mm_struct *mm, pte_t *pte) +{ + if (pte_write(*pte)) + mm->dirty--; + ptep_set_wrprotect(pte); +} + +static inline void vm_set_pte(struct vm_area_struct *vma, pte_t *dst, + pte_t val, unsigned long addr) +{ + vm_account_inc(vma, val, addr); + set_pte(dst, val); +} + +static inline pte_t vm_ptep_get_and_clear(struct vm_area_struct *vma, + pte_t *pte, unsigned long addr) +{ + pte_t val = ptep_get_and_clear(pte); + vm_account_dec(vma, val, addr); + return val; +} + +static inline void vm_pte_clear(struct vm_area_struct *vma, pte_t *pte, + unsigned long addr) +{ + pte_t val = *pte; + pte_clear(pte); + vm_account_dec(vma, val, addr); +} + #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff -prauN linux-2.5.73/include/linux/mmzone.h wli-2.5.73-29/include/linux/mmzone.h --- linux-2.5.73/include/linux/mmzone.h 2003-06-22 11:32:55.000000000 -0700 +++ wli-2.5.73-29/include/linux/mmzone.h 2003-06-23 10:29:54.000000000 -0700 @@ -26,8 +26,8 @@ #endif struct free_area { - struct list_head free_list; - unsigned long *map; + struct list_head free_list, deferred_pages; + unsigned long *map, globally_free, active, locally_free; }; struct pglist_data; diff -prauN linux-2.5.73/include/linux/page-flags.h wli-2.5.73-29/include/linux/page-flags.h --- linux-2.5.73/include/linux/page-flags.h 2003-06-22 11:32:59.000000000 -0700 +++ wli-2.5.73-29/include/linux/page-flags.h 2003-06-23 10:53:46.000000000 -0700 @@ -69,12 +69,14 @@ #define PG_private 12 /* Has something at ->private */ #define PG_writeback 13 /* Page is under writeback */ #define PG_nosave 14 /* Used for system suspend/resume */ -#define PG_chainlock 15 /* lock bit for ->pte_chain */ +#define PG_rmaplock 15 /* lock bit for ->pte_chain */ -#define PG_direct 16 /* ->pte_chain points directly at pte */ -#define PG_mappedtodisk 17 /* Has blocks allocated on-disk */ -#define PG_reclaim 18 /* To be reclaimed asap */ -#define PG_compound 19 /* Part of a compound page */ +#define PG_mappedtodisk 16 /* Has blocks allocated on-disk */ +#define PG_reclaim 17 /* To be reclaimed asap */ +#define PG_compound 18 /* Part of a compound page */ +#define PG_anon 19 /* Anonymous page */ +#define PG_swapcache 20 /* Swap page; swp_entry_t in ->private */ +#define PG_chained 21 /* @@ -87,6 +89,7 @@ struct page_state { unsigned long nr_unstable; /* NFS unstable pages */ unsigned long nr_page_table_pages;/* Pages used for pagetables */ unsigned long nr_mapped; /* mapped into pagetables */ + unsigned long nr_swapcache; /* in swapcache */ unsigned long nr_slab; /* In slab */ #define GET_PAGE_STATE_LAST nr_slab @@ -248,11 +251,9 @@ extern void get_full_page_state(struct p #define ClearPageNosave(page) clear_bit(PG_nosave, &(page)->flags) #define TestClearPageNosave(page) test_and_clear_bit(PG_nosave, &(page)->flags) -#define PageDirect(page) test_bit(PG_direct, &(page)->flags) -#define SetPageDirect(page) set_bit(PG_direct, &(page)->flags) -#define TestSetPageDirect(page) test_and_set_bit(PG_direct, &(page)->flags) -#define ClearPageDirect(page) clear_bit(PG_direct, &(page)->flags) -#define TestClearPageDirect(page) test_and_clear_bit(PG_direct, &(page)->flags) +#define PageChained(page) test_bit(PG_chained, &(page)->flags) +#define SetPageChained(page) set_bit(PG_chained, &(page)->flags) +#define ClearPageChained(page) clear_bit(PG_chained, &(page)->flags) #define PageMappedToDisk(page) test_bit(PG_mappedtodisk, &(page)->flags) #define SetPageMappedToDisk(page) set_bit(PG_mappedtodisk, &(page)->flags) @@ -267,15 +268,16 @@ extern void get_full_page_state(struct p #define SetPageCompound(page) set_bit(PG_compound, &(page)->flags) #define ClearPageCompound(page) clear_bit(PG_compound, &(page)->flags) -/* - * The PageSwapCache predicate doesn't use a PG_flag at this time, - * but it may again do so one day. - */ +#define PageAnon(page) test_bit(PG_anon, &(page)->flags) +#define SetPageAnon(page) set_bit(PG_anon, &(page)->flags) +#define ClearPageAnon(page) clear_bit(PG_anon, &(page)->flags) + #ifdef CONFIG_SWAP -extern struct address_space swapper_space; -#define PageSwapCache(page) ((page)->mapping == &swapper_space) +#define PageSwapCache(page) test_bit(PG_swapcache, &(page)->flags) +#define SetPageSwapCache(page) set_bit(PG_swapcache, &(page)->flags) +#define ClearPageSwapCache(page) clear_bit(PG_swapcache, &(page)->flags) #else -#define PageSwapCache(page) 0 +#define PageSwapCache(page) 0 #endif struct page; /* forward declaration */ diff -prauN linux-2.5.73/include/linux/pagemap.h wli-2.5.73-29/include/linux/pagemap.h --- linux-2.5.73/include/linux/pagemap.h 2003-06-22 11:32:30.000000000 -0700 +++ wli-2.5.73-29/include/linux/pagemap.h 2003-06-23 10:46:31.000000000 -0700 @@ -116,17 +116,6 @@ static inline unsigned long get_page_cac return atomic_read(&nr_pagecache); } -static inline void ___add_to_page_cache(struct page *page, - struct address_space *mapping, unsigned long index) -{ - list_add(&page->list, &mapping->clean_pages); - page->mapping = mapping; - page->index = index; - - mapping->nrpages++; - pagecache_acct(1); -} - extern void FASTCALL(__lock_page(struct page *page)); extern void FASTCALL(unlock_page(struct page *page)); diff -prauN linux-2.5.73/include/linux/pid.h wli-2.5.73-29/include/linux/pid.h --- linux-2.5.73/include/linux/pid.h 2003-06-22 11:32:37.000000000 -0700 +++ wli-2.5.73-29/include/linux/pid.h 2003-06-23 10:36:32.000000000 -0700 @@ -47,6 +47,7 @@ extern void FASTCALL(detach_pid(struct t * held. */ extern struct pid *FASTCALL(find_pid(enum pid_type, int)); +int find_next_pid(int); extern int alloc_pidmap(void); extern void FASTCALL(free_pidmap(int)); diff -prauN linux-2.5.73/include/linux/rmap-locking.h wli-2.5.73-29/include/linux/rmap-locking.h --- linux-2.5.73/include/linux/rmap-locking.h 2003-06-22 11:32:33.000000000 -0700 +++ wli-2.5.73-29/include/linux/rmap-locking.h 1969-12-31 16:00:00.000000000 -0800 @@ -1,23 +0,0 @@ -/* - * include/linux/rmap-locking.h - * - * Locking primitives for exclusive access to a page's reverse-mapping - * pte chain. - */ - -#include - -struct pte_chain; -extern kmem_cache_t *pte_chain_cache; - -#define pte_chain_lock(page) bit_spin_lock(PG_chainlock, &page->flags) -#define pte_chain_unlock(page) bit_spin_unlock(PG_chainlock, &page->flags) - -struct pte_chain *pte_chain_alloc(int gfp_flags); -void __pte_chain_free(struct pte_chain *pte_chain); - -static inline void pte_chain_free(struct pte_chain *pte_chain) -{ - if (pte_chain) - __pte_chain_free(pte_chain); -} diff -prauN linux-2.5.73/include/linux/rmap.h wli-2.5.73-29/include/linux/rmap.h --- linux-2.5.73/include/linux/rmap.h 1969-12-31 16:00:00.000000000 -0800 +++ wli-2.5.73-29/include/linux/rmap.h 2003-06-23 10:58:03.000000000 -0700 @@ -0,0 +1,162 @@ +/* + * include/linux/rmap.h + * + * Locking primitives for exclusive access to a page's reverse-mapping + * pte chain. + */ + +#include +#include +#include +#include +#include +#include +#include + +struct anon { + atomic_t count; + spinlock_t lock; + struct list_head list; +}; + +#ifdef CONFIG_MMU + +int FASTCALL(rmap_get_cpu(void)); +void FASTCALL(page_turn_rmap(struct page *, struct vm_area_struct *)); +void FASTCALL(page_move_rmap(struct page *page, struct vm_area_struct *, unsigned long, unsigned long)); +void FASTCALL(add_rmap_address(struct page *, unsigned long)); +void FASTCALL(clear_page_chained(struct page *page)); + +/* + * Called from mm/vmscan.c to handle pageout + */ +int FASTCALL(page_referenced(struct page *)); +int FASTCALL(try_to_unmap(struct page *)); + +void init_rmap(void); +int exec_rmap(struct mm_struct *); +void dup_rmap(struct mm_struct *, struct mm_struct *); +void exit_rmap(struct mm_struct *); + +/* + * Return values of try_to_unmap(): + */ +#define SWAP_SUCCESS 0 +#define SWAP_AGAIN 1 +#define SWAP_FAIL 2 + +#else /* !CONFIG_MMU */ +#define page_referenced(page) TestClearPageReferenced(page) +#define init_rmap() do { } while (0) +#define exec_rmap(mm) ({ 0; }) +#define dup_rmap(new, old) ({ 0; }) +#define exit_rmap(mm) do { } while (0) +#define try_to_unmap(page) ({ SWAP_FAIL; }) +#endif /* CONFIG_MMU */ + +#define NOADDR (~0UL) + +static inline void rmap_lock(struct page *page) +{ + bit_spin_lock(PG_rmaplock, &page->flags); +} + +static inline void rmap_unlock(struct page *page) +{ + bit_spin_unlock(PG_rmaplock, &page->flags); +} + +#define page_mapcount(page) \ + (PageChained(page) ? (page)->rmap.chain->slot[0] : (page)->rmap.count) + +#define NRSLOT ((L1_CACHE_BYTES - sizeof(unsigned long))/sizeof(unsigned long)) + +struct rmap_chain { + unsigned long slot[NRSLOT]; /* first contains count, then */ + struct rmap_chain *next; /* user virtual addresses */ +}; + +static inline void page_dup_rmap(struct page *page) +{ + rmap_lock(page); + page_mapcount(page)++; + rmap_unlock(page); +} + +static inline void clear_page_anon(struct page *page) +{ + set_page_mapping(page, NULL); + ClearPageAnon(page); +} + +/** + * page_remove_rmap - take down reverse mapping to a page + * @page: page to remove mapping from + * + * For general use: Remove the reverse mapping from the page. + * after that the caller can clear the page table entry and free + * the page. Caller needs to hold the mm->page_table_lock. + */ +static inline void page_remove_rmap(struct page *page) +{ + rmap_lock(page); + + if (!PageChained(page)) + page->rmap.count--; + else { + page->rmap.chain->slot[0]--; + if (!page->rmap.chain->slot[0]) + clear_page_chained(page); + } + if (!page_mapped(page)) { + dec_page_state(nr_mapped); + if (PageAnon(page)) + clear_page_anon(page); + } + rmap_unlock(page); +} + +static inline unsigned long vma_address(struct page *page, struct vm_area_struct *vma) +{ + unsigned long pgoff, address; + pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); + if (address < vma->vm_start || address >= vma->vm_end) + return NOADDR; + else + return address; +} + +/** + * page_add_rmap - add reverse mapping entry to a page + * @page: the page to add the mapping to + * @vma: the vma into which this page is being mapped + * @address: the virtual address at which the page is being mapped + * @anon: is this an anonymous (not file-backed) page? + * + * Add a new pte reverse mapping to a page. + * The caller needs to hold the mm->page_table_lock. + */ +static inline void page_add_rmap(struct page *page, struct vm_area_struct *vma, + unsigned long address, int anon) +{ + address &= PAGE_MASK; + + rmap_lock(page); + + if (!page_mapped(page)) + inc_page_state(nr_mapped); + + page_mapcount(page)++; + + if (page->__mapping) { + if ((anon && address != page->index) || + address != vma_address(page, vma)) + add_rmap_address(page, address); + } else if (anon) { + SetPageAnon(page); + set_page_mapping(page, vma->vm_mm); + page->index = address; + } + rmap_unlock(page); +} diff -prauN linux-2.5.73/include/linux/sched.h wli-2.5.73-29/include/linux/sched.h --- linux-2.5.73/include/linux/sched.h 2003-06-22 11:32:32.000000000 -0700 +++ wli-2.5.73-29/include/linux/sched.h 2003-06-23 10:54:18.000000000 -0700 @@ -28,6 +28,7 @@ #include #include #include +#include struct exec_domain; @@ -196,11 +197,14 @@ struct mm_struct { * together off init_mm.mmlist, and are protected * by mmlist_lock */ + struct anon *anon; /* set of forks between execs */ + struct list_head anon_list; /* chain of mm's against anon */ unsigned long start_code, end_code, start_data, end_data; unsigned long start_brk, brk, start_stack; unsigned long arg_start, arg_end, env_start, env_end; unsigned long rss, total_vm, locked_vm; + unsigned long shared, text, lib, data, dirty, stack; unsigned long def_flags; unsigned long cpu_vm_mask; unsigned long swap_address; @@ -221,6 +225,7 @@ struct mm_struct { struct kioctx *ioctx_list; struct kioctx default_kioctx; + struct rcu_head rcu; }; extern int mmlist_nr; diff -prauN linux-2.5.73/include/linux/spinlock.h wli-2.5.73-29/include/linux/spinlock.h --- linux-2.5.73/include/linux/spinlock.h 2003-06-22 11:32:45.000000000 -0700 +++ wli-2.5.73-29/include/linux/spinlock.h 2003-06-23 10:55:31.000000000 -0700 @@ -387,7 +387,6 @@ do { \ /* "lock on reference count zero" */ #ifndef ATOMIC_DEC_AND_LOCK #include -extern int atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock); #endif /* diff -prauN linux-2.5.73/include/linux/swap.h wli-2.5.73-29/include/linux/swap.h --- linux-2.5.73/include/linux/swap.h 2003-06-22 11:32:31.000000000 -0700 +++ wli-2.5.73-29/include/linux/swap.h 2003-06-23 10:46:31.000000000 -0700 @@ -77,7 +77,6 @@ struct reclaim_state { #ifdef __KERNEL__ struct address_space; -struct pte_chain; struct sysinfo; struct writeback_control; struct zone; @@ -163,6 +162,7 @@ extern unsigned int nr_free_pages(void); extern unsigned int nr_free_pages_pgdat(pg_data_t *pgdat); extern unsigned int nr_free_buffer_pages(void); extern unsigned int nr_free_pagecache_pages(void); +unsigned long nr_deferred_pages(void); /* linux/mm/swap.c */ extern void FASTCALL(lru_cache_add(struct page *)); @@ -178,25 +178,8 @@ extern int try_to_free_pages(struct zone extern int shrink_all_memory(int); extern int vm_swappiness; -/* linux/mm/rmap.c */ -#ifdef CONFIG_MMU -int FASTCALL(page_referenced(struct page *)); -struct pte_chain *FASTCALL(page_add_rmap(struct page *, pte_t *, - struct pte_chain *)); -void FASTCALL(page_remove_rmap(struct page *, pte_t *)); -int FASTCALL(try_to_unmap(struct page *)); - /* linux/mm/shmem.c */ -extern int shmem_unuse(swp_entry_t entry, struct page *page); -#else -#define page_referenced(page) TestClearPageReferenced(page) -#define try_to_unmap(page) SWAP_FAIL -#endif /* CONFIG_MMU */ - -/* return values of try_to_unmap */ -#define SWAP_SUCCESS 0 -#define SWAP_AGAIN 1 -#define SWAP_FAIL 2 +int shmem_unuse(swp_entry_t entry, struct page *page); #ifdef CONFIG_SWAP /* linux/mm/page_io.c */ @@ -206,7 +189,6 @@ extern int rw_swap_page_sync(int, swp_en /* linux/mm/swap_state.c */ extern struct address_space swapper_space; -#define total_swapcache_pages swapper_space.nrpages extern void show_swap_cache_info(void); extern int add_to_swap(struct page *); extern void __delete_from_swap_cache(struct page *); @@ -245,7 +227,6 @@ extern spinlock_t swaplock; #else /* CONFIG_SWAP */ #define total_swap_pages 0 -#define total_swapcache_pages 0UL #define si_swapinfo(val) \ do { (val)->freeswap = (val)->totalswap = 0; } while (0) diff -prauN linux-2.5.73/init/main.c wli-2.5.73-29/init/main.c --- linux-2.5.73/init/main.c 2003-06-22 11:32:35.000000000 -0700 +++ wli-2.5.73-29/init/main.c 2003-06-23 10:52:52.000000000 -0700 @@ -80,7 +80,6 @@ extern void signals_init(void); extern void buffer_init(void); extern void pidhash_init(void); extern void pidmap_init(void); -extern void pte_chain_init(void); extern void radix_tree_init(void); extern void free_initmem(void); extern void populate_rootfs(void); @@ -436,7 +435,6 @@ asmlinkage void __init start_kernel(void kmem_cache_init(); pidmap_init(); pgtable_cache_init(); - pte_chain_init(); fork_init(num_physpages); proc_caches_init(); buffer_init(); diff -prauN linux-2.5.73/ipc/shm.c wli-2.5.73-29/ipc/shm.c --- linux-2.5.73/ipc/shm.c 2003-06-22 11:32:45.000000000 -0700 +++ wli-2.5.73-29/ipc/shm.c 2003-06-23 10:38:47.000000000 -0700 @@ -380,9 +380,9 @@ static void shm_get_stat(unsigned long * if (is_file_hugepages(shp->shm_file)) { struct address_space *mapping = inode->i_mapping; - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); *rss += (HPAGE_SIZE/PAGE_SIZE)*mapping->nrpages; - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); } else { struct shmem_inode_info *info = SHMEM_I(inode); spin_lock(&info->lock); diff -prauN linux-2.5.73/kernel/fork.c wli-2.5.73-29/kernel/fork.c --- linux-2.5.73/kernel/fork.c 2003-06-22 11:32:32.000000000 -0700 +++ wli-2.5.73-29/kernel/fork.c 2003-06-23 15:25:10.000000000 -0700 @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -306,9 +307,9 @@ static inline int dup_mmap(struct mm_str atomic_dec(&inode->i_writecount); /* insert tmp into the share list, just after mpnt */ - down(&inode->i_mapping->i_shared_sem); - list_add_tail(&tmp->shared, &mpnt->shared); - up(&inode->i_mapping->i_shared_sem); + spin_lock(&inode->i_mapping->i_shared_lock); + list_add_tail_rcu(&tmp->shared, &mpnt->shared); + spin_unlock(&inode->i_mapping->i_shared_lock); } /* @@ -362,8 +363,21 @@ static inline void mm_free_pgd(struct mm spinlock_t mmlist_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; int mmlist_nr; +/* SLAB cache for mm_struct structures (tsk->mm) */ +static kmem_cache_t *mm_cachep; + #define allocate_mm() (kmem_cache_alloc(mm_cachep, SLAB_KERNEL)) -#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) + +static void __free_mm(void *mm) +{ + kmem_cache_free(mm_cachep, mm); +} + +void free_mm(struct mm_struct *mm) +{ + INIT_RCU_HEAD(&mm->rcu); + call_rcu(&mm->rcu, __free_mm, mm); +} #include @@ -377,6 +391,7 @@ static struct mm_struct * mm_init(struct mm->ioctx_list_lock = RW_LOCK_UNLOCKED; mm->default_kioctx = (struct kioctx)INIT_KIOCTX(mm->default_kioctx, *mm); mm->free_area_cache = TASK_UNMAPPED_BASE; + mm->shared = mm->text = mm->lib = mm->data = mm->dirty = mm->stack = 0; if (likely(!mm_alloc_pgd(mm))) { mm->def_flags = 0; @@ -394,11 +409,15 @@ struct mm_struct * mm_alloc(void) struct mm_struct * mm; mm = allocate_mm(); - if (mm) { - memset(mm, 0, sizeof(*mm)); - return mm_init(mm); + if (!mm) + return NULL; + memset(mm, 0, sizeof(*mm)); + if (exec_rmap(mm)) { + mm_free_pgd(mm); + free_mm(mm); + return NULL; } - return NULL; + return mm_init(mm); } /* @@ -425,6 +444,7 @@ void mmput(struct mm_struct *mm) spin_unlock(&mmlist_lock); exit_aio(mm); exit_mmap(mm); + exit_rmap(mm); mmdrop(mm); } } @@ -511,6 +531,8 @@ static int copy_mm(unsigned long clone_f if (!mm_init(mm)) goto fail_nomem; + dup_rmap(mm, oldmm); + if (init_new_context(tsk,mm)) goto free_pt; @@ -1149,8 +1171,7 @@ kmem_cache_t *fs_cachep; /* SLAB cache for vm_area_struct structures */ kmem_cache_t *vm_area_cachep; -/* SLAB cache for mm_struct structures (tsk->mm) */ -kmem_cache_t *mm_cachep; +void init_rmap(void); void __init proc_caches_init(void) { @@ -1189,4 +1210,6 @@ void __init proc_caches_init(void) SLAB_HWCACHE_ALIGN, NULL, NULL); if(!mm_cachep) panic("vma_init: Cannot alloc mm_struct SLAB cache"); + + init_rmap(); } diff -prauN linux-2.5.73/kernel/pid.c wli-2.5.73-29/kernel/pid.c --- linux-2.5.73/kernel/pid.c 2003-06-22 11:32:56.000000000 -0700 +++ wli-2.5.73-29/kernel/pid.c 2003-06-23 10:36:32.000000000 -0700 @@ -172,13 +172,22 @@ int attach_pid(task_t *task, enum pid_ty if (pid) atomic_inc(&pid->count); else { + struct list_head *elem, *bucket; + pid = &task->pids[type].pid; pid->nr = nr; atomic_set(&pid->count, 1); INIT_LIST_HEAD(&pid->task_list); pid->task = task; get_task_struct(task); - list_add(&pid->hash_chain, &pid_hash[type][pid_hashfn(nr)]); + bucket = &pid_hash[type][pid_hashfn(nr)]; + __list_for_each(elem, bucket) { + struct pid *walk; + walk = list_entry(elem, struct pid, hash_chain); + if (walk->nr > nr) + break; + } + list_add_tail(&pid->hash_chain, elem); } list_add_tail(&task->pids[type].pid_chain, &pid->task_list); task->pids[type].pidptr = pid; @@ -221,6 +230,42 @@ void detach_pid(task_t *task, enum pid_t free_pidmap(nr); } +/** + * find_next_pid - Returns the pid of next task. + * @pid: Starting point for the search. + * + * Returns the pid number of the task that follows behind + * "pid". The function works even if the input pid value + * is not valid anymore. + */ + int find_next_pid(int pid) +{ + struct list_head *elem, *bucket; + + if(!pid) { + bucket = &pid_hash[PIDTYPE_PID][0]; + } else { + bucket = &pid_hash[PIDTYPE_PID][pid_hashfn(pid)]; + } + read_lock(&tasklist_lock); +next_chain: + __list_for_each(elem, bucket) { + struct pid *walk; + walk = list_entry(elem, struct pid, hash_chain); + if (walk->nr > pid) { + pid = walk->nr; + read_unlock(&tasklist_lock); + return pid; + } + } + pid = 0; + bucket++; + if (bucket < &pid_hash[PIDTYPE_PID][1<i_shared_sem (vmtruncate) - * ->private_lock (__free_pte->__set_page_dirty_buffers) + * ->i_shared_lock (vmtruncate) + * ->private_lock (__free_pte->set_page_dirty_buffers) * ->swap_list_lock * ->swap_device_lock (exclusive_swap_page, others) * ->mapping->page_lock * ->mmap_sem - * ->i_shared_sem (various places) + * ->i_shared_lock (various places) + * + * ->lock_page + * ->i_shared_lock (page_convert_anon) * * ->inode_lock * ->sb_lock (fs/fs-writeback.c) @@ -79,11 +82,11 @@ */ void __remove_from_page_cache(struct page *page) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); radix_tree_delete(&mapping->page_tree, page->index); list_del(&page->list); - page->mapping = NULL; + set_page_mapping(page, NULL); mapping->nrpages--; pagecache_acct(-1); @@ -91,22 +94,24 @@ void __remove_from_page_cache(struct pag void remove_from_page_cache(struct page *page) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); - if (unlikely(!PageLocked(page))) - PAGE_BUG(page); + BUG_ON(!PageLocked(page)); - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); __remove_from_page_cache(page); - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); + page_cache_release(page); } static inline int sync_page(struct page *page) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); if (mapping && mapping->a_ops && mapping->a_ops->sync_page) return mapping->a_ops->sync_page(page); + if (PageSwapCache(page)) + blk_run_queues(); return 0; } @@ -130,9 +135,9 @@ static int __filemap_fdatawrite(struct a if (mapping->backing_dev_info->memory_backed) return 0; - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); list_splice_init(&mapping->dirty_pages, &mapping->io_pages); - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); ret = do_writepages(mapping, &wbc); return ret; } @@ -163,7 +168,7 @@ int filemap_fdatawait(struct address_spa restart: progress = 0; - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); while (!list_empty(&mapping->locked_pages)) { struct page *page; @@ -177,7 +182,7 @@ restart: if (!PageWriteback(page)) { if (++progress > 32) { if (need_resched()) { - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); __cond_resched(); goto restart; } @@ -187,16 +192,16 @@ restart: progress = 0; page_cache_get(page); - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); wait_on_page_writeback(page); if (PageError(page)) ret = -EIO; page_cache_release(page); - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); } - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); return ret; } @@ -204,16 +209,9 @@ restart: * This adds a page to the page cache, starting out as locked, unreferenced, * not uptodate and with no errors. * - * This function is used for two things: adding newly allocated pagecache - * pages and for moving existing anon pages into swapcache. - * - * In the case of pagecache pages, the page is new, so we can just run - * SetPageLocked() against it. The other page state flags were set by - * rmqueue() - * - * In the case of swapcache, try_to_swap_out() has already locked the page, so - * SetPageLocked() is ugly-but-OK there too. The required page state has been - * set up by swap_out_add_to_swap_cache(). + * This function is used to add newly allocated pagecache pages; + * the page is new, so we can just run SetPageLocked() against it. + * The other page state flags were set by rmqueue(). * * This function does not add the page to the LRU. The caller must do that. */ @@ -224,15 +222,19 @@ int add_to_page_cache(struct page *page, if (error == 0) { page_cache_get(page); - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); error = radix_tree_insert(&mapping->page_tree, offset, page); if (!error) { SetPageLocked(page); - ___add_to_page_cache(page, mapping, offset); + list_add(&page->list, &mapping->clean_pages); + set_page_mapping(page, mapping); + page->index = offset; + mapping->nrpages++; + pagecache_acct(+1); } else { page_cache_release(page); } - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); radix_tree_preload_end(); } return error; @@ -361,11 +363,11 @@ struct page * find_get_page(struct addre * We scan the hash list read-only. Addition to and removal from * the hash-list needs a held write-lock. */ - spin_lock(&mapping->page_lock); + mapping_rdlock(&mapping->page_lock); page = radix_tree_lookup(&mapping->page_tree, offset); if (page) page_cache_get(page); - spin_unlock(&mapping->page_lock); + mapping_rdunlock(&mapping->page_lock); return page; } @@ -376,11 +378,11 @@ struct page *find_trylock_page(struct ad { struct page *page; - spin_lock(&mapping->page_lock); + mapping_rdlock(&mapping->page_lock); page = radix_tree_lookup(&mapping->page_tree, offset); if (page && TestSetPageLocked(page)) page = NULL; - spin_unlock(&mapping->page_lock); + mapping_rdunlock(&mapping->page_lock); return page; } @@ -400,25 +402,25 @@ struct page *find_lock_page(struct addre { struct page *page; - spin_lock(&mapping->page_lock); + mapping_rdlock(&mapping->page_lock); repeat: page = radix_tree_lookup(&mapping->page_tree, offset); if (page) { page_cache_get(page); if (TestSetPageLocked(page)) { - spin_unlock(&mapping->page_lock); + mapping_rdunlock(&mapping->page_lock); lock_page(page); - spin_lock(&mapping->page_lock); + mapping_rdlock(&mapping->page_lock); /* Has the page been truncated while we slept? */ - if (page->mapping != mapping || page->index != offset) { + if (page_mapping(page) != mapping || page->index != offset) { unlock_page(page); page_cache_release(page); goto repeat; } } } - spin_unlock(&mapping->page_lock); + mapping_rdunlock(&mapping->page_lock); return page; } @@ -488,12 +490,12 @@ unsigned int find_get_pages(struct addre unsigned int i; unsigned int ret; - spin_lock(&mapping->page_lock); + mapping_rdlock(&mapping->page_lock); ret = radix_tree_gang_lookup(&mapping->page_tree, (void **)pages, start, nr_pages); for (i = 0; i < ret; i++) page_cache_get(pages[i]); - spin_unlock(&mapping->page_lock); + mapping_rdunlock(&mapping->page_lock); return ret; } @@ -620,8 +622,8 @@ page_not_up_to_date: /* Get exclusive access to the page ... */ lock_page(page); - /* Did it get unhashed before we got the lock? */ - if (!page->mapping) { + /* Did it get removed from the radix tree before we got the lock? */ + if (!page_mapping(page)) { unlock_page(page); page_cache_release(page); continue; @@ -1038,8 +1040,8 @@ page_not_uptodate: inc_page_state(pgmajfault); lock_page(page); - /* Did it get unhashed while we waited for it? */ - if (!page->mapping) { + /* Did it get removed from the radix tree while we waited for it? */ + if (!page_mapping(page)) { unlock_page(page); page_cache_release(page); goto retry_all; @@ -1066,7 +1068,7 @@ page_not_uptodate: lock_page(page); /* Somebody truncated the page on us? */ - if (!page->mapping) { + if (!page_mapping(page)) { unlock_page(page); page_cache_release(page); goto retry_all; @@ -1145,8 +1147,8 @@ no_cached_page: page_not_uptodate: lock_page(page); - /* Did it get unhashed while we waited for it? */ - if (!page->mapping) { + /* Did it get removed from the radix tree while we waited for it? */ + if (!page_mapping(page)) { unlock_page(page); goto err; } @@ -1172,7 +1174,7 @@ page_not_uptodate: lock_page(page); /* Somebody truncated the page on us? */ - if (!page->mapping) { + if (!page_mapping(page)) { unlock_page(page); goto err; } @@ -1340,7 +1342,7 @@ retry: goto out; lock_page(page); - if (!page->mapping) { + if (!page_mapping(page)) { unlock_page(page); page_cache_release(page); goto retry; diff -prauN linux-2.5.73/mm/fremap.c wli-2.5.73-29/mm/fremap.c --- linux-2.5.73/mm/fremap.c 2003-06-22 11:32:31.000000000 -0700 +++ wli-2.5.73-29/mm/fremap.c 2003-06-23 14:55:26.000000000 -0700 @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include #include @@ -28,13 +28,13 @@ static inline int zap_pte(struct mm_stru unsigned long pfn = pte_pfn(pte); flush_cache_page(vma, addr); - pte = ptep_get_and_clear(ptep); + pte = vm_ptep_get_and_clear(vma, ptep, addr); if (pfn_valid(pfn)) { struct page *page = pfn_to_page(pfn); if (!PageReserved(page)) { if (pte_dirty(pte)) set_page_dirty(page); - page_remove_rmap(page, ptep); + page_remove_rmap(page); page_cache_release(page); mm->rss--; } @@ -43,7 +43,7 @@ static inline int zap_pte(struct mm_stru } else { if (!pte_file(pte)) free_swap_and_cache(pte_to_swp_entry(pte)); - pte_clear(ptep); + vm_pte_clear(vma, ptep, addr); return 0; } } @@ -59,19 +59,18 @@ int install_page(struct mm_struct *mm, s pte_t *pte; pgd_t *pgd; pmd_t *pmd; - struct pte_chain *pte_chain; - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) - goto err; pgd = pgd_offset(mm, addr); + if (!rmap_get_cpu()) + goto err; spin_lock(&mm->page_table_lock); + put_cpu(); - pmd = pmd_alloc(mm, pgd, addr); + pmd = pmd_alloc_map(mm, pgd, addr); if (!pmd) goto err_unlock; - pte = pte_alloc_map(mm, pmd, addr); + pte = pte_alloc_map(mm, &pmd, addr); if (!pte) goto err_unlock; @@ -79,19 +78,18 @@ int install_page(struct mm_struct *mm, s mm->rss++; flush_icache_page(vma, page); - set_pte(pte, mk_pte(page, prot)); - pte_chain = page_add_rmap(page, pte, pte_chain); + vm_set_pte(vma, pte, mk_pte(page, prot), addr); + if (!PageReserved(page)) + page_add_rmap(page, vma, addr, 0); pte_unmap(pte); + pmd_unmap(pmd); if (flush) flush_tlb_page(vma, addr); update_mmu_cache(vma, addr, *pte); - spin_unlock(&mm->page_table_lock); - pte_chain_free(pte_chain); - return 0; + err = 0; err_unlock: spin_unlock(&mm->page_table_lock); - pte_chain_free(pte_chain); err: return err; } diff -prauN linux-2.5.73/mm/memory.c wli-2.5.73-29/mm/memory.c --- linux-2.5.73/mm/memory.c 2003-06-22 11:32:43.000000000 -0700 +++ wli-2.5.73-29/mm/memory.c 2003-06-23 14:58:02.000000000 -0700 @@ -44,10 +44,9 @@ #include #include #include -#include +#include #include -#include #include #include #include @@ -96,14 +95,15 @@ static inline void free_one_pmd(struct m } page = pmd_page(*dir); pmd_clear(dir); - pgtable_remove_rmap(page); + dec_page_state(nr_page_table_pages); pte_free_tlb(tlb, page); } static inline void free_one_pgd(struct mmu_gather *tlb, pgd_t * dir) { int j; - pmd_t * pmd; + pmd_t *pmd; + struct page *page; if (pgd_none(*dir)) return; @@ -112,11 +112,13 @@ static inline void free_one_pgd(struct m pgd_clear(dir); return; } - pmd = pmd_offset(dir, 0); + page = pgd_page(*dir); + pmd = pmd_offset_map(dir, 0); pgd_clear(dir); for (j = 0; j < PTRS_PER_PMD ; j++) free_one_pmd(tlb, pmd+j); - pmd_free_tlb(tlb, pmd); + pmd_unmap(pmd); + pmd_free_tlb(tlb, page); } /* @@ -136,30 +138,40 @@ void clear_page_tables(struct mmu_gather } while (--nr); } -pte_t * pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address) +/* + * error return happens with pmd unmapped + */ +pte_t *pte_alloc_map(struct mm_struct *mm, pmd_t **pmd, unsigned long address) { - if (!pmd_present(*pmd)) { + if (!pmd_present(**pmd)) { + pgd_t *pgd; struct page *new; + pmd_unmap(*pmd); spin_unlock(&mm->page_table_lock); new = pte_alloc_one(mm, address); spin_lock(&mm->page_table_lock); - if (!new) + if (!new) { + *pmd = NULL; return NULL; + } + + pgd = pgd_offset(mm, address); + *pmd = pmd_offset_map(pgd, address); /* * Because we dropped the lock, we should re-check the * entry, as somebody else could have populated it.. */ - if (pmd_present(*pmd)) { + if (pmd_present(**pmd)) { pte_free(new); goto out; } - pgtable_add_rmap(new, mm, address); - pmd_populate(mm, pmd, new); + inc_page_state(nr_page_table_pages); + pmd_populate(mm, *pmd, new); } out: - return pte_offset_map(pmd, address); + return pte_offset_map(*pmd, address); } pte_t * pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address) @@ -181,7 +193,7 @@ pte_t * pte_alloc_kernel(struct mm_struc pte_free_kernel(new); goto out; } - pgtable_add_rmap(virt_to_page(new), mm, address); + inc_page_state(nr_page_table_pages); pmd_populate_kernel(mm, pmd, new); } out: @@ -207,21 +219,11 @@ int copy_page_range(struct mm_struct *ds pgd_t * src_pgd, * dst_pgd; unsigned long address = vma->vm_start; unsigned long end = vma->vm_end; - unsigned long cow; - struct pte_chain *pte_chain = NULL; + unsigned long cow, rss = 0; if (is_vm_hugetlb_page(vma)) return copy_hugetlb_page_range(dst, src, vma); - pte_chain = pte_chain_alloc(GFP_ATOMIC); - if (!pte_chain) { - spin_unlock(&dst->page_table_lock); - pte_chain = pte_chain_alloc(GFP_KERNEL); - spin_lock(&dst->page_table_lock); - if (!pte_chain) - goto nomem; - } - cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; src_pgd = pgd_offset(src, address)-1; dst_pgd = pgd_offset(dst, address)-1; @@ -244,10 +246,10 @@ skip_copy_pmd_range: address = (address continue; } - src_pmd = pmd_offset(src_pgd, address); - dst_pmd = pmd_alloc(dst, dst_pgd, address); + dst_pmd = pmd_alloc_map(dst, dst_pgd, address); if (!dst_pmd) goto nomem; + src_pmd = pmd_offset_map_nested(src_pgd, address); do { pte_t * src_pte, * dst_pte; @@ -261,15 +263,20 @@ skip_copy_pmd_range: address = (address pmd_clear(src_pmd); skip_copy_pte_range: address = (address + PMD_SIZE) & PMD_MASK; - if (address >= end) + if (address >= end) { + pmd_unmap(dst_pmd); + pmd_unmap_nested(src_pmd); goto out; + } goto cont_copy_pmd_range; } - dst_pte = pte_alloc_map(dst, dst_pmd, address); + pmd_unmap_nested(src_pmd); + dst_pte = pte_alloc_map(dst, &dst_pmd, address); if (!dst_pte) goto nomem; spin_lock(&src->page_table_lock); + src_pmd = pmd_offset_map_nested(src_pgd, address); src_pte = pte_offset_map_nested(src_pmd, address); do { pte_t pte = *src_pte; @@ -284,8 +291,7 @@ skip_copy_pte_range: if (!pte_present(pte)) { if (!pte_file(pte)) swap_duplicate(pte_to_swp_entry(pte)); - set_pte(dst_pte, pte); - goto cont_copy_pte_range_noset; + goto cont_copy_pte_range; } pfn = pte_pfn(pte); /* the pte points outside of valid memory, the @@ -293,13 +299,13 @@ skip_copy_pte_range: * and not mapped via rmap - duplicate the * mapping as is. */ - page = NULL; - if (pfn_valid(pfn)) - page = pfn_to_page(pfn); - - if (!page || PageReserved(page)) { - set_pte(dst_pte, pte); - goto cont_copy_pte_range_noset; + if (!pfn_valid(pfn)) { + page = NULL; + goto cont_copy_pte_range; + } else { + page = pfn_to_page(pfn); + if (PageReserved(page)) + goto cont_copy_pte_range; } /* @@ -307,7 +313,7 @@ skip_copy_pte_range: * in the parent and the child */ if (cow) { - ptep_set_wrprotect(src_pte); + vm_ptep_set_wrprotect(src, src_pte); pte = *src_pte; } @@ -319,36 +325,15 @@ skip_copy_pte_range: pte = pte_mkclean(pte); pte = pte_mkold(pte); get_page(page); - dst->rss++; - - set_pte(dst_pte, pte); - pte_chain = page_add_rmap(page, dst_pte, - pte_chain); - if (pte_chain) - goto cont_copy_pte_range_noset; - pte_chain = pte_chain_alloc(GFP_ATOMIC); - if (pte_chain) - goto cont_copy_pte_range_noset; - - /* - * pte_chain allocation failed, and we need to - * run page reclaim. - */ - pte_unmap_nested(src_pte); - pte_unmap(dst_pte); - spin_unlock(&src->page_table_lock); - spin_unlock(&dst->page_table_lock); - pte_chain = pte_chain_alloc(GFP_KERNEL); - spin_lock(&dst->page_table_lock); - if (!pte_chain) - goto nomem; - spin_lock(&src->page_table_lock); - dst_pte = pte_offset_map(dst_pmd, address); - src_pte = pte_offset_map_nested(src_pmd, - address); + rss++; + page_dup_rmap(page); +cont_copy_pte_range: + vm_set_pte(vma, dst_pte, pte, address); cont_copy_pte_range_noset: address += PAGE_SIZE; if (address >= end) { + pmd_unmap(dst_pmd); + pmd_unmap_nested(src_pmd); pte_unmap_nested(src_pte); pte_unmap(dst_pte); goto out_unlock; @@ -364,19 +349,21 @@ cont_copy_pmd_range: src_pmd++; dst_pmd++; } while ((unsigned long)src_pmd & PMD_TABLE_MASK); + pmd_unmap_nested(src_pmd-1); + pmd_unmap(dst_pmd-1); } out_unlock: spin_unlock(&src->page_table_lock); out: - pte_chain_free(pte_chain); + dst->rss += rss; return 0; nomem: - pte_chain_free(pte_chain); + dst->rss += rss; return -ENOMEM; } static void -zap_pte_range(struct mmu_gather *tlb, pmd_t * pmd, +zap_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long address, unsigned long size) { unsigned long offset; @@ -401,32 +388,32 @@ zap_pte_range(struct mmu_gather *tlb, pm if (pte_present(pte)) { unsigned long pfn = pte_pfn(pte); - pte = ptep_get_and_clear(ptep); + pte = vm_ptep_get_and_clear(vma, ptep, address + offset); tlb_remove_tlb_entry(tlb, ptep, address+offset); if (pfn_valid(pfn)) { struct page *page = pfn_to_page(pfn); if (!PageReserved(page)) { if (pte_dirty(pte)) set_page_dirty(page); - if (page->mapping && pte_young(pte) && + if (page_mapping(page) && pte_young(pte) && !PageSwapCache(page)) mark_page_accessed(page); tlb->freed++; - page_remove_rmap(page, ptep); + page_remove_rmap(page); tlb_remove_page(tlb, page); } } } else { if (!pte_file(pte)) free_swap_and_cache(pte_to_swp_entry(pte)); - pte_clear(ptep); + vm_pte_clear(vma, ptep, address); } } pte_unmap(ptep-1); } static void -zap_pmd_range(struct mmu_gather *tlb, pgd_t * dir, +zap_pmd_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pgd_t *dir, unsigned long address, unsigned long size) { pmd_t * pmd; @@ -439,15 +426,16 @@ zap_pmd_range(struct mmu_gather *tlb, pg pgd_clear(dir); return; } - pmd = pmd_offset(dir, address); + pmd = pmd_offset_map(dir, address); end = address + size; if (end > ((address + PGDIR_SIZE) & PGDIR_MASK)) end = ((address + PGDIR_SIZE) & PGDIR_MASK); do { - zap_pte_range(tlb, pmd, address, end - address); + zap_pte_range(tlb, vma, pmd, address, end - address); address = (address + PMD_SIZE) & PMD_MASK; pmd++; } while (address < end); + pmd_unmap(pmd - 1); } void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, @@ -465,7 +453,7 @@ void unmap_page_range(struct mmu_gather dir = pgd_offset(vma->vm_mm, address); tlb_start_vma(tlb, vma); do { - zap_pmd_range(tlb, dir, address, end - address); + zap_pmd_range(tlb, vma, dir, address, end - address); address = (address + PGDIR_SIZE) & PGDIR_MASK; dir++; } while (address && (address < end)); @@ -629,20 +617,24 @@ follow_page(struct mm_struct *mm, unsign if (pgd_none(*pgd) || pgd_bad(*pgd)) goto out; - pmd = pmd_offset(pgd, address); + pmd = pmd_offset_map(pgd, address); if (pmd_none(*pmd)) - goto out; - if (pmd_huge(*pmd)) - return follow_huge_pmd(mm, address, pmd, write); + goto out_unmap; + if (pmd_huge(*pmd)) { + struct page *page = follow_huge_pmd(mm, address, pmd, write); + pmd_unmap(pmd); + return page; + } if (pmd_bad(*pmd)) - goto out; + goto out_unmap; ptep = pte_offset_map(pmd, address); if (!ptep) - goto out; + goto out_unmap; pte = *ptep; pte_unmap(ptep); + pmd_unmap(pmd); if (pte_present(pte)) { if (!write || (pte_write(pte) && pte_dirty(pte))) { pfn = pte_pfn(pte); @@ -653,6 +645,9 @@ follow_page(struct mm_struct *mm, unsign out: return NULL; +out_unmap: + pmd_unmap(pmd); + goto out; } /* @@ -711,7 +706,7 @@ int get_user_pages(struct task_struct *t pgd = pgd_offset_k(pg); if (!pgd) return i ? : -EFAULT; - pmd = pmd_offset(pgd, pg); + pmd = pmd_offset_kernel(pgd, pg); if (!pmd) return i ? : -EFAULT; pte = pte_offset_kernel(pmd, pg); @@ -785,8 +780,8 @@ out: return i; } -static void zeromap_pte_range(pte_t * pte, unsigned long address, - unsigned long size, pgprot_t prot) +static void zeromap_pte_range(struct vm_area_struct *vma, pte_t *pte, + unsigned long address, unsigned long size, pgprot_t prot) { unsigned long end; @@ -797,14 +792,14 @@ static void zeromap_pte_range(pte_t * pt do { pte_t zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE(address), prot)); BUG_ON(!pte_none(*pte)); - set_pte(pte, zero_pte); + vm_set_pte(vma, pte, zero_pte, address); address += PAGE_SIZE; pte++; } while (address && (address < end)); } -static inline int zeromap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned long address, - unsigned long size, pgprot_t prot) +static inline int zeromap_pmd_range(struct vm_area_struct *vma, pmd_t **pmd, + unsigned long address, unsigned long size, pgprot_t prot) { unsigned long end; @@ -813,13 +808,13 @@ static inline int zeromap_pmd_range(stru if (end > PGDIR_SIZE) end = PGDIR_SIZE; do { - pte_t * pte = pte_alloc_map(mm, pmd, address); + pte_t *pte = pte_alloc_map(vma->vm_mm, pmd, address); if (!pte) return -ENOMEM; - zeromap_pte_range(pte, address, end - address, prot); + zeromap_pte_range(vma, pte, address, end - address, prot); pte_unmap(pte); address = (address + PMD_SIZE) & PMD_MASK; - pmd++; + (*pmd)++; } while (address && (address < end)); return 0; } @@ -839,13 +834,14 @@ int zeromap_page_range(struct vm_area_st spin_lock(&mm->page_table_lock); do { - pmd_t *pmd = pmd_alloc(mm, dir, address); + pmd_t *pmd = pmd_alloc_map(mm, dir, address); error = -ENOMEM; if (!pmd) break; - error = zeromap_pmd_range(mm, pmd, address, end - address, prot); + error = zeromap_pmd_range(vma, &pmd, address, end - address, prot); if (error) break; + pmd_unmap(pmd - 1); address = (address + PGDIR_SIZE) & PGDIR_MASK; dir++; } while (address && (address < end)); @@ -859,8 +855,9 @@ int zeromap_page_range(struct vm_area_st * mappings are removed. any references to nonexistent pages results * in null mappings (currently treated as "copy-on-access") */ -static inline void remap_pte_range(pte_t * pte, unsigned long address, unsigned long size, - unsigned long phys_addr, pgprot_t prot) +static inline void remap_pte_range(struct vm_area_struct *vma, pte_t *pte, + unsigned long address, unsigned long size, + unsigned long phys_addr, pgprot_t prot) { unsigned long end; unsigned long pfn; @@ -873,15 +870,16 @@ static inline void remap_pte_range(pte_t do { BUG_ON(!pte_none(*pte)); if (!pfn_valid(pfn) || PageReserved(pfn_to_page(pfn))) - set_pte(pte, pfn_pte(pfn, prot)); + vm_set_pte(vma, pte, pfn_pte(pfn, prot), address); address += PAGE_SIZE; pfn++; pte++; } while (address && (address < end)); } -static inline int remap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned long address, unsigned long size, - unsigned long phys_addr, pgprot_t prot) +static inline int remap_pmd_range(struct vm_area_struct *vma, pmd_t **pmd, + unsigned long address, unsigned long size, + unsigned long phys_addr, pgprot_t prot) { unsigned long base, end; @@ -892,13 +890,13 @@ static inline int remap_pmd_range(struct end = PGDIR_SIZE; phys_addr -= address; do { - pte_t * pte = pte_alloc_map(mm, pmd, base + address); + pte_t *pte = pte_alloc_map(vma->vm_mm, pmd, base + address); if (!pte) return -ENOMEM; - remap_pte_range(pte, base + address, end - address, address + phys_addr, prot); + remap_pte_range(vma, pte, base + address, end - address, address + phys_addr, prot); pte_unmap(pte); address = (address + PMD_SIZE) & PMD_MASK; - pmd++; + (*pmd)++; } while (address && (address < end)); return 0; } @@ -920,13 +918,14 @@ int remap_page_range(struct vm_area_stru spin_lock(&mm->page_table_lock); do { - pmd_t *pmd = pmd_alloc(mm, dir, from); + pmd_t *pmd = pmd_alloc_map(mm, dir, from); error = -ENOMEM; if (!pmd) break; - error = remap_pmd_range(mm, pmd, from, end - from, phys_addr + from, prot); + error = remap_pmd_range(vma, &pmd, from, end - from, phys_addr + from, prot); if (error) break; + pmd_unmap(pmd - 1); from = (from + PGDIR_SIZE) & PGDIR_MASK; dir++; } while (from && (from < end)); @@ -943,9 +942,10 @@ int remap_page_range(struct vm_area_stru * * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock */ -static inline void establish_pte(struct vm_area_struct * vma, unsigned long address, pte_t *page_table, pte_t entry) +static inline void establish_pte(struct vm_area_struct *vma, + unsigned long address, pte_t *page_table, pte_t entry) { - set_pte(page_table, entry); + vm_set_pte(vma, page_table, entry, address); flush_tlb_page(vma, address); update_mmu_cache(vma, address, entry); } @@ -953,8 +953,9 @@ static inline void establish_pte(struct /* * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock */ -static inline void break_cow(struct vm_area_struct * vma, struct page * new_page, unsigned long address, - pte_t *page_table) +static inline void break_cow(struct vm_area_struct *vma, + struct page *new_page, unsigned long address, + pte_t *page_table) { invalidate_vcache(address, vma->vm_mm, new_page); flush_cache_page(vma, address); @@ -986,7 +987,6 @@ static int do_wp_page(struct mm_struct * { struct page *old_page, *new_page; unsigned long pfn = pte_pfn(pte); - struct pte_chain *pte_chain = NULL; int ret; if (unlikely(!pfn_valid(pfn))) { @@ -996,6 +996,7 @@ static int do_wp_page(struct mm_struct * * data, but for the moment just pretend this is OOM. */ pte_unmap(page_table); + pmd_unmap(pmd); printk(KERN_ERR "do_wp_page: bogus page at address %08lx\n", address); goto oom; @@ -1009,12 +1010,16 @@ static int do_wp_page(struct mm_struct * flush_cache_page(vma, address); establish_pte(vma, address, page_table, pte_mkyoung(pte_mkdirty(pte_mkwrite(pte)))); + if (!PageReserved(old_page)) + page_turn_rmap(old_page, vma); pte_unmap(page_table); + pmd_unmap(pmd); ret = VM_FAULT_MINOR; goto out; } } pte_unmap(page_table); + pmd_unmap(pmd); /* * Ok, we need to copy. Oh, well.. @@ -1022,9 +1027,6 @@ static int do_wp_page(struct mm_struct * page_cache_get(old_page); spin_unlock(&mm->page_table_lock); - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) - goto no_mem; new_page = alloc_page(GFP_HIGHUSER); if (!new_page) goto no_mem; @@ -1034,32 +1036,34 @@ static int do_wp_page(struct mm_struct * * Re-check the pte - we dropped the lock */ spin_lock(&mm->page_table_lock); + pmd = pmd_offset_map(pgd_offset(mm, address), address); page_table = pte_offset_map(pmd, address); if (pte_same(*page_table, pte)) { if (PageReserved(old_page)) ++mm->rss; - page_remove_rmap(old_page, page_table); + else + page_remove_rmap(old_page); break_cow(vma, new_page, address, page_table); - pte_chain = page_add_rmap(new_page, page_table, pte_chain); + page_add_rmap(new_page, vma, address, 1); lru_cache_add_active(new_page); /* Free the old page.. */ new_page = old_page; } pte_unmap(page_table); + pmd_unmap(pmd); page_cache_release(new_page); page_cache_release(old_page); ret = VM_FAULT_MINOR; - goto out; +out: + spin_unlock(&mm->page_table_lock); + return ret; no_mem: page_cache_release(old_page); oom: ret = VM_FAULT_OOM; -out: - spin_unlock(&mm->page_table_lock); - pte_chain_free(pte_chain); - return ret; + goto out; } static void vmtruncate_list(struct list_head *head, unsigned long pgoff) @@ -1068,8 +1072,12 @@ static void vmtruncate_list(struct list_ struct vm_area_struct *vma; struct list_head *curr; - list_for_each(curr, head) { + list_for_each_rcu(curr, head) { vma = list_entry(curr, struct vm_area_struct, shared); + + if (vma->vm_flags & VM_DEAD) + continue; + start = vma->vm_start; end = vma->vm_end; len = end - start; @@ -1111,12 +1119,12 @@ int vmtruncate(struct inode * inode, lof goto do_expand; inode->i_size = offset; pgoff = (offset + PAGE_SIZE - 1) >> PAGE_SHIFT; - down(&mapping->i_shared_sem); + rcu_read_lock(); /* mapping->i_shared_lock */ if (unlikely(!list_empty(&mapping->i_mmap))) vmtruncate_list(&mapping->i_mmap, pgoff); if (unlikely(!list_empty(&mapping->i_mmap_shared))) vmtruncate_list(&mapping->i_mmap_shared, pgoff); - up(&mapping->i_shared_sem); + rcu_read_unlock(); /* mapping->i_shared_lock */ truncate_inode_pages(mapping, offset); goto out_truncate; @@ -1177,9 +1185,9 @@ static int do_swap_page(struct mm_struct swp_entry_t entry = pte_to_swp_entry(orig_pte); pte_t pte; int ret = VM_FAULT_MINOR; - struct pte_chain *pte_chain = NULL; pte_unmap(page_table); + pmd_unmap(pmd); spin_unlock(&mm->page_table_lock); page = lookup_swap_cache(entry); if (!page) { @@ -1191,12 +1199,14 @@ static int do_swap_page(struct mm_struct * we released the page table lock. */ spin_lock(&mm->page_table_lock); + pmd = pmd_offset_map(pgd_offset(mm, address), address); page_table = pte_offset_map(pmd, address); if (pte_same(*page_table, orig_pte)) ret = VM_FAULT_OOM; else ret = VM_FAULT_MINOR; pte_unmap(page_table); + pmd_unmap(pmd); spin_unlock(&mm->page_table_lock); goto out; } @@ -1207,26 +1217,27 @@ static int do_swap_page(struct mm_struct } mark_page_accessed(page); - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) { - ret = -ENOMEM; - goto out; - } lock_page(page); + if (!rmap_get_cpu()) { + ret = VM_FAULT_OOM; + goto outrel; + } + spin_lock(&mm->page_table_lock); + put_cpu(); + pmd = pmd_offset_map(pgd_offset(mm, address), address); + page_table = pte_offset_map(pmd, address); + /* * Back out if somebody else faulted in this pte while we * released the page table lock. */ - spin_lock(&mm->page_table_lock); - page_table = pte_offset_map(pmd, address); if (!pte_same(*page_table, orig_pte)) { pte_unmap(page_table); + pmd_unmap(pmd); spin_unlock(&mm->page_table_lock); - unlock_page(page); - page_cache_release(page); ret = VM_FAULT_MINOR; - goto out; + goto outrel; } /* The page isn't present yet, go ahead with the fault. */ @@ -1242,16 +1253,20 @@ static int do_swap_page(struct mm_struct unlock_page(page); flush_icache_page(vma, page); - set_pte(page_table, pte); - pte_chain = page_add_rmap(page, page_table, pte_chain); + vm_set_pte(vma, page_table, pte, address); + page_add_rmap(page, vma, address, 1); /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, address, pte); + pmd_unmap(pmd); pte_unmap(page_table); spin_unlock(&mm->page_table_lock); out: - pte_chain_free(pte_chain); return ret; +outrel: + unlock_page(page); + page_cache_release(page); + goto out; } /* @@ -1266,20 +1281,8 @@ do_anonymous_page(struct mm_struct *mm, { pte_t entry; struct page * page = ZERO_PAGE(addr); - struct pte_chain *pte_chain; int ret; - pte_chain = pte_chain_alloc(GFP_ATOMIC); - if (!pte_chain) { - pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) - goto no_mem; - spin_lock(&mm->page_table_lock); - page_table = pte_offset_map(pmd, addr); - } - /* Read-only mapping of ZERO_PAGE. */ entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot)); @@ -1287,6 +1290,7 @@ do_anonymous_page(struct mm_struct *mm, if (write_access) { /* Allocate our own private page. */ pte_unmap(page_table); + pmd_unmap(pmd); spin_unlock(&mm->page_table_lock); page = alloc_page(GFP_HIGHUSER); @@ -1295,9 +1299,11 @@ do_anonymous_page(struct mm_struct *mm, clear_user_highpage(page, addr); spin_lock(&mm->page_table_lock); + pmd = pmd_offset_map(pgd_offset(mm, addr), addr); page_table = pte_offset_map(pmd, addr); if (!pte_none(*page_table)) { + pmd_unmap(pmd); pte_unmap(page_table); page_cache_release(page); spin_unlock(&mm->page_table_lock); @@ -1310,22 +1316,21 @@ do_anonymous_page(struct mm_struct *mm, mark_page_accessed(page); } - set_pte(page_table, entry); - /* ignores ZERO_PAGE */ - pte_chain = page_add_rmap(page, page_table, pte_chain); + vm_set_pte(vma, page_table, entry, addr); + if (write_access) + page_add_rmap(page, vma, addr, 1); + pmd_unmap(pmd); pte_unmap(page_table); /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, addr, entry); spin_unlock(&mm->page_table_lock); ret = VM_FAULT_MINOR; - goto out; - -no_mem: - ret = VM_FAULT_OOM; out: - pte_chain_free(pte_chain); return ret; +no_mem: + ret = VM_FAULT_OOM; + goto out; } /* @@ -1346,13 +1351,13 @@ do_no_page(struct mm_struct *mm, struct { struct page * new_page; pte_t entry; - struct pte_chain *pte_chain; - int ret; + int ret, anon = 0; if (!vma->vm_ops || !vma->vm_ops->nopage) return do_anonymous_page(mm, vma, page_table, pmd, write_access, address); pte_unmap(page_table); + pmd_unmap(pmd); spin_unlock(&mm->page_table_lock); new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, 0); @@ -1363,26 +1368,25 @@ do_no_page(struct mm_struct *mm, struct if (new_page == NOPAGE_OOM) return VM_FAULT_OOM; - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) - goto oom; - /* * Should we do an early C-O-W break? */ if (write_access && !(vma->vm_flags & VM_SHARED)) { struct page * page = alloc_page(GFP_HIGHUSER); - if (!page) { - page_cache_release(new_page); + if (!page) goto oom; - } copy_user_highpage(page, new_page, address); page_cache_release(new_page); lru_cache_add_active(page); + anon = 1; new_page = page; } + if (!rmap_get_cpu()) + goto oom; spin_lock(&mm->page_table_lock); + put_cpu(); + pmd = pmd_offset_map(pgd_offset(mm, address), address); page_table = pte_offset_map(pmd, address); /* @@ -1402,12 +1406,15 @@ do_no_page(struct mm_struct *mm, struct entry = mk_pte(new_page, vma->vm_page_prot); if (write_access) entry = pte_mkwrite(pte_mkdirty(entry)); - set_pte(page_table, entry); - pte_chain = page_add_rmap(new_page, page_table, pte_chain); + vm_set_pte(vma, page_table, entry, address); + if (!PageReserved(new_page)) + page_add_rmap(new_page, vma, address, anon); pte_unmap(page_table); + pmd_unmap(pmd); } else { /* One of our sibling threads was faster, back out. */ pte_unmap(page_table); + pmd_unmap(pmd); page_cache_release(new_page); spin_unlock(&mm->page_table_lock); ret = VM_FAULT_MINOR; @@ -1418,12 +1425,12 @@ do_no_page(struct mm_struct *mm, struct update_mmu_cache(vma, address, entry); spin_unlock(&mm->page_table_lock); ret = VM_FAULT_MAJOR; - goto out; -oom: - ret = VM_FAULT_OOM; out: - pte_chain_free(pte_chain); return ret; +oom: + page_cache_release(new_page); + ret = VM_FAULT_OOM; + goto out; } /* @@ -1444,13 +1451,14 @@ static int do_file_page(struct mm_struct */ if (!vma->vm_ops || !vma->vm_ops->populate || (write_access && !(vma->vm_flags & VM_SHARED))) { - pte_clear(pte); + vm_pte_clear(vma, pte, address); return do_no_page(mm, vma, address, write_access, pte, pmd); } pgoff = pte_to_pgoff(*pte); pte_unmap(pte); + pmd_unmap(pmd); spin_unlock(&mm->page_table_lock); err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE, vma->vm_page_prot, pgoff, 0); @@ -1537,10 +1545,10 @@ int handle_mm_fault(struct mm_struct *mm * and the SMP-safe atomic PTE updates. */ spin_lock(&mm->page_table_lock); - pmd = pmd_alloc(mm, pgd, address); + pmd = pmd_alloc_map(mm, pgd, address); if (pmd) { - pte_t * pte = pte_alloc_map(mm, pmd, address); + pte_t *pte = pte_alloc_map(mm, &pmd, address); if (pte) return handle_pte_fault(mm, vma, address, write_access, pte, pmd); } @@ -1577,7 +1585,30 @@ pmd_t *__pmd_alloc(struct mm_struct *mm, } pgd_populate(mm, pgd, new); out: - return pmd_offset(pgd, address); + return pmd_offset_map(pgd, address); +} + +pmd_t *__pmd_alloc_kernel(struct mm_struct *mm, pgd_t *pgd, unsigned long address) +{ + pmd_t *new; + + spin_unlock(&mm->page_table_lock); + new = pmd_alloc_one_kernel(mm, address); + spin_lock(&mm->page_table_lock); + if (!new) + return NULL; + + /* + * Because we dropped the lock, we should re-check the + * entry, as somebody else could have populated it.. + */ + if (pgd_present(*pgd)) { + pmd_free(new); + goto out; + } + pgd_populate(mm, pgd, new); +out: + return pmd_offset_kernel(pgd, address); } int make_pages_present(unsigned long addr, unsigned long end) @@ -1600,7 +1631,7 @@ int make_pages_present(unsigned long add /* * Map a vmalloc()-space virtual address to the physical page. */ -struct page * vmalloc_to_page(void * vmalloc_addr) +struct page *vmalloc_to_page(void *vmalloc_addr) { unsigned long addr = (unsigned long) vmalloc_addr; struct page *page = NULL; @@ -1609,7 +1640,7 @@ struct page * vmalloc_to_page(void * vma pte_t *ptep, pte; if (!pgd_none(*pgd)) { - pmd = pmd_offset(pgd, addr); + pmd = pmd_offset_map(pgd, addr); if (!pmd_none(*pmd)) { preempt_disable(); ptep = pte_offset_map(pmd, addr); @@ -1619,6 +1650,7 @@ struct page * vmalloc_to_page(void * vma pte_unmap(ptep); preempt_enable(); } + pmd_unmap(pmd); } return page; } diff -prauN linux-2.5.73/mm/mmap.c wli-2.5.73-29/mm/mmap.c --- linux-2.5.73/mm/mmap.c 2003-06-22 11:32:57.000000000 -0700 +++ wli-2.5.73-29/mm/mmap.c 2003-06-23 13:21:52.000000000 -0700 @@ -113,8 +113,19 @@ int vm_enough_memory(long pages) return 0; } +static void __free_vma(void *vma) +{ + kmem_cache_free(vm_area_cachep, vma); +} + +void free_vma(struct vm_area_struct *vma) +{ + INIT_LIST_HEAD(&vma->rcu.list); + call_rcu(&vma->rcu, __free_vma, vma); +} + /* - * Requires inode->i_mapping->i_shared_sem + * Requires inode->i_mapping->i_shared_lock */ static inline void __remove_shared_vm_struct(struct vm_area_struct *vma, struct inode *inode) @@ -122,7 +133,9 @@ __remove_shared_vm_struct(struct vm_area if (inode) { if (vma->vm_flags & VM_DENYWRITE) atomic_inc(&inode->i_writecount); - list_del_init(&vma->shared); + vma->vm_flags |= VM_DEAD; + wmb(); + list_del_rcu(&vma->shared); } } @@ -136,9 +149,9 @@ static void remove_shared_vm_struct(stru if (file) { struct inode *inode = file->f_dentry->d_inode; - down(&inode->i_mapping->i_shared_sem); + spin_lock(&inode->i_mapping->i_shared_lock); __remove_shared_vm_struct(vma, inode); - up(&inode->i_mapping->i_shared_sem); + spin_unlock(&inode->i_mapping->i_shared_lock); } } @@ -319,9 +332,9 @@ static inline void __vma_link_file(struc atomic_dec(&inode->i_writecount); if (vma->vm_flags & VM_SHARED) - list_add_tail(&vma->shared, &mapping->i_mmap_shared); + list_add_tail_rcu(&vma->shared, &mapping->i_mmap_shared); else - list_add_tail(&vma->shared, &mapping->i_mmap); + list_add_tail_rcu(&vma->shared, &mapping->i_mmap); } } @@ -345,12 +358,12 @@ static void vma_link(struct mm_struct *m mapping = vma->vm_file->f_dentry->d_inode->i_mapping; if (mapping) - down(&mapping->i_shared_sem); + spin_lock(&mapping->i_shared_lock); spin_lock(&mm->page_table_lock); __vma_link(mm, vma, prev, rb_link, rb_parent); spin_unlock(&mm->page_table_lock); if (mapping) - up(&mapping->i_shared_sem); + spin_unlock(&mapping->i_shared_lock); mark_mm_hugetlb(mm, vma); mm->map_count++; @@ -377,6 +390,28 @@ static inline int is_mergeable_vma(struc return 1; } +static void move_vma_start(struct vm_area_struct *vma, unsigned long addr) +{ + spinlock_t *lock = &vma->vm_mm->page_table_lock; + struct inode *inode = NULL; + + if (vma->vm_file) { + inode = vma->vm_file->f_dentry->d_inode; + spin_lock(&inode->i_mapping->i_shared_lock); + } + spin_lock(lock); + if (inode) + __remove_shared_vm_struct(vma, inode); + /* If no vm_file, perhaps we should always keep vm_pgoff at 0?? */ + vma->vm_pgoff += (long)(addr - vma->vm_start) >> PAGE_SHIFT; + vma->vm_start = addr; + if (inode) { + __vma_link_file(vma); + spin_unlock(&inode->i_mapping->i_shared_lock); + } + spin_unlock(lock); +} + /* * Return true if we can merge this (vm_flags,file,vm_pgoff,size) * in front of (at a lower virtual address and file offset than) the vma. @@ -429,8 +464,6 @@ static int vma_merge(struct mm_struct *m unsigned long end, unsigned long vm_flags, struct file *file, unsigned long pgoff) { - spinlock_t * lock = &mm->page_table_lock; - /* * We later require that vma->vm_flags == vm_flags, so this tests * vma->vm_flags & VM_SPECIAL, too. @@ -450,12 +483,13 @@ static int vma_merge(struct mm_struct *m is_mergeable_vma(prev, file, vm_flags) && can_vma_merge_after(prev, vm_flags, file, pgoff)) { struct vm_area_struct *next; + spinlock_t *lock = &mm->page_table_lock; struct inode *inode = file ? file->f_dentry->d_inode : NULL; int need_up = 0; if (unlikely(file && prev->vm_next && prev->vm_next->vm_file == file)) { - down(&inode->i_mapping->i_shared_sem); + spin_lock(&inode->i_mapping->i_shared_lock); need_up = 1; } spin_lock(lock); @@ -473,17 +507,17 @@ static int vma_merge(struct mm_struct *m __remove_shared_vm_struct(next, inode); spin_unlock(lock); if (need_up) - up(&inode->i_mapping->i_shared_sem); + spin_unlock(&inode->i_mapping->i_shared_lock); if (file) fput(file); mm->map_count--; - kmem_cache_free(vm_area_cachep, next); + free_vma(next); return 1; } spin_unlock(lock); if (need_up) - up(&inode->i_mapping->i_shared_sem); + spin_unlock(&inode->i_mapping->i_shared_lock); return 1; } @@ -497,10 +531,7 @@ static int vma_merge(struct mm_struct *m pgoff, (end - addr) >> PAGE_SHIFT)) return 0; if (end == prev->vm_start) { - spin_lock(lock); - prev->vm_start = addr; - prev->vm_pgoff -= (end - addr) >> PAGE_SHIFT; - spin_unlock(lock); + move_vma_start(prev, addr); return 1; } } @@ -727,7 +758,7 @@ munmap_back: atomic_inc(&inode->i_writecount); fput(file); } - kmem_cache_free(vm_area_cachep, vma); + free_vma(vma); } out: mm->total_vm += len >> PAGE_SHIFT; @@ -752,7 +783,7 @@ unmap_and_free_vma: /* Undo any partial mapping done by a device driver. */ zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start); free_vma: - kmem_cache_free(vm_area_cachep, vma); + free_vma(vma); unacct_error: if (charged) vm_unacct_memory(charged); @@ -1131,7 +1162,7 @@ static void unmap_vma(struct mm_struct * area->vm_ops->close(area); if (area->vm_file) fput(area->vm_file); - kmem_cache_free(vm_area_cachep, area); + free_vma(area); } /* @@ -1221,8 +1252,7 @@ int split_vma(struct mm_struct * mm, str if (new_below) { new->vm_end = addr; - vma->vm_start = addr; - vma->vm_pgoff += ((addr - new->vm_start) >> PAGE_SHIFT); + move_vma_start(vma, addr); } else { vma->vm_end = addr; new->vm_start = addr; @@ -1470,7 +1500,7 @@ void exit_mmap(struct mm_struct *mm) } if (vma->vm_file) fput(vma->vm_file); - kmem_cache_free(vm_area_cachep, vma); + free_vma(vma); vma = next; } } diff -prauN linux-2.5.73/mm/mprotect.c wli-2.5.73-29/mm/mprotect.c --- linux-2.5.73/mm/mprotect.c 2003-06-22 11:32:42.000000000 -0700 +++ wli-2.5.73-29/mm/mprotect.c 2003-06-23 10:44:16.000000000 -0700 @@ -24,11 +24,11 @@ #include static inline void -change_pte_range(pmd_t *pmd, unsigned long address, - unsigned long size, pgprot_t newprot) +change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long address, unsigned long size, pgprot_t newprot) { pte_t * pte; - unsigned long end; + unsigned long start, end; if (pmd_none(*pmd)) return; @@ -38,6 +38,7 @@ change_pte_range(pmd_t *pmd, unsigned lo return; } pte = pte_offset_map(pmd, address); + start = address & PMD_MASK; address &= ~PMD_MASK; end = address + size; if (end > PMD_SIZE) @@ -50,8 +51,8 @@ change_pte_range(pmd_t *pmd, unsigned lo * bits by wiping the pte and then setting the new pte * into place. */ - entry = ptep_get_and_clear(pte); - set_pte(pte, pte_modify(entry, newprot)); + entry = vm_ptep_get_and_clear(vma, pte, address + start); + vm_set_pte(vma, pte, pte_modify(entry, newprot), start + address); } address += PAGE_SIZE; pte++; @@ -60,11 +61,11 @@ change_pte_range(pmd_t *pmd, unsigned lo } static inline void -change_pmd_range(pgd_t *pgd, unsigned long address, - unsigned long size, pgprot_t newprot) +change_pmd_range(struct vm_area_struct *vma, pgd_t *pgd, + unsigned long address, unsigned long size, pgprot_t newprot) { pmd_t * pmd; - unsigned long end; + unsigned long start, end; if (pgd_none(*pgd)) return; @@ -73,16 +74,18 @@ change_pmd_range(pgd_t *pgd, unsigned lo pgd_clear(pgd); return; } - pmd = pmd_offset(pgd, address); + pmd = pmd_offset_map(pgd, address); + start = address & PGDIR_MASK; address &= ~PGDIR_MASK; end = address + size; if (end > PGDIR_SIZE) end = PGDIR_SIZE; do { - change_pte_range(pmd, address, end - address, newprot); + change_pte_range(vma, pmd, start + address, end - address, newprot); address = (address + PMD_SIZE) & PMD_MASK; pmd++; } while (address && (address < end)); + pmd_unmap(pmd - 1); } static void @@ -98,7 +101,7 @@ change_protection(struct vm_area_struct BUG(); spin_lock(¤t->mm->page_table_lock); do { - change_pmd_range(dir, start, end - start, newprot); + change_pmd_range(vma, dir, start, end - start, newprot); start = (start + PGDIR_SIZE) & PGDIR_MASK; dir++; } while (start && (start < end)); @@ -135,7 +138,7 @@ mprotect_attempt_merge(struct vm_area_st __vma_unlink(mm, vma, prev); spin_unlock(&mm->page_table_lock); - kmem_cache_free(vm_area_cachep, vma); + free_vma(vma); mm->map_count--; return 1; } @@ -297,7 +300,7 @@ sys_mprotect(unsigned long start, size_t __vma_unlink(prev->vm_mm, next, prev); spin_unlock(&prev->vm_mm->page_table_lock); - kmem_cache_free(vm_area_cachep, next); + free_vma(next); prev->vm_mm->map_count--; } out: diff -prauN linux-2.5.73/mm/mremap.c wli-2.5.73-29/mm/mremap.c --- linux-2.5.73/mm/mremap.c 2003-06-22 11:32:55.000000000 -0700 +++ wli-2.5.73-29/mm/mremap.c 2003-06-23 15:14:41.000000000 -0700 @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include #include @@ -37,7 +37,7 @@ static pte_t *get_one_pte_map_nested(str goto end; } - pmd = pmd_offset(pgd, addr); + pmd = pmd_offset_map_nested(pgd, addr); if (pmd_none(*pmd)) goto end; if (pmd_bad(*pmd)) { @@ -52,6 +52,7 @@ static pte_t *get_one_pte_map_nested(str pte = NULL; } end: + pmd_unmap_nested(pmd); return pte; } @@ -60,12 +61,15 @@ static inline int page_table_present(str { pgd_t *pgd; pmd_t *pmd; + int ret; pgd = pgd_offset(mm, addr); if (pgd_none(*pgd)) return 0; - pmd = pmd_offset(pgd, addr); - return pmd_present(*pmd); + pmd = pmd_offset_map(pgd, addr); + ret = pmd_present(*pmd); + pmd_unmap(pmd); + return ret != 0; } #else #define page_table_present(mm, addr) (1) @@ -76,37 +80,33 @@ static inline pte_t *alloc_one_pte_map(s pmd_t *pmd; pte_t *pte = NULL; - pmd = pmd_alloc(mm, pgd_offset(mm, addr), addr); + pmd = pmd_alloc_map(mm, pgd_offset(mm, addr), addr); if (pmd) - pte = pte_alloc_map(mm, pmd, addr); + pte = pte_alloc_map(mm, &pmd, addr); + pmd_unmap(pmd); return pte; } static int -copy_one_pte(struct mm_struct *mm, pte_t *src, pte_t *dst, - struct pte_chain **pte_chainp) +copy_one_pte(struct vm_area_struct *vma, pte_t *src, pte_t *dst, + unsigned long old_addr, unsigned long new_addr) { - int error = 0; - pte_t pte; - struct page *page = NULL; - - if (pte_present(*src)) - page = pte_page(*src); - if (!pte_none(*src)) { - if (page) - page_remove_rmap(page, src); - pte = ptep_get_and_clear(src); - if (!dst) { - /* No dest? We must put it back. */ - dst = src; - error++; + pte_t pte; + if (!dst) + return -1; + pte = vm_ptep_get_and_clear(vma, src, old_addr); + vm_set_pte(vma, dst, pte, new_addr); + if (pte_present(pte)) { + unsigned long pfn = pte_pfn(pte); + if (pfn_valid(pfn)) { + struct page *page = pfn_to_page(pfn); + if (!PageReserved(page)) + page_move_rmap(page, vma, old_addr, new_addr); + } } - set_pte(dst, pte); - if (page) - *pte_chainp = page_add_rmap(page, dst, *pte_chainp); } - return error; + return 0; } static int @@ -114,16 +114,16 @@ move_one_page(struct vm_area_struct *vma unsigned long new_addr) { struct mm_struct *mm = vma->vm_mm; - int error = 0; pte_t *src, *dst; - struct pte_chain *pte_chain; + int error = 0; - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) { + if (!rmap_get_cpu()) { error = -ENOMEM; goto out; } + spin_lock(&mm->page_table_lock); + put_cpu(); src = get_one_pte_map_nested(mm, old_addr); if (src) { /* @@ -138,13 +138,12 @@ move_one_page(struct vm_area_struct *vma dst = alloc_one_pte_map(mm, new_addr); if (src == NULL) src = get_one_pte_map_nested(mm, old_addr); - error = copy_one_pte(mm, src, dst, &pte_chain); + error = copy_one_pte(vma, src, dst, old_addr, new_addr); pte_unmap_nested(src); pte_unmap(dst); } flush_tlb_page(vma, old_addr); spin_unlock(&mm->page_table_lock); - pte_chain_free(pte_chain); out: return error; } @@ -213,7 +212,7 @@ static unsigned long move_vma(struct vm_ if (vma == next) vma = prev; mm->map_count--; - kmem_cache_free(vm_area_cachep, next); + free_vma(next); } } else if (next->vm_start == new_addr + new_len && can_vma_merge(next, vma->vm_flags) && @@ -295,7 +294,7 @@ static unsigned long move_vma(struct vm_ return new_addr; } if (allocated_vma) - kmem_cache_free(vm_area_cachep, new_vma); + free_vma(new_vma); out: return -ENOMEM; } diff -prauN linux-2.5.73/mm/msync.c wli-2.5.73-29/mm/msync.c --- linux-2.5.73/mm/msync.c 2003-06-22 11:32:42.000000000 -0700 +++ wli-2.5.73-29/mm/msync.c 2003-06-23 10:31:02.000000000 -0700 @@ -82,7 +82,7 @@ static inline int filemap_sync_pmd_range pgd_clear(pgd); return 0; } - pmd = pmd_offset(pgd, address); + pmd = pmd_offset_map(pgd, address); if ((address & PGDIR_MASK) != (end & PGDIR_MASK)) end = (address & PGDIR_MASK) + PGDIR_SIZE; error = 0; @@ -91,6 +91,7 @@ static inline int filemap_sync_pmd_range address = (address + PMD_SIZE) & PMD_MASK; pmd++; } while (address && (address < end)); + pmd_unmap(pmd - 1); return error; } diff -prauN linux-2.5.73/mm/nommu.c wli-2.5.73-29/mm/nommu.c --- linux-2.5.73/mm/nommu.c 2003-06-22 11:33:15.000000000 -0700 +++ wli-2.5.73-29/mm/nommu.c 2003-06-23 10:52:52.000000000 -0700 @@ -572,7 +572,3 @@ unsigned long get_unmapped_area(struct f { return -ENOMEM; } - -void pte_chain_init(void) -{ -} diff -prauN linux-2.5.73/mm/page-writeback.c wli-2.5.73-29/mm/page-writeback.c --- linux-2.5.73/mm/page-writeback.c 2003-06-22 11:32:58.000000000 -0700 +++ wli-2.5.73-29/mm/page-writeback.c 2003-06-23 10:46:31.000000000 -0700 @@ -453,7 +453,7 @@ int do_writepages(struct address_space * */ int write_one_page(struct page *page, int wait) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); int ret = 0; struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, @@ -465,12 +465,12 @@ int write_one_page(struct page *page, in if (wait) wait_on_page_writeback(page); - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); list_del(&page->list); if (test_clear_page_dirty(page)) { list_add(&page->list, &mapping->locked_pages); page_cache_get(page); - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); ret = mapping->a_ops->writepage(page, &wbc); if (ret == 0 && wait) { wait_on_page_writeback(page); @@ -480,7 +480,7 @@ int write_one_page(struct page *page, in page_cache_release(page); } else { list_add(&page->list, &mapping->clean_pages); - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); unlock_page(page); } return ret; @@ -492,31 +492,31 @@ EXPORT_SYMBOL(write_one_page); * and move it to the dirty_pages list. Also perform space reservation if * required. * - * __set_page_dirty_nobuffers() may return -ENOSPC. But if it does, the page + * set_page_dirty_nobuffers() may return -ENOSPC. But if it does, the page * is still safe, as long as it actually manages to find some blocks at * writeback time. * * This is also used when a single buffer is being dirtied: we want to set the * page dirty in that case, but not all the buffers. This is a "bottom-up" - * dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying. + * dirtying, whereas set_page_dirty_buffers() is a "top-down" dirtying. */ -int __set_page_dirty_nobuffers(struct page *page) +int set_page_dirty_nobuffers(struct page *page) { int ret = 0; if (!TestSetPageDirty(page)) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); if (mapping) { - spin_lock(&mapping->page_lock); - if (page->mapping) { /* Race with truncate? */ - BUG_ON(page->mapping != mapping); + mapping_wrlock(&mapping->page_lock); + if (page_mapping(page)) { /* Race with truncate? */ + BUG_ON(page_mapping(page) != mapping); if (!mapping->backing_dev_info->memory_backed) inc_page_state(nr_dirty); list_del(&page->list); list_add(&page->list, &mapping->dirty_pages); } - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); if (!PageSwapCache(page)) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); @@ -524,7 +524,28 @@ int __set_page_dirty_nobuffers(struct pa } return ret; } -EXPORT_SYMBOL(__set_page_dirty_nobuffers); +EXPORT_SYMBOL(set_page_dirty_nobuffers); + +/* + * If the mapping doesn't provide a set_page_dirty() a_op, then + * just fall through and assume that it wants bh's. + */ +int set_page_dirty(struct page *page) +{ + struct address_space *mapping = page_mapping(page); + int (*spd)(struct page *); + + if (!mapping) { + SetPageDirty(page); + return 0; + } + spd = mapping->a_ops->set_page_dirty; + if (spd) + return (*spd)(page); + else + return set_page_dirty_buffers(page); +} +EXPORT_SYMBOL(set_page_dirty); /* * set_page_dirty() is racy if the caller has no reference against @@ -553,7 +574,7 @@ int set_page_dirty_lock(struct page *pag int test_clear_page_dirty(struct page *page) { if (TestClearPageDirty(page)) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); if (mapping && !mapping->backing_dev_info->memory_backed) dec_page_state(nr_dirty); diff -prauN linux-2.5.73/mm/page_alloc.c wli-2.5.73-29/mm/page_alloc.c --- linux-2.5.73/mm/page_alloc.c 2003-06-22 11:32:32.000000000 -0700 +++ wli-2.5.73-29/mm/page_alloc.c 2003-06-23 10:53:46.000000000 -0700 @@ -69,7 +69,7 @@ static void bad_page(const char *functio { printk("Bad page state at %s\n", function); printk("flags:0x%08lx mapping:%p mapped:%d count:%d\n", - page->flags, page->mapping, + page->flags, (void *)page->__mapping, page_mapped(page), page_count(page)); printk("Backtrace:\n"); dump_stack(); @@ -79,9 +79,13 @@ static void bad_page(const char *functio 1 << PG_lru | 1 << PG_active | 1 << PG_dirty | + 1 << PG_rmaplock | + 1 << PG_chained | + 1 << PG_anon | + 1 << PG_swapcache | 1 << PG_writeback); set_page_count(page, 0); - page->mapping = NULL; + set_page_mapping(page, NULL); } #ifndef CONFIG_HUGETLB_PAGE @@ -163,7 +167,7 @@ static void destroy_compound_page(struct * -- wli */ -static inline void __free_pages_bulk (struct page *page, struct page *base, +static inline void buddy_free(struct page *page, struct page *base, struct zone *zone, struct free_area *area, unsigned long mask, unsigned int order) { @@ -176,7 +180,6 @@ static inline void __free_pages_bulk (st BUG(); index = page_idx >> (1 + order); - zone->free_pages -= mask; while (mask + (1 << (MAX_ORDER-1))) { struct page *buddy1, *buddy2; @@ -197,17 +200,45 @@ static inline void __free_pages_bulk (st BUG_ON(bad_range(zone, buddy2)); list_del(&buddy1->list); mask <<= 1; + area->globally_free--; area++; index >>= 1; page_idx &= mask; } list_add(&(base + page_idx)->list, &area->free_list); + area->globally_free++; +} + +static inline void __free_pages_bulk(struct page *page, struct page *base, + struct zone *zone, struct free_area *area, unsigned long mask, + unsigned int order) +{ + switch (area->active - area->locally_free) { + case 0: + if (!list_empty(&area->deferred_pages)) { + struct page *defer = list_entry(area->deferred_pages.next, struct page, list); + list_del(&defer->list); + area->locally_free--; + buddy_free(defer, base, zone, area, mask, order); + } + /* fall through */ + case 1: + buddy_free(page, base, zone, area, mask, order); + break; + default: + list_add(&page->list, &area->deferred_pages); + area->locally_free++; + break; + } + if (area->active) + area->active--; + zone->free_pages += 1 << order; } static inline void free_pages_check(const char *function, struct page *page) { if ( page_mapped(page) || - page->mapping != NULL || + page->__mapping != 0 || page_count(page) != 0 || (page->flags & ( 1 << PG_lru | @@ -215,6 +246,10 @@ static inline void free_pages_check(cons 1 << PG_locked | 1 << PG_active | 1 << PG_reclaim | + 1 << PG_rmaplock | + 1 << PG_chained | + 1 << PG_anon | + 1 << PG_swapcache | 1 << PG_writeback ))) bad_page(function, page); if (PageDirty(page)) @@ -232,40 +267,78 @@ static inline void free_pages_check(cons * And clear the zone's pages_scanned counter, to hold off the "all pages are * pinned" detection logic. */ -static int -free_pages_bulk(struct zone *zone, int count, - struct list_head *list, unsigned int order) +void free_pages_bulk(struct zone *zone, struct page *page, unsigned int order) { - unsigned long mask, flags; + unsigned long mask, flags, count; struct free_area *area; - struct page *base, *page = NULL; - int ret = 0; + struct page *base, *save; + LIST_HEAD(tmp); + + count = page->private; mask = (~0UL) << order; base = zone->zone_mem_map; area = zone->free_area + order; spin_lock_irqsave(&zone->lock, flags); zone->all_unreclaimable = 0; zone->pages_scanned = 0; - while (!list_empty(list) && count--) { - page = list_entry(list->prev, struct page, list); - /* have to delete it as __free_pages_bulk list manipulates */ - list_del(&page->list); - __free_pages_bulk(page, base, zone, area, mask, order); - ret++; + + if (order || area->active - area->locally_free <= 2*count) { + list_splice(&page->list, &tmp); + list_add(&page->list, &tmp); + page->private = 0; + } + + if (order) { + list_for_each_entry_safe(page, save, &tmp, list) { + list_del(&page->list); + __free_pages_bulk(page, base, zone, area, mask, order); + } + } else if (area->active - area->locally_free <= 2*count) { + /* + * This is a somewhat ad hoc approach to dealing with + * the interaction of gang allocation and the deferred + * coalescing heuristics. + */ + if (area->active - area->locally_free < count) { + int local = 0; + + while (local < count && area->locally_free) { + struct page *follow, *head = + list_entry(area->deferred_pages.next, struct page, lru); + list_del(&head->lru); + list_for_each_entry_safe(follow, save, &head->list, list) { + list_del(&follow->list); + buddy_free(follow, base, zone, area, mask, 0); + } + local += head->private; + area->locally_free -= head->private; + head->private = 0; + buddy_free(head, base, zone, area, mask, 0); + } + } + list_for_each_entry_safe(page, save, &tmp, list) { + list_del(&page->list); + buddy_free(page, base, zone, area, mask, order); + } + } else { + area->locally_free += count; + list_add(&page->lru, &area->deferred_pages); + } + if (!order) { + zone->free_pages += count; + area->active -= min(area->active, count); } spin_unlock_irqrestore(&zone->lock, flags); - return ret; } void __free_pages_ok(struct page *page, unsigned int order) { - LIST_HEAD(list); - mod_page_state(pgfree, 1 << order); free_pages_check(__FUNCTION__, page); - list_add(&page->list, &list); - free_pages_bulk(page_zone(page), 1, &list, order); + page->private = 1; + INIT_LIST_HEAD(&page->list); + free_pages_bulk(page_zone(page), page, order); } #define MARK_USED(index, order, area) \ @@ -278,10 +351,10 @@ expand(struct zone *zone, struct page *p unsigned long size = 1 << high; while (high > low) { - BUG_ON(bad_range(zone, page)); area--; high--; size >>= 1; + area->globally_free++; list_add(&page->list, &area->free_list); MARK_USED(index, high, area); index += size; @@ -311,7 +384,7 @@ static inline void set_page_refs(struct */ static void prep_new_page(struct page *page, int order) { - if (page->mapping || page_mapped(page) || + if (page->__mapping || page_mapped(page) || (page->flags & ( 1 << PG_private | 1 << PG_locked | @@ -319,6 +392,10 @@ static void prep_new_page(struct page *p 1 << PG_active | 1 << PG_dirty | 1 << PG_reclaim | + 1 << PG_rmaplock | + 1 << PG_chained | + 1 << PG_anon | + 1 << PG_swapcache | 1 << PG_writeback ))) bad_page(__FUNCTION__, page); @@ -332,7 +409,7 @@ static void prep_new_page(struct page *p * Do the hard work of removing an element from the buddy allocator. * Call me with the zone->lock already held. */ -static struct page *__rmqueue(struct zone *zone, unsigned int order) +static struct page *buddy_alloc(struct zone *zone, unsigned int order) { struct free_area * area; unsigned int current_order; @@ -346,16 +423,144 @@ static struct page *__rmqueue(struct zon page = list_entry(area->free_list.next, struct page, list); list_del(&page->list); + area->globally_free--; index = page - zone->zone_mem_map; if (current_order != MAX_ORDER-1) MARK_USED(index, current_order, area); - zone->free_pages -= 1UL << order; return expand(zone, page, index, order, current_order, area); } return NULL; } +/* + * This is bad; some way to avoid putting singleton pages on the + * deferred lists should be worked out at some point. + */ +static void split_pages(struct zone *zone, struct page *page, int page_order, int deferred_order) +{ + int split_order = deferred_order - 1; + unsigned long split_offset = 1UL << split_order; + struct page *split_page; + + while (split_order >= page_order) { + split_page = &page[split_offset]; + if (split_order) + list_add(&split_page->list, + &zone->free_area[split_order].deferred_pages); + else if (!zone->free_area[split_order].locally_free) { + INIT_LIST_HEAD(&split_page->list); + split_page->private = 1; + list_add(&split_page->lru, + &zone->free_area[split_order].deferred_pages); + } else { + struct page *head; + head = list_entry(zone->free_area[split_order].deferred_pages.next, struct page, lru); + head->private++; + list_add(&split_page->list, &head->list); + } + zone->free_area[split_order].locally_free++; + --split_order; + split_offset >>= 1; + } +} + +#define COALESCE_BATCH 256 +static inline struct page *steal_deferred_page(struct zone *zone, int order) +{ + struct page *page; + struct list_head *elem; + struct free_area *area = zone->free_area; + int found_order, k; + + if (zone->free_pages < (1 << order)) + return NULL; + + /* the range of found_order precludes order 0 */ + for (found_order = order + 1; found_order < MAX_ORDER; ++found_order) + if (!list_empty(&area[found_order].deferred_pages)) { + elem = area[found_order].deferred_pages.next; + page = list_entry(elem, struct page, list); + list_del(elem); + area[found_order].locally_free--; + split_pages(zone, page, order, found_order); + return page; + } + + for (found_order = order - 1; found_order >= 0; --found_order) { + for (k = 0; k < COALESCE_BATCH; ++k) { + unsigned long mask = (~0UL) << found_order; + if (list_empty(&area[found_order].deferred_pages)) + break; + elem = area[found_order].deferred_pages.next; + if (found_order) { + page = list_entry(elem, struct page, list); + list_del(elem); + area[found_order].locally_free--; + buddy_free(page, zone->zone_mem_map, zone, &area[found_order], mask, found_order); + } else { + LIST_HEAD(tmp); + struct page *save; + + page = list_entry(elem, struct page, lru); + list_del(elem); + area[found_order].locally_free -= page->private; + page->private = 0; + list_splice(&page->list, &tmp); + list_add(&page->list, &tmp); + list_for_each_entry_safe(page, save, &tmp, list) { + list_del(&page->list); + buddy_free(page, zone->zone_mem_map, zone, &area[found_order], mask, found_order); + } + } + } + page = buddy_alloc(zone, order); + if (page) + return page; + } + return buddy_alloc(zone, order); +} + +static inline int __rmqueue(struct zone *zone, unsigned int order, struct list_head *list) +{ + struct free_area *area = &zone->free_area[order]; + struct page *page; + int count; + + if (!list_empty(&area->deferred_pages)) { + if (order) { + page = list_entry(area->deferred_pages.next, struct page, list); + list_del(&page->list); + count = 1; + } else { + page = list_entry(area->deferred_pages.next, struct page, lru); + list_del(&page->lru); + count = page->private; + page->private = 0; + list_splice(&page->list, list); + } + + area->locally_free -= count; + area->active += count; + zone->free_pages -= count << order; + } else { + page = buddy_alloc(zone, order); + if (page) + count = 1; + else { + page = steal_deferred_page(zone, order); + if (page) + count = 1; + else + return 0; + } + area->active += count; + zone->free_pages -= count << order; + } + list_add(&page->list, list); + return count; +} + /* * Obtain a specified number of elements from the buddy allocator, all under * a single hold of the lock, for efficiency. Add them to the supplied list. @@ -365,17 +570,14 @@ static int rmqueue_bulk(struct zone *zon unsigned long count, struct list_head *list) { unsigned long flags; - int i; - int allocated = 0; - struct page *page; + int i, j, allocated = 0; spin_lock_irqsave(&zone->lock, flags); - for (i = 0; i < count; ++i) { - page = __rmqueue(zone, order); - if (page == NULL) + for (i = 0; i < count && allocated < count; ++i) { + j = __rmqueue(zone, order, list); + if (!j) break; - allocated++; - list_add_tail(&page->list, list); + allocated += j; } spin_unlock_irqrestore(&zone->lock, flags); return allocated; @@ -420,10 +622,14 @@ void drain_local_pages(void) pset = &zone->pageset[smp_processor_id()]; for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { struct per_cpu_pages *pcp; + struct page *page, *save; pcp = &pset->pcp[i]; - pcp->count -= free_pages_bulk(zone, pcp->count, - &pcp->list, 0); + list_for_each_entry_safe(page, save, &pcp->list, lru) { + list_del(&page->lru); + pcp->count -= page->private; + free_pages_bulk(zone, page, 0); + } } } local_irq_restore(flags); @@ -439,14 +645,27 @@ static void free_hot_cold_page(struct pa struct zone *zone = page_zone(page); struct per_cpu_pages *pcp; unsigned long flags; + struct page *head; inc_page_state(pgfree); free_pages_check(__FUNCTION__, page); pcp = &zone->pageset[get_cpu()].pcp[cold]; local_irq_save(flags); - if (pcp->count >= pcp->high) - pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0); - list_add(&page->list, &pcp->list); + while (pcp->count >= pcp->high) { + struct page *free = list_entry(pcp->list.prev, struct page, lru); + list_del(&free->lru); + pcp->count -= free->private; + free_pages_bulk(zone, free, 0); + } + head = list_entry(pcp->list.next, struct page, lru); + if (!list_empty(&pcp->list) && head->private < pcp->batch) { + list_add(&page->list, &head->list); + head->private++; + } else { + INIT_LIST_HEAD(&page->list); + list_add(&page->lru, &pcp->list); + page->private = 1; + } pcp->count++; local_irq_restore(flags); put_cpu(); @@ -471,31 +690,76 @@ void free_cold_page(struct page *page) static struct page *buffered_rmqueue(struct zone *zone, int order, int cold) { unsigned long flags; - struct page *page = NULL; + struct page *head, *page = NULL; + struct per_cpu_pages *pcp = NULL; if (order == 0) { - struct per_cpu_pages *pcp; - pcp = &zone->pageset[get_cpu()].pcp[cold]; local_irq_save(flags); - if (pcp->count <= pcp->low) - pcp->count += rmqueue_bulk(zone, 0, - pcp->batch, &pcp->list); + if (pcp->count <= pcp->low) { + LIST_HEAD(tmp); + int k; + + k = rmqueue_bulk(zone, 0, pcp->batch, &tmp); + if (k) { + pcp->count += k; + head = list_entry(tmp.next, struct page, list); + list_del_init(&head->list); + head->private = k; + list_splice(&tmp, &head->list); + list_add(&head->lru, &pcp->list); + } + } if (pcp->count) { - page = list_entry(pcp->list.next, struct page, list); - list_del(&page->list); + head = list_entry(pcp->list.next, struct page, lru); + WARN_ON(!head->private); + if (head->private == 1) { + list_del(&head->lru); + page = head; + page->private = 0; + } else { + page = list_entry(head->list.next, struct page,list); + list_del(&page->list); + head->private--; + } pcp->count--; } local_irq_restore(flags); put_cpu(); } - if (page == NULL) { + if (!page) { + LIST_HEAD(tmp); + int count; + + if (!order) + pcp = &zone->pageset[get_cpu()].pcp[cold]; + spin_lock_irqsave(&zone->lock, flags); - page = __rmqueue(zone, order); - spin_unlock_irqrestore(&zone->lock, flags); + count = __rmqueue(zone, order, &tmp); + spin_unlock(&zone->lock); + + if (!list_empty(&tmp)) + page = list_entry(tmp.next, struct page, list); + + if (!order && count > 1) { + struct page *head; + + list_del(&page->list); + pcp->count += count - 1; + head = list_entry(tmp.next, struct page, list); + list_del_init(&head->list); + head->private = count - 1; + list_splice(&tmp, &head->list); + list_add(&head->lru, &pcp->list); + } + + local_irq_restore(flags); + if (order && page) prep_compound_page(page, order); + else if (!order) + put_cpu(); } if (page != NULL) { @@ -809,6 +1073,17 @@ static void show_node(struct zone *zone) #define show_node(zone) do { } while (0) #endif +unsigned long nr_deferred_pages(void) +{ + struct zone *zone; + unsigned long order, pages = 0; + + for_each_zone(zone) + for (order = 0; order < MAX_ORDER; ++order) + pages += zone->free_area[order].locally_free << order; + return pages; +} + /* * Accumulate the page_state information across all CPUs. * The result is unavoidably approximate - it can change @@ -979,8 +1254,7 @@ void show_free_areas(void) } for_each_zone(zone) { - struct list_head *elem; - unsigned long nr, flags, order, total = 0; + unsigned long order, total = 0; show_node(zone); printk("%s: ", zone->name); @@ -989,16 +1263,20 @@ void show_free_areas(void) continue; } - spin_lock_irqsave(&zone->lock, flags); + printk("buddy: "); + for (order = 0; order < MAX_ORDER; order++) { + printk("%lu*%lukB ", zone->free_area[order].globally_free, K(1UL) << order); + total += zone->free_area[order].globally_free << order; + } + printk("\ndefer: "); for (order = 0; order < MAX_ORDER; order++) { - nr = 0; - list_for_each(elem, &zone->free_area[order].free_list) - ++nr; - total += nr << order; - printk("%lu*%lukB ", nr, K(1UL) << order); + printk("%lu*%lukB ", zone->free_area[order].locally_free, K(1UL) << order); + total += zone->free_area[order].locally_free << order; } - spin_unlock_irqrestore(&zone->lock, flags); - printk("= %lukB\n", K(total)); + printk("\nactive: "); + for (order = 0; order < MAX_ORDER; order++) + printk("%lu*%lukB ", zone->free_area[order].active, K(1UL) << order); + printk("\n= %lukB\n", K(total)); } show_swap_cache_info(); @@ -1234,10 +1512,11 @@ static void __init free_area_init_core(s batch = zone->present_pages / 1024; if (batch * PAGE_SIZE > 256 * 1024) batch = (256 * 1024) / PAGE_SIZE; - batch /= 4; /* We effectively *= 4 below */ if (batch < 1) batch = 1; + batch *= 4; + for (cpu = 0; cpu < NR_CPUS; cpu++) { struct per_cpu_pages *pcp; @@ -1294,8 +1573,11 @@ static void __init free_area_init_core(s for (i = 0; ; i++) { unsigned long bitmap_size; - + INIT_LIST_HEAD(&zone->free_area[i].deferred_pages); INIT_LIST_HEAD(&zone->free_area[i].free_list); + zone->free_area[i].globally_free = 0; + zone->free_area[i].locally_free = 0; + zone->free_area[i].active = 0; if (i == MAX_ORDER-1) { zone->free_area[i].map = NULL; break; @@ -1401,24 +1683,22 @@ static int frag_show(struct seq_file *m, pg_data_t *pgdat = (pg_data_t *)arg; struct zone *zone; struct zone *node_zones = pgdat->node_zones; - unsigned long flags; int order; for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { if (!zone->present_pages) continue; - spin_lock_irqsave(&zone->lock, flags); - seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); - for (order = 0; order < MAX_ORDER; ++order) { - unsigned long nr_bufs = 0; - struct list_head *elem; - - list_for_each(elem, &(zone->free_area[order].free_list)) - ++nr_bufs; - seq_printf(m, "%6lu ", nr_bufs); - } - spin_unlock_irqrestore(&zone->lock, flags); + seq_printf(m, "Node %d, zone %8s\n", pgdat->node_id, zone->name); + seq_puts(m, "buddy: "); + for (order = 0; order < MAX_ORDER; ++order) + seq_printf(m, "%6lu ", zone->free_area[order].globally_free); + seq_puts(m, "\ndefer: "); + for (order = 0; order < MAX_ORDER; ++order) + seq_printf(m, "%6lu ", zone->free_area[order].locally_free); + seq_puts(m, "\nactive: "); + for (order = 0; order < MAX_ORDER; ++order) + seq_printf(m, "%6lu ", zone->free_area[order].active); seq_putc(m, '\n'); } return 0; @@ -1437,6 +1717,7 @@ static char *vmstat_text[] = { "nr_unstable", "nr_page_table_pages", "nr_mapped", + "nr_swapcache", "nr_slab", "pgpgin", diff -prauN linux-2.5.73/mm/page_io.c wli-2.5.73-29/mm/page_io.c --- linux-2.5.73/mm/page_io.c 2003-06-22 11:32:33.000000000 -0700 +++ wli-2.5.73-29/mm/page_io.c 2003-06-23 10:46:31.000000000 -0700 @@ -16,8 +16,6 @@ #include #include #include -#include /* for block_sync_page() */ -#include #include #include @@ -32,7 +30,7 @@ get_swap_bio(int gfp_flags, struct page swp_entry_t entry; BUG_ON(!PageSwapCache(page)); - entry.val = page->index; + entry.val = page->private; sis = get_swap_info_struct(swp_type(entry)); bio->bi_sector = map_swap_page(sis, swp_offset(entry)) * @@ -130,13 +128,6 @@ out: return ret; } -struct address_space_operations swap_aops = { - .writepage = swap_writepage, - .readpage = swap_readpage, - .sync_page = block_sync_page, - .set_page_dirty = __set_page_dirty_nobuffers, -}; - /* * A scruffy utility function to read or write an arbitrary swap page * and wait on the I/O. @@ -150,9 +141,8 @@ int rw_swap_page_sync(int rw, swp_entry_ lock_page(page); - BUG_ON(page->mapping); - page->mapping = &swapper_space; - page->index = entry.val; + SetPageSwapCache(page); + page->private = entry.val; if (rw == READ) { ret = swap_readpage(NULL, page); @@ -161,7 +151,7 @@ int rw_swap_page_sync(int rw, swp_entry_ ret = swap_writepage(page, &swap_wbc); wait_on_page_writeback(page); } - page->mapping = NULL; + ClearPageSwapCache(page); if (ret == 0 && (!PageUptodate(page) || PageError(page))) ret = -EIO; return ret; diff -prauN linux-2.5.73/mm/readahead.c wli-2.5.73-29/mm/readahead.c --- linux-2.5.73/mm/readahead.c 2003-06-22 11:32:32.000000000 -0700 +++ wli-2.5.73-29/mm/readahead.c 2003-06-23 10:38:47.000000000 -0700 @@ -217,7 +217,7 @@ __do_page_cache_readahead(struct address /* * Preallocate as many pages as we will need. */ - spin_lock(&mapping->page_lock); + mapping_rdlock(&mapping->page_lock); for (page_idx = 0; page_idx < nr_to_read; page_idx++) { unsigned long page_offset = offset + page_idx; @@ -228,16 +228,16 @@ __do_page_cache_readahead(struct address if (page) continue; - spin_unlock(&mapping->page_lock); + mapping_rdunlock(&mapping->page_lock); page = page_cache_alloc_cold(mapping); - spin_lock(&mapping->page_lock); + mapping_rdlock(&mapping->page_lock); if (!page) break; page->index = page_offset; list_add(&page->list, &page_pool); ret++; } - spin_unlock(&mapping->page_lock); + mapping_rdunlock(&mapping->page_lock); /* * Now start the IO. We ignore I/O errors - if the page is not diff -prauN linux-2.5.73/mm/rmap.c wli-2.5.73-29/mm/rmap.c --- linux-2.5.73/mm/rmap.c 2003-06-22 11:33:17.000000000 -0700 +++ wli-2.5.73-29/mm/rmap.c 2003-06-23 23:32:52.000000000 -0700 @@ -5,531 +5,633 @@ * Released under the General Public License (GPL). * * - * Simple, low overhead pte-based reverse mapping scheme. - * This is kept modular because we may want to experiment - * with object-based reverse mapping schemes. Please try - * to keep this thing as modular as possible. + * Simple, low overhead reverse mapping scheme. + * Please try to keep this thing as modular as possible. */ /* * Locking: - * - the page->pte.chain is protected by the PG_chainlock bit, + * - the page->rmap field is protected by the PG_rmaplock bit, * which nests within the the mm->page_table_lock, * which nests within the page lock. * - because swapout locking is opposite to the locking order * in the page fault path, the swapout path uses trylocks * on the mm->page_table_lock */ + #include #include #include #include #include #include -#include +#include #include #include - -#include -#include -#include +#include +#include #include /* #define DEBUG_RMAP */ /* - * Shared pages have a chain of pte_chain structures, used to locate - * all the mappings to this page. We only need a pointer to the pte - * here, the page struct for the page table page contains the process - * it belongs to and the offset within that process. - * - * We use an array of pte pointers in this structure to minimise cache misses - * while traversing reverse maps. + * struct addresser: for next_rmap_address to dole out user addresses + * one by one to page_referenced() or try_to_unmap() */ -#define NRPTE ((L1_CACHE_BYTES - sizeof(unsigned long))/sizeof(pte_addr_t)) +struct addresser { + unsigned long address, count; + struct rmap_chain *chain; + int index; +}; -/* - * next_and_idx encodes both the address of the next pte_chain and the - * offset of the highest-index used pte in ptes[]. - */ -struct pte_chain { - unsigned long next_and_idx; - pte_addr_t ptes[NRPTE]; -} ____cacheline_aligned; +static kmem_cache_t *rmap_chain_cache; + +static DEFINE_PER_CPU(struct rmap_chain *, rmap_chain) = NULL; -kmem_cache_t *pte_chain_cache; +kmem_cache_t *anon_cache; -static inline struct pte_chain *pte_chain_next(struct pte_chain *pte_chain) +static void anon_ctor(void *arg, kmem_cache_t *cache, unsigned long unused) { - return (struct pte_chain *)(pte_chain->next_and_idx & ~NRPTE); + struct anon *anon = (struct anon *)arg; + atomic_set(&anon->count, 2); + anon->lock = SPIN_LOCK_UNLOCKED; + INIT_LIST_HEAD(&anon->list); } -static inline struct pte_chain *pte_chain_ptr(unsigned long pte_chain_addr) +static void rmap_chain_ctor(void *arg, kmem_cache_t *cache, unsigned long flags) { - return (struct pte_chain *)(pte_chain_addr & ~NRPTE); + int i; + struct rmap_chain *chain = (struct rmap_chain *)arg; + + for (i = 0; i < NRSLOT; ++i) + chain->slot[i] = NOADDR; + chain->next = NULL; } -static inline int pte_chain_idx(struct pte_chain *pte_chain) +static inline void rmap_chain_dtor(struct rmap_chain *chain) { - return pte_chain->next_and_idx & NRPTE; + int i; + for (i = 0; i < NRSLOT; ++i) + if (chain->slot[i] != NOADDR) + chain->slot[i] = NOADDR; + if (chain->next) + chain->next = NULL; } -static inline unsigned long -pte_chain_encode(struct pte_chain *pte_chain, int idx) +void __init init_rmap(void) { - return (unsigned long)pte_chain | idx; + anon_cache = kmem_cache_create("anon", sizeof(struct anon), 0, 0, anon_ctor, NULL); + if (!anon_cache) + panic("init_rmap: Cannot alloc anon slab cache\n"); + rmap_chain_cache = kmem_cache_create("rmap_chain", sizeof(struct rmap_chain), 0, 0, rmap_chain_ctor, NULL); } -/* - * pte_chain list management policy: - * - * - If a page has a pte_chain list then it is shared by at least two processes, - * because a single sharing uses PageDirect. (Well, this isn't true yet, - * coz this code doesn't collapse singletons back to PageDirect on the remove - * path). - * - A pte_chain list has free space only in the head member - all succeeding - * members are 100% full. - * - If the head element has free space, it occurs in its leading slots. - * - All free space in the pte_chain is at the start of the head member. - * - Insertion into the pte_chain puts a pte pointer in the last free slot of - * the head member. - * - Removal from a pte chain moves the head pte of the head member onto the - * victim pte and frees the head member if it became empty. - */ +int exec_rmap(struct mm_struct *mm) +{ + struct anon *anon = kmem_cache_alloc(anon_cache, GFP_KERNEL); + if (!anon) + return -ENOMEM; + + spin_lock(&anon->lock); + mm->anon = anon; + list_add_rcu(&mm->anon_list, &anon->list); + spin_unlock(&anon->lock); + return 0; +} + +void dup_rmap(struct mm_struct *new, struct mm_struct *old) +{ + struct anon *anon = old->anon; + atomic_inc(&anon->count); + new->anon = anon; + spin_lock(&anon->lock); + list_add_tail_rcu(&new->anon_list, &anon->list); + spin_unlock(&anon->lock); +} + +void exit_rmap(struct mm_struct *mm) +{ + struct anon *anon = mm->anon; + + spin_lock(&anon->lock); + mm->anon = NULL; + wmb(); + list_del_rcu(&mm->anon_list); + spin_unlock(&anon->lock); + + if (!atomic_dec_and_test(&anon->count)) + return; + + /* RCU may not have quiesced things just yet */ + INIT_LIST_HEAD(&anon->list); + atomic_set(&anon->count, 2); + kmem_cache_free(anon_cache, anon); +} /** - ** VM stuff below this comment + ** Functions for manipulating struct rmap_chain. **/ -/** - * page_referenced - test if the page was referenced - * @page: the page to test - * - * Quick test_and_clear_referenced for all mappings to a page, - * returns the number of processes which referenced the page. - * Caller needs to hold the pte_chain_lock. - * - * If the page has a single-entry pte_chain, collapse that back to a PageDirect - * representation. This way, it's only done under memory pressure. - */ -int page_referenced(struct page * page) +/* + * Boolean rmap_get_cpu() ensures the cpu has an rmap_chain cached + * in case it is needed later while lock is held. It is never needed + * when page_add_rmap() is adding a freshly allocated anon page. + * caller does put_cpu() once ->page_table_lock prevents preemption. + */ +int rmap_get_cpu(void) +{ + struct rmap_chain **cache, *chain; + might_sleep(); + cache = &per_cpu(rmap_chain, get_cpu()); + if (*cache) + return 1; + put_cpu(); + chain = kmem_cache_alloc(rmap_chain_cache, GFP_KERNEL); + cache = &per_cpu(rmap_chain, get_cpu()); + if (*cache) + kmem_cache_free(rmap_chain_cache, chain); + else if (chain) + *cache = chain; + else { + put_cpu(); + return 0; + } + return 1; +} + +static struct rmap_chain *get_rmap_chain(void) { - struct pte_chain *pc; - int referenced = 0; + struct rmap_chain **cache, *chain; + int i; - if (TestClearPageReferenced(page)) - referenced++; + /* + * ->page_table_lock and rmap_lock are held, no need to get_cpu() + */ + cache = &per_cpu(rmap_chain, smp_processor_id()); + chain = *cache; + *cache = NULL; + for (i = 0; i < NRSLOT; ++i) + chain->slot[i] = NOADDR; + chain->next = NULL; + return chain; +} - if (PageDirect(page)) { - pte_t *pte = rmap_ptep_map(page->pte.direct); - if (ptep_test_and_clear_young(pte)) - referenced++; - rmap_ptep_unmap(pte); - } else { - int nr_chains = 0; +void add_rmap_address(struct page *page, unsigned long address) +{ + struct rmap_chain *chain; + int i = 1; - /* Check all the page tables mapping this page. */ - for (pc = page->pte.chain; pc; pc = pte_chain_next(pc)) { - int i; - - for (i = NRPTE-1; i >= 0; i--) { - pte_addr_t pte_paddr = pc->ptes[i]; - pte_t *p; - - if (!pte_paddr) - break; - p = rmap_ptep_map(pte_paddr); - if (ptep_test_and_clear_young(p)) - referenced++; - rmap_ptep_unmap(p); - nr_chains++; + if (PageChained(page)) { + /* + * Check lest duplicates arise, and find a free slot at the end + */ + for (chain = page->rmap.chain; ; chain = chain->next, i = 0) { + for (; i < NRSLOT; ++i) { + if (chain->slot[i] == NOADDR) + goto set; + else if (chain->slot[i] == address) + return; } + if (!chain->next) + chain->next = get_rmap_chain(); } - if (nr_chains == 1) { - pc = page->pte.chain; - page->pte.direct = pc->ptes[NRPTE-1]; - SetPageDirect(page); - pc->ptes[NRPTE-1] = 0; - __pte_chain_free(pc); + } else { + SetPageChained(page); + chain = get_rmap_chain(); + chain->slot[0] = page->rmap.count; + page->rmap.chain = chain; + } +set: + chain->slot[i] = address; +} + +static int +next_rmap_address(struct page *page, struct vm_area_struct *vma, + struct addresser *addresser) +{ + if (addresser->index == 0) { + /* set chain and index for next call */ + addresser->chain = PageChained(page) ? page->rmap.chain : NULL; + addresser->index = 1; + if (vma) { + addresser->address = vma_address(page, vma); + if (addresser->address != NOADDR) + return 1; + } else { + addresser->address = page->index; + return 1; } } - return referenced; + while (addresser->chain) { + if (addresser->index >= NRSLOT) + addresser->index = 0; + addresser->address = + addresser->chain->slot[addresser->index]; + if (addresser->address == NOADDR) + break; + addresser->index++; + if (addresser->index >= NRSLOT) + addresser->chain = addresser->chain->next; + if (!vma || addresser->address != vma_address(page, vma)) + return 1; + } + return 0; } -/** - * page_add_rmap - add reverse mapping entry to a page - * @page: the page to add the mapping to - * @ptep: the page table entry mapping this page - * - * Add a new pte reverse mapping to a page. - * The caller needs to hold the mm->page_table_lock. - */ -struct pte_chain * -page_add_rmap(struct page *page, pte_t *ptep, struct pte_chain *pte_chain) +void clear_page_chained(struct page *page) { - pte_addr_t pte_paddr = ptep_to_paddr(ptep); - struct pte_chain *cur_pte_chain; + struct rmap_chain *chain = page->rmap.chain; - if (!pfn_valid(page_to_pfn(page)) || PageReserved(page)) - return pte_chain; + /* + * This is only called when mapcount goes to 0, which + * means it's possible for a page to accumulate a large + * chain of stale addresses. But normally try_to_unmap_one() + * will bring the count to 0 and free them all here. + */ + page->rmap.count = chain->slot[0]; + ClearPageChained(page); + do { + struct rmap_chain *next = chain->next; + rmap_chain_dtor(chain); + kmem_cache_free(rmap_chain_cache, chain); + chain = next; + } while (chain); +} - pte_chain_lock(page); +/** + ** Subfunctions of page_referenced(): page_referenced_one() called + ** repeatedly from page_referenced_obj(); + **/ - if (page->pte.direct == 0) { - page->pte.direct = pte_paddr; - SetPageDirect(page); - inc_page_state(nr_mapped); - goto out; - } +static inline int page_referenced_one(struct page *page, struct mm_struct *mm, + struct addresser *addresser) +{ + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + int referenced = 0; - if (PageDirect(page)) { - /* Convert a direct pointer into a pte_chain */ - ClearPageDirect(page); - pte_chain->ptes[NRPTE-1] = page->pte.direct; - pte_chain->ptes[NRPTE-2] = pte_paddr; - pte_chain->next_and_idx = pte_chain_encode(NULL, NRPTE-2); - page->pte.direct = 0; - page->pte.chain = pte_chain; - pte_chain = NULL; /* We consumed it */ + if (!spin_trylock(&mm->page_table_lock)) { + referenced = 1; goto out; } - cur_pte_chain = page->pte.chain; - if (cur_pte_chain->ptes[0]) { /* It's full */ - pte_chain->next_and_idx = pte_chain_encode(cur_pte_chain, - NRPTE - 1); - page->pte.chain = pte_chain; - pte_chain->ptes[NRPTE-1] = pte_paddr; - pte_chain = NULL; /* We consumed it */ + pgd = pgd_offset(mm, addresser->address); + if (!pgd_present(*pgd)) + goto out_unlock; + + pmd = pmd_offset_map(pgd, addresser->address); + if (!pmd) goto out; - } - cur_pte_chain->ptes[pte_chain_idx(cur_pte_chain) - 1] = pte_paddr; - cur_pte_chain->next_and_idx--; + + if (!pmd_present(*pmd)) + goto out_unmap_pmd; + + pte = pte_offset_map(pmd, addresser->address); + if (!pte_present(*pte)) + goto out_unmap_pte; + + if (page_to_pfn(page) != pte_pfn(*pte)) + goto out_unmap_pte; + + referenced = ptep_test_and_clear_young(pte); + addresser->count--; + +out_unmap_pmd: + pmd_unmap(pmd); +out_unmap_pte: + pte_unmap(pte); +out_unlock: + spin_unlock(&mm->page_table_lock); out: - pte_chain_unlock(page); - return pte_chain; + return referenced; } -/** - * page_remove_rmap - take down reverse mapping to a page - * @page: page to remove mapping from - * @ptep: page table entry to remove - * - * Removes the reverse mapping from the pte_chain of the page, - * after that the caller can clear the page table entry and free - * the page. - * Caller needs to hold the mm->page_table_lock. - */ -void page_remove_rmap(struct page *page, pte_t *ptep) +static inline int +page_referenced_anon(struct page *page, struct addresser *addresser) { - pte_addr_t pte_paddr = ptep_to_paddr(ptep); - struct pte_chain *pc; + struct mm_struct *mm; + struct anon *anon; + int referenced = 0; - if (!pfn_valid(page_to_pfn(page)) || PageReserved(page)) - return; + rcu_read_lock(); /* anon->lock */ + + mm = page_mm(page); + anon = mm->anon; + if (!anon) + goto out; - pte_chain_lock(page); + list_for_each_entry_rcu(mm, &anon->list, anon_list) { + if (!mm->anon || !mm->rss) + continue; + addresser->index = 0; + while (next_rmap_address(page, NULL, addresser)) { + referenced += page_referenced_one(page, mm, addresser); + if (!addresser->count) + goto out; + } + } +out: + rcu_read_unlock(); /* anon->lock */ + return referenced; +} - if (!page_mapped(page)) - goto out_unlock; /* remap_page_range() from a driver? */ +static inline int page_referenced_obj(struct page *page, struct addresser *addresser) +{ + struct address_space *mapping = page_mapping(page); + struct vm_area_struct *vma; + int referenced = 0; - if (PageDirect(page)) { - if (page->pte.direct == pte_paddr) { - page->pte.direct = 0; - ClearPageDirect(page); - goto out; + rcu_read_lock(); /* mapping->i_shared_lock */ + list_for_each_entry_rcu(vma, &mapping->i_mmap, shared) { + if (vma->vm_flags & VM_DEAD) + continue; + if (!vma->vm_mm->rss) + continue; + addresser->index = 0; + while (next_rmap_address(page, vma, addresser)) { + referenced += page_referenced_one(page, vma->vm_mm, addresser); + if (!addresser->count) + goto out; } - } else { - struct pte_chain *start = page->pte.chain; - struct pte_chain *next; - int victim_i = -1; - - for (pc = start; pc; pc = next) { - int i; - - next = pte_chain_next(pc); - if (next) - prefetch(next); - for (i = pte_chain_idx(pc); i < NRPTE; i++) { - pte_addr_t pa = pc->ptes[i]; - - if (victim_i == -1) - victim_i = i; - if (pa != pte_paddr) - continue; - pc->ptes[i] = start->ptes[victim_i]; - start->ptes[victim_i] = 0; - if (victim_i == NRPTE-1) { - /* Emptied a pte_chain */ - page->pte.chain = pte_chain_next(start); - __pte_chain_free(start); - } else { - start->next_and_idx++; - } + } + + list_for_each_entry_rcu(vma, &mapping->i_mmap_shared, shared) { + if (vma->vm_flags & VM_DEAD) + continue; + if (!vma->vm_mm->rss) + continue; + addresser->index = 0; + while (next_rmap_address(page, vma, addresser)) { + referenced += page_referenced_one(page, vma->vm_mm, addresser); + if (!addresser->count) goto out; - } } } out: - if (!page_mapped(page)) - dec_page_state(nr_mapped); -out_unlock: - pte_chain_unlock(page); - return; + rcu_read_unlock(); /* mapping->i_shared_lock */ + return referenced; } /** - * try_to_unmap_one - worker function for try_to_unmap - * @page: page to unmap - * @ptep: page table entry to unmap from page + * page_referenced - test if the page was referenced + * @page: the page to test * - * Internal helper function for try_to_unmap, called for each page - * table entry mapping a page. Because locking order here is opposite - * to the locking order used by the page fault path, we use trylocks. - * Locking: - * page lock shrink_list(), trylock - * pte_chain_lock shrink_list() - * mm->page_table_lock try_to_unmap_one(), trylock + * returns the number of ptes which referenced the page. + * Caller needs to hold the rmap_lock. */ -static int FASTCALL(try_to_unmap_one(struct page *, pte_addr_t)); -static int try_to_unmap_one(struct page * page, pte_addr_t paddr) +int page_referenced(struct page * page) { - pte_t *ptep = rmap_ptep_map(paddr); - unsigned long address = ptep_to_address(ptep); - struct mm_struct * mm = ptep_to_mm(ptep); - struct vm_area_struct * vma; - pte_t pte; - int ret; + int referenced = !!TestClearPageReferenced(page); + struct addresser addresser; + + addresser.count = page_mapcount(page); + if (!addresser.count || !page->__mapping) + return 0; + else if (PageAnon(page)) + referenced += page_referenced_anon(page, &addresser); + else + referenced += page_referenced_obj(page, &addresser); + return referenced; +} + +void page_turn_rmap(struct page *page, struct vm_area_struct *vma) +{ + struct mm_struct *old, *new; + old = page_mm(page); + new = vma->vm_mm; + + BUG_ON(!PageAnon(page)); + BUG_ON(page_mapcount(page) != 1); + + if (old == new) + return; + + rmap_lock(page); + set_page_mapping(page, new); + rmap_unlock(page); +} - if (!mm) - BUG(); +void page_move_rmap(struct page *page, struct vm_area_struct *vma, + unsigned long old, unsigned long new) +{ + if (!page_mapped(page) || !page->__mapping) + return; + + rmap_lock(page); + + if (PageAnon(page)) { + /* + * Don't check page_mapcount(page) == 1 here + * because the mapcount could be 1 but the page + * could still have a chain, and our new address + * in that chain. + */ + if (page->rmap.count == 1) + page->index = new; + else if (new != page->index) + add_rmap_address(page, new); + } else { + /* + * Just in case things are nonlinear. + */ + if (old != vma_address(page, vma)) + add_rmap_address(page, new); + } + + rmap_unlock(page); +} + +static int try_to_unmap_one(struct page *page, struct mm_struct *mm, + struct addresser *addresser, struct vm_area_struct *vma) +{ + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + pte_t pteval; + unsigned long address = addresser->address; + int ret = SWAP_AGAIN; /* * We need the page_table_lock to protect us from page faults, * munmap, fork, etc... */ - if (!spin_trylock(&mm->page_table_lock)) { - rmap_ptep_unmap(ptep); - return SWAP_AGAIN; - } - + if (!spin_trylock(&mm->page_table_lock)) + goto out; - /* During mremap, it's possible pages are not in a VMA. */ - vma = find_vma(mm, address); - if (!vma) { + /* If the page is mlock()'d, we can't unmap it. */ + if (!vma) + vma = find_vma(mm, address); + if (!vma || (vma->vm_flags & VM_LOCKED)) { ret = SWAP_FAIL; goto out_unlock; } - /* The page is mlock()d, we cannot swap it out. */ - if (vma->vm_flags & VM_LOCKED) { - ret = SWAP_FAIL; + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) goto out_unlock; - } + pmd = pmd_offset_map(pgd, address); + if (!pmd_present(*pmd)) + goto out_unmap_pmd; + pte = pte_offset_map(pmd, address); + if (!pte_present(*pte)) + goto out_unmap_pte; + + if (page_to_pfn(page) != pte_pfn(*pte)) + goto out_unmap_pte; + + addresser->count--; /* Nuke the page table entry. */ flush_cache_page(vma, address); - pte = ptep_get_and_clear(ptep); + pteval = vm_ptep_get_and_clear(vma, pte, address); flush_tlb_page(vma, address); - if (PageSwapCache(page)) { + if (PageAnon(page)) { /* * Store the swap location in the pte. * See handle_pte_fault() ... */ - swp_entry_t entry = { .val = page->index }; + swp_entry_t entry = { .val = page->private }; + BUG_ON(!PageSwapCache(page)); swap_duplicate(entry); - set_pte(ptep, swp_entry_to_pte(entry)); - BUG_ON(pte_file(*ptep)); + vm_set_pte(vma, pte, swp_entry_to_pte(entry), address); + BUG_ON(pte_file(*pte)); } else { - unsigned long pgidx; /* - * If a nonlinear mapping then store the file page offset - * in the pte. + * If a nonlinear mapping from sys_remap_file_pages(), + * then store the file page offset in the pte. */ - pgidx = (address - vma->vm_start) >> PAGE_SHIFT; - pgidx += vma->vm_pgoff; - pgidx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT; - if (page->index != pgidx) { - set_pte(ptep, pgoff_to_pte(page->index)); - BUG_ON(!pte_file(*ptep)); + if (address != vma_address(page, vma)) { + vm_set_pte(vma, pte, pgoff_to_pte(page->index), address); + BUG_ON(!pte_file(*pte)); } } /* Move the dirty bit to the physical page now the pte is gone. */ - if (pte_dirty(pte)) + if (pte_dirty(pteval)) set_page_dirty(page); - mm->rss--; + BUG_ON(!page_mapcount(page)); + if (!PageChained(page)) + page->rmap.count--; + else { + page->rmap.chain->slot[0]--; + if (!page->rmap.chain->slot[0]) + clear_page_chained(page); + } page_cache_release(page); - ret = SWAP_SUCCESS; + mm->rss--; +out_unmap_pmd: + pmd_unmap(pmd); +out_unmap_pte: + pte_unmap(pte); out_unlock: - rmap_ptep_unmap(ptep); spin_unlock(&mm->page_table_lock); +out: return ret; } -/** - * try_to_unmap - try to remove all page table mappings to a page - * @page: the page to get unmapped - * - * Tries to remove all the page table entries which are mapping this - * page, used in the pageout path. Caller must hold the page lock - * and its pte chain lock. Return values are: - * - * SWAP_SUCCESS - we succeeded in removing all mappings - * SWAP_AGAIN - we missed a trylock, try again later - * SWAP_FAIL - the page is unswappable - */ -int try_to_unmap(struct page * page) +static inline int try_to_unmap_anon(struct page *page, struct addresser *addresser) { - struct pte_chain *pc, *next_pc, *start; - int ret = SWAP_SUCCESS; - int victim_i = -1; - - /* This page should not be on the pageout lists. */ - if (PageReserved(page)) - BUG(); - if (!PageLocked(page)) - BUG(); - /* We need backing store to swap out a page. */ - if (!page->mapping) - BUG(); - - if (PageDirect(page)) { - ret = try_to_unmap_one(page, page->pte.direct); - if (ret == SWAP_SUCCESS) { - page->pte.direct = 0; - ClearPageDirect(page); - } + struct mm_struct *mm; + struct anon *anon; + int ret = SWAP_AGAIN; + + rcu_read_lock(); /* anon->lock */ + + mm = page_mm(page); + anon = mm->anon; + if (!anon) goto out; - } - start = page->pte.chain; - for (pc = start; pc; pc = next_pc) { - int i; - - next_pc = pte_chain_next(pc); - if (next_pc) - prefetch(next_pc); - for (i = pte_chain_idx(pc); i < NRPTE; i++) { - pte_addr_t pte_paddr = pc->ptes[i]; - - if (!pte_paddr) - continue; - if (victim_i == -1) - victim_i = i; - - switch (try_to_unmap_one(page, pte_paddr)) { - case SWAP_SUCCESS: - /* - * Release a slot. If we're releasing the - * first pte in the first pte_chain then - * pc->ptes[i] and start->ptes[victim_i] both - * refer to the same thing. It works out. - */ - pc->ptes[i] = start->ptes[victim_i]; - start->ptes[victim_i] = 0; - victim_i++; - if (victim_i == NRPTE) { - page->pte.chain = pte_chain_next(start); - __pte_chain_free(start); - start = page->pte.chain; - victim_i = 0; - } else { - start->next_and_idx++; - } - break; - case SWAP_AGAIN: - /* Skip this pte, remembering status. */ - ret = SWAP_AGAIN; - continue; - case SWAP_FAIL: - ret = SWAP_FAIL; + list_for_each_entry_rcu(mm, &anon->list, anon_list) { + if (!mm->anon) + continue; + addresser->index = 0; + while (next_rmap_address(page, NULL, addresser)) { + ret = try_to_unmap_one(page, mm, addresser, NULL); + if (ret == SWAP_FAIL || !addresser->count) goto out; - } } } out: - if (!page_mapped(page)) - dec_page_state(nr_mapped); + rcu_read_unlock(); /* anon->lock */ return ret; } -/** - ** No more VM stuff below this comment, only pte_chain helper - ** functions. - **/ - -static void pte_chain_ctor(void *p, kmem_cache_t *cachep, unsigned long flags) +static inline int try_to_unmap_obj(struct page *page, struct addresser *addresser) { - struct pte_chain *pc = p; + struct address_space *mapping; + struct vm_area_struct *vma; + int ret = SWAP_AGAIN; + + mapping = page_mapping(page); + + rcu_read_lock(); /* mapping->i_shared_lock */ + + list_for_each_entry_rcu(vma, &mapping->i_mmap, shared) { + if (vma->vm_flags & VM_DEAD) + continue; + if (!vma->vm_mm->rss) + continue; + addresser->index = 0; + while (next_rmap_address(page, vma, addresser)) { + ret = try_to_unmap_one(page, vma->vm_mm, addresser, vma); + if (ret == SWAP_FAIL || !addresser->count) + goto out; + } + } - memset(pc, 0, sizeof(*pc)); + list_for_each_entry_rcu(vma, &mapping->i_mmap_shared, shared) { + if (vma->vm_flags & VM_DEAD) + continue; + if (!vma->vm_mm->rss) + continue; + addresser->index = 0; + while (next_rmap_address(page, vma, addresser)) { + ret = try_to_unmap_one(page, vma->vm_mm, addresser, vma); + if (ret == SWAP_FAIL || !addresser->count) + goto out; + } + } +out: + rcu_read_unlock(); /* mapping->i_shared_lock */ + return ret; } -DEFINE_PER_CPU(struct pte_chain *, local_pte_chain) = 0; - /** - * __pte_chain_free - free pte_chain structure - * @pte_chain: pte_chain struct to free - */ -void __pte_chain_free(struct pte_chain *pte_chain) -{ - int cpu = get_cpu(); - struct pte_chain **pte_chainp; - - if (pte_chain->next_and_idx) - pte_chain->next_and_idx = 0; - pte_chainp = &per_cpu(local_pte_chain, cpu); - if (*pte_chainp) - kmem_cache_free(pte_chain_cache, *pte_chainp); - *pte_chainp = pte_chain; - put_cpu(); -} - -/* - * pte_chain_alloc(): allocate a pte_chain structure for use by page_add_rmap(). + * try_to_unmap - try to remove all page table mappings to a page + * @page: the page to get unmapped * - * The caller of page_add_rmap() must perform the allocation because - * page_add_rmap() is invariably called under spinlock. Often, page_add_rmap() - * will not actually use the pte_chain, because there is space available in one - * of the existing pte_chains which are attached to the page. So the case of - * allocating and then freeing a single pte_chain is specially optimised here, - * with a one-deep per-cpu cache. + * Tries to remove all the page table entries which are mapping this + * page, used in the pageout path. Caller must hold the page lock + * and its pte chain lock. Return values are: + * + * SWAP_SUCCESS - we succeeded in removing all mappings + * SWAP_AGAIN - we missed a trylock, try again later + * SWAP_FAIL - the page is unswappable */ -struct pte_chain *pte_chain_alloc(int gfp_flags) +int try_to_unmap(struct page *page) { - int cpu; - struct pte_chain *ret; - struct pte_chain **pte_chainp; - - if (gfp_flags & __GFP_WAIT) - might_sleep(); - - cpu = get_cpu(); - pte_chainp = &per_cpu(local_pte_chain, cpu); - if (*pte_chainp) { - ret = *pte_chainp; - *pte_chainp = NULL; - put_cpu(); - } else { - put_cpu(); - ret = kmem_cache_alloc(pte_chain_cache, gfp_flags); + struct addresser addresser; + int ret; + + BUG_ON(PageReserved(page)); + BUG_ON(!PageLocked(page)); + BUG_ON(!page_mapped(page)); + + addresser.count = page_mapcount(page); + if (PageAnon(page)) + ret = try_to_unmap_anon(page, &addresser); + else + ret = try_to_unmap_obj(page, &addresser); + if (!page_mapped(page)) { + dec_page_state(nr_mapped); + if (PageAnon(page)) + clear_page_anon(page); + ret = SWAP_SUCCESS; } return ret; } - -void __init pte_chain_init(void) -{ - pte_chain_cache = kmem_cache_create( "pte_chain", - sizeof(struct pte_chain), - 0, - SLAB_MUST_HWCACHE_ALIGN, - pte_chain_ctor, - NULL); - - if (!pte_chain_cache) - panic("failed to create pte_chain cache!\n"); -} diff -prauN linux-2.5.73/mm/shmem.c wli-2.5.73-29/mm/shmem.c --- linux-2.5.73/mm/shmem.c 2003-06-22 11:32:43.000000000 -0700 +++ wli-2.5.73-29/mm/shmem.c 2003-06-23 10:46:31.000000000 -0700 @@ -693,7 +693,7 @@ static int shmem_writepage(struct page * BUG_ON(!PageLocked(page)); BUG_ON(page_mapped(page)); - mapping = page->mapping; + mapping = page_mapping(page); index = page->index; inode = mapping->host; info = SHMEM_I(inode); @@ -1108,7 +1108,7 @@ static struct inode_operations shmem_sym static int shmem_prepare_write(struct file *file, struct page *page, unsigned offset, unsigned to) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; return shmem_getpage(inode, page->index, &page, SGP_WRITE); } @@ -1761,7 +1761,7 @@ static void destroy_inodecache(void) static struct address_space_operations shmem_aops = { .writepage = shmem_writepage, - .set_page_dirty = __set_page_dirty_nobuffers, + .set_page_dirty = set_page_dirty_nobuffers, #ifdef CONFIG_TMPFS .prepare_write = shmem_prepare_write, .commit_write = simple_commit_write, diff -prauN linux-2.5.73/mm/swap_state.c wli-2.5.73-29/mm/swap_state.c --- linux-2.5.73/mm/swap_state.c 2003-06-22 11:32:43.000000000 -0700 +++ wli-2.5.73-29/mm/swap_state.c 2003-06-24 02:46:46.000000000 -0700 @@ -21,22 +21,16 @@ static struct backing_dev_info swap_back .memory_backed = 1, /* Does not contribute to dirty memory */ }; -extern struct address_space_operations swap_aops; +static struct address_space_operations swap_aops = { + .writepage = swap_writepage, + .readpage = swap_readpage, +}; struct address_space swapper_space = { .page_tree = RADIX_TREE_INIT(GFP_ATOMIC), - .page_lock = SPIN_LOCK_UNLOCKED, - .clean_pages = LIST_HEAD_INIT(swapper_space.clean_pages), - .dirty_pages = LIST_HEAD_INIT(swapper_space.dirty_pages), - .io_pages = LIST_HEAD_INIT(swapper_space.io_pages), - .locked_pages = LIST_HEAD_INIT(swapper_space.locked_pages), + .page_lock = MAPPING_RW_LOCK_UNLOCKED, .a_ops = &swap_aops, .backing_dev_info = &swap_backing_dev_info, - .i_mmap = LIST_HEAD_INIT(swapper_space.i_mmap), - .i_mmap_shared = LIST_HEAD_INIT(swapper_space.i_mmap_shared), - .i_shared_sem = __MUTEX_INITIALIZER(swapper_space.i_shared_sem), - .private_lock = SPIN_LOCK_UNLOCKED, - .private_list = LIST_HEAD_INIT(swapper_space.private_list), }; #define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) @@ -58,30 +52,50 @@ void show_swap_cache_info(void) swap_cache_info.noent_race, swap_cache_info.exist_race); } +static int __add_to_swap_cache(struct page *page, swp_entry_t entry) +{ + int error; + + BUG_ON(PageSwapCache(page)); + BUG_ON(PagePrivate(page)); + error = radix_tree_preload(GFP_ATOMIC); + if (error) + return error; + + page_cache_get(page); + mapping_wrlock(&swapper_space.page_lock); + error = radix_tree_insert(&swapper_space.page_tree, entry.val, page); + if (error) + page_cache_release(page); + else { + SetPageLocked(page); + SetPageSwapCache(page); + page->private = entry.val; + inc_page_state(nr_swapcache); + } + mapping_wrunlock(&swapper_space.page_lock); + radix_tree_preload_end(); + return error; +} + static int add_to_swap_cache(struct page *page, swp_entry_t entry) { int error; - if (page->mapping) - BUG(); if (!swap_duplicate(entry)) { INC_CACHE_INFO(noent_race); return -ENOENT; } - error = add_to_page_cache(page, &swapper_space, entry.val, GFP_KERNEL); + error = __add_to_swap_cache(page, entry); /* * Anon pages are already on the LRU, we don't run lru_cache_add here. */ - if (error != 0) { + if (error) { swap_free(entry); if (error == -EEXIST) INC_CACHE_INFO(exist_race); return error; } - if (!PageLocked(page)) - BUG(); - if (!PageSwapCache(page)) - BUG(); INC_CACHE_INFO(add_total); return 0; } @@ -95,7 +109,9 @@ void __delete_from_swap_cache(struct pag BUG_ON(!PageLocked(page)); BUG_ON(!PageSwapCache(page)); BUG_ON(PageWriteback(page)); - __remove_from_page_cache(page); + radix_tree_delete(&swapper_space.page_tree, page->private); + ClearPageSwapCache(page); + dec_page_state(nr_swapcache); INC_CACHE_INFO(del_total); } @@ -139,8 +155,7 @@ int add_to_swap(struct page * page) /* * Add it to the swap cache and mark it dirty */ - err = add_to_page_cache(page, &swapper_space, - entry.val, GFP_ATOMIC); + err = __add_to_swap_cache(page, entry); if (pf_flags & PF_MEMALLOC) current->flags |= PF_MEMALLOC; @@ -148,8 +163,7 @@ int add_to_swap(struct page * page) switch (err) { case 0: /* Success */ SetPageUptodate(page); - ClearPageDirty(page); - set_page_dirty(page); + SetPageDirty(page); INC_CACHE_INFO(add_total); return 1; case -EEXIST: @@ -175,15 +189,16 @@ void delete_from_swap_cache(struct page { swp_entry_t entry; + BUG_ON(!PageSwapCache(page)); BUG_ON(!PageLocked(page)); BUG_ON(PageWriteback(page)); BUG_ON(PagePrivate(page)); - entry.val = page->index; + entry.val = page->private; - spin_lock(&swapper_space.page_lock); + mapping_wrlock(&swapper_space.page_lock); __delete_from_swap_cache(page); - spin_unlock(&swapper_space.page_lock); + mapping_wrunlock(&swapper_space.page_lock); swap_free(entry); page_cache_release(page); @@ -191,27 +206,11 @@ void delete_from_swap_cache(struct page int move_to_swap_cache(struct page *page, swp_entry_t entry) { - struct address_space *mapping = page->mapping; - int err; - - spin_lock(&swapper_space.page_lock); - spin_lock(&mapping->page_lock); - - err = radix_tree_insert(&swapper_space.page_tree, entry.val, page); - if (!err) { - __remove_from_page_cache(page); - ___add_to_page_cache(page, &swapper_space, entry.val); - } - - spin_unlock(&mapping->page_lock); - spin_unlock(&swapper_space.page_lock); + int err = __add_to_swap_cache(page, entry); if (!err) { - if (!swap_duplicate(entry)) - BUG(); - /* shift page from clean_pages to dirty_pages list */ - BUG_ON(PageDirty(page)); - set_page_dirty(page); + BUG_ON(!swap_duplicate(entry)); + SetPageDirty(page); INC_CACHE_INFO(add_total); } else if (err == -EEXIST) INC_CACHE_INFO(exist_race); @@ -221,29 +220,13 @@ int move_to_swap_cache(struct page *page int move_from_swap_cache(struct page *page, unsigned long index, struct address_space *mapping) { - swp_entry_t entry; - int err; - - BUG_ON(!PageLocked(page)); - BUG_ON(PageWriteback(page)); - BUG_ON(PagePrivate(page)); - - entry.val = page->index; - - spin_lock(&swapper_space.page_lock); - spin_lock(&mapping->page_lock); - - err = radix_tree_insert(&mapping->page_tree, index, page); - if (!err) { - __delete_from_swap_cache(page); - ___add_to_page_cache(page, mapping, index); + int err = add_to_page_cache(page, mapping, index, GFP_ATOMIC); + if (err == -EEXIST) { + INC_CACHE_INFO(exist_race); + err = 0; } - - spin_unlock(&mapping->page_lock); - spin_unlock(&swapper_space.page_lock); - if (!err) { - swap_free(entry); + delete_from_swap_cache(page); /* shift page from clean_pages to dirty_pages list */ ClearPageDirty(page); set_page_dirty(page); @@ -307,11 +290,17 @@ void free_pages_and_swap_cache(struct pa * lock getting page table operations atomic even if we drop the page * lock before returning. */ -struct page * lookup_swap_cache(swp_entry_t entry) +struct page *lookup_swap_cache(swp_entry_t entry) { - struct page *found; + struct page *page; - found = find_get_page(&swapper_space, entry.val); + mapping_rdlock(&swapper_space.page_lock); + page = radix_tree_lookup(&swapper_space.page_tree, entry.val); + if (page) { + page_cache_get(page); + INC_CACHE_INFO(find_success); + } + mapping_rdunlock(&swapper_space.page_lock); /* * Unsafe to assert PageSwapCache and mapping on page found: * if SMP nothing prevents swapoff from deleting this page from @@ -319,9 +308,7 @@ struct page * lookup_swap_cache(swp_entr * that, but no need to change: we _have_ got the right page. */ INC_CACHE_INFO(find_total); - if (found) - INC_CACHE_INFO(find_success); - return found; + return page; } /* @@ -330,7 +317,7 @@ struct page * lookup_swap_cache(swp_entr * A failure return means that either the page allocation failed or that * the swap entry is no longer in use. */ -struct page * read_swap_cache_async(swp_entry_t entry) +struct page *read_swap_cache_async(swp_entry_t entry) { struct page *found_page, *new_page = NULL; int err; @@ -342,7 +329,11 @@ struct page * read_swap_cache_async(swp_ * that would confuse statistics: use find_get_page() * directly. */ - found_page = find_get_page(&swapper_space, entry.val); + mapping_rdlock(&swapper_space.page_lock); + found_page = radix_tree_lookup(&swapper_space.page_tree, entry.val); + if (found_page) + page_cache_get(found_page); + mapping_rdunlock(&swapper_space.page_lock); if (found_page) break; diff -prauN linux-2.5.73/mm/swapfile.c wli-2.5.73-29/mm/swapfile.c --- linux-2.5.73/mm/swapfile.c 2003-06-22 11:32:36.000000000 -0700 +++ wli-2.5.73-29/mm/swapfile.c 2003-06-24 02:45:12.000000000 -0700 @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include #include @@ -242,16 +242,16 @@ static int exclusive_swap_page(struct pa struct swap_info_struct * p; swp_entry_t entry; - entry.val = page->index; + entry.val = page->private; p = swap_info_get(entry); if (p) { /* Is the only swap cache user the cache itself? */ if (p->swap_map[swp_offset(entry)] == 1) { /* Recheck the page count with the pagecache lock held.. */ - spin_lock(&swapper_space.page_lock); + mapping_rdlock(&swapper_space.page_lock); if (page_count(page) - !!PagePrivate(page) == 2) retval = 1; - spin_unlock(&swapper_space.page_lock); + mapping_rdunlock(&swapper_space.page_lock); } swap_info_put(p); } @@ -310,7 +310,7 @@ int remove_exclusive_swap_page(struct pa if (page_count(page) != 2) /* 2: us + cache */ return 0; - entry.val = page->index; + entry.val = page->private; p = swap_info_get(entry); if (!p) return 0; @@ -319,13 +319,13 @@ int remove_exclusive_swap_page(struct pa retval = 0; if (p->swap_map[swp_offset(entry)] == 1) { /* Recheck the page count with the pagecache lock held.. */ - spin_lock(&swapper_space.page_lock); + mapping_wrlock(&swapper_space.page_lock); if ((page_count(page) == 2) && !PageWriteback(page)) { __delete_from_swap_cache(page); SetPageDirty(page); retval = 1; } - spin_unlock(&swapper_space.page_lock); + mapping_wrunlock(&swapper_space.page_lock); } swap_info_put(p); @@ -348,8 +348,13 @@ void free_swap_and_cache(swp_entry_t ent p = swap_info_get(entry); if (p) { - if (swap_entry_free(p, swp_offset(entry)) == 1) - page = find_trylock_page(&swapper_space, entry.val); + if (swap_entry_free(p, swp_offset(entry)) == 1) { + mapping_rdlock(&swapper_space.page_lock); + page = radix_tree_lookup(&swapper_space.page_tree, entry.val); + if (page && TestSetPageLocked(page)) + page = NULL; + mapping_rdunlock(&swapper_space.page_lock); + } swap_info_put(p); } if (page) { @@ -378,21 +383,21 @@ void free_swap_and_cache(swp_entry_t ent * what to do if a write is requested later. */ /* vma->vm_mm->page_table_lock is held */ -static void +static inline void unuse_pte(struct vm_area_struct *vma, unsigned long address, pte_t *dir, - swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp) + swp_entry_t entry, struct page *page) { - vma->vm_mm->rss++; get_page(page); - set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot))); - *pte_chainp = page_add_rmap(page, dir, *pte_chainp); + vm_set_pte(vma, dir, pte_mkold(mk_pte(page, vma->vm_page_prot)), address); + vma->vm_mm->rss++; + page_add_rmap(page, vma, address, 1); swap_free(entry); } /* vma->vm_mm->page_table_lock is held */ static int unuse_pmd(struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long size, unsigned long offset, - swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp) + swp_entry_t entry, struct page *page) { pte_t * pte; unsigned long end; @@ -417,8 +422,7 @@ static int unuse_pmd(struct vm_area_stru * Test inline before going to call unuse_pte. */ if (unlikely(pte_same(*pte, swp_pte))) { - unuse_pte(vma, offset + address, pte, - entry, page, pte_chainp); + unuse_pte(vma, offset + address, pte, entry, page); pte_unmap(pte); return 1; } @@ -432,7 +436,7 @@ static int unuse_pmd(struct vm_area_stru /* vma->vm_mm->page_table_lock is held */ static int unuse_pgd(struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long size, - swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp) + swp_entry_t entry, struct page *page) { pmd_t * pmd; unsigned long offset, end; @@ -444,7 +448,7 @@ static int unuse_pgd(struct vm_area_stru pgd_clear(dir); return 0; } - pmd = pmd_offset(dir, address); + pmd = pmd_offset_map(dir, address); offset = address & PGDIR_MASK; address &= ~PGDIR_MASK; end = address + size; @@ -453,26 +457,25 @@ static int unuse_pgd(struct vm_area_stru if (address >= end) BUG(); do { - if (unuse_pmd(vma, pmd, address, end - address, - offset, entry, page, pte_chainp)) + if (unuse_pmd(vma, pmd, address, end - address, offset, entry, page)) return 1; address = (address + PMD_SIZE) & PMD_MASK; pmd++; } while (address && (address < end)); + pmd_unmap(pmd - 1); return 0; } /* vma->vm_mm->page_table_lock is held */ static int unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir, - swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp) + swp_entry_t entry, struct page *page) { unsigned long start = vma->vm_start, end = vma->vm_end; if (start >= end) BUG(); do { - if (unuse_pgd(vma, pgdir, start, end - start, - entry, page, pte_chainp)) + if (unuse_pgd(vma, pgdir, start, end - start, entry, page)) return 1; start = (start + PGDIR_SIZE) & PGDIR_MASK; pgdir++; @@ -484,23 +487,20 @@ static int unuse_process(struct mm_struc swp_entry_t entry, struct page* page) { struct vm_area_struct* vma; - struct pte_chain *pte_chain; - - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) - return -ENOMEM; /* * Go through process' page directory. */ + if (!rmap_get_cpu()) + return -ENOMEM; spin_lock(&mm->page_table_lock); + put_cpu(); for (vma = mm->mmap; vma; vma = vma->vm_next) { pgd_t * pgd = pgd_offset(mm, vma->vm_start); - if (unuse_vma(vma, pgd, entry, page, &pte_chain)) + if (unuse_vma(vma, pgd, entry, page)) break; } spin_unlock(&mm->page_table_lock); - pte_chain_free(pte_chain); return 0; } @@ -648,8 +648,14 @@ static int try_to_unuse(unsigned int typ if (swcount > 1) { if (start_mm == &init_mm) shmem = shmem_unuse(entry, page); - else + else { retval = unuse_process(start_mm, entry, page); + if (retval) { + unlock_page(page); + page_cache_release(page); + break; + } + } } if (*swap_map > 1) { int set_start_mm = (*swap_map >= swcount); @@ -672,9 +678,7 @@ static int try_to_unuse(unsigned int typ cond_resched(); swcount = *swap_map; - if (swcount <= 1) - ; - else if (mm == &init_mm) { + if (mm == &init_mm) { set_start_mm = 1; shmem = shmem_unuse(entry, page); } else @@ -990,9 +994,10 @@ int page_queue_congested(struct page *pa BUG_ON(!PageLocked(page)); /* It pins the swap_info_struct */ - bdi = page->mapping->backing_dev_info; - if (PageSwapCache(page)) { - swp_entry_t entry = { .val = page->index }; + if (!PageSwapCache(page)) + bdi = page_mapping(page)->backing_dev_info; + else { + swp_entry_t entry = { .val = page->private }; struct swap_info_struct *sis; sis = get_swap_info_struct(swp_type(entry)); diff -prauN linux-2.5.73/mm/truncate.c wli-2.5.73-29/mm/truncate.c --- linux-2.5.73/mm/truncate.c 2003-06-22 11:33:18.000000000 -0700 +++ wli-2.5.73-29/mm/truncate.c 2003-06-23 10:46:31.000000000 -0700 @@ -18,7 +18,7 @@ static int do_invalidatepage(struct page *page, unsigned long offset) { int (*invalidatepage)(struct page *, unsigned long); - invalidatepage = page->mapping->a_ops->invalidatepage; + invalidatepage = page_mapping(page)->a_ops->invalidatepage; if (invalidatepage == NULL) invalidatepage = block_invalidatepage; return (*invalidatepage)(page, offset); @@ -36,7 +36,7 @@ static inline void truncate_partial_page * becomes anonymous. It will be left on the LRU and may even be mapped into * user pagetables if we're racing with filemap_nopage(). * - * We need to bale out if page->mapping is no longer equal to the original + * We need to bale out if page_mapping(page) is no longer equal to the original * mapping. This happens a) when the VM reclaimed the page while we waited on * its lock, b) when a concurrent invalidate_inode_pages got there first and * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space. @@ -44,7 +44,7 @@ static inline void truncate_partial_page static void truncate_complete_page(struct address_space *mapping, struct page *page) { - if (page->mapping != mapping) + if (page_mapping(page) != mapping) return; if (PagePrivate(page)) @@ -54,32 +54,31 @@ truncate_complete_page(struct address_sp ClearPageUptodate(page); ClearPageMappedToDisk(page); remove_from_page_cache(page); - page_cache_release(page); /* pagecache ref */ } /* * This is for invalidate_inode_pages(). That function can be called at * any time, and is not supposed to throw away dirty pages. But pages can * be marked dirty at any time too. So we re-check the dirtiness inside - * ->page_lock. That provides exclusion against the __set_page_dirty + * ->page_lock. That provides exclusion against the set_page_dirty * functions. */ static int invalidate_complete_page(struct address_space *mapping, struct page *page) { - if (page->mapping != mapping) + if (page_mapping(page) != mapping) return 0; if (PagePrivate(page) && !try_to_release_page(page, 0)) return 0; - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); if (PageDirty(page)) { - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); return 0; } __remove_from_page_cache(page); - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); ClearPageUptodate(page); page_cache_release(page); /* pagecache ref */ return 1; @@ -250,7 +249,7 @@ void invalidate_inode_pages2(struct addr struct page *page = pvec.pages[i]; lock_page(page); - if (page->mapping == mapping) { /* truncate race? */ + if (page_mapping(page) == mapping) { /* truncate race? */ wait_on_page_writeback(page); next = page->index + 1; if (page_mapped(page)) diff -prauN linux-2.5.73/mm/vmalloc.c wli-2.5.73-29/mm/vmalloc.c --- linux-2.5.73/mm/vmalloc.c 2003-06-22 11:32:56.000000000 -0700 +++ wli-2.5.73-29/mm/vmalloc.c 2003-06-23 10:31:02.000000000 -0700 @@ -70,7 +70,7 @@ static void unmap_area_pmd(pgd_t *dir, u return; } - pmd = pmd_offset(dir, address); + pmd = pmd_offset_kernel(dir, address); address &= ~PGDIR_MASK; end = address + size; if (end > PGDIR_SIZE) @@ -159,7 +159,7 @@ int map_vm_area(struct vm_struct *area, dir = pgd_offset_k(address); spin_lock(&init_mm.page_table_lock); do { - pmd_t *pmd = pmd_alloc(&init_mm, dir, address); + pmd_t *pmd = pmd_alloc_kernel(&init_mm, dir, address); if (!pmd) { err = -ENOMEM; break; diff -prauN linux-2.5.73/mm/vmscan.c wli-2.5.73-29/mm/vmscan.c --- linux-2.5.73/mm/vmscan.c 2003-06-22 11:32:33.000000000 -0700 +++ wli-2.5.73-29/mm/vmscan.c 2003-06-23 10:52:52.000000000 -0700 @@ -27,7 +27,7 @@ #include #include #include -#include +#include #include #include @@ -172,23 +172,23 @@ static int shrink_slab(long scanned, uns return 0; } -/* Must be called with page's pte_chain_lock held. */ +/* Must be called with page's rmap_lock held. */ static inline int page_mapping_inuse(struct page *page) { - struct address_space *mapping = page->mapping; + struct address_space *mapping; /* Page is in somebody's page tables. */ if (page_mapped(page)) return 1; - /* XXX: does this happen ? */ - if (!mapping) - return 0; - /* Be more reluctant to reclaim swapcache than pagecache */ if (PageSwapCache(page)) return 1; + mapping = page_mapping(page); + if (!mapping) + return 0; + /* File is mmap'd by somebody. */ if (!list_empty(&mapping->i_mmap)) return 1; @@ -253,14 +253,14 @@ shrink_list(struct list_head *page_list, if (PageWriteback(page)) goto keep_locked; - pte_chain_lock(page); + rmap_lock(page); if (page_referenced(page) && page_mapping_inuse(page)) { /* In active use or really unfreeable. Activate it. */ - pte_chain_unlock(page); + rmap_unlock(page); goto activate_locked; } - mapping = page->mapping; + mapping = page_mapping(page); #ifdef CONFIG_SWAP /* @@ -269,12 +269,14 @@ shrink_list(struct list_head *page_list, * * XXX: implement swap clustering ? */ - if (page_mapped(page) && !mapping && !PagePrivate(page)) { - pte_chain_unlock(page); + if (PageSwapCache(page)) + mapping = &swapper_space; + else if (PageAnon(page)) { + rmap_unlock(page); if (!add_to_swap(page)) goto activate_locked; - pte_chain_lock(page); - mapping = page->mapping; + rmap_lock(page); + mapping = &swapper_space; } #endif /* CONFIG_SWAP */ @@ -285,16 +287,16 @@ shrink_list(struct list_head *page_list, if (page_mapped(page) && mapping) { switch (try_to_unmap(page)) { case SWAP_FAIL: - pte_chain_unlock(page); + rmap_unlock(page); goto activate_locked; case SWAP_AGAIN: - pte_chain_unlock(page); + rmap_unlock(page); goto keep_locked; case SWAP_SUCCESS: ; /* try to free the page below */ } } - pte_chain_unlock(page); + rmap_unlock(page); /* * If the page is dirty, only perform writeback if that write @@ -324,7 +326,7 @@ shrink_list(struct list_head *page_list, goto keep_locked; if (!may_write_to_queue(mapping->backing_dev_info)) goto keep_locked; - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); if (test_clear_page_dirty(page)) { int res; struct writeback_control wbc = { @@ -334,8 +336,9 @@ shrink_list(struct list_head *page_list, .for_reclaim = 1, }; - list_move(&page->list, &mapping->locked_pages); - spin_unlock(&mapping->page_lock); + if (!PageSwapCache(page)) + list_move(&page->list, &mapping->locked_pages); + mapping_wrunlock(&mapping->page_lock); SetPageReclaim(page); res = mapping->a_ops->writepage(page, &wbc); @@ -350,7 +353,7 @@ shrink_list(struct list_head *page_list, } goto keep; } - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); } /* @@ -367,7 +370,7 @@ shrink_list(struct list_head *page_list, * try_to_release_page() will discover that cleanness and will * drop the buffers and mark the page clean - it can be freed. * - * Rarely, pages can have buffers and no ->mapping. These are + * Rarely, pages can have buffers and no page_mapping(). These are * the pages which were not successfully invalidated in * truncate_complete_page(). We try to drop those buffers here * and if that worked, and the page is no longer mapped into @@ -384,7 +387,7 @@ shrink_list(struct list_head *page_list, if (!mapping) goto keep_locked; /* truncate got there first */ - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); /* * The non-racy check for busy page. It is critical to check @@ -392,15 +395,15 @@ shrink_list(struct list_head *page_list, * not in use by anybody. (pagecache + us == 2) */ if (page_count(page) != 2 || PageDirty(page)) { - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); goto keep_locked; } #ifdef CONFIG_SWAP if (PageSwapCache(page)) { - swp_entry_t swap = { .val = page->index }; + swp_entry_t swap = { .val = page->private }; __delete_from_swap_cache(page); - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); swap_free(swap); __put_page(page); /* The pagecache ref */ goto free_it; @@ -408,7 +411,7 @@ shrink_list(struct list_head *page_list, #endif /* CONFIG_SWAP */ __remove_from_page_cache(page); - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); __put_page(page); free_it: @@ -628,13 +631,13 @@ refill_inactive_zone(struct zone *zone, page = list_entry(l_hold.prev, struct page, lru); list_del(&page->lru); if (page_mapped(page)) { - pte_chain_lock(page); + rmap_lock(page); if (page_mapped(page) && page_referenced(page)) { - pte_chain_unlock(page); + rmap_unlock(page); list_add(&page->lru, &l_active); continue; } - pte_chain_unlock(page); + rmap_unlock(page); if (!reclaim_mapped) { list_add(&page->lru, &l_active); continue; @@ -644,7 +647,7 @@ refill_inactive_zone(struct zone *zone, * FIXME: need to consider page_count(page) here if/when we * reap orphaned pages via the LRU (Daniel's locking stuff) */ - if (total_swap_pages == 0 && !page->mapping && + if (total_swap_pages == 0 && !page_mapping(page) && !PagePrivate(page)) { list_add(&page->lru, &l_active); continue;