diff -prauN linux-2.5.72/arch/i386/Kconfig wli-2.5.72-numaq-15/arch/i386/Kconfig --- linux-2.5.72/arch/i386/Kconfig 2003-06-16 21:19:41.000000000 -0700 +++ wli-2.5.72-numaq-15/arch/i386/Kconfig 2003-06-20 03:59:13.000000000 -0700 @@ -397,6 +397,11 @@ config X86_OOSTORE depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 default y +config X86_CMOV + bool + depends on M686 || MPENTIUMII || MPENTIUMIII || MPENTIUM4 || MK8 || MCRUSOE + default y + config HUGETLB_PAGE bool "Huge TLB Page Support" help @@ -723,6 +728,25 @@ config HIGHPTE low memory. Setting this option will put user-space page table entries in high memory. +config HIGHPMD + bool "Allocate 2nd-level pagetables from highmem" + depends on HIGHMEM64G + help + The VM uses one pmd entry for each pagetable page of physical + memory allocated. For systems with extreme amounts of highmem, + this cannot be tolerated. Setting this option will put + userspace 2nd-level pagetables in highmem. + +config 4K_STACK + bool "Use smaller 4k per-task stacks" + help + This option will shrink the kernel's per-task stack from 8k to + 4k. This will greatly increase your chance of overflowing it. + But, if you use the per-cpu interrupt stacks as well, your chances + go way down. Also try the CONFIG_X86_STACK_CHECK overflow + detection. It is much more reliable than the currently in-kernel + version. + config MATH_EMULATION bool "Math emulation" ---help--- @@ -1499,6 +1523,25 @@ config FRAME_POINTER If you don't debug the kernel, you can say N, but we may not be able to solve problems without frame pointers. +config X86_STACK_CHECK + bool "Detect stack overflows" + depends on FRAME_POINTER + help + Say Y here to have the kernel attempt to detect when the per-task + kernel stack overflows. This is much more robust checking than + the above overflow check, which will only occasionally detect + an overflow. The level of guarantee here is much greater. + + Some older versions of gcc don't handle the -p option correctly. + Kernprof is affected by the same problem, which is described here: + http://oss.sgi.com/projects/kernprof/faq.html#Q9 + + Basically, if you get oopses in __free_pages_ok during boot when + you have this turned on, you need to fix gcc. The Redhat 2.96 + version and gcc-3.x seem to work. + + If not debugging a stack overflow problem, say N + config X86_EXTRA_IRQS bool depends on X86_LOCAL_APIC || X86_VOYAGER diff -prauN linux-2.5.72/arch/i386/Makefile wli-2.5.72-numaq-15/arch/i386/Makefile --- linux-2.5.72/arch/i386/Makefile 2003-06-16 21:19:58.000000000 -0700 +++ wli-2.5.72-numaq-15/arch/i386/Makefile 2003-06-20 03:46:46.000000000 -0700 @@ -85,6 +85,10 @@ mcore-$(CONFIG_X86_ES7000) := mach-es700 # default subarch .h files mflags-y += -Iinclude/asm-i386/mach-default +ifdef CONFIG_X86_STACK_CHECK +CFLAGS += -p +endif + head-y := arch/i386/kernel/head.o arch/i386/kernel/init_task.o libs-y += arch/i386/lib/ diff -prauN linux-2.5.72/arch/i386/boot/compressed/misc.c wli-2.5.72-numaq-15/arch/i386/boot/compressed/misc.c --- linux-2.5.72/arch/i386/boot/compressed/misc.c 2003-06-16 21:20:05.000000000 -0700 +++ wli-2.5.72-numaq-15/arch/i386/boot/compressed/misc.c 2003-06-20 03:46:46.000000000 -0700 @@ -379,3 +379,7 @@ asmlinkage int decompress_kernel(struct if (high_loaded) close_output_buffer_if_we_run_high(mv); return high_loaded; } + +/* We don't actually check for stack overflows this early. */ +__asm__(".globl mcount ; mcount: ret\n"); + diff -prauN linux-2.5.72/arch/i386/kernel/apic.c wli-2.5.72-numaq-15/arch/i386/kernel/apic.c --- linux-2.5.72/arch/i386/kernel/apic.c 2003-06-16 21:20:27.000000000 -0700 +++ wli-2.5.72-numaq-15/arch/i386/kernel/apic.c 2003-06-20 03:33:01.000000000 -0700 @@ -1037,7 +1037,8 @@ inline void smp_local_timer_interrupt(st * interrupt as well. Thus we cannot inline the local irq ... ] */ -void smp_apic_timer_interrupt(struct pt_regs regs) +struct pt_regs * IRQHANDLER(smp_apic_timer_interrupt(struct pt_regs* regs)); +struct pt_regs * smp_apic_timer_interrupt(struct pt_regs* regs) { int cpu = smp_processor_id(); @@ -1057,14 +1058,16 @@ void smp_apic_timer_interrupt(struct pt_ * interrupt lock, which is the WrongThing (tm) to do. */ irq_enter(); - smp_local_timer_interrupt(®s); + smp_local_timer_interrupt(regs); irq_exit(); + return regs; } /* * This interrupt should _never_ happen with our APIC/SMP architecture */ -asmlinkage void smp_spurious_interrupt(void) +struct pt_regs * IRQHANDLER(smp_spurious_interrupt(struct pt_regs* regs)); +struct pt_regs * smp_spurious_interrupt(struct pt_regs* regs) { unsigned long v; @@ -1082,13 +1085,15 @@ asmlinkage void smp_spurious_interrupt(v printk(KERN_INFO "spurious APIC interrupt on CPU#%d, should never happen.\n", smp_processor_id()); irq_exit(); + return regs; } /* * This interrupt should never happen with our APIC/SMP architecture */ -asmlinkage void smp_error_interrupt(void) +struct pt_regs * IRQHANDLER(smp_error_interrupt(struct pt_regs* regs)); +struct pt_regs * smp_error_interrupt(struct pt_regs* regs) { unsigned long v, v1; @@ -1113,6 +1118,7 @@ asmlinkage void smp_error_interrupt(void printk (KERN_INFO "APIC error on CPU%d: %02lx(%02lx)\n", smp_processor_id(), v , v1); irq_exit(); + return regs; } /* diff -prauN linux-2.5.72/arch/i386/kernel/cpu/mcheck/p4.c wli-2.5.72-numaq-15/arch/i386/kernel/cpu/mcheck/p4.c --- linux-2.5.72/arch/i386/kernel/cpu/mcheck/p4.c 2003-06-16 21:20:06.000000000 -0700 +++ wli-2.5.72-numaq-15/arch/i386/kernel/cpu/mcheck/p4.c 2003-06-20 03:33:01.000000000 -0700 @@ -61,11 +61,13 @@ static void intel_thermal_interrupt(stru /* Thermal interrupt handler for this CPU setup */ static void (*vendor_thermal_interrupt)(struct pt_regs *regs) = unexpected_thermal_interrupt; -asmlinkage void smp_thermal_interrupt(struct pt_regs regs) +struct pt_regs * IRQHANDLER(smp_thermal_interrupt(struct pt_regs* regs)); +struct pt_regs * smp_thermal_interrupt(struct pt_regs* regs) { irq_enter(); vendor_thermal_interrupt(®s); irq_exit(); + return regs; } /* P4/Xeon Thermal regulation detect and init */ diff -prauN linux-2.5.72/arch/i386/kernel/entry.S wli-2.5.72-numaq-15/arch/i386/kernel/entry.S --- linux-2.5.72/arch/i386/kernel/entry.S 2003-06-16 21:19:46.000000000 -0700 +++ wli-2.5.72-numaq-15/arch/i386/kernel/entry.S 2003-06-20 03:46:46.000000000 -0700 @@ -160,7 +160,7 @@ do_lcall: movl %eax,EFLAGS(%ebp) # movl %edx,EIP(%ebp) # Now we move them to their "normal" places movl %ecx,CS(%ebp) # - andl $-8192, %ebp # GET_THREAD_INFO + GET_THREAD_INFO_WITH_ESP(%ebp) # GET_THREAD_INFO movl TI_EXEC_DOMAIN(%ebp), %edx # Get the execution domain call *4(%edx) # Call the lcall7 handler for the domain addl $4, %esp @@ -394,17 +394,78 @@ ENTRY(irq_entries_start) vector=vector+1 .endr + +# lets play optimizing compiler... +#ifdef CONFIG_X86_CMOV +#define COND_MOVE cmovnz %esi,%esp; +#else +#define COND_MOVE \ + jz 1f; \ + mov %esi,%esp; \ +1: +#endif + +# These macros will switch you to, and from a per-cpu interrupt stack +# They take the pt_regs arg and move it from the normal place on the +# stack to %eax. Any handler function can retrieve it using regparm(1). +# The handlers are expected to return the stack to switch back to in +# the same register. +# +# This means that the irq handlers need to return their arg +# +# SWITCH_TO_IRQSTACK clobbers %ebx, %ecx, %edx, %esi +# old stack gets put in %eax + +.macro SWITCH_TO_IRQSTACK + GET_THREAD_INFO(%ebx); + movl TI_IRQ_STACK(%ebx),%ecx; + movl TI_TASK(%ebx),%edx; + movl %esp,%eax; + + # %ecx+THREAD_SIZE is next stack -4 keeps us in the right one + leal (THREAD_SIZE-4)(%ecx),%esi; + + # is there a valid irq_stack? + testl %ecx,%ecx; + COND_MOVE; + + # update the task pointer in the irq stack + GET_THREAD_INFO(%esi); + movl %edx,TI_TASK(%esi); + + # update the preempt count in the irq stack + movl TI_PRE_COUNT(%ebx),%ecx; + movl %ecx,TI_PRE_COUNT(%esi); +.endm + +# copy flags from the irq stack back into the task's thread_info +# %esi is saved over the irq handler call and contains the irq stack's +# thread_info pointer +# %eax was returned from the handler, as described above +# %ebx contains the original thread_info pointer + +.macro RESTORE_FROM_IRQSTACK + movl %eax,%esp; + movl TI_FLAGS(%esi),%eax; + movl $0,TI_FLAGS(%esi); + LOCK orl %eax,TI_FLAGS(%ebx); +.endm + ALIGN common_interrupt: SAVE_ALL + SWITCH_TO_IRQSTACK call do_IRQ + RESTORE_FROM_IRQSTACK jmp ret_from_intr #define BUILD_INTERRUPT(name, nr) \ ENTRY(name) \ pushl $nr-256; \ SAVE_ALL \ - call smp_/**/name; \ + SWITCH_TO_IRQSTACK; \ + call smp_/**/name; \ + RESTORE_FROM_IRQSTACK; \ jmp ret_from_intr; /* The include is where all of the SMP etc. interrupts come from */ @@ -604,6 +665,61 @@ ENTRY(spurious_interrupt_bug) pushl $do_spurious_interrupt_bug jmp error_code + +#ifdef CONFIG_X86_STACK_CHECK +.data + .globl stack_overflowed +stack_overflowed: + .long 0 +.text + +ENTRY(mcount) + push %eax + movl $(THREAD_SIZE - 1),%eax + andl %esp,%eax + cmpl $STACK_WARN,%eax /* more than half the stack is used*/ + jle 1f +2: + popl %eax + ret +1: + lock; btsl $0,stack_overflowed + jc 2b + + # switch to overflow stack + movl %esp,%eax + movl $(stack_overflow_stack + THREAD_SIZE - 4),%esp + + pushf + cli + pushl %eax + + # push eip then esp of error for stack_overflow_panic + pushl 4(%eax) + pushl %eax + + # update the task pointer and cpu in the overflow stack's thread_info. + GET_THREAD_INFO_WITH_ESP(%eax) + movl TI_TASK(%eax),%ebx + movl %ebx,stack_overflow_stack+TI_TASK + movl TI_CPU(%eax),%ebx + movl %ebx,stack_overflow_stack+TI_CPU + + call stack_overflow + + # pop off call arguments + addl $8,%esp + + popl %eax + popf + movl %eax,%esp + popl %eax + movl $0,stack_overflowed + ret + +#warning stack check enabled +#endif + .data ENTRY(sys_call_table) .long sys_restart_syscall /* 0 - old "setup()" system call, used for restarting */ diff -prauN linux-2.5.72/arch/i386/kernel/head.S wli-2.5.72-numaq-15/arch/i386/kernel/head.S --- linux-2.5.72/arch/i386/kernel/head.S 2003-06-16 21:19:40.000000000 -0700 +++ wli-2.5.72-numaq-15/arch/i386/kernel/head.S 2003-06-20 03:19:38.000000000 -0700 @@ -16,6 +16,7 @@ #include #include #include +#include #define OLD_CL_MAGIC_ADDR 0x90020 #define OLD_CL_MAGIC 0xA33F @@ -325,7 +326,7 @@ rp_sidt: ret ENTRY(stack_start) - .long init_thread_union+8192 + .long init_thread_union+THREAD_SIZE .long __BOOT_DS /* This is the default interrupt "handler" :-) */ diff -prauN linux-2.5.72/arch/i386/kernel/i386_ksyms.c wli-2.5.72-numaq-15/arch/i386/kernel/i386_ksyms.c --- linux-2.5.72/arch/i386/kernel/i386_ksyms.c 2003-06-16 21:20:27.000000000 -0700 +++ wli-2.5.72-numaq-15/arch/i386/kernel/i386_ksyms.c 2003-06-20 03:46:46.000000000 -0700 @@ -208,3 +208,8 @@ EXPORT_SYMBOL(kmap_atomic_to_page); EXPORT_SYMBOL(edd); EXPORT_SYMBOL(eddnr); #endif + +#ifdef CONFIG_X86_STACK_CHECK +extern void mcount(void); +EXPORT_SYMBOL(mcount); +#endif diff -prauN linux-2.5.72/arch/i386/kernel/init_task.c wli-2.5.72-numaq-15/arch/i386/kernel/init_task.c --- linux-2.5.72/arch/i386/kernel/init_task.c 2003-06-16 21:20:26.000000000 -0700 +++ wli-2.5.72-numaq-15/arch/i386/kernel/init_task.c 2003-06-20 03:46:46.000000000 -0700 @@ -14,6 +14,14 @@ static struct signal_struct init_signals static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); struct mm_struct init_mm = INIT_MM(init_mm); +union thread_union init_irq_union + __attribute__((__section__(".data.init_task"))); + +#ifdef CONFIG_X86_STACK_CHECK +union thread_union stack_overflow_stack + __attribute__((__section__(".data.init_task"))); +#endif + /* * Initial thread structure. * diff -prauN linux-2.5.72/arch/i386/kernel/irq.c wli-2.5.72-numaq-15/arch/i386/kernel/irq.c --- linux-2.5.72/arch/i386/kernel/irq.c 2003-06-16 21:19:40.000000000 -0700 +++ wli-2.5.72-numaq-15/arch/i386/kernel/irq.c 2003-06-20 04:43:19.000000000 -0700 @@ -403,7 +403,8 @@ void enable_irq(unsigned int irq) * SMP cross-CPU interrupts have their own specific * handlers). */ -asmlinkage unsigned int do_IRQ(struct pt_regs regs) +struct pt_regs * IRQHANDLER(do_IRQ(struct pt_regs *regs)); +struct pt_regs * do_IRQ(struct pt_regs *regs) { /* * We ack quickly, we don't want the irq controller @@ -415,7 +416,7 @@ asmlinkage unsigned int do_IRQ(struct pt * 0 return value means that this irq is already being * handled by some other CPU. (or is disabled) */ - int irq = regs.orig_eax & 0xff; /* high bits used in ret_from_ code */ + int irq = regs->orig_eax & 0xff; /* high bits used in ret_from_ code */ int cpu = smp_processor_id(); irq_desc_t *desc = irq_desc + irq; struct irqaction * action; @@ -429,7 +430,7 @@ asmlinkage unsigned int do_IRQ(struct pt long esp; __asm__ __volatile__("andl %%esp,%0" : - "=r" (esp) : "0" (8191)); + "=r" (esp) : "0" (THREAD_SIZE - 1)); if (unlikely(esp < (sizeof(struct thread_info) + 1024))) { printk("do_IRQ: stack overflow: %ld\n", esp - sizeof(struct thread_info)); @@ -482,7 +483,7 @@ asmlinkage unsigned int do_IRQ(struct pt irqreturn_t action_ret; spin_unlock(&desc->lock); - action_ret = handle_IRQ_event(irq, ®s, action); + action_ret = handle_IRQ_event(irq, regs, action); spin_lock(&desc->lock); if (!noirqdebug) note_interrupt(irq, desc, action_ret); @@ -502,7 +503,7 @@ out: irq_exit(); - return 1; + return regs; } /** diff -prauN linux-2.5.72/arch/i386/kernel/process.c wli-2.5.72-numaq-15/arch/i386/kernel/process.c --- linux-2.5.72/arch/i386/kernel/process.c 2003-06-16 21:19:37.000000000 -0700 +++ wli-2.5.72-numaq-15/arch/i386/kernel/process.c 2003-06-20 03:46:46.000000000 -0700 @@ -160,7 +160,25 @@ static int __init idle_setup (char *str) __setup("idle=", idle_setup); -void show_regs(struct pt_regs * regs) +void stack_overflow(unsigned long esp, unsigned long eip) +{ + int panicing = ((esp&(THREAD_SIZE-1)) <= STACK_PANIC); + + printk( "esp: 0x%lx masked: 0x%lx STACK_PANIC:0x%x %d %d\n", + esp, (esp&(THREAD_SIZE-1)), STACK_PANIC, (((esp&(THREAD_SIZE-1)) <= STACK_PANIC)), panicing ); + + if (panicing) + print_symbol("stack overflow from %s\n", eip); + else + print_symbol("excessive stack use from %s\n", eip); + printk("esp: %p\n", (void*)esp); + show_trace((void*)esp); + + if (panicing) + panic("stack overflow\n"); +} + +asmlinkage void show_regs(struct pt_regs * regs) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; @@ -449,6 +467,7 @@ struct task_struct * __switch_to(struct /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ + next_p->thread_info->irq_stack = prev_p->thread_info->irq_stack; unlazy_fpu(prev_p); /* diff -prauN linux-2.5.72/arch/i386/kernel/smp.c wli-2.5.72-numaq-15/arch/i386/kernel/smp.c --- linux-2.5.72/arch/i386/kernel/smp.c 2003-06-16 21:19:40.000000000 -0700 +++ wli-2.5.72-numaq-15/arch/i386/kernel/smp.c 2003-06-20 03:34:53.000000000 -0700 @@ -305,7 +305,8 @@ static inline void leave_mm (unsigned lo * 2) Leave the mm if we are in the lazy tlb mode. */ -asmlinkage void smp_invalidate_interrupt (void) +struct pt_regs * IRQHANDLER(smp_invalidate_interrupt(struct pt_regs *regs)); +struct pt_regs * smp_invalidate_interrupt(struct pt_regs *regs) { unsigned long cpu; @@ -336,6 +337,7 @@ asmlinkage void smp_invalidate_interrupt out: put_cpu_no_resched(); + return regs; } static void flush_tlb_others (unsigned long cpumask, struct mm_struct *mm, @@ -559,12 +561,15 @@ void smp_send_stop(void) * all the work is done automatically when * we return from the interrupt. */ -asmlinkage void smp_reschedule_interrupt(void) +struct pt_regs *IRQHANDLER(smp_reschedule_interrupt(struct pt_regs *)); +struct pt_regs *smp_reschedule_interrupt(struct pt_regs *regs) { ack_APIC_irq(); + return regs; } -asmlinkage void smp_call_function_interrupt(void) +struct pt_regs *IRQHANDLER(smp_call_function_interrupt(struct pt_regs *)); +struct pt_regs *smp_call_function_interrupt(struct pt_regs *regs) { void (*func) (void *info) = call_data->func; void *info = call_data->info; @@ -588,5 +593,6 @@ asmlinkage void smp_call_function_interr mb(); atomic_inc(&call_data->finished); } + return regs; } diff -prauN linux-2.5.72/arch/i386/kernel/smpboot.c wli-2.5.72-numaq-15/arch/i386/kernel/smpboot.c --- linux-2.5.72/arch/i386/kernel/smpboot.c 2003-06-16 21:20:01.000000000 -0700 +++ wli-2.5.72-numaq-15/arch/i386/kernel/smpboot.c 2003-06-20 03:33:01.000000000 -0700 @@ -71,6 +71,11 @@ static unsigned long smp_commenced_mask; /* Per CPU bogomips and other parameters */ struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned; +/* Per CPU interrupt stacks */ +extern union thread_union init_irq_union; +union thread_union *irq_stacks[NR_CPUS] __cacheline_aligned = + { &init_irq_union, }; + /* Set when the idlers are all forked */ int smp_threads_ready; @@ -770,6 +775,28 @@ wakeup_secondary_cpu(int phys_apicid, un } #endif /* WAKE_SECONDARY_VIA_INIT */ +static void __init setup_irq_stack(struct task_struct *p, int cpu) +{ + unsigned long stk; + + stk = __get_free_pages(GFP_KERNEL, THREAD_ORDER); + if (!stk) + panic("I can't seem to allocate my irq stack. Oh well, giving up."); + + irq_stacks[cpu] = (void *)stk; + memset(irq_stacks[cpu], 0, THREAD_SIZE); + irq_stacks[cpu]->thread_info.cpu = cpu; + irq_stacks[cpu]->thread_info.preempt_count = 1; + /* interrupts are not preemptable */ + p->thread_info->irq_stack = &irq_stacks[cpu]->thread_info; + + /* If we want to make the irq stack more than one unit + * deep, we can chain then off of the irq_stack pointer + * here. + */ +} + + extern unsigned long cpu_initialized; static int __init do_boot_cpu(int apicid) @@ -793,6 +820,7 @@ static int __init do_boot_cpu(int apicid idle = fork_by_hand(); if (IS_ERR(idle)) panic("failed fork for CPU %d", cpu); + setup_irq_stack(idle, cpu); wake_up_forked_process(idle); /* diff -prauN linux-2.5.72/arch/i386/kernel/vm86.c wli-2.5.72-numaq-15/arch/i386/kernel/vm86.c --- linux-2.5.72/arch/i386/kernel/vm86.c 2003-06-16 21:19:40.000000000 -0700 +++ wli-2.5.72-numaq-15/arch/i386/kernel/vm86.c 2003-06-18 19:17:06.000000000 -0700 @@ -127,16 +127,17 @@ struct pt_regs * save_v86_state(struct k return ret; } -static void mark_screen_rdonly(struct task_struct * tsk) +static void mark_screen_rdonly(task_t *task) { + struct mm_struct *mm = task->mm; pgd_t *pgd; pmd_t *pmd; pte_t *pte, *mapped; int i; preempt_disable(); - spin_lock(&tsk->mm->page_table_lock); - pgd = pgd_offset(tsk->mm, 0xA0000); + spin_lock(&mm->page_table_lock); + pgd = pgd_offset(mm, 0xA0000); if (pgd_none(*pgd)) goto out; if (pgd_bad(*pgd)) { @@ -144,23 +145,26 @@ static void mark_screen_rdonly(struct ta pgd_clear(pgd); goto out; } - pmd = pmd_offset(pgd, 0xA0000); - if (pmd_none(*pmd)) + pmd = pmd_offset_map(pgd, 0xA0000); + if (pmd_none(*pmd)) { + pmd_unmap(pmd); goto out; - if (pmd_bad(*pmd)) { + } else if (pmd_bad(*pmd)) { pmd_ERROR(*pmd); pmd_clear(pmd); + pmd_unmap(pmd); goto out; } pte = mapped = pte_offset_map(pmd, 0xA0000); for (i = 0; i < 32; i++) { if (pte_present(*pte)) - set_pte(pte, pte_wrprotect(*pte)); + vm_ptep_set_wrprotect(mm, pte); pte++; } pte_unmap(mapped); + pmd_unmap(pmd); out: - spin_unlock(&tsk->mm->page_table_lock); + spin_unlock(&mm->page_table_lock); preempt_enable(); flush_tlb(); } diff -prauN linux-2.5.72/arch/i386/mm/fault.c wli-2.5.72-numaq-15/arch/i386/mm/fault.c --- linux-2.5.72/arch/i386/mm/fault.c 2003-06-16 21:19:37.000000000 -0700 +++ wli-2.5.72-numaq-15/arch/i386/mm/fault.c 2003-06-18 19:11:28.000000000 -0700 @@ -330,8 +330,8 @@ vmalloc_fault: * and redundant with the set_pmd() on non-PAE. */ - pmd = pmd_offset(pgd, address); - pmd_k = pmd_offset(pgd_k, address); + pmd = pmd_offset_kernel(pgd, address); + pmd_k = pmd_offset_kernel(pgd_k, address); if (!pmd_present(*pmd_k)) goto no_context; set_pmd(pmd, *pmd_k); diff -prauN linux-2.5.72/arch/i386/mm/highmem.c wli-2.5.72-numaq-15/arch/i386/mm/highmem.c --- linux-2.5.72/arch/i386/mm/highmem.c 2003-06-16 21:20:02.000000000 -0700 +++ wli-2.5.72-numaq-15/arch/i386/mm/highmem.c 2003-06-19 10:47:20.000000000 -0700 @@ -1,22 +1,5 @@ #include -void *kmap(struct page *page) -{ - might_sleep(); - if (page < highmem_start_page) - return page_address(page); - return kmap_high(page); -} - -void kunmap(struct page *page) -{ - if (in_interrupt()) - BUG(); - if (page < highmem_start_page) - return; - kunmap_high(page); -} - /* * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because * no global lock is needed and because the kmap code must perform a global TLB @@ -25,40 +8,38 @@ void kunmap(struct page *page) * However when holding an atomic kmap is is not legal to sleep, so atomic * kmaps are appropriate for short, tight code paths only. */ -void *kmap_atomic(struct page *page, enum km_type type) +void *__kmap_atomic(struct page *page, enum km_type type) { enum fixed_addresses idx; unsigned long vaddr; - - inc_preempt_count(); - if (page < highmem_start_page) - return page_address(page); + pte_t old_pte, pte = mk_pte(page, kmap_prot); idx = type + KM_TYPE_NR*smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); + old_pte = *(kmap_pte - idx); + #ifdef CONFIG_DEBUG_HIGHMEM - if (!pte_none(*(kmap_pte-idx))) - BUG(); + BUG_ON(!pte_none(old_pte)); #endif - set_pte(kmap_pte-idx, mk_pte(page, kmap_prot)); - __flush_tlb_one(vaddr); - return (void*) vaddr; + if (!pte_same(old_pte, pte)) { + set_pte(kmap_pte-idx, mk_pte(page, kmap_prot)); + if (!pte_none(old_pte)) + __flush_tlb_one(vaddr); + } + return (void *)vaddr; } -void kunmap_atomic(void *kvaddr, enum km_type type) -{ #ifdef CONFIG_DEBUG_HIGHMEM +void __kunmap_atomic(void *kvaddr, enum km_type type) +{ unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); - if (vaddr < FIXADDR_START) { // FIXME - dec_preempt_count(); + if (vaddr < FIXADDR_START) // FIXME return; - } - if (vaddr != __fix_to_virt(FIX_KMAP_BEGIN+idx)) - BUG(); + BUG_ON(vaddr != __fix_to_virt(FIX_KMAP_BEGIN+idx)); /* * force other mappings to Oops if they'll try to access @@ -66,21 +47,15 @@ void kunmap_atomic(void *kvaddr, enum km */ pte_clear(kmap_pte-idx); __flush_tlb_one(vaddr); -#endif - - dec_preempt_count(); } +#endif -struct page *kmap_atomic_to_page(void *ptr) +struct page *__kmap_atomic_to_page(void *ptr) { unsigned long idx, vaddr = (unsigned long)ptr; pte_t *pte; - if (vaddr < FIXADDR_START) - return virt_to_page(ptr); - idx = virt_to_fix(vaddr); pte = kmap_pte - (idx - FIX_KMAP_BEGIN); return pte_page(*pte); } - diff -prauN linux-2.5.72/arch/i386/mm/hugetlbpage.c wli-2.5.72-numaq-15/arch/i386/mm/hugetlbpage.c --- linux-2.5.72/arch/i386/mm/hugetlbpage.c 2003-06-16 21:20:22.000000000 -0700 +++ wli-2.5.72-numaq-15/arch/i386/mm/hugetlbpage.c 2003-06-18 19:57:58.000000000 -0700 @@ -57,8 +57,8 @@ static pte_t *huge_pte_alloc(struct mm_s pmd_t *pmd = NULL; pgd = pgd_offset(mm, addr); - pmd = pmd_alloc(mm, pgd, addr); - return (pte_t *) pmd; + pmd = pmd_alloc_map(mm, pgd, addr); + return (pte_t *)pmd; } static pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) @@ -67,11 +67,13 @@ static pte_t *huge_pte_offset(struct mm_ pmd_t *pmd = NULL; pgd = pgd_offset(mm, addr); - pmd = pmd_offset(pgd, addr); - return (pte_t *) pmd; + pmd = pmd_offset_map(pgd, addr); + return (pte_t *)pmd; } -static void set_huge_pte(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page, pte_t * page_table, int write_access) +static void set_huge_pte(struct mm_struct *mm, struct vm_area_struct *vma, + struct page *page, pte_t * page_table, + unsigned long addr, int write_access) { pte_t entry; @@ -84,6 +86,7 @@ static void set_huge_pte(struct mm_struc entry = pte_mkyoung(entry); mk_pte_huge(entry); set_pte(page_table, entry); + vm_account_huge_inc(vma, *page_table, addr); } /* @@ -115,6 +118,8 @@ int copy_hugetlb_page_range(struct mm_st ptepage = pte_page(entry); get_page(ptepage); set_pte(dst_pte, entry); + pmd_unmap(dst_pte); + pmd_unmap_nested(src_pte); dst->rss += (HPAGE_SIZE / PAGE_SIZE); addr += HPAGE_SIZE; } @@ -152,6 +157,7 @@ follow_hugetlb_page(struct mm_struct *mm get_page(page); pages[i] = page; + pmd_unmap(pte); } if (vmas) @@ -241,6 +247,7 @@ follow_huge_pmd(struct mm_struct *mm, un page += ((address & ~HPAGE_MASK) >> PAGE_SHIFT); get_page(page); } + pmd_unmap(pmd); return page; } #endif @@ -284,6 +291,8 @@ void unmap_hugepage_range(struct vm_area page = pte_page(*pte); huge_page_release(page); pte_clear(pte); + vm_account_huge_dec(vma, *pte, address); + pmd_unmap(pte); } mm->rss -= (end - start) >> PAGE_SHIFT; flush_tlb_range(vma, start, end); @@ -328,16 +337,19 @@ int hugetlb_prefault(struct address_spac page = alloc_hugetlb_page(); if (!page) { ret = -ENOMEM; + pmd_unmap(pte); goto out; } ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC); unlock_page(page); if (ret) { free_huge_page(page); + pmd_unmap(pte); goto out; } } - set_huge_pte(mm, vma, page, pte, vma->vm_flags & VM_WRITE); + set_huge_pte(mm, vma, page, pte, addr, vma->vm_flags & VM_WRITE); + pmd_unmap(pte); } out: spin_unlock(&mm->page_table_lock); diff -prauN linux-2.5.72/arch/i386/mm/init.c wli-2.5.72-numaq-15/arch/i386/mm/init.c --- linux-2.5.72/arch/i386/mm/init.c 2003-06-16 21:20:19.000000000 -0700 +++ wli-2.5.72-numaq-15/arch/i386/mm/init.c 2003-06-18 19:15:51.000000000 -0700 @@ -58,10 +58,10 @@ static pmd_t * __init one_md_table_init( #ifdef CONFIG_X86_PAE pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); - if (pmd_table != pmd_offset(pgd, 0)) + if (pmd_table != pmd_offset_kernel(pgd, 0)) BUG(); #else - pmd_table = pmd_offset(pgd, 0); + pmd_table = pmd_offset_kernel(pgd, 0); #endif return pmd_table; @@ -112,7 +112,7 @@ static void __init page_table_range_init if (pgd_none(*pgd)) one_md_table_init(pgd); - pmd = pmd_offset(pgd, vaddr); + pmd = pmd_offset_kernel(pgd, vaddr); for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) { if (pmd_none(*pmd)) one_page_table_init(pmd); @@ -193,7 +193,7 @@ pte_t *kmap_pte; pgprot_t kmap_prot; #define kmap_get_fixmap_pte(vaddr) \ - pte_offset_kernel(pmd_offset(pgd_offset_k(vaddr), (vaddr)), (vaddr)) + pte_offset_kernel(pmd_offset_kernel(pgd_offset_k(vaddr), (vaddr)), (vaddr)) void __init kmap_init(void) { @@ -217,7 +217,7 @@ void __init permanent_kmaps_init(pgd_t * page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base); pgd = swapper_pg_dir + pgd_index(vaddr); - pmd = pmd_offset(pgd, vaddr); + pmd = pmd_offset_kernel(pgd, vaddr); pte = pte_offset_kernel(pmd, vaddr); pkmap_page_table = pte; } @@ -462,7 +462,7 @@ void __init mem_init(void) /* this will put all low memory onto the freelists */ totalram_pages += __free_all_bootmem(); - + tlb_init(); reservedpages = 0; for (tmp = 0; tmp < max_low_pfn; tmp++) /* @@ -505,20 +505,19 @@ void __init mem_init(void) #endif } -#ifdef CONFIG_X86_PAE -struct kmem_cache_s *pae_pgd_cachep; +kmem_cache_t *pgd_cache; void __init pgtable_cache_init(void) { - /* - * PAE pgds must be 16-byte aligned: - */ - pae_pgd_cachep = kmem_cache_create("pae_pgd", 32, 0, - SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN, NULL, NULL); - if (!pae_pgd_cachep) - panic("init_pae(): Cannot alloc pae_pgd SLAB cache"); + pgd_cache = kmem_cache_create("pgd", + PTRS_PER_PGD*sizeof(pgd_t), + 0, + SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN, + pgd_ctor, + PTRS_PER_PMD == 1 ? pgd_dtor : NULL); + if (!pgd_cache) + panic("pagetable_cache_init(): Cannot create pgd cache"); } -#endif /* * This function cannot be __init, since exceptions don't work in that diff -prauN linux-2.5.72/arch/i386/mm/ioremap.c wli-2.5.72-numaq-15/arch/i386/mm/ioremap.c --- linux-2.5.72/arch/i386/mm/ioremap.c 2003-06-16 21:19:46.000000000 -0700 +++ wli-2.5.72-numaq-15/arch/i386/mm/ioremap.c 2003-06-18 19:11:28.000000000 -0700 @@ -82,7 +82,7 @@ static int remap_area_pages(unsigned lon spin_lock(&init_mm.page_table_lock); do { pmd_t *pmd; - pmd = pmd_alloc(&init_mm, dir, address); + pmd = pmd_alloc_kernel(&init_mm, dir, address); error = -ENOMEM; if (!pmd) break; diff -prauN linux-2.5.72/arch/i386/mm/pageattr.c wli-2.5.72-numaq-15/arch/i386/mm/pageattr.c --- linux-2.5.72/arch/i386/mm/pageattr.c 2003-06-16 21:20:00.000000000 -0700 +++ wli-2.5.72-numaq-15/arch/i386/mm/pageattr.c 2003-06-19 19:30:29.000000000 -0700 @@ -19,7 +19,7 @@ static inline pte_t *lookup_address(unsi pmd_t *pmd; if (pgd_none(*pgd)) return NULL; - pmd = pmd_offset(pgd, address); + pmd = pmd_offset_kernel(pgd, address); if (pmd_none(*pmd)) return NULL; if (pmd_large(*pmd)) @@ -58,19 +58,22 @@ static void flush_kernel_map(void *dummy static void set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) { + struct page *page; + unsigned long flags; + set_pte_atomic(kpte, pte); /* change init_mm */ -#ifndef CONFIG_X86_PAE - { - struct list_head *l; - spin_lock(&mmlist_lock); - list_for_each(l, &init_mm.mmlist) { - struct mm_struct *mm = list_entry(l, struct mm_struct, mmlist); - pmd_t *pmd = pmd_offset(pgd_offset(mm, address), address); - set_pte_atomic((pte_t *)pmd, pte); - } - spin_unlock(&mmlist_lock); + if (PTRS_PER_PMD > 1) + return; + + spin_lock_irqsave(&pgd_lock, flags); + list_for_each_entry(page, &pgd_list, lru) { + pgd_t *pgd; + pmd_t *pmd; + pgd = (pgd_t *)page_address(page) + pgd_index(address); + pmd = pmd_offset_kernel(pgd, address); + set_pte_atomic((pte_t *)pmd, pte); } -#endif + spin_unlock_irqrestore(&pgd_lock, flags); } /* @@ -80,7 +83,7 @@ static void set_pmd_pte(pte_t *kpte, uns static inline void revert_page(struct page *kpte_page, unsigned long address) { pte_t *linear = (pte_t *) - pmd_offset(pgd_offset(&init_mm, address), address); + pmd_offset_kernel(pgd_offset_k(address), address); set_pmd_pte(linear, address, pfn_pte((__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); diff -prauN linux-2.5.72/arch/i386/mm/pgtable.c wli-2.5.72-numaq-15/arch/i386/mm/pgtable.c --- linux-2.5.72/arch/i386/mm/pgtable.c 2003-06-16 21:20:29.000000000 -0700 +++ wli-2.5.72-numaq-15/arch/i386/mm/pgtable.c 2003-06-18 19:15:51.000000000 -0700 @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -69,7 +70,7 @@ static void set_pte_pfn(unsigned long va BUG(); return; } - pmd = pmd_offset(pgd, vaddr); + pmd = pmd_offset_kernel(pgd, vaddr); if (pmd_none(*pmd)) { BUG(); return; @@ -109,7 +110,7 @@ void set_pmd_pfn(unsigned long vaddr, un printk ("set_pmd_pfn: pgd_none\n"); return; /* BUG(); */ } - pmd = pmd_offset(pgd, vaddr); + pmd = pmd_offset_kernel(pgd, vaddr); set_pmd(pmd, pfn_pmd(pfn, flags)); /* * It's enough to flush this one mapping. @@ -137,75 +138,142 @@ pte_t *pte_alloc_one_kernel(struct mm_st return pte; } -struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) +void tlb_init(void) { - struct page *pte; + int cpu; + for (cpu = 0; cpu < NR_CPUS; ++cpu) { + int zone; + struct mmu_gather *tlb = &per_cpu(mmu_gathers, cpu); + for (zone = 0; zone < MAX_ZONE_ID; ++zone) { + INIT_LIST_HEAD(&tlb->active_list[zone]); + INIT_LIST_HEAD(&tlb->ready_list[zone]); + } + } +} -#ifdef CONFIG_HIGHPTE - pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT, 0); -#else - pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT, 0); -#endif - if (pte) - clear_highpage(pte); - return pte; +static inline struct page *pte_alloc_fresh(int gfp_mask) +{ + struct page *page = alloc_page(gfp_mask); + if (page) { + clear_highpage(page); + if (TestSetPagePTE(page)) + BUG(); + } + return page; } -#ifdef CONFIG_X86_PAE +static inline int zone_high(struct zone *zone) +{ + if (!zone) + return 1; + else + return zone - zone->zone_pgdat->node_zones >= ZONE_HIGHMEM; +} -pgd_t *pgd_alloc(struct mm_struct *mm) +static inline struct page *pte_alloc_ready(int gfp_flags) { - int i; - pgd_t *pgd = kmem_cache_alloc(pae_pgd_cachep, GFP_KERNEL); + struct mmu_gather *tlb = &per_cpu(mmu_gathers, get_cpu()); + struct page *page = NULL; - if (pgd) { - for (i = 0; i < USER_PTRS_PER_PGD; i++) { - unsigned long pmd = __get_free_page(GFP_KERNEL); - if (!pmd) - goto out_oom; - clear_page(pmd); - set_pgd(pgd + i, __pgd(1 + __pa(pmd))); + if (tlb->nr_pte_ready) { + int z; + for (z = MAX_ZONE_ID - 1; z >= 0; --z) { + struct zone *zone = zone_table[z]; + if (!(gfp_flags & __GFP_HIGHMEM) && zone_high(zone)) + continue; + if (!list_empty(&tlb->ready_list[z])) + break; } - memcpy(pgd + USER_PTRS_PER_PGD, - swapper_pg_dir + USER_PTRS_PER_PGD, - (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); + page = list_entry(tlb->ready_list[z].next, struct page, list); + if (TestSetPagePTE(page)) + BUG(); + list_del(&page->list); + tlb->ready_count[z]--; + tlb->nr_pte_ready--; } - return pgd; -out_oom: - for (i--; i >= 0; i--) - free_page((unsigned long)__va(pgd_val(pgd[i])-1)); - kmem_cache_free(pae_pgd_cachep, pgd); - return NULL; + put_cpu(); + return page; } -void pgd_free(pgd_t *pgd) +struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) { - int i; + struct page *page = pte_alloc_ready(GFP_PTE); + return page ? page : pte_alloc_fresh(GFP_PTE); +} - for (i = 0; i < USER_PTRS_PER_PGD; i++) - free_page((unsigned long)__va(pgd_val(pgd[i])-1)); - kmem_cache_free(pae_pgd_cachep, pgd); +static inline struct page *__pmd_alloc_one(void) +{ + struct page *page = pte_alloc_ready(GFP_PMD); + return page ? page : pte_alloc_fresh(GFP_PMD); } -#else +LIST_HEAD(pgd_list); +spinlock_t pgd_lock = SPIN_LOCK_UNLOCKED; -pgd_t *pgd_alloc(struct mm_struct *mm) +void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused) { - pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL); + unsigned long flags; + + if (PTRS_PER_PMD == 1) + spin_lock_irqsave(&pgd_lock, flags); - if (pgd) { - memset(pgd, 0, USER_PTRS_PER_PGD * sizeof(pgd_t)); - memcpy(pgd + USER_PTRS_PER_PGD, + memcpy((pgd_t *)pgd + USER_PTRS_PER_PGD, swapper_pg_dir + USER_PTRS_PER_PGD, - (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); + (PTRS_PER_PGD - USER_PTRS_PER_PGD)*sizeof(pgd_t)); + + if (PTRS_PER_PMD > 1) + return; + + list_add(&virt_to_page(pgd)->lru, &pgd_list); + spin_unlock_irqrestore(&pgd_lock, flags); + memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); +} + +void pgd_dtor(void *pgd, kmem_cache_t *cache, unsigned long unused) +{ + unsigned long flags; + + spin_lock_irqsave(&pgd_lock, flags); + list_del(&virt_to_page(pgd)->lru); + spin_unlock_irqrestore(&pgd_lock, flags); +} + +pgd_t *pgd_alloc(struct mm_struct *mm) +{ + int i; + pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL); + + if (PTRS_PER_PMD == 1 || !pgd) + return pgd; + + for (i = 0; i < USER_PTRS_PER_PGD; i++) { + struct page *pmd = __pmd_alloc_one(); + if (!pmd) + goto out_oom; + set_pgd(&pgd[i], __pgd(1ULL | (u64)page_to_pfn(pmd) << PAGE_SHIFT)); } + return pgd; + + /* + * This looks unusual. pte_free() is actually a convenient wrapper + * for queueing up preconstructed pmd and/or pte pages. The cases + * fall through to just queueing them in the per-cpu lists. + */ +out_oom: + for (i--; i >= 0; i--) + pte_free(pgd_page(pgd[i])); + kmem_cache_free(pgd_cache, pgd); + return NULL; } + void pgd_free(pgd_t *pgd) { - free_page((unsigned long)pgd); + if (PTRS_PER_PMD > 1) { + int i; + for (i = 0; i < USER_PTRS_PER_PGD; i++) + pte_free(pgd_page(pgd[i])); + } + kmem_cache_free(pgd_cache, pgd); } - -#endif /* CONFIG_X86_PAE */ - diff -prauN linux-2.5.72/arch/ia64/ia32/binfmt_elf32.c wli-2.5.72-numaq-15/arch/ia64/ia32/binfmt_elf32.c --- linux-2.5.72/arch/ia64/ia32/binfmt_elf32.c 2003-06-16 21:20:23.000000000 -0700 +++ wli-2.5.72-numaq-15/arch/ia64/ia32/binfmt_elf32.c 2003-06-18 19:17:06.000000000 -0700 @@ -200,7 +200,8 @@ ia32_setup_arg_pages (struct linux_binpr struct page *page = bprm->page[i]; if (page) { bprm->page[i] = NULL; - put_dirty_page(current, page, stack_base, PAGE_COPY); + put_dirty_page(current, mpnt, page, + stack_base, PAGE_COPY); } stack_base += PAGE_SIZE; } diff -prauN linux-2.5.72/arch/s390/kernel/compat_exec.c wli-2.5.72-numaq-15/arch/s390/kernel/compat_exec.c --- linux-2.5.72/arch/s390/kernel/compat_exec.c 2003-06-16 21:20:22.000000000 -0700 +++ wli-2.5.72-numaq-15/arch/s390/kernel/compat_exec.c 2003-06-18 19:17:06.000000000 -0700 @@ -80,7 +80,8 @@ int setup_arg_pages32(struct linux_binpr struct page *page = bprm->page[i]; if (page) { bprm->page[i] = NULL; - put_dirty_page(current,page,stack_base,PAGE_COPY); + put_dirty_page(current, mpnt, page, + stack_base, PAGE_COPY); } stack_base += PAGE_SIZE; } diff -prauN linux-2.5.72/arch/sparc/mm/srmmu.c wli-2.5.72-numaq-15/arch/sparc/mm/srmmu.c --- linux-2.5.72/arch/sparc/mm/srmmu.c 2003-06-16 21:20:05.000000000 -0700 +++ wli-2.5.72-numaq-15/arch/sparc/mm/srmmu.c 2003-06-18 19:11:28.000000000 -0700 @@ -2180,7 +2180,7 @@ void __init ld_mmu_srmmu(void) BTFIXUPSET_CALL(pte_pfn, srmmu_pte_pfn, BTFIXUPCALL_NORM); BTFIXUPSET_CALL(pmd_page, srmmu_pmd_page, BTFIXUPCALL_NORM); - BTFIXUPSET_CALL(pgd_page, srmmu_pgd_page, BTFIXUPCALL_NORM); + BTFIXUPSET_CALL(__pgd_page, srmmu_pgd_page, BTFIXUPCALL_NORM); BTFIXUPSET_SETHI(none_mask, 0xF0000000); diff -prauN linux-2.5.72/arch/sparc/mm/sun4c.c wli-2.5.72-numaq-15/arch/sparc/mm/sun4c.c --- linux-2.5.72/arch/sparc/mm/sun4c.c 2003-06-16 21:20:19.000000000 -0700 +++ wli-2.5.72-numaq-15/arch/sparc/mm/sun4c.c 2003-06-18 19:11:28.000000000 -0700 @@ -2252,5 +2252,5 @@ void __init ld_mmu_sun4c(void) /* These should _never_ get called with two level tables. */ BTFIXUPSET_CALL(pgd_set, sun4c_pgd_set, BTFIXUPCALL_NOP); - BTFIXUPSET_CALL(pgd_page, sun4c_pgd_page, BTFIXUPCALL_RETO0); + BTFIXUPSET_CALL(__pgd_page, sun4c_pgd_page, BTFIXUPCALL_RETO0); } diff -prauN linux-2.5.72/arch/x86_64/ia32/ia32_binfmt.c wli-2.5.72-numaq-15/arch/x86_64/ia32/ia32_binfmt.c --- linux-2.5.72/arch/x86_64/ia32/ia32_binfmt.c 2003-06-16 21:19:39.000000000 -0700 +++ wli-2.5.72-numaq-15/arch/x86_64/ia32/ia32_binfmt.c 2003-06-18 19:17:06.000000000 -0700 @@ -363,7 +363,8 @@ int setup_arg_pages(struct linux_binprm struct page *page = bprm->page[i]; if (page) { bprm->page[i] = NULL; - put_dirty_page(current,page,stack_base,PAGE_COPY_EXEC); + put_dirty_page(current, mpnt, page, + stack_base, PAGE_COPY_EXEC); } stack_base += PAGE_SIZE; } diff -prauN linux-2.5.72/drivers/char/drm/drm_memory.h wli-2.5.72-numaq-15/drivers/char/drm/drm_memory.h --- linux-2.5.72/drivers/char/drm/drm_memory.h 2003-06-16 21:19:42.000000000 -0700 +++ wli-2.5.72-numaq-15/drivers/char/drm/drm_memory.h 2003-06-18 19:11:28.000000000 -0700 @@ -123,7 +123,7 @@ static inline unsigned long drm_follow_page (void *vaddr) { pgd_t *pgd = pgd_offset_k((unsigned long) vaddr); - pmd_t *pmd = pmd_offset(pgd, (unsigned long) vaddr); + pmd_t *pmd = pmd_offset_kernel(pgd, (unsigned long)vaddr); pte_t *ptep = pte_offset_kernel(pmd, (unsigned long) vaddr); return pte_pfn(*ptep) << PAGE_SHIFT; } diff -prauN linux-2.5.72/fs/buffer.c wli-2.5.72-numaq-15/fs/buffer.c --- linux-2.5.72/fs/buffer.c 2003-06-16 21:20:05.000000000 -0700 +++ wli-2.5.72-numaq-15/fs/buffer.c 2003-06-18 21:44:46.000000000 -0700 @@ -839,14 +839,14 @@ int __set_page_dirty_buffers(struct page spin_unlock(&mapping->private_lock); if (!TestSetPageDirty(page)) { - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); if (page->mapping) { /* Race with truncate? */ if (!mapping->backing_dev_info->memory_backed) inc_page_state(nr_dirty); list_del(&page->list); list_add(&page->list, &mapping->dirty_pages); } - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); } diff -prauN linux-2.5.72/fs/exec.c wli-2.5.72-numaq-15/fs/exec.c --- linux-2.5.72/fs/exec.c 2003-06-16 21:19:59.000000000 -0700 +++ wli-2.5.72-numaq-15/fs/exec.c 2003-06-20 05:04:06.000000000 -0700 @@ -285,29 +285,30 @@ int copy_strings_kernel(int argc,char ** * This routine is used to map in a page into an address space: needed by * execve() for the initial stack and environment pages. * - * tsk->mmap_sem is held for writing. + * The caller should hold task->mm->mmap_sem for writing. */ -void put_dirty_page(struct task_struct *tsk, struct page *page, - unsigned long address, pgprot_t prot) +void put_dirty_page(task_t *task, struct vm_area_struct *vma, + struct page *page, unsigned long address, pgprot_t prot) { - pgd_t * pgd; - pmd_t * pmd; - pte_t * pte; + struct mm_struct *mm = task->mm; + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; struct pte_chain *pte_chain; if (page_count(page) != 1) printk(KERN_ERR "mem_map disagrees with %p at %08lx\n", page, address); - pgd = pgd_offset(tsk->mm, address); + pgd = pgd_offset(mm, address); pte_chain = pte_chain_alloc(GFP_KERNEL); if (!pte_chain) goto out_sig; - spin_lock(&tsk->mm->page_table_lock); - pmd = pmd_alloc(tsk->mm, pgd, address); + spin_lock(&mm->page_table_lock); + pmd = pmd_alloc_map(mm, pgd, address); if (!pmd) goto out; - pte = pte_alloc_map(tsk->mm, pmd, address); + pte = pte_alloc_map(mm, &pmd, address); if (!pte) goto out; if (!pte_none(*pte)) { @@ -316,20 +317,24 @@ void put_dirty_page(struct task_struct * } lru_cache_add_active(page); flush_dcache_page(page); - set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(page, prot)))); + SetPageAnon(page); + vm_set_pte(vma, pte, pte_mkdirty(pte_mkwrite(mk_pte(page, prot))), address); pte_chain = page_add_rmap(page, pte, pte_chain); pte_unmap(pte); - tsk->mm->rss++; - spin_unlock(&tsk->mm->page_table_lock); + pmd_unmap(pmd); + mm->rss++; + spin_unlock(&mm->page_table_lock); /* no need for flush_tlb */ pte_chain_free(pte_chain); return; out: - spin_unlock(&tsk->mm->page_table_lock); + if (pmd) + pmd_unmap(pmd); + spin_unlock(&mm->page_table_lock); out_sig: __free_page(page); - force_sig(SIGKILL, tsk); + force_sig(SIGKILL, task); pte_chain_free(pte_chain); return; } @@ -423,7 +428,7 @@ int setup_arg_pages(struct linux_binprm struct page *page = bprm->page[i]; if (page) { bprm->page[i] = NULL; - put_dirty_page(current, page, stack_base, + put_dirty_page(current, mpnt, page, stack_base, mpnt->vm_page_prot); } stack_base += PAGE_SIZE; diff -prauN linux-2.5.72/fs/fs-writeback.c wli-2.5.72-numaq-15/fs/fs-writeback.c --- linux-2.5.72/fs/fs-writeback.c 2003-06-16 21:20:28.000000000 -0700 +++ wli-2.5.72-numaq-15/fs/fs-writeback.c 2003-06-18 21:44:37.000000000 -0700 @@ -150,10 +150,10 @@ __sync_single_inode(struct inode *inode, * read speculatively by this cpu before &= ~I_DIRTY -- mikulas */ - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); if (wait || !wbc->for_kupdate || list_empty(&mapping->io_pages)) list_splice_init(&mapping->dirty_pages, &mapping->io_pages); - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); spin_unlock(&inode_lock); do_writepages(mapping, wbc); diff -prauN linux-2.5.72/fs/inode.c wli-2.5.72-numaq-15/fs/inode.c --- linux-2.5.72/fs/inode.c 2003-06-16 21:20:27.000000000 -0700 +++ wli-2.5.72-numaq-15/fs/inode.c 2003-06-18 21:26:53.000000000 -0700 @@ -182,7 +182,7 @@ void inode_init_once(struct inode *inode INIT_LIST_HEAD(&inode->i_devices); sema_init(&inode->i_sem, 1); INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC); - spin_lock_init(&inode->i_data.page_lock); + mapping_rwlock_init(&inode->i_data.page_lock); init_MUTEX(&inode->i_data.i_shared_sem); INIT_LIST_HEAD(&inode->i_data.private_list); spin_lock_init(&inode->i_data.private_lock); diff -prauN linux-2.5.72/fs/mpage.c wli-2.5.72-numaq-15/fs/mpage.c --- linux-2.5.72/fs/mpage.c 2003-06-16 21:19:44.000000000 -0700 +++ wli-2.5.72-numaq-15/fs/mpage.c 2003-06-18 21:43:58.000000000 -0700 @@ -627,7 +627,7 @@ mpage_writepages(struct address_space *m writepage = mapping->a_ops->writepage; pagevec_init(&pvec, 0); - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); while (!list_empty(&mapping->io_pages) && !done) { struct page *page = list_entry(mapping->io_pages.prev, struct page, list); @@ -647,7 +647,7 @@ mpage_writepages(struct address_space *m list_add(&page->list, &mapping->locked_pages); page_cache_get(page); - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); /* * At this point we hold neither mapping->page_lock nor @@ -679,12 +679,12 @@ mpage_writepages(struct address_space *m unlock_page(page); } page_cache_release(page); - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); } /* * Leave any remaining dirty pages on ->io_pages */ - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); if (bio) mpage_bio_submit(WRITE, bio); return ret; diff -prauN linux-2.5.72/fs/namei.c wli-2.5.72-numaq-15/fs/namei.c --- linux-2.5.72/fs/namei.c 2003-06-16 21:19:57.000000000 -0700 +++ wli-2.5.72-numaq-15/fs/namei.c 2003-06-20 01:23:55.000000000 -0700 @@ -434,19 +434,17 @@ int follow_up(struct vfsmount **mnt, str return 1; } +/* no need for dcache_lock, as serialization is taken care in + * namespace.c + */ static int follow_mount(struct vfsmount **mnt, struct dentry **dentry) { int res = 0; while (d_mountpoint(*dentry)) { - struct vfsmount *mounted; - spin_lock(&dcache_lock); - mounted = lookup_mnt(*mnt, *dentry); - if (!mounted) { - spin_unlock(&dcache_lock); + struct vfsmount *mounted = lookup_mnt(*mnt, *dentry); + if (!mounted) break; - } - *mnt = mntget(mounted); - spin_unlock(&dcache_lock); + *mnt = mounted; dput(*dentry); mntput(mounted->mnt_parent); *dentry = dget(mounted->mnt_root); @@ -455,21 +453,21 @@ static int follow_mount(struct vfsmount return res; } +/* no need for dcache_lock, as serialization is taken care in + * namespace.c + */ static inline int __follow_down(struct vfsmount **mnt, struct dentry **dentry) { struct vfsmount *mounted; - - spin_lock(&dcache_lock); + mounted = lookup_mnt(*mnt, *dentry); if (mounted) { - *mnt = mntget(mounted); - spin_unlock(&dcache_lock); + *mnt = mounted; dput(*dentry); mntput(mounted->mnt_parent); *dentry = dget(mounted->mnt_root); return 1; } - spin_unlock(&dcache_lock); return 0; } diff -prauN linux-2.5.72/fs/namespace.c wli-2.5.72-numaq-15/fs/namespace.c --- linux-2.5.72/fs/namespace.c 2003-06-16 21:20:06.000000000 -0700 +++ wli-2.5.72-numaq-15/fs/namespace.c 2003-06-20 02:40:38.000000000 -0700 @@ -26,6 +26,8 @@ extern int __init init_rootfs(void); extern int __init sysfs_init(void); +/* spinlock for vfsmount related operation, inplace of dcache_lock */ +spinlock_t vfsmount_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; static struct list_head *mount_hashtable; static int hash_mask, hash_bits; static kmem_cache_t *mnt_cache; @@ -66,30 +68,43 @@ void free_vfsmnt(struct vfsmount *mnt) kmem_cache_free(mnt_cache, mnt); } +/* + * Now, lookup_mnt increments the ref count before returning + * the vfsmount struct. + * + * lookup_mnt can be done without taking any lock, as now we + * do synchronize_kernel() while removing vfsmount struct + * from mnt_hash list. rcu_read_(un)lock is required for + * pre-emptive kernels. + */ struct vfsmount *lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) { struct list_head * head = mount_hashtable + hash(mnt, dentry); struct list_head * tmp = head; - struct vfsmount *p; + struct vfsmount *p, *found = NULL; + rcu_read_lock(); for (;;) { tmp = tmp->next; p = NULL; if (tmp == head) break; p = list_entry(tmp, struct vfsmount, mnt_hash); - if (p->mnt_parent == mnt && p->mnt_mountpoint == dentry) + if (p->mnt_parent == mnt && p->mnt_mountpoint == dentry) { + found = mntget(p); break; + } } - return p; + rcu_read_unlock(); + return found; } static int check_mnt(struct vfsmount *mnt) { - spin_lock(&dcache_lock); + spin_lock(&vfsmount_lock); while (mnt->mnt_parent != mnt) mnt = mnt->mnt_parent; - spin_unlock(&dcache_lock); + spin_unlock(&vfsmount_lock); return mnt == current->namespace->root; } @@ -97,10 +112,19 @@ static void detach_mnt(struct vfsmount * { old_nd->dentry = mnt->mnt_mountpoint; old_nd->mnt = mnt->mnt_parent; + + /* remove from the hash_list, before other things */ + list_del_rcu(&mnt->mnt_hash); + spin_unlock(&vfsmount_lock); + + /* There could be existing users doing lookup_mnt, let + * them finish their work. + */ + synchronize_kernel(); + spin_lock(&vfsmount_lock); mnt->mnt_parent = mnt; mnt->mnt_mountpoint = mnt->mnt_root; list_del_init(&mnt->mnt_child); - list_del_init(&mnt->mnt_hash); old_nd->dentry->d_mounted--; } @@ -108,7 +132,7 @@ static void attach_mnt(struct vfsmount * { mnt->mnt_parent = mntget(nd->mnt); mnt->mnt_mountpoint = dget(nd->dentry); - list_add(&mnt->mnt_hash, mount_hashtable+hash(nd->mnt, nd->dentry)); + list_add_rcu(&mnt->mnt_hash, mount_hashtable+hash(nd->mnt, nd->dentry)); list_add_tail(&mnt->mnt_child, &nd->mnt->mnt_mounts); nd->dentry->d_mounted++; } @@ -263,15 +287,15 @@ void umount_tree(struct vfsmount *mnt) mnt = list_entry(kill.next, struct vfsmount, mnt_list); list_del_init(&mnt->mnt_list); if (mnt->mnt_parent == mnt) { - spin_unlock(&dcache_lock); + spin_unlock(&vfsmount_lock); } else { struct nameidata old_nd; detach_mnt(mnt, &old_nd); - spin_unlock(&dcache_lock); + spin_unlock(&vfsmount_lock); path_release(&old_nd); } mntput(mnt); - spin_lock(&dcache_lock); + spin_lock(&vfsmount_lock); } } @@ -324,17 +348,17 @@ static int do_umount(struct vfsmount *mn } down_write(¤t->namespace->sem); - spin_lock(&dcache_lock); + spin_lock(&vfsmount_lock); if (atomic_read(&sb->s_active) == 1) { /* last instance - try to be smart */ - spin_unlock(&dcache_lock); + spin_unlock(&vfsmount_lock); lock_kernel(); DQUOT_OFF(sb); acct_auto_close(sb); unlock_kernel(); security_sb_umount_close(mnt); - spin_lock(&dcache_lock); + spin_lock(&vfsmount_lock); } retval = -EBUSY; if (atomic_read(&mnt->mnt_count) == 2 || flags & MNT_DETACH) { @@ -342,7 +366,7 @@ static int do_umount(struct vfsmount *mn umount_tree(mnt); retval = 0; } - spin_unlock(&dcache_lock); + spin_unlock(&vfsmount_lock); if (retval) security_sb_umount_busy(mnt); up_write(¤t->namespace->sem); @@ -449,18 +473,18 @@ static struct vfsmount *copy_tree(struct q = clone_mnt(p, p->mnt_root); if (!q) goto Enomem; - spin_lock(&dcache_lock); + spin_lock(&vfsmount_lock); list_add_tail(&q->mnt_list, &res->mnt_list); attach_mnt(q, &nd); - spin_unlock(&dcache_lock); + spin_unlock(&vfsmount_lock); } } return res; Enomem: if (res) { - spin_lock(&dcache_lock); + spin_lock(&vfsmount_lock); umount_tree(res); - spin_unlock(&dcache_lock); + spin_unlock(&vfsmount_lock); } return NULL; } @@ -485,7 +509,7 @@ static int graft_tree(struct vfsmount *m goto out_unlock; err = -ENOENT; - spin_lock(&dcache_lock); + spin_lock(&vfsmount_lock); if (IS_ROOT(nd->dentry) || !d_unhashed(nd->dentry)) { struct list_head head; @@ -495,7 +519,7 @@ static int graft_tree(struct vfsmount *m mntget(mnt); err = 0; } - spin_unlock(&dcache_lock); + spin_unlock(&vfsmount_lock); out_unlock: up(&nd->dentry->d_inode->i_sem); if (!err) @@ -532,9 +556,9 @@ static int do_loopback(struct nameidata if (mnt) { err = graft_tree(mnt, nd); if (err) { - spin_lock(&dcache_lock); + spin_lock(&vfsmount_lock); umount_tree(mnt); - spin_unlock(&dcache_lock); + spin_unlock(&vfsmount_lock); } else mntput(mnt); } @@ -599,7 +623,7 @@ static int do_move_mount(struct nameidat if (IS_DEADDIR(nd->dentry->d_inode)) goto out1; - spin_lock(&dcache_lock); + spin_lock(&vfsmount_lock); if (!IS_ROOT(nd->dentry) && d_unhashed(nd->dentry)) goto out2; @@ -623,7 +647,7 @@ static int do_move_mount(struct nameidat detach_mnt(old_nd.mnt, &parent_nd); attach_mnt(old_nd.mnt, nd); out2: - spin_unlock(&dcache_lock); + spin_unlock(&vfsmount_lock); out1: up(&nd->dentry->d_inode->i_sem); out: @@ -804,9 +828,9 @@ int copy_namespace(int flags, struct tas down_write(&tsk->namespace->sem); /* First pass: copy the tree topology */ new_ns->root = copy_tree(namespace->root, namespace->root->mnt_root); - spin_lock(&dcache_lock); + spin_lock(&vfsmount_lock); list_add_tail(&new_ns->list, &new_ns->root->mnt_list); - spin_unlock(&dcache_lock); + spin_unlock(&vfsmount_lock); /* Second pass: switch the tsk->fs->* elements */ if (fs) { @@ -1027,7 +1051,7 @@ asmlinkage long sys_pivot_root(const cha if (new_nd.mnt->mnt_root != new_nd.dentry) goto out2; /* not a mountpoint */ tmp = old_nd.mnt; /* make sure we can reach put_old from new_root */ - spin_lock(&dcache_lock); + spin_lock(&vfsmount_lock); if (tmp != new_nd.mnt) { for (;;) { if (tmp->mnt_parent == tmp) @@ -1044,7 +1068,7 @@ asmlinkage long sys_pivot_root(const cha detach_mnt(user_nd.mnt, &root_parent); attach_mnt(user_nd.mnt, &old_nd); attach_mnt(new_nd.mnt, &root_parent); - spin_unlock(&dcache_lock); + spin_unlock(&vfsmount_lock); chroot_fs_refs(&user_nd, &new_nd); security_sb_post_pivotroot(&user_nd, &new_nd); error = 0; @@ -1061,7 +1085,7 @@ out0: unlock_kernel(); return error; out3: - spin_unlock(&dcache_lock); + spin_unlock(&vfsmount_lock); goto out2; } diff -prauN linux-2.5.72/fs/proc/array.c wli-2.5.72-numaq-15/fs/proc/array.c --- linux-2.5.72/fs/proc/array.c 2003-06-16 21:20:05.000000000 -0700 +++ wli-2.5.72-numaq-15/fs/proc/array.c 2003-06-19 21:32:10.000000000 -0700 @@ -283,7 +283,7 @@ int proc_pid_status(struct task_struct * return buffer - orig; } -extern unsigned long task_vsize(struct mm_struct *); +unsigned long task_vsize(struct mm_struct *); int proc_pid_stat(struct task_struct *task, char * buffer) { unsigned long vsize, eip, esp, wchan; @@ -307,11 +307,9 @@ int proc_pid_stat(struct task_struct *ta } task_unlock(task); if (mm) { - down_read(&mm->mmap_sem); vsize = task_vsize(mm); eip = KSTK_EIP(task); esp = KSTK_ESP(task); - up_read(&mm->mmap_sem); } wchan = get_wchan(task); @@ -388,20 +386,23 @@ int proc_pid_stat(struct task_struct *ta return res; } -extern int task_statm(struct mm_struct *, int *, int *, int *, int *); +int task_statm(struct mm_struct *, int *, int *, int *, int *, int *, int *); int proc_pid_statm(struct task_struct *task, char *buffer) { - int size = 0, resident = 0, shared = 0, text = 0, lib = 0, data = 0; + int size, resident, shared, text, lib, data, dirty; struct mm_struct *mm = get_task_mm(task); - if (mm) { + if (!mm) + size = resident = shared = text = lib = data = dirty = 0; + else { down_read(&mm->mmap_sem); - size = task_statm(mm, &shared, &text, &data, &resident); + size = task_statm(mm, &shared, &text, &lib, &data, + &resident, &dirty); up_read(&mm->mmap_sem); mmput(mm); } return sprintf(buffer,"%d %d %d %d %d %d %d\n", - size, resident, shared, text, lib, data, 0); + size, resident, shared, text, lib, data, dirty); } diff -prauN linux-2.5.72/fs/proc/base.c wli-2.5.72-numaq-15/fs/proc/base.c --- linux-2.5.72/fs/proc/base.c 2003-06-16 21:20:06.000000000 -0700 +++ wli-2.5.72-numaq-15/fs/proc/base.c 2003-06-20 02:34:56.000000000 -0700 @@ -298,7 +298,7 @@ static int proc_check_root(struct inode { struct dentry *de, *base, *root; struct vfsmount *our_vfsmnt, *vfsmnt, *mnt; - int res = 0; + int subdir, res = 0; if (proc_root_link(inode, &root, &vfsmnt)) /* Ewww... */ return -ENOENT; @@ -307,20 +307,23 @@ static int proc_check_root(struct inode base = dget(current->fs->root); read_unlock(¤t->fs->lock); - spin_lock(&dcache_lock); + spin_lock(&vfsmount_lock); de = root; mnt = vfsmnt; while (vfsmnt != our_vfsmnt) { if (vfsmnt == vfsmnt->mnt_parent) - goto out; + goto out_unlock; de = vfsmnt->mnt_mountpoint; vfsmnt = vfsmnt->mnt_parent; } + spin_unlock(&vfsmount_lock); - if (!is_subdir(de, base)) + rcu_read_lock(); /* dcache_lock */ + subdir = is_subdir(de, base); + rcu_read_unlock(); /* dcache_lock */ + if (!subdir) goto out; - spin_unlock(&dcache_lock); exit: dput(base); @@ -328,8 +331,9 @@ exit: dput(root); mntput(mnt); return res; +out_unlock: + spin_unlock(&vfsmount_lock); out: - spin_unlock(&dcache_lock); res = -EACCES; goto exit; } @@ -1385,62 +1389,37 @@ out: } #define PROC_NUMBUF 10 -#define PROC_MAXPIDS 20 - -/* - * Get a few pid's to return for filldir - we need to hold the - * tasklist lock while doing this, and we must release it before - * we actually do the filldir itself, so we use a temp buffer.. - */ -static int get_pid_list(int index, unsigned int *pids) -{ - struct task_struct *p; - int nr_pids = 0; - - index--; - read_lock(&tasklist_lock); - for_each_process(p) { - int pid = p->pid; - if (!pid_alive(p)) - continue; - if (--index >= 0) - continue; - pids[nr_pids] = pid; - nr_pids++; - if (nr_pids >= PROC_MAXPIDS) - break; - } - read_unlock(&tasklist_lock); - return nr_pids; -} int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) { - unsigned int pid_array[PROC_MAXPIDS]; char buf[PROC_NUMBUF]; unsigned int nr = filp->f_pos - FIRST_PROCESS_ENTRY; - unsigned int nr_pids, i; + int pid; if (!nr) { ino_t ino = fake_ino(0,PROC_PID_INO); if (filldir(dirent, "self", 4, filp->f_pos, ino, DT_LNK) < 0) return 0; filp->f_pos++; - nr++; + nr = 1; } + pid = nr - 1; + for (;;) { + unsigned long i, j; + ino_t ino; - nr_pids = get_pid_list(nr, pid_array); - - for (i = 0; i < nr_pids; i++) { - int pid = pid_array[i]; - ino_t ino = fake_ino(pid,PROC_PID_INO); - unsigned long j = PROC_NUMBUF; + pid = find_next_pid(pid); + if (pid < 0) + break; - do buf[--j] = '0' + (pid % 10); while (pid/=10); + i = pid; + j = PROC_NUMBUF; + do buf[--j] = '0' + (i % 10); while (i/=10); + ino = fake_ino(pid, PROC_PID_INO); if (filldir(dirent, buf+j, PROC_NUMBUF-j, filp->f_pos, ino, DT_DIR) < 0) break; - filp->f_pos++; + filp->f_pos = pid + 1 + FIRST_PROCESS_ENTRY; } return 0; } diff -prauN linux-2.5.72/fs/proc/proc_misc.c wli-2.5.72-numaq-15/fs/proc/proc_misc.c --- linux-2.5.72/fs/proc/proc_misc.c 2003-06-16 21:19:41.000000000 -0700 +++ wli-2.5.72-numaq-15/fs/proc/proc_misc.c 2003-06-18 19:14:41.000000000 -0700 @@ -200,6 +200,7 @@ static int meminfo_read_proc(char *page, "SwapFree: %8lu kB\n" "Dirty: %8lu kB\n" "Writeback: %8lu kB\n" + "Deferred: %8lu kB\n" "Mapped: %8lu kB\n" "Slab: %8lu kB\n" "Committed_AS: %8u kB\n" @@ -222,6 +223,7 @@ static int meminfo_read_proc(char *page, K(i.freeswap), K(ps.nr_dirty), K(ps.nr_writeback), + K(nr_deferred_pages()), K(ps.nr_mapped), K(ps.nr_slab), K(committed), @@ -497,11 +499,10 @@ static int ds1286_read_proc(char *page, static int locks_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) { - int len; - lock_kernel(); - len = get_locks_status(page, start, off, count); - unlock_kernel(); - if (len < count) *eof = 1; + int len = get_locks_status(page, start, off, count); + + if (len < count) + *eof = 1; return len; } diff -prauN linux-2.5.72/fs/proc/root.c wli-2.5.72-numaq-15/fs/proc/root.c --- linux-2.5.72/fs/proc/root.c 2003-06-16 21:20:20.000000000 -0700 +++ wli-2.5.72-numaq-15/fs/proc/root.c 2003-06-18 19:14:41.000000000 -0700 @@ -81,11 +81,13 @@ void __init proc_root_init(void) static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry) { - if (dir->i_ino == PROC_ROOT_INO) { /* check for safety... */ - lock_kernel(); + /* + * nr_threads is actually protected by the tasklist_lock; + * however, it's conventional to do reads, especially for + * reporting, without any locking whatsoever. + */ + if (dir->i_ino == PROC_ROOT_INO) /* check for safety... */ dir->i_nlink = proc_root.nlink + nr_threads; - unlock_kernel(); - } if (!proc_lookup(dir, dentry)) { return NULL; diff -prauN linux-2.5.72/fs/proc/task_mmu.c wli-2.5.72-numaq-15/fs/proc/task_mmu.c --- linux-2.5.72/fs/proc/task_mmu.c 2003-06-16 21:19:41.000000000 -0700 +++ wli-2.5.72-numaq-15/fs/proc/task_mmu.c 2003-06-19 23:41:48.000000000 -0700 @@ -5,27 +5,6 @@ char *task_mem(struct mm_struct *mm, char *buffer) { - unsigned long data = 0, stack = 0, exec = 0, lib = 0; - struct vm_area_struct *vma; - - down_read(&mm->mmap_sem); - for (vma = mm->mmap; vma; vma = vma->vm_next) { - unsigned long len = (vma->vm_end - vma->vm_start) >> 10; - if (!vma->vm_file) { - data += len; - if (vma->vm_flags & VM_GROWSDOWN) - stack += len; - continue; - } - if (vma->vm_flags & VM_WRITE) - continue; - if (vma->vm_flags & VM_EXEC) { - exec += len; - if (vma->vm_flags & VM_EXECUTABLE) - continue; - lib += len; - } - } buffer += sprintf(buffer, "VmSize:\t%8lu kB\n" "VmLck:\t%8lu kB\n" @@ -37,9 +16,10 @@ char *task_mem(struct mm_struct *mm, cha mm->total_vm << (PAGE_SHIFT-10), mm->locked_vm << (PAGE_SHIFT-10), mm->rss << (PAGE_SHIFT-10), - data - stack, stack, - exec - lib, lib); - up_read(&mm->mmap_sem); + (mm->data - mm->stack) << (PAGE_SHIFT-10), + mm->stack << (PAGE_SHIFT-10), + mm->text << (PAGE_SHIFT-10), + mm->lib << (PAGE_SHIFT-10)); return buffer; } @@ -49,30 +29,15 @@ unsigned long task_vsize(struct mm_struc } int task_statm(struct mm_struct *mm, int *shared, int *text, - int *data, int *resident) + int *lib, int *data, int *resident, int *dirty) { - struct vm_area_struct *vma; - int size = 0; - + *shared = mm->shared; + *text = mm->text; + *lib = mm->lib; + *data = mm->data; + *dirty = mm->dirty; *resident = mm->rss; - for (vma = mm->mmap; vma; vma = vma->vm_next) { - int pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; - - size += pages; - if (is_vm_hugetlb_page(vma)) { - if (!(vma->vm_flags & VM_DONTCOPY)) - *shared += pages; - continue; - } - if (vma->vm_flags & VM_SHARED || !list_empty(&vma->shared)) - *shared += pages; - if (vma->vm_flags & VM_EXECUTABLE) - *text += pages; - else - *data += pages; - } - - return size; + return mm->total_vm; } static int show_map(struct seq_file *m, void *v) diff -prauN linux-2.5.72/fs/proc/task_nommu.c wli-2.5.72-numaq-15/fs/proc/task_nommu.c --- linux-2.5.72/fs/proc/task_nommu.c 2003-06-16 21:20:05.000000000 -0700 +++ wli-2.5.72-numaq-15/fs/proc/task_nommu.c 2003-06-19 21:31:07.000000000 -0700 @@ -67,16 +67,17 @@ unsigned long task_vsize(struct mm_struc struct mm_tblock_struct *tbp; unsigned long vsize = 0; + down_read(&mm->mmap_sem); for (tbp = &mm->context.tblock; tbp; tbp = tbp->next) { if (tbp->rblock) vsize += kobjsize(tbp->rblock->kblock); } - + up_read(&mm->mmap_sem); return vsize; } int task_statm(struct mm_struct *mm, int *shared, int *text, - int *data, int *resident) + int *lib, int *data, int *resident, int *dirty) { struct mm_tblock_struct *tbp; int size = kobjsize(mm); @@ -92,7 +93,7 @@ int task_statm(struct mm_struct *mm, int size += (*text = mm->end_code - mm->start_code); size += (*data = mm->start_stack - mm->start_data); - + *shared = *lib = *dirty = 0; *resident = size; return size; } diff -prauN linux-2.5.72/include/asm-alpha/pgtable.h wli-2.5.72-numaq-15/include/asm-alpha/pgtable.h --- linux-2.5.72/include/asm-alpha/pgtable.h 2003-06-16 21:19:47.000000000 -0700 +++ wli-2.5.72-numaq-15/include/asm-alpha/pgtable.h 2003-06-18 19:11:28.000000000 -0700 @@ -229,9 +229,11 @@ pmd_page_kernel(pmd_t pmd) #define pmd_page(pmd) (mem_map + ((pmd_val(pmd) & _PFN_MASK) >> 32)) #endif -extern inline unsigned long pgd_page(pgd_t pgd) +extern inline unsigned long __pgd_page(pgd_t pgd) { return PAGE_OFFSET + ((pgd_val(pgd) & _PFN_MASK) >> (32-PAGE_SHIFT)); } +#defiene pgd_page(pgd) virt_to_page(__pgd_page(pgd)) + extern inline int pte_none(pte_t pte) { return !pte_val(pte); } extern inline int pte_present(pte_t pte) { return pte_val(pte) & _PAGE_VALID; } extern inline void pte_clear(pte_t *ptep) { pte_val(*ptep) = 0; } @@ -280,7 +282,7 @@ extern inline pte_t pte_mkyoung(pte_t pt /* Find an entry in the second-level page table.. */ extern inline pmd_t * pmd_offset(pgd_t * dir, unsigned long address) { - return (pmd_t *) pgd_page(*dir) + ((address >> PMD_SHIFT) & (PTRS_PER_PAGE - 1)); + return (pmd_t *)__pgd_page(*dir) + ((address >> PMD_SHIFT) & (PTRS_PER_PAGE - 1)); } /* Find an entry in the third-level page table.. */ diff -prauN linux-2.5.72/include/asm-arm/pgtable.h wli-2.5.72-numaq-15/include/asm-arm/pgtable.h --- linux-2.5.72/include/asm-arm/pgtable.h 2003-06-16 21:19:46.000000000 -0700 +++ wli-2.5.72-numaq-15/include/asm-arm/pgtable.h 2003-06-18 19:11:28.000000000 -0700 @@ -125,6 +125,11 @@ extern struct page *empty_zero_page; /* Find an entry in the second-level page table.. */ #define pmd_offset(dir, addr) ((pmd_t *)(dir)) +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) /* Find an entry in the third-level page table.. */ #define __pte_index(addr) (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) diff -prauN linux-2.5.72/include/asm-arm26/pgtable.h wli-2.5.72-numaq-15/include/asm-arm26/pgtable.h --- linux-2.5.72/include/asm-arm26/pgtable.h 2003-06-16 21:19:40.000000000 -0700 +++ wli-2.5.72-numaq-15/include/asm-arm26/pgtable.h 2003-06-18 19:11:28.000000000 -0700 @@ -189,6 +189,12 @@ extern struct page *empty_zero_page; #define pte_unmap(pte) do { } while (0) #define pte_unmap_nested(pte) do { } while (0) +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pgd, addr) do { } while (0) +#define pmd_unmap_nested(pgd, addr) do { } while (0) + #define _PAGE_PRESENT 0x01 #define _PAGE_READONLY 0x02 diff -prauN linux-2.5.72/include/asm-h8300/pgtable.h wli-2.5.72-numaq-15/include/asm-h8300/pgtable.h --- linux-2.5.72/include/asm-h8300/pgtable.h 2003-06-16 21:20:00.000000000 -0700 +++ wli-2.5.72-numaq-15/include/asm-h8300/pgtable.h 2003-06-18 19:11:28.000000000 -0700 @@ -15,6 +15,11 @@ typedef pte_t *pte_addr_t; #define pgd_clear(pgdp) #define kern_addr_valid(addr) (1) #define pmd_offset(a, b) ((void *)0) +#define pmd_offset_kernel(a,b) pmd_offset(a,b) +#define pmd_offset_map(a,b) pmd_offset(a,b) +#define pmd_offset_map_nested(a,b) pmd_offset(a,b) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) #define PAGE_NONE __pgprot(0) /* these mean nothing to NO_MM */ #define PAGE_SHARED __pgprot(0) /* these mean nothing to NO_MM */ diff -prauN linux-2.5.72/include/asm-i386/highmem.h wli-2.5.72-numaq-15/include/asm-i386/highmem.h --- linux-2.5.72/include/asm-i386/highmem.h 2003-06-16 21:19:40.000000000 -0700 +++ wli-2.5.72-numaq-15/include/asm-i386/highmem.h 2003-06-19 10:40:30.000000000 -0700 @@ -52,11 +52,55 @@ extern void kmap_init(void); extern void * FASTCALL(kmap_high(struct page *page)); extern void FASTCALL(kunmap_high(struct page *page)); -void *kmap(struct page *page); -void kunmap(struct page *page); -void *kmap_atomic(struct page *page, enum km_type type); -void kunmap_atomic(void *kvaddr, enum km_type type); -struct page *kmap_atomic_to_page(void *ptr); +void *__kmap_atomic(struct page *page, enum km_type type); +struct page *__kmap_atomic_to_page(void *ptr); + +#ifdef CONFIG_DEBUG_HIGHMEM +void __kunmap_atomic(void *kvaddr, enum km_type type); +#else +static inline void __kunmap_atomic(void *kvaddr, enum km_type type) +{ +} +#endif + +static inline void *kmap(struct page *page) +{ + might_sleep(); + if (page < highmem_start_page) + return lowmem_page_address(page); + else + return kmap_high(page); +} + +static inline void kunmap(struct page *page) +{ + BUG_ON(in_interrupt()); + if (page >= highmem_start_page) + kunmap_high(page); +} + +static inline void *kmap_atomic(struct page *page, enum km_type type) +{ + inc_preempt_count(); + if (page < highmem_start_page) + return lowmem_page_address(page); + else + return __kmap_atomic(page, type); +} + +static inline void kunmap_atomic(void *vaddr, enum km_type type) +{ + __kunmap_atomic(vaddr, type); + dec_preempt_count(); +} + +static inline struct page *kmap_atomic_to_page(void *vaddr) +{ + if ((unsigned long)vaddr < FIXADDR_START) + return virt_to_page(vaddr); + else + return __kmap_atomic_to_page(vaddr); +} #endif /* __KERNEL__ */ diff -prauN linux-2.5.72/include/asm-i386/kmap_types.h wli-2.5.72-numaq-15/include/asm-i386/kmap_types.h --- linux-2.5.72/include/asm-i386/kmap_types.h 2003-06-16 21:20:19.000000000 -0700 +++ wli-2.5.72-numaq-15/include/asm-i386/kmap_types.h 2003-06-18 19:11:28.000000000 -0700 @@ -17,14 +17,16 @@ D(3) KM_USER0, D(4) KM_USER1, D(5) KM_BIO_SRC_IRQ, D(6) KM_BIO_DST_IRQ, -D(7) KM_PTE0, -D(8) KM_PTE1, -D(9) KM_PTE2, -D(10) KM_IRQ0, -D(11) KM_IRQ1, -D(12) KM_SOFTIRQ0, -D(13) KM_SOFTIRQ1, -D(14) KM_TYPE_NR +D(7) KM_PMD0, +D(8) KM_PMD1, +D(9) KM_PTE0, +D(10) KM_PTE1, +D(11) KM_PTE2, +D(12) KM_IRQ0, +D(13) KM_IRQ1, +D(14) KM_SOFTIRQ0, +D(15) KM_SOFTIRQ1, +D(16) KM_TYPE_NR }; #undef D diff -prauN linux-2.5.72/include/asm-i386/linkage.h wli-2.5.72-numaq-15/include/asm-i386/linkage.h --- linux-2.5.72/include/asm-i386/linkage.h 2003-06-16 21:20:28.000000000 -0700 +++ wli-2.5.72-numaq-15/include/asm-i386/linkage.h 2003-06-20 03:33:01.000000000 -0700 @@ -3,6 +3,7 @@ #define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0))) #define FASTCALL(x) x __attribute__((regparm(3))) +#define IRQHANDLER(x) x __attribute__((regparm(1))) #ifdef CONFIG_X86_ALIGNMENT_16 #define __ALIGN .align 16,0x90 diff -prauN linux-2.5.72/include/asm-i386/page.h wli-2.5.72-numaq-15/include/asm-i386/page.h --- linux-2.5.72/include/asm-i386/page.h 2003-06-16 21:19:41.000000000 -0700 +++ wli-2.5.72-numaq-15/include/asm-i386/page.h 2003-06-20 03:19:38.000000000 -0700 @@ -3,7 +3,11 @@ /* PAGE_SHIFT determines the page size */ #define PAGE_SHIFT 12 -#define PAGE_SIZE (1UL << PAGE_SHIFT) +#ifndef __ASSEMBLY__ +#define PAGE_SIZE (1UL << PAGE_SHIFT) +#else +#define PAGE_SIZE (1 << PAGE_SHIFT) +#endif #define PAGE_MASK (~(PAGE_SIZE-1)) #define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1)) diff -prauN linux-2.5.72/include/asm-i386/pgalloc.h wli-2.5.72-numaq-15/include/asm-i386/pgalloc.h --- linux-2.5.72/include/asm-i386/pgalloc.h 2003-06-16 21:19:40.000000000 -0700 +++ wli-2.5.72-numaq-15/include/asm-i386/pgalloc.h 2003-06-18 19:15:13.000000000 -0700 @@ -31,14 +31,6 @@ static inline void pte_free_kernel(pte_t free_page((unsigned long)pte); } -static inline void pte_free(struct page *pte) -{ - __free_page(pte); -} - - -#define __pte_free_tlb(tlb,pte) tlb_remove_page((tlb),(pte)) - /* * allocating and freeing a pmd is trivial: the 1-entry pmd is * inside the pgd, so has no extra memory associated with it. @@ -46,10 +38,29 @@ static inline void pte_free(struct page */ #define pmd_alloc_one(mm, addr) ({ BUG(); ((pmd_t *)2); }) +#define pmd_alloc_one_kernel(mm, addr) ({ BUG(); ((pmd_t *)2); }) #define pmd_free(x) do { } while (0) #define __pmd_free_tlb(tlb,x) do { } while (0) #define pgd_populate(mm, pmd, pte) BUG() #define check_pgt_cache() do { } while (0) +#include + +static inline void pte_free(struct page *page) +{ + struct mmu_gather *tlb = &per_cpu(mmu_gathers, get_cpu()); + tlb_remove_page(tlb, page); + put_cpu(); +} + +static inline void pte_free_tlb(struct mmu_gather *tlb, struct page *page) +{ + tlb_remove_page(tlb, page); +} + +static inline void pmd_free_tlb(struct mmu_gather *tlb, struct page *page) +{ +} + #endif /* _I386_PGALLOC_H */ diff -prauN linux-2.5.72/include/asm-i386/pgtable-2level.h wli-2.5.72-numaq-15/include/asm-i386/pgtable-2level.h --- linux-2.5.72/include/asm-i386/pgtable-2level.h 2003-06-16 21:20:02.000000000 -0700 +++ wli-2.5.72-numaq-15/include/asm-i386/pgtable-2level.h 2003-06-18 19:11:28.000000000 -0700 @@ -48,13 +48,15 @@ static inline int pgd_present(pgd_t pgd) #define set_pmd(pmdptr, pmdval) (*(pmdptr) = pmdval) #define set_pgd(pgdptr, pgdval) (*(pgdptr) = pgdval) -#define pgd_page(pgd) \ -((unsigned long) __va(pgd_val(pgd) & PAGE_MASK)) +#define pgd_page(pgd) pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT) + +#define pmd_offset_map(pgd, addr) ({ (pmd_t *)(pgd); }) +#define pmd_offset_map_nested(pgd, addr) pmd_offset_map(pgd, addr) +#define pmd_offset_kernel(pgd, addr) pmd_offset_map(pgd, addr) + +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) -static inline pmd_t * pmd_offset(pgd_t * dir, unsigned long address) -{ - return (pmd_t *) dir; -} #define ptep_get_and_clear(xp) __pte(xchg(&(xp)->pte_low, 0)) #define pte_same(a, b) ((a).pte_low == (b).pte_low) #define pte_page(x) pfn_to_page(pte_pfn(x)) diff -prauN linux-2.5.72/include/asm-i386/pgtable-3level.h wli-2.5.72-numaq-15/include/asm-i386/pgtable-3level.h --- linux-2.5.72/include/asm-i386/pgtable-3level.h 2003-06-16 21:20:20.000000000 -0700 +++ wli-2.5.72-numaq-15/include/asm-i386/pgtable-3level.h 2003-06-18 19:15:51.000000000 -0700 @@ -64,12 +64,25 @@ static inline void set_pte(pte_t *ptep, */ static inline void pgd_clear (pgd_t * pgd) { } -#define pgd_page(pgd) \ -((unsigned long) __va(pgd_val(pgd) & PAGE_MASK)) +#define pgd_page(pgd) pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT) + +static inline unsigned long pgd_pfn(pgd_t pgd) +{ + return pgd_val(pgd) >> PAGE_SHIFT; +} + +#define pmd_offset_kernel(pgd, addr) \ + ((pmd_t *)__va(pgd_val(*(pgd)) & PAGE_MASK) + pmd_index(addr)) /* Find an entry in the second-level page table.. */ -#define pmd_offset(dir, address) ((pmd_t *) pgd_page(*(dir)) + \ - pmd_index(address)) +#define __pmd_offset(pgd, addr, type) \ + ((pmd_t *)kmap_atomic(pgd_page(*(pgd)), type) + pmd_index(addr)) + +#define pmd_offset_map(pgd, addr) __pmd_offset(pgd, addr, KM_PMD0) +#define pmd_offset_map_nested(pgd, addr) __pmd_offset(pgd, addr, KM_PMD1) + +#define pmd_unmap(pmd) kunmap_atomic(pmd, KM_PMD0); +#define pmd_unmap_nested(pmd) kunmap_atomic(pmd, KM_PMD1); static inline pte_t ptep_get_and_clear(pte_t *ptep) { @@ -123,6 +136,4 @@ static inline pmd_t pfn_pmd(unsigned lon #define pgoff_to_pte(off) ((pte_t) { _PAGE_FILE, (off) }) #define PTE_FILE_MAX_BITS 32 -extern struct kmem_cache_s *pae_pgd_cachep; - #endif /* _I386_PGTABLE_3LEVEL_H */ diff -prauN linux-2.5.72/include/asm-i386/pgtable.h wli-2.5.72-numaq-15/include/asm-i386/pgtable.h --- linux-2.5.72/include/asm-i386/pgtable.h 2003-06-16 21:20:19.000000000 -0700 +++ wli-2.5.72-numaq-15/include/asm-i386/pgtable.h 2003-06-18 19:15:51.000000000 -0700 @@ -16,6 +16,9 @@ #include #include #include +#include +#include +#include #ifndef _I386_BITOPS_H #include @@ -31,33 +34,26 @@ extern void paging_init(void); extern unsigned long empty_zero_page[1024]; #define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) -#endif /* !__ASSEMBLY__ */ +extern kmem_cache_t *pgd_cache; +extern struct list_head pgd_list; +extern spinlock_t pgd_lock; +void pgtable_cache_init(void); +void pgd_ctor(void *, kmem_cache_t *, unsigned long); +void pgd_dtor(void *, kmem_cache_t *, unsigned long); /* * The Linux x86 paging architecture is 'compile-time dual-mode', it * implements both the traditional 2-level x86 page tables and the * newer 3-level PAE-mode page tables. */ -#ifndef __ASSEMBLY__ #ifdef CONFIG_X86_PAE # include - -/* - * Need to initialise the X86 PAE caches - */ -extern void pgtable_cache_init(void); - #else # include - -/* - * No page table caches to initialise - */ -#define pgtable_cache_init() do { } while (0) - -#endif #endif +#endif /* !__ASSEMBLY__ */ + #define PMD_SIZE (1UL << PMD_SHIFT) #define PMD_MASK (~(PMD_SIZE-1)) #define PGDIR_SIZE (1UL << PGDIR_SHIFT) @@ -294,32 +290,25 @@ static inline pte_t pte_modify(pte_t pte #define pte_offset_kernel(dir, address) \ ((pte_t *) pmd_page_kernel(*(dir)) + pte_index(address)) -#if defined(CONFIG_HIGHPTE) -#define pte_offset_map(dir, address) \ - ((pte_t *)kmap_atomic(pmd_page(*(dir)),KM_PTE0) + pte_index(address)) -#define pte_offset_map_nested(dir, address) \ - ((pte_t *)kmap_atomic(pmd_page(*(dir)),KM_PTE1) + pte_index(address)) -#define pte_unmap(pte) kunmap_atomic(pte, KM_PTE0) -#define pte_unmap_nested(pte) kunmap_atomic(pte, KM_PTE1) -#else -#define pte_offset_map(dir, address) \ - ((pte_t *)page_address(pmd_page(*(dir))) + pte_index(address)) -#define pte_offset_map_nested(dir, address) pte_offset_map(dir, address) -#define pte_unmap(pte) do { } while (0) -#define pte_unmap_nested(pte) do { } while (0) -#endif +#define __pte_offset(pmd, addr, type) \ + ((pte_t *)kmap_atomic(pmd_page(*pmd), type) + pte_index(addr)) -#if defined(CONFIG_HIGHPTE) && defined(CONFIG_HIGHMEM4G) -typedef u32 pte_addr_t; -#endif +#define pte_offset_map(pmd, addr) __pte_offset(pmd, addr, KM_PTE0) +#define pte_offset_map_nested(pmd, addr) __pte_offset(pmd, addr, KM_PTE1) +#define pte_unmap(pte) kunmap_atomic(pte, KM_PTE0) +#define pte_unmap_nested(pte) kunmap_atomic(pte, KM_PTE1) + +#ifdef CONFIG_HIGHPTE -#if defined(CONFIG_HIGHPTE) && defined(CONFIG_HIGHMEM64G) +#ifdef CONFIG_HIGHMEM64G typedef u64 pte_addr_t; -#endif +#else /* CONFIG_HIGHMEM4G */ +typedef u32 pte_addr_t; +#endif /* CONFIG_HIGHMEM4G */ -#if !defined(CONFIG_HIGHPTE) +#else /* !CONFIG_HIGHPTE */ typedef pte_t *pte_addr_t; -#endif +#endif /* !CONFIG_HIGHPTE */ /* * The i386 doesn't have any external MMU info: the kernel page diff -prauN linux-2.5.72/include/asm-i386/thread_info.h wli-2.5.72-numaq-15/include/asm-i386/thread_info.h --- linux-2.5.72/include/asm-i386/thread_info.h 2003-06-16 21:19:41.000000000 -0700 +++ wli-2.5.72-numaq-15/include/asm-i386/thread_info.h 2003-06-20 04:49:06.000000000 -0700 @@ -9,6 +9,8 @@ #ifdef __KERNEL__ +#include +#include #ifndef __ASSEMBLY__ #include #endif @@ -30,9 +32,11 @@ struct thread_info { __s32 preempt_count; /* 0 => preemptable, <0 => BUG */ mm_segment_t addr_limit; /* thread address space: + 0 for interrupts: illegal 0-0xBFFFFFFF for user-thead 0-0xFFFFFFFF for kernel-thread */ + struct thread_info *irq_stack; /* pointer to cpu irq stack */ struct restart_block restart_block; __u8 supervisor_stack[0]; @@ -48,7 +52,8 @@ struct thread_info { #define TI_CPU 0x00000010 #define TI_PRE_COUNT 0x00000014 #define TI_ADDR_LIMIT 0x00000018 -#define TI_RESTART_BLOCK 0x000001C +#define TI_IRQ_STACK 0x0000001C +#define TI_RESTART_BLOCK 0x0000026 #endif @@ -59,46 +64,64 @@ struct thread_info { * * preempt_count needs to be 1 initially, until the scheduler is functional. */ +#ifdef CONFIG_4K_STACK +#define THREAD_ORDER 0 +#define STACK_WARN 0x200 +#define STACK_PANIC 0x100 +#else +#define THREAD_ORDER 1 +#define STACK_WARN ((THREAD_SIZE)>>1) +#define STACK_PANIC 0x100 +#endif +#define INIT_THREAD_SIZE THREAD_SIZE + #ifndef __ASSEMBLY__ -#define INIT_THREAD_INFO(tsk) \ -{ \ - .task = &tsk, \ - .exec_domain = &default_exec_domain, \ - .flags = 0, \ - .cpu = 0, \ - .preempt_count = 1, \ - .addr_limit = KERNEL_DS, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ +#define INIT_THREAD_INFO(tsk) \ +{ \ + .task = &tsk, \ + .exec_domain = &default_exec_domain, \ + .flags = 0, \ + .cpu = 0, \ + .preempt_count = 1, \ + .addr_limit = KERNEL_DS, \ + .irq_stack = &init_irq_union.thread_info, \ + .restart_block = { \ + .fn = do_no_restart_syscall, \ + } \ } #define init_thread_info (init_thread_union.thread_info) #define init_stack (init_thread_union.stack) +/* thread information allocation */ +#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER) +#define alloc_thread_info(tsk) ((struct thread_info *) __get_free_pages(GFP_KERNEL,THREAD_ORDER)) +#define free_thread_info(ti) free_pages((unsigned long) (ti), THREAD_ORDER) +#define get_thread_info(ti) get_task_struct((ti)->task) +#define put_thread_info(ti) put_task_struct((ti)->task) + /* how to get the thread information struct from C */ static inline struct thread_info *current_thread_info(void) { struct thread_info *ti; - __asm__("andl %%esp,%0; ":"=r" (ti) : "0" (~8191UL)); + __asm__("andl %%esp,%0; ":"=r" (ti) : "0" (~(THREAD_SIZE - 1))); return ti; } -/* thread information allocation */ -#define THREAD_SIZE (2*PAGE_SIZE) -#define alloc_thread_info(tsk) ((struct thread_info *) __get_free_pages(GFP_KERNEL,1)) -#define free_thread_info(ti) free_pages((unsigned long) (ti), 1) -#define get_thread_info(ti) get_task_struct((ti)->task) -#define put_thread_info(ti) put_task_struct((ti)->task) - #else /* !__ASSEMBLY__ */ +#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER) + /* how to get the thread information struct from ASM */ #define GET_THREAD_INFO(reg) \ - movl $-8192, reg; \ + movl $-THREAD_SIZE, reg; \ andl %esp, reg +/* use this one if reg already contains %esp */ +#define GET_THREAD_INFO_WITH_ESP(reg) \ +andl $-THREAD_SIZE, reg + #endif /* diff -prauN linux-2.5.72/include/asm-i386/tlb.h wli-2.5.72-numaq-15/include/asm-i386/tlb.h --- linux-2.5.72/include/asm-i386/tlb.h 2003-06-16 21:19:47.000000000 -0700 +++ wli-2.5.72-numaq-15/include/asm-i386/tlb.h 2003-06-18 19:15:13.000000000 -0700 @@ -1,10 +1,54 @@ #ifndef _I386_TLB_H #define _I386_TLB_H - /* - * x86 doesn't need any special per-pte or - * per-vma handling.. + * include/asm-i386/tlb.h + * (C) June 2003 William Irwin, IBM + * Routines for pagetable cacheing and release. */ + +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_HIGHPTE +#define GFP_PTE (GFP_KERNEL|__GFP_REPEAT|__GFP_HIGHMEM) +#else +#define GFP_PTE (GFP_KERNEL|__GFP_REPEAT) +#endif + +#ifdef CONFIG_HIGHPMD +#define GFP_PMD (GFP_KERNEL|__GFP_REPEAT|__GFP_HIGHMEM) +#else +#define GFP_PMD (GFP_KERNEL|__GFP_REPEAT) +#endif + +#define PG_PTE PG_arch_1 +#define NR_PTE 128 +#define FREE_PTE_NR NR_PTE +#define NR_NONPTE 512 +#define MAX_ZONE_ID (MAX_NUMNODES * MAX_NR_ZONES) + +#define PagePTE(page) test_bit(PG_PTE, &(page)->flags) +#define SetPagePTE(page) set_bit(PG_PTE, &(page)->flags) +#define ClearPagePTE(page) clear_bit(PG_PTE, &(page)->flags) +#define TestSetPagePTE(page) test_and_set_bit(PG_PTE, &(page)->flags) +#define TestClearPagePTE(page) test_and_clear_bit(PG_PTE, &(page)->flags) +#define PageZoneID(page) ((page)->flags >> ZONE_SHIFT) + +struct mmu_gather { + struct mm_struct *mm; + int nr_pte_active, nr_pte_ready, nr_nonpte, need_flush, fullmm, freed; + struct list_head active_list[MAX_ZONE_ID], ready_list[MAX_ZONE_ID]; + int active_count[MAX_ZONE_ID], ready_count[MAX_ZONE_ID]; + struct page *nonpte[NR_NONPTE]; +}; + +DECLARE_PER_CPU(struct mmu_gather, mmu_gathers); + #define tlb_start_vma(tlb, vma) do { } while (0) #define tlb_end_vma(tlb, vma) do { } while (0) #define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0) @@ -15,6 +59,109 @@ */ #define tlb_flush(tlb) flush_tlb_mm((tlb)->mm) -#include +void tlb_init(void); -#endif +static inline +struct mmu_gather *tlb_gather_mmu(struct mm_struct *mm, unsigned int flush) +{ + struct mmu_gather *tlb = &per_cpu(mmu_gathers, get_cpu()); + tlb->mm = mm; + tlb->fullmm = flush; + tlb->freed = 0; + put_cpu(); + return tlb; +} + +static inline +void tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *pte, unsigned long addr) +{ + tlb->need_flush = 1; +} + +static inline +void tlb_flush_ready(struct mmu_gather *tlb) +{ + int zone = 0; + while (tlb->nr_pte_ready >= NR_PTE) { + if (!list_empty(&tlb->ready_list[zone])) { + struct page *head = list_entry(tlb->ready_list[zone].next, struct page, list); + list_del_init(&head->list); + list_splice_init(&tlb->ready_list[zone], &head->list); + head->private = tlb->ready_count[zone]; + tlb->nr_pte_ready -= tlb->ready_count[zone]; + tlb->ready_count[zone] = 0; + free_pages_bulk(zone_table[zone], head, 0); + } + ++zone; + } +} + +static inline +void tlb_flush_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) +{ + int zone; + + if (!tlb->need_flush && tlb->nr_nonpte < NR_NONPTE) + return; + + tlb->need_flush = 0; + tlb_flush(tlb); + if (tlb->nr_nonpte) { + free_pages_and_swap_cache(tlb->nonpte, tlb->nr_nonpte); + tlb->nr_nonpte = 0; + } + + for (zone = 0; zone < MAX_ZONE_ID; ++zone) { + if (!tlb->active_count[zone]) + continue; + + list_splice_init(&tlb->active_list[zone], &tlb->ready_list[zone]); + tlb->ready_count[zone] += tlb->active_count[zone]; + tlb->active_count[zone] = 0; + } + tlb->nr_pte_ready += tlb->nr_pte_active; + tlb->nr_pte_active = 0; + if (tlb->nr_pte_ready >= NR_PTE) + tlb_flush_ready(tlb); +} + +static inline +void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) +{ + if (tlb->mm->rss >= tlb->freed) + tlb->mm->rss -= tlb->freed; + else + tlb->mm->rss = 0; + tlb_flush_mmu(tlb, start, end); +} + +static inline +void tlb_remove_nonpte_page(struct mmu_gather *tlb, struct page *page) +{ + tlb->nonpte[tlb->nr_nonpte] = page; + tlb->nr_nonpte++; + if (tlb->nr_nonpte >= NR_NONPTE) + tlb_flush_mmu(tlb, 0, 0); +} + +static inline +void tlb_remove_pte_page(struct mmu_gather *tlb, struct page *page) +{ + int zone = PageZoneID(page); + ClearPagePTE(page); + tlb->nr_pte_active++; + tlb->active_count[zone]++; + list_add(&page->list, &tlb->active_list[zone]); +} + +static inline +void tlb_remove_page(struct mmu_gather *tlb, struct page *page) +{ + tlb->need_flush = 1; + if (PagePTE(page)) + tlb_remove_pte_page(tlb, page); + else + tlb_remove_nonpte_page(tlb, page); +} + +#endif /* _I386_TLB_H */ diff -prauN linux-2.5.72/include/asm-ia64/pgtable.h wli-2.5.72-numaq-15/include/asm-ia64/pgtable.h --- linux-2.5.72/include/asm-ia64/pgtable.h 2003-06-16 21:19:47.000000000 -0700 +++ wli-2.5.72-numaq-15/include/asm-ia64/pgtable.h 2003-06-18 19:11:28.000000000 -0700 @@ -250,7 +250,8 @@ ia64_phys_addr_valid (unsigned long addr #define pgd_bad(pgd) (!ia64_phys_addr_valid(pgd_val(pgd))) #define pgd_present(pgd) (pgd_val(pgd) != 0UL) #define pgd_clear(pgdp) (pgd_val(*(pgdp)) = 0UL) -#define pgd_page(pgd) ((unsigned long) __va(pgd_val(pgd) & _PFN_MASK)) +#define __pgd_page(pgd) ((unsigned long)__va(pgd_val(pgd) & _PFN_MASK)) +#define pgd_page(pgd) virt_to_page(__pgd_page(pgd)) /* * The following have defined behavior only work if pte_present() is true. @@ -319,7 +320,13 @@ pgd_offset (struct mm_struct *mm, unsign /* Find an entry in the second-level page table.. */ #define pmd_offset(dir,addr) \ - ((pmd_t *) pgd_page(*(dir)) + (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))) + ((pmd_t *)__pgd_page(*(dir)) + (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))) + +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) /* * Find an entry in the third-level page table. This looks more complicated than it diff -prauN linux-2.5.72/include/asm-m68k/motorola_pgtable.h wli-2.5.72-numaq-15/include/asm-m68k/motorola_pgtable.h --- linux-2.5.72/include/asm-m68k/motorola_pgtable.h 2003-06-16 21:20:06.000000000 -0700 +++ wli-2.5.72-numaq-15/include/asm-m68k/motorola_pgtable.h 2003-06-18 19:11:28.000000000 -0700 @@ -115,6 +115,7 @@ extern inline void pgd_set(pgd_t * pgdp, #define __pte_page(pte) ((unsigned long)__va(pte_val(pte) & PAGE_MASK)) #define __pmd_page(pmd) ((unsigned long)__va(pmd_val(pmd) & _TABLE_MASK)) #define __pgd_page(pgd) ((unsigned long)__va(pgd_val(pgd) & _TABLE_MASK)) +#define pgd_page(pgd) virt_to_page(__pgd_page(pgd)) #define pte_none(pte) (!pte_val(pte)) diff -prauN linux-2.5.72/include/asm-m68knommu/pgtable.h wli-2.5.72-numaq-15/include/asm-m68knommu/pgtable.h --- linux-2.5.72/include/asm-m68knommu/pgtable.h 2003-06-16 21:20:05.000000000 -0700 +++ wli-2.5.72-numaq-15/include/asm-m68knommu/pgtable.h 2003-06-18 19:11:28.000000000 -0700 @@ -21,7 +21,12 @@ typedef pte_t *pte_addr_t; #define pgd_bad(pgd) (0) #define pgd_clear(pgdp) #define kern_addr_valid(addr) (1) -#define pmd_offset(a, b) ((void *)0) +#define pmd_offset(a, b) ((void *)0) +#define pmd_offset_kernel(a, b) pmd_offset(a, b) +#define pmd_offset_map(a, b) pmd_offset(a, b) +#define pmd_offset_map_nested(a, b) pmd_offset(a, b) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) #define PAGE_NONE __pgprot(0) #define PAGE_SHARED __pgprot(0) diff -prauN linux-2.5.72/include/asm-mips64/pgtable.h wli-2.5.72-numaq-15/include/asm-mips64/pgtable.h --- linux-2.5.72/include/asm-mips64/pgtable.h 2003-06-16 21:20:27.000000000 -0700 +++ wli-2.5.72-numaq-15/include/asm-mips64/pgtable.h 2003-06-18 19:11:28.000000000 -0700 @@ -274,11 +274,13 @@ extern inline unsigned long pmd_page(pmd return pmd_val(pmd); } -extern inline unsigned long pgd_page(pgd_t pgd) +extern inline unsigned long __pgd_page(pgd_t pgd) { return pgd_val(pgd); } +#define pgd_page(pgd) virt_to_page(__pgd_page(pgd)) + extern inline void pmd_set(pmd_t * pmdp, pte_t * ptep) { pmd_val(*pmdp) = (((unsigned long) ptep) & PAGE_MASK); @@ -520,7 +522,7 @@ extern inline pgd_t *pgd_offset(struct m /* Find an entry in the second-level page table.. */ extern inline pmd_t * pmd_offset(pgd_t * dir, unsigned long address) { - return (pmd_t *) pgd_page(*dir) + + return (pmd_t *)__pgd_page(*dir) + ((address >> PMD_SHIFT) & (PTRS_PER_PMD - 1)); } diff -prauN linux-2.5.72/include/asm-parisc/pgtable.h wli-2.5.72-numaq-15/include/asm-parisc/pgtable.h --- linux-2.5.72/include/asm-parisc/pgtable.h 2003-06-16 21:20:22.000000000 -0700 +++ wli-2.5.72-numaq-15/include/asm-parisc/pgtable.h 2003-06-18 19:11:28.000000000 -0700 @@ -242,7 +242,8 @@ extern unsigned long *empty_zero_page; #ifdef __LP64__ -#define pgd_page(pgd) ((unsigned long) __va(pgd_val(pgd) & PAGE_MASK)) +#define __pgd_page(pgd) ((unsigned long) __va(pgd_val(pgd) & PAGE_MASK)) +#define pgd_page(pgd) virt_to_page(__pgd_page(pgd)) /* For 64 bit we have three level tables */ @@ -339,11 +340,17 @@ extern inline pte_t pte_modify(pte_t pte #ifdef __LP64__ #define pmd_offset(dir,address) \ -((pmd_t *) pgd_page(*(dir)) + (((address)>>PMD_SHIFT) & (PTRS_PER_PMD-1))) +((pmd_t *)__pgd_page(*(dir)) + (((address)>>PMD_SHIFT) & (PTRS_PER_PMD-1))) #else #define pmd_offset(dir,addr) ((pmd_t *) dir) #endif +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) + /* Find an entry in the third-level page table.. */ #define pte_index(address) (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE-1)) #define pte_offset_kernel(pmd, address) \ diff -prauN linux-2.5.72/include/asm-ppc/pgtable.h wli-2.5.72-numaq-15/include/asm-ppc/pgtable.h --- linux-2.5.72/include/asm-ppc/pgtable.h 2003-06-16 21:19:42.000000000 -0700 +++ wli-2.5.72-numaq-15/include/asm-ppc/pgtable.h 2003-06-18 19:11:28.000000000 -0700 @@ -370,8 +370,9 @@ static inline int pgd_bad(pgd_t pgd) { static inline int pgd_present(pgd_t pgd) { return 1; } #define pgd_clear(xp) do { } while (0) -#define pgd_page(pgd) \ +#define __pgd_page(pgd) \ ((unsigned long) __va(pgd_val(pgd) & PAGE_MASK)) +#define pgd_page(pgd) virt_to_page(__pgd_page(pgd)) /* * The following only work if pte_present() is true. diff -prauN linux-2.5.72/include/asm-ppc64/pgtable.h wli-2.5.72-numaq-15/include/asm-ppc64/pgtable.h --- linux-2.5.72/include/asm-ppc64/pgtable.h 2003-06-16 21:20:24.000000000 -0700 +++ wli-2.5.72-numaq-15/include/asm-ppc64/pgtable.h 2003-06-18 19:11:28.000000000 -0700 @@ -190,7 +190,8 @@ extern unsigned long empty_zero_page[PAG #define pgd_bad(pgd) ((pgd_val(pgd)) == 0) #define pgd_present(pgd) (pgd_val(pgd) != 0UL) #define pgd_clear(pgdp) (pgd_val(*(pgdp)) = 0UL) -#define pgd_page(pgd) (__bpn_to_ba(pgd_val(pgd))) +#define __pgd_page(pgd) (__bpn_to_ba(pgd_val(pgd))) +#define pgd_page(pgd) virt_to_page(__pgd_page(pgd)) /* * Find an entry in a page-table-directory. We combine the address region @@ -203,12 +204,18 @@ extern unsigned long empty_zero_page[PAG /* Find an entry in the second-level page table.. */ #define pmd_offset(dir,addr) \ - ((pmd_t *) pgd_page(*(dir)) + (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))) + ((pmd_t *)__pgd_page(*(dir)) + (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))) /* Find an entry in the third-level page table.. */ #define pte_offset_kernel(dir,addr) \ ((pte_t *) pmd_page_kernel(*(dir)) + (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))) +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) + #define pte_offset_map(dir,addr) pte_offset_kernel((dir), (addr)) #define pte_offset_map_nested(dir,addr) pte_offset_kernel((dir), (addr)) #define pte_unmap(pte) do { } while(0) diff -prauN linux-2.5.72/include/asm-s390/pgtable.h wli-2.5.72-numaq-15/include/asm-s390/pgtable.h --- linux-2.5.72/include/asm-s390/pgtable.h 2003-06-16 21:20:20.000000000 -0700 +++ wli-2.5.72-numaq-15/include/asm-s390/pgtable.h 2003-06-18 19:11:28.000000000 -0700 @@ -613,6 +613,7 @@ static inline pte_t mk_pte_phys(unsigned /* to find an entry in a page-table-directory */ #define pgd_index(address) ((address >> PGDIR_SHIFT) & (PTRS_PER_PGD-1)) #define pgd_offset(mm, address) ((mm)->pgd+pgd_index(address)) +#define pgd_page(pgd) virt_to_page(pgd_page_kernel(pgd)) /* to find an entry in a kernel page-table-directory */ #define pgd_offset_k(address) pgd_offset(&init_mm, address) @@ -634,6 +635,12 @@ extern inline pmd_t * pmd_offset(pgd_t * #endif /* __s390x__ */ +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) + /* Find an entry in the third-level page table.. */ #define pte_index(address) (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE-1)) #define pte_offset_kernel(pmd, address) \ diff -prauN linux-2.5.72/include/asm-sh/pgtable-2level.h wli-2.5.72-numaq-15/include/asm-sh/pgtable-2level.h --- linux-2.5.72/include/asm-sh/pgtable-2level.h 2003-06-16 21:20:26.000000000 -0700 +++ wli-2.5.72-numaq-15/include/asm-sh/pgtable-2level.h 2003-06-18 19:11:28.000000000 -0700 @@ -48,8 +48,9 @@ static inline void pgd_clear (pgd_t * pg #define set_pmd(pmdptr, pmdval) (*(pmdptr) = pmdval) #define set_pgd(pgdptr, pgdval) (*(pgdptr) = pgdval) -#define pgd_page(pgd) \ +#define __pgd_page(pgd) \ ((unsigned long) __va(pgd_val(pgd) & PAGE_MASK)) +#define pgd_page(pgd) virt_to_page(__pgd_page(pgd)) static inline pmd_t * pmd_offset(pgd_t * dir, unsigned long address) { diff -prauN linux-2.5.72/include/asm-sparc/pgtable.h wli-2.5.72-numaq-15/include/asm-sparc/pgtable.h --- linux-2.5.72/include/asm-sparc/pgtable.h 2003-06-16 21:20:03.000000000 -0700 +++ wli-2.5.72-numaq-15/include/asm-sparc/pgtable.h 2003-06-18 19:11:28.000000000 -0700 @@ -202,10 +202,11 @@ extern unsigned long empty_zero_page; /* */ BTFIXUPDEF_CALL_CONST(struct page *, pmd_page, pmd_t) -BTFIXUPDEF_CALL_CONST(unsigned long, pgd_page, pgd_t) +BTFIXUPDEF_CALL_CONST(unsigned long, __pgd_page, pgd_t) #define pmd_page(pmd) BTFIXUP_CALL(pmd_page)(pmd) -#define pgd_page(pgd) BTFIXUP_CALL(pgd_page)(pgd) +#define __pgd_page(pgd) BTFIXUP_CALL(__pgd_page)(pgd) +#define pgd_page(pgd) virt_to_page(__pgd_page(pgd)) BTFIXUPDEF_SETHI(none_mask) BTFIXUPDEF_CALL_CONST(int, pte_present, pte_t) @@ -352,6 +353,11 @@ extern __inline__ pte_t pte_modify(pte_t /* Find an entry in the second-level page table.. */ BTFIXUPDEF_CALL(pmd_t *, pmd_offset, pgd_t *, unsigned long) #define pmd_offset(dir,addr) BTFIXUP_CALL(pmd_offset)(dir,addr) +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) /* Find an entry in the third-level page table.. */ BTFIXUPDEF_CALL(pte_t *, pte_offset_kernel, pmd_t *, unsigned long) diff -prauN linux-2.5.72/include/asm-sparc64/pgtable.h wli-2.5.72-numaq-15/include/asm-sparc64/pgtable.h --- linux-2.5.72/include/asm-sparc64/pgtable.h 2003-06-16 21:19:39.000000000 -0700 +++ wli-2.5.72-numaq-15/include/asm-sparc64/pgtable.h 2003-06-18 19:11:28.000000000 -0700 @@ -228,7 +228,8 @@ static inline pte_t pte_modify(pte_t ori (pgd_val(*(pgdp)) = (__pa((unsigned long) (pmdp)) >> 11UL)) #define __pmd_page(pmd) ((unsigned long) __va((pmd_val(pmd)<<11UL))) #define pmd_page(pmd) virt_to_page((void *)__pmd_page(pmd)) -#define pgd_page(pgd) ((unsigned long) __va((pgd_val(pgd)<<11UL))) +#define __pgd_page(pgd) ((unsigned long) __va((pgd_val(pgd)<<11UL))) +#define pgd_page(pgd) virt_to_page(__pgd_page(pgd)) #define pte_none(pte) (!pte_val(pte)) #define pte_present(pte) (pte_val(pte) & _PAGE_PRESENT) #define pte_clear(pte) (pte_val(*(pte)) = 0UL) @@ -270,8 +271,13 @@ static inline pte_t pte_modify(pte_t ori #define pgd_offset_k(address) pgd_offset(&init_mm, address) /* Find an entry in the second-level page table.. */ -#define pmd_offset(dir, address) ((pmd_t *) pgd_page(*(dir)) + \ +#define pmd_offset(dir, address) ((pmd_t *)__pgd_page(*(dir)) + \ ((address >> PMD_SHIFT) & (REAL_PTRS_PER_PMD-1))) +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) /* Find an entry in the third-level page table.. */ #define pte_index(dir, address) ((pte_t *) __pmd_page(*(dir)) + \ diff -prauN linux-2.5.72/include/asm-v850/pgtable.h wli-2.5.72-numaq-15/include/asm-v850/pgtable.h --- linux-2.5.72/include/asm-v850/pgtable.h 2003-06-16 21:20:07.000000000 -0700 +++ wli-2.5.72-numaq-15/include/asm-v850/pgtable.h 2003-06-18 19:11:28.000000000 -0700 @@ -13,6 +13,11 @@ typedef pte_t *pte_addr_t; #define pgd_clear(pgdp) ((void)0) #define pmd_offset(a, b) ((void *)0) +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) #define kern_addr_valid(addr) (1) diff -prauN linux-2.5.72/include/asm-x86_64/pgtable.h wli-2.5.72-numaq-15/include/asm-x86_64/pgtable.h --- linux-2.5.72/include/asm-x86_64/pgtable.h 2003-06-16 21:20:19.000000000 -0700 +++ wli-2.5.72-numaq-15/include/asm-x86_64/pgtable.h 2003-06-18 19:11:28.000000000 -0700 @@ -98,8 +98,9 @@ static inline void set_pml4(pml4_t *dst, pml4_val(*dst) = pml4_val(val); } -#define pgd_page(pgd) \ +#define __pgd_page(pgd) \ ((unsigned long) __va(pgd_val(pgd) & PHYSICAL_PAGE_MASK)) +#define pgd_page(pgd) virt_to_page(__pgd_page(pgd)) #define ptep_get_and_clear(xp) __pte(xchg(&(xp)->pte, 0)) #define pte_same(a, b) ((a).pte == (b).pte) @@ -332,8 +333,13 @@ static inline pgd_t *current_pgd_offset_ #define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT)) #define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1)) -#define pmd_offset(dir, address) ((pmd_t *) pgd_page(*(dir)) + \ +#define pmd_offset(dir, address) ((pmd_t *)__pgd_page(*(dir)) + \ pmd_index(address)) +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) #define pmd_none(x) (!pmd_val(x)) #define pmd_present(x) (pmd_val(x) & _PAGE_PRESENT) #define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0) diff -prauN linux-2.5.72/include/linux/dcache.h wli-2.5.72-numaq-15/include/linux/dcache.h --- linux-2.5.72/include/linux/dcache.h 2003-06-16 21:20:27.000000000 -0700 +++ wli-2.5.72-numaq-15/include/linux/dcache.h 2003-06-20 02:40:54.000000000 -0700 @@ -154,6 +154,7 @@ d_iput: no no yes #define DCACHE_UNHASHED 0x0010 extern spinlock_t dcache_lock; +extern spinlock_t vfsmount_lock; /** * d_drop - drop a dentry diff -prauN linux-2.5.72/include/linux/fs.h wli-2.5.72-numaq-15/include/linux/fs.h --- linux-2.5.72/include/linux/fs.h 2003-06-16 21:19:46.000000000 -0700 +++ wli-2.5.72-numaq-15/include/linux/fs.h 2003-06-18 21:43:23.000000000 -0700 @@ -19,6 +19,8 @@ #include #include #include +#include +#include #include struct iovec; @@ -309,11 +311,29 @@ struct address_space_operations { loff_t offset, unsigned long nr_segs); }; +#if NR_CPUS > 8 +typedef rwlock_t mapping_rwlock_t; +#define mapping_rdlock(lock) read_lock(lock) +#define mapping_rdunlock(lock) read_unlock(lock) +#define mapping_wrlock(lock) write_lock(lock) +#define mapping_wrunlock(lock) write_unlock(lock) +#define mapping_rwlock_init(lock) rwlock_init(lock) +#define MAPPING_RW_LOCK_UNLOCKED RW_LOCK_UNLOCKED +#else +typedef spinlock_t mapping_rwlock_t; +#define mapping_rdlock(lock) spin_lock(lock) +#define mapping_rdunlock(lock) spin_unlock(lock) +#define mapping_wrlock(lock) spin_lock(lock) +#define mapping_wrunlock(lock) spin_unlock(lock) +#define mapping_rwlock_init(lock) spin_lock_init(lock) +#define MAPPING_RW_LOCK_UNLOCKED SPIN_LOCK_UNLOCKED +#endif + struct backing_dev_info; struct address_space { struct inode *host; /* owner: inode, block_device */ struct radix_tree_root page_tree; /* radix tree of all pages */ - spinlock_t page_lock; /* and spinlock protecting it */ + mapping_rwlock_t page_lock; /* and spinlock protecting it */ struct list_head clean_pages; /* list of clean pages */ struct list_head dirty_pages; /* list of dirty pages */ struct list_head locked_pages; /* list of locked pages */ diff -prauN linux-2.5.72/include/linux/gfp.h wli-2.5.72-numaq-15/include/linux/gfp.h --- linux-2.5.72/include/linux/gfp.h 2003-06-16 21:19:46.000000000 -0700 +++ wli-2.5.72-numaq-15/include/linux/gfp.h 2003-06-18 19:15:13.000000000 -0700 @@ -76,6 +76,7 @@ static inline struct page * alloc_pages_ extern unsigned long FASTCALL(__get_free_pages(unsigned int gfp_mask, unsigned int order)); extern unsigned long FASTCALL(get_zeroed_page(unsigned int gfp_mask)); +void free_pages_bulk(struct zone *zone, struct page *page, unsigned int order); #define __get_free_page(gfp_mask) \ __get_free_pages((gfp_mask),0) diff -prauN linux-2.5.72/include/linux/hugetlb.h wli-2.5.72-numaq-15/include/linux/hugetlb.h --- linux-2.5.72/include/linux/hugetlb.h 2003-06-16 21:20:01.000000000 -0700 +++ wli-2.5.72-numaq-15/include/linux/hugetlb.h 2003-06-18 19:55:50.000000000 -0700 @@ -41,6 +41,11 @@ mark_mm_hugetlb(struct mm_struct *mm, st #define is_hugepage_only_range(addr, len) 0 #endif +#define vm_account_huge_inc(vma, pte, addr) \ + vm_account(vma, pte, addr, HPAGE_SIZE/PAGE_SIZE) +#define vm_account_huge_dec(vma, pte, addr) \ + vm_account(vma, pte, addr, -(HPAGE_SIZE/PAGE_SIZE)) + #else /* !CONFIG_HUGETLB_PAGE */ static inline int is_vm_hugetlb_page(struct vm_area_struct *vma) diff -prauN linux-2.5.72/include/linux/mm.h wli-2.5.72-numaq-15/include/linux/mm.h --- linux-2.5.72/include/linux/mm.h 2003-06-16 21:19:39.000000000 -0700 +++ wli-2.5.72-numaq-15/include/linux/mm.h 2003-06-20 05:04:06.000000000 -0700 @@ -179,6 +179,7 @@ struct page { struct pte_chain *chain;/* Reverse pte mapping pointer. * protected by PG_chainlock */ pte_addr_t direct; + int mapcount; } pte; unsigned long private; /* mapping-private opaque data */ @@ -339,9 +340,14 @@ static inline void set_page_zone(struct page->flags |= zone_num << ZONE_SHIFT; } -static inline void * lowmem_page_address(struct page *page) +#ifndef CONFIG_DISCONTIGMEM +/* The array of struct pages - for discontigmem use pgdat->lmem_map */ +extern struct page *mem_map; +#endif + +static inline void *lowmem_page_address(struct page *page) { - return __va( ( (page - page_zone(page)->zone_mem_map) + page_zone(page)->zone_start_pfn) << PAGE_SHIFT); + return __va(page_to_pfn(page) << PAGE_SHIFT); } #if defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) @@ -395,11 +401,6 @@ static inline int page_mapped(struct pag #define VM_FAULT_MINOR 1 #define VM_FAULT_MAJOR 2 -#ifndef CONFIG_DISCONTIGMEM -/* The array of struct pages - for discontigmem use pgdat->lmem_map */ -extern struct page *mem_map; -#endif - extern void show_free_areas(void); struct page *shmem_nopage(struct vm_area_struct * vma, @@ -423,15 +424,16 @@ int zeromap_page_range(struct vm_area_st extern int vmtruncate(struct inode * inode, loff_t offset); extern pmd_t *FASTCALL(__pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)); +pmd_t *FASTCALL(__pmd_alloc_kernel(struct mm_struct *mm, pgd_t *pmd, unsigned long address)); extern pte_t *FASTCALL(pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address)); -extern pte_t *FASTCALL(pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address)); +pte_t *FASTCALL(pte_alloc_map(struct mm_struct *mm, pmd_t **pmd, unsigned long address)); extern int install_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot); extern int handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma, unsigned long address, int write_access); extern int make_pages_present(unsigned long addr, unsigned long end); extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write); extern long sys_remap_file_pages(unsigned long start, unsigned long size, unsigned long prot, unsigned long pgoff, unsigned long nonblock); -void put_dirty_page(struct task_struct *tsk, struct page *page, - unsigned long address, pgprot_t prot); +void put_dirty_page(task_t *task, struct vm_area_struct *vma, + struct page *page, unsigned long address, pgprot_t prot); int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int len, int write, int force, struct page **pages, struct vm_area_struct **vmas); @@ -485,12 +487,11 @@ static inline int set_page_dirty(struct * inlining and the symmetry break with pte_alloc_map() that does all * of this out-of-line. */ -static inline pmd_t *pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) -{ - if (pgd_none(*pgd)) - return __pmd_alloc(mm, pgd, address); - return pmd_offset(pgd, address); -} +#define pmd_alloc_map(mm, pgd, addr) \ + (pgd_none(*(pgd))? __pmd_alloc(mm,pgd,addr): pmd_offset_map(pgd,addr)) + +#define pmd_alloc_kernel(mm, pgd, addr) \ + (pgd_none(*(pgd))? __pmd_alloc_kernel(mm,pgd,addr): pmd_offset_kernel(pgd,addr)) extern void free_area_init(unsigned long * zones_size); extern void free_area_init_node(int nid, pg_data_t *pgdat, struct page *pmap, @@ -609,5 +610,75 @@ extern struct page * follow_page(struct int write); extern int remap_page_range(struct vm_area_struct *vma, unsigned long from, unsigned long to, unsigned long size, pgprot_t prot); + +static inline void vm_account(struct vm_area_struct *vma, pte_t pte, + unsigned long addr, long adjustment) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long pfn; + struct page *page; + + if (!pte_present(pte)) + return; + + pfn = pte_pfn(pte); + if (!pfn_valid(pfn)) + goto out; + + page = pfn_to_page(pfn); + if (PageReserved(page)) + goto out; + + if (vma->vm_flags & VM_EXECUTABLE) + mm->text += adjustment; + else if (vma->vm_flags & (VM_STACK_FLAGS & (VM_GROWSUP | VM_GROWSDOWN))) { + mm->data += adjustment; + mm->stack += adjustment; + } else if (addr >= TASK_UNMAPPED_BASE) + mm->lib += adjustment; + else + mm->data += adjustment; + + if (page->mapping) + mm->shared += adjustment; + +out: + if (pte_write(pte)) + mm->dirty += adjustment; +} + +#define vm_account_inc(vma, pte, addr) vm_account(vma, pte, addr, +1) +#define vm_account_dec(vma, pte, addr) vm_account(vma, pte, addr, -1) + +static inline void vm_ptep_set_wrprotect(struct mm_struct *mm, pte_t *pte) +{ + if (pte_write(*pte)) + mm->dirty--; + ptep_set_wrprotect(pte); +} + +static inline void vm_set_pte(struct vm_area_struct *vma, pte_t *dst, + pte_t val, unsigned long addr) +{ + vm_account_inc(vma, val, addr); + set_pte(dst, val); +} + +static inline pte_t vm_ptep_get_and_clear(struct vm_area_struct *vma, + pte_t *pte, unsigned long addr) +{ + pte_t val = ptep_get_and_clear(pte); + vm_account_dec(vma, val, addr); + return val; +} + +static inline void vm_pte_clear(struct vm_area_struct *vma, pte_t *pte, + unsigned long addr) +{ + pte_t val = *pte; + pte_clear(pte); + vm_account_dec(vma, val, addr); +} + #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff -prauN linux-2.5.72/include/linux/mmzone.h wli-2.5.72-numaq-15/include/linux/mmzone.h --- linux-2.5.72/include/linux/mmzone.h 2003-06-16 21:20:02.000000000 -0700 +++ wli-2.5.72-numaq-15/include/linux/mmzone.h 2003-06-18 19:10:12.000000000 -0700 @@ -26,8 +26,8 @@ #endif struct free_area { - struct list_head free_list; - unsigned long *map; + struct list_head free_list, deferred_pages; + unsigned long *map, globally_free, active, locally_free; }; struct pglist_data; diff -prauN linux-2.5.72/include/linux/page-flags.h wli-2.5.72-numaq-15/include/linux/page-flags.h --- linux-2.5.72/include/linux/page-flags.h 2003-06-16 21:20:07.000000000 -0700 +++ wli-2.5.72-numaq-15/include/linux/page-flags.h 2003-06-20 05:04:06.000000000 -0700 @@ -75,6 +75,7 @@ #define PG_mappedtodisk 17 /* Has blocks allocated on-disk */ #define PG_reclaim 18 /* To be reclaimed asap */ #define PG_compound 19 /* Part of a compound page */ +#define PG_anon 20 /* Anonymous page */ /* @@ -266,6 +267,10 @@ extern void get_full_page_state(struct p #define SetPageCompound(page) set_bit(PG_compound, &(page)->flags) #define ClearPageCompound(page) clear_bit(PG_compound, &(page)->flags) +#define PageAnon(page) test_bit(PG_anon, &(page)->flags) +#define SetPageAnon(page) set_bit(PG_anon, &(page)->flags) +#define ClearPageAnon(page) clear_bit(PG_anon, &(page)->flags) + /* * The PageSwapCache predicate doesn't use a PG_flag at this time, * but it may again do so one day. diff -prauN linux-2.5.72/include/linux/pid.h wli-2.5.72-numaq-15/include/linux/pid.h --- linux-2.5.72/include/linux/pid.h 2003-06-16 21:19:45.000000000 -0700 +++ wli-2.5.72-numaq-15/include/linux/pid.h 2003-06-18 19:16:42.000000000 -0700 @@ -47,6 +47,7 @@ extern void FASTCALL(detach_pid(struct t * held. */ extern struct pid *FASTCALL(find_pid(enum pid_type, int)); +int find_next_pid(int); extern int alloc_pidmap(void); extern void FASTCALL(free_pidmap(int)); diff -prauN linux-2.5.72/include/linux/rmap-locking.h wli-2.5.72-numaq-15/include/linux/rmap-locking.h --- linux-2.5.72/include/linux/rmap-locking.h 2003-06-16 21:19:40.000000000 -0700 +++ wli-2.5.72-numaq-15/include/linux/rmap-locking.h 2003-06-19 11:57:21.000000000 -0700 @@ -6,10 +6,14 @@ */ #include +#include +#include struct pte_chain; extern kmem_cache_t *pte_chain_cache; +DECLARE_PER_CPU(struct pte_chain *, local_pte_chain); + static inline void pte_chain_lock(struct page *page) { /* @@ -37,7 +41,6 @@ static inline void pte_chain_unlock(stru preempt_enable(); } -struct pte_chain *pte_chain_alloc(int gfp_flags); void __pte_chain_free(struct pte_chain *pte_chain); static inline void pte_chain_free(struct pte_chain *pte_chain) @@ -45,3 +48,34 @@ static inline void pte_chain_free(struct if (pte_chain) __pte_chain_free(pte_chain); } + +/* + * pte_chain_alloc(): allocate a pte_chain structure for use by page_add_rmap(). + * + * The caller of page_add_rmap() must perform the allocation because + * page_add_rmap() is invariably called under spinlock. Often, page_add_rmap() + * will not actually use the pte_chain, because there is space available in one + * of the existing pte_chains which are attached to the page. So the case of + * allocating and then freeing a single pte_chain is specially optimised here, + * with a one-deep per-cpu cache. + */ +static inline struct pte_chain *pte_chain_alloc(int gfp_flags) +{ + struct pte_chain *ret, **pte_chainp; + int cpu; + + if (gfp_flags & __GFP_WAIT) + might_sleep(); + + cpu = get_cpu(); + pte_chainp = &per_cpu(local_pte_chain, cpu); + if (*pte_chainp) { + ret = *pte_chainp; + *pte_chainp = NULL; + put_cpu(); + } else { + put_cpu(); + ret = kmem_cache_alloc(pte_chain_cache, gfp_flags); + } + return ret; +} diff -prauN linux-2.5.72/include/linux/sched.h wli-2.5.72-numaq-15/include/linux/sched.h --- linux-2.5.72/include/linux/sched.h 2003-06-16 21:19:40.000000000 -0700 +++ wli-2.5.72-numaq-15/include/linux/sched.h 2003-06-19 20:31:44.000000000 -0700 @@ -196,6 +196,7 @@ struct mm_struct { unsigned long start_brk, brk, start_stack; unsigned long arg_start, arg_end, env_start, env_end; unsigned long rss, total_vm, locked_vm; + unsigned long shared, text, lib, data, dirty, stack; unsigned long def_flags; unsigned long cpu_vm_mask; unsigned long swap_address; diff -prauN linux-2.5.72/include/linux/swap.h wli-2.5.72-numaq-15/include/linux/swap.h --- linux-2.5.72/include/linux/swap.h 2003-06-16 21:19:39.000000000 -0700 +++ wli-2.5.72-numaq-15/include/linux/swap.h 2003-06-20 05:04:06.000000000 -0700 @@ -163,6 +163,7 @@ extern unsigned int nr_free_pages(void); extern unsigned int nr_free_pages_pgdat(pg_data_t *pgdat); extern unsigned int nr_free_buffer_pages(void); extern unsigned int nr_free_pagecache_pages(void); +unsigned long nr_deferred_pages(void); /* linux/mm/swap.c */ extern void FASTCALL(lru_cache_add(struct page *)); @@ -186,6 +187,8 @@ struct pte_chain *FASTCALL(page_add_rmap void FASTCALL(page_remove_rmap(struct page *, pte_t *)); int FASTCALL(try_to_unmap(struct page *)); +int page_convert_anon(struct page *); + /* linux/mm/shmem.c */ extern int shmem_unuse(swp_entry_t entry, struct page *page); #else diff -prauN linux-2.5.72/ipc/shm.c wli-2.5.72-numaq-15/ipc/shm.c --- linux-2.5.72/ipc/shm.c 2003-06-16 21:20:01.000000000 -0700 +++ wli-2.5.72-numaq-15/ipc/shm.c 2003-06-18 21:42:09.000000000 -0700 @@ -380,9 +380,9 @@ static void shm_get_stat(unsigned long * if (is_file_hugepages(shp->shm_file)) { struct address_space *mapping = inode->i_mapping; - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); *rss += (HPAGE_SIZE/PAGE_SIZE)*mapping->nrpages; - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); } else { struct shmem_inode_info *info = SHMEM_I(inode); spin_lock(&info->lock); diff -prauN linux-2.5.72/kernel/fork.c wli-2.5.72-numaq-15/kernel/fork.c --- linux-2.5.72/kernel/fork.c 2003-06-16 21:19:40.000000000 -0700 +++ wli-2.5.72-numaq-15/kernel/fork.c 2003-06-19 20:39:16.000000000 -0700 @@ -377,6 +377,7 @@ static struct mm_struct * mm_init(struct mm->ioctx_list_lock = RW_LOCK_UNLOCKED; mm->default_kioctx = (struct kioctx)INIT_KIOCTX(mm->default_kioctx, *mm); mm->free_area_cache = TASK_UNMAPPED_BASE; + mm->shared = mm->text = mm->lib = mm->data = mm->dirty = mm->stack = 0; if (likely(!mm_alloc_pgd(mm))) { mm->def_flags = 0; diff -prauN linux-2.5.72/kernel/pid.c wli-2.5.72-numaq-15/kernel/pid.c --- linux-2.5.72/kernel/pid.c 2003-06-16 21:20:05.000000000 -0700 +++ wli-2.5.72-numaq-15/kernel/pid.c 2003-06-18 19:16:42.000000000 -0700 @@ -172,13 +172,22 @@ int attach_pid(task_t *task, enum pid_ty if (pid) atomic_inc(&pid->count); else { + struct list_head *elem, *bucket; + pid = &task->pids[type].pid; pid->nr = nr; atomic_set(&pid->count, 1); INIT_LIST_HEAD(&pid->task_list); pid->task = task; get_task_struct(task); - list_add(&pid->hash_chain, &pid_hash[type][pid_hashfn(nr)]); + bucket = &pid_hash[type][pid_hashfn(nr)]; + __list_for_each(elem, bucket) { + struct pid *walk; + walk = list_entry(elem, struct pid, hash_chain); + if (walk->nr > nr) + break; + } + list_add_tail(&pid->hash_chain, elem); } list_add_tail(&task->pids[type].pid_chain, &pid->task_list); task->pids[type].pidptr = pid; @@ -221,6 +230,42 @@ void detach_pid(task_t *task, enum pid_t free_pidmap(nr); } +/** + * find_next_pid - Returns the pid of next task. + * @pid: Starting point for the search. + * + * Returns the pid number of the task that follows behind + * "pid". The function works even if the input pid value + * is not valid anymore. + */ + int find_next_pid(int pid) +{ + struct list_head *elem, *bucket; + + if(!pid) { + bucket = &pid_hash[PIDTYPE_PID][0]; + } else { + bucket = &pid_hash[PIDTYPE_PID][pid_hashfn(pid)]; + } + read_lock(&tasklist_lock); +next_chain: + __list_for_each(elem, bucket) { + struct pid *walk; + walk = list_entry(elem, struct pid, hash_chain); + if (walk->nr > pid) { + pid = walk->nr; + read_unlock(&tasklist_lock); + return pid; + } + } + pid = 0; + bucket++; + if (bucket < &pid_hash[PIDTYPE_PID][1<mmap_sem * ->i_shared_sem (various places) * + * ->lock_page + * ->i_shared_sem (page_convert_anon) + * * ->inode_lock * ->sb_lock (fs/fs-writeback.c) * ->mapping->page_lock (__sync_single_inode) @@ -96,9 +99,9 @@ void remove_from_page_cache(struct page if (unlikely(!PageLocked(page))) PAGE_BUG(page); - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); __remove_from_page_cache(page); - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); } static inline int sync_page(struct page *page) @@ -130,9 +133,9 @@ static int __filemap_fdatawrite(struct a if (mapping->backing_dev_info->memory_backed) return 0; - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); list_splice_init(&mapping->dirty_pages, &mapping->io_pages); - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); ret = do_writepages(mapping, &wbc); return ret; } @@ -163,7 +166,7 @@ int filemap_fdatawait(struct address_spa restart: progress = 0; - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); while (!list_empty(&mapping->locked_pages)) { struct page *page; @@ -177,7 +180,7 @@ restart: if (!PageWriteback(page)) { if (++progress > 32) { if (need_resched()) { - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); __cond_resched(); goto restart; } @@ -187,16 +190,16 @@ restart: progress = 0; page_cache_get(page); - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); wait_on_page_writeback(page); if (PageError(page)) ret = -EIO; page_cache_release(page); - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); } - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); return ret; } @@ -224,7 +227,7 @@ int add_to_page_cache(struct page *page, if (error == 0) { page_cache_get(page); - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); error = radix_tree_insert(&mapping->page_tree, offset, page); if (!error) { SetPageLocked(page); @@ -232,7 +235,7 @@ int add_to_page_cache(struct page *page, } else { page_cache_release(page); } - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); radix_tree_preload_end(); } return error; @@ -361,11 +364,11 @@ struct page * find_get_page(struct addre * We scan the hash list read-only. Addition to and removal from * the hash-list needs a held write-lock. */ - spin_lock(&mapping->page_lock); + mapping_rdlock(&mapping->page_lock); page = radix_tree_lookup(&mapping->page_tree, offset); if (page) page_cache_get(page); - spin_unlock(&mapping->page_lock); + mapping_rdunlock(&mapping->page_lock); return page; } @@ -376,11 +379,11 @@ struct page *find_trylock_page(struct ad { struct page *page; - spin_lock(&mapping->page_lock); + mapping_rdlock(&mapping->page_lock); page = radix_tree_lookup(&mapping->page_tree, offset); if (page && TestSetPageLocked(page)) page = NULL; - spin_unlock(&mapping->page_lock); + mapping_rdunlock(&mapping->page_lock); return page; } @@ -400,15 +403,15 @@ struct page *find_lock_page(struct addre { struct page *page; - spin_lock(&mapping->page_lock); + mapping_rdlock(&mapping->page_lock); repeat: page = radix_tree_lookup(&mapping->page_tree, offset); if (page) { page_cache_get(page); if (TestSetPageLocked(page)) { - spin_unlock(&mapping->page_lock); + mapping_rdunlock(&mapping->page_lock); lock_page(page); - spin_lock(&mapping->page_lock); + mapping_rdlock(&mapping->page_lock); /* Has the page been truncated while we slept? */ if (page->mapping != mapping || page->index != offset) { @@ -418,7 +421,7 @@ repeat: } } } - spin_unlock(&mapping->page_lock); + mapping_rdunlock(&mapping->page_lock); return page; } @@ -488,12 +491,12 @@ unsigned int find_get_pages(struct addre unsigned int i; unsigned int ret; - spin_lock(&mapping->page_lock); + mapping_rdlock(&mapping->page_lock); ret = radix_tree_gang_lookup(&mapping->page_tree, (void **)pages, start, nr_pages); for (i = 0; i < ret; i++) page_cache_get(pages[i]); - spin_unlock(&mapping->page_lock); + mapping_rdunlock(&mapping->page_lock); return ret; } diff -prauN linux-2.5.72/mm/fremap.c wli-2.5.72-numaq-15/mm/fremap.c --- linux-2.5.72/mm/fremap.c 2003-06-16 21:19:39.000000000 -0700 +++ wli-2.5.72-numaq-15/mm/fremap.c 2003-06-20 05:04:06.000000000 -0700 @@ -28,7 +28,7 @@ static inline int zap_pte(struct mm_stru unsigned long pfn = pte_pfn(pte); flush_cache_page(vma, addr); - pte = ptep_get_and_clear(ptep); + pte = vm_ptep_get_and_clear(vma, ptep, addr); if (pfn_valid(pfn)) { struct page *page = pfn_to_page(pfn); if (!PageReserved(page)) { @@ -43,7 +43,7 @@ static inline int zap_pte(struct mm_stru } else { if (!pte_file(pte)) free_swap_and_cache(pte_to_swp_entry(pte)); - pte_clear(ptep); + vm_pte_clear(vma, ptep, addr); return 0; } } @@ -60,18 +60,34 @@ int install_page(struct mm_struct *mm, s pgd_t *pgd; pmd_t *pmd; struct pte_chain *pte_chain; + unsigned long pgidx; pte_chain = pte_chain_alloc(GFP_KERNEL); if (!pte_chain) goto err; + + /* + * Convert this page to anon for objrmap if it's nonlinear + */ + pgidx = (addr - vma->vm_start) >> PAGE_SHIFT; + pgidx += vma->vm_pgoff; + pgidx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT; + if (!PageAnon(page) && (page->index != pgidx)) { + lock_page(page); + err = page_convert_anon(page); + unlock_page(page); + if (err < 0) + goto err_free; + } + pgd = pgd_offset(mm, addr); spin_lock(&mm->page_table_lock); - pmd = pmd_alloc(mm, pgd, addr); + pmd = pmd_alloc_map(mm, pgd, addr); if (!pmd) goto err_unlock; - pte = pte_alloc_map(mm, pmd, addr); + pte = pte_alloc_map(mm, &pmd, addr); if (!pte) goto err_unlock; @@ -79,18 +95,18 @@ int install_page(struct mm_struct *mm, s mm->rss++; flush_icache_page(vma, page); - set_pte(pte, mk_pte(page, prot)); + vm_set_pte(vma, pte, mk_pte(page, prot), addr); pte_chain = page_add_rmap(page, pte, pte_chain); pte_unmap(pte); + pmd_unmap(pmd); if (flush) flush_tlb_page(vma, addr); update_mmu_cache(vma, addr, *pte); - spin_unlock(&mm->page_table_lock); - pte_chain_free(pte_chain); - return 0; + err = 0; err_unlock: spin_unlock(&mm->page_table_lock); +err_free: pte_chain_free(pte_chain); err: return err; diff -prauN linux-2.5.72/mm/memory.c wli-2.5.72-numaq-15/mm/memory.c --- linux-2.5.72/mm/memory.c 2003-06-16 21:20:00.000000000 -0700 +++ wli-2.5.72-numaq-15/mm/memory.c 2003-06-20 05:04:06.000000000 -0700 @@ -103,7 +103,8 @@ static inline void free_one_pmd(struct m static inline void free_one_pgd(struct mmu_gather *tlb, pgd_t * dir) { int j; - pmd_t * pmd; + pmd_t *pmd; + struct page *page; if (pgd_none(*dir)) return; @@ -112,11 +113,13 @@ static inline void free_one_pgd(struct m pgd_clear(dir); return; } - pmd = pmd_offset(dir, 0); + page = pgd_page(*dir); + pmd = pmd_offset_map(dir, 0); pgd_clear(dir); for (j = 0; j < PTRS_PER_PMD ; j++) free_one_pmd(tlb, pmd+j); - pmd_free_tlb(tlb, pmd); + pmd_unmap(pmd); + pmd_free_tlb(tlb, page); } /* @@ -136,30 +139,40 @@ void clear_page_tables(struct mmu_gather } while (--nr); } -pte_t * pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address) +/* + * error return happens with pmd unmapped + */ +pte_t *pte_alloc_map(struct mm_struct *mm, pmd_t **pmd, unsigned long address) { - if (!pmd_present(*pmd)) { + if (!pmd_present(**pmd)) { + pgd_t *pgd; struct page *new; + pmd_unmap(*pmd); spin_unlock(&mm->page_table_lock); new = pte_alloc_one(mm, address); spin_lock(&mm->page_table_lock); - if (!new) + if (!new) { + *pmd = NULL; return NULL; + } + + pgd = pgd_offset(mm, address); + *pmd = pmd_offset_map(pgd, address); /* * Because we dropped the lock, we should re-check the * entry, as somebody else could have populated it.. */ - if (pmd_present(*pmd)) { + if (pmd_present(**pmd)) { pte_free(new); goto out; } pgtable_add_rmap(new, mm, address); - pmd_populate(mm, pmd, new); + pmd_populate(mm, *pmd, new); } out: - return pte_offset_map(pmd, address); + return pte_offset_map(*pmd, address); } pte_t * pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address) @@ -244,10 +257,10 @@ skip_copy_pmd_range: address = (address continue; } - src_pmd = pmd_offset(src_pgd, address); - dst_pmd = pmd_alloc(dst, dst_pgd, address); + dst_pmd = pmd_alloc_map(dst, dst_pgd, address); if (!dst_pmd) goto nomem; + src_pmd = pmd_offset_map_nested(src_pgd, address); do { pte_t * src_pte, * dst_pte; @@ -261,15 +274,20 @@ skip_copy_pmd_range: address = (address pmd_clear(src_pmd); skip_copy_pte_range: address = (address + PMD_SIZE) & PMD_MASK; - if (address >= end) + if (address >= end) { + pmd_unmap(dst_pmd); + pmd_unmap_nested(src_pmd); goto out; + } goto cont_copy_pmd_range; } - dst_pte = pte_alloc_map(dst, dst_pmd, address); + pmd_unmap_nested(src_pmd); + dst_pte = pte_alloc_map(dst, &dst_pmd, address); if (!dst_pte) goto nomem; spin_lock(&src->page_table_lock); + src_pmd = pmd_offset_map_nested(src_pgd, address); src_pte = pte_offset_map_nested(src_pmd, address); do { pte_t pte = *src_pte; @@ -284,7 +302,7 @@ skip_copy_pte_range: if (!pte_present(pte)) { if (!pte_file(pte)) swap_duplicate(pte_to_swp_entry(pte)); - set_pte(dst_pte, pte); + vm_set_pte(vma, dst_pte, pte, address); goto cont_copy_pte_range_noset; } pfn = pte_pfn(pte); @@ -298,7 +316,7 @@ skip_copy_pte_range: page = pfn_to_page(pfn); if (!page || PageReserved(page)) { - set_pte(dst_pte, pte); + vm_set_pte(vma, dst_pte, pte, address); goto cont_copy_pte_range_noset; } @@ -307,7 +325,7 @@ skip_copy_pte_range: * in the parent and the child */ if (cow) { - ptep_set_wrprotect(src_pte); + vm_ptep_set_wrprotect(src, src_pte); pte = *src_pte; } @@ -321,7 +339,7 @@ skip_copy_pte_range: get_page(page); dst->rss++; - set_pte(dst_pte, pte); + vm_set_pte(vma, dst_pte, pte, address); pte_chain = page_add_rmap(page, dst_pte, pte_chain); if (pte_chain) @@ -336,6 +354,8 @@ skip_copy_pte_range: */ pte_unmap_nested(src_pte); pte_unmap(dst_pte); + pmd_unmap_nested(src_pmd); + pmd_unmap(dst_pmd); spin_unlock(&src->page_table_lock); spin_unlock(&dst->page_table_lock); pte_chain = pte_chain_alloc(GFP_KERNEL); @@ -343,12 +363,16 @@ skip_copy_pte_range: if (!pte_chain) goto nomem; spin_lock(&src->page_table_lock); + dst_pmd = pmd_offset_map(dst_pgd, address); + src_pmd = pmd_offset_map_nested(src_pgd, address); dst_pte = pte_offset_map(dst_pmd, address); src_pte = pte_offset_map_nested(src_pmd, address); cont_copy_pte_range_noset: address += PAGE_SIZE; if (address >= end) { + pmd_unmap(dst_pmd); + pmd_unmap_nested(src_pmd); pte_unmap_nested(src_pte); pte_unmap(dst_pte); goto out_unlock; @@ -364,6 +388,8 @@ cont_copy_pmd_range: src_pmd++; dst_pmd++; } while ((unsigned long)src_pmd & PMD_TABLE_MASK); + pmd_unmap_nested(src_pmd-1); + pmd_unmap(dst_pmd-1); } out_unlock: spin_unlock(&src->page_table_lock); @@ -376,7 +402,7 @@ nomem: } static void -zap_pte_range(struct mmu_gather *tlb, pmd_t * pmd, +zap_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long address, unsigned long size) { unsigned long offset; @@ -401,7 +427,7 @@ zap_pte_range(struct mmu_gather *tlb, pm if (pte_present(pte)) { unsigned long pfn = pte_pfn(pte); - pte = ptep_get_and_clear(ptep); + pte = vm_ptep_get_and_clear(vma, ptep, address + offset); tlb_remove_tlb_entry(tlb, ptep, address+offset); if (pfn_valid(pfn)) { struct page *page = pfn_to_page(pfn); @@ -419,14 +445,14 @@ zap_pte_range(struct mmu_gather *tlb, pm } else { if (!pte_file(pte)) free_swap_and_cache(pte_to_swp_entry(pte)); - pte_clear(ptep); + vm_pte_clear(vma, ptep, address); } } pte_unmap(ptep-1); } static void -zap_pmd_range(struct mmu_gather *tlb, pgd_t * dir, +zap_pmd_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pgd_t *dir, unsigned long address, unsigned long size) { pmd_t * pmd; @@ -439,15 +465,16 @@ zap_pmd_range(struct mmu_gather *tlb, pg pgd_clear(dir); return; } - pmd = pmd_offset(dir, address); + pmd = pmd_offset_map(dir, address); end = address + size; if (end > ((address + PGDIR_SIZE) & PGDIR_MASK)) end = ((address + PGDIR_SIZE) & PGDIR_MASK); do { - zap_pte_range(tlb, pmd, address, end - address); + zap_pte_range(tlb, vma, pmd, address, end - address); address = (address + PMD_SIZE) & PMD_MASK; pmd++; } while (address < end); + pmd_unmap(pmd - 1); } void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, @@ -465,7 +492,7 @@ void unmap_page_range(struct mmu_gather dir = pgd_offset(vma->vm_mm, address); tlb_start_vma(tlb, vma); do { - zap_pmd_range(tlb, dir, address, end - address); + zap_pmd_range(tlb, vma, dir, address, end - address); address = (address + PGDIR_SIZE) & PGDIR_MASK; dir++; } while (address && (address < end)); @@ -629,20 +656,24 @@ follow_page(struct mm_struct *mm, unsign if (pgd_none(*pgd) || pgd_bad(*pgd)) goto out; - pmd = pmd_offset(pgd, address); + pmd = pmd_offset_map(pgd, address); if (pmd_none(*pmd)) - goto out; - if (pmd_huge(*pmd)) - return follow_huge_pmd(mm, address, pmd, write); + goto out_unmap; + if (pmd_huge(*pmd)) { + struct page *page = follow_huge_pmd(mm, address, pmd, write); + pmd_unmap(pmd); + return page; + } if (pmd_bad(*pmd)) - goto out; + goto out_unmap; ptep = pte_offset_map(pmd, address); if (!ptep) - goto out; + goto out_unmap; pte = *ptep; pte_unmap(ptep); + pmd_unmap(pmd); if (pte_present(pte)) { if (!write || (pte_write(pte) && pte_dirty(pte))) { pfn = pte_pfn(pte); @@ -653,6 +684,9 @@ follow_page(struct mm_struct *mm, unsign out: return NULL; +out_unmap: + pmd_unmap(pmd); + goto out; } /* @@ -711,7 +745,7 @@ int get_user_pages(struct task_struct *t pgd = pgd_offset_k(pg); if (!pgd) return i ? : -EFAULT; - pmd = pmd_offset(pgd, pg); + pmd = pmd_offset_kernel(pgd, pg); if (!pmd) return i ? : -EFAULT; pte = pte_offset_kernel(pmd, pg); @@ -785,8 +819,8 @@ out: return i; } -static void zeromap_pte_range(pte_t * pte, unsigned long address, - unsigned long size, pgprot_t prot) +static void zeromap_pte_range(struct vm_area_struct *vma, pte_t *pte, + unsigned long address, unsigned long size, pgprot_t prot) { unsigned long end; @@ -797,14 +831,14 @@ static void zeromap_pte_range(pte_t * pt do { pte_t zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE(address), prot)); BUG_ON(!pte_none(*pte)); - set_pte(pte, zero_pte); + vm_set_pte(vma, pte, zero_pte, address); address += PAGE_SIZE; pte++; } while (address && (address < end)); } -static inline int zeromap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned long address, - unsigned long size, pgprot_t prot) +static inline int zeromap_pmd_range(struct vm_area_struct *vma, pmd_t **pmd, + unsigned long address, unsigned long size, pgprot_t prot) { unsigned long end; @@ -813,13 +847,13 @@ static inline int zeromap_pmd_range(stru if (end > PGDIR_SIZE) end = PGDIR_SIZE; do { - pte_t * pte = pte_alloc_map(mm, pmd, address); + pte_t *pte = pte_alloc_map(vma->vm_mm, pmd, address); if (!pte) return -ENOMEM; - zeromap_pte_range(pte, address, end - address, prot); + zeromap_pte_range(vma, pte, address, end - address, prot); pte_unmap(pte); address = (address + PMD_SIZE) & PMD_MASK; - pmd++; + (*pmd)++; } while (address && (address < end)); return 0; } @@ -839,13 +873,14 @@ int zeromap_page_range(struct vm_area_st spin_lock(&mm->page_table_lock); do { - pmd_t *pmd = pmd_alloc(mm, dir, address); + pmd_t *pmd = pmd_alloc_map(mm, dir, address); error = -ENOMEM; if (!pmd) break; - error = zeromap_pmd_range(mm, pmd, address, end - address, prot); + error = zeromap_pmd_range(vma, &pmd, address, end - address, prot); if (error) break; + pmd_unmap(pmd - 1); address = (address + PGDIR_SIZE) & PGDIR_MASK; dir++; } while (address && (address < end)); @@ -859,8 +894,9 @@ int zeromap_page_range(struct vm_area_st * mappings are removed. any references to nonexistent pages results * in null mappings (currently treated as "copy-on-access") */ -static inline void remap_pte_range(pte_t * pte, unsigned long address, unsigned long size, - unsigned long phys_addr, pgprot_t prot) +static inline void remap_pte_range(struct vm_area_struct *vma, pte_t *pte, + unsigned long address, unsigned long size, + unsigned long phys_addr, pgprot_t prot) { unsigned long end; unsigned long pfn; @@ -873,15 +909,16 @@ static inline void remap_pte_range(pte_t do { BUG_ON(!pte_none(*pte)); if (!pfn_valid(pfn) || PageReserved(pfn_to_page(pfn))) - set_pte(pte, pfn_pte(pfn, prot)); + vm_set_pte(vma, pte, pfn_pte(pfn, prot), address); address += PAGE_SIZE; pfn++; pte++; } while (address && (address < end)); } -static inline int remap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned long address, unsigned long size, - unsigned long phys_addr, pgprot_t prot) +static inline int remap_pmd_range(struct vm_area_struct *vma, pmd_t **pmd, + unsigned long address, unsigned long size, + unsigned long phys_addr, pgprot_t prot) { unsigned long base, end; @@ -892,13 +929,13 @@ static inline int remap_pmd_range(struct end = PGDIR_SIZE; phys_addr -= address; do { - pte_t * pte = pte_alloc_map(mm, pmd, base + address); + pte_t *pte = pte_alloc_map(vma->vm_mm, pmd, base + address); if (!pte) return -ENOMEM; - remap_pte_range(pte, base + address, end - address, address + phys_addr, prot); + remap_pte_range(vma, pte, base + address, end - address, address + phys_addr, prot); pte_unmap(pte); address = (address + PMD_SIZE) & PMD_MASK; - pmd++; + (*pmd)++; } while (address && (address < end)); return 0; } @@ -920,13 +957,14 @@ int remap_page_range(struct vm_area_stru spin_lock(&mm->page_table_lock); do { - pmd_t *pmd = pmd_alloc(mm, dir, from); + pmd_t *pmd = pmd_alloc_map(mm, dir, from); error = -ENOMEM; if (!pmd) break; - error = remap_pmd_range(mm, pmd, from, end - from, phys_addr + from, prot); + error = remap_pmd_range(vma, &pmd, from, end - from, phys_addr + from, prot); if (error) break; + pmd_unmap(pmd - 1); from = (from + PGDIR_SIZE) & PGDIR_MASK; dir++; } while (from && (from < end)); @@ -943,9 +981,10 @@ int remap_page_range(struct vm_area_stru * * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock */ -static inline void establish_pte(struct vm_area_struct * vma, unsigned long address, pte_t *page_table, pte_t entry) +static inline void establish_pte(struct vm_area_struct *vma, + unsigned long address, pte_t *page_table, pte_t entry) { - set_pte(page_table, entry); + vm_set_pte(vma, page_table, entry, address); flush_tlb_page(vma, address); update_mmu_cache(vma, address, entry); } @@ -953,8 +992,9 @@ static inline void establish_pte(struct /* * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock */ -static inline void break_cow(struct vm_area_struct * vma, struct page * new_page, unsigned long address, - pte_t *page_table) +static inline void break_cow(struct vm_area_struct *vma, + struct page *new_page, unsigned long address, + pte_t *page_table) { invalidate_vcache(address, vma->vm_mm, new_page); flush_cache_page(vma, address); @@ -996,6 +1036,7 @@ static int do_wp_page(struct mm_struct * * data, but for the moment just pretend this is OOM. */ pte_unmap(page_table); + pmd_unmap(pmd); printk(KERN_ERR "do_wp_page: bogus page at address %08lx\n", address); goto oom; @@ -1010,11 +1051,13 @@ static int do_wp_page(struct mm_struct * establish_pte(vma, address, page_table, pte_mkyoung(pte_mkdirty(pte_mkwrite(pte)))); pte_unmap(page_table); + pmd_unmap(pmd); ret = VM_FAULT_MINOR; goto out; } } pte_unmap(page_table); + pmd_unmap(pmd); /* * Ok, we need to copy. Oh, well.. @@ -1034,12 +1077,14 @@ static int do_wp_page(struct mm_struct * * Re-check the pte - we dropped the lock */ spin_lock(&mm->page_table_lock); + pmd = pmd_offset_map(pgd_offset(mm, address), address); page_table = pte_offset_map(pmd, address); if (pte_same(*page_table, pte)) { if (PageReserved(old_page)) ++mm->rss; page_remove_rmap(old_page, page_table); break_cow(vma, new_page, address, page_table); + SetPageAnon(new_page); pte_chain = page_add_rmap(new_page, page_table, pte_chain); lru_cache_add_active(new_page); @@ -1047,6 +1092,7 @@ static int do_wp_page(struct mm_struct * new_page = old_page; } pte_unmap(page_table); + pmd_unmap(pmd); page_cache_release(new_page); page_cache_release(old_page); ret = VM_FAULT_MINOR; @@ -1180,6 +1226,7 @@ static int do_swap_page(struct mm_struct struct pte_chain *pte_chain = NULL; pte_unmap(page_table); + pmd_unmap(pmd); spin_unlock(&mm->page_table_lock); page = lookup_swap_cache(entry); if (!page) { @@ -1191,12 +1238,14 @@ static int do_swap_page(struct mm_struct * we released the page table lock. */ spin_lock(&mm->page_table_lock); + pmd = pmd_offset_map(pgd_offset(mm, address), address); page_table = pte_offset_map(pmd, address); if (pte_same(*page_table, orig_pte)) ret = VM_FAULT_OOM; else ret = VM_FAULT_MINOR; pte_unmap(page_table); + pmd_unmap(pmd); spin_unlock(&mm->page_table_lock); goto out; } @@ -1219,9 +1268,11 @@ static int do_swap_page(struct mm_struct * released the page table lock. */ spin_lock(&mm->page_table_lock); + pmd = pmd_offset_map(pgd_offset(mm, address), address); page_table = pte_offset_map(pmd, address); if (!pte_same(*page_table, orig_pte)) { pte_unmap(page_table); + pmd_unmap(pmd); spin_unlock(&mm->page_table_lock); unlock_page(page); page_cache_release(page); @@ -1242,11 +1293,13 @@ static int do_swap_page(struct mm_struct unlock_page(page); flush_icache_page(vma, page); - set_pte(page_table, pte); + SetPageAnon(page); + vm_set_pte(vma, page_table, pte, address); pte_chain = page_add_rmap(page, page_table, pte_chain); /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, address, pte); + pmd_unmap(pmd); pte_unmap(page_table); spin_unlock(&mm->page_table_lock); out: @@ -1272,11 +1325,13 @@ do_anonymous_page(struct mm_struct *mm, pte_chain = pte_chain_alloc(GFP_ATOMIC); if (!pte_chain) { pte_unmap(page_table); + pmd_unmap(pmd); spin_unlock(&mm->page_table_lock); pte_chain = pte_chain_alloc(GFP_KERNEL); if (!pte_chain) goto no_mem; spin_lock(&mm->page_table_lock); + pmd = pmd_offset_map(pgd_offset(mm, addr), addr); page_table = pte_offset_map(pmd, addr); } @@ -1287,6 +1342,7 @@ do_anonymous_page(struct mm_struct *mm, if (write_access) { /* Allocate our own private page. */ pte_unmap(page_table); + pmd_unmap(pmd); spin_unlock(&mm->page_table_lock); page = alloc_page(GFP_HIGHUSER); @@ -1295,9 +1351,11 @@ do_anonymous_page(struct mm_struct *mm, clear_user_highpage(page, addr); spin_lock(&mm->page_table_lock); + pmd = pmd_offset_map(pgd_offset(mm, addr), addr); page_table = pte_offset_map(pmd, addr); if (!pte_none(*page_table)) { + pmd_unmap(pmd); pte_unmap(page_table); page_cache_release(page); spin_unlock(&mm->page_table_lock); @@ -1308,11 +1366,13 @@ do_anonymous_page(struct mm_struct *mm, entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); lru_cache_add_active(page); mark_page_accessed(page); + SetPageAnon(page); } - set_pte(page_table, entry); + vm_set_pte(vma, page_table, entry, addr); /* ignores ZERO_PAGE */ pte_chain = page_add_rmap(page, page_table, pte_chain); + pmd_unmap(pmd); pte_unmap(page_table); /* No need to invalidate - it was non-present before */ @@ -1353,6 +1413,7 @@ do_no_page(struct mm_struct *mm, struct return do_anonymous_page(mm, vma, page_table, pmd, write_access, address); pte_unmap(page_table); + pmd_unmap(pmd); spin_unlock(&mm->page_table_lock); new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, 0); @@ -1367,6 +1428,10 @@ do_no_page(struct mm_struct *mm, struct if (!pte_chain) goto oom; + /* See if nopage returned an anon page */ + if (!new_page->mapping || PageSwapCache(new_page)) + SetPageAnon(new_page); + /* * Should we do an early C-O-W break? */ @@ -1379,10 +1444,12 @@ do_no_page(struct mm_struct *mm, struct copy_user_highpage(page, new_page, address); page_cache_release(new_page); lru_cache_add_active(page); + SetPageAnon(page); new_page = page; } spin_lock(&mm->page_table_lock); + pmd = pmd_offset_map(pgd_offset(mm, address), address); page_table = pte_offset_map(pmd, address); /* @@ -1402,12 +1469,14 @@ do_no_page(struct mm_struct *mm, struct entry = mk_pte(new_page, vma->vm_page_prot); if (write_access) entry = pte_mkwrite(pte_mkdirty(entry)); - set_pte(page_table, entry); + vm_set_pte(vma, page_table, entry, address); pte_chain = page_add_rmap(new_page, page_table, pte_chain); pte_unmap(page_table); + pmd_unmap(pmd); } else { /* One of our sibling threads was faster, back out. */ pte_unmap(page_table); + pmd_unmap(pmd); page_cache_release(new_page); spin_unlock(&mm->page_table_lock); ret = VM_FAULT_MINOR; @@ -1444,13 +1513,14 @@ static int do_file_page(struct mm_struct */ if (!vma->vm_ops || !vma->vm_ops->populate || (write_access && !(vma->vm_flags & VM_SHARED))) { - pte_clear(pte); + vm_pte_clear(vma, pte, address); return do_no_page(mm, vma, address, write_access, pte, pmd); } pgoff = pte_to_pgoff(*pte); pte_unmap(pte); + pmd_unmap(pmd); spin_unlock(&mm->page_table_lock); err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE, vma->vm_page_prot, pgoff, 0); @@ -1537,10 +1607,10 @@ int handle_mm_fault(struct mm_struct *mm * and the SMP-safe atomic PTE updates. */ spin_lock(&mm->page_table_lock); - pmd = pmd_alloc(mm, pgd, address); + pmd = pmd_alloc_map(mm, pgd, address); if (pmd) { - pte_t * pte = pte_alloc_map(mm, pmd, address); + pte_t *pte = pte_alloc_map(mm, &pmd, address); if (pte) return handle_pte_fault(mm, vma, address, write_access, pte, pmd); } @@ -1577,7 +1647,30 @@ pmd_t *__pmd_alloc(struct mm_struct *mm, } pgd_populate(mm, pgd, new); out: - return pmd_offset(pgd, address); + return pmd_offset_map(pgd, address); +} + +pmd_t *__pmd_alloc_kernel(struct mm_struct *mm, pgd_t *pgd, unsigned long address) +{ + pmd_t *new; + + spin_unlock(&mm->page_table_lock); + new = pmd_alloc_one_kernel(mm, address); + spin_lock(&mm->page_table_lock); + if (!new) + return NULL; + + /* + * Because we dropped the lock, we should re-check the + * entry, as somebody else could have populated it.. + */ + if (pgd_present(*pgd)) { + pmd_free(new); + goto out; + } + pgd_populate(mm, pgd, new); +out: + return pmd_offset_kernel(pgd, address); } int make_pages_present(unsigned long addr, unsigned long end) @@ -1600,7 +1693,7 @@ int make_pages_present(unsigned long add /* * Map a vmalloc()-space virtual address to the physical page. */ -struct page * vmalloc_to_page(void * vmalloc_addr) +struct page *vmalloc_to_page(void *vmalloc_addr) { unsigned long addr = (unsigned long) vmalloc_addr; struct page *page = NULL; @@ -1609,7 +1702,7 @@ struct page * vmalloc_to_page(void * vma pte_t *ptep, pte; if (!pgd_none(*pgd)) { - pmd = pmd_offset(pgd, addr); + pmd = pmd_offset_map(pgd, addr); if (!pmd_none(*pmd)) { preempt_disable(); ptep = pte_offset_map(pmd, addr); @@ -1619,6 +1712,7 @@ struct page * vmalloc_to_page(void * vma pte_unmap(ptep); preempt_enable(); } + pmd_unmap(pmd); } return page; } diff -prauN linux-2.5.72/mm/mmap.c wli-2.5.72-numaq-15/mm/mmap.c --- linux-2.5.72/mm/mmap.c 2003-06-16 21:20:06.000000000 -0700 +++ wli-2.5.72-numaq-15/mm/mmap.c 2003-06-20 05:04:06.000000000 -0700 @@ -377,6 +377,28 @@ static inline int is_mergeable_vma(struc return 1; } +static void move_vma_start(struct vm_area_struct *vma, unsigned long addr) +{ + spinlock_t *lock = &vma->vm_mm->page_table_lock; + struct inode *inode = NULL; + + if (vma->vm_file) { + inode = vma->vm_file->f_dentry->d_inode; + down(&inode->i_mapping->i_shared_sem); + } + spin_lock(lock); + if (inode) + __remove_shared_vm_struct(vma, inode); + /* If no vm_file, perhaps we should always keep vm_pgoff at 0?? */ + vma->vm_pgoff += (long)(addr - vma->vm_start) >> PAGE_SHIFT; + vma->vm_start = addr; + if (inode) { + __vma_link_file(vma); + up(&inode->i_mapping->i_shared_sem); + } + spin_unlock(lock); +} + /* * Return true if we can merge this (vm_flags,file,vm_pgoff,size) * in front of (at a lower virtual address and file offset than) the vma. @@ -429,8 +451,6 @@ static int vma_merge(struct mm_struct *m unsigned long end, unsigned long vm_flags, struct file *file, unsigned long pgoff) { - spinlock_t * lock = &mm->page_table_lock; - /* * We later require that vma->vm_flags == vm_flags, so this tests * vma->vm_flags & VM_SPECIAL, too. @@ -450,6 +470,7 @@ static int vma_merge(struct mm_struct *m is_mergeable_vma(prev, file, vm_flags) && can_vma_merge_after(prev, vm_flags, file, pgoff)) { struct vm_area_struct *next; + spinlock_t *lock = &mm->page_table_lock; struct inode *inode = file ? file->f_dentry->d_inode : NULL; int need_up = 0; @@ -497,10 +518,7 @@ static int vma_merge(struct mm_struct *m pgoff, (end - addr) >> PAGE_SHIFT)) return 0; if (end == prev->vm_start) { - spin_lock(lock); - prev->vm_start = addr; - prev->vm_pgoff -= (end - addr) >> PAGE_SHIFT; - spin_unlock(lock); + move_vma_start(prev, addr); return 1; } } @@ -1221,8 +1239,7 @@ int split_vma(struct mm_struct * mm, str if (new_below) { new->vm_end = addr; - vma->vm_start = addr; - vma->vm_pgoff += ((addr - new->vm_start) >> PAGE_SHIFT); + move_vma_start(vma, addr); } else { vma->vm_end = addr; new->vm_start = addr; diff -prauN linux-2.5.72/mm/mprotect.c wli-2.5.72-numaq-15/mm/mprotect.c --- linux-2.5.72/mm/mprotect.c 2003-06-16 21:19:59.000000000 -0700 +++ wli-2.5.72-numaq-15/mm/mprotect.c 2003-06-18 19:17:06.000000000 -0700 @@ -24,11 +24,11 @@ #include static inline void -change_pte_range(pmd_t *pmd, unsigned long address, - unsigned long size, pgprot_t newprot) +change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long address, unsigned long size, pgprot_t newprot) { pte_t * pte; - unsigned long end; + unsigned long start, end; if (pmd_none(*pmd)) return; @@ -38,6 +38,7 @@ change_pte_range(pmd_t *pmd, unsigned lo return; } pte = pte_offset_map(pmd, address); + start = address & PMD_MASK; address &= ~PMD_MASK; end = address + size; if (end > PMD_SIZE) @@ -50,8 +51,8 @@ change_pte_range(pmd_t *pmd, unsigned lo * bits by wiping the pte and then setting the new pte * into place. */ - entry = ptep_get_and_clear(pte); - set_pte(pte, pte_modify(entry, newprot)); + entry = vm_ptep_get_and_clear(vma, pte, address + start); + vm_set_pte(vma, pte, pte_modify(entry, newprot), start + address); } address += PAGE_SIZE; pte++; @@ -60,11 +61,11 @@ change_pte_range(pmd_t *pmd, unsigned lo } static inline void -change_pmd_range(pgd_t *pgd, unsigned long address, - unsigned long size, pgprot_t newprot) +change_pmd_range(struct vm_area_struct *vma, pgd_t *pgd, + unsigned long address, unsigned long size, pgprot_t newprot) { pmd_t * pmd; - unsigned long end; + unsigned long start, end; if (pgd_none(*pgd)) return; @@ -73,16 +74,18 @@ change_pmd_range(pgd_t *pgd, unsigned lo pgd_clear(pgd); return; } - pmd = pmd_offset(pgd, address); + pmd = pmd_offset_map(pgd, address); + start = address & PGDIR_MASK; address &= ~PGDIR_MASK; end = address + size; if (end > PGDIR_SIZE) end = PGDIR_SIZE; do { - change_pte_range(pmd, address, end - address, newprot); + change_pte_range(vma, pmd, start + address, end - address, newprot); address = (address + PMD_SIZE) & PMD_MASK; pmd++; } while (address && (address < end)); + pmd_unmap(pmd - 1); } static void @@ -98,7 +101,7 @@ change_protection(struct vm_area_struct BUG(); spin_lock(¤t->mm->page_table_lock); do { - change_pmd_range(dir, start, end - start, newprot); + change_pmd_range(vma, dir, start, end - start, newprot); start = (start + PGDIR_SIZE) & PGDIR_MASK; dir++; } while (start && (start < end)); diff -prauN linux-2.5.72/mm/mremap.c wli-2.5.72-numaq-15/mm/mremap.c --- linux-2.5.72/mm/mremap.c 2003-06-16 21:20:02.000000000 -0700 +++ wli-2.5.72-numaq-15/mm/mremap.c 2003-06-18 19:18:22.000000000 -0700 @@ -37,7 +37,7 @@ static pte_t *get_one_pte_map_nested(str goto end; } - pmd = pmd_offset(pgd, addr); + pmd = pmd_offset_map_nested(pgd, addr); if (pmd_none(*pmd)) goto end; if (pmd_bad(*pmd)) { @@ -52,6 +52,7 @@ static pte_t *get_one_pte_map_nested(str pte = NULL; } end: + pmd_unmap_nested(pmd); return pte; } @@ -60,12 +61,15 @@ static inline int page_table_present(str { pgd_t *pgd; pmd_t *pmd; + int ret; pgd = pgd_offset(mm, addr); if (pgd_none(*pgd)) return 0; - pmd = pmd_offset(pgd, addr); - return pmd_present(*pmd); + pmd = pmd_offset_map(pgd, addr); + ret = pmd_present(*pmd); + pmd_unmap(pmd); + return ret != 0; } #else #define page_table_present(mm, addr) (1) @@ -76,14 +80,16 @@ static inline pte_t *alloc_one_pte_map(s pmd_t *pmd; pte_t *pte = NULL; - pmd = pmd_alloc(mm, pgd_offset(mm, addr), addr); + pmd = pmd_alloc_map(mm, pgd_offset(mm, addr), addr); if (pmd) - pte = pte_alloc_map(mm, pmd, addr); + pte = pte_alloc_map(mm, &pmd, addr); + pmd_unmap(pmd); return pte; } static int -copy_one_pte(struct mm_struct *mm, pte_t *src, pte_t *dst, +copy_one_pte(struct vm_area_struct *vma, pte_t *src, pte_t *dst, + unsigned long old_addr, unsigned long new_addr, struct pte_chain **pte_chainp) { int error = 0; @@ -96,13 +102,13 @@ copy_one_pte(struct mm_struct *mm, pte_t if (!pte_none(*src)) { if (page) page_remove_rmap(page, src); - pte = ptep_get_and_clear(src); + pte = vm_ptep_get_and_clear(vma, src, old_addr); if (!dst) { /* No dest? We must put it back. */ dst = src; error++; } - set_pte(dst, pte); + vm_set_pte(vma, dst, pte, new_addr); if (page) *pte_chainp = page_add_rmap(page, dst, *pte_chainp); } @@ -138,7 +144,7 @@ move_one_page(struct vm_area_struct *vma dst = alloc_one_pte_map(mm, new_addr); if (src == NULL) src = get_one_pte_map_nested(mm, old_addr); - error = copy_one_pte(mm, src, dst, &pte_chain); + error = copy_one_pte(vma, src, dst, old_addr, new_addr, &pte_chain); pte_unmap_nested(src); pte_unmap(dst); } diff -prauN linux-2.5.72/mm/msync.c wli-2.5.72-numaq-15/mm/msync.c --- linux-2.5.72/mm/msync.c 2003-06-16 21:19:59.000000000 -0700 +++ wli-2.5.72-numaq-15/mm/msync.c 2003-06-18 19:11:28.000000000 -0700 @@ -82,7 +82,7 @@ static inline int filemap_sync_pmd_range pgd_clear(pgd); return 0; } - pmd = pmd_offset(pgd, address); + pmd = pmd_offset_map(pgd, address); if ((address & PGDIR_MASK) != (end & PGDIR_MASK)) end = (address & PGDIR_MASK) + PGDIR_SIZE; error = 0; @@ -91,6 +91,7 @@ static inline int filemap_sync_pmd_range address = (address + PMD_SIZE) & PMD_MASK; pmd++; } while (address && (address < end)); + pmd_unmap(pmd - 1); return error; } diff -prauN linux-2.5.72/mm/page-writeback.c wli-2.5.72-numaq-15/mm/page-writeback.c --- linux-2.5.72/mm/page-writeback.c 2003-06-16 21:20:07.000000000 -0700 +++ wli-2.5.72-numaq-15/mm/page-writeback.c 2003-06-18 21:27:42.000000000 -0700 @@ -464,12 +464,12 @@ int write_one_page(struct page *page, in if (wait) wait_on_page_writeback(page); - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); list_del(&page->list); if (test_clear_page_dirty(page)) { list_add(&page->list, &mapping->locked_pages); page_cache_get(page); - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); ret = mapping->a_ops->writepage(page, &wbc); if (ret == 0 && wait) { wait_on_page_writeback(page); @@ -479,7 +479,7 @@ int write_one_page(struct page *page, in page_cache_release(page); } else { list_add(&page->list, &mapping->clean_pages); - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); unlock_page(page); } return ret; @@ -507,7 +507,7 @@ int __set_page_dirty_nobuffers(struct pa struct address_space *mapping = page->mapping; if (mapping) { - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); if (page->mapping) { /* Race with truncate? */ BUG_ON(page->mapping != mapping); if (!mapping->backing_dev_info->memory_backed) @@ -515,7 +515,7 @@ int __set_page_dirty_nobuffers(struct pa list_del(&page->list); list_add(&page->list, &mapping->dirty_pages); } - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); } } diff -prauN linux-2.5.72/mm/page_alloc.c wli-2.5.72-numaq-15/mm/page_alloc.c --- linux-2.5.72/mm/page_alloc.c 2003-06-16 21:19:40.000000000 -0700 +++ wli-2.5.72-numaq-15/mm/page_alloc.c 2003-06-20 05:04:06.000000000 -0700 @@ -163,7 +163,7 @@ static void destroy_compound_page(struct * -- wli */ -static inline void __free_pages_bulk (struct page *page, struct page *base, +static inline void buddy_free(struct page *page, struct page *base, struct zone *zone, struct free_area *area, unsigned long mask, unsigned int order) { @@ -176,7 +176,6 @@ static inline void __free_pages_bulk (st BUG(); index = page_idx >> (1 + order); - zone->free_pages -= mask; while (mask + (1 << (MAX_ORDER-1))) { struct page *buddy1, *buddy2; @@ -197,11 +196,39 @@ static inline void __free_pages_bulk (st BUG_ON(bad_range(zone, buddy2)); list_del(&buddy1->list); mask <<= 1; + area->globally_free--; area++; index >>= 1; page_idx &= mask; } list_add(&(base + page_idx)->list, &area->free_list); + area->globally_free++; +} + +static inline void __free_pages_bulk(struct page *page, struct page *base, + struct zone *zone, struct free_area *area, unsigned long mask, + unsigned int order) +{ + switch (area->active - area->locally_free) { + case 0: + if (!list_empty(&area->deferred_pages)) { + struct page *defer = list_entry(area->deferred_pages.next, struct page, list); + list_del(&defer->list); + area->locally_free--; + buddy_free(defer, base, zone, area, mask, order); + } + /* fall through */ + case 1: + buddy_free(page, base, zone, area, mask, order); + break; + default: + list_add(&page->list, &area->deferred_pages); + area->locally_free++; + break; + } + if (area->active) + area->active--; + zone->free_pages += 1 << order; } static inline void free_pages_check(const char *function, struct page *page) @@ -219,6 +246,8 @@ static inline void free_pages_check(cons bad_page(function, page); if (PageDirty(page)) ClearPageDirty(page); + if (PageAnon(page)) + ClearPageAnon(page); } /* @@ -232,40 +261,78 @@ static inline void free_pages_check(cons * And clear the zone's pages_scanned counter, to hold off the "all pages are * pinned" detection logic. */ -static int -free_pages_bulk(struct zone *zone, int count, - struct list_head *list, unsigned int order) +void free_pages_bulk(struct zone *zone, struct page *page, unsigned int order) { - unsigned long mask, flags; + unsigned long mask, flags, count; struct free_area *area; - struct page *base, *page = NULL; - int ret = 0; + struct page *base, *save; + LIST_HEAD(tmp); + + count = page->private; mask = (~0UL) << order; base = zone->zone_mem_map; area = zone->free_area + order; spin_lock_irqsave(&zone->lock, flags); zone->all_unreclaimable = 0; zone->pages_scanned = 0; - while (!list_empty(list) && count--) { - page = list_entry(list->prev, struct page, list); - /* have to delete it as __free_pages_bulk list manipulates */ - list_del(&page->list); - __free_pages_bulk(page, base, zone, area, mask, order); - ret++; + + if (order || area->active - area->locally_free <= 2*count) { + list_splice(&page->list, &tmp); + list_add(&page->list, &tmp); + page->private = 0; + } + + if (order) { + list_for_each_entry_safe(page, save, &tmp, list) { + list_del(&page->list); + __free_pages_bulk(page, base, zone, area, mask, order); + } + } else if (area->active - area->locally_free <= 2*count) { + /* + * This is a somewhat ad hoc approach to dealing with + * the interaction of gang allocation and the deferred + * coalescing heuristics. + */ + if (area->active - area->locally_free < count) { + int local = 0; + + while (local < count && area->locally_free) { + struct page *follow, *head = + list_entry(area->deferred_pages.next, struct page, lru); + list_del(&head->lru); + list_for_each_entry_safe(follow, save, &head->list, list) { + list_del(&follow->list); + buddy_free(follow, base, zone, area, mask, 0); + } + local += head->private; + area->locally_free -= head->private; + head->private = 0; + buddy_free(head, base, zone, area, mask, 0); + } + } + list_for_each_entry_safe(page, save, &tmp, list) { + list_del(&page->list); + buddy_free(page, base, zone, area, mask, order); + } + } else { + area->locally_free += count; + list_add(&page->lru, &area->deferred_pages); + } + if (!order) { + zone->free_pages += count; + area->active -= min(area->active, count); } spin_unlock_irqrestore(&zone->lock, flags); - return ret; } void __free_pages_ok(struct page *page, unsigned int order) { - LIST_HEAD(list); - mod_page_state(pgfree, 1 << order); free_pages_check(__FUNCTION__, page); - list_add(&page->list, &list); - free_pages_bulk(page_zone(page), 1, &list, order); + page->private = 1; + INIT_LIST_HEAD(&page->list); + free_pages_bulk(page_zone(page), page, order); } #define MARK_USED(index, order, area) \ @@ -278,10 +345,10 @@ expand(struct zone *zone, struct page *p unsigned long size = 1 << high; while (high > low) { - BUG_ON(bad_range(zone, page)); area--; high--; size >>= 1; + area->globally_free++; list_add(&page->list, &area->free_list); MARK_USED(index, high, area); index += size; @@ -332,7 +399,7 @@ static void prep_new_page(struct page *p * Do the hard work of removing an element from the buddy allocator. * Call me with the zone->lock already held. */ -static struct page *__rmqueue(struct zone *zone, unsigned int order) +static struct page *buddy_alloc(struct zone *zone, unsigned int order) { struct free_area * area; unsigned int current_order; @@ -346,16 +413,144 @@ static struct page *__rmqueue(struct zon page = list_entry(area->free_list.next, struct page, list); list_del(&page->list); + area->globally_free--; index = page - zone->zone_mem_map; if (current_order != MAX_ORDER-1) MARK_USED(index, current_order, area); - zone->free_pages -= 1UL << order; return expand(zone, page, index, order, current_order, area); } return NULL; } +/* + * This is bad; some way to avoid putting singleton pages on the + * deferred lists should be worked out at some point. + */ +static void split_pages(struct zone *zone, struct page *page, int page_order, int deferred_order) +{ + int split_order = deferred_order - 1; + unsigned long split_offset = 1UL << split_order; + struct page *split_page; + + while (split_order >= page_order) { + split_page = &page[split_offset]; + if (split_order) + list_add(&split_page->list, + &zone->free_area[split_order].deferred_pages); + else if (!zone->free_area[split_order].locally_free) { + INIT_LIST_HEAD(&split_page->list); + split_page->private = 1; + list_add(&split_page->lru, + &zone->free_area[split_order].deferred_pages); + } else { + struct page *head; + head = list_entry(zone->free_area[split_order].deferred_pages.next, struct page, lru); + head->private++; + list_add(&split_page->list, &head->list); + } + zone->free_area[split_order].locally_free++; + --split_order; + split_offset >>= 1; + } +} + +#define COALESCE_BATCH 256 +static inline struct page *steal_deferred_page(struct zone *zone, int order) +{ + struct page *page; + struct list_head *elem; + struct free_area *area = zone->free_area; + int found_order, k; + + if (zone->free_pages < (1 << order)) + return NULL; + + /* the range of found_order precludes order 0 */ + for (found_order = order + 1; found_order < MAX_ORDER; ++found_order) + if (!list_empty(&area[found_order].deferred_pages)) { + elem = area[found_order].deferred_pages.next; + page = list_entry(elem, struct page, list); + list_del(elem); + area[found_order].locally_free--; + split_pages(zone, page, order, found_order); + return page; + } + + for (found_order = order - 1; found_order >= 0; --found_order) { + for (k = 0; k < COALESCE_BATCH; ++k) { + unsigned long mask = (~0UL) << found_order; + if (list_empty(&area[found_order].deferred_pages)) + break; + elem = area[found_order].deferred_pages.next; + if (found_order) { + page = list_entry(elem, struct page, list); + list_del(elem); + area[found_order].locally_free--; + buddy_free(page, zone->zone_mem_map, zone, &area[found_order], mask, found_order); + } else { + LIST_HEAD(tmp); + struct page *save; + + page = list_entry(elem, struct page, lru); + list_del(elem); + area[found_order].locally_free -= page->private; + page->private = 0; + list_splice(&page->list, &tmp); + list_add(&page->list, &tmp); + list_for_each_entry_safe(page, save, &tmp, list) { + list_del(&page->list); + buddy_free(page, zone->zone_mem_map, zone, &area[found_order], mask, found_order); + } + } + } + page = buddy_alloc(zone, order); + if (page) + return page; + } + return buddy_alloc(zone, order); +} + +static inline int __rmqueue(struct zone *zone, unsigned int order, struct list_head *list) +{ + struct free_area *area = &zone->free_area[order]; + struct page *page; + int count; + + if (!list_empty(&area->deferred_pages)) { + if (order) { + page = list_entry(area->deferred_pages.next, struct page, list); + list_del(&page->list); + count = 1; + } else { + page = list_entry(area->deferred_pages.next, struct page, lru); + list_del(&page->lru); + count = page->private; + page->private = 0; + list_splice(&page->list, list); + } + + area->locally_free -= count; + area->active += count; + zone->free_pages -= count << order; + } else { + page = buddy_alloc(zone, order); + if (page) + count = 1; + else { + page = steal_deferred_page(zone, order); + if (page) + count = 1; + else + return 0; + } + area->active += count; + zone->free_pages -= count << order; + } + list_add(&page->list, list); + return count; +} + /* * Obtain a specified number of elements from the buddy allocator, all under * a single hold of the lock, for efficiency. Add them to the supplied list. @@ -365,17 +560,14 @@ static int rmqueue_bulk(struct zone *zon unsigned long count, struct list_head *list) { unsigned long flags; - int i; - int allocated = 0; - struct page *page; + int i, j, allocated = 0; spin_lock_irqsave(&zone->lock, flags); - for (i = 0; i < count; ++i) { - page = __rmqueue(zone, order); - if (page == NULL) + for (i = 0; i < count && allocated < count; ++i) { + j = __rmqueue(zone, order, list); + if (!j) break; - allocated++; - list_add_tail(&page->list, list); + allocated += j; } spin_unlock_irqrestore(&zone->lock, flags); return allocated; @@ -420,10 +612,14 @@ void drain_local_pages(void) pset = &zone->pageset[smp_processor_id()]; for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { struct per_cpu_pages *pcp; + struct page *page, *save; pcp = &pset->pcp[i]; - pcp->count -= free_pages_bulk(zone, pcp->count, - &pcp->list, 0); + list_for_each_entry_safe(page, save, &pcp->list, lru) { + list_del(&page->lru); + pcp->count -= page->private; + free_pages_bulk(zone, page, 0); + } } } local_irq_restore(flags); @@ -439,14 +635,27 @@ static void free_hot_cold_page(struct pa struct zone *zone = page_zone(page); struct per_cpu_pages *pcp; unsigned long flags; + struct page *head; inc_page_state(pgfree); free_pages_check(__FUNCTION__, page); pcp = &zone->pageset[get_cpu()].pcp[cold]; local_irq_save(flags); - if (pcp->count >= pcp->high) - pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0); - list_add(&page->list, &pcp->list); + while (pcp->count >= pcp->high) { + struct page *free = list_entry(pcp->list.prev, struct page, lru); + list_del(&free->lru); + pcp->count -= free->private; + free_pages_bulk(zone, free, 0); + } + head = list_entry(pcp->list.next, struct page, lru); + if (!list_empty(&pcp->list) && head->private < pcp->batch) { + list_add(&page->list, &head->list); + head->private++; + } else { + INIT_LIST_HEAD(&page->list); + list_add(&page->lru, &pcp->list); + page->private = 1; + } pcp->count++; local_irq_restore(flags); put_cpu(); @@ -471,31 +680,76 @@ void free_cold_page(struct page *page) static struct page *buffered_rmqueue(struct zone *zone, int order, int cold) { unsigned long flags; - struct page *page = NULL; + struct page *head, *page = NULL; + struct per_cpu_pages *pcp = NULL; if (order == 0) { - struct per_cpu_pages *pcp; - pcp = &zone->pageset[get_cpu()].pcp[cold]; local_irq_save(flags); - if (pcp->count <= pcp->low) - pcp->count += rmqueue_bulk(zone, 0, - pcp->batch, &pcp->list); + if (pcp->count <= pcp->low) { + LIST_HEAD(tmp); + int k; + + k = rmqueue_bulk(zone, 0, pcp->batch, &tmp); + if (k) { + pcp->count += k; + head = list_entry(tmp.next, struct page, list); + list_del_init(&head->list); + head->private = k; + list_splice(&tmp, &head->list); + list_add(&head->lru, &pcp->list); + } + } if (pcp->count) { - page = list_entry(pcp->list.next, struct page, list); - list_del(&page->list); + head = list_entry(pcp->list.next, struct page, lru); + WARN_ON(!head->private); + if (head->private == 1) { + list_del(&head->lru); + page = head; + page->private = 0; + } else { + page = list_entry(head->list.next, struct page,list); + list_del(&page->list); + head->private--; + } pcp->count--; } local_irq_restore(flags); put_cpu(); } - if (page == NULL) { + if (!page) { + LIST_HEAD(tmp); + int count; + + if (!order) + pcp = &zone->pageset[get_cpu()].pcp[cold]; + spin_lock_irqsave(&zone->lock, flags); - page = __rmqueue(zone, order); - spin_unlock_irqrestore(&zone->lock, flags); + count = __rmqueue(zone, order, &tmp); + spin_unlock(&zone->lock); + + if (!list_empty(&tmp)) + page = list_entry(tmp.next, struct page, list); + + if (!order && count > 1) { + struct page *head; + + list_del(&page->list); + pcp->count += count - 1; + head = list_entry(tmp.next, struct page, list); + list_del_init(&head->list); + head->private = count - 1; + list_splice(&tmp, &head->list); + list_add(&head->lru, &pcp->list); + } + + local_irq_restore(flags); + if (order && page) prep_compound_page(page, order); + else if (!order) + put_cpu(); } if (page != NULL) { @@ -809,6 +1063,17 @@ static void show_node(struct zone *zone) #define show_node(zone) do { } while (0) #endif +unsigned long nr_deferred_pages(void) +{ + struct zone *zone; + unsigned long order, pages = 0; + + for_each_zone(zone) + for (order = 0; order < MAX_ORDER; ++order) + pages += zone->free_area[order].locally_free << order; + return pages; +} + /* * Accumulate the page_state information across all CPUs. * The result is unavoidably approximate - it can change @@ -979,8 +1244,7 @@ void show_free_areas(void) } for_each_zone(zone) { - struct list_head *elem; - unsigned long nr, flags, order, total = 0; + unsigned long order, total = 0; show_node(zone); printk("%s: ", zone->name); @@ -989,16 +1253,20 @@ void show_free_areas(void) continue; } - spin_lock_irqsave(&zone->lock, flags); + printk("buddy: "); + for (order = 0; order < MAX_ORDER; order++) { + printk("%lu*%lukB ", zone->free_area[order].globally_free, K(1UL) << order); + total += zone->free_area[order].globally_free << order; + } + printk("\ndefer: "); for (order = 0; order < MAX_ORDER; order++) { - nr = 0; - list_for_each(elem, &zone->free_area[order].free_list) - ++nr; - total += nr << order; - printk("%lu*%lukB ", nr, K(1UL) << order); + printk("%lu*%lukB ", zone->free_area[order].locally_free, K(1UL) << order); + total += zone->free_area[order].locally_free << order; } - spin_unlock_irqrestore(&zone->lock, flags); - printk("= %lukB\n", K(total)); + printk("\nactive: "); + for (order = 0; order < MAX_ORDER; order++) + printk("%lu*%lukB ", zone->free_area[order].active, K(1UL) << order); + printk("\n= %lukB\n", K(total)); } show_swap_cache_info(); @@ -1234,10 +1502,11 @@ static void __init free_area_init_core(s batch = zone->present_pages / 1024; if (batch * PAGE_SIZE > 256 * 1024) batch = (256 * 1024) / PAGE_SIZE; - batch /= 4; /* We effectively *= 4 below */ if (batch < 1) batch = 1; + batch *= 4; + for (cpu = 0; cpu < NR_CPUS; cpu++) { struct per_cpu_pages *pcp; @@ -1294,8 +1563,11 @@ static void __init free_area_init_core(s for (i = 0; ; i++) { unsigned long bitmap_size; - + INIT_LIST_HEAD(&zone->free_area[i].deferred_pages); INIT_LIST_HEAD(&zone->free_area[i].free_list); + zone->free_area[i].globally_free = 0; + zone->free_area[i].locally_free = 0; + zone->free_area[i].active = 0; if (i == MAX_ORDER-1) { zone->free_area[i].map = NULL; break; @@ -1401,24 +1673,22 @@ static int frag_show(struct seq_file *m, pg_data_t *pgdat = (pg_data_t *)arg; struct zone *zone; struct zone *node_zones = pgdat->node_zones; - unsigned long flags; int order; for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { if (!zone->present_pages) continue; - spin_lock_irqsave(&zone->lock, flags); - seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); - for (order = 0; order < MAX_ORDER; ++order) { - unsigned long nr_bufs = 0; - struct list_head *elem; - - list_for_each(elem, &(zone->free_area[order].free_list)) - ++nr_bufs; - seq_printf(m, "%6lu ", nr_bufs); - } - spin_unlock_irqrestore(&zone->lock, flags); + seq_printf(m, "Node %d, zone %8s\n", pgdat->node_id, zone->name); + seq_puts(m, "buddy: "); + for (order = 0; order < MAX_ORDER; ++order) + seq_printf(m, "%6lu ", zone->free_area[order].globally_free); + seq_puts(m, "\ndefer: "); + for (order = 0; order < MAX_ORDER; ++order) + seq_printf(m, "%6lu ", zone->free_area[order].locally_free); + seq_puts(m, "\nactive: "); + for (order = 0; order < MAX_ORDER; ++order) + seq_printf(m, "%6lu ", zone->free_area[order].active); seq_putc(m, '\n'); } return 0; diff -prauN linux-2.5.72/mm/readahead.c wli-2.5.72-numaq-15/mm/readahead.c --- linux-2.5.72/mm/readahead.c 2003-06-16 21:19:40.000000000 -0700 +++ wli-2.5.72-numaq-15/mm/readahead.c 2003-06-18 21:26:53.000000000 -0700 @@ -217,7 +217,7 @@ __do_page_cache_readahead(struct address /* * Preallocate as many pages as we will need. */ - spin_lock(&mapping->page_lock); + mapping_rdlock(&mapping->page_lock); for (page_idx = 0; page_idx < nr_to_read; page_idx++) { unsigned long page_offset = offset + page_idx; @@ -228,16 +228,16 @@ __do_page_cache_readahead(struct address if (page) continue; - spin_unlock(&mapping->page_lock); + mapping_rdunlock(&mapping->page_lock); page = page_cache_alloc_cold(mapping); - spin_lock(&mapping->page_lock); + mapping_rdlock(&mapping->page_lock); if (!page) break; page->index = page_offset; list_add(&page->list, &page_pool); ret++; } - spin_unlock(&mapping->page_lock); + mapping_rdunlock(&mapping->page_lock); /* * Now start the IO. We ignore I/O errors - if the page is not diff -prauN linux-2.5.72/mm/rmap.c wli-2.5.72-numaq-15/mm/rmap.c --- linux-2.5.72/mm/rmap.c 2003-06-16 21:20:23.000000000 -0700 +++ wli-2.5.72-numaq-15/mm/rmap.c 2003-06-20 05:04:06.000000000 -0700 @@ -102,6 +102,143 @@ pte_chain_encode(struct pte_chain *pte_c **/ /** + * find_pte - Find a pte pointer given a vma and a struct page. + * @vma: the vma to search + * @page: the page to find + * + * Determine if this page is mapped in this vma. If it is, map and rethrn + * the pte pointer associated with it. Return null if the page is not + * mapped in this vma for any reason. + * + * This is strictly an internal helper function for the object-based rmap + * functions. + * + * It is the caller's responsibility to unmap the pte if it is returned. + */ +static inline pte_t * +find_pte(struct vm_area_struct *vma, struct page *page, unsigned long *addr) +{ + struct mm_struct *mm = vma->vm_mm; + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + unsigned long loffset; + unsigned long address; + + loffset = (page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT)); + address = vma->vm_start + ((loffset - vma->vm_pgoff) << PAGE_SHIFT); + if (address < vma->vm_start || address >= vma->vm_end) + goto out; + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + goto out; + + pmd = pmd_offset_map(pgd, address); + if (!pmd) + goto out; + + if (!pmd_present(*pmd)) { + pmd_unmap(pmd); + goto out; + } + + pte = pte_offset_map(pmd, address); + if (!pte_present(*pte)) + goto out_unmap; + + if (page_to_pfn(page) != pte_pfn(*pte)) + goto out_unmap; + + if (addr) + *addr = address; + + pmd_unmap(pmd); + return pte; + +out_unmap: + pmd_unmap(pmd); + pte_unmap(pte); +out: + return NULL; +} + +/** + * page_referenced_obj_one - referenced check for object-based rmap + * @vma: the vma to look in. + * @page: the page we're working on. + * + * Find a pte entry for a page/vma pair, then check and clear the referenced + * bit. + * + * This is strictly a helper function for page_referenced_obj. + */ +static int +page_referenced_obj_one(struct vm_area_struct *vma, struct page *page) +{ + struct mm_struct *mm = vma->vm_mm; + pte_t *pte; + int referenced = 0; + + if (!spin_trylock(&mm->page_table_lock)) + return 1; + + pte = find_pte(vma, page, NULL); + if (pte) { + if (ptep_test_and_clear_young(pte)) + referenced++; + pte_unmap(pte); + } + + spin_unlock(&mm->page_table_lock); + return referenced; +} + +/** + * page_referenced_obj_one - referenced check for object-based rmap + * @page: the page we're checking references on. + * + * For an object-based mapped page, find all the places it is mapped and + * check/clear the referenced flag. This is done by following the page->mapping + * pointer, then walking the chain of vmas it holds. It returns the number + * of references it found. + * + * This function is only called from page_referenced for object-based pages. + * + * The semaphore address_space->i_shared_sem is tried. If it can't be gotten, + * assume a reference count of 1. + */ +static int +page_referenced_obj(struct page *page) +{ + struct address_space *mapping = page->mapping; + struct vm_area_struct *vma; + int referenced = 0; + + if (!page->pte.mapcount) + return 0; + + if (!mapping) + BUG(); + + if (PageSwapCache(page)) + BUG(); + + if (down_trylock(&mapping->i_shared_sem)) + return 1; + + list_for_each_entry(vma, &mapping->i_mmap, shared) + referenced += page_referenced_obj_one(vma, page); + + list_for_each_entry(vma, &mapping->i_mmap_shared, shared) + referenced += page_referenced_obj_one(vma, page); + + up(&mapping->i_shared_sem); + + return referenced; +} + +/** * page_referenced - test if the page was referenced * @page: the page to test * @@ -120,6 +257,10 @@ int page_referenced(struct page * page) if (TestClearPageReferenced(page)) referenced++; + if (!PageAnon(page)) { + referenced += page_referenced_obj(page); + goto out; + } if (PageDirect(page)) { pte_t *pte = rmap_ptep_map(page->pte.direct); if (ptep_test_and_clear_young(pte)) @@ -153,6 +294,7 @@ int page_referenced(struct page * page) __pte_chain_free(pc); } } +out: return referenced; } @@ -175,6 +317,21 @@ page_add_rmap(struct page *page, pte_t * pte_chain_lock(page); + /* + * If this is an object-based page, just count it. We can + * find the mappings by walking the object vma chain for that object. + */ + if (!PageAnon(page)) { + if (!page->mapping) + BUG(); + if (PageSwapCache(page)) + BUG(); + if (!page->pte.mapcount) + inc_page_state(nr_mapped); + page->pte.mapcount++; + goto out; + } + if (page->pte.direct == 0) { page->pte.direct = pte_paddr; SetPageDirect(page); @@ -231,8 +388,25 @@ void page_remove_rmap(struct page *page, pte_chain_lock(page); if (!page_mapped(page)) - goto out_unlock; /* remap_page_range() from a driver? */ + goto out_unlock; + /* + * If this is an object-based page, just uncount it. We can + * find the mappings by walking the object vma chain for that object. + */ + if (!PageAnon(page)) { + if (!page->mapping) + BUG(); + if (PageSwapCache(page)) + BUG(); + if (!page->pte.mapcount) + BUG(); + page->pte.mapcount--; + if (!page->pte.mapcount) + dec_page_state(nr_mapped); + goto out_unlock; + } + if (PageDirect(page)) { if (page->pte.direct == pte_paddr) { page->pte.direct = 0; @@ -279,6 +453,102 @@ out_unlock: } /** + * try_to_unmap_obj - unmap a page using the object-based rmap method + * @page: the page to unmap + * + * Determine whether a page is mapped in a given vma and unmap it if it's found. + * + * This function is strictly a helper function for try_to_unmap_obj. + */ +static inline int +try_to_unmap_obj_one(struct vm_area_struct *vma, struct page *page) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long address; + pte_t *pte; + pte_t pteval; + int ret = SWAP_AGAIN; + + if (!spin_trylock(&mm->page_table_lock)) + return ret; + + pte = find_pte(vma, page, &address); + if (!pte) + goto out; + + if (vma->vm_flags & VM_LOCKED) { + ret = SWAP_FAIL; + goto out_unmap; + } + + flush_cache_page(vma, address); + pteval = ptep_get_and_clear(pte); + flush_tlb_page(vma, address); + + if (pte_dirty(pteval)) + set_page_dirty(page); + + if (!page->pte.mapcount) + BUG(); + + mm->rss--; + page->pte.mapcount--; + page_cache_release(page); + +out_unmap: + pte_unmap(pte); + +out: + spin_unlock(&mm->page_table_lock); + return ret; +} + +/** + * try_to_unmap_obj - unmap a page using the object-based rmap method + * @page: the page to unmap + * + * Find all the mappings of a page using the mapping pointer and the vma chains + * contained in the address_space struct it points to. + * + * This function is only called from try_to_unmap for object-based pages. + * + * The semaphore address_space->i_shared_sem is tried. If it can't be gotten, + * return a temporary error. + */ +static int +try_to_unmap_obj(struct page *page) +{ + struct address_space *mapping = page->mapping; + struct vm_area_struct *vma; + int ret = SWAP_AGAIN; + + if (!mapping) + BUG(); + + if (PageSwapCache(page)) + BUG(); + + if (down_trylock(&mapping->i_shared_sem)) + return ret; + + list_for_each_entry(vma, &mapping->i_mmap, shared) { + ret = try_to_unmap_obj_one(vma, page); + if (ret == SWAP_FAIL || !page->pte.mapcount) + goto out; + } + + list_for_each_entry(vma, &mapping->i_mmap_shared, shared) { + ret = try_to_unmap_obj_one(vma, page); + if (ret == SWAP_FAIL || !page->pte.mapcount) + goto out; + } + +out: + up(&mapping->i_shared_sem); + return ret; +} + +/** * try_to_unmap_one - worker function for try_to_unmap * @page: page to unmap * @ptep: page table entry to unmap from page @@ -329,7 +599,7 @@ static int try_to_unmap_one(struct page /* Nuke the page table entry. */ flush_cache_page(vma, address); - pte = ptep_get_and_clear(ptep); + pte = vm_ptep_get_and_clear(vma, ptep, address); flush_tlb_page(vma, address); if (PageSwapCache(page)) { @@ -339,7 +609,7 @@ static int try_to_unmap_one(struct page */ swp_entry_t entry = { .val = page->index }; swap_duplicate(entry); - set_pte(ptep, swp_entry_to_pte(entry)); + vm_set_pte(vma, ptep, swp_entry_to_pte(entry), address); BUG_ON(pte_file(*ptep)); } else { unsigned long pgidx; @@ -351,7 +621,7 @@ static int try_to_unmap_one(struct page pgidx += vma->vm_pgoff; pgidx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT; if (page->index != pgidx) { - set_pte(ptep, pgoff_to_pte(page->index)); + vm_set_pte(vma, ptep, pgoff_to_pte(page->index), address); BUG_ON(!pte_file(*ptep)); } } @@ -397,6 +667,15 @@ int try_to_unmap(struct page * page) if (!page->mapping) BUG(); + /* + * If it's an object-based page, use the object vma chain to find all + * the mappings. + */ + if (!PageAnon(page)) { + ret = try_to_unmap_obj(page); + goto out; + } + if (PageDirect(page)) { ret = try_to_unmap_one(page, page->pte.direct); if (ret == SWAP_SUCCESS) { @@ -452,12 +731,115 @@ int try_to_unmap(struct page * page) } } out: - if (!page_mapped(page)) + if (!page_mapped(page)) { dec_page_state(nr_mapped); + ret = SWAP_SUCCESS; + } return ret; } /** + * page_convert_anon - Convert an object-based mapped page to pte_chain-based. + * @page: the page to convert + * + * Find all the mappings for an object-based page and convert them + * to 'anonymous', ie create a pte_chain and store all the pte pointers there. + * + * This function takes the address_space->i_shared_sem, sets the PageAnon flag, + * then sets the mm->page_table_lock for each vma and calls page_add_rmap. This + * means there is a period when PageAnon is set, but still has some mappings + * with no pte_chain entry. This is in fact safe, since page_remove_rmap will + * simply not find it. try_to_unmap might erroneously return success, but it + * will never be called because the page_convert_anon() caller has locked the + * page. + * + * page_referenced() may fail to scan all the appropriate pte's and may return + * an inaccurate result. This is so rare that it does not matter. + */ +int page_convert_anon(struct page *page) +{ + struct address_space *mapping; + struct vm_area_struct *vma; + struct pte_chain *pte_chain = NULL; + pte_t *pte; + int err = 0; + + mapping = page->mapping; + if (mapping == NULL) + goto out; /* truncate won the lock_page() race */ + + down(&mapping->i_shared_sem); + pte_chain_lock(page); + + /* + * Has someone else done it for us before we got the lock? + * If so, pte.direct or pte.chain has replaced pte.mapcount. + */ + if (PageAnon(page)) { + pte_chain_unlock(page); + goto out_unlock; + } + + SetPageAnon(page); + if (page->pte.mapcount == 0) { + pte_chain_unlock(page); + goto out_unlock; + } + /* This is gonna get incremented by page_add_rmap */ + dec_page_state(nr_mapped); + page->pte.mapcount = 0; + + /* + * Now that the page is marked as anon, unlock it. page_add_rmap will + * lock it as necessary. + */ + pte_chain_unlock(page); + + list_for_each_entry(vma, &mapping->i_mmap, shared) { + if (!pte_chain) { + pte_chain = pte_chain_alloc(GFP_KERNEL); + if (!pte_chain) { + err = -ENOMEM; + goto out_unlock; + } + } + spin_lock(&vma->vm_mm->page_table_lock); + pte = find_pte(vma, page, NULL); + if (pte) { + /* Make sure this isn't a duplicate */ + page_remove_rmap(page, pte); + pte_chain = page_add_rmap(page, pte, pte_chain); + pte_unmap(pte); + } + spin_unlock(&vma->vm_mm->page_table_lock); + } + list_for_each_entry(vma, &mapping->i_mmap_shared, shared) { + if (!pte_chain) { + pte_chain = pte_chain_alloc(GFP_KERNEL); + if (!pte_chain) { + err = -ENOMEM; + goto out_unlock; + } + } + spin_lock(&vma->vm_mm->page_table_lock); + pte = find_pte(vma, page, NULL); + if (pte) { + /* Make sure this isn't a duplicate */ + page_remove_rmap(page, pte); + pte_chain = page_add_rmap(page, pte, pte_chain); + pte_unmap(pte); + } + spin_unlock(&vma->vm_mm->page_table_lock); + } + +out_unlock: + pte_chain_free(pte_chain); + up(&mapping->i_shared_sem); +out: + return err; +} + +/** ** No more VM stuff below this comment, only pte_chain helper ** functions. **/ @@ -489,38 +871,6 @@ void __pte_chain_free(struct pte_chain * put_cpu(); } -/* - * pte_chain_alloc(): allocate a pte_chain structure for use by page_add_rmap(). - * - * The caller of page_add_rmap() must perform the allocation because - * page_add_rmap() is invariably called under spinlock. Often, page_add_rmap() - * will not actually use the pte_chain, because there is space available in one - * of the existing pte_chains which are attached to the page. So the case of - * allocating and then freeing a single pte_chain is specially optimised here, - * with a one-deep per-cpu cache. - */ -struct pte_chain *pte_chain_alloc(int gfp_flags) -{ - int cpu; - struct pte_chain *ret; - struct pte_chain **pte_chainp; - - if (gfp_flags & __GFP_WAIT) - might_sleep(); - - cpu = get_cpu(); - pte_chainp = &per_cpu(local_pte_chain, cpu); - if (*pte_chainp) { - ret = *pte_chainp; - *pte_chainp = NULL; - put_cpu(); - } else { - put_cpu(); - ret = kmem_cache_alloc(pte_chain_cache, gfp_flags); - } - return ret; -} - void __init pte_chain_init(void) { pte_chain_cache = kmem_cache_create( "pte_chain", diff -prauN linux-2.5.72/mm/swap_state.c wli-2.5.72-numaq-15/mm/swap_state.c --- linux-2.5.72/mm/swap_state.c 2003-06-16 21:20:00.000000000 -0700 +++ wli-2.5.72-numaq-15/mm/swap_state.c 2003-06-18 21:26:53.000000000 -0700 @@ -33,7 +33,7 @@ extern struct address_space_operations s struct address_space swapper_space = { .page_tree = RADIX_TREE_INIT(GFP_ATOMIC), - .page_lock = SPIN_LOCK_UNLOCKED, + .page_lock = MAPPING_RW_LOCK_UNLOCKED, .clean_pages = LIST_HEAD_INIT(swapper_space.clean_pages), .dirty_pages = LIST_HEAD_INIT(swapper_space.dirty_pages), .io_pages = LIST_HEAD_INIT(swapper_space.io_pages), @@ -190,9 +190,9 @@ void delete_from_swap_cache(struct page entry.val = page->index; - spin_lock(&swapper_space.page_lock); + mapping_wrlock(&swapper_space.page_lock); __delete_from_swap_cache(page); - spin_unlock(&swapper_space.page_lock); + mapping_wrunlock(&swapper_space.page_lock); swap_free(entry); page_cache_release(page); @@ -203,8 +203,8 @@ int move_to_swap_cache(struct page *page struct address_space *mapping = page->mapping; int err; - spin_lock(&swapper_space.page_lock); - spin_lock(&mapping->page_lock); + mapping_wrlock(&swapper_space.page_lock); + write_lock(&mapping->page_lock); err = radix_tree_insert(&swapper_space.page_tree, entry.val, page); if (!err) { @@ -212,8 +212,8 @@ int move_to_swap_cache(struct page *page ___add_to_page_cache(page, &swapper_space, entry.val); } - spin_unlock(&mapping->page_lock); - spin_unlock(&swapper_space.page_lock); + mapping_wrunlock(&mapping->page_lock); + mapping_wrunlock(&swapper_space.page_lock); if (!err) { if (!swap_duplicate(entry)) @@ -239,8 +239,8 @@ int move_from_swap_cache(struct page *pa entry.val = page->index; - spin_lock(&swapper_space.page_lock); - spin_lock(&mapping->page_lock); + mapping_wrlock(&swapper_space.page_lock); + mapping_wrlock(&mapping->page_lock); err = radix_tree_insert(&mapping->page_tree, index, page); if (!err) { @@ -248,8 +248,8 @@ int move_from_swap_cache(struct page *pa ___add_to_page_cache(page, mapping, index); } - spin_unlock(&mapping->page_lock); - spin_unlock(&swapper_space.page_lock); + mapping_wrunlock(&mapping->page_lock); + mapping_wrunlock(&swapper_space.page_lock); if (!err) { swap_free(entry); diff -prauN linux-2.5.72/mm/swapfile.c wli-2.5.72-numaq-15/mm/swapfile.c --- linux-2.5.72/mm/swapfile.c 2003-06-16 21:19:42.000000000 -0700 +++ wli-2.5.72-numaq-15/mm/swapfile.c 2003-06-20 05:04:06.000000000 -0700 @@ -248,10 +248,10 @@ static int exclusive_swap_page(struct pa /* Is the only swap cache user the cache itself? */ if (p->swap_map[swp_offset(entry)] == 1) { /* Recheck the page count with the pagecache lock held.. */ - spin_lock(&swapper_space.page_lock); + mapping_rdlock(&swapper_space.page_lock); if (page_count(page) - !!PagePrivate(page) == 2) retval = 1; - spin_unlock(&swapper_space.page_lock); + mapping_rdunlock(&swapper_space.page_lock); } swap_info_put(p); } @@ -319,13 +319,13 @@ int remove_exclusive_swap_page(struct pa retval = 0; if (p->swap_map[swp_offset(entry)] == 1) { /* Recheck the page count with the pagecache lock held.. */ - spin_lock(&swapper_space.page_lock); + mapping_wrlock(&swapper_space.page_lock); if ((page_count(page) == 2) && !PageWriteback(page)) { __delete_from_swap_cache(page); SetPageDirty(page); retval = 1; } - spin_unlock(&swapper_space.page_lock); + mapping_wrunlock(&swapper_space.page_lock); } swap_info_put(p); @@ -384,7 +384,8 @@ unuse_pte(struct vm_area_struct *vma, un { vma->vm_mm->rss++; get_page(page); - set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot))); + SetPageAnon(page); + vm_set_pte(vma, dir, pte_mkold(mk_pte(page, vma->vm_page_prot)), address); *pte_chainp = page_add_rmap(page, dir, *pte_chainp); swap_free(entry); } @@ -444,7 +445,7 @@ static int unuse_pgd(struct vm_area_stru pgd_clear(dir); return 0; } - pmd = pmd_offset(dir, address); + pmd = pmd_offset_map(dir, address); offset = address & PGDIR_MASK; address &= ~PGDIR_MASK; end = address + size; @@ -459,6 +460,7 @@ static int unuse_pgd(struct vm_area_stru address = (address + PMD_SIZE) & PMD_MASK; pmd++; } while (address && (address < end)); + pmd_unmap(pmd - 1); return 0; } diff -prauN linux-2.5.72/mm/truncate.c wli-2.5.72-numaq-15/mm/truncate.c --- linux-2.5.72/mm/truncate.c 2003-06-16 21:20:24.000000000 -0700 +++ wli-2.5.72-numaq-15/mm/truncate.c 2003-06-18 21:26:53.000000000 -0700 @@ -73,13 +73,13 @@ invalidate_complete_page(struct address_ if (PagePrivate(page) && !try_to_release_page(page, 0)) return 0; - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); if (PageDirty(page)) { - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); return 0; } __remove_from_page_cache(page); - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); ClearPageUptodate(page); page_cache_release(page); /* pagecache ref */ return 1; diff -prauN linux-2.5.72/mm/vmalloc.c wli-2.5.72-numaq-15/mm/vmalloc.c --- linux-2.5.72/mm/vmalloc.c 2003-06-16 21:20:05.000000000 -0700 +++ wli-2.5.72-numaq-15/mm/vmalloc.c 2003-06-18 19:11:28.000000000 -0700 @@ -70,7 +70,7 @@ static void unmap_area_pmd(pgd_t *dir, u return; } - pmd = pmd_offset(dir, address); + pmd = pmd_offset_kernel(dir, address); address &= ~PGDIR_MASK; end = address + size; if (end > PGDIR_SIZE) @@ -159,7 +159,7 @@ int map_vm_area(struct vm_struct *area, dir = pgd_offset_k(address); spin_lock(&init_mm.page_table_lock); do { - pmd_t *pmd = pmd_alloc(&init_mm, dir, address); + pmd_t *pmd = pmd_alloc_kernel(&init_mm, dir, address); if (!pmd) { err = -ENOMEM; break; diff -prauN linux-2.5.72/mm/vmscan.c wli-2.5.72-numaq-15/mm/vmscan.c --- linux-2.5.72/mm/vmscan.c 2003-06-16 21:19:41.000000000 -0700 +++ wli-2.5.72-numaq-15/mm/vmscan.c 2003-06-18 21:26:53.000000000 -0700 @@ -324,7 +324,7 @@ shrink_list(struct list_head *page_list, goto keep_locked; if (!may_write_to_queue(mapping->backing_dev_info)) goto keep_locked; - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); if (test_clear_page_dirty(page)) { int res; struct writeback_control wbc = { @@ -335,7 +335,7 @@ shrink_list(struct list_head *page_list, }; list_move(&page->list, &mapping->locked_pages); - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); SetPageReclaim(page); res = mapping->a_ops->writepage(page, &wbc); @@ -350,7 +350,7 @@ shrink_list(struct list_head *page_list, } goto keep; } - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); } /* @@ -384,7 +384,7 @@ shrink_list(struct list_head *page_list, if (!mapping) goto keep_locked; /* truncate got there first */ - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); /* * The non-racy check for busy page. It is critical to check @@ -392,7 +392,7 @@ shrink_list(struct list_head *page_list, * not in use by anybody. (pagecache + us == 2) */ if (page_count(page) != 2 || PageDirty(page)) { - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); goto keep_locked; } @@ -400,7 +400,7 @@ shrink_list(struct list_head *page_list, if (PageSwapCache(page)) { swp_entry_t swap = { .val = page->index }; __delete_from_swap_cache(page); - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); swap_free(swap); __put_page(page); /* The pagecache ref */ goto free_it; @@ -408,7 +408,7 @@ shrink_list(struct list_head *page_list, #endif /* CONFIG_SWAP */ __remove_from_page_cache(page); - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); __put_page(page); free_it: