From pgcl-2.5.70-bk9-2: Do some intelligent pagetable preconstruction, in combination with a small bit of restoration of struct mmu_gather's opacity to the core VM. From pgcl-2.5.70-bk9-3: Also inline various things to cope with identified regressions and for utterly trivial functions that can be inlined with the non-private structure declaration. This hopefully addresses a performance degradation in pte_alloc_one() identified by Randy Hron. Further tuning may be required to keep space consumption more tightly bounded. Unfortunately, neither the sysenter bug nor the bug encountered during the AIM7 run have had progress made on them. -- wli arch/i386/Kconfig | 12 arch/i386/boot/setup.S | 5 arch/i386/kernel/apic.c | 4 arch/i386/kernel/cpu/amd.c | 2 arch/i386/kernel/cpu/mtrr/amd.c | 10 arch/i386/kernel/cpu/mtrr/centaur.c | 12 arch/i386/kernel/cpu/mtrr/cyrix.c | 4 arch/i386/kernel/cpu/mtrr/generic.c | 16 - arch/i386/kernel/cpu/mtrr/if.c | 28 - arch/i386/kernel/cpu/mtrr/main.c | 10 arch/i386/kernel/entry.S | 2 arch/i386/kernel/head.S | 3 arch/i386/kernel/microcode.c | 2 arch/i386/kernel/mpparse.c | 6 arch/i386/kernel/numaq.c | 5 arch/i386/kernel/setup.c | 18 - arch/i386/kernel/smpboot.c | 2 arch/i386/kernel/srat.c | 106 ++---- arch/i386/kernel/sys_i386.c | 4 arch/i386/kernel/traps.c | 3 arch/i386/lib/getuser.S | 2 arch/i386/mm/Makefile | 2 arch/i386/mm/discontig.c | 62 ++-- arch/i386/mm/fault.c | 169 +++++++++-- arch/i386/mm/highmem.c | 67 +++- arch/i386/mm/init.c | 206 ++++++++----- arch/i386/mm/ioremap.c | 24 - arch/i386/mm/pageattr.c | 41 +- arch/i386/mm/pgtable.c | 145 ++++++--- arch/i386/mm/tlb.c | 133 ++++++++ arch/i386/pci/i386.c | 2 drivers/block/ll_rw_blk.c | 2 drivers/char/agp/backend.c | 8 drivers/char/agp/generic.c | 12 drivers/char/mem.c | 42 +- drivers/oprofile/buffer_sync.c | 2 drivers/scsi/qlogicisp.c | 2 drivers/scsi/sym53c8xx.c | 6 drivers/scsi/sym53c8xx_2/sym_glue.c | 4 drivers/scsi/sym53c8xx_comm.h | 4 fs/aio.c | 37 +- fs/binfmt_elf.c | 22 - fs/bio.c | 18 - fs/direct-io.c | 82 ++++- fs/exec.c | 108 ++++--- fs/ext2/dir.c | 8 fs/file_table.c | 2 fs/inode.c | 4 fs/proc/base.c | 46 ++- fs/proc/proc_misc.c | 2 fs/proc/task_mmu.c | 2 include/asm-alpha/page.h | 2 include/asm-arm/page.h | 2 include/asm-cris/page.h | 2 include/asm-generic/page.h | 11 include/asm-generic/rmap.h | 54 ++- include/asm-generic/tlb.h | 10 include/asm-i386/dma-mapping.h | 2 include/asm-i386/fixmap.h | 70 +++- include/asm-i386/highmem.h | 19 - include/asm-i386/io.h | 30 + include/asm-i386/io_apic.h | 2 include/asm-i386/mmzone.h | 40 +- include/asm-i386/numaq.h | 4 include/asm-i386/page.h | 50 ++- include/asm-i386/pci.h | 4 include/asm-i386/pgalloc.h | 97 ++++-- include/asm-i386/pgtable-2level.h | 11 include/asm-i386/pgtable-3level.h | 15 include/asm-i386/pgtable.h | 75 ++-- include/asm-i386/rmap.h | 11 include/asm-i386/setup.h | 8 include/asm-i386/shmparam.h | 2 include/asm-i386/thread_info.h | 10 include/asm-i386/tlb.h | 201 ++++++++++++- include/asm-i386/tlbflush.h | 19 - include/asm-ia64/page.h | 2 include/asm-m68k/page.h | 2 include/asm-m68knommu/page.h | 2 include/asm-mips/page.h | 2 include/asm-mips64/page.h | 2 include/asm-parisc/page.h | 2 include/asm-ppc/page.h | 2 include/asm-ppc64/page.h | 2 include/asm-s390/page.h | 2 include/asm-sh/page.h | 2 include/asm-sparc/page.h | 2 include/asm-sparc64/page.h | 2 include/asm-v850/page.h | 1 include/asm-x86_64/page.h | 2 include/linux/aio.h | 4 include/linux/binfmts.h | 10 include/linux/bio.h | 14 include/linux/gfp.h | 5 include/linux/highmem.h | 11 include/linux/ide.h | 2 include/linux/mm.h | 48 ++- include/linux/mmzone.h | 2 include/linux/pagemap.h | 34 -- include/linux/sched.h | 9 include/linux/shm.h | 2 include/linux/sunrpc/svc.h | 3 include/linux/swap.h | 6 init/main.c | 5 ipc/shm.c | 10 kernel/fork.c | 9 kernel/futex.c | 27 + kernel/ksyms.c | 1 kernel/ptrace.c | 25 + mm/bootmem.c | 118 +++---- mm/filemap.c | 38 +- mm/fremap.c | 11 mm/highmem.c | 51 ++- mm/madvise.c | 10 mm/memory.c | 506 +++++++++++++++++++++++++-------- mm/mincore.c | 32 +- mm/mlock.c | 18 - mm/mmap.c | 88 +++-- mm/mprotect.c | 12 mm/mremap.c | 30 - mm/msync.c | 6 mm/page-writeback.c | 4 mm/page_alloc.c | 24 + mm/page_io.c | 8 mm/rmap.c | 9 mm/shmem.c | 58 +-- mm/slab.c | 4 mm/swap.c | 2 mm/swap_state.c | 16 - mm/swapfile.c | 136 +++++++- mm/vcache.c | 2 mm/vmalloc.c | 148 +++------ mm/vmscan.c | 2 net/ipv4/netfilter/ip_conntrack_core.c | 4 net/ipv4/tcp.c | 4 135 files changed, 2595 insertions(+), 1271 deletions(-) diff -prauN linux-2.5.70-bk10/arch/i386/Kconfig pgcl-2.5.70-bk10-1/arch/i386/Kconfig --- linux-2.5.70-bk10/arch/i386/Kconfig 2003-06-05 05:43:43.000000000 -0700 +++ pgcl-2.5.70-bk10-1/arch/i386/Kconfig 2003-06-05 09:44:34.000000000 -0700 @@ -676,6 +676,18 @@ config X86_PAE depends on HIGHMEM64G default y +config PAGE_CLUSTER + int "Page clustering factor" + default 3 if HIGHMEM64G + default 2 if HIGHMEM4G + default 1 + help + Select page clustering factor as a power of 2. + Defaults and examples: + 3 => 32KB PAGE_SIZE + 2 => 16KB PAGE_SIZE + 1 => 8KB PAGE_SIZE + # Common NUMA Features config NUMA bool "Numa Memory Allocation Support" diff -prauN linux-2.5.70-bk10/arch/i386/boot/setup.S pgcl-2.5.70-bk10-1/arch/i386/boot/setup.S --- linux-2.5.70-bk10/arch/i386/boot/setup.S 2003-05-26 18:00:41.000000000 -0700 +++ pgcl-2.5.70-bk10-1/arch/i386/boot/setup.S 2003-06-05 09:44:34.000000000 -0700 @@ -58,6 +58,9 @@ #include #include #include + +#define VMALLOC_START (-0xC0000000 - 128*1024*1024) +#include #include /* Signature words to ensure LILO loaded us right */ @@ -162,7 +165,7 @@ cmd_line_ptr: .long 0 # (Header versio # can be located anywhere in # low memory 0x10000 or higher. -ramdisk_max: .long MAXMEM-1 # (Header version 0x0203 or later) +ramdisk_max: .long __MAXMEM-1 # (Header version 0x0203 or later) # The highest safe address for # the contents of an initrd diff -prauN linux-2.5.70-bk10/arch/i386/kernel/apic.c pgcl-2.5.70-bk10-1/arch/i386/kernel/apic.c --- linux-2.5.70-bk10/arch/i386/kernel/apic.c 2003-05-26 18:01:02.000000000 -0700 +++ pgcl-2.5.70-bk10-1/arch/i386/kernel/apic.c 2003-06-05 09:44:34.000000000 -0700 @@ -679,7 +679,7 @@ void __init init_apic_mappings(void) * one for the IO-APIC. */ if (!smp_found_config && detect_init_APIC()) { - apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); + apic_phys = (unsigned long) alloc_bootmem_pages(MMUPAGE_SIZE); apic_phys = __pa(apic_phys); } else apic_phys = mp_lapic_addr; @@ -711,7 +711,7 @@ void __init init_apic_mappings(void) } } else { fake_ioapic_page: - ioapic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); + ioapic_phys = (unsigned long) alloc_bootmem_pages(MMUPAGE_SIZE); ioapic_phys = __pa(ioapic_phys); } set_fixmap_nocache(idx, ioapic_phys); diff -prauN linux-2.5.70-bk10/arch/i386/kernel/cpu/amd.c pgcl-2.5.70-bk10-1/arch/i386/kernel/cpu/amd.c --- linux-2.5.70-bk10/arch/i386/kernel/cpu/amd.c 2003-05-26 18:00:59.000000000 -0700 +++ pgcl-2.5.70-bk10-1/arch/i386/kernel/cpu/amd.c 2003-06-05 09:44:34.000000000 -0700 @@ -25,7 +25,7 @@ __asm__(".align 4\nvide: ret"); static void __init init_amd(struct cpuinfo_x86 *c) { u32 l, h; - int mbytes = num_physpages >> (20-PAGE_SHIFT); + int mbytes = num_physpages >> (20-MMUPAGE_SHIFT); int r; /* diff -prauN linux-2.5.70-bk10/arch/i386/kernel/cpu/mtrr/amd.c pgcl-2.5.70-bk10-1/arch/i386/kernel/cpu/mtrr/amd.c --- linux-2.5.70-bk10/arch/i386/kernel/cpu/mtrr/amd.c 2003-05-26 18:00:40.000000000 -0700 +++ pgcl-2.5.70-bk10-1/arch/i386/kernel/cpu/mtrr/amd.c 2003-06-05 09:44:34.000000000 -0700 @@ -16,7 +16,7 @@ amd_get_mtrr(unsigned int reg, unsigned if (reg == 1) low = high; /* The base masks off on the right alignment */ - *base = (low & 0xFFFE0000) >> PAGE_SHIFT; + *base = (low & 0xFFFE0000) >> MMUPAGE_SHIFT; *type = 0; if (low & 1) *type = MTRR_TYPE_UNCACHABLE; @@ -42,7 +42,7 @@ amd_get_mtrr(unsigned int reg, unsigned * *128K ... */ low = (~low) & 0x1FFFC; - *size = (low + 4) << (15 - PAGE_SHIFT); + *size = (low + 4) << (15 - MMUPAGE_SHIFT); return; } @@ -77,8 +77,8 @@ static void amd_set_mtrr(unsigned int re desired 111 1111 1111 1100 mask But ~(x - 1) == ~x + 1 == -x. Two's complement rocks! */ - regs[reg] = (-size >> (15 - PAGE_SHIFT) & 0x0001FFFC) - | (base << PAGE_SHIFT) | (type + 1); + regs[reg] = (-size >> (15 - MMUPAGE_SHIFT) & 0x0001FFFC) + | (base << MMUPAGE_SHIFT) | (type + 1); /* * The writeback rule is quite specific. See the manual. Its @@ -97,7 +97,7 @@ static int amd_validate_add_page(unsigne o Power of 2 block o base suitably aligned to the power */ - if (type > MTRR_TYPE_WRCOMB || size < (1 << (17 - PAGE_SHIFT)) + if (type > MTRR_TYPE_WRCOMB || size < (1 << (17 - MMUPAGE_SHIFT)) || (size & ~(size - 1)) - size || (base & (size - 1))) return -EINVAL; return 0; diff -prauN linux-2.5.70-bk10/arch/i386/kernel/cpu/mtrr/centaur.c pgcl-2.5.70-bk10-1/arch/i386/kernel/cpu/mtrr/centaur.c --- linux-2.5.70-bk10/arch/i386/kernel/cpu/mtrr/centaur.c 2003-05-26 18:00:27.000000000 -0700 +++ pgcl-2.5.70-bk10-1/arch/i386/kernel/cpu/mtrr/centaur.c 2003-06-05 09:44:34.000000000 -0700 @@ -51,8 +51,8 @@ static void centaur_get_mcr(unsigned int reg, unsigned long *base, unsigned int *size, mtrr_type * type) { - *base = centaur_mcr[reg].high >> PAGE_SHIFT; - *size = -(centaur_mcr[reg].low & 0xfffff000) >> PAGE_SHIFT; + *base = centaur_mcr[reg].high >> MMUPAGE_SHIFT; + *size = -(centaur_mcr[reg].low & 0xfffff000) >> MMUPAGE_SHIFT; *type = MTRR_TYPE_WRCOMB; /* If it is there, it is write-combining */ if (centaur_mcr_type == 1 && ((centaur_mcr[reg].low & 31) & 2)) *type = MTRR_TYPE_UNCACHABLE; @@ -72,14 +72,14 @@ static void centaur_set_mcr(unsigned int /* Disable */ high = low = 0; } else { - high = base << PAGE_SHIFT; + high = base << MMUPAGE_SHIFT; if (centaur_mcr_type == 0) - low = -size << PAGE_SHIFT | 0x1f; /* only support write-combining... */ + low = -size << MMUPAGE_SHIFT | 0x1f; /* only support write-combining... */ else { if (type == MTRR_TYPE_UNCACHABLE) - low = -size << PAGE_SHIFT | 0x02; /* NC */ + low = -size << MMUPAGE_SHIFT | 0x02; /* NC */ else - low = -size << PAGE_SHIFT | 0x09; /* WWO,WC */ + low = -size << MMUPAGE_SHIFT | 0x09; /* WWO,WC */ } } centaur_mcr[reg].high = high; diff -prauN linux-2.5.70-bk10/arch/i386/kernel/cpu/mtrr/cyrix.c pgcl-2.5.70-bk10-1/arch/i386/kernel/cpu/mtrr/cyrix.c --- linux-2.5.70-bk10/arch/i386/kernel/cpu/mtrr/cyrix.c 2003-05-26 18:00:40.000000000 -0700 +++ pgcl-2.5.70-bk10-1/arch/i386/kernel/cpu/mtrr/cyrix.c 2003-06-05 09:44:34.000000000 -0700 @@ -30,7 +30,7 @@ cyrix_get_arr(unsigned int reg, unsigned /* Enable interrupts if it was enabled previously */ local_irq_restore(flags); shift = ((unsigned char *) base)[1] & 0x0f; - *base >>= PAGE_SHIFT; + *base >>= MMUPAGE_SHIFT; /* Power of two, at least 4K on ARR0-ARR6, 256K on ARR7 * Note: shift==0xf means 4G, this is unsupported. @@ -203,7 +203,7 @@ static void cyrix_set_arr(unsigned int r prepare_set(); - base <<= PAGE_SHIFT; + base <<= MMUPAGE_SHIFT; setCx86(arr, ((unsigned char *) &base)[3]); setCx86(arr + 1, ((unsigned char *) &base)[2]); setCx86(arr + 2, (((unsigned char *) &base)[1]) | arr_size); diff -prauN linux-2.5.70-bk10/arch/i386/kernel/cpu/mtrr/generic.c pgcl-2.5.70-bk10-1/arch/i386/kernel/cpu/mtrr/generic.c --- linux-2.5.70-bk10/arch/i386/kernel/cpu/mtrr/generic.c 2003-05-26 18:00:41.000000000 -0700 +++ pgcl-2.5.70-bk10-1/arch/i386/kernel/cpu/mtrr/generic.c 2003-06-05 09:44:34.000000000 -0700 @@ -131,13 +131,13 @@ void generic_get_mtrr(unsigned int reg, rdmsr(MTRRphysBase_MSR(reg), base_lo, base_hi); /* Work out the shifted address mask. */ - mask_lo = size_or_mask | mask_hi << (32 - PAGE_SHIFT) - | mask_lo >> PAGE_SHIFT; + mask_lo = size_or_mask | mask_hi << (32 - MMUPAGE_SHIFT) + | mask_lo >> MMUPAGE_SHIFT; /* This works correctly if size is a power of two, i.e. a contiguous range. */ *size = -mask_lo; - *base = base_hi << (32 - PAGE_SHIFT) | base_lo >> PAGE_SHIFT; + *base = base_hi << (32 - MMUPAGE_SHIFT) | base_lo >> MMUPAGE_SHIFT; *type = base_lo & 0xff; } @@ -317,10 +317,10 @@ static void generic_set_mtrr(unsigned in relevant mask register to disable a range. */ wrmsr(MTRRphysMask_MSR(reg), 0, 0); } else { - wrmsr(MTRRphysBase_MSR(reg), base << PAGE_SHIFT | type, - (base & size_and_mask) >> (32 - PAGE_SHIFT)); - wrmsr(MTRRphysMask_MSR(reg), -size << PAGE_SHIFT | 0x800, - (-size & size_and_mask) >> (32 - PAGE_SHIFT)); + wrmsr(MTRRphysBase_MSR(reg), base << MMUPAGE_SHIFT | type, + (base & size_and_mask) >> (32 - MMUPAGE_SHIFT)); + wrmsr(MTRRphysMask_MSR(reg), -size << MMUPAGE_SHIFT | 0x800, + (-size & size_and_mask) >> (32 - MMUPAGE_SHIFT)); } post_set(); @@ -335,7 +335,7 @@ int generic_validate_add_page(unsigned l if (is_cpu(INTEL) && boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 1 && boot_cpu_data.x86_mask <= 7) { - if (base & ((1 << (22 - PAGE_SHIFT)) - 1)) { + if (base & ((1 << (22 - MMUPAGE_SHIFT)) - 1)) { printk(KERN_WARNING "mtrr: base(0x%lx000) is not 4 MiB aligned\n", base); return -EINVAL; } diff -prauN linux-2.5.70-bk10/arch/i386/kernel/cpu/mtrr/if.c pgcl-2.5.70-bk10-1/arch/i386/kernel/cpu/mtrr/if.c --- linux-2.5.70-bk10/arch/i386/kernel/cpu/mtrr/if.c 2003-05-26 18:00:26.000000000 -0700 +++ pgcl-2.5.70-bk10-1/arch/i386/kernel/cpu/mtrr/if.c 2003-06-05 09:44:34.000000000 -0700 @@ -33,10 +33,10 @@ mtrr_file_add(unsigned long base, unsign FILE_FCOUNT(file) = fcount; } if (!page) { - if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) + if ((base & (MMUPAGE_SIZE - 1)) || (size & (MMUPAGE_SIZE - 1))) return -EINVAL; - base >>= PAGE_SHIFT; - size >>= PAGE_SHIFT; + base >>= MMUPAGE_SHIFT; + size >>= MMUPAGE_SHIFT; } reg = mtrr_add_page(base, size, type, 1); if (reg >= 0) @@ -52,10 +52,10 @@ mtrr_file_del(unsigned long base, unsign unsigned int *fcount = FILE_FCOUNT(file); if (!page) { - if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) + if ((base & (MMUPAGE_SIZE - 1)) || (size & (MMUPAGE_SIZE - 1))) return -EINVAL; - base >>= PAGE_SHIFT; - size >>= PAGE_SHIFT; + base >>= MMUPAGE_SHIFT; + size >>= MMUPAGE_SHIFT; } reg = mtrr_del_page(-1, base, size); if (reg < 0) @@ -119,8 +119,8 @@ mtrr_write(struct file *file, const char for (i = 0; i < MTRR_NUM_TYPES; ++i) { if (strcmp(ptr, mtrr_strings[i])) continue; - base >>= PAGE_SHIFT; - size >>= PAGE_SHIFT; + base >>= MMUPAGE_SHIFT; + size >>= MMUPAGE_SHIFT; err = mtrr_add_page((unsigned long) base, (unsigned long) size, i, 1); @@ -194,8 +194,8 @@ mtrr_ioctl(struct inode *inode, struct f || gentry.size == 0x100000) gentry.base = gentry.size = gentry.type = 0; else { - gentry.base <<= PAGE_SHIFT; - gentry.size <<= PAGE_SHIFT; + gentry.base <<= MMUPAGE_SHIFT; + gentry.size <<= MMUPAGE_SHIFT; gentry.type = type; } @@ -320,18 +320,18 @@ static int mtrr_seq_show(struct seq_file if (size == 0) usage_table[i] = 0; else { - if (size < (0x100000 >> PAGE_SHIFT)) { + if (size < (0x100000 >> MMUPAGE_SHIFT)) { /* less than 1MB */ factor = 'K'; - size <<= PAGE_SHIFT - 10; + size <<= MMUPAGE_SHIFT - 10; } else { factor = 'M'; - size >>= 20 - PAGE_SHIFT; + size >>= 20 - MMUPAGE_SHIFT; } /* RED-PEN: base can be > 32bit */ len += seq_printf(seq, "reg%02i: base=0x%05lx000 (%4liMB), size=%4i%cB: %s, count=%d\n", - i, base, base >> (20 - PAGE_SHIFT), size, factor, + i, base, base >> (20 - MMUPAGE_SHIFT), size, factor, attrib_to_str(type), usage_table[i]); } } diff -prauN linux-2.5.70-bk10/arch/i386/kernel/cpu/mtrr/main.c pgcl-2.5.70-bk10-1/arch/i386/kernel/cpu/mtrr/main.c --- linux-2.5.70-bk10/arch/i386/kernel/cpu/mtrr/main.c 2003-05-26 18:00:21.000000000 -0700 +++ pgcl-2.5.70-bk10-1/arch/i386/kernel/cpu/mtrr/main.c 2003-06-05 09:44:34.000000000 -0700 @@ -410,12 +410,12 @@ int mtrr_add(unsigned long base, unsigned long size, unsigned int type, char increment) { - if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) { + if ((base & (MMUPAGE_SIZE - 1)) || (size & (MMUPAGE_SIZE - 1))) { printk("mtrr: size and base must be multiples of 4 kiB\n"); printk("mtrr: size: 0x%lx base: 0x%lx\n", size, base); return -EINVAL; } - return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type, + return mtrr_add_page(base >> MMUPAGE_SHIFT, size >> MMUPAGE_SHIFT, type, increment); } @@ -506,12 +506,12 @@ int mtrr_del_page(int reg, unsigned long int mtrr_del(int reg, unsigned long base, unsigned long size) { - if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) { + if ((base & (MMUPAGE_SIZE - 1)) || (size & (MMUPAGE_SIZE - 1))) { printk("mtrr: size and base must be multiples of 4 kiB\n"); printk("mtrr: size: 0x%lx base: 0x%lx\n", size, base); return -EINVAL; } - return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT); + return mtrr_del_page(reg, base >> MMUPAGE_SHIFT, size >> MMUPAGE_SHIFT); } EXPORT_SYMBOL(mtrr_add); @@ -579,7 +579,7 @@ static int __init mtrr_init(void) u32 phys_addr; phys_addr = cpuid_eax(0x80000008) & 0xff; size_or_mask = - ~((1 << (phys_addr - PAGE_SHIFT)) - 1); + ~((1 << (phys_addr - MMUPAGE_SHIFT)) - 1); size_and_mask = ~size_or_mask & 0xfff00000; } /* Athlon MTRRs use an Intel-compatible interface for diff -prauN linux-2.5.70-bk10/arch/i386/kernel/entry.S pgcl-2.5.70-bk10-1/arch/i386/kernel/entry.S --- linux-2.5.70-bk10/arch/i386/kernel/entry.S 2003-06-05 05:43:43.000000000 -0700 +++ pgcl-2.5.70-bk10-1/arch/i386/kernel/entry.S 2003-06-05 09:44:34.000000000 -0700 @@ -160,7 +160,7 @@ do_lcall: movl %eax,EFLAGS(%ebp) # movl %edx,EIP(%ebp) # Now we move them to their "normal" places movl %ecx,CS(%ebp) # - andl $-8192, %ebp # GET_THREAD_INFO + andl $~(THREAD_SIZE-1), %ebp # GET_THREAD_INFO movl TI_EXEC_DOMAIN(%ebp), %edx # Get the execution domain call *4(%edx) # Call the lcall7 handler for the domain addl $4, %esp diff -prauN linux-2.5.70-bk10/arch/i386/kernel/head.S pgcl-2.5.70-bk10-1/arch/i386/kernel/head.S --- linux-2.5.70-bk10/arch/i386/kernel/head.S 2003-05-26 18:00:23.000000000 -0700 +++ pgcl-2.5.70-bk10-1/arch/i386/kernel/head.S 2003-06-05 09:44:34.000000000 -0700 @@ -16,6 +16,7 @@ #include #include #include +#include #define OLD_CL_MAGIC_ADDR 0x90020 #define OLD_CL_MAGIC 0xA33F @@ -325,7 +326,7 @@ rp_sidt: ret ENTRY(stack_start) - .long init_thread_union+8192 + .long init_thread_union+THREAD_SIZE .long __BOOT_DS /* This is the default interrupt "handler" :-) */ diff -prauN linux-2.5.70-bk10/arch/i386/kernel/microcode.c pgcl-2.5.70-bk10-1/arch/i386/kernel/microcode.c --- linux-2.5.70-bk10/arch/i386/kernel/microcode.c 2003-05-26 18:00:26.000000000 -0700 +++ pgcl-2.5.70-bk10-1/arch/i386/kernel/microcode.c 2003-06-05 09:44:34.000000000 -0700 @@ -319,7 +319,7 @@ static ssize_t microcode_write(struct fi sizeof(struct microcode)); return -EINVAL; } - if ((len >> PAGE_SHIFT) > num_physpages) { + if ((len >> MMUPAGE_SHIFT) > num_physpages) { printk(KERN_ERR "microcode: too much data (max %ld pages)\n", num_physpages); return -EINVAL; } diff -prauN linux-2.5.70-bk10/arch/i386/kernel/mpparse.c pgcl-2.5.70-bk10-1/arch/i386/kernel/mpparse.c --- linux-2.5.70-bk10/arch/i386/kernel/mpparse.c 2003-05-26 18:00:27.000000000 -0700 +++ pgcl-2.5.70-bk10-1/arch/i386/kernel/mpparse.c 2003-06-05 09:44:34.000000000 -0700 @@ -710,7 +710,7 @@ static int __init smp_scan_config (unsig smp_found_config = 1; printk(KERN_INFO "found SMP MP-table at %08lx\n", virt_to_phys(mpf)); - reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE); + reserve_bootmem(virt_to_phys(mpf), MMUPAGE_SIZE); if (mpf->mpf_physptr) { /* * We cannot access to MPC table to compute @@ -721,8 +721,8 @@ static int __init smp_scan_config (unsig * PAGE_SIZE from mpg->mpf_physptr yields BUG() * in reserve_bootmem. */ - unsigned long size = PAGE_SIZE; - unsigned long end = max_low_pfn * PAGE_SIZE; + unsigned long size = MMUPAGE_SIZE; + unsigned long end = max_low_pfn * MMUPAGE_SIZE; if (mpf->mpf_physptr + size > end) size = end - mpf->mpf_physptr; reserve_bootmem(mpf->mpf_physptr, size); diff -prauN linux-2.5.70-bk10/arch/i386/kernel/numaq.c pgcl-2.5.70-bk10-1/arch/i386/kernel/numaq.c --- linux-2.5.70-bk10/arch/i386/kernel/numaq.c 2003-05-26 18:00:45.000000000 -0700 +++ pgcl-2.5.70-bk10-1/arch/i386/kernel/numaq.c 2003-06-05 09:44:34.000000000 -0700 @@ -30,10 +30,7 @@ #include #include -/* These are needed before the pgdat's are created */ -extern long node_start_pfn[], node_end_pfn[]; - -#define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT)) +#define MB_TO_PAGES(addr) ((addr) << (20 - MMUPAGE_SHIFT)) /* * Function: smp_dump_qct() diff -prauN linux-2.5.70-bk10/arch/i386/kernel/setup.c pgcl-2.5.70-bk10-1/arch/i386/kernel/setup.c --- linux-2.5.70-bk10/arch/i386/kernel/setup.c 2003-06-05 05:43:43.000000000 -0700 +++ pgcl-2.5.70-bk10-1/arch/i386/kernel/setup.c 2003-06-05 09:44:34.000000000 -0700 @@ -557,6 +557,8 @@ void __init find_max_pfn(void) continue; if (end > max_pfn) max_pfn = end; + + max_pfn &= ~(PAGE_MMUCOUNT - 1); } } @@ -567,6 +569,8 @@ unsigned long __init find_max_low_pfn(vo { unsigned long max_low_pfn; + printk("MAXMEM = %p\n", (void *)MAXMEM); + max_low_pfn = max_pfn; if (max_low_pfn > MAXMEM_PFN) { if (highmem_pages == -1) @@ -680,10 +684,10 @@ static unsigned long __init setup_memory highstart_pfn = max_low_pfn; } printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", - pages_to_mb(highend_pfn - highstart_pfn)); + (highend_pfn - highstart_pfn) >> (20 - MMUPAGE_SHIFT)); #endif printk(KERN_NOTICE "%ldMB LOWMEM available.\n", - pages_to_mb(max_low_pfn)); + max_low_pfn >> (20 - MMUPAGE_SHIFT)); /* * Initialize the boot-time allocator (with low memory only): */ @@ -704,7 +708,7 @@ static unsigned long __init setup_memory * reserve physical page 0 - it's a special BIOS page on many boxes, * enabling clean reboots, SMP operation, laptop functions. */ - reserve_bootmem(0, PAGE_SIZE); + reserve_bootmem(0, MMUPAGE_SIZE); #ifdef CONFIG_SMP /* @@ -712,7 +716,7 @@ static unsigned long __init setup_memory * FIXME: Don't need the extra page at 4K, but need to fix * trampoline before removing it. (see the GDT stuff) */ - reserve_bootmem(PAGE_SIZE, PAGE_SIZE); + reserve_bootmem(MMUPAGE_SIZE, MMUPAGE_SIZE); #endif #ifdef CONFIG_ACPI_SLEEP /* @@ -729,7 +733,7 @@ static unsigned long __init setup_memory #ifdef CONFIG_BLK_DEV_INITRD if (LOADER_TYPE && INITRD_START) { - if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) { + if (INITRD_START + INITRD_SIZE <= PFN_PHYS(max_low_pfn)) { reserve_bootmem(INITRD_START, INITRD_SIZE); initrd_start = INITRD_START ? INITRD_START + PAGE_OFFSET : 0; @@ -739,7 +743,7 @@ static unsigned long __init setup_memory printk(KERN_ERR "initrd extends beyond end of memory " "(0x%08lx > 0x%08lx)\ndisabling initrd\n", INITRD_START + INITRD_SIZE, - max_low_pfn << PAGE_SHIFT); + PFN_PHYS(max_low_pfn)); initrd_start = 0; } } @@ -793,7 +797,7 @@ static void __init register_memory(unsig request_resource(&ioport_resource, standard_io_resources+i); /* Tell the PCI layer not to allocate too close to the RAM area.. */ - low_mem_size = ((max_low_pfn << PAGE_SHIFT) + 0xfffff) & ~0xfffff; + low_mem_size = ((max_low_pfn << MMUPAGE_SHIFT) + 0xfffff) & ~0xfffff; if (low_mem_size > pci_mem_start) pci_mem_start = low_mem_size; } diff -prauN linux-2.5.70-bk10/arch/i386/kernel/smpboot.c pgcl-2.5.70-bk10-1/arch/i386/kernel/smpboot.c --- linux-2.5.70-bk10/arch/i386/kernel/smpboot.c 2003-05-26 18:00:39.000000000 -0700 +++ pgcl-2.5.70-bk10-1/arch/i386/kernel/smpboot.c 2003-06-05 09:44:34.000000000 -0700 @@ -100,7 +100,7 @@ static unsigned long __init setup_trampo */ void __init smp_alloc_memory(void) { - trampoline_base = (void *) alloc_bootmem_low_pages(PAGE_SIZE); + trampoline_base = (void *) alloc_bootmem_low_pages(MMUPAGE_SIZE); /* * Has to be in very low memory so we can execute * real-mode AP code. diff -prauN linux-2.5.70-bk10/arch/i386/kernel/srat.c pgcl-2.5.70-bk10-1/arch/i386/kernel/srat.c --- linux-2.5.70-bk10/arch/i386/kernel/srat.c 2003-05-26 18:00:25.000000000 -0700 +++ pgcl-2.5.70-bk10-1/arch/i386/kernel/srat.c 2003-06-05 09:44:34.000000000 -0700 @@ -24,23 +24,20 @@ * Send feedback to Pat Gaughen */ #include +#include #include #include #include #include #include +#include /* * proximity macros and definitions */ -#define NODE_ARRAY_INDEX(x) ((x) / 8) /* 8 bits/char */ -#define NODE_ARRAY_OFFSET(x) ((x) % 8) /* 8 bits/char */ -#define BMAP_SET(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] |= 1 << NODE_ARRAY_OFFSET(bit)) -#define BMAP_TEST(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] & (1 << NODE_ARRAY_OFFSET(bit))) -#define MAX_PXM_DOMAINS 256 /* 1 byte and no promises about values */ -/* bitmap length; _PXM is at most 255 */ -#define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8) -static u8 pxm_bitmap[PXM_BITMAP_LEN]; /* bitmap of proximity domains */ +#define MAX_PXM_DOMAINS 256 /* 1 byte and no promises about values */ + +static DECLARE_BITMAP(pxm_bitmap, MAX_PXM_DOMAINS); #define MAX_CHUNKS_PER_NODE 4 #define MAXCHUNKS (MAX_CHUNKS_PER_NODE * MAX_NUMNODES) @@ -57,10 +54,6 @@ static int num_memory_chunks; /* total static int zholes_size_init; static unsigned long zholes_size[MAX_NUMNODES * MAX_NR_ZONES]; -extern unsigned long node_start_pfn[], node_end_pfn[]; - -extern void * boot_ioremap(unsigned long, unsigned long); - /* Identify CPU proximity domains */ static void __init parse_cpu_affinity_structure(char *p) { @@ -71,7 +64,7 @@ static void __init parse_cpu_affinity_st return; /* empty entry */ /* mark this node as "seen" in node bitmap */ - BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain); + set_bit(cpu_affinity->proximity_domain, pxm_bitmap); printk("CPU 0x%02X in proximity domain 0x%02X\n", cpu_affinity->apic_id, cpu_affinity->proximity_domain); @@ -94,7 +87,7 @@ static void __init parse_memory_affinity return; /* empty entry */ /* mark this node as "seen" in node bitmap */ - BMAP_SET(pxm_bitmap, memory_affinity->proximity_domain); + set_bit(memory_affinity->proximity_domain, pxm_bitmap); /* calculate info for memory chunk structure */ paddr = memory_affinity->base_addr_hi; @@ -102,8 +95,8 @@ static void __init parse_memory_affinity size = memory_affinity->length_hi; size = (size << 32) | memory_affinity->length_lo; - start_pfn = paddr >> PAGE_SHIFT; - end_pfn = (paddr + size) >> PAGE_SHIFT; + start_pfn = paddr >> MMUPAGE_SHIFT; + end_pfn = (paddr + size) >> MMUPAGE_SHIFT; pxm = memory_affinity->proximity_domain; @@ -140,25 +133,20 @@ static void __init parse_memory_affinity #if MAX_NR_ZONES != 3 #error "MAX_NR_ZONES != 3, chunk_to_zone requires review" #endif -/* Take a chunk of pages from page frame cstart to cend and count the number - * of pages in each zone, returned via zones[]. +/* + * Take a chunk of pages from page frame cstart to cend and count the number + * of pages in each zone, returned via zones[]. This has a hardcoded bias + * to round up; for uses other than holes, introduce a bias argument to + * round differently in each case. */ static __init void chunk_to_zones(unsigned long cstart, unsigned long cend, unsigned long *zones) { - unsigned long max_dma; - extern unsigned long max_low_pfn; - + unsigned long rend, max_dma = __pa(MAX_DMA_ADDRESS)/MMUPAGE_SIZE; int z; - unsigned long rend; - - /* FIXME: MAX_DMA_ADDRESS and max_low_pfn are trying to provide - * similarly scoped information and should be handled in a consistant - * manner. - */ - max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; - /* Split the hole into the zones in which it falls. Repeatedly + /* + * Split the hole into the zones in which it falls. Repeatedly * take the segment in which the remaining hole starts, round it * to the end of that zone. */ @@ -176,7 +164,7 @@ static __init void chunk_to_zones(unsign z = ZONE_HIGHMEM; rend = cend; } - zones[z] += rend - cstart; + zones[z] += (rend - cstart + PAGE_MMUCOUNT - 1)/PAGE_MMUCOUNT; cstart = rend; } } @@ -192,9 +180,7 @@ static void __init initialize_physnode_m for (i = num_memory_chunks; --i >= 0; nmcp++) { for (pfn = nmcp->start_pfn; pfn <= nmcp->end_pfn; pfn += PAGES_PER_ELEMENT) - { - physnode_map[pfn / PAGES_PER_ELEMENT] = (int)nmcp->nid; - } + physnode_map[pfn/PAGES_PER_ELEMENT] = (int)nmcp->nid; } } @@ -245,7 +231,7 @@ static int __init acpi20_parse_srat(stru */ numnodes = 0; /* init total nodes in system */ for (i = 0; i < MAX_PXM_DOMAINS; i++) { - if (BMAP_TEST(pxm_bitmap, i)) { + if (test_bit(i, pxm_bitmap)) { pxm_to_nid_map[i] = numnodes; nid_to_pxm_map[numnodes] = i; node_set_online(numnodes); @@ -253,8 +239,7 @@ static int __init acpi20_parse_srat(stru } } - if (numnodes == 0) - BUG(); + BUG_ON(!numnodes); /* set cnode id in memory chunk structure */ for (i = 0; i < num_memory_chunks; i++) @@ -263,25 +248,24 @@ static int __init acpi20_parse_srat(stru initialize_physnode_map(); printk("pxm bitmap: "); - for (i = 0; i < sizeof(pxm_bitmap); i++) { - printk("%02X ", pxm_bitmap[i]); - } + for (i = 0; i < sizeof(pxm_bitmap); i++) + printk("%08X ", pxm_bitmap[i]); + printk("\n"); printk("Number of logical nodes in system = %d\n", numnodes); printk("Number of memory chunks in system = %d\n", num_memory_chunks); - for (j = 0; j < num_memory_chunks; j++){ + for (j = 0; j < num_memory_chunks; j++) printk("chunk %d nid %d start_pfn %08lx end_pfn %08lx\n", j, node_memory_chunk[j].nid, node_memory_chunk[j].start_pfn, node_memory_chunk[j].end_pfn); - } /*calculate node_start_pfn/node_end_pfn arrays*/ for (nid = 0; nid < numnodes; nid++) { int been_here_before = 0; - for (j = 0; j < num_memory_chunks; j++){ + for (j = 0; j < num_memory_chunks; j++) { if (node_memory_chunk[j].nid == nid) { if (been_here_before == 0) { node_start_pfn[nid] = node_memory_chunk[j].start_pfn; @@ -397,28 +381,28 @@ printk("Begin table scan....\n"); */ static void __init get_zholes_init(void) { - int nid; - int c; - int first; + int nid, c, first; unsigned long end = 0; for (nid = 0; nid < numnodes; nid++) { first = 1; - for (c = 0; c < num_memory_chunks; c++){ - if (node_memory_chunk[c].nid == nid) { - if (first) { - end = node_memory_chunk[c].end_pfn; - first = 0; - - } else { - /* Record any gap between this chunk - * and the previous chunk on this node - * against the zones it spans. - */ - chunk_to_zones(end, - node_memory_chunk[c].start_pfn, - &zholes_size[nid * MAX_NR_ZONES]); - } + for (c = 0; c < num_memory_chunks; c++) { + if (node_memory_chunk[c].nid != nid) + continue; + + /* + * Record any gap between this chunk and the + * previous chunk on this node against the zones + * it spans. Also, round up the sizes of holes + * subtracted out to PAGE_SIZE multiples. + */ + if (!first) + chunk_to_zones(end, + node_memory_chunk[c].start_pfn, + &zholes_size[MAX_NR_ZONES*nid]); + else { + end = node_memory_chunk[c].end_pfn; + first = 0; } } } @@ -430,7 +414,7 @@ unsigned long * __init get_zholes_size(i zholes_size_init++; get_zholes_init(); } - if((nid >= numnodes) | (nid >= MAX_NUMNODES)) + if (nid < 0 || nid >= numnodes || nid >= MAX_NUMNODES) printk("%s: nid = %d is invalid. numnodes = %d", __FUNCTION__, nid, numnodes); return &zholes_size[nid * MAX_NR_ZONES]; diff -prauN linux-2.5.70-bk10/arch/i386/kernel/sys_i386.c pgcl-2.5.70-bk10-1/arch/i386/kernel/sys_i386.c --- linux-2.5.70-bk10/arch/i386/kernel/sys_i386.c 2003-05-26 18:00:59.000000000 -0700 +++ pgcl-2.5.70-bk10-1/arch/i386/kernel/sys_i386.c 2003-06-05 09:44:34.000000000 -0700 @@ -97,10 +97,10 @@ asmlinkage int old_mmap(struct mmap_arg_ goto out; err = -EINVAL; - if (a.offset & ~PAGE_MASK) + if (a.offset & ~MMUPAGE_MASK) goto out; - err = do_mmap2(a.addr, a.len, a.prot, a.flags, a.fd, a.offset >> PAGE_SHIFT); + err = do_mmap2(a.addr, a.len, a.prot, a.flags, a.fd, a.offset >> MMUPAGE_SHIFT); out: return err; } diff -prauN linux-2.5.70-bk10/arch/i386/kernel/traps.c pgcl-2.5.70-bk10-1/arch/i386/kernel/traps.c --- linux-2.5.70-bk10/arch/i386/kernel/traps.c 2003-05-26 18:00:25.000000000 -0700 +++ pgcl-2.5.70-bk10-1/arch/i386/kernel/traps.c 2003-06-05 09:44:34.000000000 -0700 @@ -120,7 +120,7 @@ void show_trace_task(struct task_struct unsigned long esp = tsk->thread.esp; /* User space on another CPU? */ - if ((esp ^ (unsigned long)tsk->thread_info) & (PAGE_MASK<<1)) + if ((esp ^ (unsigned long)tsk->thread_info) & ~(THREAD_SIZE-1)) return; show_trace((unsigned long *)esp); } @@ -431,6 +431,7 @@ static void unknown_nmi_error(unsigned c reason, smp_processor_id()); printk("Dazed and confused, but trying to continue\n"); printk("Do you have a strange power saving mode enabled?\n"); + dump_stack(); } static void default_do_nmi(struct pt_regs * regs) diff -prauN linux-2.5.70-bk10/arch/i386/lib/getuser.S pgcl-2.5.70-bk10-1/arch/i386/lib/getuser.S --- linux-2.5.70-bk10/arch/i386/lib/getuser.S 2003-05-26 18:00:20.000000000 -0700 +++ pgcl-2.5.70-bk10-1/arch/i386/lib/getuser.S 2003-06-05 09:44:34.000000000 -0700 @@ -8,9 +8,9 @@ * return an error value in addition to the "real" * return value. */ +#include #include - /* * __get_user_X * diff -prauN linux-2.5.70-bk10/arch/i386/mm/Makefile pgcl-2.5.70-bk10-1/arch/i386/mm/Makefile --- linux-2.5.70-bk10/arch/i386/mm/Makefile 2003-05-26 18:00:26.000000000 -0700 +++ pgcl-2.5.70-bk10-1/arch/i386/mm/Makefile 2003-06-05 09:48:26.000000000 -0700 @@ -2,7 +2,7 @@ # Makefile for the linux i386-specific parts of the memory manager. # -obj-y := init.o pgtable.o fault.o ioremap.o extable.o pageattr.o +obj-y := init.o pgtable.o fault.o ioremap.o extable.o pageattr.o tlb.o obj-$(CONFIG_DISCONTIGMEM) += discontig.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o diff -prauN linux-2.5.70-bk10/arch/i386/mm/discontig.c pgcl-2.5.70-bk10-1/arch/i386/mm/discontig.c --- linux-2.5.70-bk10/arch/i386/mm/discontig.c 2003-05-26 18:00:40.000000000 -0700 +++ pgcl-2.5.70-bk10-1/arch/i386/mm/discontig.c 2003-06-05 09:44:34.000000000 -0700 @@ -71,8 +71,6 @@ extern unsigned long max_low_pfn; extern unsigned long totalram_pages; extern unsigned long totalhigh_pages; -#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) - unsigned long node_remap_start_pfn[MAX_NUMNODES]; unsigned long node_remap_size[MAX_NUMNODES]; unsigned long node_remap_offset[MAX_NUMNODES]; @@ -129,7 +127,7 @@ static void __init allocate_pgdat(int ni if (nid) NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid]; else { - NODE_DATA(nid) = (pg_data_t *)(__va(min_low_pfn << PAGE_SHIFT)); + NODE_DATA(nid) = (pg_data_t *)(__va(min_low_pfn*MMUPAGE_SIZE)); min_low_pfn += PFN_UP(sizeof(pg_data_t)); memset(NODE_DATA(nid), 0, sizeof(pg_data_t)); } @@ -182,8 +180,8 @@ void __init remap_numa_kva(void) int node; for (node = 1; node < numnodes; ++node) { - for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) { - vaddr = node_remap_start_vaddr[node]+(pfn< system_max_low_pfn) highstart_pfn = system_max_low_pfn; printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", - pages_to_mb(highend_pfn - highstart_pfn)); + (highend_pfn - highstart_pfn) >> (20 - MMUPAGE_SHIFT)); #endif system_max_low_pfn = max_low_pfn = max_low_pfn - reserve_pages; printk(KERN_NOTICE "%ldMB LOWMEM available.\n", - pages_to_mb(system_max_low_pfn)); + system_max_low_pfn >> (20 - MMUPAGE_SHIFT)); printk("min_low_pfn = %ld, max_low_pfn = %ld, highstart_pfn = %ld\n", min_low_pfn, max_low_pfn, highstart_pfn); @@ -261,6 +260,11 @@ unsigned long __init setup_memory(void) (ulong) pfn_to_kaddr(highstart_pfn)); for (nid = 0; nid < numnodes; nid++) find_max_pfn_node(nid); + printk("vmallocspace = [0x%lx, 0x%lx)\n", + VMALLOC_START, VMALLOC_END); + printk("fixmapspace = [0x%lx, 0x%lx)\n", + FIXADDR_START, FIXADDR_TOP); + printk("MAXMEM = 0x%lx\n", MAXMEM); NODE_DATA(0)->bdata = &node0_bdata; @@ -277,21 +281,21 @@ unsigned long __init setup_memory(void) * the (very unlikely) case of us accidentally initializing the * bootmem allocator with an invalid RAM area. */ - reserve_bootmem_node(NODE_DATA(0), HIGH_MEMORY, (PFN_PHYS(min_low_pfn) + - bootmap_size + PAGE_SIZE-1) - (HIGH_MEMORY)); + reserve_bootmem_node(NODE_DATA(0), HIGH_MEMORY, PFN_PHYS(min_low_pfn) + + bootmap_size - HIGH_MEMORY); /* * reserve physical page 0 - it's a special BIOS page on many boxes, * enabling clean reboots, SMP operation, laptop functions. */ - reserve_bootmem_node(NODE_DATA(0), 0, PAGE_SIZE); + reserve_bootmem_node(NODE_DATA(0), 0, MMUPAGE_SIZE); /* * But first pinch a few for the stack/trampoline stuff * FIXME: Don't need the extra page at 4K, but need to fix * trampoline before removing it. (see the GDT stuff) */ - reserve_bootmem_node(NODE_DATA(0), PAGE_SIZE, PAGE_SIZE); + reserve_bootmem_node(NODE_DATA(0), MMUPAGE_SIZE, MMUPAGE_SIZE); #ifdef CONFIG_ACPI_SLEEP /* @@ -307,7 +311,7 @@ unsigned long __init setup_memory(void) #ifdef CONFIG_BLK_DEV_INITRD if (LOADER_TYPE && INITRD_START) { - if (INITRD_START + INITRD_SIZE <= (system_max_low_pfn << PAGE_SHIFT)) { + if (INITRD_START + INITRD_SIZE <= (system_max_low_pfn << MMUPAGE_SHIFT)) { reserve_bootmem_node(NODE_DATA(0), INITRD_START, INITRD_SIZE); initrd_start = INITRD_START ? INITRD_START + PAGE_OFFSET : 0; @@ -317,7 +321,7 @@ unsigned long __init setup_memory(void) printk(KERN_ERR "initrd extends beyond end of memory " "(0x%08lx > 0x%08lx)\ndisabling initrd\n", INITRD_START + INITRD_SIZE, - system_max_low_pfn << PAGE_SHIFT); + system_max_low_pfn << MMUPAGE_SHIFT); initrd_start = 0; } } @@ -350,20 +354,20 @@ void __init zone_sizes_init(void) unsigned long start = node_start_pfn[nid]; unsigned long high = node_end_pfn[nid]; - max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; + max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> MMUPAGE_SHIFT; if (start > low) { #ifdef CONFIG_HIGHMEM - zones_size[ZONE_HIGHMEM] = high - start; + zones_size[ZONE_HIGHMEM] = (high - start) >> PAGE_MMUSHIFT; #endif } else { if (low < max_dma) - zones_size[ZONE_DMA] = low; + zones_size[ZONE_DMA] = low >> PAGE_MMUSHIFT; else { - zones_size[ZONE_DMA] = max_dma; - zones_size[ZONE_NORMAL] = low - max_dma; + zones_size[ZONE_DMA] = max_dma >> PAGE_MMUSHIFT; + zones_size[ZONE_NORMAL] = (low - max_dma) >> PAGE_MMUSHIFT; #ifdef CONFIG_HIGHMEM - zones_size[ZONE_HIGHMEM] = high - low; + zones_size[ZONE_HIGHMEM] = (high - low) >> PAGE_MMUSHIFT; #endif } } @@ -403,10 +407,14 @@ void __init set_highmem_pages_init(int b zone_start_pfn = NODE_DATA(nid)->node_zones[ZONE_HIGHMEM].zone_start_pfn; printk("Initializing highpages for node %d\n", nid); - for (node_pfn = 0; node_pfn < node_high_size; node_pfn++) { - one_highpage_init((struct page *)(zone_mem_map + node_pfn), - zone_start_pfn + node_pfn, bad_ppro); - } + + /* + * Note: zone->spanned_pages is in PAGE_SIZE units. + */ + for (node_pfn = 0; node_pfn < node_high_size; node_pfn++) + one_highpage_init(&zone_mem_map[node_pfn], + zone_start_pfn + node_pfn*PAGE_MMUCOUNT, + bad_ppro); } totalram_pages += totalhigh_pages; #endif diff -prauN linux-2.5.70-bk10/arch/i386/mm/fault.c pgcl-2.5.70-bk10-1/arch/i386/mm/fault.c --- linux-2.5.70-bk10/arch/i386/mm/fault.c 2003-05-26 18:00:20.000000000 -0700 +++ pgcl-2.5.70-bk10-1/arch/i386/mm/fault.c 2003-06-05 09:44:34.000000000 -0700 @@ -20,6 +20,8 @@ #include #include /* For unblank_screen() */ #include +#include /* for max_low_pfn */ +#include #include #include @@ -73,19 +75,22 @@ asmlinkage void do_page_fault(struct pt_ struct mm_struct *mm; struct vm_area_struct * vma; unsigned long address; - unsigned long page; int write; siginfo_t info; /* get the address */ __asm__("movl %%cr2,%0":"=r" (address)); + pr_debug("faulted on %lx,", address); + /* It's safe to allow irq's after cr2 has been saved */ if (regs->eflags & X86_EFLAGS_IF) local_irq_enable(); tsk = current; + pr_debug(" pid = %d\n", current->pid); + /* * We fault-in kernel-space virtual memory on-demand. The * 'reference' page table is init_mm.pgd. @@ -104,7 +109,20 @@ asmlinkage void do_page_fault(struct pt_ mm = tsk->mm; info.si_code = SEGV_MAPERR; - + if (1) { + pgd_t *pgd = pgd_offset(mm, address); + pmd_t *pmd = pmd_offset(pgd, address); + pr_debug("fault handled by PGD at vaddr %p, %Lx\n", + pgd, (u64)pgd_val(*pgd)); + pr_debug("fault handled by PMD at vaddr %p, %Lx\n", + pmd, (u64)pmd_val(*pmd)); + if (pmd_present(*pmd)) { + pr_debug("fault will be handled by PTE at paddr %Lx\n", + (u64)(pmd_val(*pmd) & MMUPAGE_MASK) + + pte_index(address)*sizeof(pte_t)); + } else + pr_debug("pmd not present\n"); + } /* * If we're in an interrupt, have no user context or are running in an * atomic region then we must not take the fault.. @@ -115,12 +133,16 @@ asmlinkage void do_page_fault(struct pt_ down_read(&mm->mmap_sem); vma = find_vma(mm, address); - if (!vma) + if (!vma) { + pr_debug("no vma, goto bad_area\n"); goto bad_area; + } if (vma->vm_start <= address) goto good_area; - if (!(vma->vm_flags & VM_GROWSDOWN)) + if (!(vma->vm_flags & VM_GROWSDOWN)) { + pr_debug("VM_GROWSDOWN not in vma->vm_flags, goto bad_area\n"); goto bad_area; + } if (error_code & 4) { /* * accessing the stack below %esp is always a bug. @@ -128,11 +150,15 @@ asmlinkage void do_page_fault(struct pt_ * pusha) doing post-decrement on the stack and that * doesn't show up until later.. */ - if (address + 32 < regs->esp) + if (address + 32 < regs->esp) { + pr_debug("postdecrement on stack, goto bad_area\n"); goto bad_area; } - if (expand_stack(vma, address)) + } + if (expand_stack(vma, address)) { + pr_debug("expand_stack() failed, goto bad_area\n"); goto bad_area; + } /* * Ok, we have a good vm_area for this memory access, so * we can handle it.. @@ -144,20 +170,25 @@ good_area: default: /* 3: write, present */ #ifdef TEST_VERIFY_AREA if (regs->cs == KERNEL_CS) - printk("WP fault at %08lx\n", regs->eip); + pr_debug("WP fault at %08lx\n", regs->eip); #endif /* fall through */ case 2: /* write, not present */ - if (!(vma->vm_flags & VM_WRITE)) + if (!(vma->vm_flags & VM_WRITE)) { + pr_debug("vma not writable, goto bad_area\n"); goto bad_area; + } write++; break; case 1: /* read, present */ + pr_debug("NFI what happened, goto bad_area\n"); goto bad_area; case 0: /* read, not present */ - if (!(vma->vm_flags & (VM_READ | VM_EXEC))) + if (!(vma->vm_flags & (VM_READ | VM_EXEC))) { + pr_debug("vma not read/exec, goto bad_area\n"); goto bad_area; } + } survive: /* @@ -184,7 +215,7 @@ good_area: * Did it hit the DOS screen memory VA from vm86 mode? */ if (regs->eflags & VM_MASK) { - unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT; + unsigned long bit = (address - 0xA0000) >> MMUPAGE_SHIFT; if (bit < 32) tsk->thread.screen_bitmap |= 1 << bit; } @@ -200,6 +231,45 @@ bad_area: /* User mode accesses just cause a SIGSEGV */ if (error_code & 4) { + printk("user mode SIGSEGV, pid = %d, comm = %16s, EIP = %p, ESP = %p, CR2 = %p\n", + current->pid, current->comm, (void *)regs->eip, (void *)regs->esp, (void *)address); + spin_lock(&mm->page_table_lock); + for (vma = mm->mmap; vma; vma = vma->vm_next) { + unsigned long addr; + + printk("vma = [%lx,%lx) prot=%lx flags=%lx\n", + vma->vm_start, vma->vm_end, + vma->vm_page_prot.pgprot, vma->vm_flags); + + for (addr = vma->vm_start; addr < vma->vm_end; addr += MMUPAGE_SIZE) { + pgd_t *pgd = pgd_offset(mm, addr); + pmd_t *pmd; + pte_t *pte; + struct page *page; + void *mem; + + if (pgd_none(*pgd) || pgd_bad(*pgd)) + continue; + + pmd = pmd_offset(pgd, addr); + if (pmd_none(*pmd) || pmd_bad(*pmd)) + continue; + + pte = pte_offset_map(pmd, addr); + if (pte_none(*pte) || !pte_present(*pte) || + !pfn_valid(pte_pfn(*pte))) { + pte_unmap(pte); + continue; + } + page = pte_page(*pte); + mem = kmap_atomic(page, KM_USER0); + if (!memcmp(mem, page_address(ZERO_PAGE(0)), PAGE_SIZE)) + printk("page at 0x%lx zero!\n", addr); + kunmap_atomic(mem, KM_USER0); + pte_unmap(pte); + } + } + spin_unlock(&mm->page_table_lock); tsk->thread.cr2 = address; tsk->thread.error_code = error_code; tsk->thread.trap_no = 14; @@ -207,6 +277,13 @@ bad_area: info.si_errno = 0; /* info.si_code has been set above */ info.si_addr = (void *)address; +#if 0 + if (current->pid >= 1024) { + while (1) { + schedule_timeout(HZ); + } + } +#endif force_sig_info(SIGSEGV, &info, tsk); return; } @@ -239,30 +316,53 @@ no_context: bust_spinlocks(1); - if (address < PAGE_SIZE) + if (address < MMUPAGE_SIZE) printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference"); else printk(KERN_ALERT "Unable to handle kernel paging request"); printk(" at virtual address %08lx\n",address); printk(" printing eip:\n"); printk("%08lx\n", regs->eip); - asm("movl %%cr3,%0":"=r" (page)); - page = ((unsigned long *) __va(page))[address >> 22]; - printk(KERN_ALERT "*pde = %08lx\n", page); - /* - * We must not directly access the pte in the highpte - * case, the page table might be allocated in highmem. - * And lets rather not kmap-atomic the pte, just in case - * it's allocated already. - */ -#ifndef CONFIG_HIGHPTE - if (page & 1) { - page &= PAGE_MASK; - address &= 0x003ff000; - page = ((unsigned long *) __va(page))[address >> PAGE_SHIFT]; - printk(KERN_ALERT "*pte = %08lx\n", page); + { + unsigned long cr3; + pgd_t *pgd; + pmd_t *pmd; + char *fmt; + + asm("movl %%cr3,%0":"=r" (cr3)); + cr3 &= ~0x1f; /* lower 5 bits of %cr3 are flags */ + /* pgd's in lowmem, but only need to be < 4G (32-bit %cr3) */ + pgd = (pgd_t *)__va(cr3); + fmt = PTRS_PER_PMD > 1 ? KERN_ALERT "*pdpte = %Lx\n" : NULL; + if (PTRS_PER_PMD > 1) + printk(fmt, pgd_val(*pgd)); + + /* pmd's in lowmem, but can be anywhere (64-bit PDPTE) */ + pmd = pmd_offset(pgd, address); + if (PTRS_PER_PMD > 1) + fmt = KERN_ALERT "*pde = %Lx\n"; + else + fmt = KERN_ALERT "*pde = %08lx\n"; + printk(fmt, pmd_val(*pmd)); + + /* + * this is getting at what are potentially user + * PTE's with pte_offset_kernel(); it's mostly + * unsafe to try editing kernel PTE's at this + * point for kmap_atomic() so just drop out of it + * if pmd_val(*pmd)/MMUPAGE_SIZE > max_low_pfn + */ + + if (pmd_present(*pmd) && !pmd_large(*pmd) + && pmd_val(*pmd)/MMUPAGE_SIZE <= max_low_pfn) { + pte_t *pte = pte_offset_kernel(pmd, address); + if (PTRS_PER_PMD > 1) + fmt = KERN_ALERT "*pte = %Lx\n"; + else + fmt = KERN_ALERT "*pte = %08lx\n"; + printk(fmt, pte_val(*pte)); + } } -#endif die("Oops", regs, error_code); bust_spinlocks(0); do_exit(SIGKILL); @@ -290,6 +390,7 @@ do_sigbus: * Send a sigbus, regardless of whether we were in kernel * or user mode. */ + pr_debug("sending SIGBUS\n"); tsk->thread.cr2 = address; tsk->thread.error_code = error_code; tsk->thread.trap_no = 14; @@ -318,12 +419,16 @@ vmalloc_fault: pmd_t *pmd, *pmd_k; pte_t *pte_k; + pr_debug("took vmalloc fault on address %lx\n", address); + asm("movl %%cr3,%0":"=r" (pgd)); pgd = index + (pgd_t *)__va(pgd); pgd_k = init_mm.pgd + index; - if (!pgd_present(*pgd_k)) + if (!pgd_present(*pgd_k)) { + printk("missing pgd in vmalloc fault!\n"); goto no_context; + } /* * set_pgd(pgd, *pgd_k); here would be useless on PAE @@ -332,13 +437,17 @@ vmalloc_fault: pmd = pmd_offset(pgd, address); pmd_k = pmd_offset(pgd_k, address); - if (!pmd_present(*pmd_k)) + if (!pmd_present(*pmd_k)) { + printk("missing pmd in vmalloc fault!\n"); goto no_context; + } set_pmd(pmd, *pmd_k); pte_k = pte_offset_kernel(pmd_k, address); - if (!pte_present(*pte_k)) + if (!pte_present(*pte_k)) { + printk("missing pte in vmalloc fault!\n"); goto no_context; + } return; } } diff -prauN linux-2.5.70-bk10/arch/i386/mm/highmem.c pgcl-2.5.70-bk10-1/arch/i386/mm/highmem.c --- linux-2.5.70-bk10/arch/i386/mm/highmem.c 2003-05-26 18:00:40.000000000 -0700 +++ pgcl-2.5.70-bk10-1/arch/i386/mm/highmem.c 2003-06-05 09:44:34.000000000 -0700 @@ -1,5 +1,9 @@ #include +/* + * XXX: resurrect kmap_pte + */ + void *kmap(struct page *page) { might_sleep(); @@ -28,44 +32,67 @@ void kunmap(struct page *page) void *kmap_atomic(struct page *page, enum km_type type) { enum fixed_addresses idx; - unsigned long vaddr; + unsigned long addr, vaddr, pfn; + int k; + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; inc_preempt_count(); if (page < highmem_start_page) return page_address(page); idx = type + KM_TYPE_NR*smp_processor_id(); - vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); + vaddr = __fix_to_virt(FIX_KMAP_END) + PAGE_SIZE*idx; + WARN_ON(vaddr > __fix_to_virt(FIX_KMAP_BEGIN)); + WARN_ON(vaddr < __fix_to_virt(FIX_KMAP_END)); + pfn = page_to_pfn(page); + addr = vaddr; + pgd = pgd_offset_k(addr); + pmd = pmd_offset(pgd, addr); + pte = pte_offset_kernel(pmd, addr); + for (k = 0; k < PAGE_MMUCOUNT; ++k, addr += MMUPAGE_SIZE) { #ifdef CONFIG_DEBUG_HIGHMEM - if (!pte_none(*(kmap_pte-idx))) - BUG(); + BUG_ON(!pte_none(pte[k])); #endif - set_pte(kmap_pte-idx, mk_pte(page, kmap_prot)); - __flush_tlb_one(vaddr); - - return (void*) vaddr; + set_pte(&pte[k], pfn_pte(pfn + k, kmap_prot)); + __flush_tlb_one(addr); + } + return (void *)vaddr; } void kunmap_atomic(void *kvaddr, enum km_type type) { #ifdef CONFIG_DEBUG_HIGHMEM - unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; + unsigned long vaddr = (unsigned long)kvaddr & PAGE_MASK; enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); + unsigned long lo, hi; + int k; + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; if (vaddr < FIXADDR_START) { // FIXME dec_preempt_count(); return; } - if (vaddr != __fix_to_virt(FIX_KMAP_BEGIN+idx)) - BUG(); + lo = __fix_to_virt(FIX_KMAP_END) + PAGE_SIZE*idx; + hi = lo + PAGE_SIZE; + + BUG_ON(vaddr < lo || vaddr > hi); /* * force other mappings to Oops if they'll try to access * this pte without first remap it */ - pte_clear(kmap_pte-idx); - __flush_tlb_one(vaddr); + pgd = pgd_offset_k(vaddr); + pmd = pmd_offset(pgd, vaddr); + pte = pte_offset_kernel(pmd, vaddr); + for (k = 0; k < PAGE_MMUCOUNT; ++k, vaddr += MMUPAGE_SIZE) { + pte_clear(&pte[k]); + __flush_tlb_one(vaddr); + } #endif dec_preempt_count(); @@ -73,14 +100,22 @@ void kunmap_atomic(void *kvaddr, enum km struct page *kmap_atomic_to_page(void *ptr) { - unsigned long idx, vaddr = (unsigned long)ptr; + unsigned long vaddr = (unsigned long)ptr; + pgd_t *pgd; + pmd_t *pmd; pte_t *pte; if (vaddr < FIXADDR_START) return virt_to_page(ptr); - idx = virt_to_fix(vaddr); - pte = kmap_pte - (idx - FIX_KMAP_BEGIN); + pgd = pgd_offset_k(vaddr); + pmd = pmd_offset(pgd, vaddr); + pte = pte_offset_kernel(pmd, vaddr); + + /* + * unsigned long idx = virt_to_fix(vaddr); + * pte = &kmap_pte[idx*PAGE_MMUCOUNT]; + */ return pte_page(*pte); } diff -prauN linux-2.5.70-bk10/arch/i386/mm/init.c pgcl-2.5.70-bk10-1/arch/i386/mm/init.c --- linux-2.5.70-bk10/arch/i386/mm/init.c 2003-05-26 18:00:45.000000000 -0700 +++ pgcl-2.5.70-bk10-1/arch/i386/mm/init.c 2003-06-05 09:48:26.000000000 -0700 @@ -41,8 +41,8 @@ #include #include -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); unsigned long highstart_pfn, highend_pfn; +struct page *zero_page; static int do_test_wp_bit(void); @@ -56,7 +56,7 @@ static pmd_t * __init one_md_table_init( pmd_t *pmd_table; #ifdef CONFIG_X86_PAE - pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); + pmd_table = (pmd_t *) alloc_bootmem_low_pages(MMUPAGE_SIZE); set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); if (pmd_table != pmd_offset(pgd, 0)) BUG(); @@ -74,7 +74,7 @@ static pmd_t * __init one_md_table_init( static pte_t * __init one_page_table_init(pmd_t *pmd) { if (pmd_none(*pmd)) { - pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE); + pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(MMUPAGE_SIZE); set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); if (page_table != pte_offset_kernel(pmd, 0)) BUG(); @@ -95,6 +95,12 @@ static pte_t * __init one_page_table_ini * NOTE: The pagetables are allocated contiguous on the physical space * so we can cache the place of the first one and move around without * checking the pgd every time. + * + * Something happened here and I'm not sure what. This might back the + * thing out (I think). I think it was just a rename so I won't care + * unless it burns me. + * + * -- wli */ static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base) { @@ -111,7 +117,14 @@ static void __init page_table_range_init for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) { if (pgd_none(*pgd)) one_md_table_init(pgd); + } + vaddr = start; + pgd_idx = pgd_index(vaddr); + pmd_idx = pmd_index(vaddr); + pgd = pgd_base + pgd_idx; + + for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) { pmd = pmd_offset(pgd, vaddr); for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) { if (pmd_none(*pmd)) @@ -180,8 +193,8 @@ static inline int page_is_ram(unsigned l * are not. Notably the 640->1Mb area. We need a sanity * check here. */ - addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT; - end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT; + addr = (e820.map[i].addr+MMUPAGE_SIZE-1) >> MMUPAGE_SHIFT; + end = (e820.map[i].addr+e820.map[i].size) >> MMUPAGE_SHIFT; if ((pagenr >= addr) && (pagenr < end)) return 1; } @@ -189,37 +202,13 @@ static inline int page_is_ram(unsigned l } #ifdef CONFIG_HIGHMEM -pte_t *kmap_pte; pgprot_t kmap_prot; -#define kmap_get_fixmap_pte(vaddr) \ - pte_offset_kernel(pmd_offset(pgd_offset_k(vaddr), (vaddr)), (vaddr)) - -void __init kmap_init(void) -{ - unsigned long kmap_vstart; - - /* cache the first kmap pte */ - kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN); - kmap_pte = kmap_get_fixmap_pte(kmap_vstart); - - kmap_prot = PAGE_KERNEL; -} +#define kmap_init() do { kmap_prot = PAGE_KERNEL; } while (0) void __init permanent_kmaps_init(pgd_t *pgd_base) { - pgd_t *pgd; - pmd_t *pmd; - pte_t *pte; - unsigned long vaddr; - - vaddr = PKMAP_BASE; - page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base); - - pgd = swapper_pg_dir + pgd_index(vaddr); - pmd = pmd_offset(pgd, vaddr); - pte = pte_offset_kernel(pmd, vaddr); - pkmap_page_table = pte; + page_table_range_init(PKMAP_BASE, PKMAP_BASE + PAGE_SIZE*LAST_PKMAP, pgd_base); } void __init one_highpage_init(struct page *page, int pfn, int bad_ppro) @@ -238,7 +227,7 @@ void __init one_highpage_init(struct pag void __init set_highmem_pages_init(int bad_ppro) { int pfn; - for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) + for (pfn = highstart_pfn; pfn < highend_pfn; pfn += PAGE_MMUCOUNT) one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro); totalram_pages += totalhigh_pages; } @@ -305,6 +294,34 @@ static void __init pagetable_init (void) */ pgd_base[0] = pgd_base[USER_PTRS_PER_PGD]; #endif + { + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + unsigned long addr = VMALLOC_START; + + do { + pgd = pgd_offset_k(addr); + if (pgd_none(*pgd) || pgd_bad(*pgd)) { + addr += MMUPAGE_SIZE; + continue; + } + do { + pmd = pmd_offset(pgd, addr); + if (pmd_none(*pmd) || pmd_bad(*pmd)) { + addr += MMUPAGE_SIZE; + continue; + } + do { + pte = pte_offset_kernel(pmd, addr); + if (!pte_none(*pte) || pte_present(*pte)) { + printk("bad vmallocspace PTE at vaddr 0x%lx\n", addr); + } + addr += MMUPAGE_SIZE; + } while (addr < VMALLOC_END); + } while (addr < VMALLOC_END); + } while (addr < VMALLOC_END); + } } void zap_low_mappings (void) @@ -331,17 +348,17 @@ void __init zone_sizes_init(void) unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0}; unsigned int max_dma, high, low; - max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; + max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> MMUPAGE_SHIFT; low = max_low_pfn; high = highend_pfn; if (low < max_dma) - zones_size[ZONE_DMA] = low; + zones_size[ZONE_DMA] = low >> PAGE_MMUSHIFT; else { - zones_size[ZONE_DMA] = max_dma; - zones_size[ZONE_NORMAL] = low - max_dma; + zones_size[ZONE_DMA] = max_dma >> PAGE_MMUSHIFT; + zones_size[ZONE_NORMAL] = (low - max_dma) >> PAGE_MMUSHIFT; #ifdef CONFIG_HIGHMEM - zones_size[ZONE_HIGHMEM] = high - low; + zones_size[ZONE_HIGHMEM] = (high - low) >> PAGE_MMUSHIFT; #endif } free_area_init(zones_size); @@ -372,7 +389,6 @@ void __init paging_init(void) set_in_cr4(X86_CR4_PAE); #endif __flush_tlb_all(); - kmap_init(); zone_sizes_init(); } @@ -418,6 +434,7 @@ static void __init set_max_mapnr_init(vo #else max_mapnr = num_physpages = max_low_pfn; #endif + max_mapnr /= PAGE_MMUCOUNT; } #define __free_all_bootmem() free_all_bootmem() #else @@ -425,11 +442,14 @@ static void __init set_max_mapnr_init(vo extern void set_max_mapnr_init(void); #endif /* !CONFIG_DISCONTIGMEM */ +/* + * Most of the reporting here needs doublechecking. + */ void __init mem_init(void) { extern int ppro_with_ram_bug(void); int codesize, reservedpages, datasize, initsize; - int tmp; + int pfn; int bad_ppro; #ifndef CONFIG_DISCONTIGMEM @@ -439,36 +459,32 @@ void __init mem_init(void) bad_ppro = ppro_with_ram_bug(); -#ifdef CONFIG_HIGHMEM - /* check that fixmap and pkmap do not overlap */ - if (PKMAP_BASE+LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) { - printk(KERN_ERR "fixmap and kmap areas overlap - this will crash\n"); - printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n", - PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, FIXADDR_START); - BUG(); - } -#endif - set_max_mapnr_init(); #ifdef CONFIG_HIGHMEM - high_memory = (void *) __va(highstart_pfn * PAGE_SIZE); + high_memory = (void *) __va(highstart_pfn * MMUPAGE_SIZE); #else - high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); + high_memory = (void *) __va(max_low_pfn * MMUPAGE_SIZE); #endif /* clear the zero-page */ - memset(empty_zero_page, 0, PAGE_SIZE); + memset(empty_zero_page, 0, MMUPAGE_SIZE); /* this will put all low memory onto the freelists */ totalram_pages += __free_all_bootmem(); + zero_page = alloc_page(GFP_ATOMIC|GFP_DMA); + clear_page(page_address(zero_page)); + SetPageReserved(zero_page); + tlb_init(); + totalram_pages--; + reservedpages = 0; - for (tmp = 0; tmp < max_low_pfn; tmp++) + for (pfn = 0; pfn < max_low_pfn; pfn += PAGE_MMUCOUNT) /* * Only count reserved RAM pages */ - if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp))) + if (page_is_ram(pfn) && PageReserved(pfn_to_page(pfn))) reservedpages++; set_highmem_pages_init(bad_ppro); @@ -479,13 +495,36 @@ void __init mem_init(void) printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)\n", (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), - num_physpages << (PAGE_SHIFT-10), + num_physpages << (MMUPAGE_SHIFT-10), codesize >> 10, reservedpages << (PAGE_SHIFT-10), datasize >> 10, initsize >> 10, (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10)) ); + printk("MAXMEM=0x%lx\n", MAXMEM); + printk("vmalloc: start = 0x%lx, end = 0x%lx\n", + VMALLOC_START, VMALLOC_END); + printk("fixaddr: start = 0x%lx, end = 0x%lx\n", + FIXADDR_START, FIXADDR_TOP); + +#ifdef CONFIG_HIGHMEM + printk("FIX_KMAP_END == %lx\n", __fix_to_virt(FIX_KMAP_END)); + if (__fix_to_virt(FIX_KMAP_END) % PAGE_SIZE) + printk(KERN_CRIT "kmap_atomic() area misaligned!\n"); + + printk("FIX_KMAP_BEGIN == %lx\n", __fix_to_virt(FIX_KMAP_BEGIN)); + if ((__fix_to_virt(FIX_KMAP_BEGIN) + MMUPAGE_SIZE) % PAGE_SIZE) + printk(KERN_CRIT "kmap_atomic() area misaligned!\n"); + + printk("FIX_PKMAP_END == %lx\n", __fix_to_virt(FIX_PKMAP_END)); + if (__fix_to_virt(FIX_PKMAP_END) % PAGE_SIZE) + printk(KERN_CRIT "kmap() area misaligned!\n"); + + printk("FIX_PKMAP_BEGIN == %lx\n", __fix_to_virt(FIX_PKMAP_BEGIN)); + if ((__fix_to_virt(FIX_PKMAP_BEGIN) + MMUPAGE_SIZE) % PAGE_SIZE) + printk(KERN_CRIT "kmap() area misaligned!\n"); +#endif #ifdef CONFIG_X86_PAE if (!cpu_has_pae) @@ -505,20 +544,30 @@ void __init mem_init(void) #endif } -#ifdef CONFIG_X86_PAE -struct kmem_cache_s *pae_pgd_cachep; +kmem_cache_t *pgd_cache; +kmem_cache_t *pmd_cache; void __init pgtable_cache_init(void) { - /* - * PAE pgds must be 16-byte aligned: - */ - pae_pgd_cachep = kmem_cache_create("pae_pgd", 32, 0, - SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN, NULL, NULL); - if (!pae_pgd_cachep) - panic("init_pae(): Cannot alloc pae_pgd SLAB cache"); + if (PTRS_PER_PMD > 1) { + pmd_cache = kmem_cache_create("pmd", + PTRS_PER_PMD*sizeof(pmd_t), + 0, + SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN, + pmd_ctor, + NULL); + if (!pmd_cache) + panic("pgtable_cache_init(): cannot create pmd cache"); + } + pgd_cache = kmem_cache_create("pgd", + PTRS_PER_PGD*sizeof(pgd_t), + 0, + SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN, + pgd_ctor, + PTRS_PER_PMD == 1 ? pgd_dtor : NULL); + if (!pgd_cache) + panic("pgtable_cache_init(): Cannot create pgd cache"); } -#endif /* * This function cannot be __init, since exceptions don't work in that @@ -549,28 +598,43 @@ static int do_test_wp_bit(void) void free_initmem(void) { - unsigned long addr; + unsigned long addr, freed = 0;; addr = (unsigned long)(&__init_begin); - for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { + addr = (addr + PAGE_SIZE - 1) & PAGE_MASK; + while(addr < (((unsigned long)(&__init_end)) & PAGE_MASK)) { ClearPageReserved(virt_to_page(addr)); set_page_count(virt_to_page(addr), 1); free_page(addr); totalram_pages++; + freed++; + addr += PAGE_SIZE; } - printk (KERN_INFO "Freeing unused kernel memory: %dk freed\n", (&__init_end - &__init_begin) >> 10); + printk(KERN_INFO "Freeing unused kernel memory: %ldk freed\n", + freed*(PAGE_SIZE/1024)); } #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - if (start < end) - printk (KERN_INFO "Freeing initrd memory: %ldk freed\n", (end - start) >> 10); - for (; start < end; start += PAGE_SIZE) { + unsigned long freed = 0; + + start = (start + PAGE_SIZE - 1) & PAGE_MASK; + end &= PAGE_MASK; + + if (start >= end) + return; + + while (start < end) { ClearPageReserved(virt_to_page(start)); set_page_count(virt_to_page(start), 1); free_page(start); totalram_pages++; + freed++; + start += PAGE_SIZE; } + + printk(KERN_INFO "Freeing initrd memory: %ldk freed\n", + freed*(PAGE_SIZE/1024)); } #endif diff -prauN linux-2.5.70-bk10/arch/i386/mm/ioremap.c pgcl-2.5.70-bk10-1/arch/i386/mm/ioremap.c --- linux-2.5.70-bk10/arch/i386/mm/ioremap.c 2003-05-26 18:00:26.000000000 -0700 +++ pgcl-2.5.70-bk10-1/arch/i386/mm/ioremap.c 2003-06-05 09:44:34.000000000 -0700 @@ -30,7 +30,7 @@ static inline void remap_area_pte(pte_t end = PMD_SIZE; if (address >= end) BUG(); - pfn = phys_addr >> PAGE_SHIFT; + pfn = phys_addr >> MMUPAGE_SHIFT; do { if (!pte_none(*pte)) { printk("remap_area_pte: page already exists\n"); @@ -38,7 +38,7 @@ static inline void remap_area_pte(pte_t } set_pte(pte, pfn_pte(pfn, __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | flags))); - address += PAGE_SIZE; + address += MMUPAGE_SIZE; pfn++; pte++; } while (address && (address < end)); @@ -196,7 +196,7 @@ void *ioremap_nocache (unsigned long phy if (phys_addr + size < virt_to_phys(high_memory)) { struct page *ppage = virt_to_page(__va(phys_addr)); - unsigned long npages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; + unsigned long npages = (size + MMUPAGE_SIZE - 1) >> MMUPAGE_SHIFT; BUG_ON(phys_addr+size > (unsigned long)high_memory); BUG_ON(phys_addr + size < phys_addr); @@ -216,7 +216,7 @@ void iounmap(void *addr) struct vm_struct *p; if (addr <= high_memory) return; - p = remove_vm_area((void *) (PAGE_MASK & (unsigned long) addr)); + p = remove_vm_area((void *) (MMUPAGE_MASK & (unsigned long) addr)); if (!p) { printk("__iounmap: bad address %p\n", addr); return; @@ -224,7 +224,7 @@ void iounmap(void *addr) if (p->flags && p->phys_addr < virt_to_phys(high_memory)) { change_page_attr(virt_to_page(__va(p->phys_addr)), - p->size >> PAGE_SHIFT, + p->size >> MMUPAGE_SHIFT, PAGE_KERNEL); global_flush_tlb(); } @@ -251,14 +251,14 @@ void __init *bt_ioremap(unsigned long ph /* * Mappings have to be page-aligned */ - offset = phys_addr & ~PAGE_MASK; - phys_addr &= PAGE_MASK; - size = PAGE_ALIGN(last_addr) - phys_addr; + offset = phys_addr & ~MMUPAGE_MASK; + phys_addr &= MMUPAGE_MASK; + size = MMUPAGE_ALIGN(last_addr) - phys_addr; /* * Mappings have to fit in the FIX_BTMAP area. */ - nrpages = size >> PAGE_SHIFT; + nrpages = size >> MMUPAGE_SHIFT; if (nrpages > NR_FIX_BTMAPS) return NULL; @@ -268,7 +268,7 @@ void __init *bt_ioremap(unsigned long ph idx = FIX_BTMAP_BEGIN; while (nrpages > 0) { set_fixmap(idx, phys_addr); - phys_addr += PAGE_SIZE; + phys_addr += MMUPAGE_SIZE; --idx; --nrpages; } @@ -285,8 +285,8 @@ void __init bt_iounmap(void *addr, unsig virt_addr = (unsigned long)addr; if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)) return; - offset = virt_addr & ~PAGE_MASK; - nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT; + offset = virt_addr & ~MMUPAGE_MASK; + nrpages = MMUPAGE_ALIGN(offset + size - 1) >> MMUPAGE_SHIFT; idx = FIX_BTMAP_BEGIN; while (nrpages > 0) { diff -prauN linux-2.5.70-bk10/arch/i386/mm/pageattr.c pgcl-2.5.70-bk10-1/arch/i386/mm/pageattr.c --- linux-2.5.70-bk10/arch/i386/mm/pageattr.c 2003-05-26 18:00:39.000000000 -0700 +++ pgcl-2.5.70-bk10-1/arch/i386/mm/pageattr.c 2003-06-05 09:44:34.000000000 -0700 @@ -38,8 +38,8 @@ static struct page *split_large_page(uns address = __pa(address); addr = address & LARGE_PAGE_MASK; pbase = (pte_t *)page_address(base); - for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { - pbase[i] = pfn_pte(addr >> PAGE_SHIFT, + for (i = 0; i < PTRS_PER_PTE; i++, addr += MMUPAGE_SIZE) { + pbase[i] = pfn_pte(addr/MMUPAGE_SIZE, addr == address ? prot : PAGE_KERNEL); } return base; @@ -58,19 +58,27 @@ static void flush_kernel_map(void *dummy static void set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) { + struct page *page; + unsigned long flags; + set_pte_atomic(kpte, pte); /* change init_mm */ -#ifndef CONFIG_X86_PAE - { - struct list_head *l; - spin_lock(&mmlist_lock); - list_for_each(l, &init_mm.mmlist) { - struct mm_struct *mm = list_entry(l, struct mm_struct, mmlist); - pmd_t *pmd = pmd_offset(pgd_offset(mm, address), address); + if (PTRS_PER_PMD > 1) + return; + + spin_lock_irqsave(&pgd_lock, flags); + list_for_each_entry(page, &pgd_list, lru) { + int k; + for (k = 0; k < PAGE_MMUCOUNT; ++k) { + pgd_t *pgd; + pmd_t *pmd; + pgd = (pgd_t *)page_address(page) + + PTRS_PER_PGD * k + + pgd_index(address); + pmd = pmd_offset(pgd, address); set_pte_atomic((pte_t *)pmd, pte); - } - spin_unlock(&mmlist_lock); + } } -#endif + spin_unlock_irqrestore(&pgd_lock, flags); } /* @@ -82,7 +90,7 @@ static inline void revert_page(struct pa pte_t *linear = (pte_t *) pmd_offset(pgd_offset(&init_mm, address), address); set_pmd_pte(linear, address, - pfn_pte((__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT, + pfn_pte((__pa(address) & LARGE_PAGE_MASK)/MMUPAGE_SIZE, PAGE_KERNEL_LARGE)); } @@ -94,15 +102,14 @@ __change_page_attr(struct page *page, pg struct page *kpte_page; #ifdef CONFIG_HIGHMEM - if (page >= highmem_start_page) - BUG(); + BUG_ON(page >= highmem_start_page); #endif address = (unsigned long)page_address(page); kpte = lookup_address(address); if (!kpte) return -EINVAL; - kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK); + kpte_page = virt_to_page(((unsigned long)kpte) & MMUPAGE_MASK); if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL)) { if ((pte_val(*kpte) & _PAGE_PSE) == 0) { pte_t old = *kpte; @@ -159,6 +166,8 @@ int change_page_attr(struct page *page, struct page *fpage; int i; + numpages = (numpages + PAGE_MMUCOUNT - 1)& ~(PAGE_MMUCOUNT-1); + down_write(&init_mm.mmap_sem); for (i = 0; i < numpages; i++, page++) { fpage = NULL; diff -prauN linux-2.5.70-bk10/arch/i386/mm/pgtable.c pgcl-2.5.70-bk10-1/arch/i386/mm/pgtable.c --- linux-2.5.70-bk10/arch/i386/mm/pgtable.c 2003-05-26 18:01:03.000000000 -0700 +++ pgcl-2.5.70-bk10-1/arch/i386/mm/pgtable.c 2003-06-05 09:48:26.000000000 -0700 @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -98,10 +99,12 @@ void set_pmd_pfn(unsigned long vaddr, un if (vaddr & (PMD_SIZE-1)) { /* vaddr is misaligned */ printk ("set_pmd_pfn: vaddr misaligned\n"); + printk ("vaddr = %lx, pfn = %lx\n", vaddr, pfn); return; /* BUG(); */ } - if (pfn & (PTRS_PER_PTE-1)) { /* pfn is misaligned */ + if (pfn & (PMD_SIZE/MMUPAGE_SIZE-1)) { /* pfn is misaligned */ printk ("set_pmd_pfn: pfn misaligned\n"); + printk ("vaddr = %lx, pfn = %lx\n", vaddr, pfn); return; /* BUG(); */ } pgd = swapper_pg_dir + pgd_index(vaddr); @@ -122,11 +125,13 @@ void __set_fixmap (enum fixed_addresses { unsigned long address = __fix_to_virt(idx); + printk("__set_fixmap(%d,%lx)\n", idx, phys); + if (idx >= __end_of_fixed_addresses) { BUG(); return; } - set_pte_pfn(address, phys >> PAGE_SHIFT, flags); + set_pte_pfn(address, phys >> MMUPAGE_SHIFT, flags); } pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) @@ -137,75 +142,111 @@ pte_t *pte_alloc_one_kernel(struct mm_st return pte; } -struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) +void pmd_ctor(void *pmd, kmem_cache_t *cache, unsigned long flags) { - struct page *pte; - -#ifdef CONFIG_HIGHPTE - pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT, 0); -#else - pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT, 0); -#endif - if (pte) - clear_highpage(pte); - return pte; + memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t)); } -#ifdef CONFIG_X86_PAE +/* + * List of all pgd's needed for non-PAE so it can invalidate entries + * in both cached and uncached pgd's; not needed for PAE since the + * kernel pmd is shared. If PAE were not to share the pmd a similar + * tactic would be needed. This is essentially codepath-based locking + * against pageattr.c; it is the unique case in which a valid change + * of kernel pagetables can't be lazily synchronized by vmalloc faults. + * vmalloc faults work because attached pagetables are never freed. + * If the locking proves to be non-performant, a ticketing scheme with + * checks at dup_mmap(), exec(), and other mmlist addition points + * could be used. The locking scheme was chosen on the basis of + * manfred's recommendations and having no core impact whatsoever. + * -- wli + */ +spinlock_t pgd_lock = SPIN_LOCK_UNLOCKED; +LIST_HEAD(pgd_list); -pgd_t *pgd_alloc(struct mm_struct *mm) +void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused) { - int i; - pgd_t *pgd = kmem_cache_alloc(pae_pgd_cachep, GFP_KERNEL); + unsigned long flags; + struct page *page; - if (pgd) { - for (i = 0; i < USER_PTRS_PER_PGD; i++) { - unsigned long pmd = __get_free_page(GFP_KERNEL); - if (!pmd) - goto out_oom; - clear_page(pmd); - set_pgd(pgd + i, __pgd(1 + __pa(pmd))); - } - memcpy(pgd + USER_PTRS_PER_PGD, - swapper_pg_dir + USER_PTRS_PER_PGD, - (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); + if (PTRS_PER_PMD == 1) { + page = virt_to_page(pgd); + spin_lock_irqsave(&pgd_lock, flags); } - return pgd; -out_oom: - for (i--; i >= 0; i--) - free_page((unsigned long)__va(pgd_val(pgd[i])-1)); - kmem_cache_free(pae_pgd_cachep, pgd); - return NULL; -} -void pgd_free(pgd_t *pgd) -{ - int i; + memcpy((pgd_t *)pgd + USER_PTRS_PER_PGD, + swapper_pg_dir + USER_PTRS_PER_PGD, + (PTRS_PER_PGD - USER_PTRS_PER_PGD)*sizeof(pgd_t)); - for (i = 0; i < USER_PTRS_PER_PGD; i++) - free_page((unsigned long)__va(pgd_val(pgd[i])-1)); - kmem_cache_free(pae_pgd_cachep, pgd); -} + if (PTRS_PER_PMD > 1) + return; -#else + /* + * When allocated, a page has a reference count of 1. + * This increases it from that to 2 on the first pgd_ctor() + * call to any part of a page. + */ + if (PAGE_MMUCOUNT == 1) + list_add(&page->lru, &pgd_list); + else { + atomic_inc(&page->count); + if (atomic_read(&page->count) == 2) + list_add(&page->lru, &pgd_list); + } + spin_unlock_irqrestore(&pgd_lock, flags); + memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); +} + +/* never called when PTRS_PER_PMD > 1 */ +void pgd_dtor(void *pgd, kmem_cache_t *cache, unsigned long unused) +{ + unsigned long flags; /* can be called from interrupt context */ + struct page *page = virt_to_page(pgd); + + spin_lock_irqsave(&pgd_lock, flags); + if (PAGE_MMUCOUNT == 1) + list_del(&page->lru); + else { + atomic_dec(&page->count); + if (atomic_read(&page->count) == 1) + list_del(&page->lru); + } + spin_unlock_irqrestore(&pgd_lock, flags); +} pgd_t *pgd_alloc(struct mm_struct *mm) { - pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL); + int i; + pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL); - if (pgd) { - memset(pgd, 0, USER_PTRS_PER_PGD * sizeof(pgd_t)); - memcpy(pgd + USER_PTRS_PER_PGD, - swapper_pg_dir + USER_PTRS_PER_PGD, - (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); + if (PTRS_PER_PMD == 1 || !pgd) + return pgd; + + for (i = 0; i < USER_PTRS_PER_PGD; ++i) { + pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL); + if (!pmd) + goto out_oom; + else + set_pgd(&pgd[i], __pgd(1 + __pa(pmd))); } return pgd; + +out_oom: + for (i--; i >= 0; i--) + kmem_cache_free(pmd_cache, __va(pgd_val(pgd[i])-1)); + kmem_cache_free(pgd_cache, pgd); + return NULL; } void pgd_free(pgd_t *pgd) { - free_page((unsigned long)pgd); -} + int i; -#endif /* CONFIG_X86_PAE */ + /* in the PAE case user pgd entries are overwritten before usage */ + if (PTRS_PER_PMD > 1) + for (i = 0; i < USER_PTRS_PER_PGD; ++i) + kmem_cache_free(pmd_cache, __va(pgd_val(pgd[i])-1)); + /* in the non-PAE case, clear_page_tables() clears user pgd entries */ + kmem_cache_free(pgd_cache, pgd); +} diff -prauN linux-2.5.70-bk10/arch/i386/mm/tlb.c pgcl-2.5.70-bk10-1/arch/i386/mm/tlb.c --- linux-2.5.70-bk10/arch/i386/mm/tlb.c 1969-12-31 16:00:00.000000000 -0800 +++ pgcl-2.5.70-bk10-1/arch/i386/mm/tlb.c 2003-06-05 09:48:37.000000000 -0700 @@ -0,0 +1,133 @@ +/* + * arch/i386/mm/tlb.c + * (C) June 2003 William Irwin, IBM + * Routines for pagetable cacheing and release. + */ +#include +#include +#include +#include +#include + +DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); + +void tlb_init(void) +{ + int cpu; + for (cpu = 0; cpu < NR_CPUS; ++cpu) { + int zone; + struct mmu_gather *tlb = &per_cpu(mmu_gathers, cpu); + + for (zone = 0; zone < MAX_ZONE_ID; ++zone) { + INIT_LIST_HEAD(&tlb->active_list[zone]); + INIT_LIST_HEAD(&tlb->ready_list[zone]); + } + } +} + +/* + * When an mmu_gather fills, we must flush the entire mm, in no + * small part because whole-mm flushes are the sole bulk TLB + * invalidation primitive on i386. + */ +void tlb_flush_ready(struct mmu_gather *tlb) +{ + int count, zone = 0; + while (tlb->nr_pte_ready >= NR_PTE) { + BUG_ON(zone >= MAX_ZONE_ID); + if (!list_empty(&tlb->ready_list[zone])) { + BUG_ON(!zone_table[zone]); + free_pages_bulk(zone_table[zone], + tlb->ready_count[zone], + &tlb->ready_list[zone], + 0); + tlb->nr_pte_ready -= tlb->ready_count[zone]; + tlb->ready_count[zone] = 0; + BUG_ON(tlb->nr_pte_ready < 0); + BUG_ON(!list_empty(&tlb->ready_list[zone])); + } + zone++; + } + for (count = 0; zone < MAX_ZONE_ID; ++zone) { + BUG_ON(tlb->ready_count[zone] < 0); + count += tlb->ready_count[zone]; + } + BUG_ON(count != tlb->nr_pte_ready); +} + +/* + * oddly declared in pgalloc.h; in general these are TLB-related pmd + * and pte twiddlings. + */ +void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *page) +{ + unsigned long pfn, pmd_off = (unsigned long)pmd; + int k; + + pmd_off = (pmd_off/sizeof(pmd_t)) % PAGE_MMUCOUNT; + pfn = page_to_pfn(page); + pmd -= pmd_off; + + if (PAGE_MMUCOUNT > 1) { + struct page *old_page = NULL; + + if (atomic_read(&page->count) != 1) { + WARN_ON(1); + printk(KERN_DEBUG "bad pte refcount = %d\n", + atomic_read(&page->count)); + } + + for (k = 0; k < PAGE_MMUCOUNT; ++k) { + if (pmd_present(pmd[k]) || !pmd_none(pmd[k])) { + if (old_page) + WARN_ON(old_page != pmd_page(pmd[k])); + else + old_page = pmd_page(pmd[k]); + } + } + + if (!old_page || old_page == page) + atomic_set(&page->count, PAGE_MMUCOUNT); + else { + /* + * old_page->index can legitimately be 0 + * but something's corrupt if it's mapping's wrong + */ + BUG_ON((struct mm_struct *)old_page->mapping != mm); + + /* + * errant callers can potentially do things + * out-of-order + */ + WARN_ON((struct mm_struct *)page->mapping != mm); + /* if (old_page->mapping != mm) + pgtable_add_rmap(page, mm, page->index); */ + pgtable_remove_rmap(page); + put_page(page); + atomic_set(&old_page->count, PAGE_MMUCOUNT); + for (k = 0; k < PAGE_MMUCOUNT; ++k) { + unsigned long long pmdval; + pmdval = page_to_pfn(old_page) + k; + pmdval <<= MMUPAGE_SHIFT; + if (pmd_present(pmd[k]) || !pmd_none(pmd[k])) { + WARN_ON(old_page != pmd_page(pmd[k])); + continue; + } else + set_pmd(&pmd[k], __pmd(_PAGE_TABLE + pmdval)); + } + return; + } + } + + for (k = 0; k < PAGE_MMUCOUNT; ++k) { + unsigned long long pmdval; + pmdval = (unsigned long long)(pfn + k) << MMUPAGE_SHIFT; + if (likely(pmd_none(pmd[k]) || !pmd_present(pmd[k]))) + set_pmd(&pmd[k], __pmd(_PAGE_TABLE + pmdval)); + else { + WARN_ON(1); + printk(KERN_DEBUG "pmdval=%Lx\n", (u64)pmd_val(pmd[k])); + put_page(page); /* a reference will be omitted */ + } + } +} diff -prauN linux-2.5.70-bk10/arch/i386/pci/i386.c pgcl-2.5.70-bk10-1/arch/i386/pci/i386.c --- linux-2.5.70-bk10/arch/i386/pci/i386.c 2003-06-05 05:43:43.000000000 -0700 +++ pgcl-2.5.70-bk10-1/arch/i386/pci/i386.c 2003-06-05 09:44:34.000000000 -0700 @@ -291,7 +291,7 @@ int pci_mmap_page_range(struct pci_dev * /* Write-combine setting is ignored, it is changed via the mtrr * interfaces on this platform. */ - if (remap_page_range(vma, vma->vm_start, vma->vm_pgoff << PAGE_SHIFT, + if (remap_page_range(vma, vma->vm_start, vma->vm_pgoff << MMUPAGE_SHIFT, vma->vm_end - vma->vm_start, vma->vm_page_prot)) return -EAGAIN; diff -prauN linux-2.5.70-bk10/drivers/block/ll_rw_blk.c pgcl-2.5.70-bk10-1/drivers/block/ll_rw_blk.c --- linux-2.5.70-bk10/drivers/block/ll_rw_blk.c 2003-06-05 05:43:44.000000000 -0700 +++ pgcl-2.5.70-bk10-1/drivers/block/ll_rw_blk.c 2003-06-05 09:44:34.000000000 -0700 @@ -234,7 +234,7 @@ void blk_queue_make_request(request_queu **/ void blk_queue_bounce_limit(request_queue_t *q, u64 dma_addr) { - unsigned long bounce_pfn = dma_addr >> PAGE_SHIFT; + unsigned long bounce_pfn = dma_addr >> MMUPAGE_SHIFT; unsigned long mb = dma_addr >> 20; static request_queue_t *last_q; diff -prauN linux-2.5.70-bk10/drivers/char/agp/backend.c pgcl-2.5.70-bk10-1/drivers/char/agp/backend.c --- linux-2.5.70-bk10/drivers/char/agp/backend.c 2003-05-26 18:00:43.000000000 -0700 +++ pgcl-2.5.70-bk10-1/drivers/char/agp/backend.c 2003-06-05 09:44:34.000000000 -0700 @@ -106,7 +106,7 @@ static int agp_find_max(void) { long memory, index, result; - memory = (num_physpages << PAGE_SHIFT) >> 20; + memory = (num_physpages << MMUPAGE_SHIFT) >> 20; index = 1; while ((memory > maxes_table[index].mem) && (index < 8)) @@ -118,7 +118,7 @@ static int agp_find_max(void) (maxes_table[index].mem - maxes_table[index - 1].mem); printk(KERN_INFO PFX "Maximum main memory to use for agp memory: %ldM\n", result); - result = result << (20 - PAGE_SHIFT); + result = result << (20 - MMUPAGE_SHIFT); return result; } @@ -157,7 +157,7 @@ static int agp_backend_initialize(struct } got_gatt = 1; - bridge->key_list = vmalloc(PAGE_SIZE * 4); + bridge->key_list = vmalloc(4*MMUPAGE_SIZE); if (bridge->key_list == NULL) { printk(KERN_ERR PFX "error allocating memory for key lists.\n"); rc = -ENOMEM; @@ -166,7 +166,7 @@ static int agp_backend_initialize(struct got_keylist = 1; /* FIXME vmalloc'd memory not guaranteed contiguous */ - memset(bridge->key_list, 0, PAGE_SIZE * 4); + memset(bridge->key_list, 0, 4*MMUPAGE_SIZE); if (bridge->driver->configure()) { printk(KERN_ERR PFX "error configuring host chipset.\n"); diff -prauN linux-2.5.70-bk10/drivers/char/agp/generic.c pgcl-2.5.70-bk10-1/drivers/char/agp/generic.c --- linux-2.5.70-bk10/drivers/char/agp/generic.c 2003-06-05 05:43:44.000000000 -0700 +++ pgcl-2.5.70-bk10-1/drivers/char/agp/generic.c 2003-06-05 09:44:34.000000000 -0700 @@ -85,7 +85,7 @@ struct agp_memory *agp_create_memory(int kfree(new); return NULL; } - new->memory = vmalloc(PAGE_SIZE * scratch_pages); + new->memory = vmalloc(MMUPAGE_SIZE * scratch_pages); if (new->memory == NULL) { agp_free_key(new->key); @@ -130,7 +130,7 @@ void agp_free_memory(struct agp_memory * } EXPORT_SYMBOL(agp_free_memory); -#define ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) +#define ENTRIES_PER_PAGE (MMUPAGE_SIZE / sizeof(unsigned long)) /** * agp_allocate_memory - allocate a group of pages of a certain type. @@ -668,7 +668,7 @@ int agp_generic_create_gatt_table(void) if (table == NULL) return -ENOMEM; - table_end = table + ((PAGE_SIZE * (1 << page_order)) - 1); + table_end = table + ((MMUPAGE_SIZE * (1 << page_order)) - 1); for (page = virt_to_page(table); page <= virt_to_page(table_end); page++) SetPageReserved(page); @@ -678,7 +678,7 @@ int agp_generic_create_gatt_table(void) agp_bridge->driver->cache_flush(); agp_bridge->gatt_table = ioremap_nocache(virt_to_phys(table), - (PAGE_SIZE * (1 << page_order))); + MMUPAGE_SIZE << page_order); agp_bridge->driver->cache_flush(); if (agp_bridge->gatt_table == NULL) { @@ -736,7 +736,7 @@ int agp_generic_free_gatt_table(void) iounmap(agp_bridge->gatt_table); table = (char *) agp_bridge->gatt_table_real; - table_end = table + ((PAGE_SIZE * (1 << page_order)) - 1); + table_end = table + ((MMUPAGE_SIZE * (1 << page_order)) - 1); for (page = virt_to_page(table); page <= virt_to_page(table_end); page++) ClearPageReserved(page); @@ -784,7 +784,7 @@ int agp_generic_insert_memory(struct agp break; } - num_entries -= agp_memory_reserved/PAGE_SIZE; + num_entries -= agp_memory_reserved/MMUPAGE_SIZE; if (num_entries < 0) num_entries = 0; if (type != 0 || mem->type != 0) { diff -prauN linux-2.5.70-bk10/drivers/char/mem.c pgcl-2.5.70-bk10-1/drivers/char/mem.c --- linux-2.5.70-bk10/drivers/char/mem.c 2003-05-26 18:00:40.000000000 -0700 +++ pgcl-2.5.70-bk10-1/drivers/char/mem.c 2003-06-05 09:44:34.000000000 -0700 @@ -43,8 +43,8 @@ static ssize_t do_write_mem(struct file written = 0; #if defined(__sparc__) || (defined(__mc68000__) && defined(CONFIG_MMU)) /* we don't have page 0 mapped on sparc and m68k.. */ - if (realp < PAGE_SIZE) { - unsigned long sz = PAGE_SIZE-realp; + if (realp < MMUPAGE_SIZE) { + unsigned long sz = MMUPAGE_SIZE-realp; if (sz > count) sz = count; /* Hmm. Do something? */ buf+=sz; @@ -80,8 +80,8 @@ static ssize_t read_mem(struct file * fi read = 0; #if defined(__sparc__) || (defined(__mc68000__) && defined(CONFIG_MMU)) /* we don't have page 0 mapped on sparc and m68k.. */ - if (p < PAGE_SIZE) { - unsigned long sz = PAGE_SIZE-p; + if (p < MMUPAGE_SIZE) { + unsigned long sz = MMUPAGE_SIZE-p; if (sz > count) sz = count; if (sz > 0) { @@ -177,7 +177,7 @@ static inline int noncached_address(unsi static int mmap_mem(struct file * file, struct vm_area_struct * vma) { - unsigned long offset = vma->vm_pgoff << PAGE_SHIFT; + unsigned long offset = vma->vm_pgoff << MMUPAGE_SHIFT; /* * Accessing memory above the top the kernel knows about or @@ -223,8 +223,8 @@ static ssize_t read_kmem(struct file *fi #if defined(__sparc__) || (defined(__mc68000__) && defined(CONFIG_MMU)) /* we don't have page 0 mapped on sparc and m68k.. */ - if (p < PAGE_SIZE && read > 0) { - size_t tmp = PAGE_SIZE - p; + if (p < MMUPAGE_SIZE && read > 0) { + size_t tmp = MMUPAGE_SIZE - p; if (tmp > read) tmp = read; if (clear_user(buf, tmp)) return -EFAULT; @@ -248,8 +248,8 @@ static ssize_t read_kmem(struct file *fi while (count > 0) { int len = count; - if (len > PAGE_SIZE) - len = PAGE_SIZE; + if (len > MMUPAGE_SIZE) + len = MMUPAGE_SIZE; len = vread(kbuf, (char *)p, len); if (!len) break; @@ -298,8 +298,8 @@ static ssize_t write_kmem(struct file * while (count > 0) { int len = count; - if (len > PAGE_SIZE) - len = PAGE_SIZE; + if (len > MMUPAGE_SIZE) + len = MMUPAGE_SIZE; if (len && copy_from_user(kbuf, buf, len)) { free_page((unsigned long)kbuf); return -EFAULT; @@ -409,12 +409,12 @@ static inline size_t read_zero_pagealign /* The shared case is hard. Let's do the conventional zeroing. */ do { - unsigned long unwritten = clear_user(buf, PAGE_SIZE); + unsigned long unwritten = clear_user(buf, MMUPAGE_SIZE); if (unwritten) - return size + unwritten - PAGE_SIZE; + return size + unwritten - MMUPAGE_SIZE; cond_resched(); - buf += PAGE_SIZE; - size -= PAGE_SIZE; + buf += MMUPAGE_SIZE; + size -= MMUPAGE_SIZE; } while (size); return size; @@ -437,23 +437,23 @@ static ssize_t read_zero(struct file * f left = count; /* do we want to be clever? Arbitrary cut-off */ - if (count >= PAGE_SIZE*4) { + if (count >= MMUPAGE_SIZE*4) { unsigned long partial; /* How much left of the page? */ - partial = (PAGE_SIZE-1) & -(unsigned long) buf; + partial = (MMUPAGE_SIZE-1) & -(unsigned long) buf; unwritten = clear_user(buf, partial); written = partial - unwritten; if (unwritten) goto out; left -= partial; buf += partial; - unwritten = read_zero_pagealigned(buf, left & PAGE_MASK); - written += (left & PAGE_MASK) - unwritten; + unwritten = read_zero_pagealigned(buf, left & MMUPAGE_MASK); + written += (left & MMUPAGE_MASK) - unwritten; if (unwritten) goto out; - buf += left & PAGE_MASK; - left &= ~PAGE_MASK; + buf += left & MMUPAGE_MASK; + left &= ~MMUPAGE_MASK; } unwritten = clear_user(buf, left); written += left - unwritten; diff -prauN linux-2.5.70-bk10/drivers/oprofile/buffer_sync.c pgcl-2.5.70-bk10-1/drivers/oprofile/buffer_sync.c --- linux-2.5.70-bk10/drivers/oprofile/buffer_sync.c 2003-05-26 18:00:26.000000000 -0700 +++ pgcl-2.5.70-bk10-1/drivers/oprofile/buffer_sync.c 2003-06-05 09:44:34.000000000 -0700 @@ -247,7 +247,7 @@ static unsigned long lookup_dcookie(stru cookie = fast_get_dcookie(vma->vm_file->f_dentry, vma->vm_file->f_vfsmnt); - *offset = (vma->vm_pgoff << PAGE_SHIFT) + addr - vma->vm_start; + *offset = MMUPAGE_SIZE*vma->vm_pgoff + addr - vma->vm_start; break; } out: diff -prauN linux-2.5.70-bk10/drivers/scsi/qlogicisp.c pgcl-2.5.70-bk10-1/drivers/scsi/qlogicisp.c --- linux-2.5.70-bk10/drivers/scsi/qlogicisp.c 2003-05-26 18:01:00.000000000 -0700 +++ pgcl-2.5.70-bk10-1/drivers/scsi/qlogicisp.c 2003-06-05 09:44:34.000000000 -0700 @@ -1415,7 +1415,7 @@ static int isp1020_init(struct Scsi_Host if ((command & PCI_COMMAND_MEMORY) && ((mem_flags & 1) == 0)) { - mem_base = (u_long) ioremap(mem_base, PAGE_SIZE); + mem_base = (u_long) ioremap(mem_base, MMUPAGE_SIZE); if (!mem_base) { printk("qlogicisp : i/o remapping failed.\n"); goto out_release; diff -prauN linux-2.5.70-bk10/drivers/scsi/sym53c8xx.c pgcl-2.5.70-bk10-1/drivers/scsi/sym53c8xx.c --- linux-2.5.70-bk10/drivers/scsi/sym53c8xx.c 2003-06-05 05:43:55.000000000 -0700 +++ pgcl-2.5.70-bk10-1/drivers/scsi/sym53c8xx.c 2003-06-05 09:44:34.000000000 -0700 @@ -686,7 +686,8 @@ spinlock_t sym53c8xx_lock = SPIN_LOCK_UN #ifndef SCSI_NCR_PCI_MEM_NOT_SUPPORTED static u_long __init remap_pci_mem(u_long base, u_long size) { - u_long page_base = ((u_long) base) & PAGE_MASK; + /* ioremap()/vmalloc() have MMUPAGE_SIZE granularity */ + u_long page_base = ((u_long) base) & MMUPAGE_MASK; u_long page_offs = ((u_long) base) - page_base; u_long page_remapped = (u_long) ioremap(page_base, page_offs+size); @@ -695,8 +696,9 @@ static u_long __init remap_pci_mem(u_lon static void __init unmap_pci_mem(u_long vaddr, u_long size) { + /* iounmap()/vfree() have MMUPAGE_SIZE granularity */ if (vaddr) - iounmap((void *) (vaddr & PAGE_MASK)); + iounmap((void *) (vaddr & MMUPAGE_MASK)); } #endif /* not def SCSI_NCR_PCI_MEM_NOT_SUPPORTED */ diff -prauN linux-2.5.70-bk10/drivers/scsi/sym53c8xx_2/sym_glue.c pgcl-2.5.70-bk10-1/drivers/scsi/sym53c8xx_2/sym_glue.c --- linux-2.5.70-bk10/drivers/scsi/sym53c8xx_2/sym_glue.c 2003-06-05 05:43:56.000000000 -0700 +++ pgcl-2.5.70-bk10-1/drivers/scsi/sym53c8xx_2/sym_glue.c 2003-06-05 09:44:34.000000000 -0700 @@ -215,7 +215,7 @@ m_addr_t __vtobus(m_pool_ident_t dev_dma #ifndef SYM_OPT_NO_BUS_MEMORY_MAPPING static u_long __init pci_map_mem(u_long base, u_long size) { - u_long page_base = ((u_long) base) & PAGE_MASK; + u_long page_base = ((u_long) base) & MMUPAGE_MASK; u_long page_offs = ((u_long) base) - page_base; u_long page_remapped = (u_long) ioremap(page_base, page_offs+size); @@ -225,7 +225,7 @@ static u_long __init pci_map_mem(u_long static void __init pci_unmap_mem(u_long vaddr, u_long size) { if (vaddr) - iounmap((void *) (vaddr & PAGE_MASK)); + iounmap((void *) (vaddr & MMUPAGE_MASK)); } #endif diff -prauN linux-2.5.70-bk10/drivers/scsi/sym53c8xx_comm.h pgcl-2.5.70-bk10-1/drivers/scsi/sym53c8xx_comm.h --- linux-2.5.70-bk10/drivers/scsi/sym53c8xx_comm.h 2003-05-26 18:00:40.000000000 -0700 +++ pgcl-2.5.70-bk10-1/drivers/scsi/sym53c8xx_comm.h 2003-06-05 09:44:34.000000000 -0700 @@ -491,7 +491,7 @@ spinlock_t DRIVER_SMP_LOCK = SPIN_LOCK_U #ifndef SCSI_NCR_PCI_MEM_NOT_SUPPORTED static u_long __init remap_pci_mem(u_long base, u_long size) { - u_long page_base = ((u_long) base) & PAGE_MASK; + u_long page_base = ((u_long) base) & MMUPAGE_MASK; u_long page_offs = ((u_long) base) - page_base; u_long page_remapped = (u_long) ioremap(page_base, page_offs+size); @@ -501,7 +501,7 @@ static u_long __init remap_pci_mem(u_lon static void __init unmap_pci_mem(u_long vaddr, u_long size) { if (vaddr) - iounmap((void *) (vaddr & PAGE_MASK)); + iounmap((void *) (vaddr & MMUPAGE_MASK)); } #endif /* not def SCSI_NCR_PCI_MEM_NOT_SUPPORTED */ diff -prauN linux-2.5.70-bk10/fs/aio.c pgcl-2.5.70-bk10-1/fs/aio.c --- linux-2.5.70-bk10/fs/aio.c 2003-06-05 05:43:57.000000000 -0700 +++ pgcl-2.5.70-bk10-1/fs/aio.c 2003-06-05 09:44:34.000000000 -0700 @@ -87,7 +87,7 @@ static void aio_free_ring(struct kioctx long i; for (i=0; inr_pages; i++) - put_page(info->ring_pages[i]); + put_page(pfn_to_page(info->ring_pages[i])); if (info->mmap_size) { down_write(&ctx->mm->mmap_sem); @@ -114,25 +114,25 @@ static int aio_setup_ring(struct kioctx size = sizeof(struct aio_ring); size += sizeof(struct io_event) * nr_events; - nr_pages = (size + PAGE_SIZE-1) >> PAGE_SHIFT; + nr_pages = (size + MMUPAGE_SIZE-1) >> MMUPAGE_SHIFT; if (nr_pages < 0) return -EINVAL; info->nr_pages = nr_pages; - nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event); + nr_events = (MMUPAGE_SIZE*nr_pages - sizeof(struct aio_ring))/sizeof(struct io_event); info->nr = 0; info->ring_pages = info->internal_pages; if (nr_pages > AIO_RING_PAGES) { - info->ring_pages = kmalloc(sizeof(struct page *) * nr_pages, GFP_KERNEL); + info->ring_pages = kmalloc(sizeof(unsigned long)*nr_pages, GFP_KERNEL); if (!info->ring_pages) return -ENOMEM; - memset(info->ring_pages, 0, sizeof(struct page *) * nr_pages); + memset(info->ring_pages, 0, sizeof(unsigned long)*nr_pages); } - info->mmap_size = nr_pages * PAGE_SIZE; + info->mmap_size = nr_pages*MMUPAGE_SIZE; dprintk("attempting mmap of %lu bytes\n", info->mmap_size); down_write(&ctx->mm->mmap_sem); info->mmap_base = do_mmap(NULL, 0, info->mmap_size, @@ -161,7 +161,8 @@ static int aio_setup_ring(struct kioctx info->nr = nr_events; /* trusted copy */ - ring = kmap_atomic(info->ring_pages[0], KM_USER0); + ring = kmap_atomic(pfn_to_page(info->ring_pages[0]), KM_USER0) + + (info->ring_pages[0] % PAGE_MMUCOUNT)*MMUPAGE_SIZE; ring->nr = nr_events; /* user copy */ ring->id = ctx->user_id; ring->head = ring->tail = 0; @@ -178,15 +179,17 @@ static int aio_setup_ring(struct kioctx /* aio_ring_event: returns a pointer to the event at the given index from * kmap_atomic(, km). Release the pointer with put_aio_ring_event(); */ -#define AIO_EVENTS_PER_PAGE (PAGE_SIZE / sizeof(struct io_event)) -#define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event)) +#define AIO_EVENTS_PER_PAGE (MMUPAGE_SIZE/sizeof(struct io_event)) +#define AIO_EVENTS_FIRST_PAGE ((MMUPAGE_SIZE-sizeof(struct aio_ring))/sizeof(struct io_event)) #define AIO_EVENTS_OFFSET (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE) #define aio_ring_event(info, nr, km) ({ \ unsigned pos = (nr) + AIO_EVENTS_OFFSET; \ struct io_event *__event; \ - __event = kmap_atomic( \ - (info)->ring_pages[pos / AIO_EVENTS_PER_PAGE], km); \ + unsigned long pfn; \ + pfn = (info)->ring_pages[pos/AIO_EVENTS_PER_PAGE]; \ + __event = kmap_atomic(pfn_to_page(pfn), km); \ + __event += (pfn % PAGE_MMUCOUNT) * MMUPAGE_SIZE; \ __event += pos % AIO_EVENTS_PER_PAGE; \ __event; \ }) @@ -194,7 +197,7 @@ static int aio_setup_ring(struct kioctx #define put_aio_ring_event(event, km) do { \ struct io_event *__event = (event); \ (void)__event; \ - kunmap_atomic((void *)((unsigned long)__event & PAGE_MASK), km); \ + kunmap_atomic((void *)((unsigned long)__event & MMUPAGE_MASK), km); \ } while(0) /* ioctx_alloc @@ -400,7 +403,8 @@ static struct kiocb *__aio_get_req(struc * accept an event from this io. */ spin_lock_irq(&ctx->ctx_lock); - ring = kmap_atomic(ctx->ring_info.ring_pages[0], KM_USER0); + ring = kmap_atomic(pfn_to_page(ctx->ring_info.ring_pages[0]), KM_USER0) + + (ctx->ring_info.ring_pages[0] % PAGE_MMUCOUNT)*MMUPAGE_SIZE; if (ctx->reqs_active < aio_ring_avail(&ctx->ring_info, ring)) { list_add(&req->ki_list, &ctx->active_reqs); get_ioctx(ctx); @@ -661,8 +665,8 @@ int aio_complete(struct kiocb *iocb, lon */ spin_lock_irqsave(&ctx->ctx_lock, flags); - ring = kmap_atomic(info->ring_pages[0], KM_IRQ1); - + ring = kmap_atomic(pfn_to_page(info->ring_pages[0]), KM_IRQ1) + + (info->ring_pages[0] % PAGE_MMUCOUNT)*MMUPAGE_SIZE; tail = info->tail; event = aio_ring_event(info, tail, KM_IRQ0); tail = (tail + 1) % info->nr; @@ -717,7 +721,8 @@ static int aio_read_evt(struct kioctx *i unsigned long head; int ret = 0; - ring = kmap_atomic(info->ring_pages[0], KM_USER0); + ring = kmap_atomic(pfn_to_page(info->ring_pages[0]), KM_USER0) + + (info->ring_pages[0] % PAGE_MMUCOUNT)*MMUPAGE_SIZE; dprintk("in aio_read_evt h%lu t%lu m%lu\n", (unsigned long)ring->head, (unsigned long)ring->tail, (unsigned long)ring->nr); diff -prauN linux-2.5.70-bk10/fs/binfmt_elf.c pgcl-2.5.70-bk10-1/fs/binfmt_elf.c --- linux-2.5.70-bk10/fs/binfmt_elf.c 2003-05-26 18:00:39.000000000 -0700 +++ pgcl-2.5.70-bk10-1/fs/binfmt_elf.c 2003-06-05 09:44:34.000000000 -0700 @@ -61,10 +61,10 @@ static int elf_core_dump(long signr, str #define elf_core_dump NULL #endif -#if ELF_EXEC_PAGESIZE > PAGE_SIZE +#if ELF_EXEC_PAGESIZE > MMUPAGE_SIZE # define ELF_MIN_ALIGN ELF_EXEC_PAGESIZE #else -# define ELF_MIN_ALIGN PAGE_SIZE +# define ELF_MIN_ALIGN MMUPAGE_SIZE #endif #define ELF_PAGESTART(_v) ((_v) & ~(unsigned long)(ELF_MIN_ALIGN-1)) @@ -781,9 +781,8 @@ static int load_elf_binary(struct linux_ and some applications "depend" upon this behavior. Since we do not have the power to recompile these, we emulate the SVr4 behavior. Sigh. */ - /* N.B. Shouldn't the size here be PAGE_SIZE?? */ down_write(¤t->mm->mmap_sem); - error = do_mmap(NULL, 0, 4096, PROT_READ | PROT_EXEC, + error = do_mmap(NULL, 0, MMUPAGE_SIZE, PROT_READ | PROT_EXEC, MAP_FIXED | MAP_PRIVATE, 0); up_write(¤t->mm->mmap_sem); } @@ -1370,21 +1369,26 @@ static int elf_core_dump(long signr, str for (addr = vma->vm_start; addr < vma->vm_end; - addr += PAGE_SIZE) { + addr += MMUPAGE_SIZE) { struct page* page; + unsigned long pfn = 0; struct vm_area_struct *vma; if (get_user_pages(current, current->mm, addr, 1, 0, 1, - &page, &vma) <= 0) { - DUMP_SEEK (file->f_pos + PAGE_SIZE); + &pfn, &vma) <= 0) { + DUMP_SEEK (file->f_pos + MMUPAGE_SIZE); } else { + page = pfn_to_page(pfn); if (page == ZERO_PAGE(addr)) { - DUMP_SEEK (file->f_pos + PAGE_SIZE); + DUMP_SEEK (file->f_pos + MMUPAGE_SIZE); } else { void *kaddr; + unsigned long subpfn; + subpfn = pfn % PAGE_MMUCOUNT; flush_cache_page(vma, addr); kaddr = kmap(page); - DUMP_WRITE(kaddr, PAGE_SIZE); + kaddr += subpfn * MMUPAGE_SIZE; + DUMP_WRITE(kaddr, MMUPAGE_SIZE); kunmap(page); } page_cache_release(page); diff -prauN linux-2.5.70-bk10/fs/bio.c pgcl-2.5.70-bk10-1/fs/bio.c --- linux-2.5.70-bk10/fs/bio.c 2003-06-05 05:43:57.000000000 -0700 +++ pgcl-2.5.70-bk10-1/fs/bio.c 2003-06-05 09:44:34.000000000 -0700 @@ -445,12 +445,12 @@ static struct bio *__bio_map_user(struct unsigned long uaddr, unsigned int len, int write_to_vm) { - unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; - unsigned long start = uaddr >> PAGE_SHIFT; + unsigned long end = (uaddr + len + MMUPAGE_SIZE - 1) >> MMUPAGE_SHIFT; + unsigned long start = uaddr >> MMUPAGE_SHIFT; const int nr_pages = end - start; request_queue_t *q = bdev_get_queue(bdev); int ret, offset, i; - struct page **pages; + unsigned long *pages; struct bio *bio; /* @@ -464,7 +464,7 @@ static struct bio *__bio_map_user(struct if (!bio) return NULL; - pages = kmalloc(nr_pages * sizeof(struct page *), GFP_KERNEL); + pages = kmalloc(nr_pages * sizeof(unsigned long), GFP_KERNEL); if (!pages) goto out; @@ -478,9 +478,11 @@ static struct bio *__bio_map_user(struct bio->bi_bdev = bdev; - offset = uaddr & ~PAGE_MASK; + offset = uaddr & ~MMUPAGE_MASK; for (i = 0; i < nr_pages; i++) { - unsigned int bytes = PAGE_SIZE - offset; + unsigned int bytes = MMUPAGE_SIZE - offset; + int suboff = (pages[i] % PAGE_MMUCOUNT)*MMUPAGE_SIZE; + struct page *pg = pfn_to_page(pages[i]); if (len <= 0) break; @@ -491,7 +493,7 @@ static struct bio *__bio_map_user(struct /* * sorry... */ - if (bio_add_page(bio, pages[i], bytes, offset) < bytes) + if (bio_add_page(bio, pg, bytes, offset + suboff) < bytes) break; len -= bytes; @@ -502,7 +504,7 @@ static struct bio *__bio_map_user(struct * release the pages we didn't map into the bio, if any */ while (i < nr_pages) - page_cache_release(pages[i++]); + page_cache_release(pfn_to_page(pages[i++])); kfree(pages); diff -prauN linux-2.5.70-bk10/fs/direct-io.c pgcl-2.5.70-bk10-1/fs/direct-io.c --- linux-2.5.70-bk10/fs/direct-io.c 2003-05-26 18:00:41.000000000 -0700 +++ pgcl-2.5.70-bk10-1/fs/direct-io.c 2003-06-05 09:44:34.000000000 -0700 @@ -35,7 +35,9 @@ /* * How many user pages to map in one call to get_user_pages(). This determines - * the size of a structure on the stack. + * the size of a structure on the stack. But these are mmupages; this + * will _not_ even be able to see a whole PAGE_SIZE area if you make + * PAGE_MMUCOUNT > DIO_PAGES. */ #define DIO_PAGES 64 @@ -49,6 +51,20 @@ * * If blkfactor is zero then the user's request was aligned to the filesystem's * blocksize. + * + * XXX: + * Okay, I just broke this and I'm not sure how to put it back together. + * Basically the issue is that we're pointed at _pfn's_ only by + * get_user_pages() so the assumption of virtual contiguity doesn't even + * guarantee PAGE_SIZE -aligned physical contiguity. + * + * AFAICT the fixup is to "opportunistically" merge all this stuff together + * into PAGE_SIZE-aligned contiguous bits and either special-case or be + * able to handle the rest as they come. I've left this broken for now. + * I'm relatively fearful of eating stackspace to keep count of the number + * mmupages starting at a given pfn there are while merging. + * + * -- wli */ struct dio { @@ -100,7 +116,7 @@ struct dio { * Page queue. These variables belong to dio_refill_pages() and * dio_get_page(). */ - struct page *pages[DIO_PAGES]; /* page buffer */ + unsigned long pages[DIO_PAGES]; /* page buffer */ unsigned head; /* next page to process */ unsigned tail; /* last valid page + 1 */ int page_errors; /* errno from get_user_pages() */ @@ -155,7 +171,7 @@ static int dio_refill_pages(struct dio * */ if (dio->page_errors == 0) dio->page_errors = ret; - dio->pages[0] = ZERO_PAGE(dio->curr_user_address); + dio->pages[0] = page_to_pfn(ZERO_PAGE(dio->curr_user_address)); dio->head = 0; dio->tail = 1; ret = 0; @@ -163,7 +179,7 @@ static int dio_refill_pages(struct dio * } if (ret >= 0) { - dio->curr_user_address += ret * PAGE_SIZE; + dio->curr_user_address += ret * MMUPAGE_SIZE; dio->curr_page += ret; dio->head = 0; dio->tail = ret; @@ -179,8 +195,13 @@ out: * decent number of pages, less frequently. To provide nicer use of the * L1 cache. */ -static struct page *dio_get_page(struct dio *dio) +static struct page *dio_get_page(struct dio *dio, int *pfoff_in_page, int *page_size) { + int pg_size = MMUPAGE_SIZE; + int pfn, tpfn; + struct page *page; + int i = 0; + if (dio_pages_present(dio) == 0) { int ret; @@ -189,7 +210,33 @@ static struct page *dio_get_page(struct return ERR_PTR(ret); BUG_ON(dio_pages_present(dio) == 0); } - return dio->pages[dio->head++]; + + pfn = dio->pages[dio->head++]; + *pfoff_in_page = (pfn % PAGE_MMUCOUNT) * MMUPAGE_SIZE; + + /* Try to cluster all pfns that belongs to this page together */ + tpfn = pfn + 1; + while (pg_size + *pfoff_in_page < PAGE_SIZE) { + if (dio->head == dio->tail) break; + if (tpfn != dio->pages[dio->head]) break; + tpfn++; + dio->head++; + pg_size += MMUPAGE_SIZE; + i++; + } + + page = pfn_to_page(pfn); + *page_size = pg_size; + + /* + * FIXME - get_user_pages got ref for each pfn, we need to drop + * the extra refs for this page + */ + while (i--) { + page_cache_release(page); + } + + return page; } /* @@ -293,8 +340,9 @@ static void dio_bio_submit(struct dio *d */ static void dio_cleanup(struct dio *dio) { + int a, b; while (dio_pages_present(dio)) - page_cache_release(dio_get_page(dio)); + page_cache_release(dio_get_page(dio, &a, &b)); } /* @@ -686,22 +734,26 @@ static void dio_zero_block(struct dio *d static int do_direct_IO(struct dio *dio) { const unsigned blkbits = dio->blkbits; - const unsigned blocks_per_page = PAGE_SIZE >> blkbits; + unsigned blocks_per_page = PAGE_SIZE >> blkbits; struct page *page; unsigned block_in_page; struct buffer_head *map_bh = &dio->map_bh; int ret = 0; + int page_size; + int pf_pgoff; /* The I/O can start at any block offset within the first page */ block_in_page = dio->first_block_in_page; while (dio->block_in_file < dio->final_block_in_request) { - page = dio_get_page(dio); + page = dio_get_page(dio, &pf_pgoff, &page_size); + if (IS_ERR(page)) { ret = PTR_ERR(page); goto out; } + blocks_per_page = page_size >> blkbits; while (block_in_page < blocks_per_page) { unsigned offset_in_page = block_in_page << blkbits; unsigned this_chunk_bytes; /* # of bytes mapped */ @@ -785,7 +837,7 @@ do_holes: * can add to this page */ this_chunk_blocks = dio->blocks_available; - u = (PAGE_SIZE - offset_in_page) >> blkbits; + u = (page_size - offset_in_page) >> blkbits; if (this_chunk_blocks > u) this_chunk_blocks = u; u = dio->final_block_in_request - dio->block_in_file; @@ -795,7 +847,7 @@ do_holes: BUG_ON(this_chunk_bytes == 0); dio->boundary = buffer_boundary(map_bh); - ret = submit_page_section(dio, page, offset_in_page, + ret = submit_page_section(dio, page, pf_pgoff + offset_in_page, this_chunk_bytes, dio->next_block_for_io); if (ret) { page_cache_release(page); @@ -882,7 +934,7 @@ direct_io_worker(int rw, struct kiocb *i bytes = iov[seg].iov_len; /* Index into the first page of the first block */ - dio->first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits; + dio->first_block_in_page = (user_addr & ~MMUPAGE_MASK) >> blkbits; dio->final_block_in_request = dio->block_in_file + (bytes >> blkbits); /* Page fetching state */ @@ -891,11 +943,11 @@ direct_io_worker(int rw, struct kiocb *i dio->curr_page = 0; dio->total_pages = 0; - if (user_addr & (PAGE_SIZE-1)) { + if (user_addr & (MMUPAGE_SIZE-1)) { dio->total_pages++; - bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1)); + bytes -= MMUPAGE_SIZE - (user_addr & (MMUPAGE_SIZE - 1)); } - dio->total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE; + dio->total_pages += (bytes + MMUPAGE_SIZE - 1) / MMUPAGE_SIZE; dio->curr_user_address = user_addr; ret = do_direct_IO(dio); diff -prauN linux-2.5.70-bk10/fs/exec.c pgcl-2.5.70-bk10-1/fs/exec.c --- linux-2.5.70-bk10/fs/exec.c 2003-05-26 18:00:38.000000000 -0700 +++ pgcl-2.5.70-bk10-1/fs/exec.c 2003-06-05 09:44:34.000000000 -0700 @@ -285,51 +285,64 @@ int copy_strings_kernel(int argc,char ** * This routine is used to map in a page into an address space: needed by * execve() for the initial stack and environment pages. * - * tsk->mmap_sem is held for writing. + * task->mm->mmap_sem is held for writing. */ -void put_dirty_page(struct task_struct *tsk, struct page *page, - unsigned long address, pgprot_t prot) +void put_dirty_page(struct task_struct *task, struct page *page, int min_subpfn, + unsigned long addr, pgprot_t prot) { - pgd_t * pgd; - pmd_t * pmd; - pte_t * pte; - struct pte_chain *pte_chain; - - if (page_count(page) != 1) - printk(KERN_ERR "mem_map disagrees with %p at %08lx\n", - page, address); - - pgd = pgd_offset(tsk->mm, address); - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) - goto out_sig; - spin_lock(&tsk->mm->page_table_lock); - pmd = pmd_alloc(tsk->mm, pgd, address); - if (!pmd) - goto out; - pte = pte_alloc_map(tsk->mm, pmd, address); - if (!pte) - goto out; - if (!pte_none(*pte)) { + unsigned long page_pfn, subpfn; + struct pte_chain *pte_chain = NULL; + struct mm_struct *mm = task->mm; + + page_pfn = page_to_pfn(page); + spin_lock(&mm->page_table_lock); + + addr += MMUPAGE_SIZE * min_subpfn; + for (subpfn = min_subpfn; subpfn < PAGE_MMUCOUNT; ++subpfn, addr += MMUPAGE_SIZE) { + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + unsigned long pfn; + + pgd = pgd_offset(mm, addr); + if (!pte_chain) + pte_chain = pte_chain_alloc(GFP_ATOMIC); + if (!pte_chain) { + spin_unlock(&mm->page_table_lock); + pte_chain = pte_chain_alloc(GFP_KERNEL); + if (!pte_chain) + goto out_nolock; + spin_lock(&mm->page_table_lock); + } + pmd = pmd_alloc(mm, pgd, addr); + if (!pmd) + goto out; + pte = pte_alloc_map(mm, pmd, addr); + if (!pte) + goto out; + if (!pte_none(*pte)) { + pte_unmap(pte); + continue; + } + pfn = page_pfn + subpfn; + set_pte(pte, pte_mkdirty(pte_mkwrite(pfn_pte(pfn, prot)))); + pte_chain = page_add_rmap(page, pte, pte_chain); + page_cache_get(page); pte_unmap(pte); - goto out; + task->mm->rss++; } + spin_unlock(&mm->page_table_lock); + pte_chain_free(pte_chain); lru_cache_add_active(page); flush_dcache_page(page); - set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(page, prot)))); - pte_chain = page_add_rmap(page, pte, pte_chain); - pte_unmap(pte); - tsk->mm->rss++; - spin_unlock(&tsk->mm->page_table_lock); - /* no need for flush_tlb */ - pte_chain_free(pte_chain); + page_cache_release(page); /* want to add PAGE_MMUCOUNT-1 */ return; out: - spin_unlock(&tsk->mm->page_table_lock); -out_sig: - __free_page(page); - force_sig(SIGKILL, tsk); + spin_unlock(&mm->page_table_lock); +out_nolock: + page_cache_release(page); + force_sig(SIGKILL, task); pte_chain_free(pte_chain); return; } @@ -392,7 +405,8 @@ int setup_arg_pages(struct linux_binprm if (!mpnt) return -ENOMEM; - if (!vm_enough_memory((STACK_TOP - (PAGE_MASK & (unsigned long) bprm->p))>>PAGE_SHIFT)) { + /* must match the length of mpnt below */ + if (!vm_enough_memory((STACK_TOP-(MMUPAGE_MASK&(unsigned long)bprm->p))/MMUPAGE_SIZE)) { kmem_cache_free(vm_area_cachep, mpnt); return -ENOMEM; } @@ -402,10 +416,9 @@ int setup_arg_pages(struct linux_binprm mpnt->vm_mm = mm; #ifdef CONFIG_STACK_GROWSUP mpnt->vm_start = stack_base; - mpnt->vm_end = PAGE_MASK & - (PAGE_SIZE - 1 + (unsigned long) bprm->p); + mpnt->vm_end = MMUPAGE_ALIGN((unsigned long)bprm->p); #else - mpnt->vm_start = PAGE_MASK & (unsigned long) bprm->p; + mpnt->vm_start = MMUPAGE_MASK & (unsigned long)bprm->p; mpnt->vm_end = STACK_TOP; #endif mpnt->vm_page_prot = protection_map[VM_STACK_FLAGS & 0x7]; @@ -414,17 +427,23 @@ int setup_arg_pages(struct linux_binprm mpnt->vm_pgoff = 0; mpnt->vm_file = NULL; INIT_LIST_HEAD(&mpnt->shared); - mpnt->vm_private_data = (void *) 0; + mpnt->vm_private_data = NULL; insert_vm_struct(mm, mpnt); - mm->total_vm = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; + mm->total_vm = (mpnt->vm_end - mpnt->vm_start) >> MMUPAGE_SHIFT; } for (i = 0 ; i < MAX_ARG_PAGES ; i++) { struct page *page = bprm->page[i]; if (page) { + int min_subpfn; + + if (mpnt->vm_start <= stack_base) + min_subpfn = 0; + else + min_subpfn = (mpnt->vm_start - stack_base)/MMUPAGE_SIZE; bprm->page[i] = NULL; - put_dirty_page(current, page, stack_base, - mpnt->vm_page_prot); + put_dirty_page(current, page, min_subpfn, stack_base, + mpnt->vm_page_prot); } stack_base += PAGE_SIZE; } @@ -1122,6 +1141,7 @@ out_file: allow_write_access(bprm.file); fput(bprm.file); } + return retval; } diff -prauN linux-2.5.70-bk10/fs/ext2/dir.c pgcl-2.5.70-bk10-1/fs/ext2/dir.c --- linux-2.5.70-bk10/fs/ext2/dir.c 2003-05-26 18:00:40.000000000 -0700 +++ pgcl-2.5.70-bk10-1/fs/ext2/dir.c 2003-06-05 09:44:34.000000000 -0700 @@ -432,15 +432,15 @@ int ext2_add_link (struct dentry *dentry struct inode *dir = dentry->d_parent->d_inode; const char *name = dentry->d_name.name; int namelen = dentry->d_name.len; - unsigned chunk_size = ext2_chunk_size(dir); - unsigned reclen = EXT2_DIR_REC_LEN(namelen); - unsigned short rec_len, name_len; + unsigned long chunk_size = ext2_chunk_size(dir); + unsigned long reclen = EXT2_DIR_REC_LEN(namelen); + unsigned long rec_len, name_len; struct page *page = NULL; ext2_dirent * de; unsigned long npages = dir_pages(dir); unsigned long n; char *kaddr; - unsigned from, to; + unsigned long from, to; int err; /* diff -prauN linux-2.5.70-bk10/fs/file_table.c pgcl-2.5.70-bk10-1/fs/file_table.c --- linux-2.5.70-bk10/fs/file_table.c 2003-05-26 18:00:21.000000000 -0700 +++ pgcl-2.5.70-bk10-1/fs/file_table.c 2003-06-05 09:44:34.000000000 -0700 @@ -279,7 +279,7 @@ void __init files_init(unsigned long mem * Per default don't use more than 10% of our memory for files. */ - n = (mempages * (PAGE_SIZE / 1024)) / 10; + n = (mempages * (MMUPAGE_SIZE / 1024)) / 10; files_stat.max_files = n; if (files_stat.max_files < NR_FILE) files_stat.max_files = NR_FILE; diff -prauN linux-2.5.70-bk10/fs/inode.c pgcl-2.5.70-bk10-1/fs/inode.c --- linux-2.5.70-bk10/fs/inode.c 2003-05-26 18:01:00.000000000 -0700 +++ pgcl-2.5.70-bk10-1/fs/inode.c 2003-06-05 09:44:34.000000000 -0700 @@ -1310,7 +1310,11 @@ void __init inode_init(unsigned long mem for (i = 0; i < ARRAY_SIZE(i_wait_queue_heads); i++) init_waitqueue_head(&i_wait_queue_heads[i].wqh); +#if PAGE_SHIFT <= 14 mempages >>= (14 - PAGE_SHIFT); +#else + mempages <<= PAGE_SHIFT - 14; +#endif mempages *= sizeof(struct hlist_head); for (order = 0; ((1UL << order) << PAGE_SHIFT) < mempages; order++) ; diff -prauN linux-2.5.70-bk10/fs/proc/base.c pgcl-2.5.70-bk10-1/fs/proc/base.c --- linux-2.5.70-bk10/fs/proc/base.c 2003-05-26 18:00:41.000000000 -0700 +++ pgcl-2.5.70-bk10-1/fs/proc/base.c 2003-06-05 09:44:34.000000000 -0700 @@ -32,6 +32,7 @@ #include #include #include +#include /* * For hysterical raisins we keep the same inumbers as in the old procfs. @@ -456,29 +457,37 @@ static ssize_t mem_read(struct file * fi size_t count, loff_t *ppos) { struct task_struct *task = proc_task(file->f_dentry->d_inode); - char *page; + char *kbuf; + struct page *page; unsigned long src = *ppos; int ret = -ESRCH; struct mm_struct *mm; - if (!MAY_PTRACE(task)) + if (0 && !MAY_PTRACE(task)) goto out; ret = -ENOMEM; - page = (char *)__get_free_page(GFP_USER); - if (!page) + page = alloc_page(GFP_HIGHUSER); + if (!page) { + printk("alloc_page() failed in mem_read()\n"); goto out; + } + kbuf = kmap(page); ret = 0; mm = get_task_mm(task); - if (!mm) + if (!mm) { + printk("get_task_mm() failed in mem_read()\n"); goto out_free; + } +#if 0 ret = -EIO; if (file->private_data != (void*)((long)current->self_exec_id)) goto out_put; +#endif ret = 0; @@ -486,14 +495,16 @@ static ssize_t mem_read(struct file * fi int this_len, retval; this_len = (count > PAGE_SIZE) ? PAGE_SIZE : count; - retval = access_process_vm(task, src, page, this_len, 0); + retval = access_process_vm(task, src, kbuf, this_len, 0); if (!retval) { + printk("access_process_vm() failed in mem_read()\n"); if (!ret) ret = -EIO; break; } - if (copy_to_user(buf, page, retval)) { + if (copy_to_user(buf, kbuf, retval)) { + printk("copy_to_user() failed in mem_read()\n"); ret = -EFAULT; break; } @@ -505,15 +516,17 @@ static ssize_t mem_read(struct file * fi } *ppos = src; -out_put: mmput(mm); out_free: - free_page((unsigned long) page); + kunmap(page); + __free_page(page); out: return ret; } +#if 0 #define mem_write NULL +#endif #ifndef mem_write /* This is a security hazard */ @@ -521,26 +534,28 @@ static ssize_t mem_write(struct file * f size_t count, loff_t *ppos) { int copied = 0; - char *page; + char *kbuf; + struct page *page; struct task_struct *task = proc_task(file->f_dentry->d_inode); unsigned long dst = *ppos; - if (!MAY_PTRACE(task)) + if (0 && !MAY_PTRACE(task)) return -ESRCH; - page = (char *)__get_free_page(GFP_USER); + page = alloc_page(GFP_HIGHUSER); if (!page) return -ENOMEM; + kbuf = kmap(page); while (count > 0) { int this_len, retval; this_len = (count > PAGE_SIZE) ? PAGE_SIZE : count; - if (copy_from_user(page, buf, this_len)) { + if (copy_from_user(kbuf, buf, this_len)) { copied = -EFAULT; break; } - retval = access_process_vm(task, dst, page, this_len, 1); + retval = access_process_vm(task, dst, kbuf, this_len, 1); if (!retval) { if (!copied) copied = -EIO; @@ -552,7 +567,8 @@ static ssize_t mem_write(struct file * f count -= retval; } *ppos = dst; - free_page((unsigned long) page); + kunmap(page); + __free_page(page); return copied; } #endif diff -prauN linux-2.5.70-bk10/fs/proc/proc_misc.c pgcl-2.5.70-bk10-1/fs/proc/proc_misc.c --- linux-2.5.70-bk10/fs/proc/proc_misc.c 2003-05-26 18:00:24.000000000 -0700 +++ pgcl-2.5.70-bk10-1/fs/proc/proc_misc.c 2003-06-05 09:44:34.000000000 -0700 @@ -241,7 +241,7 @@ static int meminfo_read_proc(char *page, K(ps.nr_writeback), K(ps.nr_mapped), K(ps.nr_slab), - K(committed), + committed << (MMUPAGE_SHIFT - 10), K(ps.nr_page_table_pages), vmtot, vmi.used, diff -prauN linux-2.5.70-bk10/fs/proc/task_mmu.c pgcl-2.5.70-bk10-1/fs/proc/task_mmu.c --- linux-2.5.70-bk10/fs/proc/task_mmu.c 2003-05-26 18:00:24.000000000 -0700 +++ pgcl-2.5.70-bk10-1/fs/proc/task_mmu.c 2003-06-05 09:44:34.000000000 -0700 @@ -56,7 +56,7 @@ int task_statm(struct mm_struct *mm, int *resident = mm->rss; for (vma = mm->mmap; vma; vma = vma->vm_next) { - int pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + int pages = (vma->vm_end - vma->vm_start) >> MMUPAGE_SHIFT; size += pages; if (is_vm_hugetlb_page(vma)) { diff -prauN linux-2.5.70-bk10/include/asm-alpha/page.h pgcl-2.5.70-bk10-1/include/asm-alpha/page.h --- linux-2.5.70-bk10/include/asm-alpha/page.h 2003-05-26 18:00:45.000000000 -0700 +++ pgcl-2.5.70-bk10-1/include/asm-alpha/page.h 2003-06-05 09:44:34.000000000 -0700 @@ -98,6 +98,8 @@ extern __inline__ int get_order(unsigned #define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) +#include + #endif /* __KERNEL__ */ #endif /* _ALPHA_PAGE_H */ diff -prauN linux-2.5.70-bk10/include/asm-arm/page.h pgcl-2.5.70-bk10-1/include/asm-arm/page.h --- linux-2.5.70-bk10/include/asm-arm/page.h 2003-05-26 18:00:25.000000000 -0700 +++ pgcl-2.5.70-bk10-1/include/asm-arm/page.h 2003-06-05 09:44:34.000000000 -0700 @@ -181,6 +181,8 @@ static inline int get_order(unsigned lon #define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) +#include + #endif /* __KERNEL__ */ #endif diff -prauN linux-2.5.70-bk10/include/asm-cris/page.h pgcl-2.5.70-bk10-1/include/asm-cris/page.h --- linux-2.5.70-bk10/include/asm-cris/page.h 2003-05-26 18:00:27.000000000 -0700 +++ pgcl-2.5.70-bk10-1/include/asm-cris/page.h 2003-06-05 09:44:34.000000000 -0700 @@ -109,6 +109,8 @@ extern unsigned long dram_start, dram_en #define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) +#include + #endif /* __KERNEL__ */ #endif /* _CRIS_PAGE_H */ diff -prauN linux-2.5.70-bk10/include/asm-generic/page.h pgcl-2.5.70-bk10-1/include/asm-generic/page.h --- linux-2.5.70-bk10/include/asm-generic/page.h 1969-12-31 16:00:00.000000000 -0800 +++ pgcl-2.5.70-bk10-1/include/asm-generic/page.h 2003-06-05 09:44:34.000000000 -0700 @@ -0,0 +1,11 @@ +#ifndef _ASM_GENERIC_PAGE_H +#define _ASM_GENERIC_PAGE_H + +#define MMUPAGE_SHIFT PAGE_SHIFT +#define MMUPAGE_SIZE PAGE_SIZE +#define MMUPAGE_MASK PAGE_MASK +#define MMUPAGE_ALIGN(x) PAGE_ALIGN(x) +#define PAGE_MMUSHIFT 0 +#define PAGE_MMUCOUNT 1 + +#endif /* _ASM_GENERIC_PAGE_H */ diff -prauN linux-2.5.70-bk10/include/asm-generic/rmap.h pgcl-2.5.70-bk10-1/include/asm-generic/rmap.h --- linux-2.5.70-bk10/include/asm-generic/rmap.h 2003-05-26 18:00:41.000000000 -0700 +++ pgcl-2.5.70-bk10-1/include/asm-generic/rmap.h 2003-06-05 09:44:34.000000000 -0700 @@ -15,7 +15,7 @@ * offset of the page table entry within the page table page * * For CONFIG_HIGHPTE, we need to represent the address of a pte in a - * scalar pte_addr_t. The pfn of the pte's page is shifted left by PAGE_SIZE + * scalar pte_addr_t. The pfn of the pte's page is shifted left by MMUPAGE_SIZE * bits and is then ORed with the byte offset of the pte within its page. * * For CONFIG_HIGHMEM4G, the pte_addr_t is 32 bits. 20 for the pfn, 12 for @@ -26,7 +26,15 @@ */ #include -static inline void pgtable_add_rmap(struct page * page, struct mm_struct * mm, unsigned long address) +/* + * This looks bizarre, but it's actually meaningful. + */ +#define MMUPAGES_MAPPED_PER_PTE_PAGE (PTRS_PER_PTE * PAGE_MMUCOUNT) +#define VIRT_AREA_MAPPED_PER_PTE_PAGE \ + (MMUPAGES_MAPPED_PER_PTE_PAGE*MMUPAGE_SIZE) + +static inline void pgtable_add_rmap(struct page *page, struct mm_struct *mm, + unsigned long address) { #ifdef BROKEN_PPC_PTE_ALLOC_ONE /* OK, so PPC calls pte_alloc() before mem_map[] is setup ... ;( */ @@ -35,13 +43,38 @@ static inline void pgtable_add_rmap(stru if (!mem_init_done) return; #endif + + /* rmap's accounting is already set up */ + if (page->mapping) { + /* + * address is presumably large. if smaller, overflow traps + * the error; if larger, check the distance + */ + WARN_ON(address - page->index >= VIRT_AREA_MAPPED_PER_PTE_PAGE); + return; + } + page->mapping = (void *)mm; - page->index = address & ~((PTRS_PER_PTE * PAGE_SIZE) - 1); + page->index = address & ~(VIRT_AREA_MAPPED_PER_PTE_PAGE - 1); inc_page_state(nr_page_table_pages); } static inline void pgtable_remove_rmap(struct page * page) { + /* we're not down to a unique reference */ + if (PAGE_MMUCOUNT > 1) { + if (atomic_read(&page->count) > 1) + return; + + /* + * A zero reference count should not be possible; + * put_page() should have freed the things outright + * so this essentially means use-after-free is happening. + */ + else + BUG_ON(atomic_read(&page->count) <= 0); + } + page->mapping = NULL; page->index = 0; dec_page_state(nr_page_table_pages); @@ -55,18 +88,19 @@ static inline struct mm_struct * ptep_to static inline unsigned long ptep_to_address(pte_t * ptep) { - struct page * page = kmap_atomic_to_page(ptep); - unsigned long low_bits; - low_bits = ((unsigned long)ptep & ~PAGE_MASK) * PTRS_PER_PTE; - return page->index + low_bits; + struct page *page = kmap_atomic_to_page(ptep); + unsigned long swpage_voff = ((unsigned long)ptep)/sizeof(pte_t); + swpage_voff %= MMUPAGES_MAPPED_PER_PTE_PAGE; + return page->index + MMUPAGE_SIZE*swpage_voff; } #ifdef CONFIG_HIGHPTE static inline pte_addr_t ptep_to_paddr(pte_t *ptep) { - pte_addr_t paddr; - paddr = ((pte_addr_t)page_to_pfn(kmap_atomic_to_page(ptep))) << PAGE_SHIFT; - return paddr + (pte_addr_t)((unsigned long)ptep & ~PAGE_MASK); + unsigned long pfn, subpfn, vaddr = (unsigned long)ptep; + subpfn = (vaddr/MMUPAGE_SIZE) % PAGE_MMUCOUNT; + pfn = page_to_pfn(kmap_atomic_to_page(ptep)) + subpfn; + return MMUPAGE_SIZE*((pte_addr_t)pfn) + (vaddr & ~MMUPAGE_MASK); } #else static inline pte_addr_t ptep_to_paddr(pte_t *ptep) diff -prauN linux-2.5.70-bk10/include/asm-generic/tlb.h pgcl-2.5.70-bk10-1/include/asm-generic/tlb.h --- linux-2.5.70-bk10/include/asm-generic/tlb.h 2003-05-26 18:00:19.000000000 -0700 +++ pgcl-2.5.70-bk10-1/include/asm-generic/tlb.h 2003-06-05 09:48:26.000000000 -0700 @@ -46,6 +46,16 @@ struct mmu_gather { /* Users of the generic TLB shootdown code must declare this storage space. */ DECLARE_PER_CPU(struct mmu_gather, mmu_gathers); +static inline struct mm_struct *tlb_mm(struct mmu_gather *tlb) +{ + return tlb->mm; +} + +static inline void tlb_inc_freed(struct mmu_gather *tlb) +{ + tlb->freed++; +} + /* tlb_gather_mmu * Return a pointer to an initialized struct mmu_gather. */ diff -prauN linux-2.5.70-bk10/include/asm-i386/dma-mapping.h pgcl-2.5.70-bk10-1/include/asm-i386/dma-mapping.h --- linux-2.5.70-bk10/include/asm-i386/dma-mapping.h 2003-05-26 18:01:03.000000000 -0700 +++ pgcl-2.5.70-bk10-1/include/asm-i386/dma-mapping.h 2003-06-05 09:44:34.000000000 -0700 @@ -51,7 +51,7 @@ dma_map_page(struct device *dev, struct size_t size, enum dma_data_direction direction) { BUG_ON(direction == DMA_NONE); - return (dma_addr_t)(page_to_pfn(page)) * PAGE_SIZE + offset; + return (dma_addr_t)(page_to_pfn(page)) * MMUPAGE_SIZE + offset; } static inline void diff -prauN linux-2.5.70-bk10/include/asm-i386/fixmap.h pgcl-2.5.70-bk10-1/include/asm-i386/fixmap.h --- linux-2.5.70-bk10/include/asm-i386/fixmap.h 2003-05-26 18:00:21.000000000 -0700 +++ pgcl-2.5.70-bk10-1/include/asm-i386/fixmap.h 2003-06-05 09:44:34.000000000 -0700 @@ -28,21 +28,55 @@ * addresses. The point is to have a constant address at * compile time, but to set the physical address only * in the boot process. We allocate these special addresses - * from the end of virtual memory (0xfffff000) backwards. + * from the end of virtual memory (-PAGE_SIZE) backwards. * Also this lets us do fail-safe vmalloc(), we * can guarantee that these special addresses and * vmalloc()-ed addresses never overlap. * - * these 'compile-time allocated' memory buffers are - * fixed-size 4k pages. (or larger if used with an increment - * highger than 1) use fixmap_set(idx,phys) to associate - * physical memory with fixmap indices. + * These 'compile-time allocated' memory buffers are + * fixed-size MMUPAGE_SIZE-size pages. Use + * set_fixmap(idx, phys, prot) to associate physical memory with + * fixmap indices. * * TLB entries of such buffers will not be flushed across * task switches. + * + * Right now we initialize only a single pte table. It can be extended + * easily, subsequent pte tables have to be allocated in one physical + * chunk of RAM. + */ +#define PKMAP_NR(virt) (((virt) - PKMAP_BASE) >> PAGE_SHIFT) +#define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) +#define LAST_PKMAP 1024 +#define LAST_PKMAP_MASK (LAST_PKMAP-1) + +/* + * FIXADDR stuff is used by highmem.c for kmapping, and various + * drivers for system devices for their io mappings. + * + * Leave one empty page between vmalloc'ed areas and + * the start of the fixmap. + * + * leave a hole of exactly PAGE_SIZE at the top for CONFIG_HIGHMEM + * this makes things easier on core code; the math works out funny + * and I didn't care enough to conserve PAGE_SIZE - MMUPAGE_SIZE + * worth of virtualspace. */ +#define FIXADDR_TOP (-PAGE_SIZE) +#define __FIXADDR_SIZE (__end_of_permanent_fixed_addresses << MMUPAGE_SHIFT) +#define FIXADDR_START (FIXADDR_TOP - __FIXADDR_SIZE) + +#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << MMUPAGE_SHIFT)) +#define __virt_to_fix(x) ((FIXADDR_TOP - ((x) & MMUPAGE_MASK)) >> MMUPAGE_SHIFT) + enum fixed_addresses { - FIX_HOLE, +#ifdef CONFIG_HIGHMEM + /* reserved pte's for temporary kernel mappings */ + FIX_KMAP_BEGIN = 1, + FIX_KMAP_END = FIX_KMAP_BEGIN+((KM_TYPE_NR*NR_CPUS+1)*PAGE_MMUCOUNT)-1, + FIX_PKMAP_BEGIN, + FIX_PKMAP_END = FIX_PKMAP_BEGIN + (LAST_PKMAP+1)*PAGE_MMUCOUNT - 1, +#endif FIX_VSYSCALL, #ifdef CONFIG_X86_LOCAL_APIC FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */ @@ -63,10 +97,6 @@ enum fixed_addresses { #ifdef CONFIG_X86_CYCLONE_TIMER FIX_CYCLONE_TIMER, /*cyclone timer register*/ #endif -#ifdef CONFIG_HIGHMEM - FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */ - FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1, -#endif #ifdef CONFIG_ACPI_BOOT FIX_ACPI_BEGIN, FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1, @@ -94,19 +124,6 @@ extern void __set_fixmap (enum fixed_add #define clear_fixmap(idx) \ __set_fixmap(idx, 0, __pgprot(0)) -/* - * used by vmalloc.c. - * - * Leave one empty page between vmalloc'ed areas and - * the start of the fixmap. - */ -#define FIXADDR_TOP (0xfffff000UL) -#define __FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT) -#define FIXADDR_START (FIXADDR_TOP - __FIXADDR_SIZE) - -#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT)) -#define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT) - extern void __this_fixmap_does_not_exist(void); /* @@ -133,8 +150,13 @@ static inline unsigned long fix_to_virt( static inline unsigned long virt_to_fix(const unsigned long vaddr) { - BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START); + if (vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START) { + printk("bad vaddr in virt_to_fix 0x%lx\n", vaddr); + BUG(); + } return __virt_to_fix(vaddr); } +#define PKMAP_BASE fix_to_virt(FIX_PKMAP_END) + #endif diff -prauN linux-2.5.70-bk10/include/asm-i386/highmem.h pgcl-2.5.70-bk10-1/include/asm-i386/highmem.h --- linux-2.5.70-bk10/include/asm-i386/highmem.h 2003-05-26 18:00:23.000000000 -0700 +++ pgcl-2.5.70-bk10-1/include/asm-i386/highmem.h 2003-06-05 09:44:34.000000000 -0700 @@ -34,23 +34,8 @@ extern pte_t *pkmap_page_table; extern void kmap_init(void); -/* - * Right now we initialize only a single pte table. It can be extended - * easily, subsequent pte tables have to be allocated in one physical - * chunk of RAM. - */ -#define PKMAP_BASE (0xff800000UL) -#ifdef CONFIG_X86_PAE -#define LAST_PKMAP 512 -#else -#define LAST_PKMAP 1024 -#endif -#define LAST_PKMAP_MASK (LAST_PKMAP-1) -#define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) -#define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) - -extern void * FASTCALL(kmap_high(struct page *page)); -extern void FASTCALL(kunmap_high(struct page *page)); +void *FASTCALL(kmap_high(struct page *page)); +void FASTCALL(kunmap_high(struct page *page)); void *kmap(struct page *page); void kunmap(struct page *page); diff -prauN linux-2.5.70-bk10/include/asm-i386/io.h pgcl-2.5.70-bk10-1/include/asm-i386/io.h --- linux-2.5.70-bk10/include/asm-i386/io.h 2003-05-26 18:01:00.000000000 -0700 +++ pgcl-2.5.70-bk10-1/include/asm-i386/io.h 2003-06-05 09:44:34.000000000 -0700 @@ -69,7 +69,7 @@ * this function */ -static inline unsigned long virt_to_phys(volatile void * address) +static inline unsigned long virt_to_phys(volatile void *address) { return __pa(address); } @@ -87,7 +87,7 @@ static inline unsigned long virt_to_phys * this function */ -static inline void * phys_to_virt(unsigned long address) +static inline void *phys_to_virt(unsigned long address) { return __va(address); } @@ -95,9 +95,9 @@ static inline void * phys_to_virt(unsign /* * Change "struct page" to physical address. */ -#define page_to_phys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT) +#define page_to_phys(page) ((dma_addr_t)page_to_pfn(page) << MMUPAGE_SHIFT) -extern void * __ioremap(unsigned long offset, unsigned long size, unsigned long flags); +void *__ioremap(unsigned long offset, unsigned long size, unsigned long flags); /** * ioremap - map bus memory into CPU space @@ -111,21 +111,33 @@ extern void * __ioremap(unsigned long of * address. */ -static inline void * ioremap (unsigned long offset, unsigned long size) +static inline void *ioremap(unsigned long offset, unsigned long size) { return __ioremap(offset, size, 0); } -extern void * ioremap_nocache (unsigned long offset, unsigned long size); -extern void iounmap(void *addr); +void *ioremap_nocache(unsigned long offset, unsigned long size); +void iounmap(void *addr); + /* * bt_ioremap() and bt_iounmap() are for temporary early boot-time * mappings, before the real ioremap() is functional. * A boot-time mapping is currently limited to at most 16 pages. */ -extern void *bt_ioremap(unsigned long offset, unsigned long size); -extern void bt_iounmap(void *addr, unsigned long size); +void *bt_ioremap(unsigned long offset, unsigned long size); +void bt_iounmap(void *addr, unsigned long size); + +#ifdef CONFIG_BOOT_IOREMAP +/* + * boot_ioremap() is an "even earlier" ioremap, primarily for use + * when the pagetable formats used during early boot differ from + * those used at runtime, e.g. PAE booting off of non-PAE pagetables. + * Don't use this unless you _really_ know what you're doing. + * -- wli + */ +void *boot_ioremap(unsigned long paddr, unsigned long size); +#endif /* CONFIG_BOOT_IOREMAP */ /* * ISA I/O bus memory addresses are 1:1 with the physical address. diff -prauN linux-2.5.70-bk10/include/asm-i386/io_apic.h pgcl-2.5.70-bk10-1/include/asm-i386/io_apic.h --- linux-2.5.70-bk10/include/asm-i386/io_apic.h 2003-05-26 18:00:42.000000000 -0700 +++ pgcl-2.5.70-bk10-1/include/asm-i386/io_apic.h 2003-06-05 09:44:34.000000000 -0700 @@ -17,7 +17,7 @@ #define IO_APIC_BASE(idx) \ ((volatile int *)(__fix_to_virt(FIX_IO_APIC_BASE_0 + idx) \ - + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK))) + + (mp_ioapics[idx].mpc_apicaddr & ~MMUPAGE_MASK))) /* * The structure of the IO-APIC: diff -prauN linux-2.5.70-bk10/include/asm-i386/mmzone.h pgcl-2.5.70-bk10-1/include/asm-i386/mmzone.h --- linux-2.5.70-bk10/include/asm-i386/mmzone.h 2003-05-26 18:00:40.000000000 -0700 +++ pgcl-2.5.70-bk10-1/include/asm-i386/mmzone.h 2003-06-05 09:44:34.000000000 -0700 @@ -11,6 +11,7 @@ #ifdef CONFIG_DISCONTIGMEM extern struct pglist_data *node_data[]; +extern unsigned long node_start_pfn[], node_end_pfn[]; /* * Following are macros that are specific to this numa platform. @@ -22,18 +23,18 @@ extern struct pglist_data *node_data[]; #define alloc_bootmem_low(x) \ __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, 0) #define alloc_bootmem_pages(x) \ - __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS)) + __alloc_bootmem_node(NODE_DATA(0), (x), MMUPAGE_SIZE, __pa(MAX_DMA_ADDRESS)) #define alloc_bootmem_low_pages(x) \ - __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0) + __alloc_bootmem_node(NODE_DATA(0), (x), MMUPAGE_SIZE, 0) #define alloc_bootmem_node(ignore, x) \ __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)) #define alloc_bootmem_pages_node(ignore, x) \ - __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS)) + __alloc_bootmem_node(NODE_DATA(0), (x), MMUPAGE_SIZE, __pa(MAX_DMA_ADDRESS)) #define alloc_bootmem_low_pages_node(ignore, x) \ - __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0) + __alloc_bootmem_node(NODE_DATA(0), (x), MMUPAGE_SIZE, 0) #define node_size(nid) (node_data[nid]->node_size) -#define node_localnr(pfn, nid) ((pfn) - node_data[nid]->node_start_pfn) +#define node_localnr(pfn, nid) (((pfn) - node_data[nid]->node_start_pfn) / PAGE_MMUCOUNT) /* * Following are macros that each numa implmentation must define. @@ -42,25 +43,41 @@ extern struct pglist_data *node_data[]; /* * Given a kernel address, find the home node of the underlying memory. */ -#define kvaddr_to_nid(kaddr) pfn_to_nid(__pa(kaddr) >> PAGE_SHIFT) +#define kvaddr_to_nid(kaddr) pfn_to_nid(__pa(kaddr) >> MMUPAGE_SHIFT) /* * Return a pointer to the node data for node n. */ #define NODE_DATA(nid) (node_data[nid]) +/* + * These names clash. I blame mbligh. + */ #define node_mem_map(nid) (NODE_DATA(nid)->node_mem_map) #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) + +/* + * pgdat->node_size is calculated from zone_sizes[], which is in + * units of PAGE_SIZE. I don't trust this. + */ #define node_end_pfn(nid) \ ({ \ pg_data_t *__pgdat = NODE_DATA(nid); \ - __pgdat->node_start_pfn + __pgdat->node_size; \ + __pgdat->node_start_pfn + __pgdat->node_size*PAGE_MMUCOUNT; \ }) #define local_mapnr(kvaddr) \ ({ \ - unsigned long __pfn = __pa(kvaddr) >> PAGE_SHIFT; \ - (__pfn - node_start_pfn(pfn_to_nid(__pfn))); \ + unsigned long __pfn = __pa(kvaddr) >> MMUPAGE_SHIFT; \ + (__pfn - node_start_pfn(pfn_to_nid(__pfn)))/PAGE_MMUCOUNT; \ +}) + +#define local_pfn(pg) \ +({ \ + struct page *__pg = pg; \ + unsigned long __nr; \ + __nr = (unsigned long)(__pg - page_zone(__pg)->zone_mem_map); \ + __nr*PAGE_MMUCOUNT; \ }) #define kern_addr_valid(kaddr) \ @@ -81,10 +98,9 @@ extern struct pglist_data *node_data[]; ({ \ struct page *__page = pg; \ struct zone *__zone = page_zone(__page); \ - (unsigned long)(__page - __zone->zone_mem_map) \ - + __zone->zone_start_pfn; \ + local_pfn(__page) + __zone->zone_start_pfn; \ }) -#define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT)) +#define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> MMUPAGE_SHIFT)) /* * pfn_valid should be made as fast as possible, and the current definition * is valid for machines that are NUMA, but still contiguous, which is what diff -prauN linux-2.5.70-bk10/include/asm-i386/numaq.h pgcl-2.5.70-bk10-1/include/asm-i386/numaq.h --- linux-2.5.70-bk10/include/asm-i386/numaq.h 2003-05-26 18:00:40.000000000 -0700 +++ pgcl-2.5.70-bk10-1/include/asm-i386/numaq.h 2003-06-05 09:44:34.000000000 -0700 @@ -28,7 +28,7 @@ #ifdef CONFIG_X86_NUMAQ -#define MAX_NUMNODES 8 +#define MAX_NUMNODES 16 extern void get_memcfg_numaq(void); #define get_memcfg_numa() get_memcfg_numaq() @@ -159,7 +159,7 @@ struct sys_cfg_data { static inline unsigned long *get_zholes_size(int nid) { - return 0; + return NULL; } #endif /* CONFIG_X86_NUMAQ */ #endif /* NUMAQ_H */ diff -prauN linux-2.5.70-bk10/include/asm-i386/page.h pgcl-2.5.70-bk10-1/include/asm-i386/page.h --- linux-2.5.70-bk10/include/asm-i386/page.h 2003-05-26 18:00:24.000000000 -0700 +++ pgcl-2.5.70-bk10-1/include/asm-i386/page.h 2003-06-05 09:44:34.000000000 -0700 @@ -1,13 +1,36 @@ #ifndef _I386_PAGE_H #define _I386_PAGE_H -/* PAGE_SHIFT determines the page size */ -#define PAGE_SHIFT 12 +#include /* for CONFIG_PAGE_CLUSTER */ + +/* + * One mmupage is represented by one Page Table Entry at the MMU level, + * and corresponds to one page at the user process level: its size is + * the same as param.h EXEC_PAGESIZE (for getpagesize(2) and mmap(2)). + */ +#define MMUPAGE_SHIFT 12 +#define MMUPAGE_SIZE (1 << MMUPAGE_SHIFT) +#define MMUPAGE_MASK (~(MMUPAGE_SIZE-1)) + +/* + * 2**N adjacent mmupages may be clustered to make up one kernel page. + * Reasonable and tested values for PAGE_MMUSHIFT are 0 (4k page), + * 1 (8k page), 2 (16k page), 3 (32k page). Higher values will not + * work without further changes e.g. to unsigned short b_size. + */ +#define PAGE_MMUSHIFT CONFIG_PAGE_CLUSTER +#define PAGE_MMUCOUNT (1 << PAGE_MMUSHIFT) + +/* + * One kernel page is represented by one struct page (see mm.h), + * and is the kernel's principal unit of memory allocation. + */ +#define PAGE_SHIFT (PAGE_MMUSHIFT + MMUPAGE_SHIFT) #define PAGE_SIZE (1UL << PAGE_SHIFT) #define PAGE_MASK (~(PAGE_SIZE-1)) #define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1)) -#define LARGE_PAGE_SIZE (1UL << PMD_SHIFT) +#define LARGE_PAGE_SIZE (1 << PMD_SHIFT) #ifdef __KERNEL__ #ifndef __ASSEMBLY__ @@ -53,7 +76,7 @@ typedef struct { unsigned long pgd; } pg #define pte_val(x) ((x).pte_low) #define HPAGE_SHIFT 22 #endif -#define PTE_MASK PAGE_MASK +#define PTE_MASK MMUPAGE_MASK #ifdef CONFIG_HUGETLB_PAGE #define HPAGE_SIZE ((1UL) << HPAGE_SHIFT) @@ -76,6 +99,7 @@ typedef struct { unsigned long pgprot; } /* to align the pointer to the (next) page boundary */ #define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK) +#define MMUPAGE_ALIGN(addr) (((addr)+MMUPAGE_SIZE-1)&MMUPAGE_MASK) /* * This handles the memory map.. We could make this a config @@ -123,18 +147,22 @@ static __inline__ int get_order(unsigned #define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET) #define VMALLOC_RESERVE ((unsigned long)__VMALLOC_RESERVE) -#define MAXMEM (-__PAGE_OFFSET-__VMALLOC_RESERVE) +#define __MAXMEM \ + ((VMALLOC_START-2*MMUPAGE_SIZE-__PAGE_OFFSET) & LARGE_PAGE_MASK) +#define MAXMEM \ + __pa((VMALLOC_START-2*MMUPAGE_SIZE) & LARGE_PAGE_MASK) #define __pa(x) ((unsigned long)(x)-PAGE_OFFSET) #define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) -#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) +#define pfn_to_kaddr(pfn) __va(MMUPAGE_SIZE*(pfn)) #ifndef CONFIG_DISCONTIGMEM -#define pfn_to_page(pfn) (mem_map + (pfn)) -#define page_to_pfn(page) ((unsigned long)((page) - mem_map)) -#define pfn_valid(pfn) ((pfn) < max_mapnr) +#define pfn_to_page(pfn) (&mem_map[(pfn)/PAGE_MMUCOUNT]) +#define page_to_mapnr(page) ((unsigned long)((page) - mem_map)) +#define page_to_pfn(page) (PAGE_MMUCOUNT*page_to_mapnr(page)) +#define pfn_valid(pfn) ((pfn) < max_mapnr*PAGE_MMUCOUNT) #endif /* !CONFIG_DISCONTIGMEM */ -#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) +#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr)/MMUPAGE_SIZE) -#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) +#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr)/MMUPAGE_SIZE) #define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) diff -prauN linux-2.5.70-bk10/include/asm-i386/pci.h pgcl-2.5.70-bk10-1/include/asm-i386/pci.h --- linux-2.5.70-bk10/include/asm-i386/pci.h 2003-05-26 18:00:39.000000000 -0700 +++ pgcl-2.5.70-bk10-1/include/asm-i386/pci.h 2003-06-05 09:44:34.000000000 -0700 @@ -67,13 +67,13 @@ pci_dac_page_to_dma(struct pci_dev *pdev static __inline__ struct page * pci_dac_dma_to_page(struct pci_dev *pdev, dma64_addr_t dma_addr) { - return pfn_to_page(dma_addr >> PAGE_SHIFT); + return pfn_to_page(dma_addr >> MMUPAGE_SHIFT); } static __inline__ unsigned long pci_dac_dma_to_offset(struct pci_dev *pdev, dma64_addr_t dma_addr) { - return (dma_addr & ~PAGE_MASK); + return dma_addr & ~PAGE_MASK; } static __inline__ void diff -prauN linux-2.5.70-bk10/include/asm-i386/pgalloc.h pgcl-2.5.70-bk10-1/include/asm-i386/pgalloc.h --- linux-2.5.70-bk10/include/asm-i386/pgalloc.h 2003-05-26 18:00:21.000000000 -0700 +++ pgcl-2.5.70-bk10-1/include/asm-i386/pgalloc.h 2003-06-05 09:48:37.000000000 -0700 @@ -2,54 +2,101 @@ #define _I386_PGALLOC_H #include +#include #include #include #include #include /* for struct page */ +#include /* to make asm-generic/rmap.h happy */ +#include /* for pgtable_remove_rmap() */ +/* + * allocating and freeing a pmd is trivial: the 1-entry pmd is + * inside the pgd, so has no extra memory associated with it. + * (In the PAE case we free the pmds as part of the pgd.) + */ + +#define pmd_alloc_one(mm, addr) ({ BUG(); ((pmd_t *)2); }) +#define pmd_free(x) do { } while (0) +#define __pmd_free_tlb(tlb,x) do { } while (0) +#define pgd_populate(mm, pmd, pte) BUG() + +#define check_pgt_cache() do { } while (0) #define pmd_populate_kernel(mm, pmd, pte) \ set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte))) -static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte) -{ - set_pmd(pmd, __pmd(_PAGE_TABLE + - ((unsigned long long)page_to_pfn(pte) << - (unsigned long long) PAGE_SHIFT))); -} +struct mmu_gather; + /* * Allocate and free page tables. */ - -extern pgd_t *pgd_alloc(struct mm_struct *); -extern void pgd_free(pgd_t *pgd); - -extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long); -extern struct page *pte_alloc_one(struct mm_struct *, unsigned long); +void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *page); +pgd_t *pgd_alloc(struct mm_struct *); +void pgd_free(pgd_t *pgd); +pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long); static inline void pte_free_kernel(pte_t *pte) { free_page((unsigned long)pte); } -static inline void pte_free(struct page *pte) +static inline void pte_free(struct page *page) { - __free_page(pte); + put_page(page); } +#include -#define __pte_free_tlb(tlb,pte) tlb_remove_page((tlb),(pte)) +static inline struct page *pte_alloc_fresh(void) +{ + struct page *page = alloc_page(GFP_PTE); + if (page) { + clear_highpage(page); + BUG_ON(PagePTE(page)); + SetPagePTE(page); + } + return page; +} -/* - * allocating and freeing a pmd is trivial: the 1-entry pmd is - * inside the pgd, so has no extra memory associated with it. - * (In the PAE case we free the pmds as part of the pgd.) - */ +static inline struct page *pte_alloc_ready(void) +{ + struct mmu_gather *tlb = &per_cpu(mmu_gathers, get_cpu()); + struct page *page; -#define pmd_alloc_one(mm, addr) ({ BUG(); ((pmd_t *)2); }) -#define pmd_free(x) do { } while (0) -#define __pmd_free_tlb(tlb,x) do { } while (0) -#define pgd_populate(mm, pmd, pte) BUG() + BUG_ON(tlb->nr_pte_ready < 0); + if (!tlb->nr_pte_ready) { + BUG_ON(tlb->nr_pte_active < 0); + BUG_ON(tlb->nr_nonpte < 0); + page = NULL; + } else { + int zone; + for (zone = MAX_ZONE_ID - 1; zone >= 0; --zone) { + if (!list_empty(&tlb->ready_list[zone])) + break; + } + + BUG_ON(zone < 0); + BUG_ON(list_empty(&tlb->ready_list[zone])); + + page = list_entry(tlb->ready_list[zone].next, struct page, list); + BUG_ON(PagePTE(page)); + SetPagePTE(page); + list_del(&page->list); + atomic_set(&page->count, 1); + tlb->ready_count[zone]--; + tlb->nr_pte_ready--; + BUG_ON(tlb->ready_count[zone] < 0); + BUG_ON(tlb->nr_pte_ready < 0); + } + put_cpu(); + return page; +} -#define check_pgt_cache() do { } while (0) +static inline struct page *pte_alloc_one(struct mm_struct *mm, + unsigned long address) +{ + struct page *page = pte_alloc_ready(); + return page ? page : pte_alloc_fresh(); +} #endif /* _I386_PGALLOC_H */ diff -prauN linux-2.5.70-bk10/include/asm-i386/pgtable-2level.h pgcl-2.5.70-bk10-1/include/asm-i386/pgtable-2level.h --- linux-2.5.70-bk10/include/asm-i386/pgtable-2level.h 2003-05-26 18:00:40.000000000 -0700 +++ pgcl-2.5.70-bk10-1/include/asm-i386/pgtable-2level.h 2003-06-05 09:44:34.000000000 -0700 @@ -17,6 +17,7 @@ #define PTRS_PER_PTE 1024 +#ifndef __ASSEMBLY__ #define pte_ERROR(e) \ printk("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, (e).pte_low) #define pmd_ERROR(e) \ @@ -49,7 +50,7 @@ static inline int pgd_present(pgd_t pgd) #define set_pgd(pgdptr, pgdval) (*(pgdptr) = pgdval) #define pgd_page(pgd) \ -((unsigned long) __va(pgd_val(pgd) & PAGE_MASK)) +((unsigned long) __va(pgd_val(pgd) & MMUPAGE_MASK)) static inline pmd_t * pmd_offset(pgd_t * dir, unsigned long address) { @@ -59,9 +60,11 @@ static inline pmd_t * pmd_offset(pgd_t * #define pte_same(a, b) ((a).pte_low == (b).pte_low) #define pte_page(x) pfn_to_page(pte_pfn(x)) #define pte_none(x) (!(x).pte_low) -#define pte_pfn(x) ((unsigned long)(((x).pte_low >> PAGE_SHIFT))) -#define pfn_pte(pfn, prot) __pte(((pfn) << PAGE_SHIFT) | pgprot_val(prot)) -#define pfn_pmd(pfn, prot) __pmd(((pfn) << PAGE_SHIFT) | pgprot_val(prot)) +#define pte_pfn(x) ((unsigned long)(((x).pte_low>>MMUPAGE_SHIFT))) +#define pfn_pte(pfn, prot) __pte(((pfn)<> PAGE_SHIFT) | - (pte.pte_high << (32 - PAGE_SHIFT)); + return (pte.pte_low >> MMUPAGE_SHIFT) | + (pte.pte_high << (32 - MMUPAGE_SHIFT)); } static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot) { pte_t pte; - pte.pte_high = page_nr >> (32 - PAGE_SHIFT); - pte.pte_low = (page_nr << PAGE_SHIFT) | pgprot_val(pgprot); + pte.pte_high = page_nr >> (32 - MMUPAGE_SHIFT); + pte.pte_low = (page_nr << MMUPAGE_SHIFT) | pgprot_val(pgprot); return pte; } static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot) { - return __pmd(((unsigned long long)page_nr << PAGE_SHIFT) | pgprot_val(pgprot)); + return __pmd(((unsigned long long)page_nr << MMUPAGE_SHIFT) | pgprot_val(pgprot)); } /* @@ -123,6 +124,6 @@ static inline pmd_t pfn_pmd(unsigned lon #define pgoff_to_pte(off) ((pte_t) { _PAGE_FILE, (off) }) #define PTE_FILE_MAX_BITS 32 -extern struct kmem_cache_s *pae_pgd_cachep; +#endif /* !__ASSEMBLY__ */ #endif /* _I386_PGTABLE_3LEVEL_H */ diff -prauN linux-2.5.70-bk10/include/asm-i386/pgtable.h pgcl-2.5.70-bk10-1/include/asm-i386/pgtable.h --- linux-2.5.70-bk10/include/asm-i386/pgtable.h 2003-05-26 18:00:45.000000000 -0700 +++ pgcl-2.5.70-bk10-1/include/asm-i386/pgtable.h 2003-06-05 09:44:34.000000000 -0700 @@ -21,15 +21,28 @@ #include #endif -extern pgd_t swapper_pg_dir[1024]; -extern void paging_init(void); +#include +#include +#include /* * ZERO_PAGE is a global shared page that is always zero: used * for zero-mapped memory areas etc.. */ extern unsigned long empty_zero_page[1024]; -#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) +extern struct page *zero_page; +#define ZERO_PAGE(vaddr) (zero_page) +extern pgd_t swapper_pg_dir[1024]; +extern kmem_cache_t *pgd_cache; +extern kmem_cache_t *pmd_cache; +extern spinlock_t pgd_lock; +extern struct list_head pgd_list; + +void pmd_ctor(void *, kmem_cache_t *, unsigned long); +void pgd_ctor(void *, kmem_cache_t *, unsigned long); +void pgd_dtor(void *, kmem_cache_t *, unsigned long); +void pgtable_cache_init(void); +void paging_init(void); #endif /* !__ASSEMBLY__ */ @@ -38,24 +51,10 @@ extern unsigned long empty_zero_page[102 * implements both the traditional 2-level x86 page tables and the * newer 3-level PAE-mode page tables. */ -#ifndef __ASSEMBLY__ #ifdef CONFIG_X86_PAE # include - -/* - * Need to initialise the X86 PAE caches - */ -extern void pgtable_cache_init(void); - #else # include - -/* - * No page table caches to initialise - */ -#define pgtable_cache_init() do { } while (0) - -#endif #endif #define PMD_SIZE (1UL << PMD_SHIFT) @@ -82,15 +81,15 @@ extern void pgtable_cache_init(void); * The vmalloc() routines leaves a hole of 4kB between each vmalloced * area for the same reason. ;) */ -#define VMALLOC_OFFSET (8*1024*1024) -#define VMALLOC_START (((unsigned long) high_memory + 2*VMALLOC_OFFSET-1) & \ - ~(VMALLOC_OFFSET-1)) #define VMALLOC_VMADDR(x) ((unsigned long)(x)) -#ifdef CONFIG_HIGHMEM -# define VMALLOC_END (PKMAP_BASE-2*PAGE_SIZE) -#else -# define VMALLOC_END (FIXADDR_START-2*PAGE_SIZE) -#endif +#define VMALLOC_END (FIXADDR_START-2*MMUPAGE_SIZE) + +#define __VMALLOC_START (VMALLOC_END - VMALLOC_RESERVE - 2*MMUPAGE_SIZE) +#define VMALLOC_START \ + (high_memory \ + ? max(__VMALLOC_START, (unsigned long)high_memory) \ + : __VMALLOC_START \ + ) /* * The 4MB page is guessing.. Detailed in the infamous "Chapter H" @@ -183,7 +182,7 @@ extern unsigned long pg0[1024]; #define pmd_none(x) (!pmd_val(x)) #define pmd_present(x) (pmd_val(x) & _PAGE_PRESENT) #define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0) -#define pmd_bad(x) ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE) +#define pmd_bad(x) ((pmd_val(x) & (~MMUPAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE) #define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT)) @@ -238,10 +237,10 @@ static inline pte_t pte_modify(pte_t pte #define page_pte(page) page_pte_prot(page, __pgprot(0)) #define pmd_page_kernel(pmd) \ -((unsigned long) __va(pmd_val(pmd) & PAGE_MASK)) +((unsigned long) __va(pmd_val(pmd) & MMUPAGE_MASK)) #ifndef CONFIG_DISCONTIGMEM -#define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT)) +#define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> MMUPAGE_SHIFT)) #endif /* !CONFIG_DISCONTIGMEM */ #define pmd_large(pmd) \ @@ -283,20 +282,32 @@ static inline pte_t pte_modify(pte_t pte * control the given virtual address */ #define pte_index(address) \ - (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + (((address) >> MMUPAGE_SHIFT) & (PTRS_PER_PTE - 1)) #define pte_offset_kernel(dir, address) \ ((pte_t *) pmd_page_kernel(*(dir)) + pte_index(address)) #if defined(CONFIG_HIGHPTE) #define pte_offset_map(dir, address) \ - ((pte_t *)kmap_atomic(pmd_page(*(dir)),KM_PTE0) + pte_index(address)) +( \ + (pte_t *)kmap_atomic(pmd_page(*(dir)),KM_PTE0) \ + + (PTRS_PER_PTE*((pmd_val(*(dir))/MMUPAGE_SIZE)%PAGE_MMUCOUNT)\ + + pte_index(address)) \ +) #define pte_offset_map_nested(dir, address) \ - ((pte_t *)kmap_atomic(pmd_page(*(dir)),KM_PTE1) + pte_index(address)) +( \ + (pte_t *)kmap_atomic(pmd_page(*(dir)),KM_PTE1) \ + + (PTRS_PER_PTE*((pmd_val(*(dir))/MMUPAGE_SIZE)%PAGE_MMUCOUNT)\ + + pte_index(address)) \ +) #define pte_unmap(pte) kunmap_atomic(pte, KM_PTE0) #define pte_unmap_nested(pte) kunmap_atomic(pte, KM_PTE1) #else #define pte_offset_map(dir, address) \ - ((pte_t *)page_address(pmd_page(*(dir))) + pte_index(address)) +( \ + (pte_t *)page_address(pmd_page(*(dir))) \ + + (PTRS_PER_PTE*((pmd_val(*(dir))/MMUPAGE_SIZE)%PAGE_MMUCOUNT)\ + + pte_index(address)) \ +) #define pte_offset_map_nested(dir, address) pte_offset_map(dir, address) #define pte_unmap(pte) do { } while (0) #define pte_unmap_nested(pte) do { } while (0) diff -prauN linux-2.5.70-bk10/include/asm-i386/rmap.h pgcl-2.5.70-bk10-1/include/asm-i386/rmap.h --- linux-2.5.70-bk10/include/asm-i386/rmap.h 2003-05-26 18:00:41.000000000 -0700 +++ pgcl-2.5.70-bk10-1/include/asm-i386/rmap.h 2003-06-05 09:44:34.000000000 -0700 @@ -5,10 +5,17 @@ #include #ifdef CONFIG_HIGHPTE +/* + * The byte offset needs to be relative to PAGE_SIZE, the pfn will be + * implicitly truncated to a PAGE_SIZE boundary, the mapping will be + * returned rounded downward, and will need compensation by adding in + * the paddr's offset within the PAGE_SIZE-aligned region to the vaddr + * returned from kmap_atomic(). + */ static inline pte_t *rmap_ptep_map(pte_addr_t pte_paddr) { - unsigned long pfn = (unsigned long)(pte_paddr >> PAGE_SHIFT); - unsigned long off = ((unsigned long)pte_paddr) & ~PAGE_MASK; + unsigned long pfn = (unsigned long)(pte_paddr/MMUPAGE_SIZE); + unsigned long off = (unsigned long)pte_paddr & ~PAGE_MASK; return (pte_t *)((char *)kmap_atomic(pfn_to_page(pfn), KM_PTE2) + off); } diff -prauN linux-2.5.70-bk10/include/asm-i386/setup.h pgcl-2.5.70-bk10-1/include/asm-i386/setup.h --- linux-2.5.70-bk10/include/asm-i386/setup.h 2003-05-26 18:00:42.000000000 -0700 +++ pgcl-2.5.70-bk10-1/include/asm-i386/setup.h 2003-06-05 09:44:34.000000000 -0700 @@ -6,15 +6,15 @@ #ifndef _i386_SETUP_H #define _i386_SETUP_H -#define PFN_UP(x) (((x) + PAGE_SIZE-1) >> PAGE_SHIFT) -#define PFN_DOWN(x) ((x) >> PAGE_SHIFT) -#define PFN_PHYS(x) ((x) << PAGE_SHIFT) +#define PFN_UP(x) (((x) + MMUPAGE_SIZE-1) >> MMUPAGE_SHIFT) +#define PFN_DOWN(x) ((x) >> MMUPAGE_SHIFT) +#define PFN_PHYS(x) ((x) << MMUPAGE_SHIFT) /* * Reserved space for vmalloc and iomap - defined in asm/page.h */ #define MAXMEM_PFN PFN_DOWN(MAXMEM) -#define MAX_NONPAE_PFN (1 << 20) +#define MAX_NONPAE_PFN (1 << (32 - MMUPAGE_SHIFT)) /* * This is set up by the setup-routine at boot-time diff -prauN linux-2.5.70-bk10/include/asm-i386/shmparam.h pgcl-2.5.70-bk10-1/include/asm-i386/shmparam.h --- linux-2.5.70-bk10/include/asm-i386/shmparam.h 2003-05-26 18:00:37.000000000 -0700 +++ pgcl-2.5.70-bk10-1/include/asm-i386/shmparam.h 2003-06-05 09:44:34.000000000 -0700 @@ -1,6 +1,6 @@ #ifndef _ASMI386_SHMPARAM_H #define _ASMI386_SHMPARAM_H -#define SHMLBA PAGE_SIZE /* attach addr a multiple of this */ +#define SHMLBA MMUPAGE_SIZE /* attach addr a multiple of this */ #endif /* _ASMI386_SHMPARAM_H */ diff -prauN linux-2.5.70-bk10/include/asm-i386/thread_info.h pgcl-2.5.70-bk10-1/include/asm-i386/thread_info.h --- linux-2.5.70-bk10/include/asm-i386/thread_info.h 2003-06-05 05:43:58.000000000 -0700 +++ pgcl-2.5.70-bk10-1/include/asm-i386/thread_info.h 2003-06-05 09:47:56.000000000 -0700 @@ -53,6 +53,7 @@ struct thread_info { #endif #define PREEMPT_ACTIVE 0x4000000 +#define THREAD_SIZE (2*MMUPAGE_SIZE) /* * macros/functions for gaining access to the thread information structure @@ -81,14 +82,13 @@ struct thread_info { static inline struct thread_info *current_thread_info(void) { struct thread_info *ti; - __asm__("andl %%esp,%0; ":"=r" (ti) : "0" (~8191UL)); + __asm__("andl %%esp,%0; ":"=r" (ti) : "0" (~(THREAD_SIZE - 1))); return ti; } /* thread information allocation */ -#define THREAD_SIZE (2*PAGE_SIZE) -#define alloc_thread_info(tsk) ((struct thread_info *) __get_free_pages(GFP_KERNEL,1)) -#define free_thread_info(ti) free_pages((unsigned long) (ti), 1) +#define alloc_thread_info(tsk) ((struct thread_info *)kmalloc(THREAD_SIZE, GFP_KERNEL)) +#define free_thread_info(ti) kfree(ti) #define get_thread_info(ti) get_task_struct((ti)->task) #define put_thread_info(ti) put_task_struct((ti)->task) @@ -96,7 +96,7 @@ static inline struct thread_info *curren /* how to get the thread information struct from ASM */ #define GET_THREAD_INFO(reg) \ - movl $-8192, reg; \ + movl $~(THREAD_SIZE-1), reg; \ andl %esp, reg #endif diff -prauN linux-2.5.70-bk10/include/asm-i386/tlb.h pgcl-2.5.70-bk10-1/include/asm-i386/tlb.h --- linux-2.5.70-bk10/include/asm-i386/tlb.h 2003-05-26 18:00:27.000000000 -0700 +++ pgcl-2.5.70-bk10-1/include/asm-i386/tlb.h 2003-06-05 09:48:37.000000000 -0700 @@ -1,20 +1,201 @@ #ifndef _I386_TLB_H #define _I386_TLB_H +#include +#include +#include +#include +#include +#include +#include +#include + +#define __GFP_PTE (GFP_KERNEL|__GFP_REPEAT) +#ifdef CONFIG_HIGHMEM +#define GFP_PTE (__GFP_PTE|__GFP_HIGHMEM) +#else +#define GFP_PTE __GFP_PTE +#endif + +#define PG_PTE PG_arch_1 +#define NR_PTE 128 +#define NR_NONPTE 512 +#define MAX_ZONE_ID (MAX_NUMNODES * MAX_NR_ZONES) + +#define PagePTE(page) test_bit(PG_PTE, &(page)->flags) +#define SetPagePTE(page) set_bit(PG_PTE, &(page)->flags) +#define ClearPagePTE(page) clear_bit(PG_PTE, &(page)->flags) +#define PageZoneID(page) ((page)->flags >> ZONE_SHIFT) + /* * x86 doesn't need any special per-pte or * per-vma handling.. */ -#define tlb_start_vma(tlb, vma) do { } while (0) -#define tlb_end_vma(tlb, vma) do { } while (0) -#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0) +struct vm_area_struct; +struct mmu_gather { + struct mm_struct *mm; -/* - * .. because we flush the whole mm when it - * fills up. - */ -#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm) + /* number of active ptes needing a TLB flush before reuse */ + int nr_pte_active; -#include + /* whether some ptes were unmapped */ + unsigned int need_flush; -#endif + /* non-zero means full mm flush */ + unsigned int fullmm; + + /* number freed for RSS adjustment */ + unsigned long freed; + + /* number of ready ptes */ + int nr_pte_ready; + + struct list_head active_list[MAX_ZONE_ID], ready_list[MAX_ZONE_ID]; + int active_count[MAX_ZONE_ID], ready_count[MAX_ZONE_ID]; + + int nr_nonpte; + struct page *nonpte[NR_NONPTE]; +}; + +DECLARE_PER_CPU(struct mmu_gather, mmu_gathers); + +void tlb_flush_ready(struct mmu_gather *tlb); +void tlb_init(void); + +static inline void tlb_start_vma(struct mmu_gather *tlb, + struct vm_area_struct *vma) +{ +} + +static inline void tlb_end_vma(struct mmu_gather *tlb, + struct vm_area_struct *vma) +{ +} + +static inline void tlb_inc_freed(struct mmu_gather *tlb) +{ + tlb->freed++; +} + +static inline void pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) +{ + tlb->need_flush = 1; +} + +static inline void tlb_flush(struct mmu_gather *tlb) +{ + flush_tlb_mm(tlb->mm); +} + +static inline struct mmu_gather *tlb_gather_mmu(struct mm_struct *mm, + unsigned int flush) +{ + struct mmu_gather *tlb = &per_cpu(mmu_gathers, get_cpu()); + tlb->mm = mm; + tlb->fullmm = flush; + put_cpu(); + return tlb; +} + +static inline void tlb_flush_mmu(struct mmu_gather *tlb, unsigned long start, + unsigned long end) +{ + int zone; + + if (!tlb->need_flush && tlb->nr_nonpte < NR_NONPTE) { + BUG_ON(tlb->nr_nonpte < 0); + BUG_ON(tlb->nr_pte_active < 0); + BUG_ON(tlb->nr_pte_ready < 0); + return; + } + + tlb->need_flush = 0; + tlb_flush(tlb); + BUG_ON(tlb->nr_nonpte < 0); + if (tlb->nr_nonpte) { + free_pages_and_swap_cache(tlb->nonpte, tlb->nr_nonpte); + tlb->nr_nonpte = 0; + } + + for (zone = 0; zone < MAX_ZONE_ID; ++zone) { + if (list_empty(&tlb->active_list[zone])) { + BUG_ON(tlb->active_count[zone]); + continue; + } + + list_splice_init(&tlb->active_list[zone], + &tlb->ready_list[zone]); + BUG_ON(tlb->active_count[zone] < 0); + BUG_ON(tlb->ready_count[zone] < 0); + tlb->ready_count[zone] += tlb->active_count[zone]; + tlb->active_count[zone] = 0; + } + tlb->nr_pte_ready += tlb->nr_pte_active; + tlb->nr_pte_active = 0; + if (tlb->nr_pte_ready >= NR_PTE) + tlb_flush_ready(tlb); +} + +static inline void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, + unsigned long end) +{ + if (tlb->mm->rss >= tlb->freed) + tlb->mm->rss -= tlb->freed; + else + tlb->mm->rss = 0; + tlb_flush_mmu(tlb, start, end); +} + +static inline void tlb_remove_nonpte_page(struct mmu_gather *tlb, + struct page *page) +{ + BUG_ON(tlb->nr_nonpte >= NR_NONPTE); + tlb->nonpte[tlb->nr_nonpte] = page; + tlb->nr_nonpte++; + if (tlb->nr_nonpte == NR_NONPTE) + tlb_flush_mmu(tlb, 0, 0); +} + +static inline void tlb_remove_pte_page(struct mmu_gather *tlb, + struct page *page) +{ + int zone; + + if (!atomic_dec_and_test(&page->count)) + return; + + zone = PageZoneID(page); + ClearPagePTE(page); + BUG_ON(tlb->nr_pte_active < 0); + BUG_ON(tlb->active_count[zone] < 0); + tlb->nr_pte_active++; + tlb->active_count[zone]++; + list_add(&page->list, &tlb->active_list[zone]); +} + +static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) +{ + tlb->need_flush = 1; + if (PagePTE(page)) + tlb_remove_pte_page(tlb, page); + else + tlb_remove_nonpte_page(tlb, page); +} + +static inline void pte_free_tlb(struct mmu_gather *tlb, struct page *page) +{ + tlb_remove_page(tlb, page); +} + +static inline void tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *pte, + unsigned long addr) +{ + tlb->need_flush = 1; +} + +static inline struct mm_struct *tlb_mm(struct mmu_gather *tlb) +{ + return tlb->mm; +} + +#endif /* _I386_TLB_H */ diff -prauN linux-2.5.70-bk10/include/asm-i386/tlbflush.h pgcl-2.5.70-bk10-1/include/asm-i386/tlbflush.h --- linux-2.5.70-bk10/include/asm-i386/tlbflush.h 2003-05-26 18:00:59.000000000 -0700 +++ pgcl-2.5.70-bk10-1/include/asm-i386/tlbflush.h 2003-06-05 09:48:26.000000000 -0700 @@ -92,8 +92,17 @@ static inline void flush_tlb_mm(struct m static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long addr) { + int k; if (vma->vm_mm == current->active_mm) - __flush_tlb_one(addr); + for (k = 0; k < PAGE_MMUCOUNT; ++k) { + unsigned long vaddr = addr + k*MMUPAGE_SIZE; + if (vaddr < vma->vm_start) + continue; + else if (vaddr >= vma->vm_end) + break; + else + __flush_tlb_one(vaddr); + } } static inline void flush_tlb_range(struct vm_area_struct *vma, @@ -110,10 +119,10 @@ static inline void flush_tlb_range(struc #define local_flush_tlb() \ __flush_tlb() -extern void flush_tlb_all(void); -extern void flush_tlb_current_task(void); -extern void flush_tlb_mm(struct mm_struct *); -extern void flush_tlb_page(struct vm_area_struct *, unsigned long); +void flush_tlb_all(void); +void flush_tlb_current_task(void); +void flush_tlb_mm(struct mm_struct *); +void flush_tlb_page(struct vm_area_struct *, unsigned long); #define flush_tlb() flush_tlb_current_task() diff -prauN linux-2.5.70-bk10/include/asm-ia64/page.h pgcl-2.5.70-bk10-1/include/asm-ia64/page.h --- linux-2.5.70-bk10/include/asm-ia64/page.h 2003-05-26 18:00:24.000000000 -0700 +++ pgcl-2.5.70-bk10-1/include/asm-ia64/page.h 2003-06-05 09:44:34.000000000 -0700 @@ -197,4 +197,6 @@ get_order (unsigned long size) (((current->thread.flags & IA64_THREAD_XSTACK) != 0) \ ? VM_EXEC : 0)) +#include + #endif /* _ASM_IA64_PAGE_H */ diff -prauN linux-2.5.70-bk10/include/asm-m68k/page.h pgcl-2.5.70-bk10-1/include/asm-m68k/page.h --- linux-2.5.70-bk10/include/asm-m68k/page.h 2003-05-26 18:01:03.000000000 -0700 +++ pgcl-2.5.70-bk10-1/include/asm-m68k/page.h 2003-06-05 09:44:34.000000000 -0700 @@ -192,6 +192,8 @@ static inline void *__va(unsigned long x #define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) +#include + #endif /* __KERNEL__ */ #endif /* _M68K_PAGE_H */ diff -prauN linux-2.5.70-bk10/include/asm-m68knommu/page.h pgcl-2.5.70-bk10-1/include/asm-m68knommu/page.h --- linux-2.5.70-bk10/include/asm-m68knommu/page.h 2003-05-26 18:00:39.000000000 -0700 +++ pgcl-2.5.70-bk10-1/include/asm-m68knommu/page.h 2003-06-05 09:44:34.000000000 -0700 @@ -94,6 +94,8 @@ extern unsigned long memory_end; #define virt_addr_valid(kaddr) (((void *)(kaddr) >= (void *)PAGE_OFFSET) && \ ((void *)(kaddr) < (void *)memory_end)) +#include + #endif /* __ASSEMBLY__ */ #endif /* __KERNEL__ */ diff -prauN linux-2.5.70-bk10/include/asm-mips/page.h pgcl-2.5.70-bk10-1/include/asm-mips/page.h --- linux-2.5.70-bk10/include/asm-mips/page.h 2003-05-26 18:00:38.000000000 -0700 +++ pgcl-2.5.70-bk10-1/include/asm-mips/page.h 2003-06-05 09:44:34.000000000 -0700 @@ -85,6 +85,8 @@ extern __inline__ int get_order(unsigned #define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) +#include + #endif /* defined (__KERNEL__) */ #endif /* __ASM_PAGE_H */ diff -prauN linux-2.5.70-bk10/include/asm-mips64/page.h pgcl-2.5.70-bk10-1/include/asm-mips64/page.h --- linux-2.5.70-bk10/include/asm-mips64/page.h 2003-05-26 18:00:20.000000000 -0700 +++ pgcl-2.5.70-bk10-1/include/asm-mips64/page.h 2003-06-05 09:44:34.000000000 -0700 @@ -80,6 +80,8 @@ typedef struct { unsigned long pgprot; } #define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) +#include + #endif /* defined (__KERNEL__) */ #endif /* _ASM_PAGE_H */ diff -prauN linux-2.5.70-bk10/include/asm-parisc/page.h pgcl-2.5.70-bk10-1/include/asm-parisc/page.h --- linux-2.5.70-bk10/include/asm-parisc/page.h 2003-05-26 18:00:25.000000000 -0700 +++ pgcl-2.5.70-bk10-1/include/asm-parisc/page.h 2003-06-05 09:44:34.000000000 -0700 @@ -112,6 +112,8 @@ extern int npmem_ranges; #define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) +#include + #endif /* __KERNEL__ */ #endif /* _PARISC_PAGE_H */ diff -prauN linux-2.5.70-bk10/include/asm-ppc/page.h pgcl-2.5.70-bk10-1/include/asm-ppc/page.h --- linux-2.5.70-bk10/include/asm-ppc/page.h 2003-05-26 18:00:27.000000000 -0700 +++ pgcl-2.5.70-bk10-1/include/asm-ppc/page.h 2003-06-05 09:44:34.000000000 -0700 @@ -142,5 +142,7 @@ extern __inline__ int get_order(unsigned #define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) +#include + #endif /* __KERNEL__ */ #endif /* _PPC_PAGE_H */ diff -prauN linux-2.5.70-bk10/include/asm-ppc64/page.h pgcl-2.5.70-bk10-1/include/asm-ppc64/page.h --- linux-2.5.70-bk10/include/asm-ppc64/page.h 2003-05-26 18:00:20.000000000 -0700 +++ pgcl-2.5.70-bk10-1/include/asm-ppc64/page.h 2003-06-05 09:44:34.000000000 -0700 @@ -204,5 +204,7 @@ static inline int get_order(unsigned lon #define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) +#include + #endif /* __KERNEL__ */ #endif /* _PPC64_PAGE_H */ diff -prauN linux-2.5.70-bk10/include/asm-s390/page.h pgcl-2.5.70-bk10-1/include/asm-s390/page.h --- linux-2.5.70-bk10/include/asm-s390/page.h 2003-05-26 18:00:24.000000000 -0700 +++ pgcl-2.5.70-bk10-1/include/asm-s390/page.h 2003-06-05 09:44:34.000000000 -0700 @@ -181,6 +181,8 @@ typedef struct { unsigned long pgd; } pg #define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) +#include + #endif /* __KERNEL__ */ #endif /* _S390_PAGE_H */ diff -prauN linux-2.5.70-bk10/include/asm-sh/page.h pgcl-2.5.70-bk10-1/include/asm-sh/page.h --- linux-2.5.70-bk10/include/asm-sh/page.h 2003-05-26 18:00:57.000000000 -0700 +++ pgcl-2.5.70-bk10-1/include/asm-sh/page.h 2003-06-05 09:44:34.000000000 -0700 @@ -106,6 +106,8 @@ static __inline__ int get_order(unsigned #endif +#include + #endif /* __KERNEL__ */ #endif /* __ASM_SH_PAGE_H */ diff -prauN linux-2.5.70-bk10/include/asm-sparc/page.h pgcl-2.5.70-bk10-1/include/asm-sparc/page.h --- linux-2.5.70-bk10/include/asm-sparc/page.h 2003-05-26 18:00:27.000000000 -0700 +++ pgcl-2.5.70-bk10-1/include/asm-sparc/page.h 2003-06-05 09:44:34.000000000 -0700 @@ -171,6 +171,8 @@ extern __inline__ int get_order(unsigned #define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) +#include + #endif /* __KERNEL__ */ #endif /* _SPARC_PAGE_H */ diff -prauN linux-2.5.70-bk10/include/asm-sparc64/page.h pgcl-2.5.70-bk10-1/include/asm-sparc64/page.h --- linux-2.5.70-bk10/include/asm-sparc64/page.h 2003-05-26 18:00:39.000000000 -0700 +++ pgcl-2.5.70-bk10-1/include/asm-sparc64/page.h 2003-06-05 09:44:34.000000000 -0700 @@ -168,6 +168,8 @@ static __inline__ int get_order(unsigned #define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) +#include + #endif /* !(__KERNEL__) */ #endif /* !(_SPARC64_PAGE_H) */ diff -prauN linux-2.5.70-bk10/include/asm-v850/page.h pgcl-2.5.70-bk10-1/include/asm-v850/page.h --- linux-2.5.70-bk10/include/asm-v850/page.h 2003-05-26 18:00:41.000000000 -0700 +++ pgcl-2.5.70-bk10-1/include/asm-v850/page.h 2003-06-05 09:44:34.000000000 -0700 @@ -140,6 +140,7 @@ extern __inline__ int get_order (unsigne #define __pa(x) __virt_to_phys ((unsigned long)(x)) #define __va(x) ((void *)__phys_to_virt ((unsigned long)(x))) +#include #endif /* KERNEL */ diff -prauN linux-2.5.70-bk10/include/asm-x86_64/page.h pgcl-2.5.70-bk10-1/include/asm-x86_64/page.h --- linux-2.5.70-bk10/include/asm-x86_64/page.h 2003-05-26 18:00:21.000000000 -0700 +++ pgcl-2.5.70-bk10-1/include/asm-x86_64/page.h 2003-06-05 09:44:34.000000000 -0700 @@ -136,6 +136,8 @@ extern __inline__ int get_order(unsigned (test_thread_flag(TIF_IA32) ? vm_stack_flags32 : vm_stack_flags) +#include + #endif /* __KERNEL__ */ #endif /* _X86_64_PAGE_H */ diff -prauN linux-2.5.70-bk10/include/linux/aio.h pgcl-2.5.70-bk10-1/include/linux/aio.h --- linux-2.5.70-bk10/include/linux/aio.h 2003-06-05 05:44:00.000000000 -0700 +++ pgcl-2.5.70-bk10-1/include/linux/aio.h 2003-06-05 09:44:34.000000000 -0700 @@ -104,13 +104,13 @@ struct aio_ring_info { unsigned long mmap_base; unsigned long mmap_size; - struct page **ring_pages; + unsigned long *ring_pages; spinlock_t ring_lock; long nr_pages; unsigned nr, tail; - struct page *internal_pages[AIO_RING_PAGES]; + unsigned long internal_pages[AIO_RING_PAGES]; /* pfn's */ }; struct kioctx { diff -prauN linux-2.5.70-bk10/include/linux/binfmts.h pgcl-2.5.70-bk10-1/include/linux/binfmts.h --- linux-2.5.70-bk10/include/linux/binfmts.h 2003-05-26 18:00:40.000000000 -0700 +++ pgcl-2.5.70-bk10-1/include/linux/binfmts.h 2003-06-05 09:44:34.000000000 -0700 @@ -2,6 +2,7 @@ #define _LINUX_BINFMTS_H #include +#include /* for PAGE_MMUCOUNT */ struct pt_regs; @@ -9,8 +10,15 @@ struct pt_regs; * MAX_ARG_PAGES defines the number of pages allocated for arguments * and envelope for the new program. 32 should suffice, this gives * a maximum env+arg of 128kB w/4KB pages! + * Now that PAGE_SIZE is a software construct and varies wildly, + * MAX_ARG_PAGES should represent a constant size of 128KB. When + * PAGE_SIZE exceeds that, we're in trouble. */ -#define MAX_ARG_PAGES 32 +#if PAGE_MMUCOUNT <= 32 +#define MAX_ARG_PAGES (32/PAGE_MMUCOUNT) +#else +#error PAGE_SIZE too large to enforce MAX_ARG_PAGES! +#endif /* sizeof(linux_binprm->buf) */ #define BINPRM_BUF_SIZE 128 diff -prauN linux-2.5.70-bk10/include/linux/bio.h pgcl-2.5.70-bk10-1/include/linux/bio.h --- linux-2.5.70-bk10/include/linux/bio.h 2003-06-05 05:44:00.000000000 -0700 +++ pgcl-2.5.70-bk10-1/include/linux/bio.h 2003-06-05 09:44:34.000000000 -0700 @@ -257,26 +257,20 @@ extern void bio_check_pages_dirty(struct */ extern inline char *bvec_kmap_irq(struct bio_vec *bvec, unsigned long *flags) { - unsigned long addr; + char *addr; /* * might not be a highmem page, but the preempt/irq count * balancing is a lot nicer this way */ local_irq_save(*flags); - addr = (unsigned long) kmap_atomic(bvec->bv_page, KM_BIO_SRC_IRQ); - - if (addr & ~PAGE_MASK) - BUG(); - - return (char *) addr + bvec->bv_offset; + addr = (char *)kmap_atomic(bvec->bv_page, KM_BIO_SRC_IRQ); + return addr + bvec->bv_offset; } extern inline void bvec_kunmap_irq(char *buffer, unsigned long *flags) { - unsigned long ptr = (unsigned long) buffer & PAGE_MASK; - - kunmap_atomic((void *) ptr, KM_BIO_SRC_IRQ); + kunmap_atomic(buffer, KM_BIO_SRC_IRQ); local_irq_restore(*flags); } diff -prauN linux-2.5.70-bk10/include/linux/gfp.h pgcl-2.5.70-bk10-1/include/linux/gfp.h --- linux-2.5.70-bk10/include/linux/gfp.h 2003-05-26 18:00:26.000000000 -0700 +++ pgcl-2.5.70-bk10-1/include/linux/gfp.h 2003-06-05 09:48:26.000000000 -0700 @@ -74,8 +74,9 @@ static inline struct page * alloc_pages_ #define alloc_page(gfp_mask) \ alloc_pages_node(numa_node_id(), gfp_mask, 0) -extern unsigned long FASTCALL(__get_free_pages(unsigned int gfp_mask, unsigned int order)); -extern unsigned long FASTCALL(get_zeroed_page(unsigned int gfp_mask)); +unsigned long FASTCALL(__get_free_pages(unsigned int gfp_mask, unsigned int order)); +unsigned long FASTCALL(get_zeroed_page(unsigned int gfp_mask)); +int free_pages_bulk(struct zone *zone, int count, struct list_head *list, unsigned int order); #define __get_free_page(gfp_mask) \ __get_free_pages((gfp_mask),0) diff -prauN linux-2.5.70-bk10/include/linux/highmem.h pgcl-2.5.70-bk10-1/include/linux/highmem.h --- linux-2.5.70-bk10/include/linux/highmem.h 2003-05-26 18:00:45.000000000 -0700 +++ pgcl-2.5.70-bk10-1/include/linux/highmem.h 2003-06-05 09:44:34.000000000 -0700 @@ -81,6 +81,17 @@ static inline void copy_user_highpage(st kunmap_atomic(vto, KM_USER1); } +static inline void copy_user_mmupages(struct page *dst, struct page *src, int offset, int size) +{ + char *vfrom, *vto; + + vfrom = kmap_atomic(src, KM_USER0); + vto = kmap_atomic(dst, KM_USER1); + memcpy(&vto[offset], &vfrom[offset], size); + kunmap_atomic(src, KM_USER0); + kunmap_atomic(dst, KM_USER1); +} + static inline void copy_highpage(struct page *to, struct page *from) { char *vfrom, *vto; diff -prauN linux-2.5.70-bk10/include/linux/ide.h pgcl-2.5.70-bk10-1/include/linux/ide.h --- linux-2.5.70-bk10/include/linux/ide.h 2003-06-05 05:44:00.000000000 -0700 +++ pgcl-2.5.70-bk10-1/include/linux/ide.h 2003-06-05 09:44:34.000000000 -0700 @@ -225,7 +225,7 @@ typedef unsigned char byte; /* used ever * allowing each to have about 256 entries (8 bytes each) from this. */ #define PRD_BYTES 8 -#define PRD_ENTRIES (PAGE_SIZE / (2 * PRD_BYTES)) +#define PRD_ENTRIES (MMUPAGE_SIZE / (2 * PRD_BYTES)) /* * Some more useful definitions diff -prauN linux-2.5.70-bk10/include/linux/mm.h pgcl-2.5.70-bk10-1/include/linux/mm.h --- linux-2.5.70-bk10/include/linux/mm.h 2003-05-26 18:00:21.000000000 -0700 +++ pgcl-2.5.70-bk10-1/include/linux/mm.h 2003-06-05 09:44:34.000000000 -0700 @@ -73,7 +73,7 @@ struct vm_area_struct { struct vm_operations_struct * vm_ops; /* Information about our backing store: */ - unsigned long vm_pgoff; /* Offset (within vm_file) in PAGE_SIZE + unsigned long vm_pgoff; /* Offset (within vm_file) in MMUPAGE_SIZE units, *not* PAGE_CACHE_SIZE */ struct file * vm_file; /* File we map to (can be NULL). */ void * vm_private_data; /* was vm_pte (shared mem) */ @@ -172,7 +172,8 @@ struct page { atomic_t count; /* Usage count, see below. */ struct list_head list; /* ->mapping has some page lists. */ struct address_space *mapping; /* The inode (or ...) we belong to. */ - unsigned long index; /* Our offset within mapping. */ + unsigned long index; /* Our offset within mapping. + * in PAGE_CACHE_SIZE units. */ struct list_head lru; /* Pageout list, eg. active_list; protected by zone->lru_lock !! */ union { @@ -339,10 +340,19 @@ static inline void set_page_zone(struct page->flags |= zone_num << ZONE_SHIFT; } -static inline void * lowmem_page_address(struct page *page) -{ - return __va( ( (page - page_zone(page)->zone_mem_map) + page_zone(page)->zone_start_pfn) << PAGE_SHIFT); -} + +#if 1 +#define lowmem_page_address(page) __va(page_to_pfn(page)*MMUPAGE_SIZE) +#else + #define lowmem_page_address(page) \ +({ \ + extern unsigned long max_low_pfn; \ + const unsigned long __lpa_pfn = page_to_pfn(page); \ + BUG_ON(max_low_pfn && __lpa_pfn > max_low_pfn); \ + BUG_ON(__lpa_pfn >= (~PAGE_OFFSET+1)/MMUPAGE_SIZE); \ + __va(__lpa_pfn*MMUPAGE_SIZE); \ +}) +#endif #if defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) #define HASHED_PAGE_VIRTUAL @@ -425,16 +435,16 @@ extern int vmtruncate(struct inode * ino extern pmd_t *FASTCALL(__pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)); extern pte_t *FASTCALL(pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address)); extern pte_t *FASTCALL(pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address)); -extern int install_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot); +extern int install_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot, int subpfn); extern int handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma, unsigned long address, int write_access); extern int make_pages_present(unsigned long addr, unsigned long end); extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write); extern long sys_remap_file_pages(unsigned long start, unsigned long size, unsigned long prot, unsigned long pgoff, unsigned long nonblock); -void put_dirty_page(struct task_struct *tsk, struct page *page, - unsigned long address, pgprot_t prot); +void put_dirty_page(struct task_struct *task, struct page *page, + int min_subpfn, unsigned long address, pgprot_t prot); int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, - int len, int write, int force, struct page **pages, struct vm_area_struct **vmas); + int len, int write, int force, unsigned long *pages, struct vm_area_struct **vmas); int __set_page_dirty_buffers(struct page *page); int __set_page_dirty_nobuffers(struct page *page); @@ -519,10 +529,10 @@ static inline unsigned long do_mmap(stru unsigned long flag, unsigned long offset) { unsigned long ret = -EINVAL; - if ((offset + PAGE_ALIGN(len)) < offset) + if ((offset + MMUPAGE_ALIGN(len)) < offset) goto out; - if (!(offset & ~PAGE_MASK)) - ret = do_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT); + if (!(offset & ~MMUPAGE_MASK)) + ret = do_mmap_pgoff(file, addr, len, prot, flag, offset >> MMUPAGE_SHIFT); out: return ret; } @@ -604,8 +614,18 @@ extern struct vm_area_struct *find_exten extern unsigned int nr_used_zone_pages(void); +/* + * Return byte offset from start of page containing virtual address in + * vma, to start of mmupage containing it: 0 if PAGE_MMUSHIFT 0. + */ +static inline unsigned long vma_suboffset(struct vm_area_struct *vma, unsigned long address) +{ + return (address - vma->vm_start + MMUPAGE_SIZE * vma->vm_pgoff) + & (MMUPAGE_MASK - PAGE_MASK); +} + extern struct page * vmalloc_to_page(void *addr); -extern struct page * follow_page(struct mm_struct *mm, unsigned long address, +unsigned long follow_page(struct mm_struct *mm, unsigned long address, int write); extern int remap_page_range(struct vm_area_struct *vma, unsigned long from, unsigned long to, unsigned long size, pgprot_t prot); diff -prauN linux-2.5.70-bk10/include/linux/mmzone.h pgcl-2.5.70-bk10-1/include/linux/mmzone.h --- linux-2.5.70-bk10/include/linux/mmzone.h 2003-05-26 18:00:41.000000000 -0700 +++ pgcl-2.5.70-bk10-1/include/linux/mmzone.h 2003-06-05 09:44:34.000000000 -0700 @@ -20,7 +20,7 @@ /* Free memory management - zoned buddy allocator. */ #ifndef CONFIG_FORCE_MAX_ZONEORDER -#define MAX_ORDER 11 +#define MAX_ORDER (11 - PAGE_MMUSHIFT) #else #define MAX_ORDER CONFIG_FORCE_MAX_ZONEORDER #endif diff -prauN linux-2.5.70-bk10/include/linux/pagemap.h pgcl-2.5.70-bk10-1/include/linux/pagemap.h --- linux-2.5.70-bk10/include/linux/pagemap.h 2003-05-26 18:00:20.000000000 -0700 +++ pgcl-2.5.70-bk10-1/include/linux/pagemap.h 2003-06-05 09:44:34.000000000 -0700 @@ -23,6 +23,9 @@ #define PAGE_CACHE_MASK PAGE_MASK #define PAGE_CACHE_ALIGN(addr) (((addr)+PAGE_CACHE_SIZE-1)&PAGE_CACHE_MASK) +#define PAGE_CACHE_MMUSHIFT (PAGE_CACHE_SHIFT - MMUPAGE_SHIFT) +#define PAGE_CACHE_MMUCOUNT (PAGE_CACHE_SIZE/MMUPAGE_SIZE) + #define page_cache_get(page) get_page(page) #define page_cache_release(page) put_page(page) void release_pages(struct page **pages, int nr, int cold); @@ -174,40 +177,27 @@ extern void end_page_writeback(struct pa */ static inline int fault_in_pages_writeable(char __user *uaddr, int size) { - int ret; + int ret = 0; + unsigned long addr, end = (unsigned long)uaddr + size - 1; /* * Writing zeroes into userspace here is OK, because we know that if * the zero gets there, we'll be overwriting it. */ - ret = __put_user(0, uaddr); - if (ret == 0) { - char __user *end = uaddr + size - 1; - - /* - * If the page was already mapped, this will get a cache miss - * for sure, so try to avoid doing it. - */ - if (((unsigned long)uaddr & PAGE_MASK) != - ((unsigned long)end & PAGE_MASK)) - ret = __put_user(0, end); - } + for (addr = (unsigned long)uaddr; addr <= MMUPAGE_ALIGN(end); addr += MMUPAGE_SIZE) + if ((ret = __put_user(0, (char *)min(addr, end))) != 0) + break; + return ret; } static inline void fault_in_pages_readable(const char __user *uaddr, int size) { volatile char c; - int ret; + unsigned long addr, end = (unsigned long)uaddr + size - 1; - ret = __get_user(c, (char *)uaddr); - if (ret == 0) { - const char __user *end = uaddr + size - 1; - - if (((unsigned long)uaddr & PAGE_MASK) != - ((unsigned long)end & PAGE_MASK)) - __get_user(c, (char *)end); - } + for (addr = (unsigned long)uaddr; addr <= MMUPAGE_ALIGN(end); addr += MMUPAGE_SIZE) + __get_user(c, (char *)min(addr, end)); } #endif /* _LINUX_PAGEMAP_H */ diff -prauN linux-2.5.70-bk10/include/linux/sched.h pgcl-2.5.70-bk10-1/include/linux/sched.h --- linux-2.5.70-bk10/include/linux/sched.h 2003-06-05 05:44:00.000000000 -0700 +++ pgcl-2.5.70-bk10-1/include/linux/sched.h 2003-06-05 09:44:34.000000000 -0700 @@ -195,7 +195,7 @@ struct mm_struct { unsigned long start_code, end_code, start_data, end_data; unsigned long start_brk, brk, start_stack; unsigned long arg_start, arg_end, env_start, env_end; - unsigned long rss, total_vm, locked_vm; + unsigned long rss, total_vm, locked_vm; /* in MMUPAGE_SIZE units */ unsigned long def_flags; unsigned long cpu_vm_mask; unsigned long swap_address; @@ -618,12 +618,7 @@ static inline int capable(int cap) extern struct mm_struct * mm_alloc(void); /* mmdrop drops the mm and the page tables */ -extern inline void FASTCALL(__mmdrop(struct mm_struct *)); -static inline void mmdrop(struct mm_struct * mm) -{ - if (atomic_dec_and_test(&mm->mm_count)) - __mmdrop(mm); -} +void mmdrop(struct mm_struct * mm); /* mmput gets rid of the mappings and all user-space */ extern void mmput(struct mm_struct *); diff -prauN linux-2.5.70-bk10/include/linux/shm.h pgcl-2.5.70-bk10-1/include/linux/shm.h --- linux-2.5.70-bk10/include/linux/shm.h 2003-05-26 18:00:45.000000000 -0700 +++ pgcl-2.5.70-bk10-1/include/linux/shm.h 2003-06-05 09:44:34.000000000 -0700 @@ -12,7 +12,7 @@ #define SHMMAX 0x2000000 /* max shared seg size (bytes) */ #define SHMMIN 1 /* min shared seg size (bytes) */ #define SHMMNI 4096 /* max num of segs system wide */ -#define SHMALL (SHMMAX/PAGE_SIZE*(SHMMNI/16)) /* max shm system wide (pages) */ +#define SHMALL (SHMMAX/MMUPAGE_SIZE*(SHMMNI/16)) /* max shm system wide (mmupages) */ #define SHMSEG SHMMNI /* max shared segs per process */ #include diff -prauN linux-2.5.70-bk10/include/linux/sunrpc/svc.h pgcl-2.5.70-bk10-1/include/linux/sunrpc/svc.h --- linux-2.5.70-bk10/include/linux/sunrpc/svc.h 2003-05-26 18:00:40.000000000 -0700 +++ pgcl-2.5.70-bk10-1/include/linux/sunrpc/svc.h 2003-06-05 09:44:34.000000000 -0700 @@ -73,7 +73,8 @@ struct svc_serv { * This assumes that the non-page part of an rpc reply will fit * in a page - NFSd ensures this. lockd also has no trouble. */ -#define RPCSVC_MAXPAGES ((RPCSVC_MAXPAYLOAD+PAGE_SIZE-1)/PAGE_SIZE + 1) + +#define RPCSVC_MAXPAGES (2+((RPCSVC_MAXPAYLOAD+PAGE_SIZE-1)/PAGE_SIZE+1)) static inline u32 svc_getu32(struct iovec *iov) { diff -prauN linux-2.5.70-bk10/include/linux/swap.h pgcl-2.5.70-bk10-1/include/linux/swap.h --- linux-2.5.70-bk10/include/linux/swap.h 2003-05-26 18:00:21.000000000 -0700 +++ pgcl-2.5.70-bk10-1/include/linux/swap.h 2003-06-05 09:44:34.000000000 -0700 @@ -45,7 +45,7 @@ static inline int current_is_kswapd(void */ union swap_header { struct { - char reserved[PAGE_SIZE - 10]; + char reserved[MMUPAGE_SIZE - 10]; char magic[10]; /* SWAP-SPACE or SWAPSPACE2 */ } magic; struct { @@ -112,8 +112,8 @@ enum { #define SWAP_CLUSTER_MAX 32 -#define SWAP_MAP_MAX 0x7fff -#define SWAP_MAP_BAD 0x8000 +#define SWAP_MAP_MAX 0xfffe +#define SWAP_MAP_BAD 0xffff /* * The in-memory structure used to track swap areas. diff -prauN linux-2.5.70-bk10/init/main.c pgcl-2.5.70-bk10-1/init/main.c --- linux-2.5.70-bk10/init/main.c 2003-05-26 18:00:25.000000000 -0700 +++ pgcl-2.5.70-bk10-1/init/main.c 2003-06-05 09:44:34.000000000 -0700 @@ -372,6 +372,7 @@ static void rest_init(void) cpu_idle(); } + /* * Activate the first processor. */ @@ -421,9 +422,9 @@ asmlinkage void __init start_kernel(void calibrate_delay(); #ifdef CONFIG_BLK_DEV_INITRD if (initrd_start && !initrd_below_start_ok && - initrd_start < min_low_pfn << PAGE_SHIFT) { + initrd_start < min_low_pfn << MMUPAGE_SHIFT) { printk(KERN_CRIT "initrd overwritten (0x%08lx < 0x%08lx) - " - "disabling it.\n",initrd_start,min_low_pfn << PAGE_SHIFT); + "disabling it.\n",initrd_start,min_low_pfn << MMUPAGE_SHIFT); initrd_start = 0; } #endif diff -prauN linux-2.5.70-bk10/ipc/shm.c pgcl-2.5.70-bk10-1/ipc/shm.c --- linux-2.5.70-bk10/ipc/shm.c 2003-05-26 18:00:40.000000000 -0700 +++ pgcl-2.5.70-bk10-1/ipc/shm.c 2003-06-05 09:44:34.000000000 -0700 @@ -110,7 +110,7 @@ static void shm_open (struct vm_area_str */ static void shm_destroy (struct shmid_kernel *shp) { - shm_tot -= (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT; + shm_tot -= (shp->shm_segsz + MMUPAGE_SIZE - 1) >> MMUPAGE_SHIFT; shm_rmid (shp->id); shm_unlock(shp); if (!is_file_hugepages(shp->shm_file)) @@ -169,7 +169,7 @@ static int newseg (key_t key, int shmflg { int error; struct shmid_kernel *shp; - int numpages = (size + PAGE_SIZE -1) >> PAGE_SHIFT; + int numpages = (size + MMUPAGE_SIZE -1) >> MMUPAGE_SHIFT; struct file * file; char name[13]; int id; @@ -717,7 +717,7 @@ long sys_shmat(int shmid, char __user *s * space left for the stack to grow (at least 4 pages). */ if (addr < current->mm->start_stack && - addr > current->mm->start_stack - size - PAGE_SIZE * 5) + addr > current->mm->start_stack - size - MMUPAGE_SIZE * 5) goto invalid; } @@ -775,7 +775,7 @@ asmlinkage long sys_shmdt(char __user *s * otherwise it starts at this address with no hassles. */ if ((vma->vm_ops == &shm_vm_ops || is_vm_hugetlb_page(vma)) && - (vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) { + (vma->vm_start - addr)/MMUPAGE_SIZE == vma->vm_pgoff) { size = vma->vm_file->f_dentry->d_inode->i_size; @@ -803,7 +803,7 @@ asmlinkage long sys_shmdt(char __user *s /* finding a matching vma now does not alter retval */ if ((vma->vm_ops == &shm_vm_ops || is_vm_hugetlb_page(vma)) && - (vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) + (vma->vm_start - addr)/MMUPAGE_SIZE == vma->vm_pgoff) do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start); vma = next; diff -prauN linux-2.5.70-bk10/kernel/fork.c pgcl-2.5.70-bk10-1/kernel/fork.c --- linux-2.5.70-bk10/kernel/fork.c 2003-06-05 05:44:00.000000000 -0700 +++ pgcl-2.5.70-bk10-1/kernel/fork.c 2003-06-05 09:44:34.000000000 -0700 @@ -205,7 +205,7 @@ void __init fork_init(unsigned long memp * value: the thread structures can take up at most half * of memory. */ - max_threads = mempages / (THREAD_SIZE/PAGE_SIZE) / 8; + max_threads = mempages / 8; /* * we need to allow at least 20 threads to boot a system */ @@ -285,7 +285,7 @@ static inline int dup_mmap(struct mm_str if(mpnt->vm_flags & VM_DONTCOPY) continue; if (mpnt->vm_flags & VM_ACCOUNT) { - unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; + unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> MMUPAGE_SHIFT; if (!vm_enough_memory(len)) goto fail_nomem; charge += len; @@ -406,8 +406,11 @@ struct mm_struct * mm_alloc(void) * is dropped: either by a lazy thread or by * mmput. Free the page directory and the mm. */ -inline void __mmdrop(struct mm_struct *mm) +void mmdrop(struct mm_struct *mm) { + if (!atomic_dec_and_test(&mm->mm_count)) + return; + BUG_ON(mm == &init_mm); mm_free_pgd(mm); destroy_context(mm); diff -prauN linux-2.5.70-bk10/kernel/futex.c pgcl-2.5.70-bk10-1/kernel/futex.c --- linux-2.5.70-bk10/kernel/futex.c 2003-05-26 18:00:26.000000000 -0700 +++ pgcl-2.5.70-bk10-1/kernel/futex.c 2003-06-05 09:44:34.000000000 -0700 @@ -108,15 +108,15 @@ static inline struct page *__pin_page_at static struct page *__pin_page(unsigned long addr) { struct mm_struct *mm = current->mm; - struct page *page, *tmp; + unsigned long pfn, tmp; int err; /* * Do a quick atomic lookup first - this is the fastpath. */ - page = follow_page(mm, addr, 0); - if (likely(page != NULL)) - return __pin_page_atomic(page); + pfn = follow_page(mm, addr, 0); + if (likely(pfn_valid(pfn))) + return __pin_page_atomic(pfn_to_page(pfn)); /* * No luck - need to fault in the page: @@ -126,7 +126,7 @@ repeat_lookup: unlock_futex_mm(); down_read(&mm->mmap_sem); - err = get_user_pages(current, mm, addr, 1, 0, 0, &page, NULL); + err = get_user_pages(current, mm, addr, 1, 0, 0, &pfn, NULL); up_read(&mm->mmap_sem); lock_futex_mm(); @@ -138,12 +138,18 @@ repeat_lookup: * check for races: */ tmp = follow_page(mm, addr, 0); - if (tmp != page) { + + /* + * XXX: this is weird, it can refer to a different pfn in the + * same page. Counts as a race in my book. + */ + if (tmp != pfn) { + struct page *page = pfn_to_page(pfn); put_page(page); goto repeat_lookup; } - return page; + return pfn ? pfn_to_page(pfn) : NULL; } /* @@ -164,6 +170,11 @@ static inline int futex_wake(unsigned lo return -EFAULT; } + /* + * XXX: I broke this. + * This needs to include a suboffset w/in the struct page's area. + * -- wli + */ head = hash_futex(page, offset); list_for_each_safe(i, next, head) { @@ -485,7 +496,7 @@ long do_futex(unsigned long uaddr, int o unsigned long pos_in_page; int ret; - pos_in_page = uaddr % PAGE_SIZE; + pos_in_page = uaddr % MMUPAGE_SIZE; /* Must be "naturally" aligned */ if (pos_in_page % sizeof(u32)) diff -prauN linux-2.5.70-bk10/kernel/ksyms.c pgcl-2.5.70-bk10-1/kernel/ksyms.c --- linux-2.5.70-bk10/kernel/ksyms.c 2003-05-26 18:00:20.000000000 -0700 +++ pgcl-2.5.70-bk10-1/kernel/ksyms.c 2003-06-05 09:44:34.000000000 -0700 @@ -127,7 +127,6 @@ EXPORT_SYMBOL(kmap_high); EXPORT_SYMBOL(kunmap_high); EXPORT_SYMBOL(highmem_start_page); EXPORT_SYMBOL(kmap_prot); -EXPORT_SYMBOL(kmap_pte); #endif #ifdef HASHED_PAGE_VIRTUAL EXPORT_SYMBOL(page_address); diff -prauN linux-2.5.70-bk10/kernel/ptrace.c pgcl-2.5.70-bk10-1/kernel/ptrace.c --- linux-2.5.70-bk10/kernel/ptrace.c 2003-05-26 18:01:03.000000000 -0700 +++ pgcl-2.5.70-bk10-1/kernel/ptrace.c 2003-06-05 09:44:34.000000000 -0700 @@ -155,27 +155,32 @@ int access_process_vm(struct task_struct struct mm_struct *mm; struct vm_area_struct *vma; struct page *page; + unsigned long pfn = 0; void *old_buf = buf; mm = get_task_mm(tsk); - if (!mm) + if (!mm) { + printk("get_task_mm() failed in access_process_vm()\n"); return 0; + } down_read(&mm->mmap_sem); /* ignore errors, just check how much was sucessfully transfered */ while (len) { int bytes, ret, offset; + unsigned long dst_off; void *maddr; - ret = get_user_pages(current, mm, addr, 1, - write, 1, &page, &vma); - if (ret <= 0) + ret = get_user_pages(current, mm, addr, 1, write, 1, &pfn, &vma); + if (ret <= 0) { + printk("get_uesr_pages() failed in access_process_vm()\n"); break; + } bytes = len; - offset = addr & (PAGE_SIZE-1); - if (bytes > PAGE_SIZE-offset) - bytes = PAGE_SIZE-offset; + offset = addr & ~MMUPAGE_MASK; + if (bytes > MMUPAGE_SIZE-offset) + bytes = MMUPAGE_SIZE-offset; flush_cache_page(vma, addr); @@ -185,12 +190,14 @@ int access_process_vm(struct task_struct * to handle this correctly. */ + page = pfn_to_page(pfn); maddr = kmap(page); + dst_off = (pfn % PAGE_MMUCOUNT)*MMUPAGE_SIZE; if (write) { - memcpy(maddr + offset, buf, bytes); + memcpy(maddr + offset + dst_off, buf, bytes); flush_icache_user_range(vma, page, addr, bytes); } else { - memcpy(buf, maddr + offset, bytes); + memcpy(buf, maddr + offset + dst_off, bytes); } kunmap(page); page_cache_release(page); diff -prauN linux-2.5.70-bk10/mm/bootmem.c pgcl-2.5.70-bk10-1/mm/bootmem.c --- linux-2.5.70-bk10/mm/bootmem.c 2003-05-26 18:00:27.000000000 -0700 +++ pgcl-2.5.70-bk10-1/mm/bootmem.c 2003-06-05 09:44:34.000000000 -0700 @@ -33,10 +33,7 @@ unsigned long __init bootmem_bootmap_pag unsigned long mapsize; mapsize = (pages+7)/8; - mapsize = (mapsize + ~PAGE_MASK) & PAGE_MASK; - mapsize >>= PAGE_SHIFT; - - return mapsize; + return (mapsize + MMUPAGE_SIZE - 1) >> MMUPAGE_SHIFT; } /* @@ -46,14 +43,17 @@ static unsigned long __init init_bootmem unsigned long mapstart, unsigned long start, unsigned long end) { bootmem_data_t *bdata = pgdat->bdata; - unsigned long mapsize = ((end - start)+7)/8; + unsigned long mapsize; pgdat->pgdat_next = pgdat_list; pgdat_list = pgdat; + /* round start down to simplify free_all_bootmem_core() */ + start &= ~(PAGE_MMUCOUNT - 1); + mapsize = ((end - start)+7)/8; mapsize = (mapsize + (sizeof(long) - 1UL)) & ~(sizeof(long) - 1UL); - bdata->node_bootmem_map = phys_to_virt(mapstart << PAGE_SHIFT); - bdata->node_boot_start = (start << PAGE_SHIFT); + bdata->node_bootmem_map = phys_to_virt(mapstart << MMUPAGE_SHIFT); + bdata->node_boot_start = (start << MMUPAGE_SHIFT); bdata->node_low_pfn = end; /* @@ -77,10 +77,10 @@ static void __init reserve_bootmem_core( * round up, partially reserved pages are considered * fully reserved. */ - unsigned long sidx = (addr - bdata->node_boot_start)/PAGE_SIZE; + unsigned long sidx = (addr - bdata->node_boot_start)/MMUPAGE_SIZE; unsigned long eidx = (addr + size - bdata->node_boot_start + - PAGE_SIZE-1)/PAGE_SIZE; - unsigned long end = (addr + size + PAGE_SIZE-1)/PAGE_SIZE; + MMUPAGE_SIZE-1)/MMUPAGE_SIZE; + unsigned long end_pfn = (addr + size + MMUPAGE_SIZE-1)/MMUPAGE_SIZE; if (!size) BUG(); @@ -90,13 +90,11 @@ static void __init reserve_bootmem_core( BUG(); if (sidx >= eidx) BUG(); - if ((addr >> PAGE_SHIFT) >= bdata->node_low_pfn) - BUG(); - if (end > bdata->node_low_pfn) + if (end_pfn > bdata->node_low_pfn) BUG(); for (i = sidx; i < eidx; i++) if (test_and_set_bit(i, bdata->node_bootmem_map)) - printk("hm, page %08lx reserved twice.\n", i*PAGE_SIZE); + printk("hm, page %08lx reserved twice.\n", i*MMUPAGE_SIZE); } static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr, unsigned long size) @@ -108,11 +106,11 @@ static void __init free_bootmem_core(boo * considered reserved. */ unsigned long sidx; - unsigned long eidx = (addr + size - bdata->node_boot_start)/PAGE_SIZE; - unsigned long end = (addr + size)/PAGE_SIZE; + unsigned long eidx = (addr + size - bdata->node_boot_start)/MMUPAGE_SIZE; + unsigned long end_pfn = (addr + size)/MMUPAGE_SIZE; if (!size) BUG(); - if (end > bdata->node_low_pfn) + if (end_pfn > bdata->node_low_pfn) BUG(); if (addr < bdata->last_success) @@ -121,8 +119,8 @@ static void __init free_bootmem_core(boo /* * Round up the beginning of the address. */ - start = (addr + PAGE_SIZE-1) / PAGE_SIZE; - sidx = start - (bdata->node_boot_start/PAGE_SIZE); + start = (addr + MMUPAGE_SIZE-1) / MMUPAGE_SIZE; + sidx = start - (bdata->node_boot_start/MMUPAGE_SIZE); for (i = sidx; i < eidx; i++) { if (!test_and_clear_bit(i, bdata->node_bootmem_map)) @@ -154,19 +152,19 @@ __alloc_bootmem_core(struct bootmem_data BUG_ON(!size); BUG_ON(align & (align-1)); - eidx = bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT); + eidx = bdata->node_low_pfn - (bdata->node_boot_start >> MMUPAGE_SHIFT); offset = 0; if (align && (bdata->node_boot_start & (align - 1UL)) != 0) offset = (align - (bdata->node_boot_start & (align - 1UL))); - offset >>= PAGE_SHIFT; + offset >>= MMUPAGE_SHIFT; /* * We try to allocate bootmem pages above 'goal' * first, then we try to allocate lower pages. */ if (goal && (goal >= bdata->node_boot_start) && - ((goal >> PAGE_SHIFT) < bdata->node_low_pfn)) { + ((goal >> MMUPAGE_SHIFT) < bdata->node_low_pfn)) { preferred = goal - bdata->node_boot_start; if (bdata->last_success >= preferred) @@ -174,10 +172,10 @@ __alloc_bootmem_core(struct bootmem_data } else preferred = 0; - preferred = ((preferred + align - 1) & ~(align - 1)) >> PAGE_SHIFT; + preferred = ((preferred + align - 1) & ~(align - 1)) >> MMUPAGE_SHIFT; preferred += offset; - areasize = (size+PAGE_SIZE-1)/PAGE_SIZE; - incr = align >> PAGE_SHIFT ? : 1; + areasize = (size+MMUPAGE_SIZE-1)/MMUPAGE_SIZE; + incr = align >> MMUPAGE_SHIFT ? : 1; restart_scan: for (i = preferred; i < eidx; i += incr) { @@ -205,7 +203,7 @@ restart_scan: return NULL; found: - bdata->last_success = start << PAGE_SHIFT; + bdata->last_success = start << MMUPAGE_SHIFT; BUG_ON(start >= eidx); /* @@ -213,30 +211,30 @@ found: * of this allocation's buffer? If yes then we can 'merge' * the previous partial page with this allocation. */ - if (align < PAGE_SIZE && + if (align < MMUPAGE_SIZE && bdata->last_offset && bdata->last_pos+1 == start) { offset = (bdata->last_offset+align-1) & ~(align-1); - BUG_ON(offset > PAGE_SIZE); - remaining_size = PAGE_SIZE-offset; + BUG_ON(offset > MMUPAGE_SIZE); + remaining_size = MMUPAGE_SIZE-offset; if (size < remaining_size) { areasize = 0; /* last_pos unchanged */ bdata->last_offset = offset+size; - ret = phys_to_virt(bdata->last_pos*PAGE_SIZE + offset + + ret = phys_to_virt(bdata->last_pos*MMUPAGE_SIZE + offset + bdata->node_boot_start); } else { remaining_size = size - remaining_size; - areasize = (remaining_size+PAGE_SIZE-1)/PAGE_SIZE; - ret = phys_to_virt(bdata->last_pos*PAGE_SIZE + offset + + areasize = (remaining_size+MMUPAGE_SIZE-1)/MMUPAGE_SIZE; + ret = phys_to_virt(bdata->last_pos*MMUPAGE_SIZE + offset + bdata->node_boot_start); bdata->last_pos = start+areasize-1; bdata->last_offset = remaining_size; } - bdata->last_offset &= ~PAGE_MASK; + bdata->last_offset &= ~MMUPAGE_MASK; } else { bdata->last_pos = start + areasize - 1; - bdata->last_offset = size & ~PAGE_MASK; - ret = phys_to_virt(start * PAGE_SIZE + bdata->node_boot_start); + bdata->last_offset = size & ~MMUPAGE_MASK; + ret = phys_to_virt(start * MMUPAGE_SIZE + bdata->node_boot_start); } /* @@ -253,49 +251,37 @@ static unsigned long __init free_all_boo { struct page *page = pgdat->node_mem_map; bootmem_data_t *bdata = pgdat->bdata; - unsigned long i, count, total = 0; - unsigned long idx; + unsigned long i, total = 0; + unsigned long idx, mapnr, node_low_mapnr; unsigned long *map; - if (!bdata->node_bootmem_map) BUG(); - - count = 0; - idx = bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT); + BUG_ON(!bdata->node_bootmem_map); map = bdata->node_bootmem_map; - for (i = 0; i < idx; ) { - unsigned long v = ~map[i / BITS_PER_LONG]; - if (v) { - unsigned long m; - for (m = 1; m && i < idx; m<<=1, page++, i++) { - if (v & m) { - count++; + i = 0; + idx = bdata->node_low_pfn - (bdata->node_boot_start >> MMUPAGE_SHIFT); + + node_low_mapnr = (bdata->node_low_pfn - bdata->node_boot_start/MMUPAGE_SIZE)/PAGE_MMUCOUNT; + for (mapnr = 0; mapnr < node_low_mapnr; ++mapnr) { + int k, should_free = 1; + for (k = 0; k < PAGE_MMUCOUNT; ++k) + if (test_bit(mapnr*PAGE_MMUCOUNT + k, map)) + should_free = 0; + if (should_free) { + page = &pgdat->node_mem_map[mapnr]; ClearPageReserved(page); set_page_count(page, 1); __free_page(page); + ++total; } } - } else { - i+=BITS_PER_LONG; - page += BITS_PER_LONG; - } - } - total += count; /* - * Now free the allocator bitmap itself, it's not - * needed anymore: - */ - page = virt_to_page(bdata->node_bootmem_map); - count = 0; - for (i = 0; i < ((bdata->node_low_pfn-(bdata->node_boot_start >> PAGE_SHIFT))/8 + PAGE_SIZE-1)/PAGE_SIZE; i++,page++) { - count++; - ClearPageReserved(page); - set_page_count(page, 1); - __free_page(page); } - total += count; - bdata->node_bootmem_map = NULL; + * Leak the allocator bitmap; it's not worth saving. + */ + bdata->node_bootmem_map = NULL; + printk("bootmem: freed %lx pages in node %d\n", total, pgdat->node_id); return total; } diff -prauN linux-2.5.70-bk10/mm/filemap.c pgcl-2.5.70-bk10-1/mm/filemap.c --- linux-2.5.70-bk10/mm/filemap.c 2003-06-05 05:44:01.000000000 -0700 +++ pgcl-2.5.70-bk10-1/mm/filemap.c 2003-06-05 09:44:34.000000000 -0700 @@ -944,8 +944,8 @@ struct page * filemap_nopage(struct vm_a unsigned long size, pgoff, endoff; int did_readahead; - pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff; - endoff = ((area->vm_end - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff; + pgoff = (address - area->vm_start)/MMUPAGE_SIZE + area->vm_pgoff; + endoff = (area->vm_end - area->vm_start + MMUPAGE_SIZE - 1)/MMUPAGE_SIZE + area->vm_pgoff; retry_all: /* @@ -953,15 +953,15 @@ retry_all: * accessible.. */ size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - if ((pgoff >= size) && (area->vm_mm == current->mm)) + if ((pgoff/PAGE_CACHE_MMUCOUNT >= size) && (area->vm_mm == current->mm)) return NULL; /* * The "size" of the file, as far as mmap is concerned, isn't bigger * than the mapping */ - if (size > endoff) - size = endoff; + if (size > endoff/PAGE_CACHE_MMUCOUNT) + size = endoff/PAGE_CACHE_MMUCOUNT; did_readahead = 0; @@ -971,23 +971,23 @@ retry_all: */ if (VM_SequentialReadHint(area)) { did_readahead = 1; - page_cache_readahead(mapping, ra, file, pgoff); + page_cache_readahead(mapping, ra, file, pgoff/PAGE_CACHE_MMUCOUNT); } /* * If the offset is outside the mapping size we're off the end * of a privately mapped file, so we need to map a zero page. */ - if ((pgoff < size) && !VM_RandomReadHint(area)) { + if ((pgoff/PAGE_CACHE_MMUCOUNT < size) && !VM_RandomReadHint(area)) { did_readahead = 1; - page_cache_readaround(mapping, ra, file, pgoff); + page_cache_readaround(mapping, ra, file, pgoff/PAGE_CACHE_MMUCOUNT); } /* * Do we have something in the page cache already? */ retry_find: - page = find_get_page(mapping, pgoff); + page = find_get_page(mapping, pgoff/PAGE_CACHE_MMUCOUNT); if (!page) { if (did_readahead) { handle_ra_miss(mapping, ra, pgoff); @@ -1015,7 +1015,7 @@ no_cached_page: * We're only likely to ever get here if MADV_RANDOM is in * effect. */ - error = page_cache_read(file, pgoff); + error = page_cache_read(file, pgoff/PAGE_CACHE_MMUCOUNT); /* * The page we want has now been added to the page cache. @@ -1103,7 +1103,7 @@ static struct page * filemap_getpage(str * Do we have something in the page cache already? */ retry_find: - page = find_get_page(mapping, pgoff); + page = find_get_page(mapping, pgoff/PAGE_CACHE_MMUCOUNT); if (!page) { if (nonblock) return NULL; @@ -1125,7 +1125,7 @@ success: return page; no_cached_page: - error = page_cache_read(file, pgoff); + error = page_cache_read(file, pgoff/PAGE_CACHE_MMUCOUNT); /* * The page we want has now been added to the page cache. @@ -1216,26 +1216,30 @@ static int filemap_populate(struct vm_ar if (!nonblock) do_page_cache_readahead(mapping, vma->vm_file, - pgoff, len >> PAGE_CACHE_SHIFT); + pgoff/PAGE_MMUCOUNT, len >> PAGE_CACHE_SHIFT); repeat: size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - if (pgoff + (len >> PAGE_CACHE_SHIFT) > size) + if (pgoff + len/MMUPAGE_SIZE > size) return -EINVAL; page = filemap_getpage(file, pgoff, nonblock); if (!page && !nonblock) return -ENOMEM; if (page) { - err = install_page(mm, vma, addr, page, prot); + /* + * page caches bytes index*PAGE_SIZE to index*(PAGE_SIZE+1)-1 + * pgoff % PAGE_MMUCOUNT is the subpfn w/in the page + */ + err = install_page(mm, vma, addr, page, prot, pgoff % PAGE_MMUCOUNT); if (err) { page_cache_release(page); return err; } } - len -= PAGE_SIZE; - addr += PAGE_SIZE; + len -= MMUPAGE_SIZE; + addr += MMUPAGE_SIZE; pgoff++; if (len) goto repeat; diff -prauN linux-2.5.70-bk10/mm/fremap.c pgcl-2.5.70-bk10-1/mm/fremap.c --- linux-2.5.70-bk10/mm/fremap.c 2003-05-26 18:00:20.000000000 -0700 +++ pgcl-2.5.70-bk10-1/mm/fremap.c 2003-06-05 09:44:34.000000000 -0700 @@ -53,13 +53,14 @@ static inline int zap_pte(struct mm_stru * previously existing mapping. */ int install_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, struct page *page, pgprot_t prot) + unsigned long addr, struct page *page, pgprot_t prot, int subpfn) { int err = -ENOMEM, flush; pte_t *pte; pgd_t *pgd; pmd_t *pmd; struct pte_chain *pte_chain; + unsigned long pfn = page_to_pfn(page) + subpfn; pte_chain = pte_chain_alloc(GFP_KERNEL); if (!pte_chain) @@ -79,7 +80,7 @@ int install_page(struct mm_struct *mm, s mm->rss++; flush_icache_page(vma, page); - set_pte(pte, mk_pte(page, prot)); + set_pte(pte, pfn_pte(pfn, prot)); pte_chain = page_add_rmap(page, pte, pte_chain); pte_unmap(pte); if (flush) @@ -127,8 +128,8 @@ long sys_remap_file_pages(unsigned long /* * Sanitize the syscall parameters: */ - start = start & PAGE_MASK; - size = size & PAGE_MASK; + start = start & MMUPAGE_MASK; + size = size & MMUPAGE_MASK; /* Does the address range wrap, or is the span zero-sized? */ if (start + size <= start) @@ -136,7 +137,7 @@ long sys_remap_file_pages(unsigned long /* Can we represent this offset inside this architecture's pte's? */ #if PTE_FILE_MAX_BITS < BITS_PER_LONG - if (pgoff + (size >> PAGE_SHIFT) >= (1UL << PTE_FILE_MAX_BITS)) + if (pgoff + (size >> MMUPAGE_SHIFT) >= (1UL << PTE_FILE_MAX_BITS)) return err; #endif diff -prauN linux-2.5.70-bk10/mm/highmem.c pgcl-2.5.70-bk10-1/mm/highmem.c --- linux-2.5.70-bk10/mm/highmem.c 2003-05-26 18:00:41.000000000 -0700 +++ pgcl-2.5.70-bk10-1/mm/highmem.c 2003-06-05 09:44:34.000000000 -0700 @@ -54,8 +54,6 @@ static int pkmap_count[LAST_PKMAP]; static unsigned int last_pkmap_nr; static spinlock_t kmap_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; -pte_t * pkmap_page_table; - static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait); static void flush_all_zero_pkmaps(void) @@ -65,6 +63,8 @@ static void flush_all_zero_pkmaps(void) flush_cache_all(); for (i = 0; i < LAST_PKMAP; i++) { + int j; + unsigned long vaddr = PKMAP_ADDR(i); struct page *page; /* @@ -78,8 +78,14 @@ static void flush_all_zero_pkmaps(void) pkmap_count[i] = 0; /* sanity check */ - if (pte_none(pkmap_page_table[i])) - BUG(); + for (j = 0; j < PAGE_MMUCOUNT; ++j) { + unsigned long addr = vaddr + j*MMUPAGE_SIZE; + pgd_t *pgd = pgd_offset_k(addr); + pmd_t *pmd = pmd_offset(pgd, addr); + pte_t *pte = pte_offset_kernel(pmd, addr); + + BUG_ON(pte_none(*pte)); + } /* * Don't need an atomic fetch-and-clear op here; @@ -88,8 +94,20 @@ static void flush_all_zero_pkmaps(void) * getting the kmap_lock (which is held here). * So no dangers, even with speculative execution. */ - page = pte_page(pkmap_page_table[i]); - pte_clear(&pkmap_page_table[i]); + { + pgd_t *pgd = pgd_offset_k(vaddr); + pmd_t *pmd = pmd_offset(pgd, vaddr); + pte_t *pte = pte_offset_kernel(pmd, vaddr); + page = pte_page(*pte); + } + + for (j = 0; j < PAGE_MMUCOUNT; ++j) { + unsigned long addr = vaddr + j*MMUPAGE_SIZE; + pgd_t *pgd = pgd_offset_k(addr); + pmd_t *pmd = pmd_offset(pgd, addr); + pte_t *pte = pte_offset_kernel(pmd, addr); + pte_clear(pte); + } set_page_address(page, NULL); } @@ -99,7 +117,7 @@ static void flush_all_zero_pkmaps(void) static inline unsigned long map_new_virtual(struct page *page) { unsigned long vaddr; - int count; + int k, count; start: count = LAST_PKMAP; @@ -137,7 +155,15 @@ start: } } vaddr = PKMAP_ADDR(last_pkmap_nr); - set_pte(&(pkmap_page_table[last_pkmap_nr]), mk_pte(page, kmap_prot)); + WARN_ON(vaddr > __fix_to_virt(FIX_PKMAP_BEGIN)); + WARN_ON(vaddr < __fix_to_virt(FIX_PKMAP_END)); + for (k = 0; k < PAGE_MMUCOUNT; ++k) { + unsigned long addr = vaddr + k * MMUPAGE_SIZE; + pgd_t *pgd = pgd_offset_k(addr); + pmd_t *pmd = pmd_offset(pgd, addr); + pte_t *pte = pte_offset_kernel(pmd, addr); + set_pte(pte, pfn_pte(page_to_pfn(page) + k, kmap_prot)); + } pkmap_count[last_pkmap_nr] = 1; set_page_address(page, (void *)vaddr); @@ -479,12 +505,19 @@ void check_highmem_ptes(void) preempt_disable(); for (type = 0; type < KM_TYPE_NR; type++) { + int k; idx = type + KM_TYPE_NR*smp_processor_id(); - if (!pte_none(*(kmap_pte-idx))) { + for (k = 0; k < PAGE_MMUCOUNT; ++k) { + unsigned long addr = __fix_to_virt(FIX_KMAP_END) + idx*PAGE_SIZE + k*MMUPAGE_SIZE; + pgd_t *pgd = pgd_offset_k(addr); + pmd_t *pmd = pmd_offset(pgd, addr); + pte_t *pte = pte_offset_kernel(pmd, addr); + if (!pte_none(*pte)) { printk("scheduling with KM_TYPE %d held!\n", type); BUG(); } } + } preempt_enable(); } #endif diff -prauN linux-2.5.70-bk10/mm/madvise.c pgcl-2.5.70-bk10-1/mm/madvise.c --- linux-2.5.70-bk10/mm/madvise.c 2003-05-26 18:00:26.000000000 -0700 +++ pgcl-2.5.70-bk10-1/mm/madvise.c 2003-06-05 09:44:34.000000000 -0700 @@ -60,10 +60,12 @@ static long madvise_willneed(struct vm_a if (!file) return -EBADF; - start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + start = ((start - vma->vm_start) >> MMUPAGE_SHIFT) + vma->vm_pgoff; if (end > vma->vm_end) end = vma->vm_end; - end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + end = ((end - vma->vm_start) >> MMUPAGE_SHIFT) + vma->vm_pgoff; + start /= PAGE_MMUCOUNT; + end /= PAGE_MMUCOUNT; do_page_cache_readahead(file->f_dentry->d_inode->i_mapping, file, start, max_sane_readahead(end - start)); @@ -170,9 +172,9 @@ asmlinkage long sys_madvise(unsigned lon down_write(¤t->mm->mmap_sem); - if (start & ~PAGE_MASK) + if (start & ~MMUPAGE_MASK) goto out; - len = (len + ~PAGE_MASK) & PAGE_MASK; + len = (len + ~MMUPAGE_MASK) & MMUPAGE_MASK; end = start + len; if (end < start) goto out; diff -prauN linux-2.5.70-bk10/mm/memory.c pgcl-2.5.70-bk10-1/mm/memory.c --- linux-2.5.70-bk10/mm/memory.c 2003-05-26 18:00:39.000000000 -0700 +++ pgcl-2.5.70-bk10-1/mm/memory.c 2003-06-05 09:48:26.000000000 -0700 @@ -127,7 +127,7 @@ static inline void free_one_pgd(struct m */ void clear_page_tables(struct mmu_gather *tlb, unsigned long first, int nr) { - pgd_t * page_dir = tlb->mm->pgd; + pgd_t * page_dir = tlb_mm(tlb)->pgd; page_dir += first; do { @@ -150,11 +150,32 @@ pte_t * pte_alloc_map(struct mm_struct * /* * Because we dropped the lock, we should re-check the * entry, as somebody else could have populated it.. + * If we raced, we also need to drop all the reference + * counts originally taken with the intent of conferring + * them to all the pte entries spanned by the pte page. */ if (pmd_present(*pmd)) { + if (PAGE_MMUCOUNT > 1) + atomic_sub(PAGE_MMUCOUNT-1, &new->count); pte_free(new); goto out; } +#if 0 + { + int k; + pmd_t *base; + unsigned long addr, __pmd = (unsigned long)pmd; + addr = address & ~(PAGE_MMUCOUNT*PMD_SIZE - 1); + base = pmd - ((__pmd/sizeof(pmd_t)) % PAGE_MMUCOUNT); + for (k = 0; k < PAGE_MMUCOUNT; ++k) + if (!pmd_none(base[k]) || pmd_present(base[k])) + printk(KERN_DEBUG + "redundant pmd instantiation " + "at vaddr 0x%lx pmd = 0x%p\n", + addr + PMD_SIZE*k, + &base[k]); + } +#endif pgtable_add_rmap(new, mm, address); pmd_populate(mm, pmd, new); } @@ -347,7 +368,7 @@ skip_copy_pte_range: src_pte = pte_offset_map_nested(src_pmd, address); cont_copy_pte_range_noset: - address += PAGE_SIZE; + address += MMUPAGE_SIZE; if (address >= end) { pte_unmap_nested(src_pte); pte_unmap(dst_pte); @@ -393,8 +414,8 @@ zap_pte_range(struct mmu_gather *tlb, pm offset = address & ~PMD_MASK; if (offset + size > PMD_SIZE) size = PMD_SIZE - offset; - size &= PAGE_MASK; - for (offset=0; offset < size; ptep++, offset += PAGE_SIZE) { + size &= MMUPAGE_MASK; + for (offset=0; offset < size; ptep++, offset += MMUPAGE_SIZE) { pte_t pte = *ptep; if (pte_none(pte)) continue; @@ -411,15 +432,15 @@ zap_pte_range(struct mmu_gather *tlb, pm if (page->mapping && pte_young(pte) && !PageSwapCache(page)) mark_page_accessed(page); - tlb->freed++; + tlb_inc_freed(tlb); page_remove_rmap(page, ptep); tlb_remove_page(tlb, page); } } } else { + pte_clear(ptep); if (!pte_file(pte)) free_swap_and_cache(pte_to_swp_entry(pte)); - pte_clear(ptep); } } pte_unmap(ptep-1); @@ -474,12 +495,12 @@ void unmap_page_range(struct mmu_gather /* Dispose of an entire struct mmu_gather per rescheduling point */ #if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT) -#define ZAP_BLOCK_SIZE (FREE_PTE_NR * PAGE_SIZE) +#define ZAP_BLOCK_SIZE (FREE_PTE_NR * MMUPAGE_SIZE) #endif /* For UP, 256 pages at a time gives nice low latency */ #if !defined(CONFIG_SMP) && defined(CONFIG_PREEMPT) -#define ZAP_BLOCK_SIZE (256 * PAGE_SIZE) +#define ZAP_BLOCK_SIZE (256 * MMUPAGE_SIZE) #endif /* No preempt: go for the best straight-line efficiency */ @@ -518,16 +539,16 @@ int unmap_vmas(struct mmu_gather **tlbp, unsigned long end_addr, unsigned long *nr_accounted) { unsigned long zap_bytes = ZAP_BLOCK_SIZE; - unsigned long tlb_start; /* For tlb_finish_mmu */ + unsigned long tlb_start = 0; /* For tlb_finish_mmu */ int tlb_start_valid = 0; int ret = 0; if (vma) { /* debug. killme. */ if (end_addr <= vma->vm_start) - printk("%s: end_addr(0x%08lx) <= vm_start(0x%08lx)\n", + pr_debug("%s: end_addr(0x%08lx) <= vm_start(0x%08lx)\n", __FUNCTION__, end_addr, vma->vm_start); if (start_addr >= vma->vm_end) - printk("%s: start_addr(0x%08lx) <= vm_end(0x%08lx)\n", + pr_debug("%s: start_addr(0x%08lx) <= vm_end(0x%08lx)\n", __FUNCTION__, start_addr, vma->vm_end); } @@ -543,7 +564,7 @@ int unmap_vmas(struct mmu_gather **tlbp, continue; if (vma->vm_flags & VM_ACCOUNT) - *nr_accounted += (end - start) >> PAGE_SHIFT; + *nr_accounted += (end - start) >> MMUPAGE_SHIFT; ret++; while (start != end) { @@ -565,7 +586,9 @@ int unmap_vmas(struct mmu_gather **tlbp, if ((long)zap_bytes > 0) continue; if (need_resched()) { - tlb_finish_mmu(*tlbp, tlb_start, start); + tlb_finish_mmu(*tlbp, + tlb_start_valid ? tlb_start : 0, + start); cond_resched_lock(&mm->page_table_lock); *tlbp = tlb_gather_mmu(mm, 0); tlb_start_valid = 0; @@ -573,7 +596,7 @@ int unmap_vmas(struct mmu_gather **tlbp, zap_bytes = ZAP_BLOCK_SIZE; } if (vma->vm_next && vma->vm_next->vm_start < vma->vm_end) - printk("%s: VMA list is not sorted correctly!\n", + pr_debug("%s: VMA list is not sorted correctly!\n", __FUNCTION__); } return ret; @@ -612,18 +635,19 @@ void zap_page_range(struct vm_area_struc * Do a quick page-table lookup for a single page. * mm->page_table_lock must be held. */ -struct page * -follow_page(struct mm_struct *mm, unsigned long address, int write) +unsigned long follow_page(struct mm_struct *mm, unsigned long address, int write) { pgd_t *pgd; pmd_t *pmd; pte_t *ptep, pte; unsigned long pfn; - struct vm_area_struct *vma; +#if 0 + struct vm_area_struct *vma; vma = hugepage_vma(mm, address); if (vma) return follow_huge_addr(mm, vma, address, write); +#endif pgd = pgd_offset(mm, address); if (pgd_none(*pgd) || pgd_bad(*pgd)) @@ -632,8 +656,15 @@ follow_page(struct mm_struct *mm, unsign pmd = pmd_offset(pgd, address); if (pmd_none(*pmd)) goto out; + + /* + * hugetlb's still broken in pgcl; not difficult to fix, + * but an unnecessary distraction while it's in flux + */ +#if 0 if (pmd_huge(*pmd)) return follow_huge_pmd(mm, address, pmd, write); +#endif if (pmd_bad(*pmd)) goto out; @@ -647,12 +678,12 @@ follow_page(struct mm_struct *mm, unsign if (!write || (pte_write(pte) && pte_dirty(pte))) { pfn = pte_pfn(pte); if (pfn_valid(pfn)) - return pfn_to_page(pfn); + return pfn; /* pfn_to_page(pfn) */ } } out: - return NULL; + return 0; /* NULL */ } /* @@ -664,14 +695,26 @@ out: static inline struct page *get_page_map(struct page *page) { if (!pfn_valid(page_to_pfn(page))) - return 0; + return NULL; return page; } +static inline unsigned long get_pfn_map(unsigned long pfn) +{ + return pfn_valid(pfn) ? pfn : 0; +} + +/* + * This puppy is handing back MMUPAGE_SIZE -sized slots. + * Callers need auditing. + * This function is a goddamn train wreck. Someone needs to + * janitor the idiot thing for mainline to at very least kill + * the #ifdef FIXADDR_START bullcrap. + */ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int len, int write, int force, - struct page **pages, struct vm_area_struct **vmas) + unsigned long *pfns, struct vm_area_struct **vmas) { int i; unsigned int flags; @@ -701,46 +744,48 @@ int get_user_pages(struct task_struct *t .vm_page_prot = PAGE_READONLY, .vm_flags = VM_READ | VM_EXEC, }; - unsigned long pg = start & PAGE_MASK; + unsigned long pg = start & MMUPAGE_MASK; pgd_t *pgd; pmd_t *pmd; pte_t *pte; pgd = pgd_offset_k(pg); if (!pgd) - return i ? : -EFAULT; + return i ? i : -EFAULT; pmd = pmd_offset(pgd, pg); if (!pmd) - return i ? : -EFAULT; + return i ? i : -EFAULT; pte = pte_offset_kernel(pmd, pg); if (!pte || !pte_present(*pte) || !pte_user(*pte) || !(write ? pte_write(*pte) : pte_read(*pte))) - return i ? : -EFAULT; - if (pages) { - pages[i] = pte_page(*pte); - get_page(pages[i]); + return i ? i : -EFAULT; + if (pfns) { + pfns[i] = pte_pfn(*pte); + get_page(pfn_to_page(pfns[i])); } if (vmas) vmas[i] = &fixmap_vma; i++; - start += PAGE_SIZE; + start += MMUPAGE_SIZE; len--; continue; } #endif - if (!vma || (pages && (vma->vm_flags & VM_IO)) + if (!vma || (pfns && (vma->vm_flags & VM_IO)) || !(flags & vma->vm_flags)) - return i ? : -EFAULT; + return i ? i : -EFAULT; +#if 0 if (is_vm_hugetlb_page(vma)) { i = follow_hugetlb_page(mm, vma, pages, vmas, &start, &len, i); continue; } +#endif spin_lock(&mm->page_table_lock); do { - struct page *map; - while (!(map = follow_page(mm, start, write))) { + unsigned long map_pfn; + while (!(map_pfn = follow_page(mm, start, write))) { spin_unlock(&mm->page_table_lock); switch (handle_mm_fault(mm,vma,start,write)) { case VM_FAULT_MINOR: @@ -750,36 +795,50 @@ int get_user_pages(struct task_struct *t tsk->maj_flt++; break; case VM_FAULT_SIGBUS: + if (!i) + printk("get_user_pages(): VM_FAULT_SIGBUS\n"); return i ? i : -EFAULT; case VM_FAULT_OOM: + if (!i) + printk("get_user_pages(): VM_FAULT_OOM\n"); return i ? i : -ENOMEM; default: BUG(); } spin_lock(&mm->page_table_lock); } - if (pages) { - pages[i] = get_page_map(map); - if (!pages[i]) { + if (pfns) { + pfns[i] = get_pfn_map(map_pfn); + if (!pfns[i]) { spin_unlock(&mm->page_table_lock); - while (i--) - page_cache_release(pages[i]); + while (i--) { + struct page *map; + map = pfn_to_page(pfns[i]); + page_cache_release(map); + } i = -EFAULT; + printk("get_user_pages(): saw a zero pfn\n"); goto out; } - flush_dcache_page(pages[i]); - if (!PageReserved(pages[i])) - page_cache_get(pages[i]); + if (1) { + struct page *map; + map = pfn_to_page(pfns[i]); + flush_dcache_page(map); + if (!PageReserved(map)) + page_cache_get(map); + } } if (vmas) vmas[i] = vma; i++; - start += PAGE_SIZE; + start += MMUPAGE_SIZE; len--; } while(len && start < vma->vm_end); spin_unlock(&mm->page_table_lock); } while(len); out: + if (i < 0) + pr_debug("get_user_pages() returning an error\n"); return i; } @@ -796,7 +855,7 @@ static void zeromap_pte_range(pte_t * pt pte_t zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE(address), prot)); BUG_ON(!pte_none(*pte)); set_pte(pte, zero_pte); - address += PAGE_SIZE; + address += MMUPAGE_SIZE; pte++; } while (address && (address < end)); } @@ -832,8 +891,7 @@ int zeromap_page_range(struct vm_area_st dir = pgd_offset(mm, address); flush_cache_range(vma, beg, end); - if (address >= end) - BUG(); + BUG_ON(address >= end); spin_lock(&mm->page_table_lock); do { @@ -867,12 +925,12 @@ static inline void remap_pte_range(pte_t end = address + size; if (end > PMD_SIZE) end = PMD_SIZE; - pfn = phys_addr >> PAGE_SHIFT; + pfn = phys_addr >> MMUPAGE_SHIFT; do { BUG_ON(!pte_none(*pte)); if (!pfn_valid(pfn) || PageReserved(pfn_to_page(pfn))) set_pte(pte, pfn_pte(pfn, prot)); - address += PAGE_SIZE; + address += MMUPAGE_SIZE; pfn++; pte++; } while (address && (address < end)); @@ -913,8 +971,7 @@ int remap_page_range(struct vm_area_stru phys_addr -= from; dir = pgd_offset(mm, from); flush_cache_range(vma, beg, end); - if (from >= end) - BUG(); + BUG_ON(from >= end); spin_lock(&mm->page_table_lock); do { @@ -951,12 +1008,12 @@ static inline void establish_pte(struct /* * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock */ -static inline void break_cow(struct vm_area_struct * vma, struct page * new_page, unsigned long address, - pte_t *page_table) +static inline void break_cow(struct vm_area_struct * vma, struct page * new_page, unsigned long address, pte_t *page_table, unsigned long subpfn) { + pte_t pte = pfn_pte(page_to_pfn(new_page) + subpfn, vma->vm_page_prot); invalidate_vcache(address, vma->vm_mm, new_page); flush_cache_page(vma, address); - establish_pte(vma, address, page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)))); + establish_pte(vma, address, page_table, pte_mkwrite(pte_mkdirty(pte))); } /* @@ -1004,6 +1061,10 @@ static int do_wp_page(struct mm_struct * int reuse = can_share_swap_page(old_page); unlock_page(old_page); if (reuse) { + /* + * XXX: this should sweep the pagetables to + * prefault all the pte's. This is free, take it. + */ flush_cache_page(vma, address); establish_pte(vma, address, page_table, pte_mkyoung(pte_mkdirty(pte_mkwrite(pte)))); @@ -1016,6 +1077,8 @@ static int do_wp_page(struct mm_struct * /* * Ok, we need to copy. Oh, well.. + * XXX: This needs to sweep the pagetables in an analogous + * manner to do_anonymous_page(). */ page_cache_get(old_page); spin_unlock(&mm->page_table_lock); @@ -1034,10 +1097,11 @@ static int do_wp_page(struct mm_struct * spin_lock(&mm->page_table_lock); page_table = pte_offset_map(pmd, address); if (pte_same(*page_table, pte)) { + unsigned long subpfn = pfn & (PAGE_MMUCOUNT-1); if (PageReserved(old_page)) ++mm->rss; page_remove_rmap(old_page, page_table); - break_cow(vma, new_page, address, page_table); + break_cow(vma, new_page, address, page_table, subpfn); pte_chain = page_add_rmap(new_page, page_table, pte_chain); lru_cache_add_active(new_page); @@ -1079,14 +1143,14 @@ static void vmtruncate_list(struct list_ } /* mapping wholly unaffected? */ - len = len >> PAGE_SHIFT; + len = len >> MMUPAGE_SHIFT; diff = pgoff - vma->vm_pgoff; if (diff >= len) continue; /* Ok, partially affected.. */ - start += diff << PAGE_SHIFT; - len = (len - diff) << PAGE_SHIFT; + start += diff << MMUPAGE_SHIFT; + len = (len - diff) << MMUPAGE_SHIFT; zap_page_range(vma, start, len); } } @@ -1108,7 +1172,7 @@ int vmtruncate(struct inode * inode, lof if (inode->i_size < offset) goto do_expand; inode->i_size = offset; - pgoff = (offset + PAGE_SIZE - 1) >> PAGE_SHIFT; + pgoff = (offset + MMUPAGE_SIZE - 1) / MMUPAGE_SIZE; down(&mapping->i_shared_sem); if (unlikely(!list_empty(&mapping->i_mmap))) vmtruncate_list(&mapping->i_mmap, pgoff); @@ -1174,8 +1238,13 @@ static int do_swap_page(struct mm_struct struct page *page; swp_entry_t entry = pte_to_swp_entry(orig_pte); pte_t pte; - int ret = VM_FAULT_MINOR; + int rss, ret = VM_FAULT_MINOR; struct pte_chain *pte_chain = NULL; + unsigned long subpfn, flt_subpfn = swp_offset(entry) % PAGE_MMUCOUNT; + unsigned long pfn, lo_vaddr, hi_vaddr, vaddr; + + lo_vaddr = max(address & PAGE_MASK, vma->vm_start); + hi_vaddr = min(PAGE_ALIGN(address), vma->vm_end); pte_unmap(page_table); spin_unlock(&mm->page_table_lock); @@ -1207,7 +1276,7 @@ static int do_swap_page(struct mm_struct mark_page_accessed(page); pte_chain = pte_chain_alloc(GFP_KERNEL); if (!pte_chain) { - ret = -ENOMEM; + ret = VM_FAULT_OOM; goto out; } lock_page(page); @@ -1229,24 +1298,90 @@ static int do_swap_page(struct mm_struct /* The page isn't present yet, go ahead with the fault. */ + /* + swap_free(entry); + if (vm_swap_full()) + remove_exclusive_swap_page(page); + */ + + rss = 0; + vaddr = lo_vaddr; + page_table -= (address - lo_vaddr)/MMUPAGE_SIZE; + + flush_icache_page(vma, page); + + pfn = page_to_pfn(page); + + do { + /* already faulted in? less work for me */ + if (pte_present(*page_table)) + goto next; + + entry = pte_to_swp_entry(*page_table); + + if (!pte_none(*page_table) && + swp_offset(entry)/PAGE_MMUCOUNT == page->index) { + swap_free(entry); if (vm_swap_full()) remove_exclusive_swap_page(page); + subpfn = swp_offset(entry) % PAGE_MMUCOUNT; + pte = pfn_pte(pfn + subpfn, vma->vm_page_prot); + + } else if (pte_none(*page_table)) { + + subpfn = flt_subpfn + (vaddr - address)/MMUPAGE_SHIFT; + + /* it'd fall outside the page */ + if (subpfn >= PAGE_MMUCOUNT) + goto next; + + pte = pfn_pte(pfn + subpfn, vma->vm_page_prot); + + /* !pte_none() && swp_offset()/PAGE_MMUCOUNT != page->index */ + } else + goto next; - mm->rss++; - pte = mk_pte(page, vma->vm_page_prot); if (write_access && can_share_swap_page(page)) pte = pte_mkdirty(pte_mkwrite(pte)); - unlock_page(page); - flush_icache_page(vma, page); + if (!pte_chain) + pte_chain = pte_chain_alloc(GFP_ATOMIC); + if (!pte_chain) { + pte_unmap(page_table); + spin_unlock(&mm->page_table_lock); + pte_chain = pte_chain_alloc(GFP_KERNEL); + if (!pte_chain) { + ret = VM_FAULT_OOM; + spin_lock(&mm->page_table_lock); + mm->rss += rss; + spin_unlock(&mm->page_table_lock); + goto no_mem; + } + spin_lock(&mm->page_table_lock); + page_table = pte_offset_map(pmd, vaddr); + } + set_pte(page_table, pte); + ++rss; pte_chain = page_add_rmap(page, page_table, pte_chain); +next: + vaddr += MMUPAGE_SIZE; + page_table++; + } while (vaddr < hi_vaddr); - /* No need to invalidate - it was non-present before */ + unlock_page(page); update_mmu_cache(vma, address, pte); - pte_unmap(page_table); + mm->rss += rss; + pte_unmap(page_table-1); spin_unlock(&mm->page_table_lock); +no_mem: + if (!page) + goto out; + if (!rss) + page_cache_release(page); + else if (rss > 1) + atomic_add(rss - 1, &page->count); out: pte_chain_free(pte_chain); return ret; @@ -1262,66 +1397,200 @@ do_anonymous_page(struct mm_struct *mm, pte_t *page_table, pmd_t *pmd, int write_access, unsigned long addr) { - pte_t entry; - struct page * page = ZERO_PAGE(addr); - struct pte_chain *pte_chain; - int ret; - + struct page *page = NULL; + struct pte_chain *pte_chain = NULL; + unsigned long up_vaddr, dn_vaddr, lo_vaddr, hi_vaddr; + unsigned long pfn, subpfn, dn_subpfn, up_subpfn; + pte_t *ptes[PAGE_MMUCOUNT] = { [0 ... PAGE_MMUCOUNT-1] = NULL }; + pte_t *up_pte, *dn_pte; + int rss, ret = VM_FAULT_MINOR; + + if (write_access) + pr_debug("write fault on 0x%lx\n", addr); + else + pr_debug("read fault on 0x%lx\n", addr); + pr_debug("page_table = 0x%p\n", page_table); + + if (!write_access) + page = ZERO_PAGE(addr); + else { + if (!pte_chain) pte_chain = pte_chain_alloc(GFP_ATOMIC); - if (!pte_chain) { pte_unmap(page_table); spin_unlock(&mm->page_table_lock); + if (!pte_chain) { pte_chain = pte_chain_alloc(GFP_KERNEL); if (!pte_chain) - goto no_mem; - spin_lock(&mm->page_table_lock); - page_table = pte_offset_map(pmd, addr); + return VM_FAULT_OOM; + } + page = alloc_page(GFP_HIGHUSER); + if (!page) { + pte_chain_free(pte_chain); + return VM_FAULT_OOM; + } + clear_user_highpage(page, addr); } - /* Read-only mapping of ZERO_PAGE. */ - entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot)); + lo_vaddr = max(addr & ~(PAGE_MMUCOUNT*PMD_SIZE - 1), vma->vm_start); + hi_vaddr = min(vma->vm_end, (addr + PAGE_MMUCOUNT*PMD_SIZE - 1) + & ~(PAGE_MMUCOUNT*PMD_SIZE - 1)); + dn_subpfn = 0; + up_subpfn = PAGE_MMUCOUNT - 1; + dn_vaddr = addr & MMUPAGE_MASK; + up_vaddr = MMUPAGE_ALIGN(addr + 1); + + pr_debug("vma->vm_start = 0x%lx, vma->vm_end = 0x%lx\n", + vma->vm_start, vma->vm_end); + pr_debug("lo_vaddr = 0x%lx, hi_vaddr = 0x%lx\n", lo_vaddr, hi_vaddr); + pr_debug("dn_vaddr = 0x%lx, up_vaddr = 0x%lx\n", dn_vaddr, up_vaddr); - /* ..except if it's a write access */ if (write_access) { - /* Allocate our own private page. */ - pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); + pr_debug("about to take mm->page_table_lock\n"); + if (spin_is_locked(&mm->page_table_lock)) + printk("hmm, I see a deadlock coming\n"); + spin_lock(&mm->page_table_lock); + } - page = alloc_page(GFP_HIGHUSER); - if (!page) - goto no_mem; - clear_user_highpage(page, addr); + pr_debug("starting PTE search loop\n"); + if (write_access) + page_table = dn_pte = pte_offset_map(pmd, dn_vaddr); + else + dn_pte = page_table; + up_pte = dn_pte + 1; - spin_lock(&mm->page_table_lock); - page_table = pte_offset_map(pmd, addr); + do { + if (up_vaddr < hi_vaddr && up_subpfn > dn_subpfn) { + if (pte_none(*up_pte)) { + ptes[up_subpfn] = up_pte; + up_subpfn--; + } + up_vaddr += MMUPAGE_SIZE; + up_pte++; + } - if (!pte_none(*page_table)) { - pte_unmap(page_table); - page_cache_release(page); - spin_unlock(&mm->page_table_lock); - ret = VM_FAULT_MINOR; - goto out; + if (dn_vaddr >= lo_vaddr && dn_subpfn < up_subpfn) { + if (pte_none(*dn_pte)) { + ptes[dn_subpfn] = dn_pte; + dn_subpfn++; } - mm->rss++; - entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); - lru_cache_add_active(page); - mark_page_accessed(page); + dn_vaddr -= MMUPAGE_SIZE; + dn_pte--; } + pr_debug("dn_vaddr = 0x%lx, up_vaddr = 0x%lx\n", + dn_vaddr, up_vaddr); + pr_debug("dn_subpfn = 0x%lx, up_subpfn = 0x%lx\n", + dn_subpfn, up_subpfn); + } while ((up_vaddr < hi_vaddr || dn_vaddr >= lo_vaddr) && + up_subpfn > dn_subpfn); + + pr_debug("finishing PTE search loop\n"); + pr_debug("starting PTE instantiation loop\n"); + + pfn = page_to_pfn(page); + rss = 0; + for (subpfn = 0; subpfn < PAGE_MMUCOUNT; ++subpfn) { + pte_t pte; - set_pte(page_table, entry); - /* ignores ZERO_PAGE */ - pte_chain = page_add_rmap(page, page_table, pte_chain); - pte_unmap(page_table); + pr_debug("subpfn = 0x%lx, ptep = 0x%p\n", subpfn, ptes[subpfn]); - /* No need to invalidate - it was non-present before */ - update_mmu_cache(vma, addr, entry); - spin_unlock(&mm->page_table_lock); - ret = VM_FAULT_MINOR; - goto out; + if (!ptes[subpfn]) { + pr_debug("pte empty\n"); + continue; + } else if (!pte_none(*ptes[subpfn])) { + pr_debug("pte non-none\n"); + continue; + } -no_mem: - ret = VM_FAULT_OOM; -out: + pte = pfn_pte(pfn + subpfn, vma->vm_page_prot); + if (!write_access) { + pr_debug("setting pte to zero page\n"); + set_pte(ptes[subpfn], pte_wrprotect(pte)); + } else { + pr_debug("setting pte to newly zeroed anonymous page\n"); + if (!pte_chain) + pte_chain = pte_chain_alloc(GFP_ATOMIC); + if (!pte_chain) { + unsigned long vaddr, offset; + int k; + + pr_debug("doing sleeping alloc of pte_chain" + " for non-anonymous page\n"); + + vaddr = ptep_to_address(ptes[subpfn]); + + pr_debug("vaddr = 0x%lx\n", vaddr); + + pte_unmap(ptes[subpfn]); + spin_unlock(&mm->page_table_lock); + pte_chain = pte_chain_alloc(GFP_KERNEL); + if (!pte_chain) { + pr_debug("going to out_oom\n"); + ret = VM_FAULT_OOM; + goto out_oom; + } + spin_lock(&mm->page_table_lock); + page_table = pte_offset_map(pmd, vaddr); + + /* is this safe from gcc? NFI */ + if (page_table != ptes[subpfn]) { + pr_debug("(page_table) 0x%p != 0x%p" + " (ptes[subpfn])\n", + page_table, + ptes[subpfn]); + offset = (unsigned long) + (page_table - ptes[subpfn]); + pr_debug("adjusting all ptes by" + " offset 0x%lx\n", + offset); + for (k = subpfn; k < PAGE_MMUCOUNT; ++k) { + pr_debug("pte before 0x%p\n", + ptes[k]); + if (ptes[k]) + ptes[k] += offset; + pr_debug("pte after 0x%p\n", + ptes[k]); + } + } + + /* check for races */ + if (!pte_none(*ptes[subpfn])) { + pr_debug("raced, skipping PTE\n"); + continue; + } + } + pr_debug("setting pte for anonymous zeroed page\n"); + pr_debug("ptep = 0x%p, pte = 0x%Lx\n", + ptes[subpfn], (u64)pte_val(pte)); + set_pte(ptes[subpfn], pte_mkwrite(pte_mkdirty(pte))); + pr_debug("about to page_add_rmap()\n"); + pte_chain = page_add_rmap(page, ptes[subpfn], + pte_chain); + pr_debug("about to update_mmu_cache()\n"); + update_mmu_cache(vma, addr, pte); + rss++; + pr_debug("about to page_cache_get()\n"); + page_cache_get(page); + } + pr_debug("falling through to next subpfn\n"); + } + pr_debug("doing pte_unmap(0x%p)\n", page_table); + pte_unmap(page_table); + pr_debug("adding %d to rss\n", rss); + mm->rss += rss; + spin_unlock(&mm->page_table_lock); + pr_debug("broke out of PTE instantiation loop\n"); +out_oom: + pr_debug("at out_oom\n"); + if (write_access) { + if (rss) { + pr_debug("adding page to LRU\n"); + lru_cache_add_active(page); + mark_page_accessed(page); + } + pr_debug("releasing page\n"); + page_cache_release(page); + } + pr_debug("doing pte_chain_free()\n"); pte_chain_free(pte_chain); return ret; } @@ -1353,12 +1622,12 @@ do_no_page(struct mm_struct *mm, struct pte_unmap(page_table); spin_unlock(&mm->page_table_lock); - new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, 0); + new_page = vma->vm_ops->nopage(vma, address & MMUPAGE_MASK, 0); /* no page was available -- either SIGBUS or OOM */ if (new_page == NOPAGE_SIGBUS) return VM_FAULT_SIGBUS; - if (new_page == NOPAGE_OOM) + else if (new_page == NOPAGE_OOM) return VM_FAULT_OOM; pte_chain = pte_chain_alloc(GFP_KERNEL); @@ -1392,12 +1661,17 @@ do_no_page(struct mm_struct *mm, struct * an exclusive copy of the page, or this is a shared mapping, * so we can make it writable and dirty to avoid having to * handle that later. + * + * XXX: this should sweep pagetables and prefault */ /* Only go through if we didn't race with anybody else... */ if (pte_none(*page_table)) { + unsigned long pfn; ++mm->rss; flush_icache_page(vma, new_page); - entry = mk_pte(new_page, vma->vm_page_prot); + pfn = page_to_pfn(new_page) + + vma_suboffset(vma, address)/MMUPAGE_SIZE; + entry = pfn_pte(pfn, vma->vm_page_prot); if (write_access) entry = pte_mkwrite(pte_mkdirty(entry)); set_pte(page_table, entry); @@ -1451,7 +1725,7 @@ static int do_file_page(struct mm_struct pte_unmap(pte); spin_unlock(&mm->page_table_lock); - err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE, vma->vm_page_prot, pgoff, 0); + err = vma->vm_ops->populate(vma, address & MMUPAGE_MASK, MMUPAGE_SIZE, vma->vm_page_prot, pgoff, 0); if (err == -ENOMEM) return VM_FAULT_OOM; if (err) @@ -1585,11 +1859,9 @@ int make_pages_present(unsigned long add vma = find_vma(current->mm, addr); write = (vma->vm_flags & VM_WRITE) != 0; - if (addr >= end) - BUG(); - if (end > vma->vm_end) - BUG(); - len = (end+PAGE_SIZE-1)/PAGE_SIZE-addr/PAGE_SIZE; + BUG_ON(addr >= end); + BUG_ON(end > vma->vm_end); + len = (end+MMUPAGE_SIZE-1)/MMUPAGE_SIZE-addr/MMUPAGE_SIZE; ret = get_user_pages(current, current->mm, addr, len, write, 0, NULL, NULL); return ret == len ? 0 : -1; diff -prauN linux-2.5.70-bk10/mm/mincore.c pgcl-2.5.70-bk10-1/mm/mincore.c --- linux-2.5.70-bk10/mm/mincore.c 2003-05-26 18:01:00.000000000 -0700 +++ pgcl-2.5.70-bk10-1/mm/mincore.c 2003-06-05 09:44:34.000000000 -0700 @@ -29,7 +29,7 @@ static unsigned char mincore_page(struct struct address_space * as = vma->vm_file->f_dentry->d_inode->i_mapping; struct page * page; - page = find_get_page(as, pgoff); + page = find_get_page(as, pgoff/PAGE_CACHE_MMUCOUNT); if (page) { present = PageUptodate(page); page_cache_release(page); @@ -42,41 +42,43 @@ static long mincore_vma(struct vm_area_s unsigned long start, unsigned long end, unsigned char __user * vec) { long error, i, remaining; - unsigned char * tmp; + unsigned char *kaddr; + struct page *page; error = -ENOMEM; if (!vma->vm_file) return error; - start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + start = ((start - vma->vm_start) >> MMUPAGE_SHIFT) + vma->vm_pgoff; if (end > vma->vm_end) end = vma->vm_end; - end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + end = ((end - vma->vm_start) >> MMUPAGE_SHIFT) + vma->vm_pgoff; error = -EAGAIN; - tmp = (unsigned char *) __get_free_page(GFP_KERNEL); - if (!tmp) + page = alloc_page(GFP_HIGHUSER); + if (!page) return error; /* (end - start) is # of pages, and also # of bytes in "vec */ - remaining = (end - start), + remaining = end - start; error = 0; + kaddr = kmap_atomic(page, KM_USER0); for (i = 0; remaining > 0; remaining -= PAGE_SIZE, i++) { int j = 0; long thispiece = (remaining < PAGE_SIZE) ? remaining : PAGE_SIZE; while (j < thispiece) - tmp[j++] = mincore_page(vma, start++); + kaddr[j++] = mincore_page(vma, start++); - if (copy_to_user(vec + PAGE_SIZE * i, tmp, thispiece)) { + if (copy_to_user(vec + PAGE_SIZE * i, kaddr, thispiece)) { error = -EFAULT; break; } } - - free_page((unsigned long) tmp); + kunmap_atomic(kaddr, KM_USER0); + __free_page(page); return error; } @@ -116,15 +118,15 @@ asmlinkage long sys_mincore(unsigned lon down_read(¤t->mm->mmap_sem); - if (start & ~PAGE_CACHE_MASK) + if (start & ~MMUPAGE_MASK) goto out; - len = (len + ~PAGE_CACHE_MASK) & PAGE_CACHE_MASK; + len = (len + ~MMUPAGE_MASK) & MMUPAGE_MASK; end = start + len; if (end < start) goto out; error = -EFAULT; - if (!access_ok(VERIFY_WRITE, (unsigned long) vec, len >> PAGE_SHIFT)) + if (!access_ok(VERIFY_WRITE, (unsigned long) vec, len >> MMUPAGE_SHIFT)) goto out; error = 0; @@ -164,7 +166,7 @@ asmlinkage long sys_mincore(unsigned lon error = mincore_vma(vma, start, vma->vm_end, &vec[index]); if (error) goto out; - index += (vma->vm_end - start) >> PAGE_CACHE_SHIFT; + index += (vma->vm_end - start)/MMUPAGE_SIZE; start = vma->vm_end; vma = vma->vm_next; } diff -prauN linux-2.5.70-bk10/mm/mlock.c pgcl-2.5.70-bk10-1/mm/mlock.c --- linux-2.5.70-bk10/mm/mlock.c 2003-05-26 18:00:21.000000000 -0700 +++ pgcl-2.5.70-bk10-1/mm/mlock.c 2003-06-05 09:44:34.000000000 -0700 @@ -37,7 +37,7 @@ static int mlock_fixup(struct vm_area_st /* * Keep track of amount of locked VM. */ - pages = (end - start) >> PAGE_SHIFT; + pages = (end - start) >> MMUPAGE_SHIFT; if (newflags & VM_LOCKED) { pages = -pages; make_pages_present(start, end); @@ -55,7 +55,7 @@ static int do_mlock(unsigned long start, if (on && !capable(CAP_IPC_LOCK)) return -EPERM; - len = PAGE_ALIGN(len); + len = MMUPAGE_ALIGN(len); end = start + len; if (end < start) return -EINVAL; @@ -101,14 +101,14 @@ asmlinkage long sys_mlock(unsigned long int error = -ENOMEM; down_write(¤t->mm->mmap_sem); - len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); - start &= PAGE_MASK; + len = MMUPAGE_ALIGN(len + (start & ~MMUPAGE_MASK)); + start &= MMUPAGE_MASK; - locked = len >> PAGE_SHIFT; + locked = len >> MMUPAGE_SHIFT; locked += current->mm->locked_vm; lock_limit = current->rlim[RLIMIT_MEMLOCK].rlim_cur; - lock_limit >>= PAGE_SHIFT; + lock_limit >>= MMUPAGE_SHIFT; /* check against resource limits */ if (locked <= lock_limit) @@ -122,8 +122,8 @@ asmlinkage long sys_munlock(unsigned lon int ret; down_write(¤t->mm->mmap_sem); - len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); - start &= PAGE_MASK; + len = MMUPAGE_ALIGN(len + (start & ~MMUPAGE_MASK)); + start &= MMUPAGE_MASK; ret = do_mlock(start, len, 0); up_write(¤t->mm->mmap_sem); return ret; @@ -167,7 +167,7 @@ asmlinkage long sys_mlockall(int flags) goto out; lock_limit = current->rlim[RLIMIT_MEMLOCK].rlim_cur; - lock_limit >>= PAGE_SHIFT; + lock_limit >>= MMUPAGE_SHIFT; ret = -ENOMEM; if (current->mm->total_vm <= lock_limit) diff -prauN linux-2.5.70-bk10/mm/mmap.c pgcl-2.5.70-bk10-1/mm/mmap.c --- linux-2.5.70-bk10/mm/mmap.c 2003-06-05 05:44:01.000000000 -0700 +++ pgcl-2.5.70-bk10-1/mm/mmap.c 2003-06-05 09:48:26.000000000 -0700 @@ -56,7 +56,8 @@ atomic_t vm_committed_space = ATOMIC_INI /* * Check that a process has enough memory to allocate a new virtual * mapping. 1 means there is enough memory for the allocation to - * succeed and 0 implies there is not. + * succeed and 0 implies there is not. the "pages" argument is in + * mmupages. * * We currently support three overcommit policies, which are set via the * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-acounting @@ -78,7 +79,7 @@ int vm_enough_memory(long pages) return 1; if (sysctl_overcommit_memory == 0) { - free = get_page_cache_size(); + free = get_page_cache_size(); free += nr_free_pages(); free += nr_swap_pages; @@ -89,6 +90,7 @@ int vm_enough_memory(long pages) * cache and most inode caches should fall into this */ free += atomic_read(&slab_reclaim_pages); + free *= PAGE_MMUCOUNT; /* * Leave the last 3% for root @@ -102,14 +104,14 @@ int vm_enough_memory(long pages) return 0; } - allowed = totalram_pages * sysctl_overcommit_ratio / 100; + allowed = totalram_pages*(sysctl_overcommit_ratio/100); allowed += total_swap_pages; + allowed *= PAGE_MMUCOUNT; if (atomic_read(&vm_committed_space) < allowed) return 1; vm_unacct_memory(pages); - return 0; } @@ -159,8 +161,8 @@ asmlinkage unsigned long sys_brk(unsigne if (brk < mm->end_code) goto out; - newbrk = PAGE_ALIGN(brk); - oldbrk = PAGE_ALIGN(mm->brk); + newbrk = MMUPAGE_ALIGN(brk); + oldbrk = MMUPAGE_ALIGN(mm->brk); if (oldbrk == newbrk) goto set_brk; @@ -177,7 +179,7 @@ asmlinkage unsigned long sys_brk(unsigne goto out; /* Check against existing mmap mappings. */ - if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE)) + if (find_vma_intersection(mm, oldbrk, newbrk+MMUPAGE_SIZE)) goto out; /* Ok, looks good - let it rip. */ @@ -534,10 +536,10 @@ unsigned long do_mmap_pgoff(struct file if (len > TASK_SIZE) return -EINVAL; - len = PAGE_ALIGN(len); + len = MMUPAGE_ALIGN(len); /* offset overflow? */ - if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) + if ((pgoff + (len >> MMUPAGE_SHIFT)) < pgoff) return -EINVAL; /* Too many mappings? */ @@ -548,7 +550,7 @@ unsigned long do_mmap_pgoff(struct file * that it represents a valid section of the address space. */ addr = get_unmapped_area(file, addr, len, pgoff, flags); - if (addr & ~PAGE_MASK) + if (addr & ~MMUPAGE_MASK) return addr; /* Do simple checking here so the lower-level routines won't have @@ -565,7 +567,7 @@ unsigned long do_mmap_pgoff(struct file } /* mlock MCL_FUTURE? */ if (vm_flags & VM_LOCKED) { - unsigned long locked = mm->locked_vm << PAGE_SHIFT; + unsigned long locked = mm->locked_vm << MMUPAGE_SHIFT; locked += len; if (locked > current->rlim[RLIMIT_MEMLOCK].rlim_cur) return -EAGAIN; @@ -633,7 +635,7 @@ munmap_back: } /* Check against address space limit. */ - if ((mm->total_vm << PAGE_SHIFT) + len + if ((mm->total_vm << MMUPAGE_SHIFT) + len > current->rlim[RLIMIT_AS].rlim_cur) return -ENOMEM; @@ -645,7 +647,7 @@ munmap_back: /* * Private writable mapping: check memory availability */ - charged = len >> PAGE_SHIFT; + charged = len >> MMUPAGE_SHIFT; if (!vm_enough_memory(charged)) return -ENOMEM; vm_flags |= VM_ACCOUNT; @@ -730,9 +732,9 @@ munmap_back: kmem_cache_free(vm_area_cachep, vma); } out: - mm->total_vm += len >> PAGE_SHIFT; + mm->total_vm += len >> MMUPAGE_SHIFT; if (vm_flags & VM_LOCKED) { - mm->locked_vm += len >> PAGE_SHIFT; + mm->locked_vm += len >> MMUPAGE_SHIFT; make_pages_present(addr, addr + len); } if (flags & MAP_POPULATE) { @@ -765,7 +767,7 @@ unacct_error: * Ugly calling convention alert: * Return value with the low bits set means error value, * ie - * if (ret & ~PAGE_MASK) + * if (ret & ~MMUPAGE_MASK) * error = ret; * * This function "knows" that -ENOMEM has the bits set. @@ -783,7 +785,7 @@ arch_get_unmapped_area(struct file *filp return -ENOMEM; if (addr) { - addr = PAGE_ALIGN(addr); + addr = MMUPAGE_ALIGN(addr); vma = find_vma(mm, addr); if (TASK_SIZE - len >= addr && (!vma || addr + len <= vma->vm_start)) @@ -822,7 +824,7 @@ get_unmapped_area(struct file *file, uns if (addr > TASK_SIZE - len) return -ENOMEM; - if (addr & ~PAGE_MASK) + if (addr & ~MMUPAGE_MASK) return -EINVAL; if (file && is_file_hugepages(file)) { /* @@ -928,18 +930,20 @@ int expand_stack(struct vm_area_struct * { unsigned long grow; - if (!(vma->vm_flags & VM_GROWSUP)) + if (!(vma->vm_flags & VM_GROWSUP)) { + printk("bad vma flags in expand_stack()\n"); return -EFAULT; + } /* * vma->vm_start/vm_end cannot change under us because the caller * is required to hold the mmap_sem in read mode. We need to get * the spinlock only before relocating the vma range ourself. */ - address += 4 + PAGE_SIZE - 1; - address &= PAGE_MASK; + address += 4 + MMUPAGE_SIZE - 1; + address &= MMUPAGE_MASK; spin_lock(&vma->vm_mm->page_table_lock); - grow = (address - vma->vm_end) >> PAGE_SHIFT; + grow = (address - vma->vm_end) >> MMUPAGE_SHIFT; /* Overcommit.. */ if (!vm_enough_memory(grow)) { @@ -948,7 +952,7 @@ int expand_stack(struct vm_area_struct * } if (address - vma->vm_start > current->rlim[RLIMIT_STACK].rlim_cur || - ((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) > + ((vma->vm_mm->total_vm + grow) << MMUPAGE_SHIFT) > current->rlim[RLIMIT_AS].rlim_cur) { spin_unlock(&vma->vm_mm->page_table_lock); vm_unacct_memory(grow); @@ -967,7 +971,7 @@ find_extend_vma(struct mm_struct *mm, un { struct vm_area_struct *vma, *prev; - addr &= PAGE_MASK; + addr &= MMUPAGE_MASK; vma = find_vma_prev(mm, addr, &prev); if (vma && (vma->vm_start <= addr)) return vma; @@ -991,9 +995,9 @@ int expand_stack(struct vm_area_struct * * is required to hold the mmap_sem in read mode. We need to get * the spinlock only before relocating the vma range ourself. */ - address &= PAGE_MASK; + address &= MMUPAGE_MASK; spin_lock(&vma->vm_mm->page_table_lock); - grow = (vma->vm_start - address) >> PAGE_SHIFT; + grow = (vma->vm_start - address) >> MMUPAGE_SHIFT; /* Overcommit.. */ if (!vm_enough_memory(grow)) { @@ -1002,7 +1006,7 @@ int expand_stack(struct vm_area_struct * } if (vma->vm_end - address > current->rlim[RLIMIT_STACK].rlim_cur || - ((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) > + ((vma->vm_mm->total_vm + grow) << MMUPAGE_SHIFT) > current->rlim[RLIMIT_AS].rlim_cur) { spin_unlock(&vma->vm_mm->page_table_lock); vm_unacct_memory(grow); @@ -1023,7 +1027,7 @@ find_extend_vma(struct mm_struct * mm, u struct vm_area_struct * vma; unsigned long start; - addr &= PAGE_MASK; + addr &= MMUPAGE_MASK; vma = find_vma(mm,addr); if (!vma) return NULL; @@ -1060,7 +1064,7 @@ static void free_pgtables(struct mmu_gat unsigned long first = start & PGDIR_MASK; unsigned long last = end + PGDIR_SIZE - 1; unsigned long start_index, end_index; - struct mm_struct *mm = tlb->mm; + struct mm_struct *mm = tlb_mm(tlb); if (!prev) { prev = mm->mmap; @@ -1115,9 +1119,9 @@ static void unmap_vma(struct mm_struct * { size_t len = area->vm_end - area->vm_start; - area->vm_mm->total_vm -= len >> PAGE_SHIFT; + area->vm_mm->total_vm -= len >> MMUPAGE_SHIFT; if (area->vm_flags & VM_LOCKED) - area->vm_mm->locked_vm -= len >> PAGE_SHIFT; + area->vm_mm->locked_vm -= len >> MMUPAGE_SHIFT; /* * Is this a new hole at the lowest possible address? */ @@ -1222,11 +1226,11 @@ int split_vma(struct mm_struct * mm, str if (new_below) { new->vm_end = addr; vma->vm_start = addr; - vma->vm_pgoff += ((addr - new->vm_start) >> PAGE_SHIFT); + vma->vm_pgoff += ((addr - new->vm_start) >> MMUPAGE_SHIFT); } else { vma->vm_end = addr; new->vm_start = addr; - new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT); + new->vm_pgoff += ((addr - vma->vm_start) >> MMUPAGE_SHIFT); } if (new->vm_file) @@ -1249,10 +1253,10 @@ int do_munmap(struct mm_struct *mm, unsi unsigned long end; struct vm_area_struct *mpnt, *prev, *last; - if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start) + if ((start & ~MMUPAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start) return -EINVAL; - if ((len = PAGE_ALIGN(len)) == 0) + if ((len = MMUPAGE_ALIGN(len)) == 0) return -EINVAL; /* Find the first overlapping VMA */ @@ -1335,7 +1339,7 @@ unsigned long do_brk(unsigned long addr, unsigned long flags; struct rb_node ** rb_link, * rb_parent; - len = PAGE_ALIGN(len); + len = MMUPAGE_ALIGN(len); if (!len) return addr; @@ -1343,7 +1347,7 @@ unsigned long do_brk(unsigned long addr, * mlock MCL_FUTURE? */ if (mm->def_flags & VM_LOCKED) { - unsigned long locked = mm->locked_vm << PAGE_SHIFT; + unsigned long locked = mm->locked_vm << MMUPAGE_SHIFT; locked += len; if (locked > current->rlim[RLIMIT_MEMLOCK].rlim_cur) return -EAGAIN; @@ -1361,14 +1365,14 @@ unsigned long do_brk(unsigned long addr, } /* Check against address space limits *after* clearing old maps... */ - if ((mm->total_vm << PAGE_SHIFT) + len + if ((mm->total_vm << MMUPAGE_SHIFT) + len > current->rlim[RLIMIT_AS].rlim_cur) return -ENOMEM; if (mm->map_count > MAX_MAP_COUNT) return -ENOMEM; - if (!vm_enough_memory(len >> PAGE_SHIFT)) + if (!vm_enough_memory(len >> MMUPAGE_SHIFT)) return -ENOMEM; flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; @@ -1383,7 +1387,7 @@ unsigned long do_brk(unsigned long addr, */ vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); if (!vma) { - vm_unacct_memory(len >> PAGE_SHIFT); + vm_unacct_memory(len >> MMUPAGE_SHIFT); return -ENOMEM; } @@ -1401,9 +1405,9 @@ unsigned long do_brk(unsigned long addr, vma_link(mm, vma, prev, rb_link, rb_parent); out: - mm->total_vm += len >> PAGE_SHIFT; + mm->total_vm += len >> MMUPAGE_SHIFT; if (flags & VM_LOCKED) { - mm->locked_vm += len >> PAGE_SHIFT; + mm->locked_vm += len >> MMUPAGE_SHIFT; make_pages_present(addr, addr + len); } return addr; diff -prauN linux-2.5.70-bk10/mm/mprotect.c pgcl-2.5.70-bk10-1/mm/mprotect.c --- linux-2.5.70-bk10/mm/mprotect.c 2003-05-26 18:00:38.000000000 -0700 +++ pgcl-2.5.70-bk10-1/mm/mprotect.c 2003-06-05 09:44:34.000000000 -0700 @@ -53,7 +53,7 @@ change_pte_range(pmd_t *pmd, unsigned lo entry = ptep_get_and_clear(pte); set_pte(pte, pte_modify(entry, newprot)); } - address += PAGE_SIZE; + address += MMUPAGE_SIZE; pte++; } while (address && (address < end)); pte_unmap(pte - 1); @@ -174,9 +174,11 @@ mprotect_fixup(struct vm_area_struct *vm */ if (newflags & VM_WRITE) { if (!(vma->vm_flags & (VM_ACCOUNT|VM_WRITE|VM_SHARED))) { - charged = (end - start) >> PAGE_SHIFT; - if (!vm_enough_memory(charged)) + charged = (end - start) >> MMUPAGE_SHIFT; + if (!vm_enough_memory(charged)) { + printk("mprotect_fixup(): OOM\n"); return -ENOMEM; + } newflags |= VM_ACCOUNT; } } @@ -228,9 +230,9 @@ sys_mprotect(unsigned long start, size_t struct vm_area_struct * vma, * next, * prev; int error = -EINVAL; - if (start & ~PAGE_MASK) + if (start & ~MMUPAGE_MASK) return -EINVAL; - len = PAGE_ALIGN(len); + len = MMUPAGE_ALIGN(len); end = start + len; if (end < start) return -EINVAL; diff -prauN linux-2.5.70-bk10/mm/mremap.c pgcl-2.5.70-bk10-1/mm/mremap.c --- linux-2.5.70-bk10/mm/mremap.c 2003-05-26 18:00:40.000000000 -0700 +++ pgcl-2.5.70-bk10-1/mm/mremap.c 2003-06-05 09:44:34.000000000 -0700 @@ -162,7 +162,7 @@ static int move_page_tables(struct vm_ar * only a few pages.. This also makes error recovery easier. */ while (offset) { - offset -= PAGE_SIZE; + offset -= MMUPAGE_SIZE; if (move_one_page(vma, old_addr + offset, new_addr + offset)) goto oops_we_failed; } @@ -177,7 +177,7 @@ static int move_page_tables(struct vm_ar */ oops_we_failed: flush_cache_range(vma, new_addr, new_addr + len); - while ((offset += PAGE_SIZE) < len) + while ((offset += MMUPAGE_SIZE) < len) move_one_page(vma, new_addr + offset, old_addr + offset); zap_page_range(vma, new_addr, len); return -1; @@ -253,7 +253,7 @@ static unsigned long move_vma(struct vm_ INIT_LIST_HEAD(&new_vma->shared); new_vma->vm_start = new_addr; new_vma->vm_end = new_addr+new_len; - new_vma->vm_pgoff += (addr-vma->vm_start) >> PAGE_SHIFT; + new_vma->vm_pgoff += (addr - vma->vm_start) >> MMUPAGE_SHIFT; if (new_vma->vm_file) get_file(new_vma->vm_file); if (new_vma->vm_ops && new_vma->vm_ops->open) @@ -287,9 +287,9 @@ static unsigned long move_vma(struct vm_ vma->vm_next->vm_flags |= VM_ACCOUNT; } - current->mm->total_vm += new_len >> PAGE_SHIFT; + current->mm->total_vm += new_len >> MMUPAGE_SHIFT; if (must_fault_in) { - current->mm->locked_vm += new_len >> PAGE_SHIFT; + current->mm->locked_vm += new_len >> MMUPAGE_SHIFT; make_pages_present(fault_in_start, fault_in_end); } return new_addr; @@ -318,15 +318,15 @@ unsigned long do_mremap(unsigned long ad if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE)) goto out; - if (addr & ~PAGE_MASK) + if (addr & ~MMUPAGE_MASK) goto out; - old_len = PAGE_ALIGN(old_len); - new_len = PAGE_ALIGN(new_len); + old_len = MMUPAGE_ALIGN(old_len); + new_len = MMUPAGE_ALIGN(new_len); /* new_addr is only valid if MREMAP_FIXED is specified */ if (flags & MREMAP_FIXED) { - if (new_addr & ~PAGE_MASK) + if (new_addr & ~MMUPAGE_MASK) goto out; if (!(flags & MREMAP_MAYMOVE)) goto out; @@ -378,19 +378,19 @@ unsigned long do_mremap(unsigned long ad goto out; } if (vma->vm_flags & VM_LOCKED) { - unsigned long locked = current->mm->locked_vm << PAGE_SHIFT; + unsigned long locked = current->mm->locked_vm << MMUPAGE_SHIFT; locked += new_len - old_len; ret = -EAGAIN; if (locked > current->rlim[RLIMIT_MEMLOCK].rlim_cur) goto out; } ret = -ENOMEM; - if ((current->mm->total_vm << PAGE_SHIFT) + (new_len - old_len) + if ((current->mm->total_vm << MMUPAGE_SHIFT) + (new_len - old_len) > current->rlim[RLIMIT_AS].rlim_cur) goto out; if (vma->vm_flags & VM_ACCOUNT) { - charged = (new_len - old_len) >> PAGE_SHIFT; + charged = (new_len - old_len) >> MMUPAGE_SHIFT; if (!vm_enough_memory(charged)) goto out_nc; } @@ -406,7 +406,7 @@ unsigned long do_mremap(unsigned long ad max_addr = vma->vm_next->vm_start; /* can we just expand the current mapping? */ if (max_addr - addr >= new_len) { - int pages = (new_len - old_len) >> PAGE_SHIFT; + int pages = (new_len - old_len) >> MMUPAGE_SHIFT; spin_lock(&vma->vm_mm->page_table_lock); vma->vm_end = addr + new_len; spin_unlock(&vma->vm_mm->page_table_lock); @@ -435,13 +435,13 @@ unsigned long do_mremap(unsigned long ad new_addr = get_unmapped_area(vma->vm_file, 0, new_len, vma->vm_pgoff, map_flags); ret = new_addr; - if (new_addr & ~PAGE_MASK) + if (new_addr & ~MMUPAGE_MASK) goto out; } ret = move_vma(vma, addr, old_len, new_len, new_addr); } out: - if (ret & ~PAGE_MASK) + if (ret & ~MMUPAGE_MASK) vm_unacct_memory(charged); out_nc: return ret; diff -prauN linux-2.5.70-bk10/mm/msync.c pgcl-2.5.70-bk10-1/mm/msync.c --- linux-2.5.70-bk10/mm/msync.c 2003-05-26 18:00:38.000000000 -0700 +++ pgcl-2.5.70-bk10-1/mm/msync.c 2003-06-05 09:44:34.000000000 -0700 @@ -59,7 +59,7 @@ static int filemap_sync_pte_range(pmd_t error = 0; do { error |= filemap_sync_pte(pte, vma, address, flags); - address += PAGE_SIZE; + address += MMUPAGE_SIZE; pte++; } while (address && (address < end)); @@ -174,12 +174,12 @@ asmlinkage long sys_msync(unsigned long down_read(¤t->mm->mmap_sem); if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC)) goto out; - if (start & ~PAGE_MASK) + if (start & ~MMUPAGE_MASK) goto out; if ((flags & MS_ASYNC) && (flags & MS_SYNC)) goto out; error = -ENOMEM; - len = (len + ~PAGE_MASK) & PAGE_MASK; + len = (len + ~MMUPAGE_MASK) & MMUPAGE_MASK; end = start + len; if (end < start) goto out; diff -prauN linux-2.5.70-bk10/mm/page-writeback.c pgcl-2.5.70-bk10-1/mm/page-writeback.c --- linux-2.5.70-bk10/mm/page-writeback.c 2003-06-05 05:44:01.000000000 -0700 +++ pgcl-2.5.70-bk10-1/mm/page-writeback.c 2003-06-05 09:44:34.000000000 -0700 @@ -379,8 +379,8 @@ static void set_ratelimit(void) ratelimit_pages = total_pages / (num_online_cpus() * 32); if (ratelimit_pages < 16) ratelimit_pages = 16; - if (ratelimit_pages * PAGE_CACHE_SIZE > 4096 * 1024) - ratelimit_pages = (4096 * 1024) / PAGE_CACHE_SIZE; + if (ratelimit_pages * PAGE_CACHE_SIZE > PAGE_SIZE * 1024) + ratelimit_pages = (PAGE_SIZE * 1024) / PAGE_CACHE_SIZE; } static int diff -prauN linux-2.5.70-bk10/mm/page_alloc.c pgcl-2.5.70-bk10-1/mm/page_alloc.c --- linux-2.5.70-bk10/mm/page_alloc.c 2003-05-26 18:00:22.000000000 -0700 +++ pgcl-2.5.70-bk10-1/mm/page_alloc.c 2003-06-05 09:48:26.000000000 -0700 @@ -57,7 +57,7 @@ static int zone_balance_max[MAX_NR_ZONES */ static int bad_range(struct zone *zone, struct page *page) { - if (page_to_pfn(page) >= zone->zone_start_pfn + zone->spanned_pages) + if (page_to_pfn(page) >= zone->zone_start_pfn + zone->spanned_pages*PAGE_MMUCOUNT) return 1; if (page_to_pfn(page) < zone->zone_start_pfn) return 1; @@ -233,7 +233,7 @@ static inline void free_pages_check(cons * And clear the zone's pages_scanned counter, to hold off the "all pages are * pinned" detection logic. */ -static int +int free_pages_bulk(struct zone *zone, int count, struct list_head *list, unsigned int order) { @@ -1175,9 +1175,9 @@ void __init memmap_init_zone(struct page #ifdef WANT_PAGE_VIRTUAL /* The shift won't overflow because ZONE_NORMAL is below 4G. */ if (zone != ZONE_HIGHMEM) - set_page_address(page, __va(start_pfn << PAGE_SHIFT)); + set_page_address(page, __va(start_pfn << MMUPAGE_SHIFT)); #endif - start_pfn++; + start_pfn += PAGE_MMUCOUNT; } } @@ -1196,7 +1196,7 @@ static void __init free_area_init_core(s unsigned long *zones_size, unsigned long *zholes_size) { unsigned long i, j; - const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1); + const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-PAGE_MMUSHIFT-1); int cpu, nid = pgdat->node_id; struct page *lmem_map = pgdat->node_mem_map; unsigned long zone_start_pfn = pgdat->node_start_pfn; @@ -1255,7 +1255,7 @@ static void __init free_area_init_core(s INIT_LIST_HEAD(&pcp->list); } printk(" %s zone: %lu pages, LIFO batch:%lu\n", - zone_names[j], realsize, batch); + zone_names[j], realsize*PAGE_MMUCOUNT, batch); INIT_LIST_HEAD(&zone->active_list); INIT_LIST_HEAD(&zone->inactive_list); atomic_set(&zone->refill_counter, 0); @@ -1297,7 +1297,7 @@ static void __init free_area_init_core(s memmap_init(lmem_map, size, nid, j, zone_start_pfn); - zone_start_pfn += size; + zone_start_pfn += PAGE_MMUCOUNT*size; lmem_map += size; for (i = 0; ; i++) { @@ -1368,7 +1368,7 @@ struct pglist_data contig_page_data = { void __init free_area_init(unsigned long *zones_size) { free_area_init_node(0, &contig_page_data, NULL, zones_size, - __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); + __pa(PAGE_OFFSET) >> MMUPAGE_SHIFT, NULL); mem_map = contig_page_data.node_mem_map; } #endif @@ -1496,6 +1496,14 @@ static void *vmstat_start(struct seq_fil if (!ps) return ERR_PTR(-ENOMEM); get_full_page_state(ps); + if (PAGE_MMUCOUNT > 1) { + ps->nr_dirty *= PAGE_MMUCOUNT; + ps->nr_writeback *= PAGE_MMUCOUNT; + ps->nr_unstable *= PAGE_MMUCOUNT; + ps->nr_page_table_pages *= PAGE_MMUCOUNT; + ps->nr_mapped *= PAGE_MMUCOUNT; + ps->nr_slab *= PAGE_MMUCOUNT; + } ps->pgpgin /= 2; /* sectors -> kbytes */ ps->pgpgout /= 2; return (unsigned long *)ps + *pos; diff -prauN linux-2.5.70-bk10/mm/page_io.c pgcl-2.5.70-bk10-1/mm/page_io.c --- linux-2.5.70-bk10/mm/page_io.c 2003-05-26 18:00:23.000000000 -0700 +++ pgcl-2.5.70-bk10-1/mm/page_io.c 2003-06-05 09:44:34.000000000 -0700 @@ -32,7 +32,7 @@ get_swap_bio(int gfp_flags, struct page swp_entry_t entry; BUG_ON(!PageSwapCache(page)); - entry.val = page->index; + entry.val = page->index*PAGE_MMUCOUNT; sis = get_swap_info_struct(swp_type(entry)); bio->bi_sector = map_swap_page(sis, swp_offset(entry)) * @@ -103,7 +103,7 @@ int swap_writepage(struct page *page, st ret = -ENOMEM; goto out; } - inc_page_state(pswpout); + mod_page_state(pswpout, PAGE_MMUCOUNT); SetPageWriteback(page); unlock_page(page); submit_bio(WRITE, bio); @@ -124,7 +124,7 @@ int swap_readpage(struct file *file, str ret = -ENOMEM; goto out; } - inc_page_state(pswpin); + mod_page_state(pswpin, PAGE_MMUCOUNT); submit_bio(READ, bio); out: return ret; @@ -152,7 +152,7 @@ int rw_swap_page_sync(int rw, swp_entry_ BUG_ON(page->mapping); page->mapping = &swapper_space; - page->index = entry.val; + page->index = entry.val/PAGE_MMUCOUNT; if (rw == READ) { ret = swap_readpage(NULL, page); diff -prauN linux-2.5.70-bk10/mm/rmap.c pgcl-2.5.70-bk10-1/mm/rmap.c --- linux-2.5.70-bk10/mm/rmap.c 2003-05-26 18:00:58.000000000 -0700 +++ pgcl-2.5.70-bk10-1/mm/rmap.c 2003-06-05 09:44:34.000000000 -0700 @@ -337,7 +337,8 @@ static int try_to_unmap_one(struct page * Store the swap location in the pte. * See handle_pte_fault() ... */ - swp_entry_t entry = { .val = page->index }; + swp_entry_t entry = { .val = page->index*PAGE_MMUCOUNT + + (pte_pfn(pte) % PAGE_MMUCOUNT) }; swap_duplicate(entry); set_pte(ptep, swp_entry_to_pte(entry)); BUG_ON(pte_file(*ptep)); @@ -347,11 +348,11 @@ static int try_to_unmap_one(struct page * If a nonlinear mapping then store the file page offset * in the pte. */ - pgidx = (address - vma->vm_start) >> PAGE_SHIFT; + pgidx = (address - vma->vm_start) >> MMUPAGE_SHIFT; pgidx += vma->vm_pgoff; - pgidx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT; + pgidx >>= PAGE_CACHE_SHIFT - MMUPAGE_SHIFT; if (page->index != pgidx) { - set_pte(ptep, pgoff_to_pte(page->index)); + set_pte(ptep, pgoff_to_pte(page->index*PAGE_MMUCOUNT)); BUG_ON(!pte_file(*ptep)); } } diff -prauN linux-2.5.70-bk10/mm/shmem.c pgcl-2.5.70-bk10-1/mm/shmem.c --- linux-2.5.70-bk10/mm/shmem.c 2003-05-26 18:00:39.000000000 -0700 +++ pgcl-2.5.70-bk10-1/mm/shmem.c 2003-06-05 09:44:34.000000000 -0700 @@ -47,7 +47,7 @@ #define SHMEM_MAX_INDEX (SHMEM_NR_DIRECT + (ENTRIES_PER_PAGEPAGE/2) * (ENTRIES_PER_PAGE+1)) #define SHMEM_MAX_BYTES ((unsigned long long)SHMEM_MAX_INDEX << PAGE_CACHE_SHIFT) -#define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT) +#define VM_ACCT(size) (MMUPAGE_ALIGN(size)/MMUPAGE_SIZE) /* Pretend that each entry is of this size in directory's i_size */ #define BOGO_DIRENT_SIZE 20 @@ -71,14 +71,14 @@ static inline struct page *shmem_dir_all /* * The above definition of ENTRIES_PER_PAGE, and the use of * BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE: - * might be reconsidered if it ever diverges from PAGE_SIZE. + * might be reconsidered if it ever diverges from MMUPAGE_SIZE. */ - return alloc_pages(gfp_mask, PAGE_CACHE_SHIFT-PAGE_SHIFT); + return alloc_pages(gfp_mask, PAGE_CACHE_MMUSHIFT); } static inline void shmem_dir_free(struct page *page) { - __free_pages(page, PAGE_CACHE_SHIFT-PAGE_SHIFT); + __free_pages(page, PAGE_CACHE_MMUSHIFT); } static struct page **shmem_dir_map(struct page *page) @@ -297,7 +297,7 @@ static swp_entry_t *shmem_swp_alloc(stru static const swp_entry_t unswapped = {0}; if (sgp != SGP_WRITE && - ((loff_t) index << PAGE_CACHE_SHIFT) >= inode->i_size) + (loff_t)index*PAGE_CACHE_SIZE >= inode->i_size) return ERR_PTR(-EINVAL); while (!(entry = shmem_swp_entry(info, index, &page))) { @@ -330,7 +330,7 @@ static swp_entry_t *shmem_swp_alloc(stru return ERR_PTR(-ENOMEM); } if (sgp != SGP_WRITE && - ((loff_t) index << PAGE_CACHE_SHIFT) >= inode->i_size) { + (loff_t)index*PAGE_CACHE_SIZE >= inode->i_size) { entry = ERR_PTR(-EINVAL); break; } @@ -383,7 +383,7 @@ static void shmem_truncate(struct inode int freed; inode->i_ctime = inode->i_mtime = CURRENT_TIME; - idx = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + idx = (inode->i_size + PAGE_CACHE_SIZE - 1)/PAGE_CACHE_SIZE; if (idx >= info->next_index) return; @@ -509,7 +509,7 @@ static int shmem_notify_change(struct de long change = 0; int error; - if ((attr->ia_valid & ATTR_SIZE) && (attr->ia_size <= SHMEM_MAX_BYTES)) { + if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size <= SHMEM_MAX_BYTES) { /* * Account swap file usage based on new file size, * but just let vmtruncate fail on out-of-range sizes. @@ -527,9 +527,9 @@ static int shmem_notify_change(struct de * truncate_partial_page cannnot miss it were * it assigned to swap. */ - if (attr->ia_size & (PAGE_CACHE_SIZE-1)) { + if (attr->ia_size % PAGE_CACHE_SIZE) { (void) shmem_getpage(inode, - attr->ia_size>>PAGE_CACHE_SHIFT, + attr->ia_size/PAGE_CACHE_SIZE, &page, SGP_READ); } } @@ -940,16 +940,14 @@ struct page *shmem_nopage(struct vm_area { struct inode *inode = vma->vm_file->f_dentry->d_inode; struct page *page = NULL; - unsigned long idx; + unsigned long pgoff; int error; - idx = (address - vma->vm_start) >> PAGE_SHIFT; - idx += vma->vm_pgoff; - idx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT; + pgoff = (address - vma->vm_start)/MMUPAGE_SIZE + vma->vm_pgoff; - error = shmem_getpage(inode, idx, &page, SGP_CACHE); + error = shmem_getpage(inode, pgoff/PAGE_CACHE_MMUCOUNT, &page, SGP_CACHE); if (error) - return (error == -ENOMEM)? NOPAGE_OOM: NOPAGE_SIGBUS; + return error == -ENOMEM ? NOPAGE_OOM : NOPAGE_SIGBUS; mark_page_accessed(page); return page; @@ -964,8 +962,8 @@ static int shmem_populate(struct vm_area enum sgp_type sgp = nonblock? SGP_QUICK: SGP_CACHE; unsigned long size; - size = (inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (pgoff >= size || pgoff + (len >> PAGE_SHIFT) > size) + size = (inode->i_size + MMUPAGE_SIZE - 1)/MMUPAGE_SIZE; + if (pgoff >= size || pgoff + len/MMUPAGE_SIZE > size) return -EINVAL; while ((long) len > 0) { @@ -974,19 +972,19 @@ static int shmem_populate(struct vm_area /* * Will need changing if PAGE_CACHE_SIZE != PAGE_SIZE */ - err = shmem_getpage(inode, pgoff, &page, sgp); + err = shmem_getpage(inode, pgoff/PAGE_CACHE_MMUCOUNT, &page, sgp); if (err) return err; if (page) { mark_page_accessed(page); - err = install_page(mm, vma, addr, page, prot); + err = install_page(mm, vma, addr, page, prot, pgoff % PAGE_CACHE_MMUCOUNT); if (err) { page_cache_release(page); return err; } } - len -= PAGE_SIZE; - addr += PAGE_SIZE; + len -= MMUPAGE_SIZE; + addr += MMUPAGE_SIZE; pgoff++; } return 0; @@ -1155,8 +1153,8 @@ shmem_file_write(struct file *file, cons char *kaddr; int left; - offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ - index = pos >> PAGE_CACHE_SHIFT; + offset = pos % PAGE_CACHE_SIZE; /* Within page */ + index = pos/PAGE_CACHE_SIZE; bytes = PAGE_CACHE_SIZE - offset; if (bytes > count) bytes = count; @@ -1231,18 +1229,18 @@ static void do_shmem_file_read(struct fi struct address_space *mapping = inode->i_mapping; unsigned long index, offset; - index = *ppos >> PAGE_CACHE_SHIFT; - offset = *ppos & ~PAGE_CACHE_MASK; + index = *ppos/PAGE_CACHE_SIZE; + offset = *ppos % PAGE_CACHE_SIZE; for (;;) { struct page *page = NULL; unsigned long end_index, nr, ret; - end_index = inode->i_size >> PAGE_CACHE_SHIFT; + end_index = inode->i_size/PAGE_CACHE_SIZE; if (index > end_index) break; if (index == end_index) { - nr = inode->i_size & ~PAGE_CACHE_MASK; + nr = inode->i_size % PAGE_CACHE_SIZE; if (nr <= offset) break; } @@ -1259,9 +1257,9 @@ static void do_shmem_file_read(struct fi * are called without i_sem protection against truncate */ nr = PAGE_CACHE_SIZE; - end_index = inode->i_size >> PAGE_CACHE_SHIFT; + end_index = inode->i_size/PAGE_CACHE_SIZE; if (index == end_index) { - nr = inode->i_size & ~PAGE_CACHE_MASK; + nr = inode->i_size % PAGE_CACHE_SIZE; if (nr <= offset) { page_cache_release(page); break; diff -prauN linux-2.5.70-bk10/mm/slab.c pgcl-2.5.70-bk10-1/mm/slab.c --- linux-2.5.70-bk10/mm/slab.c 2003-06-05 05:44:01.000000000 -0700 +++ pgcl-2.5.70-bk10-1/mm/slab.c 2003-06-05 09:44:34.000000000 -0700 @@ -592,7 +592,7 @@ void __init kmem_cache_init(void) * Fragmentation resistance on low memory - only use bigger * page orders on machines with more than 32MB of memory. */ - if (num_physpages > (32 << 20) >> PAGE_SHIFT) + if (num_physpages > (32 << 20) >> MMUPAGE_SHIFT) slab_break_gfp_order = BREAK_GFP_ORDER_HI; @@ -1014,7 +1014,7 @@ kmem_cache_create (const char *name, siz align = L1_CACHE_BYTES; /* Determine if the slab management is 'on' or 'off' slab. */ - if (size >= (PAGE_SIZE>>3)) + if (size >= PAGE_SIZE/8 || ((flags & SLAB_MUST_HWCACHE_ALIGN) && size >= MMUPAGE_SIZE)) /* * Size is large, assume best to place the slab management obj * off-slab (should allow better packing of objs). diff -prauN linux-2.5.70-bk10/mm/swap.c pgcl-2.5.70-bk10-1/mm/swap.c --- linux-2.5.70-bk10/mm/swap.c 2003-05-26 18:00:38.000000000 -0700 +++ pgcl-2.5.70-bk10-1/mm/swap.c 2003-06-05 09:44:34.000000000 -0700 @@ -379,7 +379,7 @@ void vm_acct_memory(long pages) */ void __init swap_setup(void) { - unsigned long megs = num_physpages >> (20 - PAGE_SHIFT); + unsigned long megs = num_physpages >> (20 - MMUPAGE_SHIFT); /* Use a smaller cluster for small-memory machines */ if (megs < 16) diff -prauN linux-2.5.70-bk10/mm/swap_state.c pgcl-2.5.70-bk10-1/mm/swap_state.c --- linux-2.5.70-bk10/mm/swap_state.c 2003-05-26 18:00:39.000000000 -0700 +++ pgcl-2.5.70-bk10-1/mm/swap_state.c 2003-06-05 09:44:34.000000000 -0700 @@ -77,7 +77,7 @@ static int add_to_swap_cache(struct page INC_CACHE_INFO(noent_race); return -ENOENT; } - error = add_to_page_cache(page, &swapper_space, entry.val, GFP_KERNEL); + error = add_to_page_cache(page, &swapper_space, entry.val/PAGE_MMUCOUNT, GFP_KERNEL); /* * Anon pages are already on the LRU, we don't run lru_cache_add here. */ @@ -149,7 +149,7 @@ int add_to_swap(struct page * page) * Add it to the swap cache and mark it dirty */ err = add_to_page_cache(page, &swapper_space, - entry.val, GFP_ATOMIC); + entry.val/PAGE_MMUCOUNT, GFP_ATOMIC); if (pf_flags & PF_MEMALLOC) current->flags |= PF_MEMALLOC; @@ -188,7 +188,7 @@ void delete_from_swap_cache(struct page BUG_ON(PageWriteback(page)); BUG_ON(PagePrivate(page)); - entry.val = page->index; + entry.val = page->index*PAGE_MMUCOUNT; spin_lock(&swapper_space.page_lock); __delete_from_swap_cache(page); @@ -206,10 +206,10 @@ int move_to_swap_cache(struct page *page spin_lock(&swapper_space.page_lock); spin_lock(&mapping->page_lock); - err = radix_tree_insert(&swapper_space.page_tree, entry.val, page); + err = radix_tree_insert(&swapper_space.page_tree, entry.val/PAGE_MMUCOUNT, page); if (!err) { __remove_from_page_cache(page); - ___add_to_page_cache(page, &swapper_space, entry.val); + ___add_to_page_cache(page, &swapper_space, entry.val/PAGE_MMUCOUNT); } spin_unlock(&mapping->page_lock); @@ -237,7 +237,7 @@ int move_from_swap_cache(struct page *pa BUG_ON(PageWriteback(page)); BUG_ON(PagePrivate(page)); - entry.val = page->index; + entry.val = page->index*PAGE_MMUCOUNT; spin_lock(&swapper_space.page_lock); spin_lock(&mapping->page_lock); @@ -320,7 +320,7 @@ struct page * lookup_swap_cache(swp_entr { struct page *found; - found = find_get_page(&swapper_space, entry.val); + found = find_get_page(&swapper_space, entry.val/PAGE_MMUCOUNT); /* * Unsafe to assert PageSwapCache and mapping on page found: * if SMP nothing prevents swapoff from deleting this page from @@ -351,7 +351,7 @@ struct page * read_swap_cache_async(swp_ * that would confuse statistics: use find_get_page() * directly. */ - found_page = find_get_page(&swapper_space, entry.val); + found_page = find_get_page(&swapper_space, entry.val/PAGE_MMUCOUNT); if (found_page) break; diff -prauN linux-2.5.70-bk10/mm/swapfile.c pgcl-2.5.70-bk10-1/mm/swapfile.c --- linux-2.5.70-bk10/mm/swapfile.c 2003-05-26 18:00:25.000000000 -0700 +++ pgcl-2.5.70-bk10-1/mm/swapfile.c 2003-06-05 09:44:34.000000000 -0700 @@ -41,6 +41,10 @@ struct swap_info_struct swap_info[MAX_SW #define SWAPFILE_CLUSTER 256 +/* + * returns offset into ->swap_map[] array, each entry of which + * tracks PAGE_SIZE (not MMUPAGE_SIZE) + */ static inline int scan_swap_map(struct swap_info_struct *si) { unsigned long offset; @@ -127,7 +131,7 @@ swp_entry_t get_swap_page(void) offset = scan_swap_map(p); swap_device_unlock(p); if (offset) { - entry = swp_entry(type,offset); + entry = swp_entry(type, offset*PAGE_MMUCOUNT); type = swap_info[type].next; if (type < 0 || p->prio != swap_info[type].prio) { @@ -161,15 +165,23 @@ static struct swap_info_struct * swap_in if (!entry.val) goto out; type = swp_type(entry); - if (type >= nr_swapfiles) + if (type >= nr_swapfiles) { + printk(KERN_ERR "bad type %lu beyond nr_swapfiles %u " + "in swap_info_get()\n", type, nr_swapfiles); goto bad_nofile; + } p = & swap_info[type]; if (!(p->flags & SWP_USED)) goto bad_device; offset = swp_offset(entry); - if (offset >= p->max) + + /* + * offset returned by swp_offset() is in MMUPAGE_SIZE units, + * p->max is in PAGE_SIZE units + */ + if (offset >= p->max*PAGE_MMUCOUNT) goto bad_offset; - if (!p->swap_map[offset]) + if (!p->swap_map[offset/PAGE_MMUCOUNT]) goto bad_free; swap_list_lock(); if (p->prio > swap_info[swap_list.next].prio) @@ -179,15 +191,70 @@ static struct swap_info_struct * swap_in bad_free: printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val); + WARN_ON(1); goto out; bad_offset: printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val); + WARN_ON(1); goto out; bad_device: printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val); + WARN_ON(1); goto out; bad_nofile: printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val); + WARN_ON(1); + +/* dump pagetables */ +#if 1 + { + struct mm_struct *mm = current->mm; + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + unsigned long vaddr; + + if (!mm) { + /* we're dead here anyway, but... */ + printk(KERN_ERR "bug in free_swap_and_cache() " + "with no mm!\n"); + goto out_noscan; + } + + for (vaddr = 0; vaddr < TASK_SIZE; vaddr += PGDIR_SIZE) { + pgd = pgd_offset(mm, vaddr); + printk(KERN_DEBUG "pgd for 0x%lx = 0x%Lx\n", + vaddr, (u64)pgd_val(*pgd)); + } + + if (PTRS_PER_PMD > 1) { + for (vaddr = 0; vaddr < TASK_SIZE; vaddr += PMD_SIZE) { + pgd = pgd_offset(mm, vaddr); + if (pgd_none(*pgd) || !pgd_present(*pgd)) + continue; + pmd = pmd_offset(pgd, vaddr); + printk(KERN_DEBUG "pmd for 0x%lx = 0x%Lx\n", + vaddr, (u64)pmd_val(*pmd)); + } + } + + for (vaddr = 0; vaddr < TASK_SIZE; vaddr += MMUPAGE_SIZE) { + pgd = pgd_offset(mm, vaddr); + if (pgd_none(*pgd) || !pgd_present(*pgd)) + continue; + pmd = pmd_offset(pgd, vaddr); + if (pmd_none(*pmd) || !pmd_present(*pmd)) + continue; + pte = pte_offset_map_nested(pmd, vaddr); + if (!pte_none(*pte) && pte_present(*pte)) + printk(KERN_DEBUG "pte for 0x%lx = 0x%Lx\n", + vaddr, (u64)pte_val(*pte)); + pte_unmap_nested(pte); + } +out_noscan: + ; + } +#endif out: return NULL; } @@ -198,6 +265,9 @@ static void swap_info_put(struct swap_in swap_list_unlock(); } +/* + * offset is entry.val/PAGE_MMUCOUNT + */ static int swap_entry_free(struct swap_info_struct *p, unsigned long offset) { int count = p->swap_map[offset]; @@ -226,11 +296,12 @@ void swap_free(swp_entry_t entry) struct swap_info_struct * p; p = swap_info_get(entry); - if (p) { - swap_entry_free(p, swp_offset(entry)); + if (!p) + return; + + swap_entry_free(p, swp_offset(entry)/PAGE_MMUCOUNT); swap_info_put(p); } -} /* * Check if we're the only user of a swap page, @@ -242,11 +313,11 @@ static int exclusive_swap_page(struct pa struct swap_info_struct * p; swp_entry_t entry; - entry.val = page->index; + entry.val = page->index*PAGE_MMUCOUNT; p = swap_info_get(entry); if (p) { /* Is the only swap cache user the cache itself? */ - if (p->swap_map[swp_offset(entry)] == 1) { + if (p->swap_map[swp_offset(entry)/PAGE_MMUCOUNT] == 1) { /* Recheck the page count with the pagecache lock held.. */ spin_lock(&swapper_space.page_lock); if (page_count(page) - !!PagePrivate(page) == 2) @@ -310,14 +381,14 @@ int remove_exclusive_swap_page(struct pa if (page_count(page) != 2) /* 2: us + cache */ return 0; - entry.val = page->index; + entry.val = page->index*PAGE_MMUCOUNT; p = swap_info_get(entry); if (!p) return 0; /* Is the only swap cache user the cache itself? */ retval = 0; - if (p->swap_map[swp_offset(entry)] == 1) { + if (p->swap_map[swp_offset(entry)/PAGE_MMUCOUNT] == 1) { /* Recheck the page count with the pagecache lock held.. */ spin_lock(&swapper_space.page_lock); if ((page_count(page) == 2) && !PageWriteback(page)) { @@ -348,8 +419,8 @@ void free_swap_and_cache(swp_entry_t ent p = swap_info_get(entry); if (p) { - if (swap_entry_free(p, swp_offset(entry)) == 1) - page = find_trylock_page(&swapper_space, entry.val); + if (swap_entry_free(p, swp_offset(entry)/PAGE_MMUCOUNT) == 1) + page = find_trylock_page(&swapper_space, entry.val/PAGE_MMUCOUNT); swap_info_put(p); } if (page) { @@ -382,9 +453,12 @@ static void unuse_pte(struct vm_area_struct *vma, unsigned long address, pte_t *dir, swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp) { + unsigned long pfn; + /* vma_suboffset() would be meaningless; these are anonymous */ + pfn = page_to_pfn(page) + (entry.val % PAGE_MMUCOUNT); vma->vm_mm->rss++; get_page(page); - set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot))); + set_pte(dir, pte_mkold(pfn_pte(pfn, vma->vm_page_prot))); *pte_chainp = page_add_rmap(page, dir, *pte_chainp); swap_free(entry); } @@ -422,7 +496,7 @@ static int unuse_pmd(struct vm_area_stru pte_unmap(pte); return 1; } - address += PAGE_SIZE; + address += MMUPAGE_SIZE; pte++; } while (address && (address < end)); pte_unmap(pte - 1); @@ -572,6 +646,9 @@ static int try_to_unuse(unsigned int typ * child immediately after parent. If we race with dup_mmap(), * we very much want to resolve parent before child, otherwise * we may miss some entries: using last mm would invert that. + * + * The whole of the preceding discussion is bogus now that + * physical scanning is in place. */ start_mm = &init_mm; atomic_inc(&init_mm.mm_users); @@ -601,7 +678,7 @@ static int try_to_unuse(unsigned int typ * page and read the swap into it. */ swap_map = &si->swap_map[i]; - entry = swp_entry(type, i); + entry = swp_entry(type, i*PAGE_MMUCOUNT); page = read_swap_cache_async(entry); if (!page) { /* @@ -710,6 +787,10 @@ static int try_to_unuse(unsigned int typ * we might be resetting SWAP_MAP_MAX too early here. * We know "Undead"s can happen, they're okay, so don't * report them; but do report if we reset SWAP_MAP_MAX. + * + * The whole of the preceding discussion is bogus given + * the new process capacities and there are probably + * resource leaks to fix up here. */ if (*swap_map == SWAP_MAP_MAX) { swap_device_lock(si); @@ -786,6 +867,8 @@ sector_t map_swap_page(struct swap_info_ struct swap_extent *se = sis->curr_swap_extent; struct swap_extent *start_se = se; + offset /= PAGE_MMUCOUNT; + for ( ; ; ) { struct list_head *lh; @@ -992,7 +1075,7 @@ int page_queue_congested(struct page *pa bdi = page->mapping->backing_dev_info; if (PageSwapCache(page)) { - swp_entry_t entry = { .val = page->index }; + swp_entry_t entry = { .val = page->index*PAGE_MMUCOUNT }; struct swap_info_struct *sis; sis = get_swap_info_struct(swp_type(entry)); @@ -1356,20 +1439,20 @@ asmlinkage long sys_swapon(const char __ maxpages = swp_offset(swp_entry(0,~0UL)) - 1; if (maxpages > swap_header->info.last_page) maxpages = swap_header->info.last_page; - p->highest_bit = maxpages - 1; + p->highest_bit = maxpages/PAGE_MMUCOUNT - 1; error = -EINVAL; if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) goto bad_swap; /* OK, set up the swap map and apply the bad block list */ - if (!(p->swap_map = vmalloc(maxpages * sizeof(short)))) { + if (!(p->swap_map = vmalloc(maxpages/PAGE_MMUCOUNT * sizeof(short)))) { error = -ENOMEM; goto bad_swap; } error = 0; - memset(p->swap_map, 0, maxpages * sizeof(short)); + memset(p->swap_map, 0, maxpages/PAGE_MMUCOUNT * sizeof(short)); for (i=0; iinfo.nr_badpages; i++) { int page = swap_header->info.badpages[i]; if (page <= 0 || page >= swap_header->info.last_page) @@ -1377,14 +1460,14 @@ asmlinkage long sys_swapon(const char __ else p->swap_map[page] = SWAP_MAP_BAD; } - nr_good_pages = swap_header->info.last_page - + nr_good_pages = (swap_header->info.last_page - swap_header->info.nr_badpages - - 1 /* header page */; + 1)/PAGE_MMUCOUNT /* header page */; if (error) goto bad_swap; } - if (swapfilesize && maxpages > swapfilesize) { + if (swapfilesize && maxpages/PAGE_MMUCOUNT > swapfilesize) { printk(KERN_WARNING "Swap area shorter than signature indicates\n"); error = -EINVAL; @@ -1396,7 +1479,7 @@ asmlinkage long sys_swapon(const char __ goto bad_swap; } p->swap_map[0] = SWAP_MAP_BAD; - p->max = maxpages; + p->max = maxpages/PAGE_MMUCOUNT; p->pages = nr_good_pages; if (setup_swap_extents(p)) @@ -1493,7 +1576,7 @@ int swap_duplicate(swp_entry_t entry) if (type >= nr_swapfiles) goto bad_file; p = type + swap_info; - offset = swp_offset(entry); + offset = swp_offset(entry)/PAGE_MMUCOUNT; swap_device_lock(p); if (offset < p->max && p->swap_map[offset]) { @@ -1513,6 +1596,7 @@ out: bad_file: printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); + BUG(); goto out; } @@ -1534,7 +1618,7 @@ int valid_swaphandles(swp_entry_t entry, if (!page_cluster) /* no readahead */ return 0; - toff = (swp_offset(entry) >> page_cluster) << page_cluster; + toff = (swp_offset(entry)/PAGE_MMUCOUNT) & ~((1UL << page_cluster)-1); if (!toff) /* first page is swap header */ toff++, i--; *offset = toff; diff -prauN linux-2.5.70-bk10/mm/vcache.c pgcl-2.5.70-bk10-1/mm/vcache.c --- linux-2.5.70-bk10/mm/vcache.c 2003-05-26 18:00:26.000000000 -0700 +++ pgcl-2.5.70-bk10-1/mm/vcache.c 2003-06-05 09:44:34.000000000 -0700 @@ -34,7 +34,7 @@ void __attach_vcache(vcache_t *vcache, { struct list_head *hash_head; - address &= PAGE_MASK; + address &= MMUPAGE_MASK; vcache->address = address; vcache->mm = mm; vcache->callback = callback; diff -prauN linux-2.5.70-bk10/mm/vmalloc.c pgcl-2.5.70-bk10-1/mm/vmalloc.c --- linux-2.5.70-bk10/mm/vmalloc.c 2003-05-26 18:00:41.000000000 -0700 +++ pgcl-2.5.70-bk10-1/mm/vmalloc.c 2003-06-05 09:44:34.000000000 -0700 @@ -44,15 +44,12 @@ static void unmap_area_pte(pmd_t *pmd, u end = PMD_SIZE; do { - pte_t page; - page = ptep_get_and_clear(pte); - address += PAGE_SIZE; - pte++; - if (pte_none(page)) - continue; - if (pte_present(page)) - continue; + if (pte_present(*pte)) + pte_clear(pte); + else if (!pte_none(*pte)) printk(KERN_CRIT "Whee.. Swapped out page in kernel page table\n"); + pte++; + address += MMUPAGE_SIZE; } while (address < end); } @@ -83,56 +80,6 @@ static void unmap_area_pmd(pgd_t *dir, u } while (address < end); } -static int map_area_pte(pte_t *pte, unsigned long address, - unsigned long size, pgprot_t prot, - struct page ***pages) -{ - unsigned long end; - - address &= ~PMD_MASK; - end = address + size; - if (end > PMD_SIZE) - end = PMD_SIZE; - - do { - struct page *page = **pages; - - WARN_ON(!pte_none(*pte)); - if (!page) - return -ENOMEM; - - set_pte(pte, mk_pte(page, prot)); - address += PAGE_SIZE; - pte++; - (*pages)++; - } while (address < end); - return 0; -} - -static int map_area_pmd(pmd_t *pmd, unsigned long address, - unsigned long size, pgprot_t prot, - struct page ***pages) -{ - unsigned long end; - - address &= ~PGDIR_MASK; - end = address + size; - if (end > PGDIR_SIZE) - end = PGDIR_SIZE; - - do { - pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address); - if (!pte) - return -ENOMEM; - if (map_area_pte(pte, address, end - address, prot, pages)) - return -ENOMEM; - address = (address + PMD_SIZE) & PMD_MASK; - pmd++; - } while (address < end); - - return 0; -} - void unmap_vm_area(struct vm_struct *area) { unsigned long address = VMALLOC_VMADDR(area->addr); @@ -149,30 +96,48 @@ void unmap_vm_area(struct vm_struct *are flush_tlb_kernel_range(VMALLOC_VMADDR(area->addr), end); } +#define PTE_TABLE_MASK ((PTRS_PER_PTE-1) * sizeof(pte_t)) +#define PMD_TABLE_MASK ((PTRS_PER_PMD-1) * sizeof(pmd_t)) + int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) { unsigned long address = VMALLOC_VMADDR(area->addr); - unsigned long end = address + (area->size-PAGE_SIZE); - pgd_t *dir; + /* don't instantiate PTE's for the guard page */ + unsigned long end = address + area->size - MMUPAGE_SIZE; + unsigned long voffset = 0; + pgd_t *pgd; int err = 0; - dir = pgd_offset_k(address); + pgd = pgd_offset_k(address); spin_lock(&init_mm.page_table_lock); do { - pmd_t *pmd = pmd_alloc(&init_mm, dir, address); + pmd_t *pmd = pmd_alloc(&init_mm, pgd, address); if (!pmd) { err = -ENOMEM; - break; + goto out; } - if (map_area_pmd(pmd, address, end - address, prot, pages)) { + + do { + pte_t *pte = pte_alloc_kernel(&init_mm, pmd, address); + if (!pte) { err = -ENOMEM; - break; + goto out; } - - address = (address + PGDIR_SIZE) & PGDIR_MASK; - dir++; - } while (address && (address < end)); - + do { + unsigned long pfn; + pfn = page_to_pfn((*pages)[voffset/PAGE_SIZE]); + pfn += (voffset/MMUPAGE_SIZE) % PAGE_MMUCOUNT; + set_pte(pte, pfn_pte(pfn, prot)); + ++pte; + address += MMUPAGE_SIZE; + voffset += MMUPAGE_SIZE; + } while (((unsigned long)pte & PTE_TABLE_MASK) && address < end); + ++pmd; + } while (((unsigned long)pmd & PMD_TABLE_MASK) && address < end); + ++pgd; + /* presumably address could wrap to 0, but I doubt it */ + } while (address && address < end); +out: spin_unlock(&init_mm.page_table_lock); flush_cache_all(); return err; @@ -201,7 +166,7 @@ struct vm_struct *get_vm_area(unsigned l /* * We always allocate a guard page. */ - size += PAGE_SIZE; + size += MMUPAGE_SIZE; if (unlikely(!size)) { kfree (area); return NULL; @@ -230,6 +195,9 @@ found: area->phys_addr = 0; write_unlock(&vmlist_lock); + printk("vmalloc, returning [0x%p, 0x%p)\n", + area->addr, ((char *)area->addr) + area->size); + return area; out: @@ -243,17 +211,20 @@ out: * * @addr: base address * - * Search for the kernel VM area starting at @addr, and remove it. + * Search for the kernel VM area containing @addr, and remove it. * This function returns the found VM area, but using it is NOT safe - * on SMP machines. + * on SMP machines; the final removal of an area must be serialized + * externally, and those who allocated the area own it. */ -struct vm_struct *remove_vm_area(void *addr) +struct vm_struct *remove_vm_area(void *__addr) { struct vm_struct **p, *tmp; + unsigned long addr = (unsigned long)__addr; write_lock(&vmlist_lock); - for (p = &vmlist ; (tmp = *p) ;p = &tmp->next) { - if (tmp->addr == addr) + for (p = &vmlist; (tmp = *p); p = &tmp->next) { + unsigned long tmp_addr = (unsigned long)tmp->addr; + if (addr >= tmp_addr && addr - tmp_addr < tmp->size) goto found; } write_unlock(&vmlist_lock); @@ -273,7 +244,7 @@ void __vunmap(void *addr, int deallocate if (!addr) return; - if ((PAGE_SIZE-1) & (unsigned long)addr) { + if ((MMUPAGE_SIZE-1) & (unsigned long)addr) { printk(KERN_ERR "Trying to vfree() bad address (%p)\n", addr); return; } @@ -289,8 +260,7 @@ void __vunmap(void *addr, int deallocate int i; for (i = 0; i < area->nr_pages; i++) { - if (unlikely(!area->pages[i])) - BUG(); + BUG_ON(unlikely(!area->pages[i])); __free_page(area->pages[i]); } @@ -349,10 +319,10 @@ void *vmap(struct page **pages, unsigned { struct vm_struct *area; - if (count > num_physpages) + if (PAGE_MMUCOUNT*count > num_physpages) return NULL; - area = get_vm_area((count << PAGE_SHIFT), flags); + area = get_vm_area(PAGE_SIZE*count, flags); if (!area) return NULL; if (map_vm_area(area, prot, &pages)) { @@ -380,16 +350,16 @@ void *__vmalloc(unsigned long size, int struct page **pages; unsigned int nr_pages, array_size, i; - size = PAGE_ALIGN(size); - if (!size || (size >> PAGE_SHIFT) > num_physpages) + size = MMUPAGE_ALIGN(size); + if (!size || (size >> MMUPAGE_SHIFT) > num_physpages) return NULL; area = get_vm_area(size, VM_ALLOC); if (!area) return NULL; - nr_pages = size >> PAGE_SHIFT; - array_size = (nr_pages * sizeof(struct page *)); + nr_pages = PAGE_ALIGN(size)/PAGE_SIZE; + array_size = nr_pages * sizeof(struct page *); area->nr_pages = nr_pages; area->pages = pages = kmalloc(array_size, (gfp_mask & ~__GFP_HIGHMEM)); @@ -460,7 +430,7 @@ long vread(char *buf, char *addr, unsign read_lock(&vmlist_lock); for (tmp = vmlist; tmp; tmp = tmp->next) { vaddr = (char *) tmp->addr; - if (addr >= vaddr + tmp->size - PAGE_SIZE) + if (addr >= vaddr + tmp->size - MMUPAGE_SIZE) continue; while (addr < vaddr) { if (count == 0) @@ -470,7 +440,7 @@ long vread(char *buf, char *addr, unsign addr++; count--; } - n = vaddr + tmp->size - PAGE_SIZE - addr; + n = vaddr + tmp->size - MMUPAGE_SIZE - addr; do { if (count == 0) goto finished; @@ -498,7 +468,7 @@ long vwrite(char *buf, char *addr, unsig read_lock(&vmlist_lock); for (tmp = vmlist; tmp; tmp = tmp->next) { vaddr = (char *) tmp->addr; - if (addr >= vaddr + tmp->size - PAGE_SIZE) + if (addr >= vaddr + tmp->size - MMUPAGE_SIZE) continue; while (addr < vaddr) { if (count == 0) @@ -507,7 +477,7 @@ long vwrite(char *buf, char *addr, unsig addr++; count--; } - n = vaddr + tmp->size - PAGE_SIZE - addr; + n = vaddr + tmp->size - MMUPAGE_SIZE - addr; do { if (count == 0) goto finished; diff -prauN linux-2.5.70-bk10/mm/vmscan.c pgcl-2.5.70-bk10-1/mm/vmscan.c --- linux-2.5.70-bk10/mm/vmscan.c 2003-05-26 18:00:24.000000000 -0700 +++ pgcl-2.5.70-bk10-1/mm/vmscan.c 2003-06-05 09:44:34.000000000 -0700 @@ -398,7 +398,7 @@ shrink_list(struct list_head *page_list, #ifdef CONFIG_SWAP if (PageSwapCache(page)) { - swp_entry_t swap = { .val = page->index }; + swp_entry_t swap = { .val = page->index*PAGE_MMUCOUNT }; __delete_from_swap_cache(page); spin_unlock(&mapping->page_lock); swap_free(swap); diff -prauN linux-2.5.70-bk10/net/ipv4/netfilter/ip_conntrack_core.c pgcl-2.5.70-bk10-1/net/ipv4/netfilter/ip_conntrack_core.c --- linux-2.5.70-bk10/net/ipv4/netfilter/ip_conntrack_core.c 2003-05-26 18:00:19.000000000 -0700 +++ pgcl-2.5.70-bk10-1/net/ipv4/netfilter/ip_conntrack_core.c 2003-06-05 09:44:34.000000000 -0700 @@ -1413,9 +1413,9 @@ int __init ip_conntrack_init(void) ip_conntrack_htable_size = hashsize; } else { ip_conntrack_htable_size - = (((num_physpages << PAGE_SHIFT) / 16384) + = (((num_physpages << MMUPAGE_SHIFT) / 16384) / sizeof(struct list_head)); - if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE)) + if (num_physpages > (1024 * 1024 * 1024 / MMUPAGE_SIZE)) ip_conntrack_htable_size = 8192; if (ip_conntrack_htable_size < 16) ip_conntrack_htable_size = 16; diff -prauN linux-2.5.70-bk10/net/ipv4/tcp.c pgcl-2.5.70-bk10-1/net/ipv4/tcp.c --- linux-2.5.70-bk10/net/ipv4/tcp.c 2003-06-05 05:44:01.000000000 -0700 +++ pgcl-2.5.70-bk10-1/net/ipv4/tcp.c 2003-06-05 09:44:34.000000000 -0700 @@ -2599,9 +2599,9 @@ void __init tcp_init(void) * The methodology is similar to that of the buffer cache. */ if (num_physpages >= (128 * 1024)) - goal = num_physpages >> (21 - PAGE_SHIFT); + goal = num_physpages >> (21 - MMUPAGE_SHIFT); else - goal = num_physpages >> (23 - PAGE_SHIFT); + goal = num_physpages >> (23 - MMUPAGE_SHIFT); for (order = 0; (1UL << order) < goal; order++) ;