This is a standalone patch implementing a top-down vma allocation policy, similar to what Red Hat uses in conjunction with execshield (which is very large and has unclear performance implications for various low-level issues). It's also simplified in that it doesn't attempt to manage the ->free_area_cache, which the core VM doesn't understand how to set up for such a policy without lots of hooks. The sum total core impact is one #ifdef in fs/binfmt_elf.c. The primary focus of this patch, as it was originally written, was to compact virtualspace to reduce pagetable space requirements on smaller systems. Workloads interested in compact virtualspace layouts for the purpose of using large mappings may also find it useful. Without any intervention whatsoever (e.g. setting /proc/$PID/task_unmapped_base), all userspace programs receive the benefits of reduced pagetable space and the ability to mmap()/malloc() a whopping 2.9GB in a 3:1 split. Statically-linked programs using 256KB of stackspace or less receive the additional benefit of requiring only one pagetable page per process. The downside is that this virtualspace layout creates a 128MB limit on the userspace stack. As arrangements such as this patch provides can be explicitly set up by userspace, so can such stack arrangements. Or, alternatively, this top-down policy could be made a "personality", which is Linux' internal representation of an ABI, and the mainline behavior made a separate one. Tags in the ELF executable would then choose which policy to use at load-time. The stack limit is primarily of interest to Fortran runtime systems and it's not clear it's universal among them. It's never been seen to affect practical usage. There is an "oddity" with GLIBC_BUFFER: this workaround for glibc behavior was described to me by jejb as something he encountered during his PA-RISC support work. Without it glibc's thread exit algorithms are nonfunctional. It's irrelevant to all other cases. This has been in regular use as part of -wli since sometime before August (my guess as to when GLIBC_BUFFER bits went in). It's not had issues in some time. vs. 2.6.0-test9-bk24 diff -prauN linux-2.6.0-test9-bk24/arch/i386/Kconfig topdown-2.6.0-test9-bk24-1/arch/i386/Kconfig --- linux-2.6.0-test9-bk24/arch/i386/Kconfig 2003-10-25 11:43:01.000000000 -0700 +++ topdown-2.6.0-test9-bk24-1/arch/i386/Kconfig 2003-11-19 15:58:45.000000000 -0800 @@ -1217,6 +1217,15 @@ config FRAME_POINTER If you don't debug the kernel, you can say N, but we may not be able to solve problems without frame pointers. +config MMAP_TOPDOWN + bool "Top-down vma allocation" + help + Say Y here to have the kernel change its vma allocation policy + to allocate vma's from the top of the address space down, and + to shove the stack low so as to conserve virtualspace. This is + risky because various apps, including a number of versions of + ld.so, depend on the kernel's bottom-up behavior. + config X86_EXTRA_IRQS bool depends on X86_LOCAL_APIC || X86_VOYAGER diff -prauN linux-2.6.0-test9-bk24/arch/i386/mm/pgtable.c topdown-2.6.0-test9-bk24-1/arch/i386/mm/pgtable.c --- linux-2.6.0-test9-bk24/arch/i386/mm/pgtable.c 2003-10-25 11:45:07.000000000 -0700 +++ topdown-2.6.0-test9-bk24-1/arch/i386/mm/pgtable.c 2003-11-19 15:58:55.000000000 -0800 @@ -237,3 +237,60 @@ void pgd_free(pgd_t *pgd) /* in the non-PAE case, clear_page_tables() clears user pgd entries */ kmem_cache_free(pgd_cache, pgd); } + +#define GLIBC_BUFFER (32*1024*1024) + +/* + * This is total crap; it needs to use the free area cache to mitigate + * catastrophic O(n) search with many vmas. + */ +unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma, *prev; + + len = PAGE_ALIGN(len); + addr = PAGE_ALIGN(addr); + + if (len > TASK_SIZE) + return -ENOMEM; + + if (addr) { + struct vm_area_struct *vma; + vma = find_vma(mm, addr); + if (TASK_SIZE - len >= addr && + (!vma || addr + len <= vma->vm_start)) + goto out; + } + + if (!mm->mmap) { + if (len > TASK_SIZE - GLIBC_BUFFER) + addr = TASK_SIZE - len; + else + addr = TASK_SIZE - GLIBC_BUFFER - len; + goto out; + } + + addr = -ENOMEM; + for (prev = NULL, vma = mm->mmap; vma; prev = vma, vma = vma->vm_next) { + unsigned long lo, hi; + lo = prev ? prev->vm_end : 0; + hi = vma->vm_start; + if (hi - lo >= len && (addr == -ENOMEM || addr < hi - len)) + addr = hi - len; + } + /* + * We're at the last one; let's try the top, but only if nothing + * else can be found (to respect GLIBC_BUFFER). + */ + if (prev && TASK_SIZE - prev->vm_end >= len) { + if (TASK_SIZE - GLIBC_BUFFER - prev->vm_end >= len) + addr = TASK_SIZE - GLIBC_BUFFER - len; + else if (addr == -ENOMEM) + addr = TASK_SIZE - len; + } +out: + return addr; +} diff -prauN linux-2.6.0-test9-bk24/fs/binfmt_elf.c topdown-2.6.0-test9-bk24-1/fs/binfmt_elf.c --- linux-2.6.0-test9-bk24/fs/binfmt_elf.c 2003-10-25 11:43:32.000000000 -0700 +++ topdown-2.6.0-test9-bk24-1/fs/binfmt_elf.c 2003-11-19 15:58:45.000000000 -0800 @@ -7,6 +7,7 @@ * Tools". * * Copyright 1993, 1994: Eric Youngdale (ericy@cais.com). + * Top-down vma allocation support, William Irwin, IBM, 2003 */ #include @@ -329,8 +330,13 @@ static unsigned long load_elf_interp(str if (retval < 0) goto out_close; +#ifndef CONFIG_MMAP_TOPDOWN eppnt = elf_phdata; for (i=0; ie_phnum; i++, eppnt++) { +#else + eppnt = &elf_phdata[interp_elf_ex->e_phnum - 1]; + for (i = interp_elf_ex->e_phnum - 1; i >= 0; --i, --eppnt) { +#endif if (eppnt->p_type == PT_LOAD) { int elf_type = MAP_PRIVATE | MAP_DENYWRITE; int elf_prot = 0; @@ -344,7 +350,8 @@ static unsigned long load_elf_interp(str if (interp_elf_ex->e_type == ET_EXEC || load_addr_set) elf_type |= MAP_FIXED; - map_addr = elf_map(interpreter, load_addr + vaddr, eppnt, elf_prot, elf_type); + map_addr = load_addr_set ? load_addr + vaddr : 0; + map_addr = elf_map(interpreter, map_addr, eppnt, elf_prot, elf_type); if (BAD_ADDR(map_addr)) goto out_close; diff -prauN linux-2.6.0-test9-bk24/include/asm-i386/a.out.h topdown-2.6.0-test9-bk24-1/include/asm-i386/a.out.h --- linux-2.6.0-test9-bk24/include/asm-i386/a.out.h 2003-10-25 11:43:27.000000000 -0700 +++ topdown-2.6.0-test9-bk24-1/include/asm-i386/a.out.h 2003-11-19 16:01:08.000000000 -0800 @@ -19,7 +19,16 @@ struct exec #ifdef __KERNEL__ +/* + * Typical ELF load address is 0x8048000, which is 128MB + 288KB. + * Shoving the stack very close to it lets smaller programs fit in + * a single pagetable page's worth of virtualspace. + */ +#ifdef CONFIG_MMAP_TOPDOWN +#define STACK_TOP ((128 << 20) + (256 << 10)) +#else #define STACK_TOP TASK_SIZE +#endif #endif diff -prauN linux-2.6.0-test9-bk24/include/asm-i386/pgtable.h topdown-2.6.0-test9-bk24-1/include/asm-i386/pgtable.h --- linux-2.6.0-test9-bk24/include/asm-i386/pgtable.h 2003-10-25 11:44:13.000000000 -0700 +++ topdown-2.6.0-test9-bk24-1/include/asm-i386/pgtable.h 2003-11-19 15:58:45.000000000 -0800 @@ -25,6 +25,10 @@ #include #include +#ifdef CONFIG_MMAP_TOPDOWN +#define HAVE_ARCH_UNMAPPED_AREA +#endif + /* * ZERO_PAGE is a global shared page that is always zero: used * for zero-mapped memory areas etc..