diff -prauN linux-2.6.0-test1/Documentation/filesystems/Locking wli-2.6.0-test1-37/Documentation/filesystems/Locking --- linux-2.6.0-test1/Documentation/filesystems/Locking 2003-07-13 20:37:32.000000000 -0700 +++ wli-2.6.0-test1-37/Documentation/filesystems/Locking 2003-07-14 08:52:52.000000000 -0700 @@ -186,7 +186,7 @@ currently-in-progress I/O. If the filesystem is not called for "sync" and it determines that it would need to block against in-progress I/O to be able to start new I/O against the page the filesystem shoud redirty the page (usually with -__set_page_dirty_nobuffers()), then unlock the page and return zero. +set_page_dirty_nobuffers()), then unlock the page and return zero. This may also be done to avoid internal deadlocks, but rarely. If the filesytem is called for sync then it must wait on any diff -prauN linux-2.6.0-test1/Documentation/vm/locking wli-2.6.0-test1-37/Documentation/vm/locking --- linux-2.6.0-test1/Documentation/vm/locking 2003-07-13 20:32:34.000000000 -0700 +++ wli-2.6.0-test1-37/Documentation/vm/locking 2003-07-14 08:45:37.000000000 -0700 @@ -66,7 +66,7 @@ in some cases it is not really needed. E expand_stack(), it is hard to come up with a destructive scenario without having the vmlist protection in this case. -The page_table_lock nests with the inode i_shared_sem and the kmem cache +The page_table_lock nests with the inode i_shared_lock and the kmem cache c_spinlock spinlocks. This is okay, since the kmem code asks for pages after dropping c_spinlock. The page_table_lock also nests with pagecache_lock and pagemap_lru_lock spinlocks, and no code asks for memory with these locks diff -prauN linux-2.6.0-test1/Makefile wli-2.6.0-test1-37/Makefile --- linux-2.6.0-test1/Makefile 2003-07-13 20:35:55.000000000 -0700 +++ wli-2.6.0-test1-37/Makefile 2003-07-14 08:58:44.000000000 -0700 @@ -214,7 +214,7 @@ NOSTDINC_FLAGS = -nostdinc -iwithprefix CPPFLAGS := -D__KERNEL__ -Iinclude CFLAGS := $(CPPFLAGS) -Wall -Wstrict-prototypes -Wno-trigraphs -O2 \ - -fno-strict-aliasing -fno-common + -fno-strict-aliasing -fno-common -g AFLAGS := -D__ASSEMBLY__ $(CPPFLAGS) export VERSION PATCHLEVEL SUBLEVEL EXTRAVERSION KERNELRELEASE ARCH \ diff -prauN linux-2.6.0-test1/arch/alpha/kernel/smp.c wli-2.6.0-test1-37/arch/alpha/kernel/smp.c --- linux-2.6.0-test1/arch/alpha/kernel/smp.c 2003-07-13 20:32:32.000000000 -0700 +++ wli-2.6.0-test1-37/arch/alpha/kernel/smp.c 2003-07-14 06:31:09.000000000 -0700 @@ -71,7 +71,7 @@ static int smp_secondary_alive __initdat /* Which cpus ids came online. */ unsigned long cpu_present_mask; -volatile unsigned long cpu_online_map; +cpumask_t cpu_online_map; /* cpus reported in the hwrpb */ static unsigned long hwrpb_cpu_present_mask __initdata = 0; @@ -132,7 +132,7 @@ smp_callin(void) { int cpuid = hard_smp_processor_id(); - if (test_and_set_bit(cpuid, &cpu_online_map)) { + if (cpu_test_and_set(cpuid, cpu_online_map)) { printk("??, cpu 0x%x already present??\n", cpuid); BUG(); } @@ -575,8 +575,8 @@ smp_prepare_boot_cpu(void) /* * Mark the boot cpu (current cpu) as both present and online */ - set_bit(smp_processor_id(), &cpu_present_mask); - set_bit(smp_processor_id(), &cpu_online_map); + cpu_set(smp_processor_id(), cpu_present_mask); + cpu_set(smp_processor_id(), cpu_online_map); } int __devinit diff -prauN linux-2.6.0-test1/arch/alpha/mm/remap.c wli-2.6.0-test1-37/arch/alpha/mm/remap.c --- linux-2.6.0-test1/arch/alpha/mm/remap.c 2003-07-13 20:29:27.000000000 -0700 +++ wli-2.6.0-test1-37/arch/alpha/mm/remap.c 2003-07-14 06:49:00.000000000 -0700 @@ -73,7 +73,7 @@ __alpha_remap_area_pages(unsigned long a spin_lock(&init_mm.page_table_lock); do { pmd_t *pmd; - pmd = pmd_alloc(&init_mm, dir, address); + pmd = pmd_alloc_kernel(&init_mm, dir, address); error = -ENOMEM; if (!pmd) break; diff -prauN linux-2.6.0-test1/arch/arm/mach-arc/mm.c wli-2.6.0-test1-37/arch/arm/mach-arc/mm.c --- linux-2.6.0-test1/arch/arm/mach-arc/mm.c 2003-07-13 20:33:46.000000000 -0700 +++ wli-2.6.0-test1-37/arch/arm/mach-arc/mm.c 2003-07-14 06:49:00.000000000 -0700 @@ -66,7 +66,7 @@ pgd_t *get_pgd_slow(struct mm_struct *mm goto no_pgd; /* - * This lock is here just to satisfy pmd_alloc and pte_lock + * This lock is here just to satisfy pmd_alloc_map() and pte_lock */ spin_lock(&mm->page_table_lock); @@ -74,13 +74,15 @@ pgd_t *get_pgd_slow(struct mm_struct *mm * On ARM, first page must always be allocated since it contains * the machine vectors. */ - new_pmd = pmd_alloc(mm, new_pgd, 0); + new_pmd = pmd_alloc_map(mm, new_pgd, 0); if (!new_pmd) goto no_pmd; - new_pte = pte_alloc_map(mm, new_pmd, 0); - if (!new_pte) + new_pte = pte_alloc_map(mm, new_pgd, &new_pmd, 0); + if (!new_pte) { + pmd_unmap(new_pmd); goto no_pte; + } init_pgd = pgd_offset_k(0); init_pmd = pmd_offset(init_pgd, 0); @@ -88,6 +90,7 @@ pgd_t *get_pgd_slow(struct mm_struct *mm set_pte(new_pte, *init_pte); pte_unmap_nested(init_pte); pte_unmap(new_pte); + pmd_unmap(new_pmd); /* * most of the page table entries are zeroed diff -prauN linux-2.6.0-test1/arch/arm/mm/consistent.c wli-2.6.0-test1-37/arch/arm/mm/consistent.c --- linux-2.6.0-test1/arch/arm/mm/consistent.c 2003-07-13 20:36:49.000000000 -0700 +++ wli-2.6.0-test1-37/arch/arm/mm/consistent.c 2003-07-14 06:49:00.000000000 -0700 @@ -325,7 +325,7 @@ static int __init consistent_init(void) do { pgd = pgd_offset(&init_mm, CONSISTENT_BASE); - pmd = pmd_alloc(&init_mm, pgd, CONSISTENT_BASE); + pmd = pmd_alloc_kernel(&init_mm, pgd, CONSISTENT_BASE); if (!pmd) { printk(KERN_ERR "consistent_init: out of pmd tables\n"); return -ENOMEM; diff -prauN linux-2.6.0-test1/arch/arm/mm/fault-armv.c wli-2.6.0-test1-37/arch/arm/mm/fault-armv.c --- linux-2.6.0-test1/arch/arm/mm/fault-armv.c 2003-07-13 20:31:52.000000000 -0700 +++ wli-2.6.0-test1-37/arch/arm/mm/fault-armv.c 2003-07-14 08:52:52.000000000 -0700 @@ -187,19 +187,22 @@ void __flush_dcache_page(struct page *pa __cpuc_flush_dcache_page(page_address(page)); - if (!page->mapping) + if (!page_mapping(page)) return; /* * With a VIVT cache, we need to also write back * and invalidate any user data. */ - list_for_each(l, &page->mapping->i_mmap_shared) { + list_for_each_rcu(l, &page_mapping(page)->i_mmap_shared) { struct vm_area_struct *mpnt; unsigned long off; mpnt = list_entry(l, struct vm_area_struct, shared); + if (mpnt->vm_flags & VM_DEAD) + continue; + /* * If this VMA is not in our MM, we can ignore it. */ @@ -230,12 +233,15 @@ make_coherent(struct vm_area_struct *vma * space, then we need to handle them specially to maintain * cache coherency. */ - list_for_each(l, &page->mapping->i_mmap_shared) { + list_for_each_rcu(l, &page_mapping(page)->i_mmap_shared) { struct vm_area_struct *mpnt; unsigned long off; mpnt = list_entry(l, struct vm_area_struct, shared); + if (mpnt->vm_flags & VM_DEAD) + continue; + /* * If this VMA is not in our MM, we can ignore it. * Note that we intentionally don't mask out the VMA @@ -288,7 +294,7 @@ void update_mmu_cache(struct vm_area_str if (!pfn_valid(pfn)) return; page = pfn_to_page(pfn); - if (page->mapping) { + if (page_mapping(page)) { int dirty = test_and_clear_bit(PG_dcache_dirty, &page->flags); if (dirty) diff -prauN linux-2.6.0-test1/arch/arm/mm/ioremap.c wli-2.6.0-test1-37/arch/arm/mm/ioremap.c --- linux-2.6.0-test1/arch/arm/mm/ioremap.c 2003-07-13 20:35:17.000000000 -0700 +++ wli-2.6.0-test1-37/arch/arm/mm/ioremap.c 2003-07-14 06:49:00.000000000 -0700 @@ -95,7 +95,7 @@ static int remap_area_pages(unsigned lon spin_lock(&init_mm.page_table_lock); do { pmd_t *pmd; - pmd = pmd_alloc(&init_mm, dir, address); + pmd = pmd_alloc_kernel(&init_mm, dir, address); error = -ENOMEM; if (!pmd) break; diff -prauN linux-2.6.0-test1/arch/arm/mm/minicache.c wli-2.6.0-test1-37/arch/arm/mm/minicache.c --- linux-2.6.0-test1/arch/arm/mm/minicache.c 2003-07-13 20:36:33.000000000 -0700 +++ wli-2.6.0-test1-37/arch/arm/mm/minicache.c 2003-07-14 06:49:00.000000000 -0700 @@ -57,7 +57,7 @@ static int __init minicache_init(void) pmd_t *pmd; pgd = pgd_offset_k(minicache_address); - pmd = pmd_alloc(&init_mm, pgd, minicache_address); + pmd = pmd_alloc_kernel(&init_mm, pgd, minicache_address); if (!pmd) BUG(); minicache_pte = pte_alloc_kernel(&init_mm, pmd, minicache_address); diff -prauN linux-2.6.0-test1/arch/arm/mm/mm-armv.c wli-2.6.0-test1-37/arch/arm/mm/mm-armv.c --- linux-2.6.0-test1/arch/arm/mm/mm-armv.c 2003-07-13 20:35:51.000000000 -0700 +++ wli-2.6.0-test1-37/arch/arm/mm/mm-armv.c 2003-07-14 06:49:00.000000000 -0700 @@ -131,7 +131,7 @@ pgd_t *get_pgd_slow(struct mm_struct *mm if (vectors_base() == 0) { /* - * This lock is here just to satisfy pmd_alloc and pte_lock + * This lock is here just to satisfy pmd_alloc_map() and pte_lock */ spin_lock(&mm->page_table_lock); @@ -139,20 +139,22 @@ pgd_t *get_pgd_slow(struct mm_struct *mm * On ARM, first page must always be allocated since it * contains the machine vectors. */ - new_pmd = pmd_alloc(mm, new_pgd, 0); + new_pmd = pmd_alloc_map(mm, new_pgd, 0); if (!new_pmd) goto no_pmd; - new_pte = pte_alloc_map(mm, new_pmd, 0); - if (!new_pte) + new_pte = pte_alloc_map(mm, new_pgd, &new_pmd, 0); + if (!new_pte) { + pmd_unmap(new_pmd); goto no_pte; + } init_pmd = pmd_offset(init_pgd, 0); init_pte = pte_offset_map_nested(init_pmd, 0); set_pte(new_pte, *init_pte); pte_unmap_nested(init_pte); pte_unmap(new_pte); - + pmd_unmap(new_pmd); spin_unlock(&mm->page_table_lock); } diff -prauN linux-2.6.0-test1/arch/arm26/mm/mm-memc.c wli-2.6.0-test1-37/arch/arm26/mm/mm-memc.c --- linux-2.6.0-test1/arch/arm26/mm/mm-memc.c 2003-07-13 20:38:47.000000000 -0700 +++ wli-2.6.0-test1-37/arch/arm26/mm/mm-memc.c 2003-07-14 06:49:00.000000000 -0700 @@ -79,7 +79,7 @@ pgd_t *get_pgd_slow(struct mm_struct *mm goto no_pgd; /* - * This lock is here just to satisfy pmd_alloc and pte_lock + * This lock is here just to satisfy pmd_alloc_kernel() and pte_lock * FIXME: I bet we could avoid taking it pretty much altogether */ spin_lock(&mm->page_table_lock); @@ -88,7 +88,7 @@ pgd_t *get_pgd_slow(struct mm_struct *mm * On ARM, first page must always be allocated since it contains * the machine vectors. */ - new_pmd = pmd_alloc(mm, new_pgd, 0); + new_pmd = pmd_alloc_kernel(mm, new_pgd, 0); if (!new_pmd) goto no_pmd; diff -prauN linux-2.6.0-test1/arch/cris/mm/ioremap.c wli-2.6.0-test1-37/arch/cris/mm/ioremap.c --- linux-2.6.0-test1/arch/cris/mm/ioremap.c 2003-07-13 20:30:40.000000000 -0700 +++ wli-2.6.0-test1-37/arch/cris/mm/ioremap.c 2003-07-14 06:49:00.000000000 -0700 @@ -78,7 +78,7 @@ static int remap_area_pages(unsigned lon spin_lock(&init_mm.page_table_lock); do { pmd_t *pmd; - pmd = pmd_alloc(&init_mm, dir, address); + pmd = pmd_alloc_kernel(&init_mm, dir, address); error = -ENOMEM; if (!pmd) break; diff -prauN linux-2.6.0-test1/arch/i386/Kconfig wli-2.6.0-test1-37/arch/i386/Kconfig --- linux-2.6.0-test1/arch/i386/Kconfig 2003-07-13 20:30:48.000000000 -0700 +++ wli-2.6.0-test1-37/arch/i386/Kconfig 2003-07-14 09:33:21.000000000 -0700 @@ -397,6 +397,11 @@ config X86_OOSTORE depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 default y +config X86_CMOV + bool + depends on M686 || MPENTIUMII || MPENTIUMIII || MPENTIUM4 || MK8 || MCRUSOE + default y + config HUGETLB_PAGE bool "Huge TLB Page Support" help @@ -438,9 +443,9 @@ config SMP If you don't know what to do here, say N. config NR_CPUS - int "Maximum number of CPUs (2-32)" + int "Maximum number of CPUs (2-255)" depends on SMP - default "32" + default "8" help This allows you to specify the maximum number of CPUs which this kernel will support. The maximum supported value is 32 and the @@ -723,6 +728,26 @@ config HIGHPTE low memory. Setting this option will put user-space page table entries in high memory. +config HIGHPMD + bool "Allocate 2nd-level pagetables from highmem" + depends on HIGHMEM64G && HIGHPTE + help + The VM uses one lowmem-allocated pmd entry for each pagetable + page of physical memory allocated, and preallocates them all + for 12KB of per-process lowmem overhead. For systems with + extreme amounts of highmem, this cannot be tolerated. Setting + this option will put userspace 2nd-level pagetables in highmem. + +config 4K_STACK + bool "Use smaller 4k per-task stacks" + help + This option will shrink the kernel's per-task stack from 8k to + 4k. This will greatly increase your chance of overflowing it. + But, if you use the per-cpu interrupt stacks as well, your chances + go way down. Also try the CONFIG_X86_STACK_CHECK overflow + detection. It is much more reliable than the currently in-kernel + version. + config MATH_EMULATION bool "Math emulation" ---help--- @@ -1368,6 +1393,34 @@ config FRAME_POINTER If you don't debug the kernel, you can say N, but we may not be able to solve problems without frame pointers. +config X86_STACK_CHECK + bool "Detect stack overflows" + depends on FRAME_POINTER + help + Say Y here to have the kernel attempt to detect when the per-task + kernel stack overflows. This is much more robust checking than + the above overflow check, which will only occasionally detect + an overflow. The level of guarantee here is much greater. + + Some older versions of gcc don't handle the -p option correctly. + Kernprof is affected by the same problem, which is described here: + http://oss.sgi.com/projects/kernprof/faq.html#Q9 + + Basically, if you get oopses in __free_pages_ok during boot when + you have this turned on, you need to fix gcc. The Redhat 2.96 + version and gcc-3.x seem to work. + + If not debugging a stack overflow problem, say N + +config MMAP_TOPDOWN + bool "Top-down vma allocation" + help + Say Y here to have the kernel change its vma allocation policy + to allocate vma's from the top of the address space down, and + to shove the stack low so as to conserve virtualspace. This is + risky because various apps, including a number of versions of + ld.so, depend on the kernel's bottom-up behavior. + config X86_EXTRA_IRQS bool depends on X86_LOCAL_APIC || X86_VOYAGER diff -prauN linux-2.6.0-test1/arch/i386/Makefile wli-2.6.0-test1-37/arch/i386/Makefile --- linux-2.6.0-test1/arch/i386/Makefile 2003-07-13 20:32:42.000000000 -0700 +++ wli-2.6.0-test1-37/arch/i386/Makefile 2003-07-14 08:40:19.000000000 -0700 @@ -85,6 +85,10 @@ mcore-$(CONFIG_X86_ES7000) := mach-es700 # default subarch .h files mflags-y += -Iinclude/asm-i386/mach-default +ifdef CONFIG_X86_STACK_CHECK +CFLAGS += -p +endif + head-y := arch/i386/kernel/head.o arch/i386/kernel/init_task.o libs-y += arch/i386/lib/ diff -prauN linux-2.6.0-test1/arch/i386/boot/compressed/misc.c wli-2.6.0-test1-37/arch/i386/boot/compressed/misc.c --- linux-2.6.0-test1/arch/i386/boot/compressed/misc.c 2003-07-13 20:35:12.000000000 -0700 +++ wli-2.6.0-test1-37/arch/i386/boot/compressed/misc.c 2003-07-14 08:40:19.000000000 -0700 @@ -379,3 +379,7 @@ asmlinkage int decompress_kernel(struct if (high_loaded) close_output_buffer_if_we_run_high(mv); return high_loaded; } + +/* We don't actually check for stack overflows this early. */ +__asm__(".globl mcount ; mcount: ret\n"); + diff -prauN linux-2.6.0-test1/arch/i386/kernel/apic.c wli-2.6.0-test1-37/arch/i386/kernel/apic.c --- linux-2.6.0-test1/arch/i386/kernel/apic.c 2003-07-13 20:39:27.000000000 -0700 +++ wli-2.6.0-test1-37/arch/i386/kernel/apic.c 2003-07-14 08:39:52.000000000 -0700 @@ -1037,7 +1037,8 @@ inline void smp_local_timer_interrupt(st * interrupt as well. Thus we cannot inline the local irq ... ] */ -void smp_apic_timer_interrupt(struct pt_regs regs) +struct pt_regs * IRQHANDLER(smp_apic_timer_interrupt(struct pt_regs* regs)); +struct pt_regs * smp_apic_timer_interrupt(struct pt_regs* regs) { int cpu = smp_processor_id(); @@ -1057,14 +1058,16 @@ void smp_apic_timer_interrupt(struct pt_ * interrupt lock, which is the WrongThing (tm) to do. */ irq_enter(); - smp_local_timer_interrupt(®s); + smp_local_timer_interrupt(regs); irq_exit(); + return regs; } /* * This interrupt should _never_ happen with our APIC/SMP architecture */ -asmlinkage void smp_spurious_interrupt(void) +struct pt_regs * IRQHANDLER(smp_spurious_interrupt(struct pt_regs* regs)); +struct pt_regs * smp_spurious_interrupt(struct pt_regs* regs) { unsigned long v; @@ -1082,13 +1085,15 @@ asmlinkage void smp_spurious_interrupt(v printk(KERN_INFO "spurious APIC interrupt on CPU#%d, should never happen.\n", smp_processor_id()); irq_exit(); + return regs; } /* * This interrupt should never happen with our APIC/SMP architecture */ -asmlinkage void smp_error_interrupt(void) +struct pt_regs * IRQHANDLER(smp_error_interrupt(struct pt_regs* regs)); +struct pt_regs * smp_error_interrupt(struct pt_regs* regs) { unsigned long v, v1; @@ -1113,6 +1118,7 @@ asmlinkage void smp_error_interrupt(void printk (KERN_INFO "APIC error on CPU%d: %02lx(%02lx)\n", smp_processor_id(), v , v1); irq_exit(); + return regs; } /* @@ -1137,7 +1143,7 @@ int __init APIC_init_uniprocessor (void) connect_bsp_APIC(); - phys_cpu_present_map = 1 << boot_cpu_physical_apicid; + phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid); setup_local_APIC(); diff -prauN linux-2.6.0-test1/arch/i386/kernel/apm.c wli-2.6.0-test1-37/arch/i386/kernel/apm.c --- linux-2.6.0-test1/arch/i386/kernel/apm.c 2003-07-13 20:29:30.000000000 -0700 +++ wli-2.6.0-test1-37/arch/i386/kernel/apm.c 2003-07-14 06:35:26.000000000 -0700 @@ -506,32 +506,20 @@ static void apm_error(char *str, int err * Lock APM functionality to physical CPU 0 */ -#ifdef CONFIG_SMP - -static unsigned long apm_save_cpus(void) +static cpumask_t apm_save_cpus(void) { - unsigned long x = current->cpus_allowed; + cpumask_t x = current->cpus_allowed; /* Some bioses don't like being called from CPU != 0 */ - set_cpus_allowed(current, 1UL << 0); + set_cpus_allowed(current, cpumask_of_cpu(0)); BUG_ON(smp_processor_id() != 0); return x; } -static inline void apm_restore_cpus(unsigned long mask) +static inline void apm_restore_cpus(cpumask_t mask) { set_cpus_allowed(current, mask); } -#else - -/* - * No CPU lockdown needed on a uniprocessor - */ - -#define apm_save_cpus() 0 -#define apm_restore_cpus(x) (void)(x) - -#endif /* * These are the actual BIOS calls. Depending on APM_ZERO_SEGS and @@ -593,7 +581,7 @@ static u8 apm_bios_call(u32 func, u32 eb { APM_DECL_SEGS unsigned long flags; - unsigned long cpus; + cpumask_t cpus; int cpu; struct desc_struct save_desc_40; @@ -635,7 +623,7 @@ static u8 apm_bios_call_simple(u32 func, u8 error; APM_DECL_SEGS unsigned long flags; - unsigned long cpus; + cpumask_t cpus; int cpu; struct desc_struct save_desc_40; @@ -913,7 +901,7 @@ static void apm_power_off(void) */ #ifdef CONFIG_SMP /* Some bioses don't like being called from CPU != 0 */ - set_cpus_allowed(current, 1UL << 0); + set_cpus_allowed(current, cpumask_of_cpu(0)); BUG_ON(smp_processor_id() != 0); #endif if (apm_info.realmode_power_off) @@ -1704,7 +1692,7 @@ static int apm(void *unused) * Some bioses don't like being called from CPU != 0. * Method suggested by Ingo Molnar. */ - set_cpus_allowed(current, 1UL << 0); + set_cpus_allowed(current, cpumask_of_cpu(0)); BUG_ON(smp_processor_id() != 0); #endif diff -prauN linux-2.6.0-test1/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c wli-2.6.0-test1-37/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c --- linux-2.6.0-test1/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c 2003-07-13 20:38:00.000000000 -0700 +++ wli-2.6.0-test1-37/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c 2003-07-14 06:38:10.000000000 -0700 @@ -53,10 +53,10 @@ static int stock_freq; static int cpufreq_p4_setdc(unsigned int cpu, unsigned int newstate) { u32 l, h; - unsigned long cpus_allowed; + cpumask_t cpus_allowed; struct cpufreq_freqs freqs; int hyperthreading = 0; - int affected_cpu_map = 0; + cpumask_t affected_cpu_map = 0; int sibling = 0; if (!cpu_online(cpu) || (newstate > DC_DISABLE) || @@ -67,16 +67,16 @@ static int cpufreq_p4_setdc(unsigned int cpus_allowed = current->cpus_allowed; /* only run on CPU to be set, or on its sibling */ - affected_cpu_map = 1 << cpu; + affected_cpu_map = cpumask_of_cpu(cpu); #ifdef CONFIG_X86_HT hyperthreading = ((cpu_has_ht) && (smp_num_siblings == 2)); if (hyperthreading) { sibling = cpu_sibling_map[cpu]; - affected_cpu_map |= (1 << sibling); + cpus_or(affected_cpu_map, affected_cpu_map, cpumask_of_cpu(sibling)); } #endif set_cpus_allowed(current, affected_cpu_map); - BUG_ON(!(affected_cpu_map & (1 << smp_processor_id()))); + BUG_ON(!cpu_isset(smp_processor_id(), affected_cpu_map)); /* get current state */ rdmsr(MSR_IA32_THERM_CONTROL, l, h); diff -prauN linux-2.6.0-test1/arch/i386/kernel/cpu/mcheck/p4.c wli-2.6.0-test1-37/arch/i386/kernel/cpu/mcheck/p4.c --- linux-2.6.0-test1/arch/i386/kernel/cpu/mcheck/p4.c 2003-07-13 20:35:16.000000000 -0700 +++ wli-2.6.0-test1-37/arch/i386/kernel/cpu/mcheck/p4.c 2003-07-14 08:39:52.000000000 -0700 @@ -61,11 +61,13 @@ static void intel_thermal_interrupt(stru /* Thermal interrupt handler for this CPU setup */ static void (*vendor_thermal_interrupt)(struct pt_regs *regs) = unexpected_thermal_interrupt; -asmlinkage void smp_thermal_interrupt(struct pt_regs regs) +struct pt_regs * IRQHANDLER(smp_thermal_interrupt(struct pt_regs* regs)); +struct pt_regs * smp_thermal_interrupt(struct pt_regs* regs) { irq_enter(); vendor_thermal_interrupt(®s); irq_exit(); + return regs; } /* P4/Xeon Thermal regulation detect and init */ diff -prauN linux-2.6.0-test1/arch/i386/kernel/cpu/proc.c wli-2.6.0-test1-37/arch/i386/kernel/cpu/proc.c --- linux-2.6.0-test1/arch/i386/kernel/cpu/proc.c 2003-07-13 20:37:33.000000000 -0700 +++ wli-2.6.0-test1-37/arch/i386/kernel/cpu/proc.c 2003-07-14 06:31:09.000000000 -0700 @@ -60,7 +60,7 @@ static int show_cpuinfo(struct seq_file int fpu_exception; #ifdef CONFIG_SMP - if (!(cpu_online_map & (1<f_dentry->d_inode->i_rdev); struct cpuinfo_x86 *c = &(cpu_data)[cpu]; - if ( !(cpu_online_map & (1UL << cpu)) ) + if (!cpu_online(cpu)) return -ENXIO; /* No such CPU */ if ( c->cpuid_level < 0 ) return -EIO; /* CPUID not supported */ diff -prauN linux-2.6.0-test1/arch/i386/kernel/entry.S wli-2.6.0-test1-37/arch/i386/kernel/entry.S --- linux-2.6.0-test1/arch/i386/kernel/entry.S 2003-07-13 20:32:29.000000000 -0700 +++ wli-2.6.0-test1-37/arch/i386/kernel/entry.S 2003-07-14 08:40:19.000000000 -0700 @@ -160,7 +160,7 @@ do_lcall: movl %eax,EFLAGS(%ebp) # movl %edx,EIP(%ebp) # Now we move them to their "normal" places movl %ecx,CS(%ebp) # - andl $-8192, %ebp # GET_THREAD_INFO + GET_THREAD_INFO_WITH_ESP(%ebp) # GET_THREAD_INFO movl TI_EXEC_DOMAIN(%ebp), %edx # Get the execution domain call *4(%edx) # Call the lcall7 handler for the domain addl $4, %esp @@ -394,17 +394,78 @@ ENTRY(irq_entries_start) vector=vector+1 .endr + +# lets play optimizing compiler... +#ifdef CONFIG_X86_CMOV +#define COND_MOVE cmovnz %esi,%esp; +#else +#define COND_MOVE \ + jz 1f; \ + mov %esi,%esp; \ +1: +#endif + +# These macros will switch you to, and from a per-cpu interrupt stack +# They take the pt_regs arg and move it from the normal place on the +# stack to %eax. Any handler function can retrieve it using regparm(1). +# The handlers are expected to return the stack to switch back to in +# the same register. +# +# This means that the irq handlers need to return their arg +# +# SWITCH_TO_IRQSTACK clobbers %ebx, %ecx, %edx, %esi +# old stack gets put in %eax + +.macro SWITCH_TO_IRQSTACK + GET_THREAD_INFO(%ebx); + movl TI_IRQ_STACK(%ebx),%ecx; + movl TI_TASK(%ebx),%edx; + movl %esp,%eax; + + # %ecx+THREAD_SIZE is next stack -4 keeps us in the right one + leal (THREAD_SIZE-4)(%ecx),%esi; + + # is there a valid irq_stack? + testl %ecx,%ecx; + COND_MOVE; + + # update the task pointer in the irq stack + GET_THREAD_INFO(%esi); + movl %edx,TI_TASK(%esi); + + # update the preempt count in the irq stack + movl TI_PRE_COUNT(%ebx),%ecx; + movl %ecx,TI_PRE_COUNT(%esi); +.endm + +# copy flags from the irq stack back into the task's thread_info +# %esi is saved over the irq handler call and contains the irq stack's +# thread_info pointer +# %eax was returned from the handler, as described above +# %ebx contains the original thread_info pointer + +.macro RESTORE_FROM_IRQSTACK + movl %eax,%esp; + movl TI_FLAGS(%esi),%eax; + movl $0,TI_FLAGS(%esi); + LOCK orl %eax,TI_FLAGS(%ebx); +.endm + ALIGN common_interrupt: SAVE_ALL + SWITCH_TO_IRQSTACK call do_IRQ + RESTORE_FROM_IRQSTACK jmp ret_from_intr #define BUILD_INTERRUPT(name, nr) \ ENTRY(name) \ pushl $nr-256; \ SAVE_ALL \ - call smp_/**/name; \ + SWITCH_TO_IRQSTACK; \ + call smp_/**/name; \ + RESTORE_FROM_IRQSTACK; \ jmp ret_from_intr; /* The include is where all of the SMP etc. interrupts come from */ @@ -604,6 +665,61 @@ ENTRY(spurious_interrupt_bug) pushl $do_spurious_interrupt_bug jmp error_code + +#ifdef CONFIG_X86_STACK_CHECK +.data + .globl stack_overflowed +stack_overflowed: + .long 0 +.text + +ENTRY(mcount) + push %eax + movl $(THREAD_SIZE - 1),%eax + andl %esp,%eax + cmpl $STACK_WARN,%eax /* more than half the stack is used*/ + jle 1f +2: + popl %eax + ret +1: + lock; btsl $0,stack_overflowed + jc 2b + + # switch to overflow stack + movl %esp,%eax + movl $(stack_overflow_stack + THREAD_SIZE - 4),%esp + + pushf + cli + pushl %eax + + # push eip then esp of error for stack_overflow_panic + pushl 4(%eax) + pushl %eax + + # update the task pointer and cpu in the overflow stack's thread_info. + GET_THREAD_INFO_WITH_ESP(%eax) + movl TI_TASK(%eax),%ebx + movl %ebx,stack_overflow_stack+TI_TASK + movl TI_CPU(%eax),%ebx + movl %ebx,stack_overflow_stack+TI_CPU + + call stack_overflow + + # pop off call arguments + addl $8,%esp + + popl %eax + popf + movl %eax,%esp + popl %eax + movl $0,stack_overflowed + ret + +#warning stack check enabled +#endif + .data ENTRY(sys_call_table) .long sys_restart_syscall /* 0 - old "setup()" system call, used for restarting */ diff -prauN linux-2.6.0-test1/arch/i386/kernel/head.S wli-2.6.0-test1-37/arch/i386/kernel/head.S --- linux-2.6.0-test1/arch/i386/kernel/head.S 2003-07-13 20:30:38.000000000 -0700 +++ wli-2.6.0-test1-37/arch/i386/kernel/head.S 2003-07-14 08:36:18.000000000 -0700 @@ -16,6 +16,7 @@ #include #include #include +#include #define OLD_CL_MAGIC_ADDR 0x90020 #define OLD_CL_MAGIC 0xA33F @@ -325,7 +326,7 @@ rp_sidt: ret ENTRY(stack_start) - .long init_thread_union+8192 + .long init_thread_union+THREAD_SIZE .long __BOOT_DS /* This is the default interrupt "handler" :-) */ diff -prauN linux-2.6.0-test1/arch/i386/kernel/i386_ksyms.c wli-2.6.0-test1-37/arch/i386/kernel/i386_ksyms.c --- linux-2.6.0-test1/arch/i386/kernel/i386_ksyms.c 2003-07-13 20:39:21.000000000 -0700 +++ wli-2.6.0-test1-37/arch/i386/kernel/i386_ksyms.c 2003-07-14 08:40:19.000000000 -0700 @@ -209,3 +209,8 @@ EXPORT_SYMBOL(kmap_atomic_to_page); EXPORT_SYMBOL(edd); EXPORT_SYMBOL(eddnr); #endif + +#ifdef CONFIG_X86_STACK_CHECK +extern void mcount(void); +EXPORT_SYMBOL(mcount); +#endif diff -prauN linux-2.6.0-test1/arch/i386/kernel/init_task.c wli-2.6.0-test1-37/arch/i386/kernel/init_task.c --- linux-2.6.0-test1/arch/i386/kernel/init_task.c 2003-07-13 20:38:44.000000000 -0700 +++ wli-2.6.0-test1-37/arch/i386/kernel/init_task.c 2003-07-14 10:07:15.000000000 -0700 @@ -8,12 +8,21 @@ #include #include +static struct fs_dirs init_dirs; static struct fs_struct init_fs = INIT_FS; static struct files_struct init_files = INIT_FILES; static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); struct mm_struct init_mm = INIT_MM(init_mm); +union thread_union init_irq_union + __attribute__((__section__(".data.init_task"))); + +#ifdef CONFIG_X86_STACK_CHECK +union thread_union stack_overflow_stack + __attribute__((__section__(".data.init_task"))); +#endif + /* * Initial thread structure. * diff -prauN linux-2.6.0-test1/arch/i386/kernel/io_apic.c wli-2.6.0-test1-37/arch/i386/kernel/io_apic.c --- linux-2.6.0-test1/arch/i386/kernel/io_apic.c 2003-07-13 20:32:40.000000000 -0700 +++ wli-2.6.0-test1-37/arch/i386/kernel/io_apic.c 2003-07-14 06:33:08.000000000 -0700 @@ -249,14 +249,14 @@ static void clear_IO_APIC (void) clear_IO_APIC_pin(apic, pin); } -static void set_ioapic_affinity (unsigned int irq, unsigned long cpu_mask) +static void set_ioapic_affinity(unsigned int irq, cpumask_t cpumask) { unsigned long flags; int pin; struct irq_pin_list *entry = irq_2_pin + irq; unsigned int apicid_value; - apicid_value = cpu_mask_to_apicid(cpu_mask); + apicid_value = cpu_mask_to_apicid(mk_cpumask_const(cpumask)); /* Prepare to do the io_apic_write */ apicid_value = apicid_value << 24; spin_lock_irqsave(&ioapic_lock, flags); @@ -286,9 +286,9 @@ static void set_ioapic_affinity (unsigne # define Dprintk(x...) # endif -extern unsigned long irq_affinity[NR_IRQS]; +extern cpumask_t irq_affinity[NR_IRQS]; -static int __cacheline_aligned pending_irq_balance_cpumask[NR_IRQS]; +static cpumask_t __cacheline_aligned pending_irq_balance_cpumask[NR_IRQS]; #define IRQBALANCE_CHECK_ARCH -999 static int irqbalance_disabled = IRQBALANCE_CHECK_ARCH; @@ -307,8 +307,7 @@ struct irq_cpu_info { #define IDLE_ENOUGH(cpu,now) \ (idle_cpu(cpu) && ((now) - irq_stat[(cpu)].idle_timestamp > 1)) -#define IRQ_ALLOWED(cpu,allowed_mask) \ - ((1 << cpu) & (allowed_mask)) +#define IRQ_ALLOWED(cpu, allowed_mask) cpu_isset(cpu, allowed_mask) #define CPU_TO_PACKAGEINDEX(i) \ ((physical_balance && i > cpu_sibling_map[i]) ? cpu_sibling_map[i] : i) @@ -320,7 +319,7 @@ struct irq_cpu_info { long balanced_irq_interval = MAX_BALANCED_IRQ_INTERVAL; -static unsigned long move(int curr_cpu, unsigned long allowed_mask, +static unsigned long move(int curr_cpu, cpumask_t allowed_mask, unsigned long now, int direction) { int search_idle = 1; @@ -350,20 +349,20 @@ inside: static inline void balance_irq(int cpu, int irq) { unsigned long now = jiffies; - unsigned long allowed_mask; + cpumask_t allowed_mask; unsigned int new_cpu; if (irqbalance_disabled) return; - allowed_mask = cpu_online_map & irq_affinity[irq]; + cpus_and(allowed_mask, cpu_online_map, irq_affinity[irq]); new_cpu = move(cpu, allowed_mask, now, 1); if (cpu != new_cpu) { irq_desc_t *desc = irq_desc + irq; unsigned long flags; spin_lock_irqsave(&desc->lock, flags); - pending_irq_balance_cpumask[irq] = 1 << new_cpu; + pending_irq_balance_cpumask[irq] = cpumask_of_cpu(new_cpu); spin_unlock_irqrestore(&desc->lock, flags); } } @@ -399,8 +398,7 @@ static void do_irq_balance(void) int tmp_loaded, first_attempt = 1; unsigned long tmp_cpu_irq; unsigned long imbalance = 0; - unsigned long allowed_mask; - unsigned long target_cpu_mask; + cpumask_t allowed_mask, target_cpu_mask, tmp; for (i = 0; i < NR_CPUS; i++) { int package_index; @@ -549,10 +547,11 @@ tryanotherirq: CPU_IRQ(cpu_sibling_map[min_loaded])) min_loaded = cpu_sibling_map[min_loaded]; - allowed_mask = cpu_online_map & irq_affinity[selected_irq]; - target_cpu_mask = 1 << min_loaded; + cpus_and(allowed_mask, cpu_online_map, irq_affinity[selected_irq]); + target_cpu_mask = cpumask_of_cpu(min_loaded); + cpus_and(tmp, target_cpu_mask, allowed_mask); - if (target_cpu_mask & allowed_mask) { + if (!cpus_empty(tmp)) { irq_desc_t *desc = irq_desc + selected_irq; unsigned long flags; @@ -560,7 +559,8 @@ tryanotherirq: selected_irq, min_loaded); /* mark for change destination */ spin_lock_irqsave(&desc->lock, flags); - pending_irq_balance_cpumask[selected_irq] = 1 << min_loaded; + pending_irq_balance_cpumask[selected_irq] = + cpumask_of_cpu(min_loaded); spin_unlock_irqrestore(&desc->lock, flags); /* Since we made a change, come back sooner to * check for more variation. @@ -591,8 +591,9 @@ int balanced_irq(void *unused) daemonize("kirqd"); /* push everything to CPU 0 to give us a starting point. */ - for (i = 0 ; i < NR_IRQS ; i++) - pending_irq_balance_cpumask[i] = 1; + for (i = 0 ; i < NR_IRQS ; i++) { + pending_irq_balance_cpumask[i] = cpumask_of_cpu(0); + } repeat: set_current_state(TASK_INTERRUPTIBLE); @@ -611,7 +612,9 @@ static int __init balanced_irq_init(void { int i; struct cpuinfo_x86 *c; + cpumask_t tmp; + cpus_shift_right(tmp, cpu_online_map, 2); c = &boot_cpu_data; /* When not overwritten by the command line ask subarchitecture. */ if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH) @@ -628,7 +631,7 @@ static int __init balanced_irq_init(void * Enable physical balance only if more than 1 physical processor * is present */ - if (smp_num_siblings > 1 && cpu_online_map >> 2) + if (smp_num_siblings > 1 && !cpus_empty(tmp)) physical_balance = 1; for (i = 0; i < NR_CPUS; i++) { @@ -667,14 +670,14 @@ static int __init irqbalance_disable(cha __setup("noirqbalance", irqbalance_disable); -static void set_ioapic_affinity (unsigned int irq, unsigned long mask); +static void set_ioapic_affinity(unsigned int irq, cpumask_t mask); static inline void move_irq(int irq) { /* note - we hold the desc->lock */ - if (unlikely(pending_irq_balance_cpumask[irq])) { + if (unlikely(!cpus_empty(pending_irq_balance_cpumask[irq]))) { set_ioapic_affinity(irq, pending_irq_balance_cpumask[irq]); - pending_irq_balance_cpumask[irq] = 0; + cpus_clear(pending_irq_balance_cpumask[irq]); } } @@ -822,7 +825,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, * we need to reprogram the ioredtbls to cater for the cpus which have come online * so mask in all cases should simply be TARGET_CPUS */ -void __init setup_ioapic_dest (unsigned long mask) +void __init setup_ioapic_dest(cpumask_t mask) { int pin, ioapic, irq, irq_entry; @@ -1598,7 +1601,7 @@ void disable_IO_APIC(void) static void __init setup_ioapic_ids_from_mpc(void) { union IO_APIC_reg_00 reg_00; - unsigned long phys_id_present_map; + physid_mask_t phys_id_present_map; int apic; int i; unsigned char old_id; @@ -1608,6 +1611,10 @@ static void __init setup_ioapic_ids_from /* This gets done during IOAPIC enumeration for ACPI. */ return; + /* + * This is broken; anything with a real cpu count has to + * circumvent this idiocy regardless. + */ phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map); /* @@ -1639,18 +1646,20 @@ static void __init setup_ioapic_ids_from mp_ioapics[apic].mpc_apicid)) { printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", apic, mp_ioapics[apic].mpc_apicid); - for (i = 0; i < 0xf; i++) - if (!(phys_id_present_map & (1 << i))) + for (i = 0; i < APIC_BROADCAST_ID; i++) + if (!physid_isset(i, phys_id_present_map)) break; - if (i >= 0xf) + if (i >= APIC_BROADCAST_ID) panic("Max APIC ID exceeded!\n"); printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", i); - phys_id_present_map |= 1 << i; + physid_set(i, phys_id_present_map); mp_ioapics[apic].mpc_apicid = i; } else { + physid_mask_t tmp; + tmp = apicid_to_cpu_present(mp_ioapics[apic].mpc_apicid); printk("Setting %d in the phys_id_present_map\n", mp_ioapics[apic].mpc_apicid); - phys_id_present_map |= apicid_to_cpu_present(mp_ioapics[apic].mpc_apicid); + physids_or(phys_id_present_map, phys_id_present_map, tmp); } @@ -2220,7 +2229,8 @@ late_initcall(io_apic_bug_finalize); int __init io_apic_get_unique_id (int ioapic, int apic_id) { union IO_APIC_reg_00 reg_00; - static unsigned long apic_id_map = 0; + static physid_mask_t apic_id_map = PHYSID_MASK_NONE; + physid_mask_t tmp; unsigned long flags; int i = 0; @@ -2233,8 +2243,8 @@ int __init io_apic_get_unique_id (int io * advantage of new APIC bus architecture. */ - if (!apic_id_map) - apic_id_map = phys_cpu_present_map; + if (physids_empty(apic_id_map)) + apic_id_map = ioapic_phys_id_map(phys_cpu_present_map); spin_lock_irqsave(&ioapic_lock, flags); reg_00.raw = io_apic_read(ioapic, 0); @@ -2266,7 +2276,8 @@ int __init io_apic_get_unique_id (int io apic_id = i; } - apic_id_map |= apicid_to_cpu_present(apic_id); + tmp = apicid_to_cpu_present(apic_id); + physids_or(apic_id_map, apic_id_map, tmp); if (reg_00.bits.ID != apic_id) { reg_00.bits.ID = apic_id; diff -prauN linux-2.6.0-test1/arch/i386/kernel/irq.c wli-2.6.0-test1-37/arch/i386/kernel/irq.c --- linux-2.6.0-test1/arch/i386/kernel/irq.c 2003-07-13 20:30:01.000000000 -0700 +++ wli-2.6.0-test1-37/arch/i386/kernel/irq.c 2003-07-14 08:41:02.000000000 -0700 @@ -45,8 +45,6 @@ #include #include - - /* * Linux has a controller-independent x86 interrupt architecture. * every controller has a 'controller-template', that is used @@ -403,7 +401,8 @@ void enable_irq(unsigned int irq) * SMP cross-CPU interrupts have their own specific * handlers). */ -asmlinkage unsigned int do_IRQ(struct pt_regs regs) +struct pt_regs * IRQHANDLER(do_IRQ(struct pt_regs *regs)); +struct pt_regs * do_IRQ(struct pt_regs *regs) { /* * We ack quickly, we don't want the irq controller @@ -415,7 +414,7 @@ asmlinkage unsigned int do_IRQ(struct pt * 0 return value means that this irq is already being * handled by some other CPU. (or is disabled) */ - int irq = regs.orig_eax & 0xff; /* high bits used in ret_from_ code */ + int irq = regs->orig_eax & 0xff; /* high bits used in ret_from_ code */ irq_desc_t *desc = irq_desc + irq; struct irqaction * action; unsigned int status; @@ -428,7 +427,7 @@ asmlinkage unsigned int do_IRQ(struct pt long esp; __asm__ __volatile__("andl %%esp,%0" : - "=r" (esp) : "0" (8191)); + "=r" (esp) : "0" (THREAD_SIZE - 1)); if (unlikely(esp < (sizeof(struct thread_info) + 1024))) { printk("do_IRQ: stack overflow: %ld\n", esp - sizeof(struct thread_info)); @@ -481,7 +480,7 @@ asmlinkage unsigned int do_IRQ(struct pt irqreturn_t action_ret; spin_unlock(&desc->lock); - action_ret = handle_IRQ_event(irq, ®s, action); + action_ret = handle_IRQ_event(irq, regs, action); spin_lock(&desc->lock); if (!noirqdebug) note_interrupt(irq, desc, action_ret); @@ -501,7 +500,7 @@ out: irq_exit(); - return 1; + return regs; } /** @@ -889,13 +888,13 @@ int setup_irq(unsigned int irq, struct i static struct proc_dir_entry * root_irq_dir; static struct proc_dir_entry * irq_dir [NR_IRQS]; -#define HEX_DIGITS 8 +#define HEX_DIGITS (2*sizeof(cpumask_t)) -static unsigned int parse_hex_value (const char __user *buffer, - unsigned long count, unsigned long *ret) +static unsigned int parse_hex_value(const char __user *buffer, + unsigned long count, cpumask_t *ret) { - unsigned char hexnum [HEX_DIGITS]; - unsigned long value; + unsigned char hexnum[HEX_DIGITS]; + cpumask_t value = CPU_MASK_NONE; int i; if (!count) @@ -909,10 +908,10 @@ static unsigned int parse_hex_value (con * Parse the first 8 characters as a hex string, any non-hex char * is end-of-string. '00e1', 'e1', '00E1', 'E1' are all the same. */ - value = 0; for (i = 0; i < count; i++) { unsigned int c = hexnum[i]; + int k; switch (c) { case '0' ... '9': c -= '0'; break; @@ -921,7 +920,10 @@ static unsigned int parse_hex_value (con default: goto out; } - value = (value << 4) | c; + cpus_shift_left(value, value, 4); + for (k = 0; k < 4; ++k) + if (test_bit(k, (unsigned long *)&c)) + cpu_set(k, value); } out: *ret = value; @@ -930,22 +932,35 @@ out: #ifdef CONFIG_SMP -static struct proc_dir_entry * smp_affinity_entry [NR_IRQS]; +static struct proc_dir_entry *smp_affinity_entry[NR_IRQS]; + +cpumask_t irq_affinity[NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_ALL }; -unsigned long irq_affinity [NR_IRQS] = { [0 ... NR_IRQS-1] = ~0UL }; -static int irq_affinity_read_proc (char *page, char **start, off_t off, +static int irq_affinity_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) { + int k, len; + cpumask_t tmp = irq_affinity[(long)data]; + if (count < HEX_DIGITS+1) return -EINVAL; - return sprintf (page, "%08lx\n", irq_affinity[(long)data]); + + len = 0; + for (k = 0; k < sizeof(cpumask_t)/sizeof(u16); ++k) { + int j = sprintf(page, "%04hx", (u16)cpus_coerce(tmp)); + len += j; + page += j; + cpus_shift_right(tmp, tmp, 16); + } + len += sprintf(page, "\n"); + return len; } -static int irq_affinity_write_proc (struct file *file, const char __user *buffer, +static int irq_affinity_write_proc(struct file *file, const char __user *buffer, unsigned long count, void *data) { - int irq = (long) data, full_count = count, err; - unsigned long new_value; + int irq = (long)data, full_count = count, err; + cpumask_t new_value, tmp; if (!irq_desc[irq].handler->set_affinity) return -EIO; @@ -957,11 +972,13 @@ static int irq_affinity_write_proc (stru * way to make the system unusable accidentally :-) At least * one online CPU still has to be targeted. */ - if (!(new_value & cpu_online_map)) + cpus_and(tmp, new_value, cpu_online_map); + if (cpus_empty(tmp)) return -EINVAL; irq_affinity[irq] = new_value; - irq_desc[irq].handler->set_affinity(irq, new_value); + irq_desc[irq].handler->set_affinity(irq, + cpumask_of_cpu(first_cpu(new_value))); return full_count; } @@ -980,8 +997,9 @@ static int prof_cpu_mask_read_proc (char static int prof_cpu_mask_write_proc (struct file *file, const char __user *buffer, unsigned long count, void *data) { - unsigned long *mask = (unsigned long *) data, full_count = count, err; - unsigned long new_value; + cpumask_t *mask = (cpumask_t *)data; + unsigned long full_count = count, err; + cpumask_t new_value; err = parse_hex_value(buffer, count, &new_value); if (err) diff -prauN linux-2.6.0-test1/arch/i386/kernel/ldt.c wli-2.6.0-test1-37/arch/i386/kernel/ldt.c --- linux-2.6.0-test1/arch/i386/kernel/ldt.c 2003-07-13 20:36:33.000000000 -0700 +++ wli-2.6.0-test1-37/arch/i386/kernel/ldt.c 2003-07-14 06:31:09.000000000 -0700 @@ -56,9 +56,11 @@ static int alloc_ldt(mm_context_t *pc, i if (reload) { #ifdef CONFIG_SMP + cpumask_t mask; preempt_disable(); load_LDT(pc); - if (current->mm->cpu_vm_mask != (1 << smp_processor_id())) + mask = cpumask_of_cpu(smp_processor_id()); + if (!cpus_equal(current->mm->cpu_vm_mask, mask)) smp_call_function(flush_ldt, 0, 1, 1); preempt_enable(); #else diff -prauN linux-2.6.0-test1/arch/i386/kernel/mpparse.c wli-2.6.0-test1-37/arch/i386/kernel/mpparse.c --- linux-2.6.0-test1/arch/i386/kernel/mpparse.c 2003-07-13 20:32:34.000000000 -0700 +++ wli-2.6.0-test1-37/arch/i386/kernel/mpparse.c 2003-07-14 06:33:08.000000000 -0700 @@ -71,7 +71,7 @@ unsigned int boot_cpu_logical_apicid = - static unsigned int __initdata num_processors; /* Bitmask of physically existing CPUs */ -unsigned long phys_cpu_present_map; +physid_mask_t phys_cpu_present_map; u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; @@ -106,6 +106,7 @@ static struct mpc_config_translation *tr void __init MP_processor_info (struct mpc_config_processor *m) { int ver, apicid; + physid_mask_t tmp; if (!(m->mpc_cpuflag & CPU_ENABLED)) return; @@ -176,7 +177,8 @@ void __init MP_processor_info (struct mp } ver = m->mpc_apicver; - phys_cpu_present_map |= apicid_to_cpu_present(apicid); + tmp = apicid_to_cpu_present(apicid); + physids_or(phys_cpu_present_map, phys_cpu_present_map, tmp); /* * Validate version diff -prauN linux-2.6.0-test1/arch/i386/kernel/msr.c wli-2.6.0-test1-37/arch/i386/kernel/msr.c --- linux-2.6.0-test1/arch/i386/kernel/msr.c 2003-07-13 20:33:45.000000000 -0700 +++ wli-2.6.0-test1-37/arch/i386/kernel/msr.c 2003-07-14 06:31:09.000000000 -0700 @@ -242,7 +242,7 @@ static int msr_open(struct inode *inode, int cpu = minor(file->f_dentry->d_inode->i_rdev); struct cpuinfo_x86 *c = &(cpu_data)[cpu]; - if ( !(cpu_online_map & (1UL << cpu)) ) + if (!cpu_online(cpu)) return -ENXIO; /* No such CPU */ if ( !cpu_has(c, X86_FEATURE_MSR) ) return -EIO; /* MSR not supported */ diff -prauN linux-2.6.0-test1/arch/i386/kernel/process.c wli-2.6.0-test1-37/arch/i386/kernel/process.c --- linux-2.6.0-test1/arch/i386/kernel/process.c 2003-07-13 20:28:51.000000000 -0700 +++ wli-2.6.0-test1-37/arch/i386/kernel/process.c 2003-07-14 08:41:02.000000000 -0700 @@ -160,7 +160,25 @@ static int __init idle_setup (char *str) __setup("idle=", idle_setup); -void show_regs(struct pt_regs * regs) +void stack_overflow(unsigned long esp, unsigned long eip) +{ + int panicing = ((esp&(THREAD_SIZE-1)) <= STACK_PANIC); + + printk( "esp: 0x%lx masked: 0x%lx STACK_PANIC:0x%x %d %d\n", + esp, (esp&(THREAD_SIZE-1)), STACK_PANIC, (((esp&(THREAD_SIZE-1)) <= STACK_PANIC)), panicing ); + + if (panicing) + print_symbol("stack overflow from %s\n", eip); + else + print_symbol("excessive stack use from %s\n", eip); + printk("esp: %p\n", (void *)esp); + show_trace(NULL, (void *)esp); + + if (panicing) + panic("stack overflow\n"); +} + +asmlinkage void show_regs(struct pt_regs * regs) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; @@ -449,6 +467,7 @@ struct task_struct * __switch_to(struct /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ + next_p->thread_info->irq_stack = prev_p->thread_info->irq_stack; unlazy_fpu(prev_p); /* diff -prauN linux-2.6.0-test1/arch/i386/kernel/reboot.c wli-2.6.0-test1-37/arch/i386/kernel/reboot.c --- linux-2.6.0-test1/arch/i386/kernel/reboot.c 2003-07-13 20:32:33.000000000 -0700 +++ wli-2.6.0-test1-37/arch/i386/kernel/reboot.c 2003-07-14 06:31:09.000000000 -0700 @@ -226,7 +226,7 @@ void machine_restart(char * __unused) if its not, default to the BSP */ if ((reboot_cpu == -1) || (reboot_cpu > (NR_CPUS -1)) || - !(phys_cpu_present_map & (1< */ -static volatile unsigned long flush_cpumask; +static volatile cpumask_t flush_cpumask; static struct mm_struct * flush_mm; static unsigned long flush_va; static spinlock_t tlbstate_lock = SPIN_LOCK_UNLOCKED; @@ -255,7 +258,7 @@ static inline void leave_mm (unsigned lo { if (cpu_tlbstate[cpu].state == TLBSTATE_OK) BUG(); - clear_bit(cpu, &cpu_tlbstate[cpu].active_mm->cpu_vm_mask); + cpu_clear(cpu, cpu_tlbstate[cpu].active_mm->cpu_vm_mask); load_cr3(swapper_pg_dir); } @@ -265,7 +268,7 @@ static inline void leave_mm (unsigned lo * [cpu0: the cpu that switches] * 1) switch_mm() either 1a) or 1b) * 1a) thread switch to a different mm - * 1a1) clear_bit(cpu, &old_mm->cpu_vm_mask); + * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask); * Stop ipi delivery for the old mm. This is not synchronized with * the other cpus, but smp_invalidate_interrupt ignore flush ipis * for the wrong mm, and in the worst case we perform a superflous @@ -275,7 +278,7 @@ static inline void leave_mm (unsigned lo * was in lazy tlb mode. * 1a3) update cpu_tlbstate[].active_mm * Now cpu0 accepts tlb flushes for the new mm. - * 1a4) set_bit(cpu, &new_mm->cpu_vm_mask); + * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask); * Now the other cpus will send tlb flush ipis. * 1a4) change cr3. * 1b) thread switch without mm change @@ -305,13 +308,14 @@ static inline void leave_mm (unsigned lo * 2) Leave the mm if we are in the lazy tlb mode. */ -asmlinkage void smp_invalidate_interrupt (void) +struct pt_regs * IRQHANDLER(smp_invalidate_interrupt(struct pt_regs *regs)); +struct pt_regs * smp_invalidate_interrupt(struct pt_regs *regs) { unsigned long cpu; cpu = get_cpu(); - if (!test_bit(cpu, &flush_cpumask)) + if (!cpu_isset(cpu, flush_cpumask)) goto out; /* * This was a BUG() but until someone can quote me the @@ -332,15 +336,18 @@ asmlinkage void smp_invalidate_interrupt leave_mm(cpu); } ack_APIC_irq(); - clear_bit(cpu, &flush_cpumask); - + smp_mb__before_clear_bit(); + cpu_clear(cpu, flush_cpumask); + smp_mb__after_clear_bit(); out: put_cpu_no_resched(); + return regs; } -static void flush_tlb_others (unsigned long cpumask, struct mm_struct *mm, +static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, unsigned long va) { + cpumask_t tmp; /* * A couple of (to be removed) sanity checks: * @@ -348,14 +355,12 @@ static void flush_tlb_others (unsigned l * - current CPU must not be in mask * - mask must exist :) */ - if (!cpumask) - BUG(); - if ((cpumask & cpu_online_map) != cpumask) - BUG(); - if (cpumask & (1 << smp_processor_id())) - BUG(); - if (!mm) - BUG(); + BUG_ON(cpus_empty(cpumask)); + + cpus_and(tmp, cpumask, cpu_online_map); + BUG_ON(!cpus_equal(cpumask, tmp)); + BUG_ON(cpu_isset(smp_processor_id(), cpumask)); + BUG_ON(!mm); /* * i'm not happy about this global shared spinlock in the @@ -367,15 +372,26 @@ static void flush_tlb_others (unsigned l flush_mm = mm; flush_va = va; +#if NR_CPUS <= BITS_PER_LONG atomic_set_mask(cpumask, &flush_cpumask); +#else + { + int k; + unsigned long *flush_mask = (unsigned long *)&flush_cpumask; + unsigned long *cpu_mask = (unsigned long *)&cpumask; + for (k = 0; k < BITS_TO_LONGS(NR_CPUS); ++k) + atomic_set_mask(cpu_mask[k], &flush_mask[k]); + } +#endif /* * We have to send the IPI only to * CPUs affected. */ send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR); - while (flush_cpumask) - /* nothing. lockup detection does not belong here */; + while (!cpus_empty(flush_cpumask)) + /* nothing. lockup detection does not belong here */ + mb(); flush_mm = NULL; flush_va = 0; @@ -385,23 +401,25 @@ static void flush_tlb_others (unsigned l void flush_tlb_current_task(void) { struct mm_struct *mm = current->mm; - unsigned long cpu_mask; + cpumask_t cpu_mask; preempt_disable(); - cpu_mask = mm->cpu_vm_mask & ~(1UL << smp_processor_id()); + cpu_mask = mm->cpu_vm_mask; + cpu_clear(smp_processor_id(), cpu_mask); local_flush_tlb(); - if (cpu_mask) + if (!cpus_empty(cpu_mask)) flush_tlb_others(cpu_mask, mm, FLUSH_ALL); preempt_enable(); } void flush_tlb_mm (struct mm_struct * mm) { - unsigned long cpu_mask; + cpumask_t cpu_mask; preempt_disable(); - cpu_mask = mm->cpu_vm_mask & ~(1UL << smp_processor_id()); + cpu_mask = mm->cpu_vm_mask; + cpu_clear(smp_processor_id(), cpu_mask); if (current->active_mm == mm) { if (current->mm) @@ -409,7 +427,7 @@ void flush_tlb_mm (struct mm_struct * mm else leave_mm(smp_processor_id()); } - if (cpu_mask) + if (!cpus_empty(cpu_mask)) flush_tlb_others(cpu_mask, mm, FLUSH_ALL); preempt_enable(); @@ -418,10 +436,11 @@ void flush_tlb_mm (struct mm_struct * mm void flush_tlb_page(struct vm_area_struct * vma, unsigned long va) { struct mm_struct *mm = vma->vm_mm; - unsigned long cpu_mask; + cpumask_t cpu_mask; preempt_disable(); - cpu_mask = mm->cpu_vm_mask & ~(1UL << smp_processor_id()); + cpu_mask = mm->cpu_vm_mask; + cpu_clear(smp_processor_id(), cpu_mask); if (current->active_mm == mm) { if(current->mm) @@ -430,7 +449,7 @@ void flush_tlb_page(struct vm_area_struc leave_mm(smp_processor_id()); } - if (cpu_mask) + if (!cpus_empty(cpu_mask)) flush_tlb_others(cpu_mask, mm, va); preempt_enable(); @@ -457,7 +476,7 @@ void flush_tlb_all(void) */ void smp_send_reschedule(int cpu) { - send_IPI_mask(1 << cpu, RESCHEDULE_VECTOR); + send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR); } /* @@ -533,7 +552,7 @@ static void stop_this_cpu (void * dummy) /* * Remove this CPU: */ - clear_bit(smp_processor_id(), &cpu_online_map); + cpu_clear(smp_processor_id(), cpu_online_map); local_irq_disable(); disable_local_APIC(); if (cpu_data[smp_processor_id()].hlt_works_ok) @@ -559,12 +578,15 @@ void smp_send_stop(void) * all the work is done automatically when * we return from the interrupt. */ -asmlinkage void smp_reschedule_interrupt(void) +struct pt_regs *IRQHANDLER(smp_reschedule_interrupt(struct pt_regs *)); +struct pt_regs *smp_reschedule_interrupt(struct pt_regs *regs) { ack_APIC_irq(); + return regs; } -asmlinkage void smp_call_function_interrupt(void) +struct pt_regs *IRQHANDLER(smp_call_function_interrupt(struct pt_regs *)); +struct pt_regs *smp_call_function_interrupt(struct pt_regs *regs) { void (*func) (void *info) = call_data->func; void *info = call_data->info; @@ -588,5 +610,6 @@ asmlinkage void smp_call_function_interr mb(); atomic_inc(&call_data->finished); } + return regs; } diff -prauN linux-2.6.0-test1/arch/i386/kernel/smpboot.c wli-2.6.0-test1-37/arch/i386/kernel/smpboot.c --- linux-2.6.0-test1/arch/i386/kernel/smpboot.c 2003-07-13 20:34:03.000000000 -0700 +++ wli-2.6.0-test1-37/arch/i386/kernel/smpboot.c 2003-07-14 09:29:09.000000000 -0700 @@ -61,16 +61,21 @@ static int __initdata smp_b_stepping; int smp_num_siblings = 1; int phys_proc_id[NR_CPUS]; /* Package ID of each logical CPU */ -/* Bitmask of currently online CPUs */ -unsigned long cpu_online_map; +/* bitmap of online cpus */ +cpumask_t cpu_online_map; -static volatile unsigned long cpu_callin_map; -volatile unsigned long cpu_callout_map; -static unsigned long smp_commenced_mask; +static volatile cpumask_t cpu_callin_map; +volatile cpumask_t cpu_callout_map; +static cpumask_t smp_commenced_mask; /* Per CPU bogomips and other parameters */ struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned; +/* Per CPU interrupt stacks */ +extern union thread_union init_irq_union; +union thread_union *irq_stacks[NR_CPUS] __cacheline_aligned = + { &init_irq_union, }; + /* Set when the idlers are all forked */ int smp_threads_ready; @@ -268,7 +273,7 @@ static void __init synchronize_tsc_bp (v sum = 0; for (i = 0; i < NR_CPUS; i++) { - if (test_bit(i, &cpu_callout_map)) { + if (cpu_isset(i, cpu_callout_map)) { t0 = tsc_values[i]; sum += t0; } @@ -277,7 +282,7 @@ static void __init synchronize_tsc_bp (v sum = 0; for (i = 0; i < NR_CPUS; i++) { - if (!test_bit(i, &cpu_callout_map)) + if (!cpu_isset(i, cpu_callout_map)) continue; delta = tsc_values[i] - avg; if (delta < 0) @@ -353,7 +358,7 @@ void __init smp_callin(void) */ phys_id = GET_APIC_ID(apic_read(APIC_ID)); cpuid = smp_processor_id(); - if (test_bit(cpuid, &cpu_callin_map)) { + if (cpu_isset(cpuid, cpu_callin_map)) { printk("huh, phys CPU#%d, CPU#%d already present??\n", phys_id, cpuid); BUG(); @@ -376,7 +381,7 @@ void __init smp_callin(void) /* * Has the boot CPU finished it's STARTUP sequence? */ - if (test_bit(cpuid, &cpu_callout_map)) + if (cpu_isset(cpuid, cpu_callout_map)) break; rep_nop(); } @@ -417,7 +422,7 @@ void __init smp_callin(void) /* * Allow the master to continue. */ - set_bit(cpuid, &cpu_callin_map); + cpu_set(cpuid, cpu_callin_map); /* * Synchronize the TSC with the BP @@ -442,7 +447,7 @@ int __init start_secondary(void *unused) */ cpu_init(); smp_callin(); - while (!test_bit(smp_processor_id(), &smp_commenced_mask)) + while (!cpu_isset(smp_processor_id(), smp_commenced_mask)) rep_nop(); setup_secondary_APIC_clock(); if (nmi_watchdog == NMI_IO_APIC) { @@ -456,7 +461,7 @@ int __init start_secondary(void *unused) * the local TLBs too. */ local_flush_tlb(); - set_bit(smp_processor_id(), &cpu_online_map); + cpu_set(smp_processor_id(), cpu_online_map); wmb(); return cpu_idle(); } @@ -499,16 +504,16 @@ static struct task_struct * __init fork_ #ifdef CONFIG_NUMA /* which logical CPUs are on which nodes */ -volatile unsigned long node_2_cpu_mask[MAX_NR_NODES] = - { [0 ... MAX_NR_NODES-1] = 0 }; +cpumask_t node_2_cpu_mask[MAX_NUMNODES] = + { [0 ... MAX_NUMNODES-1] = CPU_MASK_NONE }; /* which node each logical CPU is on */ -volatile int cpu_2_node[NR_CPUS] = { [0 ... NR_CPUS-1] = 0 }; +int cpu_2_node[NR_CPUS] = { [0 ... NR_CPUS-1] = 0 }; /* set up a mapping between cpu and node. */ static inline void map_cpu_to_node(int cpu, int node) { printk("Mapping cpu %d to node %d\n", cpu, node); - node_2_cpu_mask[node] |= (1 << cpu); + cpu_set(cpu, node_2_cpu_mask[node]); cpu_2_node[cpu] = node; } @@ -518,8 +523,8 @@ static inline void unmap_cpu_to_node(int int node; printk("Unmapping cpu %d from all nodes\n", cpu); - for (node = 0; node < MAX_NR_NODES; node ++) - node_2_cpu_mask[node] &= ~(1 << cpu); + for (node = 0; node < MAX_NUMNODES; node ++) + cpu_clear(cpu, node_2_cpu_mask[node]); cpu_2_node[cpu] = -1; } #else /* !CONFIG_NUMA */ @@ -770,7 +775,25 @@ wakeup_secondary_cpu(int phys_apicid, un } #endif /* WAKE_SECONDARY_VIA_INIT */ -extern unsigned long cpu_initialized; +static void __init setup_irq_stack(task_t *task, int cpu) +{ + unsigned long stack; + + stack = __get_free_pages(GFP_KERNEL, THREAD_ORDER); + if (!task) + panic("Cannot allocate irq stack\n"); + irq_stacks[cpu] = (void *)stack; + memset(irq_stacks[cpu], 0, THREAD_SIZE); + irq_stacks[cpu]->thread_info.cpu = cpu; + irq_stacks[cpu]->thread_info.preempt_count = 1; + task->thread_info->irq_stack = &irq_stacks[cpu]->thread_info; + /* + * If we want to make the irq stack more than one unit + * deep, we can chain them off the irq_stack pointer here. + */ +} + +extern cpumask_t cpu_initialized; static int __init do_boot_cpu(int apicid) /* @@ -793,6 +816,7 @@ static int __init do_boot_cpu(int apicid idle = fork_by_hand(); if (IS_ERR(idle)) panic("failed fork for CPU %d", cpu); + setup_irq_stack(idle, cpu); wake_up_forked_process(idle); /* @@ -836,19 +860,19 @@ static int __init do_boot_cpu(int apicid * allow APs to start initializing. */ Dprintk("Before Callout %d.\n", cpu); - set_bit(cpu, &cpu_callout_map); + cpu_set(cpu, cpu_callout_map); Dprintk("After Callout %d.\n", cpu); /* * Wait 5s total for a response */ for (timeout = 0; timeout < 50000; timeout++) { - if (test_bit(cpu, &cpu_callin_map)) + if (cpu_isset(cpu, cpu_callin_map)) break; /* It has booted */ udelay(100); } - if (test_bit(cpu, &cpu_callin_map)) { + if (cpu_isset(cpu, cpu_callin_map)) { /* number CPUs logically, starting from 1 (BSP is 0) */ Dprintk("OK.\n"); printk("CPU%d: ", cpu); @@ -869,8 +893,8 @@ static int __init do_boot_cpu(int apicid if (boot_error) { /* Try to put things back the way they were before ... */ unmap_cpu_to_logical_apicid(cpu); - clear_bit(cpu, &cpu_callout_map); /* was set here (do_boot_cpu()) */ - clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */ + cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */ + cpu_clear(cpu, cpu_initialized); /* was set by cpu_init() */ cpucount--; } @@ -957,7 +981,7 @@ static void __init smp_boot_cpus(unsigne if (!smp_found_config) { printk(KERN_NOTICE "SMP motherboard not detected.\n"); smpboot_clear_io_apic_irqs(); - phys_cpu_present_map = 1; + phys_cpu_present_map = physid_mask_of_physid(0); if (APIC_init_uniprocessor()) printk(KERN_NOTICE "Local APIC not detected." " Using dummy APIC emulation.\n"); @@ -973,7 +997,7 @@ static void __init smp_boot_cpus(unsigne if (!check_phys_apicid_present(boot_cpu_physical_apicid)) { printk("weird, boot CPU (#%d) not listed by the BIOS.\n", boot_cpu_physical_apicid); - phys_cpu_present_map |= (1 << hard_smp_processor_id()); + cpu_set(hard_smp_processor_id(), phys_cpu_present_map); } /* @@ -984,7 +1008,7 @@ static void __init smp_boot_cpus(unsigne boot_cpu_physical_apicid); printk(KERN_ERR "... forcing use of dummy APIC emulation. (tell your hw vendor)\n"); smpboot_clear_io_apic_irqs(); - phys_cpu_present_map = 1; + phys_cpu_present_map = physid_mask_of_physid(0); return; } @@ -997,7 +1021,7 @@ static void __init smp_boot_cpus(unsigne smp_found_config = 0; printk(KERN_INFO "SMP mode deactivated, forcing use of dummy APIC emulation.\n"); smpboot_clear_io_apic_irqs(); - phys_cpu_present_map = 1; + phys_cpu_present_map = physid_mask_of_physid(0); return; } @@ -1017,10 +1041,10 @@ static void __init smp_boot_cpus(unsigne * bits 0-3 are quad0, 4-7 are quad1, etc. A perverse twist on the * clustered apic ID. */ - Dprintk("CPU present map: %lx\n", phys_cpu_present_map); + Dprintk("CPU present map: %lx\n", cpus_coerce(phys_cpu_present_map)); kicked = 1; - for (bit = 0; kicked < NR_CPUS && bit < BITS_PER_LONG; bit++) { + for (bit = 0; kicked < NR_CPUS && bit < MAX_APICS; bit++) { apicid = cpu_present_to_apicid(bit); /* * Don't even attempt to start the boot CPU! @@ -1055,7 +1079,7 @@ static void __init smp_boot_cpus(unsigne } else { unsigned long bogosum = 0; for (cpu = 0; cpu < NR_CPUS; cpu++) - if (cpu_callout_map & (1<mm; pgd_t *pgd; pmd_t *pmd; pte_t *pte, *mapped; int i; preempt_disable(); - spin_lock(&tsk->mm->page_table_lock); - pgd = pgd_offset(tsk->mm, 0xA0000); + spin_lock(&mm->page_table_lock); + pgd = pgd_offset(mm, 0xA0000); if (pgd_none(*pgd)) goto out; if (pgd_bad(*pgd)) { @@ -144,23 +145,26 @@ static void mark_screen_rdonly(struct ta pgd_clear(pgd); goto out; } - pmd = pmd_offset(pgd, 0xA0000); - if (pmd_none(*pmd)) + pmd = pmd_offset_map(pgd, 0xA0000); + if (pmd_none(*pmd)) { + pmd_unmap(pmd); goto out; - if (pmd_bad(*pmd)) { + } else if (pmd_bad(*pmd)) { pmd_ERROR(*pmd); pmd_clear(pmd); + pmd_unmap(pmd); goto out; } pte = mapped = pte_offset_map(pmd, 0xA0000); for (i = 0; i < 32; i++) { if (pte_present(*pte)) - set_pte(pte, pte_wrprotect(*pte)); + vm_ptep_set_wrprotect(mm, pte); pte++; } pte_unmap(mapped); + pmd_unmap(pmd); out: - spin_unlock(&tsk->mm->page_table_lock); + spin_unlock(&mm->page_table_lock); preempt_enable(); flush_tlb(); } diff -prauN linux-2.6.0-test1/arch/i386/mach-generic/bigsmp.c wli-2.6.0-test1-37/arch/i386/mach-generic/bigsmp.c --- linux-2.6.0-test1/arch/i386/mach-generic/bigsmp.c 2003-07-13 20:32:31.000000000 -0700 +++ wli-2.6.0-test1-37/arch/i386/mach-generic/bigsmp.c 2003-07-14 06:31:09.000000000 -0700 @@ -3,6 +3,9 @@ * Drives the local APIC in "clustered mode". */ #define APIC_DEFINITION 1 +#include +#include +#include #include #include #include diff -prauN linux-2.6.0-test1/arch/i386/mach-generic/default.c wli-2.6.0-test1-37/arch/i386/mach-generic/default.c --- linux-2.6.0-test1/arch/i386/mach-generic/default.c 2003-07-13 20:29:59.000000000 -0700 +++ wli-2.6.0-test1-37/arch/i386/mach-generic/default.c 2003-07-14 06:31:09.000000000 -0700 @@ -2,6 +2,9 @@ * Default generic APIC driver. This handles upto 8 CPUs. */ #define APIC_DEFINITION 1 +#include +#include +#include #include #include #include diff -prauN linux-2.6.0-test1/arch/i386/mach-generic/probe.c wli-2.6.0-test1-37/arch/i386/mach-generic/probe.c --- linux-2.6.0-test1/arch/i386/mach-generic/probe.c 2003-07-13 20:29:30.000000000 -0700 +++ wli-2.6.0-test1-37/arch/i386/mach-generic/probe.c 2003-07-14 06:31:09.000000000 -0700 @@ -3,6 +3,9 @@ * * Generic x86 APIC driver probe layer. */ +#include +#include +#include #include #include #include diff -prauN linux-2.6.0-test1/arch/i386/mach-generic/summit.c wli-2.6.0-test1-37/arch/i386/mach-generic/summit.c --- linux-2.6.0-test1/arch/i386/mach-generic/summit.c 2003-07-13 20:38:42.000000000 -0700 +++ wli-2.6.0-test1-37/arch/i386/mach-generic/summit.c 2003-07-14 06:31:09.000000000 -0700 @@ -2,6 +2,9 @@ * APIC driver for the IBM "Summit" chipset. */ #define APIC_DEFINITION 1 +#include +#include +#include #include #include #include diff -prauN linux-2.6.0-test1/arch/i386/mach-visws/mpparse.c wli-2.6.0-test1-37/arch/i386/mach-visws/mpparse.c --- linux-2.6.0-test1/arch/i386/mach-visws/mpparse.c 2003-07-13 20:32:28.000000000 -0700 +++ wli-2.6.0-test1-37/arch/i386/mach-visws/mpparse.c 2003-07-14 06:31:09.000000000 -0700 @@ -26,7 +26,7 @@ unsigned int boot_cpu_physical_apicid = unsigned int boot_cpu_logical_apicid = -1U; /* Bitmask of physically existing CPUs */ -unsigned long phys_cpu_present_map; +cpumask_t phys_cpu_present_map; /* @@ -38,6 +38,7 @@ unsigned long phys_cpu_present_map; void __init MP_processor_info (struct mpc_config_processor *m) { int ver, logical_apicid; + cpumask_t apic_cpus; if (!(m->mpc_cpuflag & CPU_ENABLED)) return; @@ -62,7 +63,8 @@ void __init MP_processor_info (struct mp } ver = m->mpc_apicver; - phys_cpu_present_map |= apicid_to_cpu_present(m->mpc_apicid); + apic_cpus = apicid_to_cpu_present(m->mpc_apicid); + cpus_or(phys_cpu_present_map, phys_cpu_present_map, apic_cpus); /* * Validate version */ diff -prauN linux-2.6.0-test1/arch/i386/mach-voyager/voyager_smp.c wli-2.6.0-test1-37/arch/i386/mach-voyager/voyager_smp.c --- linux-2.6.0-test1/arch/i386/mach-voyager/voyager_smp.c 2003-07-13 20:32:44.000000000 -0700 +++ wli-2.6.0-test1-37/arch/i386/mach-voyager/voyager_smp.c 2003-07-14 06:31:09.000000000 -0700 @@ -75,15 +75,15 @@ static int voyager_extended_cpus = 1; int smp_found_config = 0; /* Used for the invalidate map that's also checked in the spinlock */ -volatile unsigned long smp_invalidate_needed; +static volatile unsigned long smp_invalidate_needed; /* Bitmask of currently online CPUs - used by setup.c for /proc/cpuinfo, visible externally but still physical */ -unsigned long cpu_online_map = 0; +cpumask_t cpu_online_map = CPU_MASK_NONE; /* Bitmask of CPUs present in the system - exported by i386_syms.c, used * by scheduler but indexed physically */ -unsigned long phys_cpu_present_map = 0; +cpumask_t phys_cpu_present_map = CPU_MASK_NONE; /* estimate of time used to flush the SMP-local cache - used in * processor affinity calculations */ @@ -108,7 +108,7 @@ static void enable_local_vic_irq(unsigne static void disable_local_vic_irq(unsigned int irq); static void before_handle_vic_irq(unsigned int irq); static void after_handle_vic_irq(unsigned int irq); -static void set_vic_irq_affinity(unsigned int irq, unsigned long mask); +static void set_vic_irq_affinity(unsigned int irq, cpumask_t mask); static void ack_vic_irq(unsigned int irq); static void vic_enable_cpi(void); static void do_boot_cpu(__u8 cpuid); @@ -128,13 +128,12 @@ send_one_QIC_CPI(__u8 cpu, __u8 cpi) static inline void send_QIC_CPI(__u32 cpuset, __u8 cpi) { - int mask; - __u8 cpu; + int cpu; - for_each_cpu(cpu, mask) { + for_each_cpu(cpu, mk_cpumask_const(cpu_online_map)) { if(cpuset & (1<cpu_vm_mask); + cpu_clear(cpu, cpu_tlbstate[cpu].active_mm->cpu_vm_mask); load_cr3(swapper_pg_dir); } @@ -878,7 +876,7 @@ smp_invalidate_interrupt(void) { __u8 cpu = get_cpu(); - if(!test_bit(cpu, &smp_invalidate_needed)) + if (!(smp_invalidate_needed & (1UL << cpu))) goto out; /* This will flood messages. Don't uncomment unless you see * Problems with cross cpu invalidation @@ -895,7 +893,7 @@ smp_invalidate_interrupt(void) } else leave_mm(cpu); } - clear_bit(cpu, &smp_invalidate_needed); + smp_invalidate_needed |= 1UL << cpu; out: put_cpu_no_resched(); } @@ -912,7 +910,7 @@ flush_tlb_others (unsigned long cpumask, if (!cpumask) BUG(); - if ((cpumask & cpu_online_map) != cpumask) + if ((cpumask & cpus_coerce(cpu_online_map)) != cpumask) BUG(); if (cpumask & (1 << smp_processor_id())) BUG(); @@ -954,7 +952,7 @@ flush_tlb_current_task(void) preempt_disable(); - cpu_mask = mm->cpu_vm_mask & ~(1 << smp_processor_id()); + cpu_mask = cpus_coerce(mm->cpu_vm_mask) & ~(1 << smp_processor_id()); local_flush_tlb(); if (cpu_mask) flush_tlb_others(cpu_mask, mm, FLUSH_ALL); @@ -970,7 +968,7 @@ flush_tlb_mm (struct mm_struct * mm) preempt_disable(); - cpu_mask = mm->cpu_vm_mask & ~(1 << smp_processor_id()); + cpu_mask = cpus_coerce(mm->cpu_vm_mask) & ~(1 << smp_processor_id()); if (current->active_mm == mm) { if (current->mm) @@ -991,7 +989,7 @@ void flush_tlb_page(struct vm_area_struc preempt_disable(); - cpu_mask = mm->cpu_vm_mask & ~(1 << smp_processor_id()); + cpu_mask = cpus_coerce(mm->cpu_vm_mask) & ~(1 << smp_processor_id()); if (current->active_mm == mm) { if(current->mm) __flush_tlb_one(va); @@ -1033,7 +1031,7 @@ static void smp_stop_cpu_function(void *dummy) { VDEBUG(("VOYAGER SMP: CPU%d is STOPPING\n", smp_processor_id())); - clear_bit(smp_processor_id(), &cpu_online_map); + cpu_clear(smp_processor_id(), cpu_online_map); local_irq_disable(); for(;;) __asm__("hlt"); @@ -1100,7 +1098,7 @@ smp_call_function (void (*func) (void *i int wait) { struct call_data_struct data; - __u32 mask = cpu_online_map; + __u32 mask = cpus_coerce(cpu_online_map); mask &= ~(1<= numnodes) + return; + + if (!node) { + vaddr = (unsigned long)alloc_bootmem(PER_CPU_PAGES*PAGE_SIZE); + __per_cpu_offset[cpu] = vaddr - (unsigned long)__per_cpu_start; + } else { + int k; + vaddr = (unsigned long)node_remap_start_vaddr[node]; + for (k = 0, cpu_in_node = 0; k < cpu; ++k) + if (cpu_isset(k, nodemask)) + ++cpu_in_node; + __per_cpu_offset[cpu] = vaddr + PAGE_SIZE*MEM_MAP_SIZE(node) + + PAGE_SIZE*PFN_UP(sizeof(pg_data_t)) + + PAGE_SIZE*PER_CPU_PAGES*cpu_in_node + - (unsigned long)__per_cpu_start; + } + memcpy(RELOC_HIDE((char *)__per_cpu_start, __per_cpu_offset[cpu]), + __per_cpu_start, + PER_CPU_PAGES*PAGE_SIZE); +} + +void __init setup_per_cpu_areas(void) +{ + int cpu; + for (cpu = 0; cpu < NR_CPUS; ++cpu) + allocate_per_cpu_pages(cpu); +} + + /* * Allocate memory for the pg_data_t via a crude pre-bootmem method * We ought to relocate these onto their own node later on during boot. @@ -203,13 +243,11 @@ static unsigned long calculate_numa_rema unsigned long size, reserve_pages = 0; for (nid = 1; nid < numnodes; nid++) { - /* calculate the size of the mem_map needed in bytes */ - size = (node_end_pfn[nid] - node_start_pfn[nid] + 1) - * sizeof(struct page) + sizeof(pg_data_t); - /* convert size to large (pmd size) pages, rounding up */ - size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES; - /* now the roundup is correct, convert to PAGE_SIZE pages */ - size = size * PTRS_PER_PTE; + /* calculate the size of the mem_map needed in pages */ + size = MEM_MAP_SIZE(nid) + PFN_UP(sizeof(pg_data_t)) + + PER_CPU_PAGES*MAX_NODE_CPUS; + /* round up to nearest pmd boundary */ + size = (size + PTRS_PER_PTE - 1) & ~(PTRS_PER_PTE - 1); printk("Reserving %ld pages of KVA for lmem_map of node %d\n", size, nid); node_remap_size[nid] = size; diff -prauN linux-2.6.0-test1/arch/i386/mm/fault.c wli-2.6.0-test1-37/arch/i386/mm/fault.c --- linux-2.6.0-test1/arch/i386/mm/fault.c 2003-07-13 20:28:54.000000000 -0700 +++ wli-2.6.0-test1-37/arch/i386/mm/fault.c 2003-07-14 06:49:00.000000000 -0700 @@ -247,6 +247,13 @@ no_context: printk(" printing eip:\n"); printk("%08lx\n", regs->eip); asm("movl %%cr3,%0":"=r" (page)); +#ifdef CONFIG_HIGHPMD /* Oh boy. Error reporting is going to blow major goats. */ + printk(KERN_ALERT "%%cr3 = 0x%lx\n", page); + /* Mask off flag bits. It should end up 32B-aligned. */ + page &= ~(PTRS_PER_PGD*sizeof(pgd_t) - 1); + printk(KERN_ALERT "*pdpte = 0x%Lx\n", + pgd_val(((pgd_t *)__va(page))[address >> PGDIR_SHIFT])); +#else /* !CONFIG_HIGHPMD */ page = ((unsigned long *) __va(page))[address >> 22]; printk(KERN_ALERT "*pde = %08lx\n", page); /* @@ -262,7 +269,8 @@ no_context: page = ((unsigned long *) __va(page))[address >> PAGE_SHIFT]; printk(KERN_ALERT "*pte = %08lx\n", page); } -#endif +#endif /* !CONFIG_HIGHPTE */ +#endif /* CONFIG_HIGHPMD */ die("Oops", regs, error_code); bust_spinlocks(0); do_exit(SIGKILL); @@ -330,8 +338,8 @@ vmalloc_fault: * and redundant with the set_pmd() on non-PAE. */ - pmd = pmd_offset(pgd, address); - pmd_k = pmd_offset(pgd_k, address); + pmd = pmd_offset_kernel(pgd, address); + pmd_k = pmd_offset_kernel(pgd_k, address); if (!pmd_present(*pmd_k)) goto no_context; set_pmd(pmd, *pmd_k); diff -prauN linux-2.6.0-test1/arch/i386/mm/highmem.c wli-2.6.0-test1-37/arch/i386/mm/highmem.c --- linux-2.6.0-test1/arch/i386/mm/highmem.c 2003-07-13 20:34:37.000000000 -0700 +++ wli-2.6.0-test1-37/arch/i386/mm/highmem.c 2003-07-14 08:29:24.000000000 -0700 @@ -1,22 +1,5 @@ #include -void *kmap(struct page *page) -{ - might_sleep(); - if (page < highmem_start_page) - return page_address(page); - return kmap_high(page); -} - -void kunmap(struct page *page) -{ - if (in_interrupt()) - BUG(); - if (page < highmem_start_page) - return; - kunmap_high(page); -} - /* * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because * no global lock is needed and because the kmap code must perform a global TLB @@ -25,40 +8,39 @@ void kunmap(struct page *page) * However when holding an atomic kmap is is not legal to sleep, so atomic * kmaps are appropriate for short, tight code paths only. */ -void *kmap_atomic(struct page *page, enum km_type type) +void *__kmap_atomic(struct page *page, enum km_type type, unsigned long vaddr) { enum fixed_addresses idx; - unsigned long vaddr; + unsigned long offset = KM_TYPE_NR*smp_processor_id(); + pte_t old_pte, pte, *kpte; - inc_preempt_count(); - if (page < highmem_start_page) - return page_address(page); - - idx = type + KM_TYPE_NR*smp_processor_id(); - vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); + idx = type + offset; + vaddr -= PAGE_SIZE*offset; + kpte = kmap_pte - idx; + old_pte = *kpte; #ifdef CONFIG_DEBUG_HIGHMEM - if (!pte_none(*(kmap_pte-idx))) - BUG(); + BUG_ON(!pte_none(old_pte)); #endif - set_pte(kmap_pte-idx, mk_pte(page, kmap_prot)); - __flush_tlb_one(vaddr); - - return (void*) vaddr; + pte = mk_pte(page, kmap_prot); + if (!pte_same(old_pte, pte)) { + set_pte(kpte, pte); + if (!pte_none(old_pte)) + __flush_tlb_one(vaddr); + } + return (void *)vaddr; } -void kunmap_atomic(void *kvaddr, enum km_type type) -{ #ifdef CONFIG_DEBUG_HIGHMEM - unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; - enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); +void __kunmap_atomic(void *kvaddr, enum km_type type, unsigned long vaddr) +{ + unsigned long offset = KM_TYPE_NR*smp_processor_id(); + unsigned long uvaddr = (unsigned long) kvaddr & PAGE_MASK; + enum fixed_addresses idx; - if (vaddr < FIXADDR_START) { // FIXME - dec_preempt_count(); - return; - } + idx = type + offset; + vaddr -= PAGE_SIZE*offset; - if (vaddr != __fix_to_virt(FIX_KMAP_BEGIN+idx)) - BUG(); + BUG_ON(uvaddr != vaddr); /* * force other mappings to Oops if they'll try to access @@ -66,21 +48,5 @@ void kunmap_atomic(void *kvaddr, enum km */ pte_clear(kmap_pte-idx); __flush_tlb_one(vaddr); -#endif - - dec_preempt_count(); } - -struct page *kmap_atomic_to_page(void *ptr) -{ - unsigned long idx, vaddr = (unsigned long)ptr; - pte_t *pte; - - if (vaddr < FIXADDR_START) - return virt_to_page(ptr); - - idx = virt_to_fix(vaddr); - pte = kmap_pte - (idx - FIX_KMAP_BEGIN); - return pte_page(*pte); -} - +#endif diff -prauN linux-2.6.0-test1/arch/i386/mm/hugetlbpage.c wli-2.6.0-test1-37/arch/i386/mm/hugetlbpage.c --- linux-2.6.0-test1/arch/i386/mm/hugetlbpage.c 2003-07-13 20:38:02.000000000 -0700 +++ wli-2.6.0-test1-37/arch/i386/mm/hugetlbpage.c 2003-07-14 08:52:52.000000000 -0700 @@ -87,8 +87,8 @@ static pte_t *huge_pte_alloc(struct mm_s pmd_t *pmd = NULL; pgd = pgd_offset(mm, addr); - pmd = pmd_alloc(mm, pgd, addr); - return (pte_t *) pmd; + pmd = pmd_alloc_map(mm, pgd, addr); + return (pte_t *)pmd; } static pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) @@ -97,11 +97,13 @@ static pte_t *huge_pte_offset(struct mm_ pmd_t *pmd = NULL; pgd = pgd_offset(mm, addr); - pmd = pmd_offset(pgd, addr); - return (pte_t *) pmd; + pmd = pmd_offset_map_nested(pgd, addr); + return (pte_t *)pmd; } -static void set_huge_pte(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page, pte_t * page_table, int write_access) +static void set_huge_pte(struct mm_struct *mm, struct vm_area_struct *vma, + struct page *page, pte_t * page_table, + unsigned long addr, int write_access) { pte_t entry; @@ -114,6 +116,7 @@ static void set_huge_pte(struct mm_struc entry = pte_mkyoung(entry); mk_pte_huge(entry); set_pte(page_table, entry); + vm_account_huge_inc(vma, *page_table, addr); } /* @@ -145,6 +148,8 @@ int copy_hugetlb_page_range(struct mm_st ptepage = pte_page(entry); get_page(ptepage); set_pte(dst_pte, entry); + pmd_unmap(dst_pte); + pmd_unmap_nested(src_pte); dst->rss += (HPAGE_SIZE / PAGE_SIZE); addr += HPAGE_SIZE; } @@ -182,6 +187,7 @@ follow_hugetlb_page(struct mm_struct *mm get_page(page); pages[i] = page; + pmd_unmap_nested(pte); } if (vmas) @@ -271,6 +277,7 @@ follow_huge_pmd(struct mm_struct *mm, un page += ((address & ~HPAGE_MASK) >> PAGE_SHIFT); get_page(page); } + pmd_unmap(pmd); return page; } #endif @@ -278,7 +285,7 @@ follow_huge_pmd(struct mm_struct *mm, un void free_huge_page(struct page *page) { BUG_ON(page_count(page)); - BUG_ON(page->mapping); + BUG_ON(page_mapping(page)); INIT_LIST_HEAD(&page->list); @@ -314,6 +321,8 @@ void unmap_hugepage_range(struct vm_area page = pte_page(*pte); huge_page_release(page); pte_clear(pte); + vm_account_huge_dec(vma, *pte, address); + pmd_unmap_nested(pte); } mm->rss -= (end - start) >> PAGE_SHIFT; flush_tlb_range(vma, start, end); @@ -348,8 +357,10 @@ int hugetlb_prefault(struct address_spac ret = -ENOMEM; goto out; } - if (!pte_none(*pte)) + if (!pte_none(*pte)) { + pmd_unmap(pte); continue; + } idx = ((addr - vma->vm_start) >> HPAGE_SHIFT) + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); @@ -358,16 +369,19 @@ int hugetlb_prefault(struct address_spac page = alloc_hugetlb_page(); if (!page) { ret = -ENOMEM; + pmd_unmap(pte); goto out; } ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC); unlock_page(page); if (ret) { free_huge_page(page); + pmd_unmap(pte); goto out; } } - set_huge_pte(mm, vma, page, pte, vma->vm_flags & VM_WRITE); + set_huge_pte(mm, vma, page, pte, addr, vma->vm_flags & VM_WRITE); + pmd_unmap(pte); } out: spin_unlock(&mm->page_table_lock); diff -prauN linux-2.6.0-test1/arch/i386/mm/init.c wli-2.6.0-test1-37/arch/i386/mm/init.c --- linux-2.6.0-test1/arch/i386/mm/init.c 2003-07-13 20:36:43.000000000 -0700 +++ wli-2.6.0-test1-37/arch/i386/mm/init.c 2003-07-14 07:23:47.000000000 -0700 @@ -59,10 +59,10 @@ static pmd_t * __init one_md_table_init( #ifdef CONFIG_X86_PAE pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); - if (pmd_table != pmd_offset(pgd, 0)) + if (pmd_table != pmd_offset_kernel(pgd, 0)) BUG(); #else - pmd_table = pmd_offset(pgd, 0); + pmd_table = pmd_offset_kernel(pgd, 0); #endif return pmd_table; @@ -113,7 +113,7 @@ static void __init page_table_range_init if (pgd_none(*pgd)) one_md_table_init(pgd); - pmd = pmd_offset(pgd, vaddr); + pmd = pmd_offset_kernel(pgd, vaddr); for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) { if (pmd_none(*pmd)) one_page_table_init(pmd); @@ -194,7 +194,7 @@ pte_t *kmap_pte; pgprot_t kmap_prot; #define kmap_get_fixmap_pte(vaddr) \ - pte_offset_kernel(pmd_offset(pgd_offset_k(vaddr), (vaddr)), (vaddr)) + pte_offset_kernel(pmd_offset_kernel(pgd_offset_k(vaddr), (vaddr)), (vaddr)) void __init kmap_init(void) { @@ -218,7 +218,7 @@ void __init permanent_kmaps_init(pgd_t * page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base); pgd = swapper_pg_dir + pgd_index(vaddr); - pmd = pmd_offset(pgd, vaddr); + pmd = pmd_offset_kernel(pgd, vaddr); pte = pte_offset_kernel(pmd, vaddr); pkmap_page_table = pte; } @@ -465,7 +465,7 @@ void __init mem_init(void) /* this will put all low memory onto the freelists */ totalram_pages += __free_all_bootmem(); - + tlb_init(); reservedpages = 0; for (tmp = 0; tmp < max_low_pfn; tmp++) /* @@ -512,20 +512,19 @@ void __init mem_init(void) #endif } -#ifdef CONFIG_X86_PAE -struct kmem_cache_s *pae_pgd_cachep; +kmem_cache_t *pgd_cache; void __init pgtable_cache_init(void) { - /* - * PAE pgds must be 16-byte aligned: - */ - pae_pgd_cachep = kmem_cache_create("pae_pgd", 32, 0, - SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN, NULL, NULL); - if (!pae_pgd_cachep) - panic("init_pae(): Cannot alloc pae_pgd SLAB cache"); + pgd_cache = kmem_cache_create("pgd", + PTRS_PER_PGD*sizeof(pgd_t), + 0, + SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN, + pgd_ctor, + PTRS_PER_PMD == 1 ? pgd_dtor : NULL); + if (!pgd_cache) + panic("pagetable_cache_init(): Cannot create pgd cache"); } -#endif /* * This function cannot be __init, since exceptions don't work in that diff -prauN linux-2.6.0-test1/arch/i386/mm/ioremap.c wli-2.6.0-test1-37/arch/i386/mm/ioremap.c --- linux-2.6.0-test1/arch/i386/mm/ioremap.c 2003-07-13 20:32:28.000000000 -0700 +++ wli-2.6.0-test1-37/arch/i386/mm/ioremap.c 2003-07-14 06:49:00.000000000 -0700 @@ -82,7 +82,7 @@ static int remap_area_pages(unsigned lon spin_lock(&init_mm.page_table_lock); do { pmd_t *pmd; - pmd = pmd_alloc(&init_mm, dir, address); + pmd = pmd_alloc_kernel(&init_mm, dir, address); error = -ENOMEM; if (!pmd) break; diff -prauN linux-2.6.0-test1/arch/i386/mm/pageattr.c wli-2.6.0-test1-37/arch/i386/mm/pageattr.c --- linux-2.6.0-test1/arch/i386/mm/pageattr.c 2003-07-13 20:33:41.000000000 -0700 +++ wli-2.6.0-test1-37/arch/i386/mm/pageattr.c 2003-07-14 07:24:15.000000000 -0700 @@ -23,7 +23,7 @@ static inline pte_t *lookup_address(unsi pmd_t *pmd; if (pgd_none(*pgd)) return NULL; - pmd = pmd_offset(pgd, address); + pmd = pmd_offset_kernel(pgd, address); if (pmd_none(*pmd)) return NULL; if (pmd_large(*pmd)) @@ -67,19 +67,22 @@ static void flush_kernel_map(void *dummy static void set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) { + struct page *page; + unsigned long flags; + set_pte_atomic(kpte, pte); /* change init_mm */ -#ifndef CONFIG_X86_PAE - { - struct list_head *l; - spin_lock(&mmlist_lock); - list_for_each(l, &init_mm.mmlist) { - struct mm_struct *mm = list_entry(l, struct mm_struct, mmlist); - pmd_t *pmd = pmd_offset(pgd_offset(mm, address), address); - set_pte_atomic((pte_t *)pmd, pte); - } - spin_unlock(&mmlist_lock); + if (PTRS_PER_PMD > 1) + return; + + spin_lock_irqsave(&pgd_lock, flags); + list_for_each_entry(page, &pgd_list, lru) { + pgd_t *pgd; + pmd_t *pmd; + pgd = (pgd_t *)page_address(page) + pgd_index(address); + pmd = pmd_offset_kernel(pgd, address); + set_pte_atomic((pte_t *)pmd, pte); } -#endif + spin_unlock_irqrestore(&pgd_lock, flags); } /* @@ -89,7 +92,7 @@ static void set_pmd_pte(pte_t *kpte, uns static inline void revert_page(struct page *kpte_page, unsigned long address) { pte_t *linear = (pte_t *) - pmd_offset(pgd_offset(&init_mm, address), address); + pmd_offset_kernel(pgd_offset_k(address), address); set_pmd_pte(linear, address, pfn_pte((__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); diff -prauN linux-2.6.0-test1/arch/i386/mm/pgtable.c wli-2.6.0-test1-37/arch/i386/mm/pgtable.c --- linux-2.6.0-test1/arch/i386/mm/pgtable.c 2003-07-13 20:39:36.000000000 -0700 +++ wli-2.6.0-test1-37/arch/i386/mm/pgtable.c 2003-07-14 09:33:21.000000000 -0700 @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -69,7 +70,7 @@ static void set_pte_pfn(unsigned long va BUG(); return; } - pmd = pmd_offset(pgd, vaddr); + pmd = pmd_offset_kernel(pgd, vaddr); if (pmd_none(*pmd)) { BUG(); return; @@ -109,7 +110,7 @@ void set_pmd_pfn(unsigned long vaddr, un printk ("set_pmd_pfn: pgd_none\n"); return; /* BUG(); */ } - pmd = pmd_offset(pgd, vaddr); + pmd = pmd_offset_kernel(pgd, vaddr); set_pmd(pmd, pfn_pmd(pfn, flags)); /* * It's enough to flush this one mapping. @@ -137,75 +138,253 @@ pte_t *pte_alloc_one_kernel(struct mm_st return pte; } +void tlb_init(void) +{ + int cpu; + for (cpu = 0; cpu < NR_CPUS; ++cpu) { + int zone; + struct mmu_gather *tlb = &per_cpu(mmu_gathers, cpu); + for (zone = 0; zone < MAX_ZONE_ID; ++zone) { + INIT_LIST_HEAD(&tlb->active_list[zone]); + INIT_LIST_HEAD(&tlb->ready_list[zone]); + } + } +} + +static inline struct page *pte_alloc_fresh(int gfp_mask) +{ + struct page *page = alloc_page(gfp_mask); + if (page) { + clear_highpage(page); + if (TestSetPagePTE(page)) + BUG(); + } + return page; +} + +static inline int zone_high(struct zone *zone) +{ + if (!zone) + return 1; + else + return zone - zone->zone_pgdat->node_zones >= ZONE_HIGHMEM; +} + +static inline struct page *pte_alloc_ready(int gfp_flags) +{ + struct mmu_gather *tlb = &per_cpu(mmu_gathers, get_cpu()); + unsigned long flags; + struct page *page = NULL; + + smp_local_irq_save(flags); + if (tlb->nr_pte_ready) { + int z; + for (z = MAX_ZONE_ID - 1; z >= 0; --z) { + struct zone *zone = zone_table[z]; + if (!(gfp_flags & __GFP_HIGHMEM) && zone_high(zone)) + continue; + if (!list_empty(&tlb->ready_list[z])) + break; + } + page = list_entry(tlb->ready_list[z].next, struct page, list); + if (TestSetPagePTE(page)) + BUG(); + list_del(&page->list); + tlb->ready_count[z]--; + tlb->nr_pte_ready--; + } + smp_local_irq_restore(flags); + put_cpu(); + return page; +} + struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) { - struct page *pte; + struct page *page = pte_alloc_ready(GFP_PTE); + return page ? page : pte_alloc_fresh(GFP_PTE); +} -#ifdef CONFIG_HIGHPTE - pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT, 0); -#else - pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT, 0); -#endif - if (pte) - clear_highpage(pte); - return pte; +static inline struct page *__pmd_alloc_one(void) +{ + struct page *page = pte_alloc_ready(GFP_PMD); + return page ? page : pte_alloc_fresh(GFP_PMD); } -#ifdef CONFIG_X86_PAE +LIST_HEAD(pgd_list); +spinlock_t pgd_lock = SPIN_LOCK_UNLOCKED; + +void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused) +{ + unsigned long flags; + + if (PTRS_PER_PMD == 1) + spin_lock_irqsave(&pgd_lock, flags); + + memcpy((pgd_t *)pgd + USER_PTRS_PER_PGD, + swapper_pg_dir + USER_PTRS_PER_PGD, + (PTRS_PER_PGD - USER_PTRS_PER_PGD)*sizeof(pgd_t)); + + if (PTRS_PER_PMD > 1) + return; + + list_add(&virt_to_page(pgd)->lru, &pgd_list); + spin_unlock_irqrestore(&pgd_lock, flags); + memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); +} + +void pgd_dtor(void *pgd, kmem_cache_t *cache, unsigned long unused) +{ + unsigned long flags; + + spin_lock_irqsave(&pgd_lock, flags); + list_del(&virt_to_page(pgd)->lru); + spin_unlock_irqrestore(&pgd_lock, flags); +} pgd_t *pgd_alloc(struct mm_struct *mm) { int i; - pgd_t *pgd = kmem_cache_alloc(pae_pgd_cachep, GFP_KERNEL); + pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL); - if (pgd) { - for (i = 0; i < USER_PTRS_PER_PGD; i++) { - unsigned long pmd = __get_free_page(GFP_KERNEL); - if (!pmd) - goto out_oom; - clear_page(pmd); - set_pgd(pgd + i, __pgd(1 + __pa(pmd))); - } - memcpy(pgd + USER_PTRS_PER_PGD, - swapper_pg_dir + USER_PTRS_PER_PGD, - (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); + if (PTRS_PER_PMD == 1 || !pgd) + return pgd; + + for (i = 0; i < USER_PTRS_PER_PGD; i++) { + struct page *pmd = __pmd_alloc_one(); + if (!pmd) + goto out_oom; + set_pgd(&pgd[i], __pgd(1ULL | (u64)page_to_pfn(pmd) << PAGE_SHIFT)); } + return pgd; + + /* + * This looks unusual. pte_free() is actually a convenient wrapper + * for queueing up preconstructed pmd and/or pte pages. The cases + * fall through to just queueing them in the per-cpu lists. + */ out_oom: for (i--; i >= 0; i--) - free_page((unsigned long)__va(pgd_val(pgd[i])-1)); - kmem_cache_free(pae_pgd_cachep, pgd); + pte_free(pgd_page(pgd[i])); + kmem_cache_free(pgd_cache, pgd); return NULL; } + void pgd_free(pgd_t *pgd) { - int i; - - for (i = 0; i < USER_PTRS_PER_PGD; i++) - free_page((unsigned long)__va(pgd_val(pgd[i])-1)); - kmem_cache_free(pae_pgd_cachep, pgd); + if (PTRS_PER_PMD > 1) { + int i; + for (i = 0; i < USER_PTRS_PER_PGD; i++) + pte_free(pgd_page(pgd[i])); + } + kmem_cache_free(pgd_cache, pgd); } -#else - -pgd_t *pgd_alloc(struct mm_struct *mm) +static void shrink_cpu_pagetable_cache(void *__gfp_mask) { - pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL); + int cpu, zone, high, gfp_mask = (int)gfp_mask; + unsigned long flags; + struct mmu_gather *tlb; + + high = !!(gfp_mask & __GFP_HIGHMEM); + cpu = get_cpu(); + tlb = &per_cpu(mmu_gathers, cpu); + smp_local_irq_save(flags); + + if (tlb->nr_pte_active || tlb->nr_nonpte) + tlb_flush(tlb); + + if (tlb->nr_nonpte) { + free_pages_and_swap_cache(tlb->nonpte, tlb->nr_nonpte); + tlb->nr_nonpte = 0; + } - if (pgd) { - memset(pgd, 0, USER_PTRS_PER_PGD * sizeof(pgd_t)); - memcpy(pgd + USER_PTRS_PER_PGD, - swapper_pg_dir + USER_PTRS_PER_PGD, - (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); + if (tlb->nr_pte_active) { + for (zone = 0; zone < MAX_ZONE_ID; ++zone) { + if (!high && zone_high(zone_table[zone])) + continue; + if (!tlb->active_count[zone]) + continue; + + list_splice_init(&tlb->active_list[zone], &tlb->ready_list[zone]); + tlb->ready_count[zone] += tlb->active_count[zone]; + tlb->active_count[zone] = 0; + } + tlb->nr_pte_ready += tlb->nr_pte_active; + tlb->nr_pte_active = 0; } - return pgd; + + for (zone = 0; zone < MAX_ZONE_ID; ++zone) { + struct page *head; + + if (list_empty(&tlb->ready_list[zone])) + continue; + if (!high && zone_high(zone_table[zone])) + continue; + + head = list_entry(tlb->ready_list[zone].next, struct page, list); + list_del_init(&head->list); + list_splice_init(&tlb->ready_list[zone], &head->list); + head->private = tlb->ready_count[zone]; + tlb->nr_pte_ready -= tlb->ready_count[zone]; + tlb->ready_count[zone] = 0; + free_pages_bulk(zone_table[zone], head, 0); + } + + smp_local_irq_restore(flags); + put_cpu(); } -void pgd_free(pgd_t *pgd) +void shrink_pagetable_cache(int gfp_mask) { - free_page((unsigned long)pgd); + BUG_ON(irqs_disabled()); + + preempt_disable(); + + /* disables interrupts appropriately internally */ + shrink_cpu_pagetable_cache((void *)gfp_mask); + + smp_call_function(shrink_cpu_pagetable_cache, (void *)gfp_mask, 1, 1); + preempt_enable(); } -#endif /* CONFIG_X86_PAE */ +unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma, *prev; + len = PAGE_ALIGN(len); + addr = PAGE_ALIGN(addr); + + if (len > TASK_SIZE) + return -ENOMEM; + + if (addr) { + struct vm_area_struct *vma; + vma = find_vma(mm, addr); + if (TASK_SIZE - len >= addr && (!vma || addr + len <= vma->vm_start)) + goto out; + } + + if (!mm->mmap) { + addr = TASK_SIZE - len; + goto out; + } + + addr = -ENOMEM; + for (prev = NULL, vma = mm->mmap; vma; prev = vma, vma = vma->vm_next) { + unsigned long lo, hi; + lo = prev ? prev->vm_end : 0; + hi = vma->vm_start; + if (hi - lo >= len && (addr == -ENOMEM || addr < hi - len)) + addr = hi - len; + } + /* we're at the last one; let's try the top */ + if (prev && TASK_SIZE - prev->vm_end >= len) + addr = TASK_SIZE - len; +out: + return addr; +} diff -prauN linux-2.6.0-test1/arch/i386/pci/numa.c wli-2.6.0-test1-37/arch/i386/pci/numa.c --- linux-2.6.0-test1/arch/i386/pci/numa.c 2003-07-13 20:31:50.000000000 -0700 +++ wli-2.6.0-test1-37/arch/i386/pci/numa.c 2003-07-17 02:37:50.000000000 -0700 @@ -115,7 +115,7 @@ static int __init pci_numa_init(void) return 0; pci_root_bus = pcibios_scan_root(0); - if (numnodes > 1) { + if (0 && numnodes > 1) { for (quad = 1; quad < numnodes; ++quad) { printk("Scanning PCI bus %d for quad %d\n", QUADLOCAL2BUS(quad,0), quad); diff -prauN linux-2.6.0-test1/arch/ia64/ia32/binfmt_elf32.c wli-2.6.0-test1-37/arch/ia64/ia32/binfmt_elf32.c --- linux-2.6.0-test1/arch/ia64/ia32/binfmt_elf32.c 2003-07-13 20:38:02.000000000 -0700 +++ wli-2.6.0-test1-37/arch/ia64/ia32/binfmt_elf32.c 2003-07-14 07:33:22.000000000 -0700 @@ -202,7 +202,8 @@ ia32_setup_arg_pages (struct linux_binpr struct page *page = bprm->page[i]; if (page) { bprm->page[i] = NULL; - put_dirty_page(current, page, stack_base, PAGE_COPY); + put_dirty_page(current, mpnt, page, + stack_base, PAGE_COPY); } stack_base += PAGE_SIZE; } diff -prauN linux-2.6.0-test1/arch/ia64/kernel/iosapic.c wli-2.6.0-test1-37/arch/ia64/kernel/iosapic.c --- linux-2.6.0-test1/arch/ia64/kernel/iosapic.c 2003-07-13 20:34:02.000000000 -0700 +++ wli-2.6.0-test1-37/arch/ia64/kernel/iosapic.c 2003-07-14 06:31:09.000000000 -0700 @@ -274,7 +274,7 @@ unmask_irq (unsigned int irq) static void -iosapic_set_affinity (unsigned int irq, unsigned long mask) +iosapic_set_affinity (unsigned int irq, cpumask_t mask) { #ifdef CONFIG_SMP unsigned long flags; @@ -287,12 +287,10 @@ iosapic_set_affinity (unsigned int irq, irq &= (~IA64_IRQ_REDIRECTED); vec = irq_to_vector(irq); - mask &= cpu_online_map; - - if (!mask || vec >= IA64_NUM_VECTORS) + if (cpus_empty(mask) || vec >= IA64_NUM_VECTORS) return; - dest = cpu_physical_id(ffz(~mask)); + dest = cpu_physical_id(first_cpu(mask)); rte_index = iosapic_intr_info[vec].rte_index; addr = iosapic_intr_info[vec].addr; diff -prauN linux-2.6.0-test1/arch/ia64/kernel/irq.c wli-2.6.0-test1-37/arch/ia64/kernel/irq.c --- linux-2.6.0-test1/arch/ia64/kernel/irq.c 2003-07-13 20:37:32.000000000 -0700 +++ wli-2.6.0-test1-37/arch/ia64/kernel/irq.c 2003-07-14 06:31:09.000000000 -0700 @@ -898,13 +898,14 @@ int setup_irq(unsigned int irq, struct i static struct proc_dir_entry * root_irq_dir; static struct proc_dir_entry * irq_dir [NR_IRQS]; -#define HEX_DIGITS 8 +#define HEX_DIGITS (2*sizeof(cpumask_t)) -static unsigned int parse_hex_value (const char *buffer, - unsigned long count, unsigned long *ret) +static unsigned int parse_hex_value(const char *buffer, + unsigned long count, cpumask_t *ret) { - unsigned char hexnum [HEX_DIGITS]; - unsigned long value, i; + unsigned char hexnum[HEX_DIGITS]; + cpumask_t value = CPU_MASK_NONE; + unsigned long i; if (!count) return -EINVAL; @@ -917,10 +918,9 @@ static unsigned int parse_hex_value (con * Parse the first 8 characters as a hex string, any non-hex char * is end-of-string. '00e1', 'e1', '00E1', 'E1' are all the same. */ - value = 0; - for (i = 0; i < count; i++) { unsigned int c = hexnum[i]; + int k; switch (c) { case '0' ... '9': c -= '0'; break; @@ -929,7 +929,10 @@ static unsigned int parse_hex_value (con default: goto out; } - value = (value << 4) | c; + cpus_shift_left(value, value, 4); + for (k = 0; k < 4; ++k) + if (test_bit(k, (unsigned long *)&c)) + cpu_set(k, value); } out: *ret = value; @@ -940,12 +943,15 @@ out: static struct proc_dir_entry * smp_affinity_entry [NR_IRQS]; -static unsigned long irq_affinity [NR_IRQS] = { [0 ... NR_IRQS-1] = ~0UL }; +static cpumask_t irq_affinity [NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_ALL }; + static char irq_redir [NR_IRQS]; // = { [0 ... NR_IRQS-1] = 1 }; void set_irq_affinity_info (unsigned int irq, int hwid, int redir) { - unsigned long mask = 1UL<handler->set_affinity(irq | (redir? IA64_IRQ_REDIRECTED : 0), new_value); @@ -1003,18 +1021,28 @@ static int irq_affinity_write_proc (stru static int prof_cpu_mask_read_proc (char *page, char **start, off_t off, int count, int *eof, void *data) { - unsigned long *mask = (unsigned long *) data; + cpumask_t *mask = (cpumask_t *)data; + int k, len = 0; + if (count < HEX_DIGITS+1) return -EINVAL; - return sprintf (page, "%08lx\n", *mask); + + for (k = 0; k < sizeof(cpumask_t)/sizeof(unsigned long); ++k) { + int j = sprintf(page, "%04hx", (u16)cpus_coerce(*mask)); + len += j; + page += j; + cpus_shift_right(*mask, *mask, 16); + } + len += sprintf(page, "\n"); + return len; } static int prof_cpu_mask_write_proc (struct file *file, const char *buffer, unsigned long count, void *data) { - unsigned long *mask = (unsigned long *) data; - int full_count = count, err; - unsigned long new_value; + cpumask_t *mask = (cpumask_t *)data; + unsigned long full_count = count, err; + cpumask_t new_value; err = parse_hex_value(buffer, count, &new_value); if (err) @@ -1058,7 +1086,7 @@ static void register_irq_proc (unsigned #endif } -unsigned long prof_cpu_mask = -1; +cpumask_t prof_cpu_mask = CPU_MASK_ALL; void init_irq_proc (void) { diff -prauN linux-2.6.0-test1/arch/ia64/kernel/perfmon.c wli-2.6.0-test1-37/arch/ia64/kernel/perfmon.c --- linux-2.6.0-test1/arch/ia64/kernel/perfmon.c 2003-07-13 20:38:06.000000000 -0700 +++ wli-2.6.0-test1-37/arch/ia64/kernel/perfmon.c 2003-07-14 06:31:09.000000000 -0700 @@ -221,14 +221,6 @@ #define PFM_REG_RETFLAG_SET(flags, val) do { flags &= ~PFM_REG_RETFL_MASK; flags |= (val); } while(0) -#ifdef CONFIG_SMP -#define PFM_CPU_ONLINE_MAP cpu_online_map -#define cpu_is_online(i) (PFM_CPU_ONLINE_MAP & (1UL << i)) -#else -#define PFM_CPU_ONLINE_MAP 1UL -#define cpu_is_online(i) (i==0) -#endif - /* * cmp0 must be the value of pmc0 */ @@ -5354,7 +5346,7 @@ pfm_proc_info(char *page) p += sprintf(p, "ovfl_mask : 0x%lx\n", pmu_conf.ovfl_val); for(i=0; i < NR_CPUS; i++) { - if (cpu_is_online(i) == 0) continue; + if (cpu_online(i) == 0) continue; p += sprintf(p, "CPU%-2d overflow intrs : %lu\n", i, pfm_stats[i].pfm_ovfl_intr_count); p += sprintf(p, "CPU%-2d overflow cycles : %lu\n", i, pfm_stats[i].pfm_ovfl_intr_cycles); p += sprintf(p, "CPU%-2d overflow min : %lu\n", i, pfm_stats[i].pfm_ovfl_intr_cycles_min); @@ -5372,7 +5364,7 @@ pfm_proc_info(char *page) p += sprintf(p, "CPU%-2d activations : %lu\n", i, pfm_get_cpu_data(pmu_activation_number,i)); } - if (hweight64(PFM_CPU_ONLINE_MAP) == 1) + if (num_online_cpus() == 1) { psr = pfm_get_psr(); ia64_srlz_d(); diff -prauN linux-2.6.0-test1/arch/ia64/kernel/setup.c wli-2.6.0-test1-37/arch/ia64/kernel/setup.c --- linux-2.6.0-test1/arch/ia64/kernel/setup.c 2003-07-13 20:30:43.000000000 -0700 +++ wli-2.6.0-test1-37/arch/ia64/kernel/setup.c 2003-07-14 06:31:09.000000000 -0700 @@ -558,7 +558,7 @@ static void * c_start (struct seq_file *m, loff_t *pos) { #ifdef CONFIG_SMP - while (*pos < NR_CPUS && !(cpu_online_map & (1UL << *pos))) + while (*pos < NR_CPUS && !cpu_isset(*pos, cpu_online_map)) ++*pos; #endif return *pos < NR_CPUS ? cpu_data(*pos) : NULL; diff -prauN linux-2.6.0-test1/arch/ia64/kernel/smp.c wli-2.6.0-test1-37/arch/ia64/kernel/smp.c --- linux-2.6.0-test1/arch/ia64/kernel/smp.c 2003-07-13 20:38:53.000000000 -0700 +++ wli-2.6.0-test1-37/arch/ia64/kernel/smp.c 2003-07-14 06:31:09.000000000 -0700 @@ -81,7 +81,7 @@ stop_this_cpu (void) /* * Remove this CPU: */ - clear_bit(smp_processor_id(), &cpu_online_map); + cpu_clear(smp_processor_id(), cpu_online_map); max_xtp(); local_irq_disable(); cpu_halt(); diff -prauN linux-2.6.0-test1/arch/ia64/kernel/smpboot.c wli-2.6.0-test1-37/arch/ia64/kernel/smpboot.c --- linux-2.6.0-test1/arch/ia64/kernel/smpboot.c 2003-07-13 20:31:58.000000000 -0700 +++ wli-2.6.0-test1-37/arch/ia64/kernel/smpboot.c 2003-07-14 06:31:09.000000000 -0700 @@ -79,13 +79,13 @@ int cpucount; task_t *task_for_booting_cpu; /* Bitmask of currently online CPUs */ -volatile unsigned long cpu_online_map; -unsigned long phys_cpu_present_map; +cpumask_t cpu_online_map; +cpumask_t phys_cpu_present_map; /* which logical CPU number maps to which CPU (physical APIC ID) */ volatile int ia64_cpu_to_sapicid[NR_CPUS]; -static volatile unsigned long cpu_callin_map; +static volatile cpumask_t cpu_callin_map; struct smp_boot_data smp_boot_data __initdata; @@ -282,7 +282,7 @@ smp_callin (void) cpuid = smp_processor_id(); phys_id = hard_smp_processor_id(); - if (test_and_set_bit(cpuid, &cpu_online_map)) { + if (cpu_test_and_set(cpuid, cpu_online_map)) { printk(KERN_ERR "huh, phys CPU#0x%x, CPU#0x%x already present??\n", phys_id, cpuid); BUG(); @@ -327,7 +327,7 @@ smp_callin (void) /* * Allow the master to continue. */ - set_bit(cpuid, &cpu_callin_map); + cpu_set(cpuid, cpu_callin_map); Dprintk("Stack on CPU %d at about %p\n",cpuid, &cpuid); } @@ -391,19 +391,19 @@ do_boot_cpu (int sapicid, int cpu) */ Dprintk("Waiting on callin_map ..."); for (timeout = 0; timeout < 100000; timeout++) { - if (test_bit(cpu, &cpu_callin_map)) + if (cpu_isset(cpu, cpu_callin_map)) break; /* It has booted */ udelay(100); } Dprintk("\n"); - if (test_bit(cpu, &cpu_callin_map)) { + if (cpu_isset(cpu, cpu_callin_map)) { /* number CPUs logically, starting from 1 (BSP is 0) */ printk(KERN_INFO "CPU%d: CPU has booted.\n", cpu); } else { printk(KERN_ERR "Processor 0x%x/0x%x is stuck.\n", cpu, sapicid); ia64_cpu_to_sapicid[cpu] = -1; - clear_bit(cpu, &cpu_online_map); /* was set in smp_callin() */ + cpu_clear(cpu, cpu_online_map); /* was set in smp_callin() */ return -EINVAL; } return 0; @@ -446,13 +446,14 @@ smp_build_cpu_map (void) ia64_cpu_to_sapicid[cpu] = -1; ia64_cpu_to_sapicid[0] = boot_cpu_id; - phys_cpu_present_map = 1; + cpus_clear(phys_cpu_present_map); + cpu_set(0, phys_cpu_present_map); for (cpu = 1, i = 0; i < smp_boot_data.cpu_count; i++) { sapicid = smp_boot_data.cpu_phys_id[i]; if (sapicid == boot_cpu_id) continue; - phys_cpu_present_map |= (1UL << cpu); + cpu_set(cpu, phys_cpu_present_map); ia64_cpu_to_sapicid[cpu] = sapicid; cpu++; } @@ -463,7 +464,7 @@ smp_build_cpu_map (void) /* on which node is each logical CPU (one cacheline even for 64 CPUs) */ volatile char cpu_to_node_map[NR_CPUS] __cacheline_aligned; /* which logical CPUs are on which nodes */ -volatile unsigned long node_to_cpu_mask[MAX_NUMNODES] __cacheline_aligned; +volatile cpumask_t node_to_cpu_mask[MAX_NUMNODES] __cacheline_aligned; /* * Build cpu to node mapping and initialize the per node cpu masks. @@ -474,7 +475,7 @@ build_cpu_to_node_map (void) int cpu, i, node; for(node=0; node= 0) - node_to_cpu_mask[node] |= (1UL << cpu); + cpu_set(cpu, node_to_cpu_mask[node]); } } @@ -515,8 +516,8 @@ smp_prepare_cpus (unsigned int max_cpus) /* * We have the boot CPU online for sure. */ - set_bit(0, &cpu_online_map); - set_bit(0, &cpu_callin_map); + cpu_set(0, cpu_online_map); + cpu_set(0, cpu_callin_map); local_cpu_data->loops_per_jiffy = loops_per_jiffy; ia64_cpu_to_sapicid[0] = boot_cpu_id; @@ -531,15 +532,18 @@ smp_prepare_cpus (unsigned int max_cpus) */ if (!max_cpus) { printk(KERN_INFO "SMP mode deactivated.\n"); - cpu_online_map = phys_cpu_present_map = 1; + cpus_clear(cpu_online_map); + cpus_clear(phys_cpu_present_map); + cpu_set(1, cpu_online_map); + cpu_set(1, phys_cpu_present_map); return; } } void __devinit smp_prepare_boot_cpu(void) { - set_bit(smp_processor_id(), &cpu_online_map); - set_bit(smp_processor_id(), &cpu_callin_map); + cpu_set(smp_processor_id(), cpu_online_map); + cpu_set(smp_processor_id(), cpu_callin_map); } void diff -prauN linux-2.6.0-test1/arch/ia64/kernel/time.c wli-2.6.0-test1-37/arch/ia64/kernel/time.c --- linux-2.6.0-test1/arch/ia64/kernel/time.c 2003-07-13 20:31:50.000000000 -0700 +++ wli-2.6.0-test1-37/arch/ia64/kernel/time.c 2003-07-14 06:31:09.000000000 -0700 @@ -40,13 +40,13 @@ unsigned long last_cli_ip; static void do_profile (unsigned long ip) { - extern unsigned long prof_cpu_mask; + extern cpumask_t prof_cpu_mask; extern char _stext; if (!prof_buffer) return; - if (!((1UL << smp_processor_id()) & prof_cpu_mask)) + if (!cpu_isset(smp_processor_id(), prof_cpu_mask)) return; ip -= (unsigned long) &_stext; diff -prauN linux-2.6.0-test1/arch/ia64/mm/hugetlbpage.c wli-2.6.0-test1-37/arch/ia64/mm/hugetlbpage.c --- linux-2.6.0-test1/arch/ia64/mm/hugetlbpage.c 2003-07-13 20:34:32.000000000 -0700 +++ wli-2.6.0-test1-37/arch/ia64/mm/hugetlbpage.c 2003-07-14 08:52:52.000000000 -0700 @@ -60,9 +60,9 @@ huge_pte_alloc (struct mm_struct *mm, un pte_t *pte = NULL; pgd = pgd_offset(mm, taddr); - pmd = pmd_alloc(mm, pgd, taddr); + pmd = pmd_alloc_map(mm, pgd, taddr); if (pmd) - pte = pte_alloc_map(mm, pmd, taddr); + pte = pte_alloc_map(mm, pgd, &pmd, taddr); return pte; } @@ -223,7 +223,7 @@ follow_huge_pmd(struct mm_struct *mm, un void free_huge_page(struct page *page) { BUG_ON(page_count(page)); - BUG_ON(page->mapping); + BUG_ON(page_mapping(page)); INIT_LIST_HEAD(&page->list); diff -prauN linux-2.6.0-test1/arch/ia64/mm/init.c wli-2.6.0-test1-37/arch/ia64/mm/init.c --- linux-2.6.0-test1/arch/ia64/mm/init.c 2003-07-13 20:33:41.000000000 -0700 +++ wli-2.6.0-test1-37/arch/ia64/mm/init.c 2003-07-14 06:49:00.000000000 -0700 @@ -286,10 +286,10 @@ put_kernel_page (struct page *page, unsi spin_lock(&init_mm.page_table_lock); { - pmd = pmd_alloc(&init_mm, pgd, address); + pmd = pmd_alloc_kernel(&init_mm, pgd, address); if (!pmd) goto out; - pte = pte_alloc_map(&init_mm, pmd, address); + pte = pte_alloc_map(&init_mm, pgd, &pmd, address); if (!pte) goto out; if (!pte_none(*pte)) { diff -prauN linux-2.6.0-test1/arch/m68k/kernel/head.S wli-2.6.0-test1-37/arch/m68k/kernel/head.S --- linux-2.6.0-test1/arch/m68k/kernel/head.S 2003-07-13 20:39:25.000000000 -0700 +++ wli-2.6.0-test1-37/arch/m68k/kernel/head.S 2003-07-14 06:49:00.000000000 -0700 @@ -110,7 +110,7 @@ * * These routines are used by other mmu routines to get a pointer into * a table, if necessary a new table is allocated. These routines are working - * basically like pmd_alloc() and pte_alloc() in . The root + * basically like pmd_alloc_map() and pte_alloc_map() in . The root * table needs of course only to be allocated once in mmu_get_root_table_entry, * so that here also some mmu specific initialization is done. The second page * at the start of the kernel (the first page is unmapped later) is used for diff -prauN linux-2.6.0-test1/arch/m68k/mm/kmap.c wli-2.6.0-test1-37/arch/m68k/mm/kmap.c --- linux-2.6.0-test1/arch/m68k/mm/kmap.c 2003-07-13 20:38:48.000000000 -0700 +++ wli-2.6.0-test1-37/arch/m68k/mm/kmap.c 2003-07-14 06:49:00.000000000 -0700 @@ -189,7 +189,7 @@ void *__ioremap(unsigned long physaddr, printk ("\npa=%#lx va=%#lx ", physaddr, virtaddr); #endif pgd_dir = pgd_offset_k(virtaddr); - pmd_dir = pmd_alloc(&init_mm, pgd_dir, virtaddr); + pmd_dir = pmd_alloc_kernel(&init_mm, pgd_dir, virtaddr); if (!pmd_dir) { printk("ioremap: no mem for pmd_dir\n"); return NULL; diff -prauN linux-2.6.0-test1/arch/m68k/sun3x/dvma.c wli-2.6.0-test1-37/arch/m68k/sun3x/dvma.c --- linux-2.6.0-test1/arch/m68k/sun3x/dvma.c 2003-07-13 20:30:48.000000000 -0700 +++ wli-2.6.0-test1-37/arch/m68k/sun3x/dvma.c 2003-07-14 06:49:00.000000000 -0700 @@ -102,7 +102,7 @@ inline int dvma_map_cpu(unsigned long ka pmd_t *pmd; unsigned long end2; - if((pmd = pmd_alloc(&init_mm, pgd, vaddr)) == NULL) { + if((pmd = pmd_alloc_kernel(&init_mm, pgd, vaddr)) == NULL) { ret = -ENOMEM; goto out; } diff -prauN linux-2.6.0-test1/arch/mips/kernel/irixioctl.c wli-2.6.0-test1-37/arch/mips/kernel/irixioctl.c --- linux-2.6.0-test1/arch/mips/kernel/irixioctl.c 2003-07-13 20:36:31.000000000 -0700 +++ wli-2.6.0-test1-37/arch/mips/kernel/irixioctl.c 2003-07-14 09:45:14.000000000 -0700 @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -35,7 +36,7 @@ static struct tty_struct *get_tty(int fd struct file *filp; struct tty_struct *ttyp = NULL; - spin_lock(¤t->files->file_lock); + rcu_read_lock(); filp = fcheck(fd); if(filp && filp->private_data) { ttyp = (struct tty_struct *) filp->private_data; @@ -43,7 +44,7 @@ static struct tty_struct *get_tty(int fd if(ttyp->magic != TTY_MAGIC) ttyp =NULL; } - spin_unlock(¤t->files->file_lock); + rcu_read_unlock(); return ttyp; } diff -prauN linux-2.6.0-test1/arch/mips/kernel/irq.c wli-2.6.0-test1-37/arch/mips/kernel/irq.c --- linux-2.6.0-test1/arch/mips/kernel/irq.c 2003-07-13 20:30:43.000000000 -0700 +++ wli-2.6.0-test1-37/arch/mips/kernel/irq.c 2003-07-14 06:31:09.000000000 -0700 @@ -861,20 +861,30 @@ out: static struct proc_dir_entry * smp_affinity_entry [NR_IRQS]; -static unsigned long irq_affinity [NR_IRQS] = { [0 ... NR_IRQS-1] = ~0UL }; +static cpumask_t irq_affinity [NR_IRQS] = { [0 ... NR_IRQS-1] = ~0UL }; static int irq_affinity_read_proc (char *page, char **start, off_t off, int count, int *eof, void *data) { + int len, k; + cpumask_t tmp = irq_affinity[(long)data]; + if (count < HEX_DIGITS+1) return -EINVAL; - return sprintf (page, "%08lx\n", irq_affinity[(long)data]); + for (k = 0; k < sizeof(cpumask_t)/sizeof(u16); ++k) { + int j = sprintf(page, "%04hx", cpus_coerce(tmp)); + len += j; + page += j; + cpus_shift_right(tmp, tmp, 16); + } + len += sprintf(page, "\n"); + return len; } static int irq_affinity_write_proc (struct file *file, const char *buffer, unsigned long count, void *data) { int irq = (long) data, full_count = count, err; - unsigned long new_value; + cpumask_t new_value, tmp; if (!irq_desc[irq].handler->set_affinity) return -EIO; @@ -886,7 +896,8 @@ static int irq_affinity_write_proc (stru * way to make the system unusable accidentally :-) At least * one online CPU still has to be targeted. */ - if (!(new_value & cpu_online_map)) + cpus_and(tmp, tmp, cpu_online_map); + if (cpus_empty(tmp)) return -EINVAL; irq_affinity[irq] = new_value; @@ -900,17 +911,28 @@ static int irq_affinity_write_proc (stru static int prof_cpu_mask_read_proc (char *page, char **start, off_t off, int count, int *eof, void *data) { - unsigned long *mask = (unsigned long *) data; + int len, k; + cpumask_t *mask = (cpumask_t *)data, tmp; + if (count < HEX_DIGITS+1) return -EINVAL; - return sprintf (page, "%08lx\n", *mask); + tmp = *mask; + + for (k = 0; k < sizeof(cpumask_t)/sizeof(u16); ++k) { + int j = sprintf(page, "%04hx", cpus_coerce(tmp)); + len += j; + page += j; + cpus_shift_right(tmp, tmp, 16); + } + len += sprintf(page, "\n"); + return len; } static int prof_cpu_mask_write_proc (struct file *file, const char *buffer, unsigned long count, void *data) { - unsigned long *mask = (unsigned long *) data, full_count = count, err; - unsigned long new_value; + cpumask_t *mask = (cpumask_t *)data, new_value; + unsigned long full_count = count, err; err = parse_hex_value(buffer, count, &new_value); if (err) diff -prauN linux-2.6.0-test1/arch/mips/kernel/proc.c wli-2.6.0-test1-37/arch/mips/kernel/proc.c --- linux-2.6.0-test1/arch/mips/kernel/proc.c 2003-07-13 20:32:28.000000000 -0700 +++ wli-2.6.0-test1-37/arch/mips/kernel/proc.c 2003-07-14 06:31:09.000000000 -0700 @@ -81,7 +81,7 @@ static int show_cpuinfo(struct seq_file char fmt [64]; #ifdef CONFIG_SMP - if (!CPUMASK_TSTB(cpu_online_map, n)) + if (!cpu_isset(n, cpu_online_map)) return 0; #endif diff -prauN linux-2.6.0-test1/arch/mips/kernel/smp.c wli-2.6.0-test1-37/arch/mips/kernel/smp.c --- linux-2.6.0-test1/arch/mips/kernel/smp.c 2003-07-13 20:31:20.000000000 -0700 +++ wli-2.6.0-test1-37/arch/mips/kernel/smp.c 2003-07-14 06:31:09.000000000 -0700 @@ -146,7 +146,7 @@ asmlinkage void start_secondary(void) cpu_data[cpu].udelay_val = loops_per_jiffy; prom_smp_finish(); printk("Slave cpu booted successfully\n"); - CPUMASK_SETB(cpu_online_map, cpu); + cpu_set(cpu, cpu_online_map); atomic_inc(&cpus_booted); cpu_idle(); } @@ -250,7 +250,7 @@ static void stop_this_cpu(void *dummy) /* * Remove this CPU: */ - clear_bit(smp_processor_id(), &cpu_online_map); + cpu_clear(smp_processor_id(), cpu_online_map); local_irq_enable(); /* May need to service _machine_restart IPI */ for (;;); /* Wait if available. */ } diff -prauN linux-2.6.0-test1/arch/mips/mm/ioremap.c wli-2.6.0-test1-37/arch/mips/mm/ioremap.c --- linux-2.6.0-test1/arch/mips/mm/ioremap.c 2003-07-13 20:34:03.000000000 -0700 +++ wli-2.6.0-test1-37/arch/mips/mm/ioremap.c 2003-07-14 06:49:00.000000000 -0700 @@ -81,7 +81,7 @@ static int remap_area_pages(unsigned lon spin_lock(&init_mm.page_table_lock); do { pmd_t *pmd; - pmd = pmd_alloc(&init_mm, dir, address); + pmd = pmd_alloc_kernel(&init_mm, dir, address); error = -ENOMEM; if (!pmd) break; diff -prauN linux-2.6.0-test1/arch/mips/sgi-ip27/ip27-init.c wli-2.6.0-test1-37/arch/mips/sgi-ip27/ip27-init.c --- linux-2.6.0-test1/arch/mips/sgi-ip27/ip27-init.c 2003-07-13 20:38:38.000000000 -0700 +++ wli-2.6.0-test1-37/arch/mips/sgi-ip27/ip27-init.c 2003-07-14 06:31:09.000000000 -0700 @@ -481,7 +481,7 @@ static int __init do_boot_cpu(int cpu, i */ __cpu_number_map[cpu] = num_cpus; __cpu_logical_map[num_cpus] = cpu; - CPUMASK_SETB(cpu_online_map, cpu); + cpu_set(cpu, cpu_online_map); /* * Wait this cpu to start up and initialize its hub, diff -prauN linux-2.6.0-test1/arch/mips/sibyte/cfe/smp.c wli-2.6.0-test1-37/arch/mips/sibyte/cfe/smp.c --- linux-2.6.0-test1/arch/mips/sibyte/cfe/smp.c 2003-07-13 20:37:18.000000000 -0700 +++ wli-2.6.0-test1-37/arch/mips/sibyte/cfe/smp.c 2003-07-14 06:31:09.000000000 -0700 @@ -63,7 +63,7 @@ void prom_setup_smp(void) /* Use CFE to find out how many CPUs are available */ for (i=1; icpu = 0; cpu_data[0].udelay_val = loops_per_jiffy; cpu_data[0].asid_cache = ASID_FIRST_VERSION; - CPUMASK_CLRALL(cpu_online_map); - CPUMASK_SETB(cpu_online_map, 0); + cpus_clear(cpu_online_map); + cpu_set(0, cpu_online_map); atomic_set(&cpus_booted, 1); /* Master CPU is already booted... */ smp_tune_scheduling(); diff -prauN linux-2.6.0-test1/arch/mips64/kernel/irq.c wli-2.6.0-test1-37/arch/mips64/kernel/irq.c --- linux-2.6.0-test1/arch/mips64/kernel/irq.c 2003-07-13 20:33:48.000000000 -0700 +++ wli-2.6.0-test1-37/arch/mips64/kernel/irq.c 2003-07-14 06:31:09.000000000 -0700 @@ -818,13 +818,13 @@ EXPORT_SYMBOL(probe_irq_mask); static struct proc_dir_entry * root_irq_dir; static struct proc_dir_entry * irq_dir [NR_IRQS]; -#define HEX_DIGITS 8 +#define HEX_DIGITS (2*sizeof(cpumask_t)) static unsigned int parse_hex_value (const char *buffer, - unsigned long count, unsigned long *ret) + unsigned long count, cpumask_t *ret) { unsigned char hexnum [HEX_DIGITS]; - unsigned long value; + cpumask_t value = CPU_MASK_NONE; int i; if (!count) @@ -838,10 +838,9 @@ static unsigned int parse_hex_value (con * Parse the first 8 characters as a hex string, any non-hex char * is end-of-string. '00e1', 'e1', '00E1', 'E1' are all the same. */ - value = 0; for (i = 0; i < count; i++) { - unsigned int c = hexnum[i]; + unsigned int k, c = hexnum[i]; switch (c) { case '0' ... '9': c -= '0'; break; @@ -850,7 +849,10 @@ static unsigned int parse_hex_value (con default: goto out; } - value = (value << 4) | c; + cpus_shift_left(value, value, 4); + for (k = 0; k < 4; ++k) + if (c & (1 << k)) + cpu_set(k, value); } out: *ret = value; @@ -861,20 +863,31 @@ out: static struct proc_dir_entry * smp_affinity_entry [NR_IRQS]; -static unsigned long irq_affinity [NR_IRQS] = { [0 ... NR_IRQS-1] = ~0UL }; +static cpumask_t irq_affinity [NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_ALL }; static int irq_affinity_read_proc (char *page, char **start, off_t off, int count, int *eof, void *data) { + int k, len = 0; + cpumask_t tmp = irq_affinity[(long)data]; + if (count < HEX_DIGITS+1) return -EINVAL; - return sprintf (page, "%08lx\n", irq_affinity[(long)data]); + + for (k = 0; k < sizeof(cpumask_t)/sizeof(u16); ++k) { + int j = sprintf(page, "%04hx", cpus_coerce(tmp)); + len += j; + page += j; + cpus_shift_right(tmp, tmp, 16); + } + len += sprintf(page, "\n"); + return len; } static int irq_affinity_write_proc (struct file *file, const char *buffer, unsigned long count, void *data) { int irq = (long) data, full_count = count, err; - unsigned long new_value; + cpumask_t new_value, tmp; if (!irq_desc[irq].handler->set_affinity) return -EIO; @@ -886,7 +899,8 @@ static int irq_affinity_write_proc (stru * way to make the system unusable accidentally :-) At least * one online CPU still has to be targeted. */ - if (!(new_value & cpu_online_map)) + cpus_and(tmp, new_value, cpu_online_map); + if (cpus_empty(tmp)) return -EINVAL; irq_affinity[irq] = new_value; @@ -900,17 +914,28 @@ static int irq_affinity_write_proc (stru static int prof_cpu_mask_read_proc (char *page, char **start, off_t off, int count, int *eof, void *data) { - unsigned long *mask = (unsigned long *) data; + int k, len = 0; + cpumask_t *mask = (cpumask_t *)data, tmp; + if (count < HEX_DIGITS+1) return -EINVAL; - return sprintf (page, "%08lx\n", *mask); + + tmp = *mask; + for (k = 0; k < sizeof(cpumask_t)/sizeof(u16); ++k) { + int j = sprintf(page, "%04hx", cpus_coerce(tmp)); + len += j; + page += j; + cpus_shift_right(tmp, tmp, 16); + } + len += sprintf(page, "\n"); + return len; } static int prof_cpu_mask_write_proc (struct file *file, const char *buffer, unsigned long count, void *data) { - unsigned long *mask = (unsigned long *) data, full_count = count, err; - unsigned long new_value; + unsigned long full_count = count, err; + cpumask_t new_value, *mask = (cpumask_t *)data; err = parse_hex_value(buffer, count, &new_value); if (err) diff -prauN linux-2.6.0-test1/arch/mips64/kernel/proc.c wli-2.6.0-test1-37/arch/mips64/kernel/proc.c --- linux-2.6.0-test1/arch/mips64/kernel/proc.c 2003-07-13 20:38:38.000000000 -0700 +++ wli-2.6.0-test1-37/arch/mips64/kernel/proc.c 2003-07-14 06:31:09.000000000 -0700 @@ -81,7 +81,7 @@ static int show_cpuinfo(struct seq_file char fmt [64]; #ifdef CONFIG_SMP - if (!CPUMASK_TSTB(cpu_online_map, n)) + if (!cpu_isset(n, cpu_online_map)) return 0; #endif diff -prauN linux-2.6.0-test1/arch/mips64/kernel/smp.c wli-2.6.0-test1-37/arch/mips64/kernel/smp.c --- linux-2.6.0-test1/arch/mips64/kernel/smp.c 2003-07-13 20:36:37.000000000 -0700 +++ wli-2.6.0-test1-37/arch/mips64/kernel/smp.c 2003-07-14 06:31:09.000000000 -0700 @@ -146,7 +146,7 @@ asmlinkage void start_secondary(void) cpu_data[cpu].udelay_val = loops_per_jiffy; prom_smp_finish(); printk("Slave cpu booted successfully\n"); - CPUMASK_SETB(cpu_online_map, cpu); + cpu_set(cpu, cpu_online_map); atomic_inc(&cpus_booted); cpu_idle(); } @@ -250,7 +250,7 @@ static void stop_this_cpu(void *dummy) /* * Remove this CPU: */ - clear_bit(smp_processor_id(), &cpu_online_map); + cpu_clear(smp_processor_id(), cpu_online_map); local_irq_enable(); /* May need to service _machine_restart IPI */ for (;;); /* Wait if available. */ } diff -prauN linux-2.6.0-test1/arch/parisc/kernel/cache.c wli-2.6.0-test1-37/arch/parisc/kernel/cache.c --- linux-2.6.0-test1/arch/parisc/kernel/cache.c 2003-07-13 20:33:46.000000000 -0700 +++ wli-2.6.0-test1-37/arch/parisc/kernel/cache.c 2003-07-14 08:52:52.000000000 -0700 @@ -64,7 +64,7 @@ update_mmu_cache(struct vm_area_struct * { struct page *page = pte_page(pte); - if (VALID_PAGE(page) && page->mapping && + if (VALID_PAGE(page) && page_mapping(page) && test_bit(PG_dcache_dirty, &page->flags)) { flush_kernel_dcache_page(page_address(page)); @@ -230,14 +230,16 @@ void __flush_dcache_page(struct page *pa flush_kernel_dcache_page(page_address(page)); - if (!page->mapping) + if (!page_mapping(page)) return; - list_for_each(l, &page->mapping->i_mmap_shared) { + list_for_each_rcu(l, &page_mapping(page)->i_mmap_shared) { struct vm_area_struct *mpnt; unsigned long off; mpnt = list_entry(l, struct vm_area_struct, shared); + if (mpnt->vm_flags & VM_DEAD) + continue; /* * If this VMA is not in our MM, we can ignore it. diff -prauN linux-2.6.0-test1/arch/parisc/kernel/pci-dma.c wli-2.6.0-test1-37/arch/parisc/kernel/pci-dma.c --- linux-2.6.0-test1/arch/parisc/kernel/pci-dma.c 2003-07-13 20:38:02.000000000 -0700 +++ wli-2.6.0-test1-37/arch/parisc/kernel/pci-dma.c 2003-07-14 06:49:00.000000000 -0700 @@ -133,7 +133,7 @@ static inline int map_uncached_pages(uns do { pmd_t *pmd; - pmd = pmd_alloc(NULL, dir, vaddr); + pmd = pmd_alloc_kernel(NULL, dir, vaddr); if (!pmd) return -ENOMEM; if (map_pmd_uncached(pmd, vaddr, end - vaddr, &paddr)) diff -prauN linux-2.6.0-test1/arch/parisc/kernel/smp.c wli-2.6.0-test1-37/arch/parisc/kernel/smp.c --- linux-2.6.0-test1/arch/parisc/kernel/smp.c 2003-07-13 20:38:51.000000000 -0700 +++ wli-2.6.0-test1-37/arch/parisc/kernel/smp.c 2003-07-14 06:31:09.000000000 -0700 @@ -62,14 +62,14 @@ volatile struct task_struct *smp_init_cu static volatile int smp_commenced = 0; /* Set when the idlers are all forked */ static volatile int cpu_now_booting = 0; /* track which CPU is booting */ -volatile unsigned long cpu_online_map = 0; /* Bitmap of online CPUs */ -#define IS_LOGGED_IN(cpunum) (test_bit(cpunum, (atomic_t *)&cpu_online_map)) +cpumask_t cpu_online_map = CPU_MASK_NONE; /* Bitmap of online CPUs */ +#define IS_LOGGED_IN(cpunum) (cpu_isset(cpunum, cpu_online_map)) int smp_num_cpus = 1; int smp_threads_ready = 0; unsigned long cache_decay_ticks; static int max_cpus = -1; /* Command line */ -unsigned long cpu_present_mask; +cpumask_t cpu_present_mask; struct smp_call_struct { void (*func) (void *info); @@ -139,7 +139,7 @@ halt_processor(void) #else /* REVISIT : redirect I/O Interrupts to another CPU? */ /* REVISIT : does PM *know* this CPU isn't available? */ - clear_bit(smp_processor_id(), (void *)&cpu_online_map); + cpu_clear(smp_processor_id(), cpu_online_map); local_irq_disable(); for (;;) ; @@ -443,7 +443,7 @@ smp_cpu_init(int cpunum) mb(); /* Well, support 2.4 linux scheme as well. */ - if (test_and_set_bit(cpunum, (unsigned long *) (&cpu_online_map))) + if (cpu_test_and_set(cpunum, cpu_online_map)) { extern void machine_halt(void); /* arch/parisc.../process.c */ @@ -624,13 +624,14 @@ void __init smp_boot_cpus(void) printk(KERN_DEBUG "SMP: bootstrap CPU ID is %d\n",bootstrap_processor); init_task.thread_info->cpu = bootstrap_processor; current->thread_info->cpu = bootstrap_processor; - cpu_online_map = 1 << bootstrap_processor; /* Mark Boostrap processor as present */ + /* Mark Boostrap processor as present */ + cpu_online_map = cpumask_of_cpu(bootstrap_processor); current->active_mm = &init_mm; #ifdef ENTRY_SYS_CPUS cpu_data[0].state = STATE_RUNNING; #endif - cpu_present_mask = 1UL << bootstrap_processor; + cpu_present_mask = cpumask_of_cpu(bootstrap_processor); /* Nothing to do when told not to. */ if (max_cpus == 0) { @@ -709,8 +710,8 @@ void __init smp_prepare_cpus(unsigned in void __devinit smp_prepare_boot_cpu(void) { - set_bit(smp_processor_id(), &cpu_online_map); - set_bit(smp_processor_id(), &cpu_present_mask); + cpu_set(smp_processor_id(), cpu_online_map); + cpu_set(smp_processor_id(), cpu_present_mask); } int __devinit __cpu_up(unsigned int cpu) diff -prauN linux-2.6.0-test1/arch/parisc/mm/ioremap.c wli-2.6.0-test1-37/arch/parisc/mm/ioremap.c --- linux-2.6.0-test1/arch/parisc/mm/ioremap.c 2003-07-13 20:32:43.000000000 -0700 +++ wli-2.6.0-test1-37/arch/parisc/mm/ioremap.c 2003-07-14 06:49:00.000000000 -0700 @@ -77,7 +77,7 @@ static int remap_area_pages(unsigned lon spin_lock(&init_mm.page_table_lock); do { pmd_t *pmd; - pmd = pmd_alloc(dir, address); + pmd = pmd_alloc_kernel(dir, address); error = -ENOMEM; if (!pmd) break; diff -prauN linux-2.6.0-test1/arch/ppc/kernel/irq.c wli-2.6.0-test1-37/arch/ppc/kernel/irq.c --- linux-2.6.0-test1/arch/ppc/kernel/irq.c 2003-07-13 20:37:18.000000000 -0700 +++ wli-2.6.0-test1-37/arch/ppc/kernel/irq.c 2003-07-14 06:31:09.000000000 -0700 @@ -44,6 +44,7 @@ #include #include #include +#include #include #include @@ -565,29 +566,41 @@ static struct proc_dir_entry *irq_dir[NR static struct proc_dir_entry *smp_affinity_entry[NR_IRQS]; #ifdef CONFIG_IRQ_ALL_CPUS -#define DEFAULT_CPU_AFFINITY 0xffffffff +#define DEFAULT_CPU_AFFINITY CPU_MASK_ALL #else -#define DEFAULT_CPU_AFFINITY 0x00000001 +#define DEFAULT_CPU_AFFINITY cpumask_of_cpu(0) #endif -unsigned int irq_affinity [NR_IRQS] = +cpumask_t irq_affinity [NR_IRQS] = { [0 ... NR_IRQS-1] = DEFAULT_CPU_AFFINITY }; -#define HEX_DIGITS 8 +#define HEX_DIGITS (2*sizeof(cpumask_t)) static int irq_affinity_read_proc (char *page, char **start, off_t off, int count, int *eof, void *data) { + cpumask_t tmp = irq_affinity[(long)data]; + int k, len = 0; + if (count < HEX_DIGITS+1) return -EINVAL; - return sprintf (page, "%08x\n", irq_affinity[(int)data]); + + for (k = 0; k < sizeof(cpumask_t)/sizeof(u16); ++k) { + int j = sprintf(page, "%04hx", (u16)cpus_coerce(tmp)); + len += j; + page += j; + cpus_shift_right(tmp, tmp, 16); + } + + len += sprintf(page, "\n"); + return len; } static unsigned int parse_hex_value (const char __user *buffer, - unsigned long count, unsigned long *ret) + unsigned long count, cpumask_t *ret) { unsigned char hexnum [HEX_DIGITS]; - unsigned long value; + cpumask_t value = CPU_MASK_NONE; int i; if (!count) @@ -601,10 +614,9 @@ static unsigned int parse_hex_value (con * Parse the first 8 characters as a hex string, any non-hex char * is end-of-string. '00e1', 'e1', '00E1', 'E1' are all the same. */ - value = 0; - for (i = 0; i < count; i++) { unsigned int c = hexnum[i]; + int k; switch (c) { case '0' ... '9': c -= '0'; break; @@ -613,7 +625,10 @@ static unsigned int parse_hex_value (con default: goto out; } - value = (value << 4) | c; + cpus_shift_left(value, value, 4); + for (k = 0; k < 4; ++k) + if (c & (1 << k)) + cpu_set(k, value); } out: *ret = value; @@ -624,7 +639,7 @@ static int irq_affinity_write_proc (stru unsigned long count, void *data) { int irq = (int) data, full_count = count, err; - unsigned long new_value; + cpumask_t new_value, tmp; if (!irq_desc[irq].handler->set_affinity) return -EIO; @@ -641,7 +656,8 @@ static int irq_affinity_write_proc (stru * are actually logical cpu #'s then we have no problem. * -- Cort */ - if (!(new_value & cpu_online_map)) + cpus_and(tmp, new_value, cpu_online_map); + if (cpus_empty(tmp)) return -EINVAL; irq_affinity[irq] = new_value; @@ -653,17 +669,27 @@ static int irq_affinity_write_proc (stru static int prof_cpu_mask_read_proc (char *page, char **start, off_t off, int count, int *eof, void *data) { - unsigned long *mask = (unsigned long *) data; + cpumask_t mask = *(cpumask_t *)data; + int k, len = 0; + if (count < HEX_DIGITS+1) return -EINVAL; - return sprintf (page, "%08lx\n", *mask); + + for (k = 0; k < sizeof(cpumask_t)/sizeof(u16); ++k) { + int j = sprintf(page, "%04hx", (u16)cpus_coerce(mask)); + len += j; + page += j; + cpus_shift_right(mask, mask, 16); + } + len += sprintf(page, "\n"); + return len; } static int prof_cpu_mask_write_proc (struct file *file, const char __user *buffer, unsigned long count, void *data) { - unsigned long *mask = (unsigned long *) data, full_count = count, err; - unsigned long new_value; + cpumask_t *mask = (cpumask_t *)data, full_count = count, err; + cpumask_t new_value; err = parse_hex_value(buffer, count, &new_value); if (err) diff -prauN linux-2.6.0-test1/arch/ppc/kernel/setup.c wli-2.6.0-test1-37/arch/ppc/kernel/setup.c --- linux-2.6.0-test1/arch/ppc/kernel/setup.c 2003-07-13 20:38:36.000000000 -0700 +++ wli-2.6.0-test1-37/arch/ppc/kernel/setup.c 2003-07-14 06:31:09.000000000 -0700 @@ -159,7 +159,7 @@ int show_cpuinfo(struct seq_file *m, voi } #ifdef CONFIG_SMP - if (!(cpu_online_map & (1 << i))) + if (!cpu_online(i)) return 0; pvr = cpu_data[i].pvr; lpj = cpu_data[i].loops_per_jiffy; diff -prauN linux-2.6.0-test1/arch/ppc/kernel/smp.c wli-2.6.0-test1-37/arch/ppc/kernel/smp.c --- linux-2.6.0-test1/arch/ppc/kernel/smp.c 2003-07-13 20:38:36.000000000 -0700 +++ wli-2.6.0-test1-37/arch/ppc/kernel/smp.c 2003-07-14 06:31:09.000000000 -0700 @@ -47,7 +47,7 @@ atomic_t ipi_sent; DEFINE_PER_CPU(unsigned int, prof_multiplier); DEFINE_PER_CPU(unsigned int, prof_counter); unsigned long cache_decay_ticks = HZ/100; -unsigned long cpu_online_map = 1UL; +unsigned long cpu_online_map = cpumask_of_cpu(0); unsigned long cpu_possible_map = 1UL; int smp_hw_index[NR_CPUS]; struct thread_info *secondary_ti; @@ -361,8 +361,8 @@ void __init smp_prepare_cpus(unsigned in void __devinit smp_prepare_boot_cpu(void) { - set_bit(smp_processor_id(), &cpu_online_map); - set_bit(smp_processor_id(), &cpu_possible_map); + cpu_set(smp_processor_id(), cpu_online_map); + cpu_set(smp_processor_id(), cpu_possible_map); } int __init setup_profiling_timer(unsigned int multiplier) @@ -444,7 +444,7 @@ int __cpu_up(unsigned int cpu) printk("Processor %d found.\n", cpu); smp_ops->give_timebase(); - set_bit(cpu, &cpu_online_map); + cpu_set(cpu, cpu_online_map); return 0; } diff -prauN linux-2.6.0-test1/arch/ppc/mm/init.c wli-2.6.0-test1-37/arch/ppc/mm/init.c --- linux-2.6.0-test1/arch/ppc/mm/init.c 2003-07-13 20:31:53.000000000 -0700 +++ wli-2.6.0-test1-37/arch/ppc/mm/init.c 2003-07-14 08:52:52.000000000 -0700 @@ -472,14 +472,14 @@ void __init mem_init(void) printk(KERN_INFO "AGP special page: 0x%08lx\n", agp_special_page); #endif - /* Make sure all our pagetable pages have page->mapping + /* Make sure all our pagetable pages have page_mapping(page) and page->index set correctly. */ for (addr = KERNELBASE; addr != 0; addr += PGDIR_SIZE) { struct page *pg; pmd_t *pmd = pmd_offset(pgd_offset_k(addr), addr); if (pmd_present(*pmd)) { pg = pmd_page(*pmd); - pg->mapping = (void *) &init_mm; + set_page_mapping(pg, &init_mm); pg->index = addr; } } diff -prauN linux-2.6.0-test1/arch/ppc64/Kconfig wli-2.6.0-test1-37/arch/ppc64/Kconfig --- linux-2.6.0-test1/arch/ppc64/Kconfig 2003-07-13 20:34:40.000000000 -0700 +++ wli-2.6.0-test1-37/arch/ppc64/Kconfig 2003-07-14 06:31:09.000000000 -0700 @@ -93,7 +93,7 @@ config IRQ_ALL_CPUS CPU. config NR_CPUS - int "Maximum number of CPUs (2-64)" + int "Maximum number of CPUs (2-128)" depends on SMP default "32" diff -prauN linux-2.6.0-test1/arch/ppc64/kernel/htab.c wli-2.6.0-test1-37/arch/ppc64/kernel/htab.c --- linux-2.6.0-test1/arch/ppc64/kernel/htab.c 2003-07-13 20:34:43.000000000 -0700 +++ wli-2.6.0-test1-37/arch/ppc64/kernel/htab.c 2003-07-14 06:31:09.000000000 -0700 @@ -377,6 +377,7 @@ int hash_page(unsigned long ea, unsigned int ret; int user_region = 0; int local = 0; + cpumask_t tmp; /* Check for invalid addresses. */ if (!IS_VALID_EA(ea)) @@ -431,7 +432,8 @@ int hash_page(unsigned long ea, unsigned */ spin_lock(&mm->page_table_lock); - if (user_region && (mm->cpu_vm_mask == (1 << smp_processor_id()))) + tmp = cpumask_of_cpu(smp_processor_id()); + if (user_region && cpus_equal(mm->cpu_vm_mask, tmp)) local = 1; ptep = find_linux_pte(pgdir, ea); diff -prauN linux-2.6.0-test1/arch/ppc64/kernel/irq.c wli-2.6.0-test1-37/arch/ppc64/kernel/irq.c --- linux-2.6.0-test1/arch/ppc64/kernel/irq.c 2003-07-13 20:33:46.000000000 -0700 +++ wli-2.6.0-test1-37/arch/ppc64/kernel/irq.c 2003-07-14 06:31:09.000000000 -0700 @@ -603,26 +603,37 @@ static struct proc_dir_entry * irq_dir [ static struct proc_dir_entry * smp_affinity_entry [NR_IRQS]; #ifdef CONFIG_IRQ_ALL_CPUS -unsigned long irq_affinity [NR_IRQS] = { [0 ... NR_IRQS-1] = -1UL}; +cpumask_t irq_affinity [NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_ALL }; #else /* CONFIG_IRQ_ALL_CPUS */ -unsigned long irq_affinity [NR_IRQS] = { [0 ... NR_IRQS-1] = 0x0}; +cpumask_t irq_affinity [NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_NONE }; #endif /* CONFIG_IRQ_ALL_CPUS */ -#define HEX_DIGITS 16 +#define HEX_DIGITS (2*sizeof(cpumask_t)) static int irq_affinity_read_proc (char *page, char **start, off_t off, int count, int *eof, void *data) { + int k, len; + cpumask_t tmp = irq_affinity[(long)data]; + if (count < HEX_DIGITS+1) return -EINVAL; - return sprintf(page, "%16lx\n", irq_affinity[(long)data]); + + for (k = 0; k < sizeof(cpumask_t) / sizeof(u16); ++k) { + int j = sprintf(page, "%04hx", (u16)cpus_coerce(tmp)); + len += j; + page += j; + cpus_shift_right(tmp, tmp, 16); + } + len += sprintf(page, "\n"); + return len; } static unsigned int parse_hex_value (const char *buffer, - unsigned long count, unsigned long *ret) + unsigned long count, cpumask_t *ret) { unsigned char hexnum [HEX_DIGITS]; - unsigned long value; + cpumask_t value = CPU_MASK_NONE; int i; if (!count) @@ -636,10 +647,10 @@ static unsigned int parse_hex_value (con * Parse the first 16 characters as a hex string, any non-hex char * is end-of-string. '00e1', 'e1', '00E1', 'E1' are all the same. */ - value = 0; for (i = 0; i < count; i++) { unsigned int c = hexnum[i]; + int k; switch (c) { case '0' ... '9': c -= '0'; break; @@ -648,7 +659,11 @@ static unsigned int parse_hex_value (con default: goto out; } - value = (value << 4) | c; + cpus_shift_left(value, value, 4); + for (k = 0; k < 4; ++k) + if (test_bit(k, (unsigned long *)&c)) + cpu_set(k, value); + } out: *ret = value; @@ -659,7 +674,7 @@ static int irq_affinity_write_proc (stru unsigned long count, void *data) { int irq = (long)data, full_count = count, err; - unsigned long new_value; + cpumask_t new_value, tmp; if (!irq_desc[irq].handler->set_affinity) return -EIO; @@ -671,7 +686,8 @@ static int irq_affinity_write_proc (stru * way to make the system unusable accidentally :-) At least * one online CPU still has to be targeted. */ - if (!(new_value & cpu_online_map)) + cpus_and(tmp, new_value, cpu_online_map); + if (cpus_empty(tmp)) return -EINVAL; irq_affinity[irq] = new_value; @@ -692,8 +708,9 @@ static int prof_cpu_mask_read_proc (char static int prof_cpu_mask_write_proc (struct file *file, const char *buffer, unsigned long count, void *data) { - unsigned long *mask = (unsigned long *) data, full_count = count, err; - unsigned long new_value; + cpumask_t *mask = (cpumask_t *)data; + unsigned long full_count = count, err; + cpumask_t new_value; err = parse_hex_value(buffer, count, &new_value); if (err) diff -prauN linux-2.6.0-test1/arch/ppc64/kernel/open_pic.c wli-2.6.0-test1-37/arch/ppc64/kernel/open_pic.c --- linux-2.6.0-test1/arch/ppc64/kernel/open_pic.c 2003-07-13 20:34:43.000000000 -0700 +++ wli-2.6.0-test1-37/arch/ppc64/kernel/open_pic.c 2003-07-14 06:31:09.000000000 -0700 @@ -46,7 +46,7 @@ static int broken_ipi_registers; OpenPIC_SourcePtr ISU[OPENPIC_MAX_ISU]; static void openpic_end_irq(unsigned int irq_nr); -static void openpic_set_affinity(unsigned int irq_nr, unsigned long cpumask); +static void openpic_set_affinity(unsigned int irq_nr, cpumask_t cpumask); struct hw_interrupt_type open_pic = { " OpenPIC ", @@ -505,7 +505,7 @@ static void openpic_set_spurious(u_int v void openpic_init_processor(u_int cpumask) { openpic_write(&OpenPIC->Global.Processor_Initialization, - cpumask & cpu_online_map); + cpumask & cpus_coerce(cpu_online_map)); } #ifdef CONFIG_SMP @@ -539,7 +539,7 @@ void openpic_cause_IPI(u_int ipi, u_int CHECK_THIS_CPU; check_arg_ipi(ipi); openpic_write(&OpenPIC->THIS_CPU.IPI_Dispatch(ipi), - cpumask & cpu_online_map); + cpumask & cpus_coerce(cpu_online_map)); } void openpic_request_IPIs(void) @@ -625,7 +625,7 @@ static void __init openpic_maptimer(u_in { check_arg_timer(timer); openpic_write(&OpenPIC->Global.Timer[timer].Destination, - cpumask & cpu_online_map); + cpumask & cpus_coerce(cpu_online_map)); } @@ -746,9 +746,12 @@ static void openpic_end_irq(unsigned int openpic_eoi(); } -static void openpic_set_affinity(unsigned int irq_nr, unsigned long cpumask) +static void openpic_set_affinity(unsigned int irq_nr, cpumask_t cpumask) { - openpic_mapirq(irq_nr - open_pic_irq_offset, cpumask & cpu_online_map); + cpumask_t tmp; + + cpus_and(tmp, cpumask, cpu_online_map); + openpic_mapirq(irq_nr - open_pic_irq_offset, cpus_coerce(tmp)); } #ifdef CONFIG_SMP diff -prauN linux-2.6.0-test1/arch/ppc64/kernel/open_pic.h wli-2.6.0-test1-37/arch/ppc64/kernel/open_pic.h --- linux-2.6.0-test1/arch/ppc64/kernel/open_pic.h 2003-07-13 20:30:41.000000000 -0700 +++ wli-2.6.0-test1-37/arch/ppc64/kernel/open_pic.h 2003-07-14 06:31:09.000000000 -0700 @@ -13,6 +13,7 @@ #define _PPC64_KERNEL_OPEN_PIC_H #include +#include #define OPENPIC_SIZE 0x40000 diff -prauN linux-2.6.0-test1/arch/ppc64/kernel/pacaData.c wli-2.6.0-test1-37/arch/ppc64/kernel/pacaData.c --- linux-2.6.0-test1/arch/ppc64/kernel/pacaData.c 2003-07-13 20:36:43.000000000 -0700 +++ wli-2.6.0-test1-37/arch/ppc64/kernel/pacaData.c 2003-07-14 06:31:09.000000000 -0700 @@ -134,5 +134,71 @@ struct paca_struct paca[NR_CPUS] __page_ PACAINITDATA(61, 0, 0, 0, 0), PACAINITDATA(62, 0, 0, 0, 0), PACAINITDATA(63, 0, 0, 0, 0), +#if NR_CPUS > 64 + PACAINITDATA(64, 0, 0, 0, 0), + PACAINITDATA(65, 0, 0, 0, 0), + PACAINITDATA(66, 0, 0, 0, 0), + PACAINITDATA(67, 0, 0, 0, 0), + PACAINITDATA(68, 0, 0, 0, 0), + PACAINITDATA(69, 0, 0, 0, 0), + PACAINITDATA(70, 0, 0, 0, 0), + PACAINITDATA(71, 0, 0, 0, 0), + PACAINITDATA(72, 0, 0, 0, 0), + PACAINITDATA(73, 0, 0, 0, 0), + PACAINITDATA(74, 0, 0, 0, 0), + PACAINITDATA(75, 0, 0, 0, 0), + PACAINITDATA(76, 0, 0, 0, 0), + PACAINITDATA(77, 0, 0, 0, 0), + PACAINITDATA(78, 0, 0, 0, 0), + PACAINITDATA(79, 0, 0, 0, 0), + PACAINITDATA(80, 0, 0, 0, 0), + PACAINITDATA(81, 0, 0, 0, 0), + PACAINITDATA(82, 0, 0, 0, 0), + PACAINITDATA(83, 0, 0, 0, 0), + PACAINITDATA(84, 0, 0, 0, 0), + PACAINITDATA(85, 0, 0, 0, 0), + PACAINITDATA(86, 0, 0, 0, 0), + PACAINITDATA(87, 0, 0, 0, 0), + PACAINITDATA(88, 0, 0, 0, 0), + PACAINITDATA(89, 0, 0, 0, 0), + PACAINITDATA(90, 0, 0, 0, 0), + PACAINITDATA(91, 0, 0, 0, 0), + PACAINITDATA(92, 0, 0, 0, 0), + PACAINITDATA(93, 0, 0, 0, 0), + PACAINITDATA(94, 0, 0, 0, 0), + PACAINITDATA(95, 0, 0, 0, 0), + PACAINITDATA(96, 0, 0, 0, 0), + PACAINITDATA(97, 0, 0, 0, 0), + PACAINITDATA(98, 0, 0, 0, 0), + PACAINITDATA(99, 0, 0, 0, 0), + PACAINITDATA(100, 0, 0, 0, 0), + PACAINITDATA(101, 0, 0, 0, 0), + PACAINITDATA(102, 0, 0, 0, 0), + PACAINITDATA(103, 0, 0, 0, 0), + PACAINITDATA(104, 0, 0, 0, 0), + PACAINITDATA(105, 0, 0, 0, 0), + PACAINITDATA(106, 0, 0, 0, 0), + PACAINITDATA(107, 0, 0, 0, 0), + PACAINITDATA(108, 0, 0, 0, 0), + PACAINITDATA(109, 0, 0, 0, 0), + PACAINITDATA(110, 0, 0, 0, 0), + PACAINITDATA(111, 0, 0, 0, 0), + PACAINITDATA(112, 0, 0, 0, 0), + PACAINITDATA(113, 0, 0, 0, 0), + PACAINITDATA(114, 0, 0, 0, 0), + PACAINITDATA(115, 0, 0, 0, 0), + PACAINITDATA(116, 0, 0, 0, 0), + PACAINITDATA(117, 0, 0, 0, 0), + PACAINITDATA(118, 0, 0, 0, 0), + PACAINITDATA(119, 0, 0, 0, 0), + PACAINITDATA(120, 0, 0, 0, 0), + PACAINITDATA(121, 0, 0, 0, 0), + PACAINITDATA(122, 0, 0, 0, 0), + PACAINITDATA(123, 0, 0, 0, 0), + PACAINITDATA(124, 0, 0, 0, 0), + PACAINITDATA(125, 0, 0, 0, 0), + PACAINITDATA(126, 0, 0, 0, 0), + PACAINITDATA(127, 0, 0, 0, 0), +#endif #endif }; diff -prauN linux-2.6.0-test1/arch/ppc64/kernel/prom.c wli-2.6.0-test1-37/arch/ppc64/kernel/prom.c --- linux-2.6.0-test1/arch/ppc64/kernel/prom.c 2003-07-13 20:36:37.000000000 -0700 +++ wli-2.6.0-test1-37/arch/ppc64/kernel/prom.c 2003-07-14 06:31:09.000000000 -0700 @@ -1134,7 +1134,7 @@ prom_init(unsigned long r3, unsigned lon _prom->cpu = (int)(unsigned long)getprop_rval; _xPaca[_prom->cpu].active = 1; #ifdef CONFIG_SMP - RELOC(cpu_online_map) = 1UL << _prom->cpu; + cpu_set(_prom->cpu, RELOC(cpu_online_map)); #endif RELOC(boot_cpuid) = _prom->cpu; diff -prauN linux-2.6.0-test1/arch/ppc64/kernel/rtasd.c wli-2.6.0-test1-37/arch/ppc64/kernel/rtasd.c --- linux-2.6.0-test1/arch/ppc64/kernel/rtasd.c 2003-07-13 20:32:34.000000000 -0700 +++ wli-2.6.0-test1-37/arch/ppc64/kernel/rtasd.c 2003-07-14 06:31:09.000000000 -0700 @@ -225,7 +225,7 @@ repeat: continue; DEBUG("scheduling on %d\n", cpu); - set_cpus_allowed(current, 1UL << cpu); + set_cpus_allowed(current, cpumask_of_cpu(cpu)); DEBUG("watchdog scheduled on cpu %d\n", smp_processor_id()); do { diff -prauN linux-2.6.0-test1/arch/ppc64/kernel/setup.c wli-2.6.0-test1-37/arch/ppc64/kernel/setup.c --- linux-2.6.0-test1/arch/ppc64/kernel/setup.c 2003-07-13 20:34:02.000000000 -0700 +++ wli-2.6.0-test1-37/arch/ppc64/kernel/setup.c 2003-07-14 06:31:09.000000000 -0700 @@ -256,7 +256,7 @@ static int show_cpuinfo(struct seq_file return 0; } - if (!(cpu_online_map & (1UL << cpu_id))) + if (!cpu_online(cpu_id)) return 0; #ifdef CONFIG_SMP diff -prauN linux-2.6.0-test1/arch/ppc64/kernel/smp.c wli-2.6.0-test1-37/arch/ppc64/kernel/smp.c --- linux-2.6.0-test1/arch/ppc64/kernel/smp.c 2003-07-13 20:36:41.000000000 -0700 +++ wli-2.6.0-test1-37/arch/ppc64/kernel/smp.c 2003-07-14 06:31:09.000000000 -0700 @@ -53,7 +53,7 @@ int smp_threads_ready; unsigned long cache_decay_ticks; /* initialised so it doesn't end up in bss */ -unsigned long cpu_online_map = 0; +cpumask_t cpu_online_map = CPU_MASK_NONE; static struct smp_ops_t *smp_ops; @@ -570,7 +570,7 @@ void __init smp_prepare_cpus(unsigned in void __devinit smp_prepare_boot_cpu(void) { - set_bit(smp_processor_id(), &cpu_online_map); + cpu_set(smp_processor_id(), cpu_online_map); /* FIXME: what about cpu_possible()? */ } @@ -631,7 +631,7 @@ int __devinit __cpu_up(unsigned int cpu) if (smp_ops->give_timebase) smp_ops->give_timebase(); - set_bit(cpu, &cpu_online_map); + cpu_set(cpu, cpu_online_map); return 0; } diff -prauN linux-2.6.0-test1/arch/ppc64/kernel/xics.c wli-2.6.0-test1-37/arch/ppc64/kernel/xics.c --- linux-2.6.0-test1/arch/ppc64/kernel/xics.c 2003-07-13 20:34:41.000000000 -0700 +++ wli-2.6.0-test1-37/arch/ppc64/kernel/xics.c 2003-07-14 06:31:09.000000000 -0700 @@ -33,7 +33,7 @@ void xics_enable_irq(u_int irq); void xics_disable_irq(u_int irq); void xics_mask_and_ack_irq(u_int irq); void xics_end_irq(u_int irq); -void xics_set_affinity(unsigned int irq_nr, unsigned long cpumask); +void xics_set_affinity(unsigned int irq_nr, cpumask_t cpumask); struct hw_interrupt_type xics_pic = { " XICS ", @@ -508,7 +508,7 @@ nextnode: ppc64_boot_msg(0x21, "XICS Done"); } -void xics_set_affinity(unsigned int virq, unsigned long cpumask) +void xics_set_affinity(unsigned int virq, cpumask_t cpumask) { irq_desc_t *desc = irq_desc + virq; unsigned int irq; @@ -516,6 +516,8 @@ void xics_set_affinity(unsigned int virq long status; unsigned long xics_status[2]; unsigned long newmask; + cpumask_t allcpus = CPU_MASK_ALL; + cpumask_t tmp = CPU_MASK_NONE; virq -= XICS_IRQ_OFFSET; irq = virt_irq_to_real(virq); @@ -533,12 +535,13 @@ void xics_set_affinity(unsigned int virq } /* For the moment only implement delivery to all cpus or one cpu */ - if (cpumask == -1UL) { + if (cpus_equal(cpumask, allcpus)) { newmask = default_distrib_server; } else { - if (!(cpumask & cpu_online_map)) + cpus_and(tmp, cpu_online_map, cpumask); + if (cpus_empty(tmp)) goto out; - newmask = find_first_bit(&cpumask, 8*sizeof(unsigned long)); + newmask = first_cpu(cpumask); } status = rtas_call(ibm_set_xive, 3, 1, NULL, diff -prauN linux-2.6.0-test1/arch/ppc64/mm/init.c wli-2.6.0-test1-37/arch/ppc64/mm/init.c --- linux-2.6.0-test1/arch/ppc64/mm/init.c 2003-07-13 20:34:43.000000000 -0700 +++ wli-2.6.0-test1-37/arch/ppc64/mm/init.c 2003-07-14 06:49:00.000000000 -0700 @@ -211,7 +211,7 @@ static void map_io_page(unsigned long ea if (mem_init_done) { spin_lock(&ioremap_mm.page_table_lock); pgdp = pgd_offset_i(ea); - pmdp = pmd_alloc(&ioremap_mm, pgdp, ea); + pmdp = pmd_alloc_kernel(&ioremap_mm, pgdp, ea); ptep = pte_alloc_kernel(&ioremap_mm, pmdp, ea); pa = absolute_to_phys(pa); @@ -253,7 +253,7 @@ flush_tlb_mm(struct mm_struct *mm) __flush_tlb_range(mm, mp->vm_start, mp->vm_end); /* XXX are there races with checking cpu_vm_mask? - Anton */ - mm->cpu_vm_mask = 0; + cpus_clear(mm->cpu_vm_mask); spin_unlock(&mm->page_table_lock); } @@ -270,6 +270,7 @@ flush_tlb_page(struct vm_area_struct *vm pte_t *ptep; pte_t pte; int local = 0; + cpumask_t tmp; switch( REGION_ID(vmaddr) ) { case VMALLOC_REGION_ID: @@ -283,7 +284,8 @@ flush_tlb_page(struct vm_area_struct *vm context = vma->vm_mm->context; /* XXX are there races with checking cpu_vm_mask? - Anton */ - if (vma->vm_mm->cpu_vm_mask == (1 << smp_processor_id())) + tmp = cpumask_of_cpu(smp_processor_id()); + if (cpus_equal(vma->vm_mm->cpu_vm_mask, tmp)) local = 1; break; @@ -319,6 +321,7 @@ __flush_tlb_range(struct mm_struct *mm, struct ppc64_tlb_batch *batch = &ppc64_tlb_batch[smp_processor_id()]; unsigned long i = 0; int local = 0; + cpumask_t tmp; switch(REGION_ID(start)) { case VMALLOC_REGION_ID: @@ -332,7 +335,8 @@ __flush_tlb_range(struct mm_struct *mm, context = mm->context; /* XXX are there races with checking cpu_vm_mask? - Anton */ - if (mm->cpu_vm_mask == (1 << smp_processor_id())) + tmp = cpumask_of_cpu(smp_processor_id()); + if (cpus_equal(mm->cpu_vm_mask, tmp)) local = 1; break; @@ -698,6 +702,7 @@ void update_mmu_cache(struct vm_area_str void *pgdir; pte_t *ptep; int local = 0; + cpumask_t tmp; /* handle i-cache coherency */ if (!(cur_cpu_spec->cpu_features & CPU_FTR_NOEXECUTE)) { @@ -723,7 +728,8 @@ void update_mmu_cache(struct vm_area_str ptep = find_linux_pte(pgdir, ea); vsid = get_vsid(vma->vm_mm->context, ea); - if (vma->vm_mm->cpu_vm_mask == (1 << smp_processor_id())) + tmp = cpumask_of_cpu(smp_processor_id()); + if (cpus_equal(vma->vm_mm->cpu_vm_mask, tmp)) local = 1; __hash_page(ea, pte_val(pte) & (_PAGE_USER|_PAGE_RW), vsid, ptep, diff -prauN linux-2.6.0-test1/arch/s390/kernel/compat_exec.c wli-2.6.0-test1-37/arch/s390/kernel/compat_exec.c --- linux-2.6.0-test1/arch/s390/kernel/compat_exec.c 2003-07-13 20:37:32.000000000 -0700 +++ wli-2.6.0-test1-37/arch/s390/kernel/compat_exec.c 2003-07-14 07:33:22.000000000 -0700 @@ -81,7 +81,8 @@ int setup_arg_pages32(struct linux_binpr struct page *page = bprm->page[i]; if (page) { bprm->page[i] = NULL; - put_dirty_page(current,page,stack_base,PAGE_COPY); + put_dirty_page(current, mpnt, page, + stack_base, PAGE_COPY); } stack_base += PAGE_SIZE; } diff -prauN linux-2.6.0-test1/arch/s390/kernel/setup.c wli-2.6.0-test1-37/arch/s390/kernel/setup.c --- linux-2.6.0-test1/arch/s390/kernel/setup.c 2003-07-13 20:30:37.000000000 -0700 +++ wli-2.6.0-test1-37/arch/s390/kernel/setup.c 2003-07-14 06:45:57.000000000 -0700 @@ -53,7 +53,7 @@ struct { unsigned long addr, size, type; #define CHUNK_READ_WRITE 0 #define CHUNK_READ_ONLY 1 int cpus_initialized = 0; -unsigned long cpu_initialized = 0; +static cpumask_t cpu_initialized; volatile int __cpu_logical_map[NR_CPUS]; /* logical cpu to cpu address */ /* @@ -83,7 +83,7 @@ void __devinit cpu_init (void) int nr = smp_processor_id(); int addr = hard_smp_processor_id(); - if (test_and_set_bit(nr,&cpu_initialized)) { + if (cpu_test_and_set(nr,cpu_initialized)) { printk("CPU#%d ALREADY INITIALIZED!!!!!!!!!\n", nr); for (;;) local_irq_enable(); } @@ -562,7 +562,7 @@ static int show_cpuinfo(struct seq_file num_online_cpus(), loops_per_jiffy/(500000/HZ), (loops_per_jiffy/(5000/HZ))%100); } - if (cpu_online_map & (1 << n)) { + if (cpu_online(n)) { #ifdef CONFIG_SMP if (smp_processor_id() == n) cpuinfo = &S390_lowcore.cpu_data; diff -prauN linux-2.6.0-test1/arch/s390/kernel/smp.c wli-2.6.0-test1-37/arch/s390/kernel/smp.c --- linux-2.6.0-test1/arch/s390/kernel/smp.c 2003-07-13 20:28:55.000000000 -0700 +++ wli-2.6.0-test1-37/arch/s390/kernel/smp.c 2003-07-14 06:44:42.000000000 -0700 @@ -51,8 +51,8 @@ struct _lowcore *lowcore_ptr[NR_CPUS]; cycles_t cacheflush_time=0; int smp_threads_ready=0; /* Set when the idlers are all forked. */ -volatile unsigned long cpu_online_map; -volatile unsigned long cpu_possible_map; +cpumask_t cpu_online_map; +cpumask_t cpu_possible_map; unsigned long cache_decay_ticks = 0; /* @@ -200,14 +200,14 @@ void smp_send_stop(void) /* * Reboot, halt and power_off routines for SMP. */ -static volatile unsigned long cpu_restart_map; +static cpumask_t cpu_restart_map; static void do_machine_restart(void * __unused) { - clear_bit(smp_processor_id(), &cpu_restart_map); + cpu_clear(smp_processor_id(), cpu_restart_map); if (smp_processor_id() == 0) { /* Wait for all other cpus to enter do_machine_restart. */ - while (cpu_restart_map != 0); + while (!cpus_empty(cpu_restart_map)); /* Store status of other cpus. */ do_store_status(); /* @@ -427,7 +427,7 @@ void __init smp_check_cpus(unsigned int if (signal_processor(num_cpus, sigp_sense) == sigp_not_operational) continue; - set_bit(num_cpus, &cpu_possible_map); + cpu_set(num_cpus, cpu_possible_map); num_cpus++; } printk("Detected %d CPU's\n",(int) num_cpus); @@ -452,7 +452,7 @@ int __devinit start_secondary(void *cpuv pfault_init(); #endif /* Mark this cpu as online */ - set_bit(smp_processor_id(), &cpu_online_map); + cpu_set(smp_processor_id(), cpu_online_map); /* Switch on interrupts */ local_irq_enable(); /* Print info about this processor */ @@ -558,8 +558,8 @@ void __init smp_prepare_cpus(unsigned in void __devinit smp_prepare_boot_cpu(void) { - set_bit(smp_processor_id(), &cpu_online_map); - set_bit(smp_processor_id(), &cpu_possible_map); + cpu_set(smp_processor_id(), cpu_online_map); + cpu_set(smp_processor_id(), cpu_possible_map); } void smp_cpus_done(unsigned int max_cpus) diff -prauN linux-2.6.0-test1/arch/s390/mm/ioremap.c wli-2.6.0-test1-37/arch/s390/mm/ioremap.c --- linux-2.6.0-test1/arch/s390/mm/ioremap.c 2003-07-13 20:30:47.000000000 -0700 +++ wli-2.6.0-test1-37/arch/s390/mm/ioremap.c 2003-07-14 06:49:00.000000000 -0700 @@ -83,7 +83,7 @@ static int remap_area_pages(unsigned lon spin_lock(&init_mm.page_table_lock); do { pmd_t *pmd; - pmd = pmd_alloc(&init_mm, dir, address); + pmd = pmd_alloc_kernel(&init_mm, dir, address); error = -ENOMEM; if (!pmd) break; diff -prauN linux-2.6.0-test1/arch/sh/mm/ioremap.c wli-2.6.0-test1-37/arch/sh/mm/ioremap.c --- linux-2.6.0-test1/arch/sh/mm/ioremap.c 2003-07-13 20:35:12.000000000 -0700 +++ wli-2.6.0-test1-37/arch/sh/mm/ioremap.c 2003-07-14 06:49:00.000000000 -0700 @@ -45,7 +45,7 @@ static inline void remap_area_pte(pte_t } while (address && (address < end)); } -static inline int remap_area_pmd(pmd_t * pmd, unsigned long address, +static inline int remap_area_pmd(pgd_t *pgd, pmd_t * pmd, unsigned long address, unsigned long size, unsigned long phys_addr, unsigned long flags) { unsigned long end; @@ -83,11 +83,11 @@ int remap_area_pages(unsigned long addre spin_lock(&init_mm.page_table_lock); do { pmd_t *pmd; - pmd = pmd_alloc(&init_mm, dir, address); + pmd = pmd_alloc_map(&init_mm, dir, address); error = -ENOMEM; if (!pmd) break; - if (remap_area_pmd(pmd, address, end - address, + if (remap_area_pmd(dir, pmd, address, end - address, phys_addr + address, flags)) break; error = 0; diff -prauN linux-2.6.0-test1/arch/sparc/mm/generic.c wli-2.6.0-test1-37/arch/sparc/mm/generic.c --- linux-2.6.0-test1/arch/sparc/mm/generic.c 2003-07-13 20:37:58.000000000 -0700 +++ wli-2.6.0-test1-37/arch/sparc/mm/generic.c 2003-07-14 06:49:00.000000000 -0700 @@ -67,7 +67,7 @@ static inline void io_remap_pte_range(pt } while (address < end); } -static inline int io_remap_pmd_range(pmd_t * pmd, unsigned long address, unsigned long size, +static inline int io_remap_pmd_range(pgd_t *pgd, pmd_t * pmd, unsigned long address, unsigned long size, unsigned long offset, pgprot_t prot, int space) { unsigned long end; @@ -78,7 +78,7 @@ static inline int io_remap_pmd_range(pmd end = PGDIR_SIZE; offset -= address; do { - pte_t * pte = pte_alloc_map(current->mm, pmd, address); + pte_t * pte = pte_alloc_map(current->mm, pgd, &pmd, address); if (!pte) return -ENOMEM; io_remap_pte_range(pte, address, end - address, address + offset, prot, space); @@ -103,11 +103,11 @@ int io_remap_page_range(struct vm_area_s spin_lock(&mm->page_table_lock); while (from < end) { - pmd_t *pmd = pmd_alloc(current->mm, dir, from); + pmd_t *pmd = pmd_alloc_map(current->mm, dir, from); error = -ENOMEM; if (!pmd) break; - error = io_remap_pmd_range(pmd, from, end - from, offset + from, prot, space); + error = io_remap_pmd_range(pgd, pmd, from, end - from, offset + from, prot, space); if (error) break; from = (from + PGDIR_SIZE) & PGDIR_MASK; diff -prauN linux-2.6.0-test1/arch/sparc/mm/srmmu.c wli-2.6.0-test1-37/arch/sparc/mm/srmmu.c --- linux-2.6.0-test1/arch/sparc/mm/srmmu.c 2003-07-13 20:34:42.000000000 -0700 +++ wli-2.6.0-test1-37/arch/sparc/mm/srmmu.c 2003-07-14 06:49:00.000000000 -0700 @@ -2188,7 +2188,7 @@ void __init ld_mmu_srmmu(void) BTFIXUPSET_CALL(pte_pfn, srmmu_pte_pfn, BTFIXUPCALL_NORM); BTFIXUPSET_CALL(pmd_page, srmmu_pmd_page, BTFIXUPCALL_NORM); - BTFIXUPSET_CALL(pgd_page, srmmu_pgd_page, BTFIXUPCALL_NORM); + BTFIXUPSET_CALL(__pgd_page, srmmu_pgd_page, BTFIXUPCALL_NORM); BTFIXUPSET_SETHI(none_mask, 0xF0000000); @@ -2220,7 +2220,7 @@ void __init ld_mmu_srmmu(void) BTFIXUPSET_CALL(pte_alloc_one_kernel, srmmu_pte_alloc_one_kernel, BTFIXUPCALL_NORM); BTFIXUPSET_CALL(pte_alloc_one, srmmu_pte_alloc_one, BTFIXUPCALL_NORM); BTFIXUPSET_CALL(free_pmd_fast, srmmu_pmd_free, BTFIXUPCALL_NORM); - BTFIXUPSET_CALL(pmd_alloc_one, srmmu_pmd_alloc_one, BTFIXUPCALL_NORM); + BTFIXUPSET_CALL(__pmd_alloc_one, srmmu_pmd_alloc_one, BTFIXUPCALL_NORM); BTFIXUPSET_CALL(free_pgd_fast, srmmu_free_pgd_fast, BTFIXUPCALL_NORM); BTFIXUPSET_CALL(get_pgd_fast, srmmu_get_pgd_fast, BTFIXUPCALL_NORM); diff -prauN linux-2.6.0-test1/arch/sparc/mm/sun4c.c wli-2.6.0-test1-37/arch/sparc/mm/sun4c.c --- linux-2.6.0-test1/arch/sparc/mm/sun4c.c 2003-07-13 20:36:43.000000000 -0700 +++ wli-2.6.0-test1-37/arch/sparc/mm/sun4c.c 2003-07-14 06:49:00.000000000 -0700 @@ -2211,7 +2211,7 @@ void __init ld_mmu_sun4c(void) BTFIXUPSET_CALL(pte_alloc_one_kernel, sun4c_pte_alloc_one_kernel, BTFIXUPCALL_NORM); BTFIXUPSET_CALL(pte_alloc_one, sun4c_pte_alloc_one, BTFIXUPCALL_NORM); BTFIXUPSET_CALL(free_pmd_fast, sun4c_free_pmd_fast, BTFIXUPCALL_NOP); - BTFIXUPSET_CALL(pmd_alloc_one, sun4c_pmd_alloc_one, BTFIXUPCALL_RETO0); + BTFIXUPSET_CALL(__pmd_alloc_one, sun4c_pmd_alloc_one, BTFIXUPCALL_RETO0); BTFIXUPSET_CALL(free_pgd_fast, sun4c_free_pgd_fast, BTFIXUPCALL_NORM); BTFIXUPSET_CALL(get_pgd_fast, sun4c_get_pgd_fast, BTFIXUPCALL_NORM); @@ -2252,5 +2252,5 @@ void __init ld_mmu_sun4c(void) /* These should _never_ get called with two level tables. */ BTFIXUPSET_CALL(pgd_set, sun4c_pgd_set, BTFIXUPCALL_NOP); - BTFIXUPSET_CALL(pgd_page, sun4c_pgd_page, BTFIXUPCALL_RETO0); + BTFIXUPSET_CALL(__pgd_page, sun4c_pgd_page, BTFIXUPCALL_RETO0); } diff -prauN linux-2.6.0-test1/arch/sparc64/kernel/irq.c wli-2.6.0-test1-37/arch/sparc64/kernel/irq.c --- linux-2.6.0-test1/arch/sparc64/kernel/irq.c 2003-07-13 20:38:05.000000000 -0700 +++ wli-2.6.0-test1-37/arch/sparc64/kernel/irq.c 2003-07-14 06:43:52.000000000 -0700 @@ -110,6 +110,10 @@ static void register_irq_proc (unsigned action->flags |= __irq_ino(irq) << 48; #define get_ino_in_irqaction(action) (action->flags >> 48) +#if NR_CPUS > 64 +#error irqaction embedded smp affinity does not work with > 64 cpus, FIXME +#endif + #define put_smpaff_in_irqaction(action, smpaff) (action)->mask = (smpaff) #define get_smpaff_in_irqaction(action) ((action)->mask) @@ -658,11 +662,11 @@ static inline void redirect_intr(int cpu * Just Do It. */ struct irqaction *ap = bp->irq_info; - unsigned long cpu_mask = get_smpaff_in_irqaction(ap); + cpumask_t cpu_mask = { .mask[0] = get_smpaff_in_irqaction(ap) }; unsigned int buddy, ticks; - cpu_mask &= cpu_online_map; - if (cpu_mask == 0) + cpus_and(cpu_mask, cpu_mask, cpu_online_map); + if (cpus_empty(cpu_mask)) cpu_mask = cpu_online_map; if (this_is_starfire != 0 || @@ -677,7 +681,7 @@ static inline void redirect_intr(int cpu buddy = 0; ticks = 0; - while ((cpu_mask & (1UL << buddy)) == 0) { + while (!cpu_isset(buddy, cpu_mask)) { if (++buddy >= NR_CPUS) buddy = 0; if (++ticks > NR_CPUS) { diff -prauN linux-2.6.0-test1/arch/sparc64/kernel/smp.c wli-2.6.0-test1-37/arch/sparc64/kernel/smp.c --- linux-2.6.0-test1/arch/sparc64/kernel/smp.c 2003-07-13 20:28:53.000000000 -0700 +++ wli-2.6.0-test1-37/arch/sparc64/kernel/smp.c 2003-07-14 08:52:52.000000000 -0700 @@ -46,12 +46,11 @@ cpuinfo_sparc cpu_data[NR_CPUS]; /* Please don't make this stuff initdata!!! --DaveM */ static unsigned char boot_cpu_id; -atomic_t sparc64_num_cpus_online = ATOMIC_INIT(0); -unsigned long cpu_online_map = 0; +cpumask_t cpu_online_map = CPU_MASK_NONE; atomic_t sparc64_num_cpus_possible = ATOMIC_INIT(0); -unsigned long phys_cpu_present_map = 0; -static unsigned long smp_commenced_mask; -static unsigned long cpu_callout_map; +cpumask_t phys_cpu_present_map = CPU_MASK_NONE; +static cpumask_t smp_commenced_mask; +static cpumask_t cpu_callout_map; void smp_info(struct seq_file *m) { @@ -151,11 +150,10 @@ void __init smp_callin(void) atomic_inc(&init_mm.mm_count); current->active_mm = &init_mm; - while (!test_bit(cpuid, &smp_commenced_mask)) + while (!cpu_isset(cpuid, smp_commenced_mask)) membar("#LoadLoad"); - set_bit(cpuid, &cpu_online_map); - atomic_inc(&sparc64_num_cpus_online); + cpu_set(cpuid, cpu_online_map); } void cpu_panic(void) @@ -334,7 +332,7 @@ static int __devinit smp_boot_one_cpu(un if (linux_cpus[no].mid == cpu) break; cpu_new_thread = p->thread_info; - set_bit(cpu, &cpu_callout_map); + cpu_set(cpu, cpu_callout_map); prom_startcpu(linux_cpus[no].prom_node, entry, cookie); for (timeout = 0; timeout < 5000000; timeout++) { if (callin_flag) @@ -346,7 +344,7 @@ static int __devinit smp_boot_one_cpu(un ret = 0; } else { printk("Processor %d is stuck.\n", cpu); - clear_bit(cpu, &cpu_callout_map); + cpu_clear(cpu, cpu_callout_map); ret = -ENODEV; } cpu_new_thread = NULL; @@ -420,17 +418,17 @@ again: } } -static __inline__ void spitfire_xcall_deliver(u64 data0, u64 data1, u64 data2, unsigned long mask) +static __inline__ void spitfire_xcall_deliver(u64 data0, u64 data1, u64 data2, cpumask_t mask) { u64 pstate; int i; __asm__ __volatile__("rdpr %%pstate, %0" : "=r" (pstate)); for (i = 0; i < NR_CPUS; i++) { - if (mask & (1UL << i)) { + if (cpu_isset(i, mask)) { spitfire_xcall_helper(data0, data1, data2, pstate, i); - mask &= ~(1UL << i); - if (!mask) + cpu_clear(i, mask); + if (cpus_empty(mask)) break; } } @@ -443,12 +441,12 @@ static __inline__ void spitfire_xcall_de #if NR_CPUS > 32 #error Fixup cheetah_xcall_deliver Dave... #endif -static void cheetah_xcall_deliver(u64 data0, u64 data1, u64 data2, unsigned long mask) +static void cheetah_xcall_deliver(u64 data0, u64 data1, u64 data2, cpumask_t mask) { u64 pstate; int nack_busy_id; - if (!mask) + if (cpus_empty(mask)) return; __asm__ __volatile__("rdpr %%pstate, %0" : "=r" (pstate)); @@ -469,11 +467,11 @@ retry: nack_busy_id = 0; { - unsigned long work_mask = mask; + cpumask_t work_mask = mask; int i; for (i = 0; i < NR_CPUS; i++) { - if (work_mask & (1UL << i)) { + if (cpu_isset(i, work_mask)) { u64 target = (i << 14) | 0x70; target |= (nack_busy_id++ << 24); @@ -482,8 +480,8 @@ retry: "membar #Sync\n\t" : /* no outputs */ : "r" (target), "i" (ASI_INTR_W)); - work_mask &= ~(1UL << i); - if (!work_mask) + cpu_clear(i, work_mask); + if (cpus_empty(work_mask)) break; } } @@ -518,7 +516,7 @@ retry: printk("CPU[%d]: mondo stuckage result[%016lx]\n", smp_processor_id(), dispatch_stat); } else { - unsigned long work_mask = mask; + cpumask_t work_mask = mask; int i, this_busy_nack = 0; /* Delay some random time with interrupts enabled @@ -530,13 +528,13 @@ retry: * NACK us. */ for (i = 0; i < NR_CPUS; i++) { - if (work_mask & (1UL << i)) { + if (cpu_isset(i, work_mask)) { if ((dispatch_stat & (0x2 << this_busy_nack)) == 0) - mask &= ~(1UL << i); + cpu_clear(i, mask); this_busy_nack += 2; - work_mask &= ~(1UL << i); - if (!work_mask) + cpu_clear(i, work_mask); + if (cpus_empty(work_mask)) break; } } @@ -549,12 +547,12 @@ retry: /* Send cross call to all processors mentioned in MASK * except self. */ -static void smp_cross_call_masked(unsigned long *func, u32 ctx, u64 data1, u64 data2, unsigned long mask) +static void smp_cross_call_masked(unsigned long *func, u32 ctx, u64 data1, u64 data2, cpumask_t mask) { u64 data0 = (((u64)ctx)<<32 | (((u64)func) & 0xffffffff)); - mask &= cpu_online_map; - mask &= ~(1UL< PAGE_SIZE) __flush_dcache_page(page->virtual, ((tlb_type == spitfire) && - page->mapping != NULL)); + page_mapping(page) != NULL)); #else - if (page->mapping != NULL && + if (page_mapping(page) != NULL && tlb_type == spitfire) __flush_icache_page(__pa(page->virtual)); #endif @@ -685,20 +684,20 @@ static __inline__ void __local_flush_dca void smp_flush_dcache_page_impl(struct page *page, int cpu) { - unsigned long mask = 1UL << cpu; + cpumask_t mask = cpumask_of_cpu(cpu); #ifdef CONFIG_DEBUG_DCFLUSH atomic_inc(&dcpage_flushes); #endif if (cpu == smp_processor_id()) { __local_flush_dcache_page(page); - } else if ((cpu_online_map & mask) != 0) { + } else if (cpu_online(cpu)) { u64 data0; if (tlb_type == spitfire) { data0 = ((u64)&xcall_flush_dcache_page_spitfire); - if (page->mapping != NULL) + if (page_mapping(page) != NULL) data0 |= ((u64)1 << 32); spitfire_xcall_deliver(data0, __pa(page->virtual), @@ -719,17 +718,18 @@ void smp_flush_dcache_page_impl(struct p void flush_dcache_page_all(struct mm_struct *mm, struct page *page) { - unsigned long mask = cpu_online_map & ~(1UL << smp_processor_id()); + cpumask_t mask = cpu_online_map; + cpu_clear(smp_processor_id(), mask); u64 data0; #ifdef CONFIG_DEBUG_DCFLUSH atomic_inc(&dcpage_flushes); #endif - if (mask == 0UL) + if (cpus_empty(mask)) goto flush_self; if (tlb_type == spitfire) { data0 = ((u64)&xcall_flush_dcache_page_spitfire); - if (page->mapping != NULL) + if (page_mapping(page) != NULL) data0 |= ((u64)1 << 32); spitfire_xcall_deliver(data0, __pa(page->virtual), @@ -750,9 +750,9 @@ void flush_dcache_page_all(struct mm_str void smp_receive_signal(int cpu) { - unsigned long mask = 1UL << cpu; + cpumask_t mask = cpumask_of_cpu(cpu); - if ((cpu_online_map & mask) != 0) { + if (cpu_online(cpu)) { u64 data0 = (((u64)&xcall_receive_signal) & 0xffffffff); if (tlb_type == spitfire) @@ -854,7 +854,7 @@ void smp_flush_tlb_mm(struct mm_struct * if (atomic_read(&mm->mm_users) == 1) { /* See smp_flush_tlb_page for info about this. */ - mm->cpu_vm_mask = (1UL << cpu); + mm->cpu_vm_mask = cpumask_of_cpu(cpu); goto local_flush_and_out; } @@ -877,7 +877,7 @@ void smp_flush_tlb_range(struct mm_struc end = PAGE_ALIGN(end); if (mm == current->active_mm && atomic_read(&mm->mm_users) == 1) { - mm->cpu_vm_mask = (1UL << cpu); + mm->cpu_vm_mask = cpumask_of_cpu(cpu); goto local_flush_and_out; } @@ -921,14 +921,16 @@ void smp_flush_tlb_page(struct mm_struct * is almost certain that all TLB entries for this * context will be replaced by the time that happens. */ - mm->cpu_vm_mask = (1UL << cpu); + mm->cpu_vm_mask = cpumask_of_cpu(cpu); goto local_flush_and_out; } else { + cpumask_t this_cpu_mask = cpumask_of_cpu(cpu); + /* By virtue of running under the mm->page_table_lock, * and mmu_context.h:switch_mm doing the same, the * following operation is safe. */ - if (mm->cpu_vm_mask == (1UL << cpu)) + if (cpus_equal(mm->cpu_vm_mask, this_cpu_mask)) goto local_flush_and_out; } @@ -939,7 +941,7 @@ void smp_flush_tlb_page(struct mm_struct smp_cross_call_masked(&xcall_flush_tlb_page, ctx, page, 0, mm->cpu_vm_mask); - if (!(mm->cpu_vm_mask & (1UL << cpu))) + if (!cpu_isset(cpu, mm->cpu_vm_mask)) return; local_flush_and_out: @@ -1122,8 +1124,7 @@ void __init smp_tick_init(void) prom_halt(); } - atomic_inc(&sparc64_num_cpus_online); - set_bit(boot_cpu_id, &cpu_online_map); + cpu_set(boot_cpu_id, cpu_online_map); prom_cpu_nodes[boot_cpu_id] = linux_cpus[0].prom_node; prof_counter(boot_cpu_id) = prof_multiplier(boot_cpu_id) = 1; } @@ -1241,16 +1242,14 @@ void __init smp_prepare_cpus(unsigned in for (i = 0; i < linux_num_cpus; i++) { if (linux_cpus[i].mid < max_cpus) { - set_bit(linux_cpus[i].mid, - &phys_cpu_present_map); + cpu_set(linux_cpus[i].mid, phys_cpu_present_map); atomic_inc(&sparc64_num_cpus_possible); } } if (atomic_read(&sparc64_num_cpus_possible) > max_cpus) { for (i = linux_num_cpus - 1; i >= 0; i--) { if (linux_cpus[i].mid != boot_cpu_id) { - clear_bit(linux_cpus[i].mid, - &phys_cpu_present_map); + cpu_clear(linux_cpus[i].mid, phys_cpu_present_map); atomic_dec(&sparc64_num_cpus_possible); if (atomic_read(&sparc64_num_cpus_possible) <= max_cpus) break; @@ -1263,8 +1262,8 @@ void __init smp_prepare_cpus(unsigned in void __devinit smp_prepare_boot_cpu(void) { - set_bit(smp_processor_id(), &cpu_online_map); - set_bit(smp_processor_id(), &phys_cpu_present_map); + cpu_set(smp_processor_id(), cpu_online_map); + cpu_set(smp_processor_id(), phys_cpu_present_map); } int __devinit __cpu_up(unsigned int cpu) @@ -1272,10 +1271,10 @@ int __devinit __cpu_up(unsigned int cpu) int ret = smp_boot_one_cpu(cpu); if (!ret) { - set_bit(cpu, &smp_commenced_mask); - while (!test_bit(cpu, &cpu_online_map)) + cpu_set(cpu, smp_commenced_mask); + while (!cpu_isset(cpu, cpu_online_map)) mb(); - if (!test_bit(cpu, &cpu_online_map)) { + if (!cpu_isset(cpu, cpu_online_map)) { ret = -ENODEV; } else { smp_synchronize_one_tick(cpu); diff -prauN linux-2.6.0-test1/arch/sparc64/kernel/sparc64_ksyms.c wli-2.6.0-test1-37/arch/sparc64/kernel/sparc64_ksyms.c --- linux-2.6.0-test1/arch/sparc64/kernel/sparc64_ksyms.c 2003-07-13 20:37:22.000000000 -0700 +++ wli-2.6.0-test1-37/arch/sparc64/kernel/sparc64_ksyms.c 2003-07-14 06:43:52.000000000 -0700 @@ -147,7 +147,6 @@ EXPORT_SYMBOL(cpu_data); /* CPU online map and active count. */ EXPORT_SYMBOL(cpu_online_map); -EXPORT_SYMBOL(sparc64_num_cpus_online); EXPORT_SYMBOL(phys_cpu_present_map); EXPORT_SYMBOL(sparc64_num_cpus_possible); diff -prauN linux-2.6.0-test1/arch/sparc64/kernel/us2e_cpufreq.c wli-2.6.0-test1-37/arch/sparc64/kernel/us2e_cpufreq.c --- linux-2.6.0-test1/arch/sparc64/kernel/us2e_cpufreq.c 2003-07-13 20:36:38.000000000 -0700 +++ wli-2.6.0-test1-37/arch/sparc64/kernel/us2e_cpufreq.c 2003-07-14 06:43:52.000000000 -0700 @@ -232,15 +232,16 @@ static unsigned long estar_to_divisor(un static void us2e_set_cpu_divider_index(unsigned int cpu, unsigned int index) { - unsigned long new_bits, new_freq, cpus_allowed; + unsigned long new_bits, new_freq; unsigned long clock_tick, divisor, old_divisor, estar; + cpumask_t cpus_allowed; struct cpufreq_freqs freqs; if (!cpu_online(cpu)) return; cpus_allowed = current->cpus_allowed; - set_cpus_allowed(current, (1UL << cpu)); + set_cpus_allowed(current, cpumask_of_cpu(cpu)); new_freq = clock_tick = sparc64_get_clock_tick(cpu); new_bits = index_to_estar_mode(index); diff -prauN linux-2.6.0-test1/arch/sparc64/kernel/us3_cpufreq.c wli-2.6.0-test1-37/arch/sparc64/kernel/us3_cpufreq.c --- linux-2.6.0-test1/arch/sparc64/kernel/us3_cpufreq.c 2003-07-13 20:29:11.000000000 -0700 +++ wli-2.6.0-test1-37/arch/sparc64/kernel/us3_cpufreq.c 2003-07-14 06:43:52.000000000 -0700 @@ -78,14 +78,15 @@ static unsigned long get_current_freq(un static void us3_set_cpu_divider_index(unsigned int cpu, unsigned int index) { - unsigned long new_bits, new_freq, reg, cpus_allowed; + unsigned long new_bits, new_freq, reg; + cpumask_t cpus_allowed; struct cpufreq_freqs freqs; if (!cpu_online(cpu)) return; cpus_allowed = current->cpus_allowed; - set_cpus_allowed(current, (1UL << cpu)); + set_cpus_allowed(current, cpumask_of_cpu(cpu)); new_freq = sparc64_get_clock_tick(cpu); switch (index) { diff -prauN linux-2.6.0-test1/arch/sparc64/mm/generic.c wli-2.6.0-test1-37/arch/sparc64/mm/generic.c --- linux-2.6.0-test1/arch/sparc64/mm/generic.c 2003-07-13 20:37:22.000000000 -0700 +++ wli-2.6.0-test1-37/arch/sparc64/mm/generic.c 2003-07-14 06:49:00.000000000 -0700 @@ -85,7 +85,7 @@ static inline void io_remap_pte_range(pt } while (address < end); } -static inline int io_remap_pmd_range(pmd_t * pmd, unsigned long address, unsigned long size, +static inline int io_remap_pmd_range(pgd_t *pgd, pmd_t * pmd, unsigned long address, unsigned long size, unsigned long offset, pgprot_t prot, int space) { unsigned long end; @@ -96,7 +96,7 @@ static inline int io_remap_pmd_range(pmd end = PGDIR_SIZE; offset -= address; do { - pte_t * pte = pte_alloc_map(current->mm, pmd, address); + pte_t * pte = pte_alloc_map(current->mm, pgd, &pmd, address); if (!pte) return -ENOMEM; io_remap_pte_range(pte, address, end - address, address + offset, prot, space); @@ -122,11 +122,11 @@ int io_remap_page_range(struct vm_area_s spin_lock(&mm->page_table_lock); while (from < end) { - pmd_t *pmd = pmd_alloc(current->mm, dir, from); + pmd_t *pmd = pmd_alloc_map(current->mm, dir, from); error = -ENOMEM; if (!pmd) break; - error = io_remap_pmd_range(pmd, from, end - from, offset + from, prot, space); + error = io_remap_pmd_range(pgd, pmd, from, end - from, offset + from, prot, space); if (error) break; from = (from + PGDIR_SIZE) & PGDIR_MASK; diff -prauN linux-2.6.0-test1/arch/sparc64/mm/hugetlbpage.c wli-2.6.0-test1-37/arch/sparc64/mm/hugetlbpage.c --- linux-2.6.0-test1/arch/sparc64/mm/hugetlbpage.c 2003-07-13 20:35:56.000000000 -0700 +++ wli-2.6.0-test1-37/arch/sparc64/mm/hugetlbpage.c 2003-07-14 08:52:52.000000000 -0700 @@ -74,8 +74,8 @@ static struct page *alloc_hugetlb_page(v static void free_hugetlb_page(struct page *page) { spin_lock(&htlbpage_lock); - if ((page->mapping != NULL) && (page_count(page) == 2)) { - struct inode *inode = page->mapping->host; + if ((page_mapping(page) != NULL) && (page_count(page) == 2)) { + struct inode *inode = page_mapping(page)->host; int i; ClearPageDirty(page); @@ -107,9 +107,11 @@ static pte_t *huge_pte_alloc_map(struct pgd = pgd_offset(mm, addr); if (pgd) { - pmd = pmd_alloc(mm, pgd, addr); - if (pmd) - pte = pte_alloc_map(mm, pmd, addr); + pmd = pmd_alloc_map(mm, pgd, addr); + if (pmd) { + pte = pte_alloc_map(mm, pgd, &pmd, addr); + pmd_unmap(pmd); + } } return pte; } @@ -122,9 +124,11 @@ static pte_t *huge_pte_offset_map(struct pgd = pgd_offset(mm, addr); if (pgd) { - pmd = pmd_offset(pgd, addr); - if (pmd) + pmd = pmd_offset_map(pgd, addr); + if (pmd) { pte = pte_offset_map(pmd, addr); + pmd_unmap(pmd); + } } return pte; } diff -prauN linux-2.6.0-test1/arch/sparc64/mm/init.c wli-2.6.0-test1-37/arch/sparc64/mm/init.c --- linux-2.6.0-test1/arch/sparc64/mm/init.c 2003-07-13 20:32:44.000000000 -0700 +++ wli-2.6.0-test1-37/arch/sparc64/mm/init.c 2003-07-14 08:52:52.000000000 -0700 @@ -129,9 +129,9 @@ __inline__ void flush_dcache_page_impl(s #if (L1DCACHE_SIZE > PAGE_SIZE) __flush_dcache_page(page->virtual, ((tlb_type == spitfire) && - page->mapping != NULL)); + page_mapping(page) != NULL)); #else - if (page->mapping != NULL && + if (page_mapping(page) != NULL && tlb_type == spitfire) __flush_icache_page(__pa(page->virtual)); #endif @@ -193,7 +193,7 @@ void update_mmu_cache(struct vm_area_str pfn = pte_pfn(pte); if (pfn_valid(pfn) && - (page = pfn_to_page(pfn), page->mapping) && + (page = pfn_to_page(pfn), page_mapping(page)) && ((pg_flags = page->flags) & (1UL << PG_dcache_dirty))) { int cpu = ((pg_flags >> 24) & (NR_CPUS - 1UL)); @@ -217,9 +217,9 @@ void flush_dcache_page(struct page *page int dirty = test_bit(PG_dcache_dirty, &page->flags); int dirty_cpu = dcache_dirty_cpu(page); - if (page->mapping && - list_empty(&page->mapping->i_mmap) && - list_empty(&page->mapping->i_mmap_shared)) { + if (page_mapping(page) && + list_empty(&page_mapping(page)->i_mmap) && + list_empty(&page_mapping(page)->i_mmap_shared)) { if (dirty) { if (dirty_cpu == smp_processor_id()) return; @@ -227,7 +227,7 @@ void flush_dcache_page(struct page *page } set_dcache_dirty(page); } else { - /* We could delay the flush for the !page->mapping + /* We could delay the flush for the !page_mapping(page) * case too. But that case is for exec env/arg * pages and those are %99 certainly going to get * faulted into the tlb (and thus flushed) anyways. @@ -269,7 +269,7 @@ static inline void flush_cache_pte_range if (!pfn_valid(pfn)) continue; page = pfn_to_page(pfn); - if (PageReserved(page) || !page->mapping) + if (PageReserved(page) || !page_mapping(page)) continue; pgaddr = (unsigned long) page_address(page); uaddr = address + offset; diff -prauN linux-2.6.0-test1/arch/sparc64/mm/ultra.S wli-2.6.0-test1-37/arch/sparc64/mm/ultra.S --- linux-2.6.0-test1/arch/sparc64/mm/ultra.S 2003-07-13 20:36:38.000000000 -0700 +++ wli-2.6.0-test1-37/arch/sparc64/mm/ultra.S 2003-07-14 08:52:52.000000000 -0700 @@ -615,7 +615,7 @@ xcall_flush_dcache_page_cheetah: /* %g1 .globl xcall_flush_dcache_page_spitfire xcall_flush_dcache_page_spitfire: /* %g1 == physical page address %g7 == kernel page virtual address - %g5 == (page->mapping != NULL) */ + %g5 == (page_mapping(page) != NULL) */ #if (L1DCACHE_SIZE > PAGE_SIZE) srlx %g1, (13 - 2), %g1 ! Form tag comparitor sethi %hi(L1DCACHE_SIZE), %g3 ! D$ size == 16K diff -prauN linux-2.6.0-test1/arch/sparc64/solaris/ioctl.c wli-2.6.0-test1-37/arch/sparc64/solaris/ioctl.c --- linux-2.6.0-test1/arch/sparc64/solaris/ioctl.c 2003-07-13 20:34:42.000000000 -0700 +++ wli-2.6.0-test1-37/arch/sparc64/solaris/ioctl.c 2003-07-14 09:45:14.000000000 -0700 @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -292,15 +293,15 @@ static inline int solaris_sockmod(unsign { struct inode *ino; /* I wonder which of these tests are superfluous... --patrik */ - spin_lock(¤t->files->file_lock); + rcu_read_lock(); if (! current->files->fd[fd] || ! current->files->fd[fd]->f_dentry || ! (ino = current->files->fd[fd]->f_dentry->d_inode) || ! ino->i_sock) { - spin_unlock(¤t->files->file_lock); + rcu_read_unlock(); return TBADF; } - spin_unlock(¤t->files->file_lock); + rcu_read_unlock(); switch (cmd & 0xff) { case 109: /* SI_SOCKPARAMS */ diff -prauN linux-2.6.0-test1/arch/sparc64/solaris/timod.c wli-2.6.0-test1-37/arch/sparc64/solaris/timod.c --- linux-2.6.0-test1/arch/sparc64/solaris/timod.c 2003-07-13 20:33:11.000000000 -0700 +++ wli-2.6.0-test1-37/arch/sparc64/solaris/timod.c 2003-07-14 09:45:14.000000000 -0700 @@ -145,9 +145,14 @@ static struct T_primsg *timod_mkctl(int static void timod_wake_socket(unsigned int fd) { struct socket *sock; + struct file *filp; SOLD("wakeing socket"); - sock = SOCKET_I(current->files->fd[fd]->f_dentry->d_inode); + if (!( filp = fcheck(fd))) { + SOLD("BAD FD"); + return; + } + sock = SOCKET_I(filp->f_dentry->d_inode); wake_up_interruptible(&sock->wait); read_lock(&sock->sk->sk_callback_lock); if (sock->fasync_list && !test_bit(SOCK_ASYNC_WAITDATA, &sock->flags)) @@ -159,9 +164,14 @@ static void timod_wake_socket(unsigned i static void timod_queue(unsigned int fd, struct T_primsg *it) { struct sol_socket_struct *sock; + struct file *filp; SOLD("queuing primsg"); - sock = (struct sol_socket_struct *)current->files->fd[fd]->private_data; + if (!( filp = fcheck(fd))) { + SOLD("BAD FD"); + return; + } + sock = (struct sol_socket_struct *)filp->private_data; it->next = sock->pfirst; sock->pfirst = it; if (!sock->plast) @@ -173,9 +183,14 @@ static void timod_queue(unsigned int fd, static void timod_queue_end(unsigned int fd, struct T_primsg *it) { struct sol_socket_struct *sock; + struct file *filp; SOLD("queuing primsg at end"); - sock = (struct sol_socket_struct *)current->files->fd[fd]->private_data; + if (!( filp = fcheck(fd))) { + SOLD("BAD FD"); + return; + } + sock = (struct sol_socket_struct *)filp->private_data; it->next = NULL; if (sock->plast) sock->plast->next = it; @@ -353,7 +368,10 @@ int timod_putmsg(unsigned int fd, char * (int (*)(int, unsigned long *))SYS(socketcall); int (*sys_sendto)(int, void *, size_t, unsigned, struct sockaddr *, int) = (int (*)(int, void *, size_t, unsigned, struct sockaddr *, int))SYS(sendto); - filp = current->files->fd[fd]; + + if (!(filp = fcheck(fd))) + return -EBADF; + ino = filp->f_dentry->d_inode; sock = (struct sol_socket_struct *)filp->private_data; SOLD("entry"); @@ -634,7 +652,10 @@ int timod_getmsg(unsigned int fd, char * SOLD("entry"); SOLDD(("%u %p %d %p %p %d %p %d\n", fd, ctl_buf, ctl_maxlen, ctl_len, data_buf, data_maxlen, data_len, *flags_p)); - filp = current->files->fd[fd]; + + if (!(filp = fcheck(fd))) + return -EBADF; + ino = filp->f_dentry->d_inode; sock = (struct sol_socket_struct *)filp->private_data; SOLDD(("%p %p\n", sock->pfirst, sock->pfirst ? sock->pfirst->next : NULL)); @@ -850,7 +871,7 @@ asmlinkage int solaris_getmsg(unsigned i lock_kernel(); if(fd >= NR_OPEN) goto out; - filp = current->files->fd[fd]; + filp = fcheck(fd); if(!filp) goto out; ino = filp->f_dentry->d_inode; @@ -917,7 +938,7 @@ asmlinkage int solaris_putmsg(unsigned i lock_kernel(); if(fd >= NR_OPEN) goto out; - filp = current->files->fd[fd]; + filp = fcheck(fd); if(!filp) goto out; ino = filp->f_dentry->d_inode; diff -prauN linux-2.6.0-test1/arch/um/kernel/irq.c wli-2.6.0-test1-37/arch/um/kernel/irq.c --- linux-2.6.0-test1/arch/um/kernel/irq.c 2003-07-13 20:37:17.000000000 -0700 +++ wli-2.6.0-test1-37/arch/um/kernel/irq.c 2003-07-14 06:31:09.000000000 -0700 @@ -565,9 +565,9 @@ static struct proc_dir_entry * smp_affin /* These are read and written as longs, so a read won't see a partial write * even during a race. */ -static unsigned long irq_affinity [NR_IRQS] = { [0 ... NR_IRQS-1] = ~0UL }; +static cpumask_t irq_affinity [NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_ALL }; -#define HEX_DIGITS 8 +#define HEX_DIGITS (2*sizeof(cpumask_t)) static int irq_affinity_read_proc (char *page, char **start, off_t off, int count, int *eof, void *data) @@ -578,10 +578,10 @@ static int irq_affinity_read_proc (char } static unsigned int parse_hex_value (const char *buffer, - unsigned long count, unsigned long *ret) + unsigned long count, cpumask_t *ret) { unsigned char hexnum [HEX_DIGITS]; - unsigned long value; + cpumask_t value = CPU_MASK_NONE; int i; if (!count) @@ -595,10 +595,9 @@ static unsigned int parse_hex_value (con * Parse the first 8 characters as a hex string, any non-hex char * is end-of-string. '00e1', 'e1', '00E1', 'E1' are all the same. */ - value = 0; for (i = 0; i < count; i++) { - unsigned int c = hexnum[i]; + unsigned int k, c = hexnum[i]; switch (c) { case '0' ... '9': c -= '0'; break; @@ -607,7 +606,10 @@ static unsigned int parse_hex_value (con default: goto out; } - value = (value << 4) | c; + cpus_shift_left(value, value, 16); + for (k = 0; k < 4; ++k) + if (c & (1 << k)) + cpu_set(k, value); } out: *ret = value; @@ -618,7 +620,7 @@ static int irq_affinity_write_proc (stru unsigned long count, void *data) { int irq = (long) data, full_count = count, err; - unsigned long new_value; + cpumask_t new_value, tmp; if (!irq_desc[irq].handler->set_affinity) return -EIO; @@ -631,7 +633,8 @@ static int irq_affinity_write_proc (stru * way to make the system unusable accidentally :-) At least * one online CPU still has to be targeted. */ - if (!(new_value & cpu_online_map)) + cpus_and(tmp, new_value, cpu_online_map); + if (cpus_empty(tmp)) return -EINVAL; #endif @@ -644,17 +647,27 @@ static int irq_affinity_write_proc (stru static int prof_cpu_mask_read_proc (char *page, char **start, off_t off, int count, int *eof, void *data) { - unsigned long *mask = (unsigned long *) data; + cpumask_t tmp, *mask = (cpumask_t *) data; + int k, len = 0; + if (count < HEX_DIGITS+1) return -EINVAL; - return sprintf (page, "%08lx\n", *mask); + tmp = *mask; + for (k = 0; k < sizeof(cpumask_t)/sizeof(u16); ++k) { + int j = sprintf(page, "%04hx", cpus_coerce(tmp)); + len += j; + page += j; + cpus_shift_right(tmp, tmp, 16); + } + len += sprintf(page, "\n"); + return len; } static int prof_cpu_mask_write_proc (struct file *file, const char *buffer, unsigned long count, void *data) { - unsigned long *mask = (unsigned long *) data, full_count = count, err; - unsigned long new_value; + cpumask_t *mask = (cpumask_t *)data, new_value; + unsigned long full_count = count, err; err = parse_hex_value(buffer, count, &new_value); if (err) @@ -693,7 +706,7 @@ static void register_irq_proc (unsigned } /* Read and written as a long */ -unsigned long prof_cpu_mask = -1; +cpumask_t prof_cpu_mask = CPU_MASK_ALL; void __init init_irq_proc (void) { diff -prauN linux-2.6.0-test1/arch/um/kernel/skas/process_kern.c wli-2.6.0-test1-37/arch/um/kernel/skas/process_kern.c --- linux-2.6.0-test1/arch/um/kernel/skas/process_kern.c 2003-07-13 20:31:59.000000000 -0700 +++ wli-2.6.0-test1-37/arch/um/kernel/skas/process_kern.c 2003-07-14 06:31:09.000000000 -0700 @@ -152,7 +152,7 @@ static int start_kernel_proc(void *unuse cpu_tasks[0].pid = pid; cpu_tasks[0].task = current; #ifdef CONFIG_SMP - cpu_online_map = 1; + cpu_online_map = cpumask_of_cpu(0); #endif start_kernel(); return(0); diff -prauN linux-2.6.0-test1/arch/um/kernel/smp.c wli-2.6.0-test1-37/arch/um/kernel/smp.c --- linux-2.6.0-test1/arch/um/kernel/smp.c 2003-07-13 20:31:57.000000000 -0700 +++ wli-2.6.0-test1-37/arch/um/kernel/smp.c 2003-07-14 06:31:09.000000000 -0700 @@ -5,9 +5,6 @@ #include "linux/config.h" -/* CPU online map, set by smp_boot_cpus */ -unsigned long cpu_online_map = 1; - #ifdef CONFIG_SMP #include "linux/sched.h" @@ -24,6 +21,9 @@ unsigned long cpu_online_map = 1; #include "irq_user.h" #include "os.h" +/* CPU online map, set by smp_boot_cpus */ +unsigned long cpu_online_map = cpumask_of_cpu(0); + /* Per CPU bogomips and other parameters * The only piece used here is the ipi pipe, which is set before SMP is * started and never changed. @@ -104,8 +104,8 @@ void smp_send_stop(void) printk("done\n"); } -static unsigned long smp_commenced_mask; -static volatile unsigned long smp_callin_map = 0; +static cpumask_t smp_commenced_mask; +static cpumask_t smp_callin_map = CPU_MASK_NONE; static int idle_proc(void *cpup) { @@ -120,15 +120,15 @@ static int idle_proc(void *cpup) current->thread.mode.tt.extern_pid); wmb(); - if (test_and_set_bit(cpu, &smp_callin_map)) { + if (cpu_test_and_set(cpu, &smp_callin_map)) { printk("huh, CPU#%d already present??\n", cpu); BUG(); } - while (!test_bit(cpu, &smp_commenced_mask)) + while (!cpu_isset(cpu, &smp_commenced_mask)) cpu_relax(); - set_bit(cpu, &cpu_online_map); + cpu_set(cpu, cpu_online_map); default_idle(); return(0); } @@ -159,8 +159,8 @@ void smp_prepare_cpus(unsigned int maxcp unsigned long waittime; int err, cpu; - set_bit(0, &cpu_online_map); - set_bit(0, &smp_callin_map); + cpu_set(0, cpu_online_map); + cpu_set(0, smp_callin_map); err = os_pipe(cpu_data[0].ipi_pipe, 1, 1); if(err) panic("CPU#0 failed to create IPI pipe, errno = %d", -err); @@ -177,10 +177,10 @@ void smp_prepare_cpus(unsigned int maxcp unhash_process(idle); waittime = 200000000; - while (waittime-- && !test_bit(cpu, &smp_callin_map)) + while (waittime-- && !cpu_isset(cpu, smp_callin_map)) cpu_relax(); - if (test_bit(cpu, &smp_callin_map)) + if (cpu_isset(cpu, smp_callin_map)) printk("done\n"); else printk("failed\n"); } @@ -188,13 +188,13 @@ void smp_prepare_cpus(unsigned int maxcp void smp_prepare_boot_cpu(void) { - set_bit(smp_processor_id(), &cpu_online_map); + cpu_set(smp_processor_id(), cpu_online_map); } int __cpu_up(unsigned int cpu) { - set_bit(cpu, &smp_commenced_mask); - while (!test_bit(cpu, &cpu_online_map)) + cpu_set(cpu, smp_commenced_mask); + while (!cpu_isset(cpu, cpu_online_map)) mb(); return(0); } @@ -271,7 +271,7 @@ int smp_call_function(void (*_func)(void for (i=0;ithread_info->cpu) && - test_bit(i, &cpu_online_map)) + cpu_isset(i, cpu_online_map)) write(cpu_data[i].ipi_pipe[1], "C", 1); while (atomic_read(&scf_started) != cpus) diff -prauN linux-2.6.0-test1/arch/um/kernel/tt/process_kern.c wli-2.6.0-test1-37/arch/um/kernel/tt/process_kern.c --- linux-2.6.0-test1/arch/um/kernel/tt/process_kern.c 2003-07-13 20:37:22.000000000 -0700 +++ wli-2.6.0-test1-37/arch/um/kernel/tt/process_kern.c 2003-07-14 06:31:09.000000000 -0700 @@ -419,7 +419,7 @@ static int start_kernel_proc(void *unuse cpu_tasks[0].pid = pid; cpu_tasks[0].task = current; #ifdef CONFIG_SMP - cpu_online_map = 1; + cpu_online_map = cpumask_of_cpu(0); #endif if(debug) os_stop_process(pid); start_kernel(); diff -prauN linux-2.6.0-test1/arch/um/kernel/um_arch.c wli-2.6.0-test1-37/arch/um/kernel/um_arch.c --- linux-2.6.0-test1/arch/um/kernel/um_arch.c 2003-07-13 20:37:14.000000000 -0700 +++ wli-2.6.0-test1-37/arch/um/kernel/um_arch.c 2003-07-14 06:31:09.000000000 -0700 @@ -57,7 +57,7 @@ static int show_cpuinfo(struct seq_file index = (struct cpuinfo_um *)v - cpu_data; #ifdef CONFIG_SMP - if (!(cpu_online_map & (1 << index))) + if (!cpu_online(index)) return 0; #endif diff -prauN linux-2.6.0-test1/arch/x86_64/ia32/ia32_binfmt.c wli-2.6.0-test1-37/arch/x86_64/ia32/ia32_binfmt.c --- linux-2.6.0-test1/arch/x86_64/ia32/ia32_binfmt.c 2003-07-13 20:29:28.000000000 -0700 +++ wli-2.6.0-test1-37/arch/x86_64/ia32/ia32_binfmt.c 2003-07-14 07:33:22.000000000 -0700 @@ -367,7 +367,8 @@ int setup_arg_pages(struct linux_binprm struct page *page = bprm->page[i]; if (page) { bprm->page[i] = NULL; - put_dirty_page(current,page,stack_base,PAGE_COPY_EXEC); + put_dirty_page(current, mpnt, page, + stack_base, PAGE_COPY_EXEC); } stack_base += PAGE_SIZE; } diff -prauN linux-2.6.0-test1/arch/x86_64/ia32/syscall32.c wli-2.6.0-test1-37/arch/x86_64/ia32/syscall32.c --- linux-2.6.0-test1/arch/x86_64/ia32/syscall32.c 2003-07-13 20:35:16.000000000 -0700 +++ wli-2.6.0-test1-37/arch/x86_64/ia32/syscall32.c 2003-07-14 06:49:00.000000000 -0700 @@ -29,12 +29,15 @@ char *syscall32_page; and let it be handled by generic VM */ int map_syscall32(struct mm_struct *mm, unsigned long address) { + pgd_t *pgd; + pmd_t *pmd; pte_t *pte; int err = 0; down_read(&mm->mmap_sem); spin_lock(&mm->page_table_lock); - pmd_t *pmd = pmd_alloc(mm, pgd_offset(mm, address), address); - if (pmd && (pte = pte_alloc_map(mm, pmd, address)) != NULL) { + pgd = pgd_offset(mm, address); + pmd = pmd_alloc_map(mm, pgd, address); + if (pmd && (pte = pte_alloc_map(mm, pgd, &pmd, address)) != NULL) { if (pte_none(*pte)) { set_pte(pte, mk_pte(virt_to_page(syscall32_page), diff -prauN linux-2.6.0-test1/arch/x86_64/kernel/apic.c wli-2.6.0-test1-37/arch/x86_64/kernel/apic.c --- linux-2.6.0-test1/arch/x86_64/kernel/apic.c 2003-07-13 20:30:48.000000000 -0700 +++ wli-2.6.0-test1-37/arch/x86_64/kernel/apic.c 2003-07-14 06:31:09.000000000 -0700 @@ -298,8 +298,8 @@ void __init setup_local_APIC (void) * Double-check whether this APIC is really registered. * This is meaningless in clustered apic mode, so we skip it. */ - if (!clustered_apic_mode && - !test_bit(GET_APIC_ID(apic_read(APIC_ID)), &phys_cpu_present_map)) + if (!clustered_apic_mode && + !cpu_isset(GET_APIC_ID(apic_read(APIC_ID)), phys_cpu_present_map)) BUG(); /* @@ -997,7 +997,7 @@ int __init APIC_init_uniprocessor (void) connect_bsp_APIC(); - phys_cpu_present_map = 1; + phys_cpu_present_map = cpumask_of_cpu(0); apic_write_around(APIC_ID, boot_cpu_id); setup_local_APIC(); diff -prauN linux-2.6.0-test1/arch/x86_64/kernel/io_apic.c wli-2.6.0-test1-37/arch/x86_64/kernel/io_apic.c --- linux-2.6.0-test1/arch/x86_64/kernel/io_apic.c 2003-07-13 20:34:31.000000000 -0700 +++ wli-2.6.0-test1-37/arch/x86_64/kernel/io_apic.c 2003-07-14 06:31:09.000000000 -0700 @@ -1014,7 +1014,7 @@ void disable_IO_APIC(void) static void __init setup_ioapic_ids_from_mpc (void) { union IO_APIC_reg_00 reg_00; - unsigned long phys_id_present_map = phys_cpu_present_map; + cpumask_t phys_id_present_map = phys_cpu_present_map; int apic; int i; unsigned char old_id; @@ -1047,22 +1047,22 @@ static void __init setup_ioapic_ids_from * system must have a unique ID or we get lots of nice * 'stuck on smp_invalidate_needed IPI wait' messages. */ - if (phys_id_present_map & (1 << mp_ioapics[apic].mpc_apicid)) { + if (cpu_isset(mp_ioapics[apic].mpc_apicid, phys_id_present_map)) { printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", apic, mp_ioapics[apic].mpc_apicid); for (i = 0; i < 0xf; i++) - if (!(phys_id_present_map & (1 << i))) + if (!cpu_isset(i, phys_id_present_map)) break; if (i >= 0xf) panic("Max APIC ID exceeded!\n"); printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", i); - phys_id_present_map |= 1 << i; + cpu_set(i, phys_id_present_map); mp_ioapics[apic].mpc_apicid = i; } else { printk(KERN_INFO "Using IO-APIC %d\n", mp_ioapics[apic].mpc_apicid); - phys_id_present_map |= 1 << mp_ioapics[apic].mpc_apicid; + cpu_set(mp_ioapics[apic].mpc_apicid, phys_id_present_map); } diff -prauN linux-2.6.0-test1/arch/x86_64/kernel/irq.c wli-2.6.0-test1-37/arch/x86_64/kernel/irq.c --- linux-2.6.0-test1/arch/x86_64/kernel/irq.c 2003-07-13 20:33:46.000000000 -0700 +++ wli-2.6.0-test1-37/arch/x86_64/kernel/irq.c 2003-07-14 06:31:09.000000000 -0700 @@ -792,13 +792,13 @@ int setup_irq(unsigned int irq, struct i static struct proc_dir_entry * root_irq_dir; static struct proc_dir_entry * irq_dir [NR_IRQS]; -#define HEX_DIGITS 8 +#define HEX_DIGITS (2*sizeof(cpumask_t)) static unsigned int parse_hex_value (const char *buffer, - unsigned long count, unsigned long *ret) + unsigned long count, cpumask_t *ret) { unsigned char hexnum [HEX_DIGITS]; - unsigned long value; + cpumask_t value = CPU_MASK_NONE; unsigned i; if (!count) @@ -812,10 +812,9 @@ static unsigned int parse_hex_value (con * Parse the first 8 characters as a hex string, any non-hex char * is end-of-string. '00e1', 'e1', '00E1', 'E1' are all the same. */ - value = 0; for (i = 0; i < count; i++) { - unsigned int c = hexnum[i]; + unsigned int k, c = hexnum[i]; switch (c) { case '0' ... '9': c -= '0'; break; @@ -824,7 +823,10 @@ static unsigned int parse_hex_value (con default: goto out; } - value = (value << 4) | c; + cpus_shift_left(value, value, 4); + for (k = 0; k < 4; ++k) + if (c & (1 << k)) + cpu_set(k, value); } out: *ret = value; @@ -835,20 +837,31 @@ out: static struct proc_dir_entry * smp_affinity_entry [NR_IRQS]; -static unsigned long irq_affinity [NR_IRQS] = { [0 ... NR_IRQS-1] = ~0UL }; +static cpumask_t irq_affinity [NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_ALL }; static int irq_affinity_read_proc (char *page, char **start, off_t off, int count, int *eof, void *data) { + int k, len; + cpumask_t tmp = irq_affinity[(long)data]; + if (count < HEX_DIGITS+1) return -EINVAL; - return sprintf (page, "%08lx\n", irq_affinity[(long)data]); + + for (k = 0; k < sizeof(cpumask_t)/sizeof(u16); ++k) { + int j = sprintf(page, "%04hx", (u16)cpus_coerce(tmp)); + len += j; + page += j; + cpus_shift_right(tmp, tmp, 16); + } + len += sprintf(page, "\n"); + return len; } static int irq_affinity_write_proc (struct file *file, const char *buffer, unsigned long count, void *data) { int irq = (long) data, full_count = count, err; - unsigned long new_value; + cpumask_t tmp, new_value; if (!irq_desc[irq].handler->set_affinity) return -EIO; @@ -860,7 +873,8 @@ static int irq_affinity_write_proc (stru * way to make the system unusable accidentally :-) At least * one online CPU still has to be targeted. */ - if (!(new_value & cpu_online_map)) + cpus_and(tmp, new_value, cpu_online_map); + if (cpus_empty(tmp)) return -EINVAL; irq_affinity[irq] = new_value; @@ -874,17 +888,26 @@ static int irq_affinity_write_proc (stru static int prof_cpu_mask_read_proc (char *page, char **start, off_t off, int count, int *eof, void *data) { - unsigned long *mask = (unsigned long *) data; + cpumask_t tmp, *mask = (unsigned long *) data; if (count < HEX_DIGITS+1) return -EINVAL; - return sprintf (page, "%08lx\n", *mask); + + tmp = *mask; + for (k = 0; k < sizeof(cpumask_t)/sizeof(u16); ++k) { + int j = sprintf(page, "%04hx", (u16)cpus_coerce(tmp)); + len += j; + page += j; + cpus_shift_right(tmp, tmp, 16); + } + len += sprintf(page, "\n"); + return len; } static int prof_cpu_mask_write_proc (struct file *file, const char *buffer, unsigned long count, void *data) { - unsigned long *mask = (unsigned long *) data, full_count = count, err; - unsigned long new_value; + unsigned long full_count = count, err; + cpumask_t new_value, *mask = (cpumask_t *)data; err = parse_hex_value(buffer, count, &new_value); if (err) diff -prauN linux-2.6.0-test1/arch/x86_64/kernel/mpparse.c wli-2.6.0-test1-37/arch/x86_64/kernel/mpparse.c --- linux-2.6.0-test1/arch/x86_64/kernel/mpparse.c 2003-07-13 20:32:39.000000000 -0700 +++ wli-2.6.0-test1-37/arch/x86_64/kernel/mpparse.c 2003-07-14 06:31:09.000000000 -0700 @@ -65,7 +65,7 @@ unsigned int boot_cpu_id = -1U; static unsigned int num_processors = 0; /* Bitmask of physically existing CPUs */ -unsigned long phys_cpu_present_map = 0; +cpumask_t phys_cpu_present_map = CPU_MASK_NONE; /* ACPI MADT entry parsing functions */ #ifdef CONFIG_ACPI_BOOT @@ -124,7 +124,7 @@ static void __init MP_processor_info (st } ver = m->mpc_apicver; - phys_cpu_present_map |= 1 << m->mpc_apicid; + cpu_set(m->mpc_apicid, phys_cpu_present_map); /* * Validate version */ diff -prauN linux-2.6.0-test1/arch/x86_64/kernel/msr.c wli-2.6.0-test1-37/arch/x86_64/kernel/msr.c --- linux-2.6.0-test1/arch/x86_64/kernel/msr.c 2003-07-13 20:30:50.000000000 -0700 +++ wli-2.6.0-test1-37/arch/x86_64/kernel/msr.c 2003-07-14 06:31:10.000000000 -0700 @@ -242,7 +242,7 @@ static int msr_open(struct inode *inode, int cpu = minor(file->f_dentry->d_inode->i_rdev); struct cpuinfo_x86 *c = &(cpu_data)[cpu]; - if ( !(cpu_online_map & (1UL << cpu)) ) + if (!cpu_online(cpu)) return -ENXIO; /* No such CPU */ if ( !cpu_has(c, X86_FEATURE_MSR) ) return -EIO; /* MSR not supported */ diff -prauN linux-2.6.0-test1/arch/x86_64/kernel/reboot.c wli-2.6.0-test1-37/arch/x86_64/kernel/reboot.c --- linux-2.6.0-test1/arch/x86_64/kernel/reboot.c 2003-07-13 20:37:22.000000000 -0700 +++ wli-2.6.0-test1-37/arch/x86_64/kernel/reboot.c 2003-07-14 06:31:10.000000000 -0700 @@ -110,7 +110,7 @@ static void smp_halt(void) } /* Wait for all other CPUs to have run smp_stop_cpu */ - while (cpu_online_map) + while (!cpus_empty(cpu_online_map)) rep_nop(); } #endif diff -prauN linux-2.6.0-test1/arch/x86_64/kernel/setup.c wli-2.6.0-test1-37/arch/x86_64/kernel/setup.c --- linux-2.6.0-test1/arch/x86_64/kernel/setup.c 2003-07-13 20:29:29.000000000 -0700 +++ wli-2.6.0-test1-37/arch/x86_64/kernel/setup.c 2003-07-14 06:31:10.000000000 -0700 @@ -643,7 +643,7 @@ static int show_cpuinfo(struct seq_file #ifdef CONFIG_SMP - if (!(cpu_online_map & (1<<(c-cpu_data)))) + if (!cpu_online(c-cpu_data)) return 0; #endif diff -prauN linux-2.6.0-test1/arch/x86_64/kernel/smp.c wli-2.6.0-test1-37/arch/x86_64/kernel/smp.c --- linux-2.6.0-test1/arch/x86_64/kernel/smp.c 2003-07-13 20:31:58.000000000 -0700 +++ wli-2.6.0-test1-37/arch/x86_64/kernel/smp.c 2003-07-14 06:31:10.000000000 -0700 @@ -230,9 +230,10 @@ out: put_cpu_no_resched(); } -static void flush_tlb_others (unsigned long cpumask, struct mm_struct *mm, +static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, unsigned long va) { + cpumask_t tmp; /* * A couple of (to be removed) sanity checks: * @@ -240,12 +241,10 @@ static void flush_tlb_others (unsigned l * - current CPU must not be in mask * - mask must exist :) */ - if (!cpumask) - BUG(); - if ((cpumask & cpu_online_map) != cpumask) - BUG(); - if (cpumask & (1 << smp_processor_id())) - BUG(); + BUG_ON(cpus_empty(cpumask)); + cpus_and(tmp, cpumask, cpu_online_map); + BUG_ON(!cpus_equal(tmp, cpumask)); + BUG_ON(cpu_isset(smp_processor_id(), cpumask)); if (!mm) BUG(); @@ -430,7 +429,7 @@ void smp_stop_cpu(void) /* * Remove this CPU: */ - clear_bit(smp_processor_id(), &cpu_online_map); + cpu_clear(smp_processor_id(), cpu_online_map); local_irq_disable(); disable_local_APIC(); local_irq_enable(); @@ -491,8 +490,8 @@ int slow_smp_processor_id(void) unsigned long sp = (unsigned long)&stack_location; int offset = 0, cpu; - for (offset = 0; (cpu_online_map >> offset); offset = cpu + 1) { - cpu = ffz(~(cpu_online_map >> offset)); + for (offset = 0; next_cpu(cpu_online_map, offset) < NR_CPUS; offset = cpu + 1) { + cpu = next_cpu(cpu_online_map, offset); if (sp >= (u64)cpu_pda[cpu].irqstackptr - IRQSTACKSIZE && sp <= (u64)cpu_pda[cpu].irqstackptr) diff -prauN linux-2.6.0-test1/arch/x86_64/kernel/smpboot.c wli-2.6.0-test1-37/arch/x86_64/kernel/smpboot.c --- linux-2.6.0-test1/arch/x86_64/kernel/smpboot.c 2003-07-13 20:39:31.000000000 -0700 +++ wli-2.6.0-test1-37/arch/x86_64/kernel/smpboot.c 2003-07-14 06:31:10.000000000 -0700 @@ -54,11 +54,11 @@ #include /* Bitmask of currently online CPUs */ -unsigned long cpu_online_map = 1; +cpumask_t cpu_online_map = cpumask_of_cpu(0); -static volatile unsigned long cpu_callin_map; -volatile unsigned long cpu_callout_map; -static unsigned long smp_commenced_mask; +static cpumask_t cpu_callin_map; +cpumask_t cpu_callout_map; +static cpumask_t smp_commenced_mask; /* Per CPU bogomips and other parameters */ struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned; @@ -174,7 +174,7 @@ static void __init synchronize_tsc_bp (v sum = 0; for (i = 0; i < NR_CPUS; i++) { - if (test_bit(i, &cpu_callout_map)) { + if (cpu_isset(i, cpu_callout_map)) { t0 = tsc_values[i]; sum += t0; } @@ -183,7 +183,7 @@ static void __init synchronize_tsc_bp (v sum = 0; for (i = 0; i < NR_CPUS; i++) { - if (!test_bit(i, &cpu_callout_map)) + if (!cpu_isset(i, cpu_callout_map)) continue; delta = tsc_values[i] - avg; @@ -258,7 +258,7 @@ void __init smp_callin(void) */ phys_id = GET_APIC_ID(apic_read(APIC_ID)); cpuid = smp_processor_id(); - if (test_and_set_bit(cpuid, &cpu_callin_map)) { + if (cpu_test_and_set(cpuid, cpu_callin_map)) { panic("smp_callin: phys CPU#%d, CPU#%d already present??\n", phys_id, cpuid); } @@ -280,7 +280,7 @@ void __init smp_callin(void) /* * Has the boot CPU finished it's STARTUP sequence? */ - if (test_bit(cpuid, &cpu_callout_map)) + if (cpu_isset(cpuid, cpu_callout_map)) break; rep_nop(); } @@ -320,7 +320,7 @@ void __init smp_callin(void) /* * Allow the master to continue. */ - set_bit(cpuid, &cpu_callin_map); + cpu_set(cpuid, cpu_callin_map); /* * Synchronize the TSC with the BP @@ -348,7 +348,7 @@ void __init start_secondary(void) barrier(); Dprintk("cpu %d: waiting for commence\n", smp_processor_id()); - while (!test_bit(smp_processor_id(), &smp_commenced_mask)) + while (!cpu_isset(smp_processor_id(), smp_commenced_mask)) rep_nop(); Dprintk("cpu %d: setting up apic clock\n", smp_processor_id()); @@ -372,7 +372,7 @@ void __init start_secondary(void) local_flush_tlb(); Dprintk("cpu %d eSetting cpu_online_map\n", smp_processor_id()); - set_bit(smp_processor_id(), &cpu_online_map); + cpu_set(smp_processor_id(), cpu_online_map); wmb(); cpu_idle(); @@ -630,19 +630,19 @@ static void __init do_boot_cpu (int apic * allow APs to start initializing. */ Dprintk("Before Callout %d.\n", cpu); - set_bit(cpu, &cpu_callout_map); + cpu_set(cpu, cpu_callout_map); Dprintk("After Callout %d.\n", cpu); /* * Wait 5s total for a response */ for (timeout = 0; timeout < 50000; timeout++) { - if (test_bit(cpu, &cpu_callin_map)) + if (cpu_isset(cpu, cpu_callin_map)) break; /* It has booted */ udelay(100); } - if (test_bit(cpu, &cpu_callin_map)) { + if (cpu_isset(cpu, cpu_callin_map)) { /* number CPUs logically, starting from 1 (BSP is 0) */ Dprintk("OK.\n"); printk(KERN_INFO "CPU%d: ", cpu); @@ -663,7 +663,7 @@ static void __init do_boot_cpu (int apic } } if (boot_error) { - clear_bit(cpu, &cpu_callout_map); /* was set here (do_boot_cpu()) */ + cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */ clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */ cpucount--; } @@ -734,10 +734,10 @@ static void __init smp_boot_cpus(unsigne current_thread_info()->cpu = 0; smp_tune_scheduling(); - if (!test_bit(hard_smp_processor_id(), &phys_cpu_present_map)) { + if (!cpu_isset(hard_smp_processor_id(), phys_cpu_present_map)) { printk("weird, boot CPU (#%d) not listed by the BIOS.\n", hard_smp_processor_id()); - phys_cpu_present_map |= (1 << hard_smp_processor_id()); + cpu_set(hard_smp_processor_id(), phys_cpu_present_map()); } /* @@ -747,8 +747,8 @@ static void __init smp_boot_cpus(unsigne if (!smp_found_config) { printk(KERN_NOTICE "SMP motherboard not detected.\n"); io_apic_irqs = 0; - cpu_online_map = phys_cpu_present_map = 1; - phys_cpu_present_map = 1; + cpu_online_map = cpumask_of_cpu(0); + phys_cpu_present_map = cpumask_of_cpu(0); if (APIC_init_uniprocessor()) printk(KERN_NOTICE "Local APIC not detected." " Using dummy APIC emulation.\n"); @@ -759,10 +759,10 @@ static void __init smp_boot_cpus(unsigne * Should not be necessary because the MP table should list the boot * CPU too, but we do it for the sake of robustness anyway. */ - if (!test_bit(boot_cpu_id, &phys_cpu_present_map)) { + if (!cpu_isset(boot_cpu_id, phys_cpu_present_map)) { printk(KERN_NOTICE "weird, boot CPU (#%d) not listed by the BIOS.\n", boot_cpu_id); - phys_cpu_present_map |= (1 << hard_smp_processor_id()); + cpu_set(hard_smp_processor_id(), phys_cpu_present_map); } /* @@ -773,8 +773,8 @@ static void __init smp_boot_cpus(unsigne boot_cpu_id); printk(KERN_ERR "... forcing use of dummy APIC emulation. (tell your hw vendor)\n"); io_apic_irqs = 0; - cpu_online_map = phys_cpu_present_map = 1; - phys_cpu_present_map = 1; + cpu_online_map = cpumask_of_cpu(0); + phys_cpu_present_map = cpumask_of_cpu(0); disable_apic = 1; goto smp_done; } @@ -788,8 +788,8 @@ static void __init smp_boot_cpus(unsigne smp_found_config = 0; printk(KERN_INFO "SMP mode deactivated, forcing use of dummy APIC emulation.\n"); io_apic_irqs = 0; - cpu_online_map = phys_cpu_present_map = 1; - phys_cpu_present_map = 1; + cpu_online_map = cpumask_of_cpu(0); + phys_cpu_present_map = cpumask_of_cpu(0); disable_apic = 1; goto smp_done; } @@ -812,7 +812,7 @@ static void __init smp_boot_cpus(unsigne if (apicid == boot_cpu_id) continue; - if (!(phys_cpu_present_map & (1 << apicid))) + if (!cpu_isset(apicid, phys_cpu_present_map)) continue; if ((max_cpus >= 0) && (max_cpus <= cpucount+1)) continue; @@ -848,7 +848,7 @@ static void __init smp_boot_cpus(unsigne } else { unsigned long bogosum = 0; for (cpu = 0; cpu < NR_CPUS; cpu++) - if (cpu_callout_map & (1< #include #include +#include #include static struct sysdev_class node_class = { @@ -17,7 +18,17 @@ static struct sysdev_class node_class = static ssize_t node_read_cpumap(struct sys_device * dev, char * buf) { struct node *node_dev = to_node(dev); - return sprintf(buf,"%lx\n",node_dev->cpumap); + cpumask_t tmp = node_dev->cpumap; + int k, len = 0; + + for (k = 0; k < sizeof(cpumask_t)/sizeof(u16); ++k) { + int j = sprintf(buf, "%04hx", (u16)cpus_coerce(tmp)); + len += j; + buf += j; + cpus_shift_right(tmp, tmp, 16); + } + len += sprintf(buf, "\n"); + return len; } static SYSDEV_ATTR(cpumap,S_IRUGO,node_read_cpumap,NULL); diff -prauN linux-2.6.0-test1/drivers/char/drm/drm_memory.h wli-2.6.0-test1-37/drivers/char/drm/drm_memory.h --- linux-2.6.0-test1/drivers/char/drm/drm_memory.h 2003-07-13 20:31:20.000000000 -0700 +++ wli-2.6.0-test1-37/drivers/char/drm/drm_memory.h 2003-07-14 06:49:00.000000000 -0700 @@ -128,7 +128,7 @@ static inline unsigned long drm_follow_page (void *vaddr) { pgd_t *pgd = pgd_offset_k((unsigned long) vaddr); - pmd_t *pmd = pmd_offset(pgd, (unsigned long) vaddr); + pmd_t *pmd = pmd_offset_kernel(pgd, (unsigned long)vaddr); pte_t *ptep = pte_offset_kernel(pmd, (unsigned long) vaddr); return pte_pfn(*ptep) << PAGE_SHIFT; } diff -prauN linux-2.6.0-test1/drivers/char/tty_io.c wli-2.6.0-test1-37/drivers/char/tty_io.c --- linux-2.6.0-test1/drivers/char/tty_io.c 2003-07-13 20:34:49.000000000 -0700 +++ wli-2.6.0-test1-37/drivers/char/tty_io.c 2003-07-14 10:20:40.000000000 -0700 @@ -159,11 +159,22 @@ static struct tty_struct *alloc_tty_stru return tty; } -static inline void free_tty_struct(struct tty_struct *tty) +static void __free_tty_struct(void *tty) { kfree(tty); } +/* + * When invalidating file->f_container we need to wait for a + * grace period to be sure no dangling reference occurs. Until + * the end of the grace period there will be readers who will + * see file->f_container pointing to our tty_struct. + */ +static inline void free_tty_struct(struct tty_struct *tty) +{ + call_rcu(&tty->rcu, __free_tty_struct, tty); +} + #define TTY_NUMBER(tty) ((tty)->index + (tty)->driver->name_base) char *tty_name(struct tty_struct *tty, char *buf) @@ -202,13 +213,13 @@ static int check_tty_count(struct tty_st { #ifdef CHECK_TTY_COUNT struct list_head *p; - int count = 0; + int cpu, count = 0; - file_list_lock(); - list_for_each(p, &tty->tty_files) { - count++; - } - file_list_unlock(); + file_list_lock_all(tty->tty_file_lists); + for (cpu = 0; cpu < NR_CPUS; ++cpu) + list_for_each(p, &tty->tty_file_lists[cpu].list) + count++; + file_list_unlock_all(tty->tty_file_lists); if (tty->driver->type == TTY_DRIVER_TYPE_PTY && tty->driver->subtype == PTY_TYPE_SLAVE && tty->link && tty->link->count) @@ -406,7 +417,7 @@ void do_tty_hangup(void *data) struct file *filp, *f = NULL; struct task_struct *p; struct pid *pid; - int closecount = 0, n; + int closecount = 0, n, cpu; if (!tty) return; @@ -424,20 +435,22 @@ void do_tty_hangup(void *data) fput(f); check_tty_count(tty, "do_tty_hangup"); - file_list_lock(); - list_for_each_entry(filp, &tty->tty_files, f_list) { - if (IS_CONSOLE_DEV(filp->f_dentry->d_inode->i_rdev) || - IS_SYSCONS_DEV(filp->f_dentry->d_inode->i_rdev)) { - cons_filp = filp; - continue; + file_list_lock_all(tty->tty_file_lists); + for (cpu = 0; cpu < NR_CPUS; ++cpu) { + list_for_each_entry(filp, &tty->tty_file_lists[cpu].list, f_list) { + if (IS_CONSOLE_DEV(filp->f_dentry->d_inode->i_rdev) || + IS_SYSCONS_DEV(filp->f_dentry->d_inode->i_rdev)) { + cons_filp = filp; + continue; + } + if (filp->f_op != &tty_fops) + continue; + closecount++; + tty_fasync(-1, filp, 0); /* can't block */ + filp->f_op = &hung_up_tty_fops; } - if (filp->f_op != &tty_fops) - continue; - closecount++; - tty_fasync(-1, filp, 0); /* can't block */ - filp->f_op = &hung_up_tty_fops; } - file_list_unlock(); + file_list_unlock_all(tty->tty_file_lists); /* FIXME! What are the locking issues here? This may me overdoing things.. * this question is especially important now that we've removed the irqlock. */ @@ -1008,6 +1021,7 @@ static void release_mem(struct tty_struc { struct tty_struct *o_tty; struct termios *tp; + int cpu; if ((o_tty = tty->link) != NULL) { o_tty->driver->ttys[idx] = NULL; @@ -1018,9 +1032,15 @@ static void release_mem(struct tty_struc } o_tty->magic = 0; o_tty->driver->refcount--; - file_list_lock(); - list_del(&o_tty->tty_files); - file_list_unlock(); + file_list_lock_all(o_tty->tty_file_lists); + for (cpu = 0; cpu < NR_CPUS; ++cpu) { + struct file *file, *save; + list_for_each_entry_safe(file, save, &o_tty->tty_file_lists[cpu].list, f_list) { + list_del_init(&file->f_list); + file->f_container = NULL; + } + } + file_list_unlock_all(o_tty->tty_file_lists); free_tty_struct(o_tty); } @@ -1032,9 +1052,15 @@ static void release_mem(struct tty_struc } tty->magic = 0; tty->driver->refcount--; - file_list_lock(); - list_del(&tty->tty_files); - file_list_unlock(); + file_list_lock_all(tty->tty_file_lists); + for (cpu = 0; cpu < NR_CPUS; ++cpu) { + struct file *file, *save; + list_for_each_entry_safe(file, save, &tty->tty_file_lists[cpu].list, f_list) { + list_del_init(&file->f_list); + file->f_container = NULL; + } + } + file_list_unlock_all(tty->tty_file_lists); module_put(tty->driver->owner); free_tty_struct(tty); } @@ -1053,6 +1079,7 @@ static void release_dev(struct file * fi int pty_master, tty_closing, o_tty_closing, do_sleep; int idx; char buf[64]; + struct file_list *container; tty = (struct tty_struct *)filp->private_data; if (tty_paranoia_check(tty, filp->f_dentry->d_inode->i_rdev, "release_dev")) @@ -1207,7 +1234,16 @@ static void release_dev(struct file * fi * - do_tty_hangup no longer sees this file descriptor as * something that needs to be handled for hangups. */ - file_kill(filp); + rcu_read_lock(); + smp_read_barrier_depends(); + container = filp->f_container; + if (container) { + spin_lock(&container->lock); + list_del_init(&filp->f_list); + filp->f_container = NULL; + spin_unlock(&container->lock); + } + rcu_read_unlock(); filp->private_data = NULL; /* @@ -1302,9 +1338,10 @@ static int tty_open(struct inode * inode struct tty_struct *tty; int noctty, retval; struct tty_driver *driver; - int index; + int cpu, index; kdev_t device; unsigned short saved_flags; + struct file_list *container; saved_flags = filp->f_flags; retry_open: @@ -1372,7 +1409,19 @@ got_driver: } filp->private_data = tty; - file_move(filp, &tty->tty_files); + cpu = get_cpu(); + smp_read_barrier_depends(); + container = filp->f_container; + if (container) { + spin_lock(&container->lock); + list_del(&filp->f_list); + spin_unlock(&container->lock); + } + container = filp->f_container = &tty->tty_file_lists[cpu]; + spin_lock(&container->lock); + list_add(&filp->f_list, &container->list); + spin_unlock(&container->lock); + put_cpu(); check_tty_count(tty, "tty_open"); if (tty->driver->type == TTY_DRIVER_TYPE_PTY && tty->driver->subtype == PTY_TYPE_MASTER) @@ -1911,7 +1960,7 @@ static void __do_SAK(void *arg) } task_lock(p); if (p->files) { - spin_lock(&p->files->file_lock); + rcu_read_lock(); for (i=0; i < p->files->max_fds; i++) { filp = fcheck_files(p->files, i); if (filp && (filp->f_op == &tty_fops) && @@ -1923,7 +1972,7 @@ static void __do_SAK(void *arg) break; } } - spin_unlock(&p->files->file_lock); + rcu_read_unlock(); } task_unlock(p); } @@ -2070,7 +2119,7 @@ static void initialize_tty_struct(struct sema_init(&tty->atomic_read, 1); sema_init(&tty->atomic_write, 1); spin_lock_init(&tty->read_lock); - INIT_LIST_HEAD(&tty->tty_files); + file_list_init(tty->tty_file_lists); INIT_WORK(&tty->SAK_work, NULL, NULL); } diff -prauN linux-2.6.0-test1/drivers/input/input.c wli-2.6.0-test1-37/drivers/input/input.c --- linux-2.6.0-test1/drivers/input/input.c 2003-07-13 20:34:40.000000000 -0700 +++ wli-2.6.0-test1-37/drivers/input/input.c 2003-07-14 10:07:15.000000000 -0700 @@ -346,7 +346,7 @@ static void input_call_hotplug(char *ver printk(KERN_ERR "input.c: calling hotplug from interrupt\n"); return; } - if (!current->fs->root) { + if (!current->fs->dirs->root) { printk(KERN_WARNING "input.c: calling hotplug without valid filesystem\n"); return; } diff -prauN linux-2.6.0-test1/drivers/s390/char/sclp.c wli-2.6.0-test1-37/drivers/s390/char/sclp.c --- linux-2.6.0-test1/drivers/s390/char/sclp.c 2003-07-13 20:35:16.000000000 -0700 +++ wli-2.6.0-test1-37/drivers/s390/char/sclp.c 2003-07-14 06:31:10.000000000 -0700 @@ -468,17 +468,17 @@ static struct sclp_register sclp_state_c * SCLP quiesce event handler */ #ifdef CONFIG_SMP -static volatile unsigned long cpu_quiesce_map; +static cpumask_t cpu_quiesce_map; static void do_load_quiesce_psw(void * __unused) { psw_t quiesce_psw; - clear_bit(smp_processor_id(), &cpu_quiesce_map); + cpu_clear(smp_processor_id(), cpu_quiesce_map); if (smp_processor_id() == 0) { /* Wait for all other cpus to enter do_load_quiesce_psw */ - while (cpu_quiesce_map != 0); + while (!cpus_empty(cpu_quiesce_map)); /* Quiesce the last cpu with the special psw */ quiesce_psw.mask = PSW_BASE_BITS | PSW_MASK_WAIT; quiesce_psw.addr = 0xfff; diff -prauN linux-2.6.0-test1/fs/adfs/inode.c wli-2.6.0-test1-37/fs/adfs/inode.c --- linux-2.6.0-test1/fs/adfs/inode.c 2003-07-13 20:32:34.000000000 -0700 +++ wli-2.6.0-test1-37/fs/adfs/inode.c 2003-07-14 08:52:52.000000000 -0700 @@ -64,7 +64,7 @@ static int adfs_readpage(struct file *fi static int adfs_prepare_write(struct file *file, struct page *page, unsigned int from, unsigned int to) { return cont_prepare_write(page, from, to, adfs_get_block, - &ADFS_I(page->mapping->host)->mmu_private); + &ADFS_I(page_mapping(page)->host)->mmu_private); } static sector_t _adfs_bmap(struct address_space *mapping, sector_t block) diff -prauN linux-2.6.0-test1/fs/affs/file.c wli-2.6.0-test1-37/fs/affs/file.c --- linux-2.6.0-test1/fs/affs/file.c 2003-07-13 20:34:02.000000000 -0700 +++ wli-2.6.0-test1-37/fs/affs/file.c 2003-07-14 08:52:52.000000000 -0700 @@ -418,7 +418,7 @@ static int affs_readpage(struct file *fi static int affs_prepare_write(struct file *file, struct page *page, unsigned from, unsigned to) { return cont_prepare_write(page, from, to, affs_get_block, - &AFFS_I(page->mapping->host)->mmu_private); + &AFFS_I(page_mapping(page)->host)->mmu_private); } static sector_t _affs_bmap(struct address_space *mapping, sector_t block) { @@ -507,7 +507,7 @@ affs_file_write(struct file *file, const static int affs_do_readpage_ofs(struct file *file, struct page *page, unsigned from, unsigned to) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; struct super_block *sb = inode->i_sb; struct buffer_head *bh; char *data; @@ -615,7 +615,7 @@ out: static int affs_readpage_ofs(struct file *file, struct page *page) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; u32 to; int err; @@ -635,7 +635,7 @@ affs_readpage_ofs(struct file *file, str static int affs_prepare_write_ofs(struct file *file, struct page *page, unsigned from, unsigned to) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; u32 size, offset; u32 tmp; int err = 0; @@ -676,7 +676,7 @@ static int affs_prepare_write_ofs(struct static int affs_commit_write_ofs(struct file *file, struct page *page, unsigned from, unsigned to) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; struct super_block *sb = inode->i_sb; struct buffer_head *bh, *prev_bh; char *data; diff -prauN linux-2.6.0-test1/fs/affs/symlink.c wli-2.6.0-test1-37/fs/affs/symlink.c --- linux-2.6.0-test1/fs/affs/symlink.c 2003-07-13 20:31:22.000000000 -0700 +++ wli-2.6.0-test1-37/fs/affs/symlink.c 2003-07-14 08:52:52.000000000 -0700 @@ -20,7 +20,7 @@ static int affs_symlink_readpage(struct file *file, struct page *page) { struct buffer_head *bh; - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; char *link = kmap(page); struct slink_front *lf; int err; diff -prauN linux-2.6.0-test1/fs/afs/file.c wli-2.6.0-test1-37/fs/afs/file.c --- linux-2.6.0-test1/fs/afs/file.c 2003-07-13 20:36:42.000000000 -0700 +++ wli-2.6.0-test1-37/fs/afs/file.c 2003-07-14 08:52:52.000000000 -0700 @@ -75,7 +75,7 @@ static int afs_file_readpage(struct file afs_vnode_t *vnode; int ret; - inode = page->mapping->host; + inode = page_mapping(page)->host; _enter("{%lu},{%lu}",inode->i_ino,page->index); diff -prauN linux-2.6.0-test1/fs/attr.c wli-2.6.0-test1-37/fs/attr.c --- linux-2.6.0-test1/fs/attr.c 2003-07-13 20:30:42.000000000 -0700 +++ wli-2.6.0-test1-37/fs/attr.c 2003-07-14 09:54:35.000000000 -0700 @@ -21,6 +21,7 @@ int inode_change_ok(struct inode *inode, { int retval = -EPERM; unsigned int ia_valid = attr->ia_valid; + task_t *task = current; /* If force is set do it anyway. */ if (ia_valid & ATTR_FORCE) @@ -28,7 +29,7 @@ int inode_change_ok(struct inode *inode, /* Make sure a caller can chown. */ if ((ia_valid & ATTR_UID) && - (current->fsuid != inode->i_uid || + (task->fsuid != inode->i_uid || attr->ia_uid != inode->i_uid) && !capable(CAP_CHOWN)) goto error; @@ -40,7 +41,7 @@ int inode_change_ok(struct inode *inode, /* Make sure a caller can chmod. */ if (ia_valid & ATTR_MODE) { - if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) + if ((task->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) goto error; /* Also check the setgid bit! */ if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid : @@ -50,7 +51,7 @@ int inode_change_ok(struct inode *inode, /* Check for setting the inode time. */ if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET)) { - if (current->fsuid != inode->i_uid && !capable(CAP_FOWNER)) + if (task->fsuid != inode->i_uid && !capable(CAP_FOWNER)) goto error; } fine: diff -prauN linux-2.6.0-test1/fs/binfmt_elf.c wli-2.6.0-test1-37/fs/binfmt_elf.c --- linux-2.6.0-test1/fs/binfmt_elf.c 2003-07-13 20:33:48.000000000 -0700 +++ wli-2.6.0-test1-37/fs/binfmt_elf.c 2003-07-14 09:54:35.000000000 -0700 @@ -7,6 +7,7 @@ * Tools". * * Copyright 1993, 1994: Eric Youngdale (ericy@cais.com). + * Top-down vma allocation support, William Irwin, IBM, 2003 */ #include @@ -45,7 +46,7 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs); static int load_elf_library(struct file*); -static unsigned long elf_map (struct file *, unsigned long, struct elf_phdr *, int, int); +static unsigned long elf_map(struct mm_struct *, struct file *, unsigned long, struct elf_phdr *, int, int); extern int dump_fpu (struct pt_regs *, elf_fpregset_t *); #ifndef elf_addr_t @@ -82,13 +83,13 @@ static struct linux_binfmt elf_format = #define BAD_ADDR(x) ((unsigned long)(x) > TASK_SIZE) -static void set_brk(unsigned long start, unsigned long end) +static void set_brk(struct mm_struct *mm, unsigned long start, unsigned long end) { start = ELF_PAGEALIGN(start); end = ELF_PAGEALIGN(end); if (end > start) do_brk(start, end - start); - current->mm->start_brk = current->mm->brk = end; + mm->start_brk = mm->brk = end; } @@ -162,7 +163,7 @@ create_elf_tables(struct linux_binprm *b */ if (smp_num_siblings > 1) - STACK_ALLOC(p, ((current->pid % 64) << 7)); + STACK_ALLOC(p, ((tsk->pid % 64) << 7)); #endif u_platform = (elf_addr_t *) STACK_ALLOC(p, len); __copy_to_user(u_platform, k_platform, len); @@ -230,7 +231,7 @@ create_elf_tables(struct linux_binprm *b } /* Populate argv and envp */ - p = current->mm->arg_start; + p = tsk->mm->arg_start; while (argc-- > 0) { size_t len; __put_user((elf_addr_t)p, argv++); @@ -240,7 +241,7 @@ create_elf_tables(struct linux_binprm *b p += len; } __put_user(0, argv); - current->mm->arg_end = current->mm->env_start = p; + tsk->mm->arg_end = tsk->mm->env_start = p; while (envc-- > 0) { size_t len; __put_user((elf_addr_t)p, envp++); @@ -250,7 +251,7 @@ create_elf_tables(struct linux_binprm *b p += len; } __put_user(0, envp); - current->mm->env_end = p; + tsk->mm->env_end = p; /* Put the elf_info on the stack in the right place. */ sp = (elf_addr_t *)envp + 1; @@ -259,16 +260,16 @@ create_elf_tables(struct linux_binprm *b #ifndef elf_map -static unsigned long elf_map(struct file *filep, unsigned long addr, - struct elf_phdr *eppnt, int prot, int type) +static unsigned long elf_map(struct mm_struct *mm, struct file *filep, + unsigned long addr, struct elf_phdr *eppnt, int prot, int type) { unsigned long map_addr; - down_write(¤t->mm->mmap_sem); + down_write(&mm->mmap_sem); map_addr = do_mmap(filep, ELF_PAGESTART(addr), eppnt->p_filesz + ELF_PAGEOFFSET(eppnt->p_vaddr), prot, type, eppnt->p_offset - ELF_PAGEOFFSET(eppnt->p_vaddr)); - up_write(¤t->mm->mmap_sem); + up_write(&mm->mmap_sem); return(map_addr); } @@ -290,6 +291,7 @@ static unsigned long load_elf_interp(str unsigned long last_bss = 0, elf_bss = 0; unsigned long error = ~0UL; int retval, i, size; + struct mm_struct *mm = current->mm; /* First of all, some simple consistency checks */ if (interp_elf_ex->e_type != ET_EXEC && @@ -323,8 +325,13 @@ static unsigned long load_elf_interp(str if (retval < 0) goto out_close; +#ifndef CONFIG_MMAP_TOPDOWN eppnt = elf_phdata; for (i=0; ie_phnum; i++, eppnt++) { +#else + eppnt = &elf_phdata[interp_elf_ex->e_phnum - 1]; + for (i = interp_elf_ex->e_phnum - 1; i >= 0; --i, --eppnt) { +#endif if (eppnt->p_type == PT_LOAD) { int elf_type = MAP_PRIVATE | MAP_DENYWRITE; int elf_prot = 0; @@ -338,7 +345,8 @@ static unsigned long load_elf_interp(str if (interp_elf_ex->e_type == ET_EXEC || load_addr_set) elf_type |= MAP_FIXED; - map_addr = elf_map(interpreter, load_addr + vaddr, eppnt, elf_prot, elf_type); + map_addr = load_addr_set ? load_addr + vaddr : 0; + map_addr = elf_map(mm, interpreter, map_addr, eppnt, elf_prot, elf_type); if (BAD_ADDR(map_addr)) goto out_close; @@ -393,11 +401,12 @@ static unsigned long load_aout_interp(st unsigned long text_data, elf_entry = ~0UL; char * addr; loff_t offset; + struct mm_struct *mm = current->mm; - current->mm->end_code = interp_ex->a_text; + mm->end_code = interp_ex->a_text; text_data = interp_ex->a_text + interp_ex->a_data; - current->mm->end_data = text_data; - current->mm->brk = interp_ex->a_bss + text_data; + mm->end_data = text_data; + mm->brk = interp_ex->a_bss + text_data; switch (N_MAGIC(*interp_ex)) { case OMAGIC: @@ -460,6 +469,7 @@ static int load_elf_binary(struct linux_ struct elfhdr interp_elf_ex; struct exec interp_ex; char passed_fileno[6]; + struct mm_struct *mm; /* Get the exec-header */ elf_ex = *((struct elfhdr *) bprm->buf); @@ -620,12 +630,13 @@ static int load_elf_binary(struct linux_ retval = flush_old_exec(bprm); if (retval) goto out_free_dentry; + mm = current->mm; /* OK, This is the point of no return */ - current->mm->start_data = 0; - current->mm->end_data = 0; - current->mm->end_code = 0; - current->mm->mmap = NULL; + mm->start_data = 0; + mm->end_data = 0; + mm->end_code = 0; + mm->mmap = NULL; current->flags &= ~PF_FORKNOEXEC; /* Do this immediately, since STACK_TOP as used in setup_arg_pages @@ -642,7 +653,7 @@ static int load_elf_binary(struct linux_ goto out_free_dentry; } - current->mm->start_stack = bprm->p; + mm->start_stack = bprm->p; /* Now we do a little grungy work by mmaping the ELF image into the correct location in memory. At this point, we assume that @@ -662,7 +673,7 @@ static int load_elf_binary(struct linux_ /* There was a PT_LOAD segment with p_memsz > p_filesz before this one. Map anonymous pages, if needed, and clear the area. */ - set_brk (elf_bss + load_bias, elf_brk + load_bias); + set_brk(mm, elf_bss + load_bias, elf_brk + load_bias); nbyte = ELF_PAGEOFFSET(elf_bss); if (nbyte) { nbyte = ELF_MIN_ALIGN - nbyte; @@ -688,7 +699,7 @@ static int load_elf_binary(struct linux_ load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr); } - error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt, elf_prot, elf_flags); + error = elf_map(mm, bprm->file, load_bias + vaddr, elf_ppnt, elf_prot, elf_flags); if (BAD_ADDR(error)) continue; @@ -775,7 +786,7 @@ static int load_elf_binary(struct linux_ /* Calling set_brk effectively mmaps the pages that we need * for the bss and break sections */ - set_brk(elf_bss, elf_brk); + set_brk(current->mm, elf_bss, elf_brk); padzero(elf_bss); @@ -839,6 +850,7 @@ static int load_elf_library(struct file unsigned long elf_bss, bss, len; int retval, error, i, j; struct elfhdr elf_ex; + struct mm_struct *mm = current->mm; error = -ENOEXEC; retval = kernel_read(file, 0, (char *) &elf_ex, sizeof(elf_ex)); @@ -876,7 +888,7 @@ static int load_elf_library(struct file while (elf_phdata->p_type != PT_LOAD) elf_phdata++; /* Now use mmap to map the library into memory. */ - down_write(¤t->mm->mmap_sem); + down_write(&mm->mmap_sem); error = do_mmap(file, ELF_PAGESTART(elf_phdata->p_vaddr), (elf_phdata->p_filesz + @@ -885,7 +897,7 @@ static int load_elf_library(struct file MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE, (elf_phdata->p_offset - ELF_PAGEOFFSET(elf_phdata->p_vaddr))); - up_write(¤t->mm->mmap_sem); + up_write(&mm->mmap_sem); if (error != ELF_PAGESTART(elf_phdata->p_vaddr)) goto out_free_ph; diff -prauN linux-2.6.0-test1/fs/bio.c wli-2.6.0-test1-37/fs/bio.c --- linux-2.6.0-test1/fs/bio.c 2003-07-13 20:35:12.000000000 -0700 +++ wli-2.6.0-test1-37/fs/bio.c 2003-07-14 09:54:35.000000000 -0700 @@ -374,6 +374,8 @@ static struct bio *__bio_map_user(struct int ret, offset, i; struct page **pages; struct bio *bio; + task_t *task = current; + struct mm_struct *mm = task->mm; /* * transfer and buffer must be aligned to at least hardsector @@ -390,10 +392,10 @@ static struct bio *__bio_map_user(struct if (!pages) goto out; - down_read(¤t->mm->mmap_sem); - ret = get_user_pages(current, current->mm, uaddr, nr_pages, + down_read(&mm->mmap_sem); + ret = get_user_pages(task, mm, uaddr, nr_pages, write_to_vm, 0, pages, NULL); - up_read(¤t->mm->mmap_sem); + up_read(&mm->mmap_sem); if (ret < nr_pages) goto out; diff -prauN linux-2.6.0-test1/fs/buffer.c wli-2.6.0-test1-37/fs/buffer.c --- linux-2.6.0-test1/fs/buffer.c 2003-07-13 20:34:42.000000000 -0700 +++ wli-2.6.0-test1-37/fs/buffer.c 2003-07-17 03:04:00.000000000 -0700 @@ -46,7 +46,7 @@ static void invalidate_bh_lrus(void); /* * Hashed waitqueue_head's for wait_on_buffer() */ -#define BH_WAIT_TABLE_ORDER 7 +#define BH_WAIT_TABLE_ORDER 12 static struct bh_wait_queue_head { wait_queue_head_t wqh; } ____cacheline_aligned_in_smp bh_wait_queue_heads[1<i_mapping; - struct address_space *buffer_mapping = bh->b_page->mapping; + struct address_space *buffer_mapping = page_mapping(bh->b_page); mark_buffer_dirty(bh); if (!mapping->assoc_mapping) { @@ -813,19 +813,10 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode); * * FIXME: may need to call ->reservepage here as well. That's rather up to the * address_space though. - * - * For now, we treat swapper_space specially. It doesn't use the normal - * block a_ops. */ -int __set_page_dirty_buffers(struct page *page) +int set_page_dirty_buffers(struct page *page) { - struct address_space * const mapping = page->mapping; - int ret = 0; - - if (mapping == NULL) { - SetPageDirty(page); - goto out; - } + struct address_space * const mapping = page_mapping(page); spin_lock(&mapping->private_lock); if (page_has_buffers(page)) { @@ -843,21 +834,19 @@ int __set_page_dirty_buffers(struct page spin_unlock(&mapping->private_lock); if (!TestSetPageDirty(page)) { - spin_lock(&mapping->page_lock); - if (page->mapping) { /* Race with truncate? */ + mapping_wrlock(&mapping->page_lock); + if (page_mapping(page)) { /* Race with truncate? */ if (!mapping->backing_dev_info->memory_backed) inc_page_state(nr_dirty); list_del(&page->list); list_add(&page->list, &mapping->dirty_pages); } - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); } - -out: - return ret; + return 0; } -EXPORT_SYMBOL(__set_page_dirty_buffers); +EXPORT_SYMBOL(set_page_dirty_buffers); /* * Write out and wait upon a list of buffers. @@ -1229,7 +1218,7 @@ __getblk_slow(struct block_device *bdev, * address_space's dirty_pages list and then attach the address_space's * inode to its superblock's dirty inode list. * - * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock, + * mark_buffer_dirty() is atomic. It takes page_mapping(bh->b_page)->private_lock, * mapping->page_lock and the global inode_lock. */ void mark_buffer_dirty(struct buffer_head *bh) @@ -1237,7 +1226,7 @@ void mark_buffer_dirty(struct buffer_hea if (!buffer_uptodate(bh)) buffer_error(); if (!buffer_dirty(bh) && !test_set_buffer_dirty(bh)) - __set_page_dirty_nobuffers(bh->b_page); + set_page_dirty_nobuffers(bh->b_page); } /* @@ -1265,7 +1254,7 @@ void __bforget(struct buffer_head *bh) { clear_buffer_dirty(bh); if (!list_empty(&bh->b_assoc_buffers)) { - struct address_space *buffer_mapping = bh->b_page->mapping; + struct address_space *buffer_mapping = page_mapping(bh->b_page); spin_lock(&buffer_mapping->private_lock); list_del_init(&bh->b_assoc_buffers); @@ -1552,7 +1541,7 @@ static inline void discard_buffer(struct */ int try_to_release_page(struct page *page, int gfp_mask) { - struct address_space * const mapping = page->mapping; + struct address_space * const mapping = page_mapping(page); if (!PageLocked(page)) BUG(); @@ -1618,7 +1607,7 @@ EXPORT_SYMBOL(block_invalidatepage); /* * We attach and possibly dirty the buffers atomically wrt - * __set_page_dirty_buffers() via private_lock. try_to_free_buffers + * set_page_dirty_buffers() via private_lock. try_to_free_buffers * is already excluded via the page lock. */ void create_empty_buffers(struct page *page, @@ -1635,7 +1624,7 @@ void create_empty_buffers(struct page *p } while (bh); tail->b_this_page = head; - spin_lock(&page->mapping->private_lock); + spin_lock(&page_mapping(page)->private_lock); if (PageUptodate(page) || PageDirty(page)) { bh = head; do { @@ -1647,7 +1636,7 @@ void create_empty_buffers(struct page *p } while (bh != head); } __set_page_buffers(page, head); - spin_unlock(&page->mapping->private_lock); + spin_unlock(&page_mapping(page)->private_lock); } EXPORT_SYMBOL(create_empty_buffers); @@ -1731,12 +1720,12 @@ static int __block_write_full_page(struc } /* - * Be very careful. We have no exclusion from __set_page_dirty_buffers + * Be very careful. We have no exclusion from set_page_dirty_buffers * here, and the (potentially unmapped) buffers may become dirty at * any time. If a buffer becomes dirty here after we've inspected it * then we just miss that fact, and the page stays dirty. * - * Buffers outside i_size may be dirtied by __set_page_dirty_buffers; + * Buffers outside i_size may be dirtied by set_page_dirty_buffers; * handle that here by just cleaning them. */ @@ -1787,7 +1776,7 @@ static int __block_write_full_page(struc lock_buffer(bh); } else { if (test_set_buffer_locked(bh)) { - __set_page_dirty_nobuffers(page); + set_page_dirty_nobuffers(page); continue; } } @@ -2040,7 +2029,7 @@ static int __block_commit_write(struct i */ int block_read_full_page(struct page *page, get_block_t *get_block) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; sector_t iblock, lblock; struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; unsigned int blocksize; @@ -2180,7 +2169,7 @@ out: int cont_prepare_write(struct page *page, unsigned offset, unsigned to, get_block_t *get_block, loff_t *bytes) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); struct inode *inode = mapping->host; struct page *new_page; unsigned long pgpos; @@ -2262,7 +2251,7 @@ out: int block_prepare_write(struct page *page, unsigned from, unsigned to, get_block_t *get_block) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; int err = __block_prepare_write(inode, page, from, to, get_block); if (err) ClearPageUptodate(page); @@ -2271,7 +2260,7 @@ int block_prepare_write(struct page *pag int block_commit_write(struct page *page, unsigned from, unsigned to) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; __block_commit_write(inode,page,from,to); return 0; } @@ -2279,7 +2268,7 @@ int block_commit_write(struct page *page int generic_commit_write(struct file *file, struct page *page, unsigned from, unsigned to) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; __block_commit_write(inode,page,from,to); /* @@ -2300,7 +2289,7 @@ int generic_commit_write(struct file *fi int nobh_prepare_write(struct page *page, unsigned from, unsigned to, get_block_t *get_block) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; const unsigned blkbits = inode->i_blkbits; const unsigned blocksize = 1 << blkbits; struct buffer_head map_bh; @@ -2434,7 +2423,7 @@ EXPORT_SYMBOL(nobh_prepare_write); int nobh_commit_write(struct file *file, struct page *page, unsigned from, unsigned to) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; set_page_dirty(page); @@ -2568,7 +2557,7 @@ out: int block_write_full_page(struct page *page, get_block_t *get_block, struct writeback_control *wbc) { - struct inode * const inode = page->mapping->host; + struct inode * const inode = page_mapping(page)->host; loff_t i_size = i_size_read(inode); const unsigned long end_index = i_size >> PAGE_CACHE_SHIFT; unsigned offset; @@ -2744,9 +2733,9 @@ void sync_dirty_buffer(struct buffer_hea static void check_ttfb_buffer(struct page *page, struct buffer_head *bh) { if (!buffer_uptodate(bh) && !buffer_req(bh)) { - if (PageUptodate(page) && page->mapping + if (PageUptodate(page) && page_mapping(page) && buffer_mapped(bh) /* discard_buffer */ - && S_ISBLK(page->mapping->host->i_mode)) + && S_ISBLK(page_mapping(page)->host->i_mode)) { buffer_error(); } @@ -2768,7 +2757,7 @@ static void check_ttfb_buffer(struct pag * * The same applies to regular filesystem pages: if all the buffers are * clean then we set the page clean and proceed. To do that, we require - * total exclusion from __set_page_dirty_buffers(). That is obtained with + * total exclusion from set_page_dirty_buffers(). That is obtained with * private_lock. * * try_to_free_buffers() is non-blocking. @@ -2815,7 +2804,7 @@ failed: int try_to_free_buffers(struct page *page) { - struct address_space * const mapping = page->mapping; + struct address_space * const mapping = page_mapping(page); struct buffer_head *buffers_to_free = NULL; int ret = 0; diff -prauN linux-2.6.0-test1/fs/cifs/file.c wli-2.6.0-test1-37/fs/cifs/file.c --- linux-2.6.0-test1/fs/cifs/file.c 2003-07-13 20:28:53.000000000 -0700 +++ wli-2.6.0-test1-37/fs/cifs/file.c 2003-07-14 08:52:52.000000000 -0700 @@ -410,14 +410,14 @@ cifs_write(struct file * file, const cha static int cifs_partialpagewrite(struct page *page,unsigned from, unsigned to) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); loff_t offset = (loff_t)page->index << PAGE_CACHE_SHIFT; char * write_data; int rc = -EFAULT; int bytes_written = 0; struct cifs_sb_info *cifs_sb; struct cifsTconInfo *pTcon; - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; struct cifsInodeInfo *cifsInode; struct cifsFileInfo *open_file = NULL; struct list_head *tmp; @@ -529,7 +529,7 @@ cifs_commit_write(struct file *file, str { int xid; int rc = 0; - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; loff_t position = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; struct cifsFileInfo *open_file; struct cifs_sb_info *cifs_sb; @@ -583,7 +583,7 @@ cifs_sync_page(struct page *page) int rc = 0; cFYI(1,("sync page %p",page)); - mapping = page->mapping; + mapping = page_mapping(page); if (!mapping) return 0; inode = mapping->host; diff -prauN linux-2.6.0-test1/fs/coda/symlink.c wli-2.6.0-test1-37/fs/coda/symlink.c --- linux-2.6.0-test1/fs/coda/symlink.c 2003-07-13 20:31:55.000000000 -0700 +++ wli-2.6.0-test1-37/fs/coda/symlink.c 2003-07-14 08:52:52.000000000 -0700 @@ -24,7 +24,7 @@ static int coda_symlink_filler(struct file *file, struct page *page) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; int error; struct coda_inode_info *cii; unsigned int len = PAGE_SIZE; diff -prauN linux-2.6.0-test1/fs/cramfs/inode.c wli-2.6.0-test1-37/fs/cramfs/inode.c --- linux-2.6.0-test1/fs/cramfs/inode.c 2003-07-13 20:29:29.000000000 -0700 +++ wli-2.6.0-test1-37/fs/cramfs/inode.c 2003-07-14 08:52:52.000000000 -0700 @@ -400,7 +400,7 @@ static struct dentry * cramfs_lookup(str static int cramfs_readpage(struct file *file, struct page * page) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; u32 maxblock, bytes_filled; void *pgdata; diff -prauN linux-2.6.0-test1/fs/dcache.c wli-2.6.0-test1-37/fs/dcache.c --- linux-2.6.0-test1/fs/dcache.c 2003-07-13 20:30:42.000000000 -0700 +++ wli-2.6.0-test1-37/fs/dcache.c 2003-07-14 10:07:15.000000000 -0700 @@ -1353,10 +1353,14 @@ char * d_path(struct dentry *dentry, str char *res; struct vfsmount *rootmnt; struct dentry *root; - read_lock(¤t->fs->lock); - rootmnt = mntget(current->fs->rootmnt); - root = dget(current->fs->root); - read_unlock(¤t->fs->lock); + struct fs_struct *fs = current->fs; + struct fs_dirs *dirs; + + rcu_read_lock(); /* fs->lock */ + dirs = fs->dirs; + rootmnt = mntget(dirs->rootmnt); + root = dget(dirs->root); + rcu_read_unlock(); /* fs->lock */ spin_lock(&dcache_lock); res = __d_path(dentry, vfsmnt, root, rootmnt, buf, buflen); spin_unlock(&dcache_lock); @@ -1389,16 +1393,19 @@ asmlinkage long sys_getcwd(char __user * struct vfsmount *pwdmnt, *rootmnt; struct dentry *pwd, *root; char *page = (char *) __get_free_page(GFP_USER); + struct fs_struct *fs = current->fs; + struct fs_dirs *dirs; if (!page) return -ENOMEM; - read_lock(¤t->fs->lock); - pwdmnt = mntget(current->fs->pwdmnt); - pwd = dget(current->fs->pwd); - rootmnt = mntget(current->fs->rootmnt); - root = dget(current->fs->root); - read_unlock(¤t->fs->lock); + rcu_read_lock(); /* fs->lock */ + dirs = fs->dirs; + pwdmnt = mntget(dirs->pwdmnt); + pwd = dget(dirs->pwd); + rootmnt = mntget(dirs->rootmnt); + root = dget(dirs->root); + rcu_read_unlock(); /* fs->lock */ error = -ENOENT; /* Has the current directory has been unlinked? */ @@ -1620,6 +1627,12 @@ extern void chrdev_init(void); void __init vfs_caches_init(unsigned long mempages) { + init_task.fs->dirs = kmalloc(sizeof(struct fs_dirs), GFP_KERNEL); + if (!init_task.fs->dirs) + panic("Cannot create init_task.fs->dirs!\n"); + else + memset(init_task.fs->dirs, 0, sizeof(struct fs_dirs)); + names_cachep = kmem_cache_create("names_cache", PATH_MAX, 0, SLAB_HWCACHE_ALIGN, NULL, NULL); diff -prauN linux-2.6.0-test1/fs/direct-io.c wli-2.6.0-test1-37/fs/direct-io.c --- linux-2.6.0-test1/fs/direct-io.c 2003-07-13 20:34:42.000000000 -0700 +++ wli-2.6.0-test1-37/fs/direct-io.c 2003-07-14 09:54:35.000000000 -0700 @@ -133,19 +133,21 @@ static int dio_refill_pages(struct dio * { int ret; int nr_pages; + task_t *task = current; + struct mm_struct *mm = task->mm; nr_pages = min(dio->total_pages - dio->curr_page, DIO_PAGES); - down_read(¤t->mm->mmap_sem); + down_read(&mm->mmap_sem); ret = get_user_pages( - current, /* Task for fault acounting */ - current->mm, /* whose pages? */ + task, /* Task for fault acounting */ + mm, /* whose pages? */ dio->curr_user_address, /* Where from? */ nr_pages, /* How many pages? */ dio->rw == READ, /* Write to memory? */ 0, /* force (?) */ &dio->pages[0], NULL); /* vmas */ - up_read(¤t->mm->mmap_sem); + up_read(&mm->mmap_sem); if (ret < 0 && dio->blocks_available && (dio->rw == WRITE)) { /* diff -prauN linux-2.6.0-test1/fs/dquot.c wli-2.6.0-test1-37/fs/dquot.c --- linux-2.6.0-test1/fs/dquot.c 2003-07-13 20:36:06.000000000 -0700 +++ wli-2.6.0-test1-37/fs/dquot.c 2003-07-14 10:20:40.000000000 -0700 @@ -525,17 +525,25 @@ static int dqinit_needed(struct inode *i /* This routine is guarded by dqptr_sem semaphore */ static void add_dquot_ref(struct super_block *sb, int type) { - struct list_head *p; + int cpu; + struct file *filp; + restart: - file_list_lock(); - list_for_each(p, &sb->s_files) { - struct file *filp = list_entry(p, struct file, f_list); - struct inode *inode = filp->f_dentry->d_inode; - if (filp->f_mode & FMODE_WRITE && dqinit_needed(inode, type)) { - struct vfsmount *mnt = mntget(filp->f_vfsmnt); - struct dentry *dentry = dget(filp->f_dentry); - file_list_unlock(); + file_list_lock_all(sb->s_file_lists); + for (cpu = 0; cpu < NR_CPUS; ++cpu) { + list_for_each_entry(filp, &sb->s_file_lists[cpu].list, f_list) { + struct inode *inode = filp->f_dentry->d_inode; + struct vfsmount *mnt; + struct dentry *dentry; + + if (!(filp->f_mode & FMODE_WRITE) || + !dqinit_needed(inode, type)) + continue; + + mnt = mntget(filp->f_vfsmnt); + dentry = dget(filp->f_dentry); + file_list_unlock_all(sb->s_file_lists); sb->dq_op->initialize(inode, type); dput(dentry); mntput(mnt); @@ -543,7 +551,7 @@ restart: goto restart; } } - file_list_unlock(); + file_list_unlock_all(sb->s_file_lists); } /* Return 0 if dqput() won't block (note that 1 doesn't necessarily mean blocking) */ diff -prauN linux-2.6.0-test1/fs/efs/symlink.c wli-2.6.0-test1-37/fs/efs/symlink.c --- linux-2.6.0-test1/fs/efs/symlink.c 2003-07-13 20:34:43.000000000 -0700 +++ wli-2.6.0-test1-37/fs/efs/symlink.c 2003-07-14 08:52:52.000000000 -0700 @@ -16,7 +16,7 @@ static int efs_symlink_readpage(struct f { char *link = kmap(page); struct buffer_head * bh; - struct inode * inode = page->mapping->host; + struct inode * inode = page_mapping(page)->host; efs_block_t size = inode->i_size; int err; diff -prauN linux-2.6.0-test1/fs/exec.c wli-2.6.0-test1-37/fs/exec.c --- linux-2.6.0-test1/fs/exec.c 2003-07-13 20:32:44.000000000 -0700 +++ wli-2.6.0-test1-37/fs/exec.c 2003-07-14 09:59:54.000000000 -0700 @@ -44,7 +44,7 @@ #include #include #include -#include +#include #include #include @@ -185,6 +185,26 @@ static int count(char __user * __user * return i; } +static inline size_t exec_copy_from_user(struct page *page, + unsigned long offset, + const char __user *buf, + unsigned bytes) +{ + int left; + char *kaddr; + + kaddr = kmap_atomic(page, KM_USER0); + left = __copy_from_user(kaddr + offset, buf, bytes); + kunmap_atomic(kaddr, KM_USER0); + + if (left) { + kaddr = kmap(page); + left = __copy_from_user(kaddr + offset, buf, bytes); + kunmap(page); + } + return left; +} + /* * 'copy_strings()' copies argument/environment strings from user * memory to free pages in kernel mem. These are in a format ready @@ -192,8 +212,6 @@ static int count(char __user * __user * */ int copy_strings(int argc,char __user * __user * argv, struct linux_binprm *bprm) { - struct page *kmapped_page = NULL; - char *kaddr = NULL; int ret; while (argc-- > 0) { @@ -220,6 +238,7 @@ int copy_strings(int argc,char __user * int i, new, err; int offset, bytes_to_copy; struct page *page; + char *kaddr = NULL; offset = pos % PAGE_SIZE; i = pos/PAGE_SIZE; @@ -235,22 +254,26 @@ int copy_strings(int argc,char __user * new = 1; } - if (page != kmapped_page) { - if (kmapped_page) - kunmap(kmapped_page); - kmapped_page = page; - kaddr = kmap(kmapped_page); - } + bytes_to_copy = PAGE_SIZE - offset; + + if ((new && offset) || bytes_to_copy > len) + kaddr = kmap_atomic(page, KM_USER0); + if (new && offset) memset(kaddr, 0, offset); - bytes_to_copy = PAGE_SIZE - offset; + if (bytes_to_copy > len) { bytes_to_copy = len; if (new) memset(kaddr+offset+len, 0, PAGE_SIZE-offset-len); } - err = copy_from_user(kaddr+offset, str, bytes_to_copy); + + if (kaddr) + kunmap_atomic(kaddr, KM_USER0); + + fault_in_pages_readable(str, bytes_to_copy); + err = exec_copy_from_user(page, offset, str, bytes_to_copy); if (err) { ret = -EFAULT; goto out; @@ -263,8 +286,6 @@ int copy_strings(int argc,char __user * } ret = 0; out: - if (kmapped_page) - kunmap(kmapped_page); return ret; } @@ -286,52 +307,48 @@ int copy_strings_kernel(int argc,char ** * This routine is used to map in a page into an address space: needed by * execve() for the initial stack and environment pages. * - * tsk->mmap_sem is held for writing. + * The caller should hold task->mm->mmap_sem for writing. */ -void put_dirty_page(struct task_struct *tsk, struct page *page, - unsigned long address, pgprot_t prot) +void put_dirty_page(task_t *task, struct vm_area_struct *vma, + struct page *page, unsigned long address, pgprot_t prot) { - pgd_t * pgd; - pmd_t * pmd; - pte_t * pte; - struct pte_chain *pte_chain; + struct mm_struct *mm = task->mm; + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; if (page_count(page) != 1) printk(KERN_ERR "mem_map disagrees with %p at %08lx\n", page, address); - pgd = pgd_offset(tsk->mm, address); - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) - goto out_sig; - spin_lock(&tsk->mm->page_table_lock); - pmd = pmd_alloc(tsk->mm, pgd, address); + pgd = pgd_offset(mm, address); + spin_lock(&mm->page_table_lock); + pmd = pmd_alloc_map(mm, pgd, address); if (!pmd) goto out; - pte = pte_alloc_map(tsk->mm, pmd, address); + pte = pte_alloc_map(mm, pgd, &pmd, address); if (!pte) goto out; if (!pte_none(*pte)) { pte_unmap(pte); + pmd_unmap(pmd); goto out; } + mm->rss++; lru_cache_add_active(page); flush_dcache_page(page); - set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(page, prot)))); - pte_chain = page_add_rmap(page, pte, pte_chain); + vm_set_pte(vma, pte, pte_mkdirty(pte_mkwrite(mk_pte(page, prot))), address); + page_add_rmap(page, vma, address, 1); pte_unmap(pte); - tsk->mm->rss++; - spin_unlock(&tsk->mm->page_table_lock); + pmd_unmap(pmd); + spin_unlock(&mm->page_table_lock); /* no need for flush_tlb */ - pte_chain_free(pte_chain); return; out: - spin_unlock(&tsk->mm->page_table_lock); -out_sig: + spin_unlock(&mm->page_table_lock); __free_page(page); - force_sig(SIGKILL, tsk); - pte_chain_free(pte_chain); + force_sig(SIGKILL, task); return; } @@ -339,7 +356,8 @@ int setup_arg_pages(struct linux_binprm { unsigned long stack_base; struct vm_area_struct *mpnt; - struct mm_struct *mm = current->mm; + task_t *task = current; + struct mm_struct *mm = task->mm; int i; #ifdef CONFIG_STACK_GROWSUP @@ -373,7 +391,7 @@ int setup_arg_pages(struct linux_binprm /* Adjust bprm->p to point to the end of the strings. */ bprm->p = PAGE_SIZE * i - offset; - stack_base = STACK_TOP - current->rlim[RLIMIT_STACK].rlim_max; + stack_base = STACK_TOP - task->rlim[RLIMIT_STACK].rlim_max; mm->arg_start = stack_base; /* zero pages that were copied above */ @@ -424,7 +442,7 @@ int setup_arg_pages(struct linux_binprm struct page *page = bprm->page[i]; if (page) { bprm->page[i] = NULL; - put_dirty_page(current, page, stack_base, + put_dirty_page(task, mpnt, page, stack_base, mpnt->vm_page_prot); } stack_base += PAGE_SIZE; @@ -541,6 +559,7 @@ static inline int de_thread(struct task_ struct signal_struct *newsig, *oldsig = tsk->signal; struct sighand_struct *newsighand, *oldsighand = tsk->sighand; spinlock_t *lock = &oldsighand->siglock; + task_t *cur = current; int count; /* @@ -577,7 +596,7 @@ static inline int de_thread(struct task_ init_sigpending(&newsig->shared_pending); } - if (thread_group_empty(current)) + if (thread_group_empty(cur)) goto no_thread_group; /* @@ -599,19 +618,19 @@ static inline int de_thread(struct task_ return -EAGAIN; } oldsig->group_exit = 1; - zap_other_threads(current); + zap_other_threads(cur); read_unlock(&tasklist_lock); /* * Account for the thread group leader hanging around: */ count = 2; - if (current->pid == current->tgid) + if (cur->pid == cur->tgid) count = 1; while (atomic_read(&oldsig->count) > count) { - oldsig->group_exit_task = current; + oldsig->group_exit_task = cur; oldsig->notify_count = count; - __set_current_state(TASK_UNINTERRUPTIBLE); + __set_task_state(cur, TASK_UNINTERRUPTIBLE); spin_unlock_irq(lock); schedule(); spin_lock_irq(lock); @@ -623,8 +642,8 @@ static inline int de_thread(struct task_ * do is to wait for the thread group leader to become inactive, * and to assume its PID: */ - if (current->pid != current->tgid) { - struct task_struct *leader = current->group_leader, *parent; + if (cur->pid != cur->tgid) { + struct task_struct *leader = cur->group_leader, *parent; struct dentry *proc_dentry1, *proc_dentry2; unsigned long state, ptrace; @@ -637,14 +656,14 @@ static inline int de_thread(struct task_ yield(); spin_lock(&leader->proc_lock); - spin_lock(¤t->proc_lock); - proc_dentry1 = proc_pid_unhash(current); + spin_lock(&cur->proc_lock); + proc_dentry1 = proc_pid_unhash(cur); proc_dentry2 = proc_pid_unhash(leader); write_lock_irq(&tasklist_lock); - if (leader->tgid != current->tgid) + if (leader->tgid != cur->tgid) BUG(); - if (current->pid == current->tgid) + if (cur->pid == cur->tgid) BUG(); /* * An exec() starts a new thread group with the @@ -655,33 +674,33 @@ static inline int de_thread(struct task_ ptrace = leader->ptrace; parent = leader->parent; - ptrace_unlink(current); + ptrace_unlink(cur); ptrace_unlink(leader); - remove_parent(current); + remove_parent(cur); remove_parent(leader); - switch_exec_pids(leader, current); + switch_exec_pids(leader, cur); - current->parent = current->real_parent = leader->real_parent; + cur->parent = cur->real_parent = leader->real_parent; leader->parent = leader->real_parent = child_reaper; - current->group_leader = current; + cur->group_leader = cur; leader->group_leader = leader; - add_parent(current, current->parent); + add_parent(cur, cur->parent); add_parent(leader, leader->parent); if (ptrace) { - current->ptrace = ptrace; - __ptrace_link(current, parent); + cur->ptrace = ptrace; + __ptrace_link(cur, parent); } - list_del(¤t->tasks); - list_add_tail(¤t->tasks, &init_task.tasks); - current->exit_signal = SIGCHLD; + list_del(&cur->tasks); + list_add_tail(&cur->tasks, &init_task.tasks); + cur->exit_signal = SIGCHLD; state = leader->state; write_unlock_irq(&tasklist_lock); spin_unlock(&leader->proc_lock); - spin_unlock(¤t->proc_lock); + spin_unlock(&cur->proc_lock); proc_pid_flush(proc_dentry1); proc_pid_flush(proc_dentry2); @@ -696,12 +715,12 @@ no_thread_group: spin_lock(&oldsighand->siglock); spin_lock(&newsighand->siglock); - if (current == oldsig->curr_target) - oldsig->curr_target = next_thread(current); + if (cur == oldsig->curr_target) + oldsig->curr_target = next_thread(cur); if (newsig) - current->signal = newsig; - current->sighand = newsighand; - init_sigpending(¤t->pending); + cur->signal = newsig; + cur->sighand = newsighand; + init_sigpending(&cur->pending); recalc_sigpending(); spin_unlock(&newsighand->siglock); @@ -714,9 +733,9 @@ no_thread_group: if (atomic_dec_and_test(&oldsighand->count)) kmem_cache_free(sighand_cachep, oldsighand); - if (!thread_group_empty(current)) + if (!thread_group_empty(cur)) BUG(); - if (current->tgid != current->pid) + if (cur->tgid != cur->pid) BUG(); return 0; } @@ -758,12 +777,13 @@ int flush_old_exec(struct linux_binprm * { char * name; int i, ch, retval; + task_t *task = current; /* * Make sure we have a private signal table and that * we are unassociated from the previous thread group. */ - retval = de_thread(current); + retval = de_thread(task); if (retval) goto out; @@ -778,34 +798,34 @@ int flush_old_exec(struct linux_binprm * /* This is the point of no return */ - current->sas_ss_sp = current->sas_ss_size = 0; + task->sas_ss_sp = task->sas_ss_size = 0; - if (current->euid == current->uid && current->egid == current->gid) - current->mm->dumpable = 1; + if (task->euid == task->uid && task->egid == task->gid) + task->mm->dumpable = 1; name = bprm->filename; for (i=0; (ch = *(name++)) != '\0';) { if (ch == '/') i = 0; else if (i < 15) - current->comm[i++] = ch; + task->comm[i++] = ch; } - current->comm[i] = '\0'; + task->comm[i] = '\0'; flush_thread(); - if (bprm->e_uid != current->euid || bprm->e_gid != current->egid || + if (bprm->e_uid != task->euid || bprm->e_gid != task->egid || permission(bprm->file->f_dentry->d_inode,MAY_READ, NULL)) - current->mm->dumpable = 0; + task->mm->dumpable = 0; /* An exec changes our domain. We are no longer part of the thread group */ - current->self_exec_id++; + task->self_exec_id++; - flush_signal_handlers(current, 0); - flush_old_files(current->files); - exit_itimers(current); + flush_signal_handlers(task, 0); + flush_old_files(task->files); + exit_itimers(task); return 0; @@ -831,6 +851,7 @@ int prepare_binprm(struct linux_binprm * int mode; struct inode * inode = bprm->file->f_dentry->d_inode; int retval; + task_t *task = current; mode = inode->i_mode; /* @@ -842,8 +863,8 @@ int prepare_binprm(struct linux_binprm * if (bprm->file->f_op == NULL) return -EACCES; - bprm->e_uid = current->euid; - bprm->e_gid = current->egid; + bprm->e_uid = task->euid; + bprm->e_gid = task->egid; if(!(bprm->file->f_vfsmnt->mnt_flags & MNT_NOSUID)) { /* Set-uid? */ @@ -886,25 +907,26 @@ int prepare_binprm(struct linux_binprm * void compute_creds(struct linux_binprm *bprm) { - task_lock(current); - if (bprm->e_uid != current->uid || bprm->e_gid != current->gid) { - current->mm->dumpable = 0; + task_t *task = current; + task_lock(task); + if (bprm->e_uid != task->uid || bprm->e_gid != task->gid) { + task->mm->dumpable = 0; - if (must_not_trace_exec(current) - || atomic_read(¤t->fs->count) > 1 - || atomic_read(¤t->files->count) > 1 - || atomic_read(¤t->sighand->count) > 1) { + if (must_not_trace_exec(task) + || atomic_read(&task->fs->count) > 1 + || atomic_read(&task->files->count) > 1 + || atomic_read(&task->sighand->count) > 1) { if(!capable(CAP_SETUID)) { - bprm->e_uid = current->uid; - bprm->e_gid = current->gid; + bprm->e_uid = task->uid; + bprm->e_gid = task->gid; } } } - current->suid = current->euid = current->fsuid = bprm->e_uid; - current->sgid = current->egid = current->fsgid = bprm->e_gid; + task->suid = task->euid = task->fsuid = bprm->e_uid; + task->sgid = task->egid = task->fsgid = bprm->e_gid; - task_unlock(current); + task_unlock(task); security_bprm_compute_creds(bprm); } @@ -940,6 +962,8 @@ int search_binary_handler(struct linux_b { int try,retval=0; struct linux_binfmt *fmt; + task_t *task = current; + #ifdef __alpha__ /* handle /sbin/loader.. */ { @@ -998,7 +1022,7 @@ int search_binary_handler(struct linux_b if (bprm->file) fput(bprm->file); bprm->file = NULL; - current->did_exec = 1; + task->did_exec = 1; return retval; } read_lock(&binfmt_lock); @@ -1125,13 +1149,14 @@ out_file: int set_binfmt(struct linux_binfmt *new) { - struct linux_binfmt *old = current->binfmt; + task_t *task = current; + struct linux_binfmt *old = task->binfmt; if (new) { if (!try_module_get(new->module)) return -1; } - current->binfmt = new; + task->binfmt = new; if (old) module_put(old->module); return 0; diff -prauN linux-2.6.0-test1/fs/ext2/dir.c wli-2.6.0-test1-37/fs/ext2/dir.c --- linux-2.6.0-test1/fs/ext2/dir.c 2003-07-13 20:34:40.000000000 -0700 +++ wli-2.6.0-test1-37/fs/ext2/dir.c 2003-07-14 08:52:52.000000000 -0700 @@ -64,10 +64,10 @@ ext2_last_byte(struct inode *inode, unsi static int ext2_commit_chunk(struct page *page, unsigned from, unsigned to) { - struct inode *dir = page->mapping->host; + struct inode *dir = page_mapping(page)->host; int err = 0; dir->i_version++; - page->mapping->a_ops->commit_write(NULL, page, from, to); + page_mapping(page)->a_ops->commit_write(NULL, page, from, to); if (IS_DIRSYNC(dir)) err = write_one_page(page, 1); else @@ -77,7 +77,7 @@ static int ext2_commit_chunk(struct page static void ext2_check_page(struct page *page) { - struct inode *dir = page->mapping->host; + struct inode *dir = page_mapping(page)->host; struct super_block *sb = dir->i_sb; unsigned chunk_size = ext2_chunk_size(dir); char *kaddr = page_address(page); @@ -412,7 +412,7 @@ void ext2_set_link(struct inode *dir, st int err; lock_page(page); - err = page->mapping->a_ops->prepare_write(NULL, page, from, to); + err = page_mapping(page)->a_ops->prepare_write(NULL, page, from, to); if (err) BUG(); de->inode = cpu_to_le32(inode->i_ino); @@ -495,7 +495,7 @@ int ext2_add_link (struct dentry *dentry got_it: from = (char*)de - (char*)page_address(page); to = from + rec_len; - err = page->mapping->a_ops->prepare_write(NULL, page, from, to); + err = page_mapping(page)->a_ops->prepare_write(NULL, page, from, to); if (err) goto out_unlock; if (de->inode) { @@ -528,7 +528,7 @@ out_unlock: */ int ext2_delete_entry (struct ext2_dir_entry_2 * dir, struct page * page ) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); struct inode *inode = mapping->host; char *kaddr = page_address(page); unsigned from = ((char*)dir - kaddr) & ~(ext2_chunk_size(inode)-1); diff -prauN linux-2.6.0-test1/fs/ext3/inode.c wli-2.6.0-test1-37/fs/ext3/inode.c --- linux-2.6.0-test1/fs/ext3/inode.c 2003-07-13 20:35:56.000000000 -0700 +++ wli-2.6.0-test1-37/fs/ext3/inode.c 2003-07-14 09:10:59.000000000 -0700 @@ -1083,7 +1083,7 @@ static int do_journal_get_write_access(h static int ext3_prepare_write(struct file *file, struct page *page, unsigned from, unsigned to) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; int ret, needed_blocks = ext3_writepage_trans_blocks(inode); handle_t *handle; @@ -1138,7 +1138,7 @@ static int ext3_ordered_commit_write(str unsigned from, unsigned to) { handle_t *handle = ext3_journal_current_handle(); - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; int ret = 0, ret2; ret = walk_page_buffers(handle, page_buffers(page), @@ -1167,7 +1167,7 @@ static int ext3_writeback_commit_write(s unsigned from, unsigned to) { handle_t *handle = ext3_journal_current_handle(); - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; int ret = 0, ret2; loff_t new_i_size; @@ -1185,7 +1185,7 @@ static int ext3_journalled_commit_write( struct page *page, unsigned from, unsigned to) { handle_t *handle = ext3_journal_current_handle(); - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; int ret = 0, ret2; int partial = 0; loff_t pos; @@ -1340,7 +1340,7 @@ static int journal_dirty_data_fn(handle_ static int ext3_ordered_writepage(struct page *page, struct writeback_control *wbc) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; struct buffer_head *page_bufs; handle_t *handle = NULL; int ret = 0; @@ -1400,7 +1400,7 @@ static int ext3_ordered_writepage(struct return ret; out_fail: - __set_page_dirty_nobuffers(page); + set_page_dirty_nobuffers(page); unlock_page(page); return ret; } @@ -1408,7 +1408,7 @@ out_fail: static int ext3_writeback_writepage(struct page *page, struct writeback_control *wbc) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; handle_t *handle = NULL; int ret = 0; int err; @@ -1429,7 +1429,7 @@ static int ext3_writeback_writepage(stru return ret; out_fail: - __set_page_dirty_nobuffers(page); + set_page_dirty_nobuffers(page); unlock_page(page); return ret; } @@ -1437,7 +1437,7 @@ out_fail: static int ext3_journalled_writepage(struct page *page, struct writeback_control *wbc) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; handle_t *handle = NULL; int ret = 0; int err; @@ -1485,7 +1485,7 @@ out: return ret; no_write: - __set_page_dirty_nobuffers(page); + set_page_dirty_nobuffers(page); out_unlock: unlock_page(page); goto out; @@ -1505,7 +1505,7 @@ ext3_readpages(struct file *file, struct static int ext3_invalidatepage(struct page *page, unsigned long offset) { - journal_t *journal = EXT3_JOURNAL(page->mapping->host); + journal_t *journal = EXT3_JOURNAL(page_mapping(page)->host); /* * If it's a full truncate we just forget about the pending dirtying @@ -1518,7 +1518,7 @@ static int ext3_invalidatepage(struct pa static int ext3_releasepage(struct page *page, int wait) { - journal_t *journal = EXT3_JOURNAL(page->mapping->host); + journal_t *journal = EXT3_JOURNAL(page_mapping(page)->host); WARN_ON(PageChecked(page)); return journal_try_to_free_buffers(journal, page, wait); @@ -1604,7 +1604,7 @@ out: static int ext3_journalled_set_page_dirty(struct page *page) { SetPageChecked(page); - return __set_page_dirty_nobuffers(page); + return set_page_dirty_nobuffers(page); } static struct address_space_operations ext3_ordered_aops = { diff -prauN linux-2.6.0-test1/fs/fat/inode.c wli-2.6.0-test1-37/fs/fat/inode.c --- linux-2.6.0-test1/fs/fat/inode.c 2003-07-13 20:33:47.000000000 -0700 +++ wli-2.6.0-test1-37/fs/fat/inode.c 2003-07-14 08:52:52.000000000 -0700 @@ -1070,7 +1070,7 @@ fat_prepare_write(struct file *file, str { kmap(page); return cont_prepare_write(page,from,to,fat_get_block, - &MSDOS_I(page->mapping->host)->mmu_private); + &MSDOS_I(page_mapping(page)->host)->mmu_private); } static int diff -prauN linux-2.6.0-test1/fs/fcntl.c wli-2.6.0-test1-37/fs/fcntl.c --- linux-2.6.0-test1/fs/fcntl.c 2003-07-13 20:35:55.000000000 -0700 +++ wli-2.6.0-test1-37/fs/fcntl.c 2003-07-14 09:45:14.000000000 -0700 @@ -36,9 +36,9 @@ static inline int get_close_on_exec(unsi { struct files_struct *files = current->files; int res; - spin_lock(&files->file_lock); + rcu_read_lock(); res = FD_ISSET(fd, files->close_on_exec); - spin_unlock(&files->file_lock); + rcu_read_unlock(); return res; } @@ -140,7 +140,7 @@ static int dupfd(struct file *file, unsi if (fd >= 0) { FD_SET(fd, files->open_fds); FD_CLR(fd, files->close_on_exec); - spin_unlock(&files->file_lock); + spin_unlock(&files->file_lock); fd_install(fd, file); } else { spin_unlock(&files->file_lock); @@ -185,6 +185,7 @@ asmlinkage long sys_dup2(unsigned int ol goto out_fput; files->fd[newfd] = file; + wmb(); FD_SET(newfd, files->open_fds); FD_CLR(newfd, files->close_on_exec); spin_unlock(&files->file_lock); diff -prauN linux-2.6.0-test1/fs/file.c wli-2.6.0-test1-37/fs/file.c --- linux-2.6.0-test1/fs/file.c 2003-07-13 20:30:37.000000000 -0700 +++ wli-2.6.0-test1-37/fs/file.c 2003-07-14 09:45:14.000000000 -0700 @@ -14,7 +14,20 @@ #include #include +#include +struct rcu_fd_array { + struct rcu_head rh; + struct file **array; + int nfds; +}; + +struct rcu_fd_set { + struct rcu_head rh; + fd_set *openset; + fd_set *execset; + int nfds; +}; /* * Allocate an fd array, using kmalloc or vmalloc. @@ -49,6 +62,13 @@ void free_fd_array(struct file **array, vfree(array); } +static void fd_array_callback(void *arg) +{ + struct rcu_fd_array *a = (struct rcu_fd_array *) arg; + free_fd_array(a->array, a->nfds); + kfree(arg); +} + /* * Expand the fd array in the files_struct. Called with the files * spinlock held for write. @@ -56,8 +76,9 @@ void free_fd_array(struct file **array, int expand_fd_array(struct files_struct *files, int nr) { - struct file **new_fds; - int error, nfds; + struct file **new_fds = NULL; + int error, nfds = 0; + struct rcu_fd_array *arg = NULL; error = -EMFILE; @@ -89,18 +110,17 @@ int expand_fd_array(struct files_struct error = -ENOMEM; new_fds = alloc_fd_array(nfds); + arg = (struct rcu_fd_array *) kmalloc(sizeof(*arg), GFP_ATOMIC); + spin_lock(&files->file_lock); - if (!new_fds) + if (!new_fds || !arg) goto out; /* Copy the existing array and install the new pointer */ if (nfds > files->max_fds) { - struct file **old_fds; - int i; - - old_fds = xchg(&files->fd, new_fds); - i = xchg(&files->max_fds, nfds); + struct file **old_fds = files->fd; + int i = files->max_fds; /* Don't copy/clear the array if we are creating a new fd array for fork() */ @@ -109,19 +129,34 @@ int expand_fd_array(struct files_struct /* clear the remainder of the array */ memset(&new_fds[i], 0, (nfds-i) * sizeof(struct file *)); - - spin_unlock(&files->file_lock); - free_fd_array(old_fds, i); - spin_lock(&files->file_lock); } + + wmb(); + files->fd = new_fds; + wmb(); + files->max_fds = nfds; + + if (i) { + arg->array = old_fds; + arg->nfds = i; + call_rcu(&arg->rh, fd_array_callback, arg); + } else + kfree(arg); } else { /* Somebody expanded the array while we slept ... */ spin_unlock(&files->file_lock); free_fd_array(new_fds, nfds); + kfree(arg); spin_lock(&files->file_lock); } - error = 0; + + return 0; out: + if (new_fds) + free_fd_array(new_fds, nfds); + if (arg) + kfree(arg); + return error; } @@ -153,6 +188,14 @@ void free_fdset(fd_set *array, int num) vfree(array); } +static void fd_set_callback (void *arg) +{ + struct rcu_fd_set *a = (struct rcu_fd_set *) arg; + free_fdset(a->openset, a->nfds); + free_fdset(a->execset, a->nfds); + kfree(arg); +} + /* * Expand the fdset in the files_struct. Called with the files spinlock * held for write. @@ -161,6 +204,7 @@ int expand_fdset(struct files_struct *fi { fd_set *new_openset = 0, *new_execset = 0; int error, nfds = 0; + struct rcu_fd_set *arg = NULL; error = -EMFILE; if (files->max_fdset >= NR_OPEN || nr >= NR_OPEN) @@ -183,35 +227,43 @@ int expand_fdset(struct files_struct *fi error = -ENOMEM; new_openset = alloc_fdset(nfds); new_execset = alloc_fdset(nfds); + arg = (struct rcu_fd_set *) kmalloc(sizeof(*arg), GFP_ATOMIC); spin_lock(&files->file_lock); - if (!new_openset || !new_execset) + if (!new_openset || !new_execset || !arg) goto out; error = 0; /* Copy the existing tables and install the new pointers */ if (nfds > files->max_fdset) { - int i = files->max_fdset / (sizeof(unsigned long) * 8); - int count = (nfds - files->max_fdset) / 8; + fd_set * old_openset = files->open_fds; + fd_set * old_execset = files->close_on_exec; + int old_nfds = files->max_fdset; + int i = old_nfds / (sizeof(unsigned long) * 8); + int count = (nfds - old_nfds) / 8; /* * Don't copy the entire array if the current fdset is * not yet initialised. */ if (i) { - memcpy (new_openset, files->open_fds, files->max_fdset/8); - memcpy (new_execset, files->close_on_exec, files->max_fdset/8); + memcpy (new_openset, old_openset, old_nfds/8); + memcpy (new_execset, old_execset, old_nfds/8); memset (&new_openset->fds_bits[i], 0, count); memset (&new_execset->fds_bits[i], 0, count); } - nfds = xchg(&files->max_fdset, nfds); - new_openset = xchg(&files->open_fds, new_openset); - new_execset = xchg(&files->close_on_exec, new_execset); - spin_unlock(&files->file_lock); - free_fdset (new_openset, nfds); - free_fdset (new_execset, nfds); - spin_lock(&files->file_lock); + wmb(); + files->open_fds = new_openset; + files->close_on_exec = new_execset; + wmb(); + files->max_fdset = nfds; + + arg->openset = old_openset; + arg->execset = old_execset; + arg->nfds = nfds; + call_rcu(&arg->rh, fd_set_callback, arg); + return 0; } /* Somebody expanded the array while we slept ... */ @@ -222,6 +274,8 @@ out: free_fdset(new_openset, nfds); if (new_execset) free_fdset(new_execset, nfds); + if (arg) + kfree(arg); spin_lock(&files->file_lock); return error; } diff -prauN linux-2.6.0-test1/fs/file_table.c wli-2.6.0-test1-37/fs/file_table.c --- linux-2.6.0-test1/fs/file_table.c 2003-07-13 20:29:30.000000000 -0700 +++ wli-2.6.0-test1-37/fs/file_table.c 2003-07-14 10:20:40.000000000 -0700 @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -23,8 +24,6 @@ struct files_stat_struct files_stat = { }; /* public *and* exported. Not pretty! */ -spinlock_t __cacheline_aligned_in_smp files_lock = SPIN_LOCK_UNLOCKED; - static spinlock_t filp_count_lock = SPIN_LOCK_UNLOCKED; /* slab constructors and destructors are called from arbitrary @@ -155,6 +154,7 @@ void __fput(struct file *file) struct dentry *dentry = file->f_dentry; struct vfsmount *mnt = file->f_vfsmnt; struct inode *inode = dentry->d_inode; + struct file_list *container; /* * The function eventpoll_release() should be the first called @@ -173,7 +173,16 @@ void __fput(struct file *file) put_write_access(inode); file->f_dentry = NULL; file->f_vfsmnt = NULL; - file_kill(file); + rcu_read_lock(); + smp_read_barrier_depends(); + container = file->f_container; + if (container) { + spin_lock(&container->lock); + list_del_init(&file->f_list); + file->f_container = NULL; + spin_unlock(&container->lock); + } + rcu_read_unlock(); file_free(file); dput(dentry); mntput(mnt); @@ -182,13 +191,24 @@ void __fput(struct file *file) struct file *fget(unsigned int fd) { struct file *file; - struct files_struct *files = current->files; - spin_lock(&files->file_lock); + rcu_read_lock(); file = fcheck(fd); - if (file) + if (file) { get_file(file); - spin_unlock(&files->file_lock); + + /* before returning check again if someone (as of now sys_close) + * has nullified the fd_array entry, if yes then we might have + * failed fput call for him by doing get_file() so do the + * favour of doing fput for him. + */ + + if (!(fcheck(fd))) { + fput(file); + return NULL; + } + } + rcu_read_unlock(); return file; } @@ -208,13 +228,13 @@ struct file *fget_light(unsigned int fd, if (likely((atomic_read(&files->count) == 1))) { file = fcheck(fd); } else { - spin_lock(&files->file_lock); + rcu_read_lock(); /* files->file_lock */ file = fcheck(fd); if (file) { get_file(file); *fput_needed = 1; } - spin_unlock(&files->file_lock); + rcu_read_unlock(); /* files->file_lock */ } return file; } @@ -222,53 +242,50 @@ struct file *fget_light(unsigned int fd, void put_filp(struct file *file) { - if (atomic_dec_and_test(&file->f_count)) { - security_file_free(file); - file_kill(file); - file_free(file); - } -} + struct file_list *container; -void file_move(struct file *file, struct list_head *list) -{ - if (!list) + if (!atomic_dec_and_test(&file->f_count)) return; - file_list_lock(); - list_move(&file->f_list, list); - file_list_unlock(); -} -void file_kill(struct file *file) -{ - if (!list_empty(&file->f_list)) { - file_list_lock(); + security_file_free(file); + rcu_read_lock(); + smp_read_barrier_depends(); + container = file->f_container; + if (container) { + spin_lock(&file->f_container->lock); list_del_init(&file->f_list); - file_list_unlock(); + file->f_container = NULL; + spin_unlock(&file->f_container->lock); } + rcu_read_unlock(); + file_free(file); } int fs_may_remount_ro(struct super_block *sb) { - struct list_head *p; + int cpu; /* Check that no files are currently opened for writing. */ - file_list_lock(); - list_for_each(p, &sb->s_files) { - struct file *file = list_entry(p, struct file, f_list); - struct inode *inode = file->f_dentry->d_inode; - - /* File with pending delete? */ - if (inode->i_nlink == 0) - goto too_bad; - - /* Writeable file? */ - if (S_ISREG(inode->i_mode) && (file->f_mode & FMODE_WRITE)) - goto too_bad; + file_list_lock_all(sb->s_file_lists); + for (cpu = 0; cpu < NR_CPUS; ++cpu) { + struct file *file; + list_for_each_entry(file, &sb->s_file_lists[cpu].list, f_list) { + struct inode *inode = file->f_dentry->d_inode; + + /* File with pending delete? */ + if (inode->i_nlink == 0) + goto too_bad; + + /* Writeable file? */ + if (S_ISREG(inode->i_mode) && + (file->f_mode & FMODE_WRITE)) + goto too_bad; + } } - file_list_unlock(); + file_list_unlock_all(sb->s_file_lists); return 1; /* Tis' cool bro. */ too_bad: - file_list_unlock(); + file_list_unlock_all(sb->s_file_lists); return 0; } diff -prauN linux-2.6.0-test1/fs/freevxfs/vxfs_immed.c wli-2.6.0-test1-37/fs/freevxfs/vxfs_immed.c --- linux-2.6.0-test1/fs/freevxfs/vxfs_immed.c 2003-07-13 20:35:56.000000000 -0700 +++ wli-2.6.0-test1-37/fs/freevxfs/vxfs_immed.c 2003-07-14 08:52:52.000000000 -0700 @@ -122,7 +122,7 @@ vxfs_immed_follow_link(struct dentry *dp static int vxfs_immed_readpage(struct file *fp, struct page *pp) { - struct vxfs_inode_info *vip = VXFS_INO(pp->mapping->host); + struct vxfs_inode_info *vip = VXFS_INO(page_mapping(pp)->host); u_int64_t offset = pp->index << PAGE_CACHE_SHIFT; caddr_t kaddr; diff -prauN linux-2.6.0-test1/fs/fs-writeback.c wli-2.6.0-test1-37/fs/fs-writeback.c --- linux-2.6.0-test1/fs/fs-writeback.c 2003-07-13 20:39:32.000000000 -0700 +++ wli-2.6.0-test1-37/fs/fs-writeback.c 2003-07-14 08:33:37.000000000 -0700 @@ -150,10 +150,10 @@ __sync_single_inode(struct inode *inode, * read speculatively by this cpu before &= ~I_DIRTY -- mikulas */ - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); if (wait || !wbc->for_kupdate || list_empty(&mapping->io_pages)) list_splice_init(&mapping->dirty_pages, &mapping->io_pages); - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); spin_unlock(&inode_lock); do_writepages(mapping, wbc); diff -prauN linux-2.6.0-test1/fs/hfs/inode.c wli-2.6.0-test1-37/fs/hfs/inode.c --- linux-2.6.0-test1/fs/hfs/inode.c 2003-07-13 20:39:23.000000000 -0700 +++ wli-2.6.0-test1-37/fs/hfs/inode.c 2003-07-14 08:52:52.000000000 -0700 @@ -240,7 +240,7 @@ static int hfs_readpage(struct file *fil static int hfs_prepare_write(struct file *file, struct page *page, unsigned from, unsigned to) { return cont_prepare_write(page,from,to,hfs_get_block, - &HFS_I(page->mapping->host)->mmu_private); + &HFS_I(page_mapping(page)->host)->mmu_private); } static sector_t hfs_bmap(struct address_space *mapping, sector_t block) { diff -prauN linux-2.6.0-test1/fs/hpfs/file.c wli-2.6.0-test1-37/fs/hpfs/file.c --- linux-2.6.0-test1/fs/hpfs/file.c 2003-07-13 20:38:48.000000000 -0700 +++ wli-2.6.0-test1-37/fs/hpfs/file.c 2003-07-14 08:52:52.000000000 -0700 @@ -109,7 +109,7 @@ static int hpfs_readpage(struct file *fi static int hpfs_prepare_write(struct file *file, struct page *page, unsigned from, unsigned to) { return cont_prepare_write(page,from,to,hpfs_get_block, - &hpfs_i(page->mapping->host)->mmu_private); + &hpfs_i(page_mapping(page)->host)->mmu_private); } static sector_t _hpfs_bmap(struct address_space *mapping, sector_t block) { diff -prauN linux-2.6.0-test1/fs/hpfs/namei.c wli-2.6.0-test1-37/fs/hpfs/namei.c --- linux-2.6.0-test1/fs/hpfs/namei.c 2003-07-13 20:33:39.000000000 -0700 +++ wli-2.6.0-test1-37/fs/hpfs/namei.c 2003-07-14 08:52:52.000000000 -0700 @@ -449,7 +449,7 @@ int hpfs_rmdir(struct inode *dir, struct int hpfs_symlink_readpage(struct file *file, struct page *page) { char *link = kmap(page); - struct inode *i = page->mapping->host; + struct inode *i = page_mapping(page)->host; struct fnode *fnode; struct buffer_head *bh; int err; diff -prauN linux-2.6.0-test1/fs/hugetlbfs/inode.c wli-2.6.0-test1-37/fs/hugetlbfs/inode.c --- linux-2.6.0-test1/fs/hugetlbfs/inode.c 2003-07-13 20:37:57.000000000 -0700 +++ wli-2.6.0-test1-37/fs/hugetlbfs/inode.c 2003-07-14 08:45:37.000000000 -0700 @@ -296,12 +296,15 @@ hugetlb_vmtruncate_list(struct list_head { struct vm_area_struct *vma; - list_for_each_entry(vma, list, shared) { + list_for_each_entry_rcu(vma, list, shared) { unsigned long h_vm_pgoff; unsigned long v_length; unsigned long h_length; unsigned long v_offset; + if (vma->vm_flags & VM_DEAD) + continue; + h_vm_pgoff = vma->vm_pgoff << (HPAGE_SHIFT - PAGE_SHIFT); v_length = vma->vm_end - vma->vm_start; h_length = v_length >> HPAGE_SHIFT; @@ -346,12 +349,12 @@ static int hugetlb_vmtruncate(struct ino pgoff = offset >> HPAGE_SHIFT; inode->i_size = offset; - down(&mapping->i_shared_sem); + rcu_read_lock(); /* mapping->i_shared_lock */ if (!list_empty(&mapping->i_mmap)) hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff); if (!list_empty(&mapping->i_mmap_shared)) hugetlb_vmtruncate_list(&mapping->i_mmap_shared, pgoff); - up(&mapping->i_shared_sem); + rcu_read_unlock(); /* mapping->i_shared_lock */ truncate_hugepages(mapping, offset); return 0; } diff -prauN linux-2.6.0-test1/fs/inode.c wli-2.6.0-test1-37/fs/inode.c --- linux-2.6.0-test1/fs/inode.c 2003-07-13 20:38:53.000000000 -0700 +++ wli-2.6.0-test1-37/fs/inode.c 2003-07-17 15:01:26.000000000 -0700 @@ -182,8 +182,8 @@ void inode_init_once(struct inode *inode INIT_LIST_HEAD(&inode->i_devices); sema_init(&inode->i_sem, 1); INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC); - spin_lock_init(&inode->i_data.page_lock); - init_MUTEX(&inode->i_data.i_shared_sem); + mapping_rwlock_init(&inode->i_data.page_lock); + spin_lock_init(&inode->i_data.i_shared_lock); INIT_LIST_HEAD(&inode->i_data.private_list); spin_lock_init(&inode->i_data.private_lock); INIT_LIST_HEAD(&inode->i_data.i_mmap); @@ -1225,7 +1225,7 @@ void remove_dquot_ref(struct super_block * Hashed waitqueues for wait_on_inode(). The table is pretty small - the * kernel doesn't lock many inodes at the same time. */ -#define I_WAIT_TABLE_ORDER 3 +#define I_WAIT_TABLE_ORDER 12 static struct i_wait_queue_head { wait_queue_head_t wqh; } ____cacheline_aligned_in_smp i_wait_queue_heads[1<f_dentry); if (dd && dd->dd_fset) { - int (*cache_ioctl)(struct inode *, struct file *, unsigned int, unsigned long ) = filter_c2cdfops(dd->dd_fset->fset_cache->cache_filter)->ioctl; + int (*cache_ioctl)(struct inode *, struct file *, unsigned int, unsigned long); + cache_ioctl = filter_c2cdfops(dd->dd_fset->fset_cache->cache_filter)->ioctl; rc = -ENOTTY; if (cache_ioctl) rc = cache_ioctl(inode, file, cmd, arg); @@ -903,47 +905,49 @@ int presto_ioctl(struct inode *inode, st return -EPERM; } - memset(buf, 0, sizeof(buf)); - - if (izo_ioctl_getdata(buf, buf + 1024, (void *)arg)) { + /* allocate a zero'd buffer for data */ + PRESTO_ALLOC(buf, bufsz); + if (!buf) { + EXIT; + return -ENOMEM; + } + + if (izo_ioctl_getdata(buf, buf + bufsz, (void *)arg)) { CERROR("intermezzo ioctl: data error\n"); - return -EINVAL; + rc = -EINVAL; + goto done; } data = (struct izo_ioctl_data *)buf; switch(cmd) { case IZO_IOC_REINTKML: { - int rc; int cperr; rc = kml_reint_rec(file, data); - EXIT; cperr = copy_to_user((char *)arg, data, sizeof(*data)); if (cperr) { CERROR("WARNING: cperr %d\n", cperr); rc = -EFAULT; } - return rc; + goto done; } case IZO_IOC_GET_RCVD: { struct izo_rcvd_rec rec; struct presto_file_set *fset; - int rc; fset = presto_fset(file->f_dentry); if (fset == NULL) { - EXIT; - return -ENODEV; - } + rc = -ENODEV; + goto done; + } + rc = izo_rcvd_get(&rec, fset, data->ioc_uuid); - if (rc < 0) { - EXIT; - return rc; - } + if (rc < 0) + goto done; - EXIT; - return copy_to_user((char *)arg, &rec, sizeof(rec))? -EFAULT : 0; + rc = copy_to_user((char *)arg, &rec, sizeof(rec))? -EFAULT : 0; + goto done; } case IZO_IOC_REPSTATUS: { @@ -952,12 +956,11 @@ int presto_ioctl(struct inode *inode, st struct izo_rcvd_rec rec; struct presto_file_set *fset; int minor; - int rc; fset = presto_fset(file->f_dentry); if (fset == NULL) { - EXIT; - return -ENODEV; + rc = -ENODEV; + goto done; } minor = presto_f2m(fset); @@ -966,13 +969,11 @@ int presto_ioctl(struct inode *inode, st rc = izo_repstatus(fset, client_kmlsize, lr_client, &rec); - if (rc < 0) { - EXIT; - return rc; - } + if (rc < 0) + goto done; - EXIT; - return copy_to_user((char *)arg, &rec, sizeof(rec))? -EFAULT : 0; + rc = copy_to_user((char *)arg, &rec, sizeof(rec))? -EFAULT : 0; + goto done; } case IZO_IOC_GET_CHANNEL: { @@ -980,30 +981,28 @@ int presto_ioctl(struct inode *inode, st fset = presto_fset(file->f_dentry); if (fset == NULL) { - EXIT; - return -ENODEV; + rc = -ENODEV; + goto done; } data->ioc_dev = fset->fset_cache->cache_psdev->uc_minor; CDEBUG(D_PSDEV, "CHANNEL %d\n", data->ioc_dev); - EXIT; - return copy_to_user((char *)arg, data, sizeof(*data))? -EFAULT : 0; + rc = copy_to_user((char *)arg, data, sizeof(*data))? -EFAULT : 0; + goto done; } case IZO_IOC_SET_IOCTL_UID: izo_authorized_uid = data->ioc_uid; - EXIT; - return 0; + rc = 0; + goto done; case IZO_IOC_SET_PID: rc = izo_psdev_setpid(data->ioc_dev); - EXIT; - return rc; + goto done; case IZO_IOC_SET_CHANNEL: rc = izo_psdev_setchannel(file, data->ioc_dev); - EXIT; - return rc; + goto done; case IZO_IOC_GET_KML_SIZE: { struct presto_file_set *fset; @@ -1011,14 +1010,14 @@ int presto_ioctl(struct inode *inode, st fset = presto_fset(file->f_dentry); if (fset == NULL) { - EXIT; - return -ENODEV; + rc = -ENODEV; + goto done; } kmlsize = presto_kml_offset(fset) + fset->fset_kml_logical_off; - EXIT; - return copy_to_user((char *)arg, &kmlsize, sizeof(kmlsize))?-EFAULT : 0; + rc = copy_to_user((char *)arg, &kmlsize, sizeof(kmlsize))?-EFAULT : 0; + goto done; } case IZO_IOC_PURGE_FILE_DATA: { @@ -1026,37 +1025,37 @@ int presto_ioctl(struct inode *inode, st fset = presto_fset(file->f_dentry); if (fset == NULL) { - EXIT; - return -ENODEV; + rc = -ENODEV; + goto done; } rc = izo_purge_file(fset, data->ioc_inlbuf1); - EXIT; - return rc; + goto done; } case IZO_IOC_GET_FILEID: { rc = izo_get_fileid(file, data); - EXIT; if (rc) - return rc; - return copy_to_user((char *)arg, data, sizeof(*data))? -EFAULT : 0; + goto done; + + rc = copy_to_user((char *)arg, data, sizeof(*data))? -EFAULT : 0; + goto done; } case IZO_IOC_SET_FILEID: { rc = izo_set_fileid(file, data); - EXIT; if (rc) - return rc; - return copy_to_user((char *)arg, data, sizeof(*data))? -EFAULT : 0; + goto done; + + rc = copy_to_user((char *)arg, data, sizeof(*data))? -EFAULT : 0; + goto done; } case IZO_IOC_ADJUST_LML: { struct lento_vfs_context *info; info = (struct lento_vfs_context *)data->ioc_inlbuf1; rc = presto_adjust_lml(file, info); - EXIT; - return rc; + goto done; } case IZO_IOC_CONNECT: { @@ -1065,16 +1064,15 @@ int presto_ioctl(struct inode *inode, st fset = presto_fset(file->f_dentry); if (fset == NULL) { - EXIT; - return -ENODEV; + rc = -ENODEV; + goto done; } minor = presto_f2m(fset); rc = izo_upc_connect(minor, data->ioc_ino, data->ioc_generation, data->ioc_uuid, data->ioc_flags); - EXIT; - return rc; + goto done; } case IZO_IOC_GO_FETCH_KML: { @@ -1083,15 +1081,14 @@ int presto_ioctl(struct inode *inode, st fset = presto_fset(file->f_dentry); if (fset == NULL) { - EXIT; - return -ENODEV; + rc = -ENODEV; + goto done; } minor = presto_f2m(fset); rc = izo_upc_go_fetch_kml(minor, fset->fset_name, data->ioc_uuid, data->ioc_kmlsize); - EXIT; - return rc; + goto done; } case IZO_IOC_REVOKE_PERMIT: @@ -1099,26 +1096,23 @@ int presto_ioctl(struct inode *inode, st rc = izo_revoke_permit(file->f_dentry, data->ioc_uuid); else rc = izo_revoke_permit(file->f_dentry, NULL); - EXIT; - return rc; + goto done; case IZO_IOC_CLEAR_FSET: rc = izo_clear_fsetroot(file->f_dentry); - EXIT; - return rc; + goto done; case IZO_IOC_CLEAR_ALL_FSETS: { struct presto_file_set *fset; fset = presto_fset(file->f_dentry); if (fset == NULL) { - EXIT; - return -ENODEV; + rc = -ENODEV; + goto done; } rc = izo_clear_all_fsetroots(fset->fset_cache); - EXIT; - return rc; + goto done; } case IZO_IOC_SET_FSET: @@ -1128,9 +1122,7 @@ int presto_ioctl(struct inode *inode, st rc = presto_set_fsetroot_from_ioc(file->f_dentry, data->ioc_inlbuf1, data->ioc_flags); - EXIT; - return rc; - + goto done; case IZO_IOC_MARK: { int res = 0; /* resulting flags - returned to user */ @@ -1186,16 +1178,16 @@ int presto_ioctl(struct inode *inode, st } if (error) { - EXIT; - return error; + rc = error; + goto done; } data->ioc_mark_what = res; CDEBUG(D_DOWNCALL, "mark inode: %ld, and: %x, or: %x, what %x\n", file->f_dentry->d_inode->i_ino, data->ioc_and_flag, data->ioc_or_flag, data->ioc_mark_what); - EXIT; - return copy_to_user((char *)arg, data, sizeof(*data))? -EFAULT : 0; + rc = copy_to_user((char *)arg, data, sizeof(*data))? -EFAULT : 0; + goto done; } #if 0 case IZO_IOC_CLIENT_MAKE_BRANCH: { @@ -1204,16 +1196,15 @@ int presto_ioctl(struct inode *inode, st fset = presto_fset(file->f_dentry); if (fset == NULL) { - EXIT; - return -ENODEV; + rc = -ENODEV; + goto done; } minor = presto_f2m(fset); rc = izo_upc_client_make_branch(minor, fset->fset_name, data->ioc_inlbuf1, data->ioc_inlbuf2); - EXIT; - return rc; + goto done; } #endif case IZO_IOC_SERVER_MAKE_BRANCH: { @@ -1222,14 +1213,14 @@ int presto_ioctl(struct inode *inode, st fset = presto_fset(file->f_dentry); if (fset == NULL) { - EXIT; - return -ENODEV; + rc = -ENODEV; + goto done; } minor = presto_f2m(fset); izo_upc_server_make_branch(minor, data->ioc_inlbuf1); - EXIT; - return 0; + rc = 0; + goto done; } case IZO_IOC_SET_KMLSIZE: { struct presto_file_set *fset; @@ -1238,38 +1229,33 @@ int presto_ioctl(struct inode *inode, st fset = presto_fset(file->f_dentry); if (fset == NULL) { - EXIT; - return -ENODEV; + rc = -ENODEV; + goto done; } minor = presto_f2m(fset); rc = izo_upc_set_kmlsize(minor, fset->fset_name, data->ioc_uuid, data->ioc_kmlsize); - if (rc != 0) { - EXIT; - return rc; - } + if (rc != 0) + goto done; rc = izo_rcvd_get(&rec, fset, data->ioc_uuid); if (rc == -EINVAL) { /* We don't know anything about this uuid yet; no * worries. */ memset(&rec, 0, sizeof(rec)); - } else if (rc <= 0) { + } else if (rc <= 0) { /* do we really want to return 0 if rc == 0 here? */ CERROR("InterMezzo: error reading last_rcvd: %d\n", rc); - EXIT; - return rc; + goto done; } rec.lr_remote_offset = data->ioc_kmlsize; rc = izo_rcvd_write(fset, &rec); if (rc <= 0) { CERROR("InterMezzo: error writing last_rcvd: %d\n", rc); - EXIT; - return rc; + goto done; } - EXIT; - return rc; + goto done; } case IZO_IOC_BRANCH_UNDO: { struct presto_file_set *fset; @@ -1277,15 +1263,14 @@ int presto_ioctl(struct inode *inode, st fset = presto_fset(file->f_dentry); if (fset == NULL) { - EXIT; - return -ENODEV; + rc = -ENODEV; + goto done; } minor = presto_f2m(fset); rc = izo_upc_branch_undo(minor, fset->fset_name, data->ioc_inlbuf1); - EXIT; - return rc; + goto done; } case IZO_IOC_BRANCH_REDO: { struct presto_file_set *fset; @@ -1293,28 +1278,33 @@ int presto_ioctl(struct inode *inode, st fset = presto_fset(file->f_dentry); if (fset == NULL) { - EXIT; - return -ENODEV; + rc = -ENODEV; + goto done; } minor = presto_f2m(fset); rc = izo_upc_branch_redo(minor, fset->fset_name, data->ioc_inlbuf1); - EXIT; - return rc; + goto done; } case TCGETS: - EXIT; - return -EINVAL; + rc = -EINVAL; + goto done; default: EXIT; - return -EINVAL; - + rc = -EINVAL; + goto done; + } + + rc = 0; + + done: + PRESTO_FREE(buf, bufsz); EXIT; - return 0; + return rc; } struct file_operations presto_dir_fops = { diff -prauN linux-2.6.0-test1/fs/intermezzo/journal.c wli-2.6.0-test1-37/fs/intermezzo/journal.c --- linux-2.6.0-test1/fs/intermezzo/journal.c 2003-07-13 20:29:20.000000000 -0700 +++ wli-2.6.0-test1-37/fs/intermezzo/journal.c 2003-07-18 09:38:27.000000000 -0700 @@ -1239,12 +1239,16 @@ int presto_write_kml_logical_offset(stru return izo_rcvd_write(fset, &rec); } +/* we are called from presto_finish_kml_truncate, which is called */ +/* with fset->fset_kml.fd_lock held. Allocations must be GFP_ATOMIC */ struct file * presto_copy_kml_tail(struct presto_file_set *fset, unsigned long int start) { struct file *f; int len; loff_t read_off, write_off, bytes; + char* buf; + size_t bufsz; ENTRY; @@ -1258,21 +1262,31 @@ struct file * presto_copy_kml_tail(struc write_off = 0; read_off = start; bytes = fset->fset_kml.fd_offset - start; - while (bytes > 0) { - char buf[4096]; - int toread; - if (bytes > sizeof(buf)) - toread = sizeof(buf); - else - toread = bytes; + bufsz = bytes; + /* can't use PRESTO_ALLOC - alloction must be atomic */ + buf = kmalloc(bufsz, GFP_ATOMIC); + if (!buf) { + CERROR("IZO: out of memory at %s:%d (trying to " + "allocate %d)\n", __FILE__, __LINE__, + bufsz); + filp_close(f, NULL); + EXIT; + return ERR_PTR(-ENOMEM); + } + + presto_kmem_inc(buf, bufsz); + memset(buf, 0, bufsz); - len = presto_fread(fset->fset_kml.fd_file, buf, toread, + while (bytes > 0) { + len = presto_fread(fset->fset_kml.fd_file, buf, bufsz, &read_off); if (len <= 0) break; if (presto_fwrite(f, buf, len, &write_off) != len) { + kfree(buf); + presto_kmem_dec(buf, bufsz); filp_close(f, NULL); EXIT; return ERR_PTR(-EIO); @@ -1280,7 +1294,9 @@ struct file * presto_copy_kml_tail(struc bytes -= len; } - + + kfree(buf); + presto_kmem_dec(buf, bufsz); EXIT; return f; } @@ -1589,11 +1605,12 @@ int presto_get_fileid(int minor, struct { int opcode = KML_OPCODE_GET_FILEID; struct rec_info rec; - char *buffer, *path, *logrecord, record[4096]; /*include path*/ + char *buffer, *path, *logrecord, *record; /*include path*/ struct dentry *root; __u32 uid, gid, pathlen; int error, size; struct kml_suffix *suffix; + size_t record_size; ENTRY; @@ -1609,9 +1626,13 @@ int presto_get_fileid(int minor, struct size_round(le32_to_cpu(pathlen)) + sizeof(struct kml_suffix); + record_size = max(4096, size); + error = -ENOMEM; + PRESTO_ALLOC(record, record_size); + if (!record) + goto free_buffer; + CDEBUG(D_FILE, "kml size: %d\n", size); - if ( size > sizeof(record) ) - CERROR("InterMezzo: BUFFER OVERFLOW in %s!\n", __FUNCTION__); memset(&rec, 0, sizeof(rec)); rec.is_kml = 1; @@ -1632,6 +1653,9 @@ int presto_get_fileid(int minor, struct size_round(le32_to_cpu(pathlen)), path, fset->fset_name); + PRESTO_FREE(record, record_size); + + free_buffer: BUFF_FREE(buffer); EXIT; return error; diff -prauN linux-2.6.0-test1/fs/isofs/rock.c wli-2.6.0-test1-37/fs/isofs/rock.c --- linux-2.6.0-test1/fs/isofs/rock.c 2003-07-13 20:34:37.000000000 -0700 +++ wli-2.6.0-test1-37/fs/isofs/rock.c 2003-07-14 08:52:52.000000000 -0700 @@ -430,7 +430,7 @@ int parse_rock_ridge_inode(struct iso_di static int rock_ridge_symlink_readpage(struct file *file, struct page *page) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; char *link = kmap(page); unsigned long bufsize = ISOFS_BUFFER_SIZE(inode); unsigned char bufbits = ISOFS_BUFFER_BITS(inode); diff -prauN linux-2.6.0-test1/fs/jbd/commit.c wli-2.6.0-test1-37/fs/jbd/commit.c --- linux-2.6.0-test1/fs/jbd/commit.c 2003-07-13 20:37:32.000000000 -0700 +++ wli-2.6.0-test1-37/fs/jbd/commit.c 2003-07-14 08:52:52.000000000 -0700 @@ -60,7 +60,7 @@ static void release_buffer_page(struct b page = bh->b_page; if (!page) goto nope; - if (page->mapping) + if (page_mapping(page)) goto nope; /* OK, it's a truncated page */ diff -prauN linux-2.6.0-test1/fs/jbd/journal.c wli-2.6.0-test1-37/fs/jbd/journal.c --- linux-2.6.0-test1/fs/jbd/journal.c 2003-07-13 20:30:48.000000000 -0700 +++ wli-2.6.0-test1-37/fs/jbd/journal.c 2003-07-14 08:52:52.000000000 -0700 @@ -1680,7 +1680,7 @@ repeat: } else { J_ASSERT_BH(bh, (atomic_read(&bh->b_count) > 0) || - (bh->b_page && bh->b_page->mapping)); + (bh->b_page && page_mapping(bh->b_page))); if (!new_jh) { jbd_unlock_bh_journal_head(bh); diff -prauN linux-2.6.0-test1/fs/jffs/inode-v23.c wli-2.6.0-test1-37/fs/jffs/inode-v23.c --- linux-2.6.0-test1/fs/jffs/inode-v23.c 2003-07-13 20:35:12.000000000 -0700 +++ wli-2.6.0-test1-37/fs/jffs/inode-v23.c 2003-07-14 08:52:52.000000000 -0700 @@ -744,7 +744,7 @@ jffs_do_readpage_nolock(struct file *fil void *buf; unsigned long read_len; int result; - struct inode *inode = (struct inode*)page->mapping->host; + struct inode *inode = (struct inode*)page_mapping(page)->host; struct jffs_file *f = (struct jffs_file *)inode->u.generic_ip; struct jffs_control *c = (struct jffs_control *)inode->i_sb->s_fs_info; int r; diff -prauN linux-2.6.0-test1/fs/jffs2/file.c wli-2.6.0-test1-37/fs/jffs2/file.c --- linux-2.6.0-test1/fs/jffs2/file.c 2003-07-13 20:39:22.000000000 -0700 +++ wli-2.6.0-test1-37/fs/jffs2/file.c 2003-07-14 08:52:52.000000000 -0700 @@ -266,18 +266,18 @@ int jffs2_do_readpage_unlock(struct inod int jffs2_readpage (struct file *filp, struct page *pg) { - struct jffs2_inode_info *f = JFFS2_INODE_INFO(pg->mapping->host); + struct jffs2_inode_info *f = JFFS2_INODE_INFO(page_mapping(pg)->host); int ret; down(&f->sem); - ret = jffs2_do_readpage_unlock(pg->mapping->host, pg); + ret = jffs2_do_readpage_unlock(page_mapping(pg)->host, pg); up(&f->sem); return ret; } int jffs2_prepare_write (struct file *filp, struct page *pg, unsigned start, unsigned end) { - struct inode *inode = pg->mapping->host; + struct inode *inode = page_mapping(pg)->host; struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode); uint32_t pageofs = pg->index << PAGE_CACHE_SHIFT; int ret = 0; @@ -362,7 +362,7 @@ int jffs2_commit_write (struct file *fil /* Actually commit the write from the page cache page we're looking at. * For now, we write the full page out each time. It sucks, but it's simple */ - struct inode *inode = pg->mapping->host; + struct inode *inode = page_mapping(pg)->host; struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode); struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); struct jffs2_raw_inode *ri; diff -prauN linux-2.6.0-test1/fs/libfs.c wli-2.6.0-test1-37/fs/libfs.c --- linux-2.6.0-test1/fs/libfs.c 2003-07-13 20:28:55.000000000 -0700 +++ wli-2.6.0-test1-37/fs/libfs.c 2003-07-14 08:52:52.000000000 -0700 @@ -325,7 +325,7 @@ int simple_prepare_write(struct file *fi int simple_commit_write(struct file *file, struct page *page, unsigned offset, unsigned to) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; /* diff -prauN linux-2.6.0-test1/fs/minix/dir.c wli-2.6.0-test1-37/fs/minix/dir.c --- linux-2.6.0-test1/fs/minix/dir.c 2003-07-13 20:36:43.000000000 -0700 +++ wli-2.6.0-test1-37/fs/minix/dir.c 2003-07-14 08:52:52.000000000 -0700 @@ -47,9 +47,9 @@ static inline unsigned long dir_pages(st static int dir_commit_chunk(struct page *page, unsigned from, unsigned to) { - struct inode *dir = (struct inode *)page->mapping->host; + struct inode *dir = (struct inode *)page_mapping(page)->host; int err = 0; - page->mapping->a_ops->commit_write(NULL, page, from, to); + page_mapping(page)->a_ops->commit_write(NULL, page, from, to); if (IS_DIRSYNC(dir)) err = write_one_page(page, 1); else @@ -240,7 +240,7 @@ int minix_add_link(struct dentry *dentry got_it: from = (char*)de - (char*)page_address(page); to = from + sbi->s_dirsize; - err = page->mapping->a_ops->prepare_write(NULL, page, from, to); + err = page_mapping(page)->a_ops->prepare_write(NULL, page, from, to); if (err) goto out_unlock; memcpy (de->name, name, namelen); @@ -260,7 +260,7 @@ out_unlock: int minix_delete_entry(struct minix_dir_entry *de, struct page *page) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); struct inode *inode = (struct inode*)mapping->host; char *kaddr = page_address(page); unsigned from = (char*)de - kaddr; @@ -364,14 +364,14 @@ not_empty: void minix_set_link(struct minix_dir_entry *de, struct page *page, struct inode *inode) { - struct inode *dir = (struct inode*)page->mapping->host; + struct inode *dir = (struct inode*)page_mapping(page)->host; struct minix_sb_info *sbi = minix_sb(dir->i_sb); unsigned from = (char *)de-(char*)page_address(page); unsigned to = from + sbi->s_dirsize; int err; lock_page(page); - err = page->mapping->a_ops->prepare_write(NULL, page, from, to); + err = page_mapping(page)->a_ops->prepare_write(NULL, page, from, to); if (err == 0) { de->inode = inode->i_ino; err = dir_commit_chunk(page, from, to); diff -prauN linux-2.6.0-test1/fs/mpage.c wli-2.6.0-test1-37/fs/mpage.c --- linux-2.6.0-test1/fs/mpage.c 2003-07-13 20:31:58.000000000 -0700 +++ wli-2.6.0-test1-37/fs/mpage.c 2003-07-14 08:52:52.000000000 -0700 @@ -129,7 +129,7 @@ mpage_alloc(struct block_device *bdev, static void map_buffer_to_page(struct page *page, struct buffer_head *bh, int page_block) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; struct buffer_head *page_bh, *head; int block = 0; @@ -209,7 +209,7 @@ static struct bio * do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages, sector_t *last_block_in_bio, get_block_t get_block) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; const unsigned blkbits = inode->i_blkbits; const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits; const unsigned blocksize = 1 << blkbits; @@ -388,7 +388,7 @@ static struct bio * mpage_writepage(struct bio *bio, struct page *page, get_block_t get_block, sector_t *last_block_in_bio, int *ret, struct writeback_control *wbc) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; const unsigned blkbits = inode->i_blkbits; unsigned long end_index; const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits; @@ -415,7 +415,7 @@ mpage_writepage(struct bio *bio, struct if (!buffer_mapped(bh)) { /* * unmapped dirty buffers are created by - * __set_page_dirty_buffers -> mmapped data + * set_page_dirty_buffers -> mmapped data */ if (buffer_dirty(bh)) goto confused; @@ -561,7 +561,7 @@ alloc_new: confused: if (bio) bio = mpage_bio_submit(WRITE, bio); - *ret = page->mapping->a_ops->writepage(page, wbc); + *ret = page_mapping(page)->a_ops->writepage(page, wbc); out: return bio; } @@ -625,7 +625,7 @@ mpage_writepages(struct address_space *m if (get_block == NULL) writepage = mapping->a_ops->writepage; - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); while (!list_empty(&mapping->io_pages) && !done) { struct page *page = list_entry(mapping->io_pages.prev, struct page, list); @@ -645,12 +645,12 @@ mpage_writepages(struct address_space *m list_add(&page->list, &mapping->locked_pages); page_cache_get(page); - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); /* * At this point we hold neither mapping->page_lock nor * lock on the page itself: the page may be truncated or - * invalidated (changing page->mapping to NULL), or even + * invalidated (changing page_mapping(page) to NULL), or even * swizzled back from swapper_space to tmpfs file mapping. */ @@ -659,7 +659,7 @@ mpage_writepages(struct address_space *m if (wbc->sync_mode != WB_SYNC_NONE) wait_on_page_writeback(page); - if (page->mapping == mapping && !PageWriteback(page) && + if (page_mapping(page) == mapping && !PageWriteback(page) && test_clear_page_dirty(page)) { if (writepage) { ret = (*writepage)(page, wbc); @@ -677,12 +677,12 @@ mpage_writepages(struct address_space *m unlock_page(page); } page_cache_release(page); - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); } /* * Leave any remaining dirty pages on ->io_pages */ - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); if (bio) mpage_bio_submit(WRITE, bio); return ret; diff -prauN linux-2.6.0-test1/fs/namei.c wli-2.6.0-test1-37/fs/namei.c --- linux-2.6.0-test1/fs/namei.c 2003-07-13 20:32:42.000000000 -0700 +++ wli-2.6.0-test1-37/fs/namei.c 2003-07-14 10:09:07.000000000 -0700 @@ -395,19 +395,21 @@ static struct dentry * real_lookup(struc static inline int do_follow_link(struct dentry *dentry, struct nameidata *nd) { int err = -ELOOP; - if (current->link_count >= 5) + task_t *task = current; + + if (task->link_count >= 5) goto loop; - if (current->total_link_count >= 40) + if (task->total_link_count >= 40) goto loop; cond_resched(); err = security_inode_follow_link(dentry, nd); if (err) goto loop; - current->link_count++; - current->total_link_count++; + task->link_count++; + task->total_link_count++; update_atime(dentry->d_inode); err = dentry->d_inode->i_op->follow_link(dentry, nd); - current->link_count--; + task->link_count--; return err; loop: path_release(nd); @@ -478,17 +480,22 @@ int follow_down(struct vfsmount **mnt, s static inline void follow_dotdot(struct vfsmount **mnt, struct dentry **dentry) { + task_t *task = current; + struct fs_struct *fs = task->fs; + while(1) { struct vfsmount *parent; struct dentry *old = *dentry; + struct fs_dirs *dirs; - read_lock(¤t->fs->lock); - if (*dentry == current->fs->root && - *mnt == current->fs->rootmnt) { - read_unlock(¤t->fs->lock); + rcu_read_lock(); /* task->fs->lock */ + dirs = fs->dirs; + if (*dentry == dirs->root && + *mnt == dirs->rootmnt) { + rcu_read_unlock(); /* task->fs->lock */ break; } - read_unlock(¤t->fs->lock); + rcu_read_unlock(); /* task->fs->lock */ spin_lock(&dcache_lock); if (*dentry != (*mnt)->mnt_root) { *dentry = dget((*dentry)->d_parent); @@ -758,11 +765,14 @@ int path_walk(const char * name, struct /* returns 1 if everything is done */ static int __emul_lookup_dentry(const char *name, struct nameidata *nd) { + struct fs_struct *fs = current->fs; + if (path_walk(name, nd)) return 0; /* something went wrong... */ if (!nd->dentry->d_inode || S_ISDIR(nd->dentry->d_inode->i_mode)) { struct nameidata nd_root; + struct fs_dirs *dirs; /* * NAME was not found in alternate root or it's a directory. Try to find * it in the normal root: @@ -770,10 +780,11 @@ static int __emul_lookup_dentry(const ch nd_root.last_type = LAST_ROOT; nd_root.flags = nd->flags; memcpy(&nd_root.intent, &nd->intent, sizeof(nd_root.intent)); - read_lock(¤t->fs->lock); - nd_root.mnt = mntget(current->fs->rootmnt); - nd_root.dentry = dget(current->fs->root); - read_unlock(¤t->fs->lock); + rcu_read_lock(); /* task->fs->lock */ + dirs = fs->dirs; + nd_root.mnt = mntget(dirs->rootmnt); + nd_root.dentry = dget(dirs->root); + rcu_read_unlock(); /* task->fs->lock */ if (path_walk(name, &nd_root)) return 1; if (nd_root.dentry->d_inode) { @@ -788,14 +799,20 @@ static int __emul_lookup_dentry(const ch return 1; } -void set_fs_altroot(void) +int set_fs_altroot(void) { char *emul = __emul_prefix(); struct nameidata nd; struct vfsmount *mnt = NULL, *oldmnt; struct dentry *dentry = NULL, *olddentry; + struct fs_struct *fs = current->fs; + struct fs_dirs *old, *new; int err; + new = kmalloc(sizeof(struct fs_dirs), GFP_KERNEL); + if (!new) + return -ENOMEM; + if (!emul) goto set_it; err = path_lookup(emul, LOOKUP_FOLLOW|LOOKUP_DIRECTORY|LOOKUP_NOALT, &nd); @@ -804,34 +821,43 @@ void set_fs_altroot(void) dentry = nd.dentry; } set_it: - write_lock(¤t->fs->lock); - oldmnt = current->fs->altrootmnt; - olddentry = current->fs->altroot; - current->fs->altrootmnt = mnt; - current->fs->altroot = dentry; - write_unlock(¤t->fs->lock); - if (olddentry) { - dput(olddentry); - mntput(oldmnt); - } + spin_lock(&fs->lock); + old = fs->dirs; + memcpy(new, old, sizeof(struct fs_dirs)); + oldmnt = old->altrootmnt; + olddentry = old->altroot; + new->altrootmnt = mnt; + new->altroot = dentry; + fs->dirs = new; + spin_unlock(&fs->lock); + + if (olddentry) + release_fs_dirs_altroot(old); + else + free_fs_dirs(old); + return 0; } /* SMP-safe */ static inline int walk_init_root(const char *name, struct nameidata *nd) { - read_lock(¤t->fs->lock); - if (current->fs->altroot && !(nd->flags & LOOKUP_NOALT)) { - nd->mnt = mntget(current->fs->altrootmnt); - nd->dentry = dget(current->fs->altroot); - read_unlock(¤t->fs->lock); + struct fs_struct *fs = current->fs; + struct fs_dirs *dirs; + + rcu_read_lock(); /* fs->lock */ + dirs = fs->dirs; + if (dirs->altroot && !(nd->flags & LOOKUP_NOALT)) { + nd->mnt = mntget(dirs->altrootmnt); + nd->dentry = dget(dirs->altroot); + rcu_read_unlock(); /* fs->lock */ if (__emul_lookup_dentry(name,nd)) return 0; - read_lock(¤t->fs->lock); + rcu_read_lock(); /* fs->lock */ } - nd->mnt = mntget(current->fs->rootmnt); - nd->dentry = dget(current->fs->root); - read_unlock(¤t->fs->lock); + nd->mnt = mntget(dirs->rootmnt); + nd->dentry = dget(dirs->root); + rcu_read_unlock(); /* fs->lock */ return 1; } @@ -839,26 +865,29 @@ int path_lookup(const char *name, unsign { nd->last_type = LAST_ROOT; /* if there are only slashes... */ nd->flags = flags; + task_t *task = current; + struct fs_struct *fs = task->fs; + struct fs_dirs *dirs; - read_lock(¤t->fs->lock); + rcu_read_lock(); /* fs->lock */ + dirs = fs->dirs; if (*name=='/') { - if (current->fs->altroot && !(nd->flags & LOOKUP_NOALT)) { - nd->mnt = mntget(current->fs->altrootmnt); - nd->dentry = dget(current->fs->altroot); - read_unlock(¤t->fs->lock); + if (dirs->altroot && !(nd->flags & LOOKUP_NOALT)) { + nd->mnt = mntget(dirs->altrootmnt); + nd->dentry = dget(dirs->altroot); + rcu_read_unlock(); /* fs->lock */ if (__emul_lookup_dentry(name,nd)) return 0; - read_lock(¤t->fs->lock); + rcu_read_lock(); /* fs->lock */ } - nd->mnt = mntget(current->fs->rootmnt); - nd->dentry = dget(current->fs->root); - } - else{ - nd->mnt = mntget(current->fs->pwdmnt); - nd->dentry = dget(current->fs->pwd); + nd->mnt = mntget(dirs->rootmnt); + nd->dentry = dget(dirs->root); + } else { + nd->mnt = mntget(dirs->pwdmnt); + nd->dentry = dget(dirs->pwd); } - read_unlock(¤t->fs->lock); - current->total_link_count = 0; + rcu_read_unlock(); /* fs->lock */ + task->total_link_count = 0; return link_path_walk(name, nd); } @@ -966,11 +995,13 @@ int __user_walk(const char __user *name, */ static inline int check_sticky(struct inode *dir, struct inode *inode) { + task_t *task = current; + if (!(dir->i_mode & S_ISVTX)) return 0; - if (inode->i_uid == current->fsuid) + if (inode->i_uid == task->fsuid) return 0; - if (dir->i_uid == current->fsuid) + if (dir->i_uid == task->fsuid) return 0; return !capable(CAP_FOWNER); } diff -prauN linux-2.6.0-test1/fs/namespace.c wli-2.6.0-test1-37/fs/namespace.c --- linux-2.6.0-test1/fs/namespace.c 2003-07-13 20:35:52.000000000 -0700 +++ wli-2.6.0-test1-37/fs/namespace.c 2003-07-14 10:20:40.000000000 -0700 @@ -21,6 +21,7 @@ #include #include #include +#include #include extern int __init init_rootfs(void); @@ -31,6 +32,7 @@ spinlock_t vfsmount_lock __cacheline_ali static struct list_head *mount_hashtable; static int hash_mask, hash_bits; static kmem_cache_t *mnt_cache; +seqlock_t mnt_move_lock __cacheline_aligned_in_smp = SEQLOCK_UNLOCKED; static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry) { @@ -71,6 +73,11 @@ void free_vfsmnt(struct vfsmount *mnt) /* * Now, lookup_mnt increments the ref count before returning * the vfsmount struct. + * + * lookup_mnt can be done without taking any lock, as now we + * do synchronize_kernel() while removing vfsmount struct + * from mnt_hash list. rcu_read_(un)lock is required for + * pre-emptive kernels. */ struct vfsmount *lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) { @@ -78,7 +85,7 @@ struct vfsmount *lookup_mnt(struct vfsmo struct list_head * tmp = head; struct vfsmount *p, *found = NULL; - spin_lock(&vfsmount_lock); + rcu_read_lock(); for (;;) { tmp = tmp->next; p = NULL; @@ -90,7 +97,7 @@ struct vfsmount *lookup_mnt(struct vfsmo break; } } - spin_unlock(&vfsmount_lock); + rcu_read_unlock(); return found; } @@ -107,10 +114,19 @@ static void detach_mnt(struct vfsmount * { old_nd->dentry = mnt->mnt_mountpoint; old_nd->mnt = mnt->mnt_parent; + + /* remove from the hash_list, before other things */ + list_del_rcu(&mnt->mnt_hash); + spin_unlock(&vfsmount_lock); + + /* There could be existing users doing lookup_mnt, let + * them finish their work. + */ + synchronize_kernel(); + spin_lock(&vfsmount_lock); mnt->mnt_parent = mnt; mnt->mnt_mountpoint = mnt->mnt_root; list_del_init(&mnt->mnt_child); - list_del_init(&mnt->mnt_hash); old_nd->dentry->d_mounted--; } @@ -118,7 +134,7 @@ static void attach_mnt(struct vfsmount * { mnt->mnt_parent = mntget(nd->mnt); mnt->mnt_mountpoint = dget(nd->dentry); - list_add(&mnt->mnt_hash, mount_hashtable+hash(nd->mnt, nd->dentry)); + list_add_rcu(&mnt->mnt_hash, mount_hashtable+hash(nd->mnt, nd->dentry)); list_add_tail(&mnt->mnt_child, &nd->mnt->mnt_mounts); nd->dentry->d_mounted++; } @@ -318,7 +334,7 @@ static int do_umount(struct vfsmount *mn * /reboot - static binary that would close all descriptors and * call reboot(9). Then init(8) could umount root and exec /reboot. */ - if (mnt == current->fs->rootmnt && !(flags & MNT_DETACH)) { + if (mnt == current->fs->dirs->rootmnt && !(flags & MNT_DETACH)) { /* * Special case for "unmounting" root ... * we just try to remount it readonly. @@ -630,8 +646,10 @@ static int do_move_mount(struct nameidat goto out2; err = 0; + write_seqlock(&mnt_move_lock); detach_mnt(old_nd.mnt, &parent_nd); attach_mnt(old_nd.mnt, nd); + write_sequnlock(&mnt_move_lock); out2: spin_unlock(&vfsmount_lock); out1: @@ -788,6 +806,7 @@ int copy_namespace(int flags, struct tas struct namespace *new_ns; struct vfsmount *rootmnt = NULL, *pwdmnt = NULL, *altrootmnt = NULL; struct fs_struct *fs = tsk->fs; + struct fs_dirs *olddirs, *newdirs; if (!namespace) return 0; @@ -806,6 +825,12 @@ int copy_namespace(int flags, struct tas if (!new_ns) goto out; + newdirs = kmalloc(sizeof(struct fs_dirs), GFP_KERNEL); + if (!newdirs) { + kfree(new_ns); + goto out; + } + atomic_set(&new_ns->count, 1); init_rwsem(&new_ns->sem); new_ns->root = NULL; @@ -821,27 +846,31 @@ int copy_namespace(int flags, struct tas /* Second pass: switch the tsk->fs->* elements */ if (fs) { struct vfsmount *p, *q; - write_lock(&fs->lock); + spin_lock(&fs->lock); + olddirs = fs->dirs; + memcpy(newdirs, olddirs, sizeof(struct fs_dirs)); p = namespace->root; q = new_ns->root; while (p) { - if (p == fs->rootmnt) { + if (p == olddirs->rootmnt) { rootmnt = p; - fs->rootmnt = mntget(q); + newdirs->rootmnt = mntget(q); } - if (p == fs->pwdmnt) { + if (p == olddirs->pwdmnt) { pwdmnt = p; - fs->pwdmnt = mntget(q); + newdirs->pwdmnt = mntget(q); } - if (p == fs->altrootmnt) { + if (p == olddirs->altrootmnt) { altrootmnt = p; - fs->altrootmnt = mntget(q); + newdirs->altrootmnt = mntget(q); } p = next_mnt(p, namespace->root); q = next_mnt(q, new_ns->root); } - write_unlock(&fs->lock); + fs->dirs = newdirs; + spin_unlock(&fs->lock); + free_fs_dirs(olddirs); } up_write(&tsk->namespace->sem); @@ -904,48 +933,139 @@ out1: return retval; } +static void dirs_schedule_work(void *__dirs) +{ + struct fs_dirs *dirs = __dirs; + schedule_work(&dirs->work); +} + +static void __release_fs_dirs_altroot(void *__dirs) +{ + struct fs_dirs *dirs = __dirs; + dput(dirs->altroot); + mntput(dirs->altrootmnt); + kfree(dirs); +} + +void release_fs_dirs_altroot(struct fs_dirs *dirs) +{ + INIT_WORK(&dirs->work, __release_fs_dirs_altroot, dirs); + call_rcu(&dirs->rcu, dirs_schedule_work, dirs); +} + +static void __release_fs_dirs_root(void *__dirs) +{ + struct fs_dirs *dirs = __dirs; + dput(dirs->root); + mntput(dirs->rootmnt); + kfree(dirs); +} + +void release_fs_dirs_root(struct fs_dirs *dirs) +{ + INIT_WORK(&dirs->work, __release_fs_dirs_root, dirs); + call_rcu(&dirs->rcu, dirs_schedule_work, dirs); +} + +static void __release_fs_dirs_pwd(void *__dirs) +{ + struct fs_dirs *dirs = __dirs; + dput(dirs->pwd); + mntput(dirs->pwdmnt); + kfree(dirs); +} + +void release_fs_dirs_pwd(struct fs_dirs *dirs) +{ + INIT_WORK(&dirs->work, __release_fs_dirs_pwd, dirs); + call_rcu(&dirs->rcu, dirs_schedule_work, dirs); +} + +static void __free_fs_dirs(void *dirs) +{ + kfree(dirs); +} + +void free_fs_dirs(struct fs_dirs *dirs) +{ + call_rcu(&dirs->rcu, __free_fs_dirs, dirs); +} + /* * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values. * It can block. Requires the big lock held. */ -void set_fs_root(struct fs_struct *fs, struct vfsmount *mnt, - struct dentry *dentry) +static inline int __set_fs_root(struct fs_struct *fs, struct vfsmount *mnt, + struct dentry *dentry, int gfp_mask) { struct dentry *old_root; struct vfsmount *old_rootmnt; - write_lock(&fs->lock); - old_root = fs->root; - old_rootmnt = fs->rootmnt; - fs->rootmnt = mntget(mnt); - fs->root = dget(dentry); - write_unlock(&fs->lock); - if (old_root) { - dput(old_root); - mntput(old_rootmnt); - } + struct fs_dirs *old, *new; + + new = kmalloc(sizeof(struct fs_dirs), gfp_mask); + if (!new) + return -ENOMEM; + + spin_lock(&fs->lock); + old = fs->dirs; + memcpy(new, old, sizeof(struct fs_dirs)); + old_root = old->root; + old_rootmnt = old->rootmnt; + new->rootmnt = mntget(mnt); + new->root = dget(dentry); + fs->dirs = new; + spin_unlock(&fs->lock); + + if (old_root) + release_fs_dirs_root(old); + else + free_fs_dirs(old); + + return 0; +} + +int set_fs_root(struct fs_struct *fs, struct vfsmount *mnt, + struct dentry *dentry) +{ + return __set_fs_root(fs, mnt, dentry, GFP_KERNEL); } /* * Replace the fs->{pwdmnt,pwd} with {mnt,dentry}. Put the old values. * It can block. Requires the big lock held. */ -void set_fs_pwd(struct fs_struct *fs, struct vfsmount *mnt, - struct dentry *dentry) +static inline int __set_fs_pwd(struct fs_struct *fs, struct vfsmount *mnt, + struct dentry *dentry, int gfp_mask) { struct dentry *old_pwd; struct vfsmount *old_pwdmnt; + struct fs_dirs *old, *new; - write_lock(&fs->lock); - old_pwd = fs->pwd; - old_pwdmnt = fs->pwdmnt; - fs->pwdmnt = mntget(mnt); - fs->pwd = dget(dentry); - write_unlock(&fs->lock); - - if (old_pwd) { - dput(old_pwd); - mntput(old_pwdmnt); - } + new = kmalloc(sizeof(struct fs_dirs), gfp_mask); + if (!new) + return -ENOMEM; + + spin_lock(&fs->lock); + old = fs->dirs; + memcpy(new, old, sizeof(struct fs_dirs)); + old_pwd = old->pwd; + old_pwdmnt = old->pwdmnt; + new->pwdmnt = mntget(mnt); + new->pwd = dget(dentry); + fs->dirs = new; + spin_unlock(&fs->lock); + + if (old_pwd) + release_fs_dirs_pwd(old); + else + free_fs_dirs(old); + return 0; +} + +int set_fs_pwd(struct fs_struct *fs, struct vfsmount *mnt, + struct dentry *dentry) +{ + return __set_fs_pwd(fs, mnt, dentry, GFP_KERNEL); } static void chroot_fs_refs(struct nameidata *old_nd, struct nameidata *new_nd) @@ -960,10 +1080,18 @@ static void chroot_fs_refs(struct nameid if (fs) { atomic_inc(&fs->count); task_unlock(p); - if (fs->root==old_nd->dentry&&fs->rootmnt==old_nd->mnt) - set_fs_root(fs, new_nd->mnt, new_nd->dentry); - if (fs->pwd==old_nd->dentry&&fs->pwdmnt==old_nd->mnt) - set_fs_pwd(fs, new_nd->mnt, new_nd->dentry); + if (fs->dirs->root==old_nd->dentry&&fs->dirs->rootmnt==old_nd->mnt) + if (__set_fs_root(fs, new_nd->mnt, new_nd->dentry, GFP_ATOMIC)) { + dump_stack(); + show_free_areas(); + panic("set_fs_root() failed!\n"); + } + if (fs->dirs->pwd==old_nd->dentry&&fs->dirs->pwdmnt==old_nd->mnt) + if (__set_fs_pwd(fs, new_nd->mnt, new_nd->dentry, GFP_ATOMIC)) { + dump_stack(); + show_free_areas(); + panic("set_fs_pwd() failed!\n"); + } put_fs_struct(fs); } else task_unlock(p); @@ -989,6 +1117,7 @@ asmlinkage long sys_pivot_root(const cha struct vfsmount *tmp; struct nameidata new_nd, old_nd, parent_nd, root_parent, user_nd; int error; + struct fs_dirs *dirs; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1012,10 +1141,11 @@ asmlinkage long sys_pivot_root(const cha goto out1; } - read_lock(¤t->fs->lock); - user_nd.mnt = mntget(current->fs->rootmnt); - user_nd.dentry = dget(current->fs->root); - read_unlock(¤t->fs->lock); + rcu_read_lock(); /* current->fs->lock */ + dirs = current->fs->dirs; + user_nd.mnt = mntget(dirs->rootmnt); + user_nd.dentry = dget(dirs->root); + rcu_read_unlock(); /* current->fs->lock */ down_write(¤t->namespace->sem); down(&old_nd.dentry->d_inode->i_sem); error = -EINVAL; @@ -1050,10 +1180,12 @@ asmlinkage long sys_pivot_root(const cha goto out3; } else if (!is_subdir(old_nd.dentry, new_nd.dentry)) goto out3; + write_seqlock(&mnt_move_lock); detach_mnt(new_nd.mnt, &parent_nd); detach_mnt(user_nd.mnt, &root_parent); attach_mnt(user_nd.mnt, &old_nd); attach_mnt(new_nd.mnt, &root_parent); + write_sequnlock(&mnt_move_lock); spin_unlock(&vfsmount_lock); chroot_fs_refs(&user_nd, &new_nd); security_sb_post_pivotroot(&user_nd, &new_nd); @@ -1101,8 +1233,16 @@ static void __init init_mount_tree(void) } while_each_thread(g, p); read_unlock(&tasklist_lock); - set_fs_pwd(current->fs, namespace->root, namespace->root->mnt_root); - set_fs_root(current->fs, namespace->root, namespace->root->mnt_root); + if (set_fs_pwd(current->fs, namespace->root, namespace->root->mnt_root)) { + dump_stack(); + show_free_areas(); + panic("set_fs_pwd() failed!\n"); + } + if (set_fs_root(current->fs, namespace->root, namespace->root->mnt_root)) { + dump_stack(); + show_free_areas(); + panic("set_fs_root() failed!\n"); + } } void __init mnt_init(unsigned long mempages) diff -prauN linux-2.6.0-test1/fs/ncpfs/symlink.c wli-2.6.0-test1-37/fs/ncpfs/symlink.c --- linux-2.6.0-test1/fs/ncpfs/symlink.c 2003-07-13 20:38:43.000000000 -0700 +++ wli-2.6.0-test1-37/fs/ncpfs/symlink.c 2003-07-14 08:52:52.000000000 -0700 @@ -43,7 +43,7 @@ static int ncp_symlink_readpage(struct file *file, struct page *page) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; int error, length, len; char *link, *rawlink; char *buf = kmap(page); diff -prauN linux-2.6.0-test1/fs/nfs/file.c wli-2.6.0-test1-37/fs/nfs/file.c --- linux-2.6.0-test1/fs/nfs/file.c 2003-07-13 20:35:53.000000000 -0700 +++ wli-2.6.0-test1-37/fs/nfs/file.c 2003-07-14 08:52:52.000000000 -0700 @@ -212,7 +212,7 @@ static int nfs_commit_write(struct file struct address_space_operations nfs_file_aops = { .readpage = nfs_readpage, .readpages = nfs_readpages, - .set_page_dirty = __set_page_dirty_nobuffers, + .set_page_dirty = set_page_dirty_nobuffers, .writepage = nfs_writepage, .writepages = nfs_writepages, .prepare_write = nfs_prepare_write, diff -prauN linux-2.6.0-test1/fs/nfs/read.c wli-2.6.0-test1-37/fs/nfs/read.c --- linux-2.6.0-test1/fs/nfs/read.c 2003-07-13 20:28:52.000000000 -0700 +++ wli-2.6.0-test1-37/fs/nfs/read.c 2003-07-14 08:52:52.000000000 -0700 @@ -300,7 +300,7 @@ nfs_readpage_result(struct rpc_task *tas int nfs_readpage(struct file *file, struct page *page) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; int error; dprintk("NFS: nfs_readpage (%p %ld@%lu)\n", @@ -341,14 +341,14 @@ static int readpage_sync_filler(void *data, struct page *page) { struct nfs_readdesc *desc = (struct nfs_readdesc *)data; - return nfs_readpage_sync(desc->filp, page->mapping->host, page); + return nfs_readpage_sync(desc->filp, page_mapping(page)->host, page); } static int readpage_async_filler(void *data, struct page *page) { struct nfs_readdesc *desc = (struct nfs_readdesc *)data; - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; struct nfs_page *new; nfs_wb_page(inode, page); diff -prauN linux-2.6.0-test1/fs/nfs/write.c wli-2.6.0-test1-37/fs/nfs/write.c --- linux-2.6.0-test1/fs/nfs/write.c 2003-07-13 20:36:42.000000000 -0700 +++ wli-2.6.0-test1-37/fs/nfs/write.c 2003-07-14 08:52:52.000000000 -0700 @@ -224,7 +224,7 @@ nfs_writepage_async(struct file *file, s int nfs_writepage(struct page *page, struct writeback_control *wbc) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; unsigned long end_index; unsigned offset = PAGE_CACHE_SIZE; loff_t i_size = i_size_read(inode); @@ -629,7 +629,7 @@ nfs_strategy(struct inode *inode) int nfs_flush_incompatible(struct file *file, struct page *page) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; struct rpc_cred *cred = nfs_file_cred(file); struct nfs_page *req; int status = 0; @@ -660,7 +660,7 @@ int nfs_updatepage(struct file *file, struct page *page, unsigned int offset, unsigned int count) { struct dentry *dentry = file->f_dentry; - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; struct nfs_page *req; loff_t end; int status = 0; diff -prauN linux-2.6.0-test1/fs/ntfs/aops.c wli-2.6.0-test1-37/fs/ntfs/aops.c --- linux-2.6.0-test1/fs/ntfs/aops.c 2003-07-13 20:36:47.000000000 -0700 +++ wli-2.6.0-test1-37/fs/ntfs/aops.c 2003-07-14 08:52:52.000000000 -0700 @@ -55,7 +55,7 @@ static void ntfs_end_buffer_async_read(s int page_uptodate = 1; page = bh->b_page; - ni = NTFS_I(page->mapping->host); + ni = NTFS_I(page_mapping(page)->host); if (likely(uptodate)) { s64 file_ofs; @@ -176,7 +176,7 @@ static int ntfs_read_block(struct page * int i, nr; unsigned char blocksize_bits; - ni = NTFS_I(page->mapping->host); + ni = NTFS_I(page_mapping(page)->host); vol = ni->vol; blocksize_bits = VFS_I(ni)->i_blkbits; @@ -359,7 +359,7 @@ int ntfs_readpage(struct file *file, str return 0; } - ni = NTFS_I(page->mapping->host); + ni = NTFS_I(page_mapping(page)->host); if (NInoNonResident(ni)) { /* @@ -473,7 +473,7 @@ static int ntfs_write_block(struct page BOOL need_end_writeback; unsigned char blocksize_bits; - vi = page->mapping->host; + vi = page_mapping(page)->host; ni = NTFS_I(vi); vol = ni->vol; @@ -500,9 +500,9 @@ static int ntfs_write_block(struct page * buffer's dirty state as-is. */ // FIXME: Once Andrew's -EAGAIN patch goes in, remove the - // __set_page_dirty_nobuffers(page) and return -EAGAIN instead + // set_page_dirty_nobuffers(page) and return -EAGAIN instead // of zero. - __set_page_dirty_nobuffers(page); + set_page_dirty_nobuffers(page); unlock_page(page); return 0; } @@ -519,12 +519,12 @@ static int ntfs_write_block(struct page iblock = ni->initialized_size >> blocksize_bits; /* - * Be very careful. We have no exclusion from __set_page_dirty_buffers + * Be very careful. We have no exclusion from set_page_dirty_buffers * here, and the (potentially unmapped) buffers may become dirty at * any time. If a buffer becomes dirty here after we've inspected it * then we just miss that fact, and the page stays dirty. * - * Buffers outside i_size may be dirtied by __set_page_dirty_buffers; + * Buffers outside i_size may be dirtied by set_page_dirty_buffers; * handle that here by just cleaning them. */ @@ -579,7 +579,7 @@ static int ntfs_write_block(struct page // Update initialized size in the attribute and // in the inode. // Again, for each page do: - // __set_page_dirty_buffers(); + // set_page_dirty_buffers(); // page_cache_release() // We don't need to wait on the writes. // Update iblock. @@ -734,9 +734,9 @@ lock_retry_remap: * leave its buffer's dirty state as-is. */ // FIXME: Once Andrew's -EAGAIN patch goes in, remove - // the __set_page_dirty_nobuffers(page) and set err to + // the set_page_dirty_nobuffers(page) and set err to // -EAGAIN instead of zero. - __set_page_dirty_nobuffers(page); + set_page_dirty_nobuffers(page); err = 0; } else SetPageError(page); @@ -805,7 +805,7 @@ static int ntfs_writepage(struct page *p BUG_ON(!PageLocked(page)); - vi = page->mapping->host; + vi = page_mapping(page)->host; /* Is the page fully outside i_size? (truncate in progress) */ if (unlikely(page->index >= (vi->i_size + PAGE_CACHE_SIZE - 1) >> @@ -987,9 +987,9 @@ err_out: * buffer's dirty state as-is. */ // FIXME: Once Andrew's -EAGAIN patch goes in, remove the - // __set_page_dirty_nobuffers(page) and set err to -EAGAIN + // set_page_dirty_nobuffers(page) and set err to -EAGAIN // instead of zero. - __set_page_dirty_nobuffers(page); + set_page_dirty_nobuffers(page); err = 0; } else { ntfs_error(vi->i_sb, "Resident attribute write failed with " @@ -1024,7 +1024,7 @@ static int ntfs_prepare_nonresident_writ BOOL is_retry; unsigned char blocksize_bits; - vi = page->mapping->host; + vi = page_mapping(page)->host; ni = NTFS_I(vi); vol = ni->vol; @@ -1125,7 +1125,7 @@ static int ntfs_prepare_nonresident_writ // Update initialized size in the attribute and // in the inode. // Again, for each page do: - // __set_page_dirty_buffers(); + // set_page_dirty_buffers(); // page_cache_release() // We don't need to wait on the writes. // Update iblock. @@ -1361,7 +1361,7 @@ err_out: * ntfs_prepare_write - prepare a page for receiving data * * This is called from generic_file_write() with i_sem held on the inode - * (@page->mapping->host). The @page is locked and kmap()ped so page_address() + * (@page_mapping(page)->host). The @page is locked and kmap()ped so page_address() * can simply be used. The source data has not yet been copied into the @page. * * Need to extend the attribute/fill in holes if necessary, create blocks and @@ -1382,7 +1382,7 @@ err_out: static int ntfs_prepare_write(struct file *file, struct page *page, unsigned from, unsigned to) { - struct inode *vi = page->mapping->host; + struct inode *vi = page_mapping(page)->host; ntfs_inode *ni = NTFS_I(vi); ntfs_debug("Entering for inode %li, attribute type 0x%x, page index " @@ -1491,7 +1491,7 @@ static int ntfs_commit_nonresident_write unsigned int block_start, block_end, blocksize; BOOL partial; - vi = page->mapping->host; + vi = page_mapping(page)->host; ntfs_debug("Entering for inode %li, attribute type 0x%x, page index " "0x%lx, from = %u, to = %u.", vi->i_ino, @@ -1547,7 +1547,7 @@ static int ntfs_commit_nonresident_write * ntfs_commit_write - commit the received data * * This is called from generic_file_write() with i_sem held on the inode - * (@page->mapping->host). The @page is locked and kmap()ped so page_address() + * (@page_mapping(page)->host). The @page is locked and kmap()ped so page_address() * can simply be used. The source data has already been copied into the @page. * * Need to mark modified blocks dirty so they get written out later when @@ -1585,7 +1585,7 @@ static int ntfs_commit_write(struct file u32 attr_len, bytes; int err; - vi = page->mapping->host; + vi = page_mapping(page)->host; ni = NTFS_I(vi); ntfs_debug("Entering for inode %li, attribute type 0x%x, page index " @@ -1758,7 +1758,7 @@ err_out: * Put the page on mapping->dirty_pages, but leave its * buffer's dirty state as-is. */ - __set_page_dirty_nobuffers(page); + set_page_dirty_nobuffers(page); err = 0; } else ntfs_error(vi->i_sb, "Page is not uptodate. Written " diff -prauN linux-2.6.0-test1/fs/ntfs/compress.c wli-2.6.0-test1-37/fs/ntfs/compress.c --- linux-2.6.0-test1/fs/ntfs/compress.c 2003-07-13 20:34:31.000000000 -0700 +++ wli-2.6.0-test1-37/fs/ntfs/compress.c 2003-07-14 08:52:52.000000000 -0700 @@ -209,7 +209,7 @@ return_error: /* Second stage: finalize completed pages. */ if (nr_completed_pages > 0) { struct page *page = dest_pages[completed_pages[0]]; - ntfs_inode *ni = NTFS_I(page->mapping->host); + ntfs_inode *ni = NTFS_I(page_mapping(page)->host); for (i = 0; i < nr_completed_pages; i++) { int di = completed_pages[i]; @@ -467,7 +467,7 @@ return_overflow: */ int ntfs_read_compressed_block(struct page *page) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); ntfs_inode *ni = NTFS_I(mapping->host); ntfs_volume *vol = ni->vol; struct super_block *sb = vol->sb; diff -prauN linux-2.6.0-test1/fs/open.c wli-2.6.0-test1-37/fs/open.c --- linux-2.6.0-test1/fs/open.c 2003-07-13 20:29:30.000000000 -0700 +++ wli-2.6.0-test1-37/fs/open.c 2003-07-14 10:20:40.000000000 -0700 @@ -442,17 +442,18 @@ asmlinkage long sys_access(const char __ struct nameidata nd; int old_fsuid, old_fsgid; kernel_cap_t old_cap; + task_t *task = current; int res; if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */ return -EINVAL; - old_fsuid = current->fsuid; - old_fsgid = current->fsgid; - old_cap = current->cap_effective; + old_fsuid = task->fsuid; + old_fsgid = task->fsgid; + old_cap = task->cap_effective; - current->fsuid = current->uid; - current->fsgid = current->gid; + task->fsuid = task->uid; + task->fsgid = task->gid; /* * Clear the capabilities if we switch to a non-root user @@ -462,10 +463,10 @@ asmlinkage long sys_access(const char __ * value below. We should hold task_capabilities_lock, * but we cannot because user_path_walk can sleep. */ - if (current->uid) - cap_clear(current->cap_effective); + if (task->uid) + cap_clear(task->cap_effective); else - current->cap_effective = current->cap_permitted; + task->cap_effective = task->cap_permitted; res = __user_walk(filename, LOOKUP_FOLLOW|LOOKUP_ACCESS, &nd); if (!res) { @@ -477,9 +478,9 @@ asmlinkage long sys_access(const char __ path_release(&nd); } - current->fsuid = old_fsuid; - current->fsgid = old_fsgid; - current->cap_effective = old_cap; + task->fsuid = old_fsuid; + task->fsgid = old_fsgid; + task->cap_effective = old_cap; return res; } @@ -497,7 +498,7 @@ asmlinkage long sys_chdir(const char __u if (error) goto dput_and_out; - set_fs_pwd(current->fs, nd.mnt, nd.dentry); + error = set_fs_pwd(current->fs, nd.mnt, nd.dentry); dput_and_out: path_release(&nd); @@ -528,7 +529,7 @@ asmlinkage long sys_fchdir(unsigned int error = permission(inode, MAY_EXEC, NULL); if (!error) - set_fs_pwd(current->fs, mnt, dentry); + error = set_fs_pwd(current->fs, mnt, dentry); out_putf: fput(file); out: @@ -552,9 +553,10 @@ asmlinkage long sys_chroot(const char __ if (!capable(CAP_SYS_CHROOT)) goto dput_and_out; - set_fs_root(current->fs, nd.mnt, nd.dentry); - set_fs_altroot(); - error = 0; + error = set_fs_root(current->fs, nd.mnt, nd.dentry); + if (error) + goto dput_and_out; + error = set_fs_altroot(); dput_and_out: path_release(&nd); out: @@ -741,7 +743,8 @@ struct file *dentry_open(struct dentry * { struct file * f; struct inode *inode; - int error; + struct file_list *container; + int cpu, error; error = -ENFILE; f = get_empty_filp(); @@ -761,7 +764,12 @@ struct file *dentry_open(struct dentry * f->f_vfsmnt = mnt; f->f_pos = 0; f->f_op = fops_get(inode->i_fop); - file_move(f, &inode->i_sb->s_files); + cpu = get_cpu(); + container = f->f_container = &inode->i_sb->s_file_lists[cpu]; + spin_lock(&container->lock); + list_add(&f->f_list, &container->list); + spin_unlock(&container->lock); + put_cpu(); if (f->f_op && f->f_op->open) { error = f->f_op->open(inode,f); @@ -785,7 +793,16 @@ cleanup_all: fops_put(f->f_op); if (f->f_mode & FMODE_WRITE) put_write_access(inode); - file_kill(f); + rcu_read_lock(); + smp_read_barrier_depends(); + container = f->f_container; + if (container) { + spin_lock(&container->lock); + list_del_init(&f->f_list); + f->f_container = NULL; + spin_unlock(&container->lock); + } + rcu_read_unlock(); f->f_dentry = NULL; f->f_vfsmnt = NULL; cleanup_file: @@ -801,7 +818,8 @@ cleanup_dentry: */ int get_unused_fd(void) { - struct files_struct * files = current->files; + task_t *task = current; + struct files_struct *files = task->files; int fd, error; error = -EMFILE; @@ -816,7 +834,7 @@ repeat: * N.B. For clone tasks sharing a files structure, this test * will limit the total number of files that can be opened. */ - if (fd >= current->rlim[RLIMIT_NOFILE].rlim_cur) + if (fd >= task->rlim[RLIMIT_NOFILE].rlim_cur) goto out; /* Do we need to expand the fdset array? */ @@ -976,7 +994,9 @@ asmlinkage long sys_close(unsigned int f filp = files->fd[fd]; if (!filp) goto out_unlock; - files->fd[fd] = NULL; + files->fd[fd] = NULL; + /* Need to make it conistent with open_fds in __put_unused_fd() */ + wmb(); FD_CLR(fd, files->close_on_exec); __put_unused_fd(files, fd); spin_unlock(&files->file_lock); diff -prauN linux-2.6.0-test1/fs/pipe.c wli-2.6.0-test1-37/fs/pipe.c --- linux-2.6.0-test1/fs/pipe.c 2003-07-13 20:34:40.000000000 -0700 +++ wli-2.6.0-test1-37/fs/pipe.c 2003-07-14 09:54:35.000000000 -0700 @@ -507,6 +507,7 @@ static struct dentry_operations pipefs_d static struct inode * get_pipe_inode(void) { + task_t *task = current; struct inode *inode = new_inode(pipe_mnt->mnt_sb); if (!inode) @@ -525,8 +526,8 @@ static struct inode * get_pipe_inode(voi */ inode->i_state = I_DIRTY; inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR; - inode->i_uid = current->fsuid; - inode->i_gid = current->fsgid; + inode->i_uid = task->fsuid; + inode->i_gid = task->fsgid; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; inode->i_blksize = PAGE_SIZE; return inode; diff -prauN linux-2.6.0-test1/fs/proc/array.c wli-2.6.0-test1-37/fs/proc/array.c --- linux-2.6.0-test1/fs/proc/array.c 2003-07-13 20:35:12.000000000 -0700 +++ wli-2.6.0-test1-37/fs/proc/array.c 2003-07-14 09:22:36.000000000 -0700 @@ -283,7 +283,7 @@ int proc_pid_status(struct task_struct * return buffer - orig; } -extern unsigned long task_vsize(struct mm_struct *); +unsigned long task_vsize(struct mm_struct *); int proc_pid_stat(struct task_struct *task, char * buffer) { unsigned long vsize, eip, esp, wchan; @@ -307,11 +307,9 @@ int proc_pid_stat(struct task_struct *ta } task_unlock(task); if (mm) { - down_read(&mm->mmap_sem); vsize = task_vsize(mm); eip = KSTK_EIP(task); esp = KSTK_ESP(task); - up_read(&mm->mmap_sem); } wchan = get_wchan(task); @@ -388,20 +386,20 @@ int proc_pid_stat(struct task_struct *ta return res; } -extern int task_statm(struct mm_struct *, int *, int *, int *, int *); +int task_statm(struct mm_struct *, int *, int *, int *, int *, int *, int *); int proc_pid_statm(struct task_struct *task, char *buffer) { - int size = 0, resident = 0, shared = 0, text = 0, lib = 0, data = 0; + int size, resident, shared, text, lib, data, dirty; struct mm_struct *mm = get_task_mm(task); - if (mm) { - down_read(&mm->mmap_sem); - size = task_statm(mm, &shared, &text, &data, &resident); - up_read(&mm->mmap_sem); - + if (!mm) + size = resident = shared = text = lib = data = dirty = 0; + else { + size = task_statm(mm, &shared, &text, &lib, &data, + &resident, &dirty); mmput(mm); } return sprintf(buffer,"%d %d %d %d %d %d %d\n", - size, resident, shared, text, lib, data, 0); + size, resident, shared, text, lib, data, dirty); } diff -prauN linux-2.6.0-test1/fs/proc/base.c wli-2.6.0-test1-37/fs/proc/base.c --- linux-2.6.0-test1/fs/proc/base.c 2003-07-13 20:35:15.000000000 -0700 +++ wli-2.6.0-test1-37/fs/proc/base.c 2003-07-14 10:07:15.000000000 -0700 @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -136,16 +137,16 @@ static int proc_fd_link(struct inode *in atomic_inc(&files->count); task_unlock(task); if (files) { - spin_lock(&files->file_lock); + rcu_read_lock(); file = fcheck_files(files, fd); if (file) { *mnt = mntget(file->f_vfsmnt); *dentry = dget(file->f_dentry); - spin_unlock(&files->file_lock); + rcu_read_unlock(); put_files_struct(files); return 0; } - spin_unlock(&files->file_lock); + rcu_read_unlock(); put_files_struct(files); } return -ENOENT; @@ -188,10 +189,12 @@ static int proc_cwd_link(struct inode *i atomic_inc(&fs->count); task_unlock(proc_task(inode)); if (fs) { - read_lock(&fs->lock); - *mnt = mntget(fs->pwdmnt); - *dentry = dget(fs->pwd); - read_unlock(&fs->lock); + struct fs_dirs *dirs; + rcu_read_lock(); /* fs->lock */ + dirs = fs->dirs; + *mnt = mntget(dirs->pwdmnt); + *dentry = dget(dirs->pwd); + rcu_read_unlock(); /* fs->lock */ result = 0; put_fs_struct(fs); } @@ -208,10 +211,12 @@ static int proc_root_link(struct inode * atomic_inc(&fs->count); task_unlock(proc_task(inode)); if (fs) { - read_lock(&fs->lock); - *mnt = mntget(fs->rootmnt); - *dentry = dget(fs->root); - read_unlock(&fs->lock); + struct fs_dirs *dirs; + rcu_read_lock(); /* fs->lock */ + dirs = fs->dirs; + *mnt = mntget(dirs->rootmnt); + *dentry = dget(dirs->root); + rcu_read_unlock(); /* fs->lock */ result = 0; put_fs_struct(fs); } @@ -299,27 +304,29 @@ static int proc_check_root(struct inode struct dentry *de, *base, *root; struct vfsmount *our_vfsmnt, *vfsmnt, *mnt; int res = 0; + unsigned long seq; + struct fs_struct *fs = current->fs; + struct fs_dirs *dirs; if (proc_root_link(inode, &root, &vfsmnt)) /* Ewww... */ return -ENOENT; - read_lock(¤t->fs->lock); - our_vfsmnt = mntget(current->fs->rootmnt); - base = dget(current->fs->root); - read_unlock(¤t->fs->lock); - - spin_lock(&vfsmount_lock); - de = root; - mnt = vfsmnt; - - while (vfsmnt != our_vfsmnt) { - if (vfsmnt == vfsmnt->mnt_parent) { - spin_unlock(&vfsmount_lock); - goto out; + rcu_read_lock(); /* current->fs->lock */ + dirs = fs->dirs; + our_vfsmnt = mntget(dirs->rootmnt); + base = dget(dirs->root); + rcu_read_unlock(); /* current->fs->lock */ + + do { + seq = read_seqbegin(&mnt_move_lock); + de = root; + mnt = vfsmnt; + while (vfsmnt != our_vfsmnt) { + if (vfsmnt == vfsmnt->mnt_parent) + goto out; + de = vfsmnt->mnt_mountpoint; + vfsmnt = vfsmnt->mnt_parent; } - de = vfsmnt->mnt_mountpoint; - vfsmnt = vfsmnt->mnt_parent; - } - spin_unlock(&vfsmount_lock); + } while (read_seqretry(&mnt_move_lock, seq)); if (!is_subdir(de, base)) goto out; @@ -638,8 +645,6 @@ static int proc_pid_readlink(struct dent struct dentry *de; struct vfsmount *mnt = NULL; - lock_kernel(); - if (current->fsuid != inode->i_uid && !capable(CAP_DAC_OVERRIDE)) goto out; error = proc_check_root(inode); @@ -654,7 +659,6 @@ static int proc_pid_readlink(struct dent dput(de); mntput(mnt); out: - unlock_kernel(); return error; } @@ -705,7 +709,7 @@ static int proc_readfd(struct file * fil task_unlock(p); if (!files) goto out; - spin_lock(&files->file_lock); + rcu_read_lock(); for (fd = filp->f_pos-2; fd < files->max_fds; fd++, filp->f_pos++) { @@ -713,7 +717,7 @@ static int proc_readfd(struct file * fil if (!fcheck_files(files, fd)) continue; - spin_unlock(&files->file_lock); + rcu_read_unlock(); j = NUMBUF; i = fd; @@ -725,12 +729,12 @@ static int proc_readfd(struct file * fil ino = fake_ino(pid, PROC_PID_FD_DIR + fd); if (filldir(dirent, buf+j, NUMBUF-j, fd+2, ino, DT_LNK) < 0) { - spin_lock(&files->file_lock); + rcu_read_lock(); break; } - spin_lock(&files->file_lock); + rcu_read_lock(); } - spin_unlock(&files->file_lock); + rcu_read_unlock(); put_files_struct(files); } out: @@ -885,13 +889,13 @@ static int pid_fd_revalidate(struct dent atomic_inc(&files->count); task_unlock(task); if (files) { - spin_lock(&files->file_lock); + rcu_read_lock(); if (fcheck_files(files, fd)) { - spin_unlock(&files->file_lock); + rcu_read_unlock(); put_files_struct(files); return 1; } - spin_unlock(&files->file_lock); + rcu_read_unlock(); put_files_struct(files); } d_drop(dentry); @@ -988,7 +992,7 @@ static struct dentry *proc_lookupfd(stru if (!files) goto out_unlock; inode->i_mode = S_IFLNK; - spin_lock(&files->file_lock); + rcu_read_lock(); file = fcheck_files(files, fd); if (!file) goto out_unlock2; @@ -996,7 +1000,7 @@ static struct dentry *proc_lookupfd(stru inode->i_mode |= S_IRUSR | S_IXUSR; if (file->f_mode & 2) inode->i_mode |= S_IWUSR | S_IXUSR; - spin_unlock(&files->file_lock); + rcu_read_unlock(); put_files_struct(files); inode->i_op = &proc_pid_link_inode_operations; inode->i_size = 64; @@ -1006,7 +1010,7 @@ static struct dentry *proc_lookupfd(stru return NULL; out_unlock2: - spin_unlock(&files->file_lock); + rcu_read_unlock(); put_files_struct(files); out_unlock: iput(inode); @@ -1390,62 +1394,37 @@ out: } #define PROC_NUMBUF 10 -#define PROC_MAXPIDS 20 - -/* - * Get a few pid's to return for filldir - we need to hold the - * tasklist lock while doing this, and we must release it before - * we actually do the filldir itself, so we use a temp buffer.. - */ -static int get_pid_list(int index, unsigned int *pids) -{ - struct task_struct *p; - int nr_pids = 0; - - index--; - read_lock(&tasklist_lock); - for_each_process(p) { - int pid = p->pid; - if (!pid_alive(p)) - continue; - if (--index >= 0) - continue; - pids[nr_pids] = pid; - nr_pids++; - if (nr_pids >= PROC_MAXPIDS) - break; - } - read_unlock(&tasklist_lock); - return nr_pids; -} int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) { - unsigned int pid_array[PROC_MAXPIDS]; char buf[PROC_NUMBUF]; unsigned int nr = filp->f_pos - FIRST_PROCESS_ENTRY; - unsigned int nr_pids, i; + int pid; if (!nr) { ino_t ino = fake_ino(0,PROC_PID_INO); if (filldir(dirent, "self", 4, filp->f_pos, ino, DT_LNK) < 0) return 0; filp->f_pos++; - nr++; + nr = 1; } + pid = nr - 1; + for (;;) { + unsigned long i, j; + ino_t ino; - nr_pids = get_pid_list(nr, pid_array); - - for (i = 0; i < nr_pids; i++) { - int pid = pid_array[i]; - ino_t ino = fake_ino(pid,PROC_PID_INO); - unsigned long j = PROC_NUMBUF; + pid = find_next_pid(pid); + if (pid < 0) + break; - do buf[--j] = '0' + (pid % 10); while (pid/=10); + i = pid; + j = PROC_NUMBUF; + do buf[--j] = '0' + (i % 10); while (i/=10); + ino = fake_ino(pid, PROC_PID_INO); if (filldir(dirent, buf+j, PROC_NUMBUF-j, filp->f_pos, ino, DT_DIR) < 0) break; - filp->f_pos++; + filp->f_pos = pid + 1 + FIRST_PROCESS_ENTRY; } return 0; } diff -prauN linux-2.6.0-test1/fs/proc/generic.c wli-2.6.0-test1-37/fs/proc/generic.c --- linux-2.6.0-test1/fs/proc/generic.c 2003-07-13 20:39:23.000000000 -0700 +++ wli-2.6.0-test1-37/fs/proc/generic.c 2003-07-14 10:16:55.000000000 -0700 @@ -487,29 +487,31 @@ static int proc_register(struct proc_dir */ static void proc_kill_inodes(struct proc_dir_entry *de) { - struct list_head *p; + int cpu; struct super_block *sb = proc_mnt->mnt_sb; /* * Actually it's a partial revoke(). */ - file_list_lock(); - list_for_each(p, &sb->s_files) { - struct file * filp = list_entry(p, struct file, f_list); - struct dentry * dentry = filp->f_dentry; - struct inode * inode; - struct file_operations *fops; - - if (dentry->d_op != &proc_dentry_operations) - continue; - inode = dentry->d_inode; - if (PDE(inode) != de) - continue; - fops = filp->f_op; - filp->f_op = NULL; - fops_put(fops); + file_list_lock_all(sb->s_file_lists); + for (cpu = 0; cpu < NR_CPUS; ++cpu) { + struct file *filp; + list_for_each_entry(filp, &sb->s_file_lists[cpu].list, f_list) { + struct dentry * dentry = filp->f_dentry; + struct inode * inode; + struct file_operations *fops; + + if (dentry->d_op != &proc_dentry_operations) + continue; + inode = dentry->d_inode; + if (PDE(inode) != de) + continue; + fops = filp->f_op; + filp->f_op = NULL; + fops_put(fops); + } } - file_list_unlock(); + file_list_unlock_all(sb->s_file_lists); } static struct proc_dir_entry *proc_create(struct proc_dir_entry **parent, diff -prauN linux-2.6.0-test1/fs/proc/proc_misc.c wli-2.6.0-test1-37/fs/proc/proc_misc.c --- linux-2.6.0-test1/fs/proc/proc_misc.c 2003-07-13 20:30:43.000000000 -0700 +++ wli-2.6.0-test1-37/fs/proc/proc_misc.c 2003-07-14 08:52:52.000000000 -0700 @@ -200,6 +200,7 @@ static int meminfo_read_proc(char *page, "SwapFree: %8lu kB\n" "Dirty: %8lu kB\n" "Writeback: %8lu kB\n" + "Deferred: %8lu kB\n" "Mapped: %8lu kB\n" "Slab: %8lu kB\n" "Committed_AS: %8u kB\n" @@ -210,8 +211,8 @@ static int meminfo_read_proc(char *page, K(i.totalram), K(i.freeram), K(i.bufferram), - K(get_page_cache_size()-total_swapcache_pages-i.bufferram), - K(total_swapcache_pages), + K(get_page_cache_size() - i.bufferram - ps.nr_swapcache), + K(ps.nr_swapcache), K(active), K(inactive), K(i.totalhigh), @@ -222,6 +223,7 @@ static int meminfo_read_proc(char *page, K(i.freeswap), K(ps.nr_dirty), K(ps.nr_writeback), + K(nr_deferred_pages()), K(ps.nr_mapped), K(ps.nr_slab), K(committed), diff -prauN linux-2.6.0-test1/fs/proc/task_mmu.c wli-2.6.0-test1-37/fs/proc/task_mmu.c --- linux-2.6.0-test1/fs/proc/task_mmu.c 2003-07-13 20:30:47.000000000 -0700 +++ wli-2.6.0-test1-37/fs/proc/task_mmu.c 2003-07-14 07:33:22.000000000 -0700 @@ -5,27 +5,6 @@ char *task_mem(struct mm_struct *mm, char *buffer) { - unsigned long data = 0, stack = 0, exec = 0, lib = 0; - struct vm_area_struct *vma; - - down_read(&mm->mmap_sem); - for (vma = mm->mmap; vma; vma = vma->vm_next) { - unsigned long len = (vma->vm_end - vma->vm_start) >> 10; - if (!vma->vm_file) { - data += len; - if (vma->vm_flags & VM_GROWSDOWN) - stack += len; - continue; - } - if (vma->vm_flags & VM_WRITE) - continue; - if (vma->vm_flags & VM_EXEC) { - exec += len; - if (vma->vm_flags & VM_EXECUTABLE) - continue; - lib += len; - } - } buffer += sprintf(buffer, "VmSize:\t%8lu kB\n" "VmLck:\t%8lu kB\n" @@ -37,9 +16,10 @@ char *task_mem(struct mm_struct *mm, cha mm->total_vm << (PAGE_SHIFT-10), mm->locked_vm << (PAGE_SHIFT-10), mm->rss << (PAGE_SHIFT-10), - data - stack, stack, - exec - lib, lib); - up_read(&mm->mmap_sem); + (mm->data - mm->stack) << (PAGE_SHIFT-10), + mm->stack << (PAGE_SHIFT-10), + mm->text << (PAGE_SHIFT-10), + mm->lib << (PAGE_SHIFT-10)); return buffer; } @@ -49,30 +29,15 @@ unsigned long task_vsize(struct mm_struc } int task_statm(struct mm_struct *mm, int *shared, int *text, - int *data, int *resident) + int *lib, int *data, int *resident, int *dirty) { - struct vm_area_struct *vma; - int size = 0; - + *shared = mm->shared; + *text = mm->text; + *lib = mm->lib; + *data = mm->data; + *dirty = mm->dirty; *resident = mm->rss; - for (vma = mm->mmap; vma; vma = vma->vm_next) { - int pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; - - size += pages; - if (is_vm_hugetlb_page(vma)) { - if (!(vma->vm_flags & VM_DONTCOPY)) - *shared += pages; - continue; - } - if (vma->vm_flags & VM_SHARED || !list_empty(&vma->shared)) - *shared += pages; - if (vma->vm_flags & VM_EXECUTABLE) - *text += pages; - else - *data += pages; - } - - return size; + return mm->total_vm; } static int show_map(struct seq_file *m, void *v) diff -prauN linux-2.6.0-test1/fs/proc/task_nommu.c wli-2.6.0-test1-37/fs/proc/task_nommu.c --- linux-2.6.0-test1/fs/proc/task_nommu.c 2003-07-13 20:34:42.000000000 -0700 +++ wli-2.6.0-test1-37/fs/proc/task_nommu.c 2003-07-14 09:22:36.000000000 -0700 @@ -67,19 +67,23 @@ unsigned long task_vsize(struct mm_struc struct mm_tblock_struct *tbp; unsigned long vsize = 0; + down_read(&mm->mmap_sem); for (tbp = &mm->context.tblock; tbp; tbp = tbp->next) { if (tbp->rblock) vsize += kobjsize(tbp->rblock->kblock); } - + up_read(&mm->mmap_sem); return vsize; } int task_statm(struct mm_struct *mm, int *shared, int *text, - int *data, int *resident) + int *lib, int *data, int *resident, int *dirty) { struct mm_tblock_struct *tbp; - int size = kobjsize(mm); + int size; + + down_read(&mm->mmap_sem); + size = kobjsize(mm); for (tbp = &mm->context.tblock; tbp; tbp = tbp->next) { if (tbp->next) @@ -92,8 +96,9 @@ int task_statm(struct mm_struct *mm, int size += (*text = mm->end_code - mm->start_code); size += (*data = mm->start_stack - mm->start_data); - + *shared = *lib = *dirty = 0; *resident = size; + up_read(&mm->mmap_sem); return size; } diff -prauN linux-2.6.0-test1/fs/qnx4/inode.c wli-2.6.0-test1-37/fs/qnx4/inode.c --- linux-2.6.0-test1/fs/qnx4/inode.c 2003-07-13 20:37:31.000000000 -0700 +++ wli-2.6.0-test1-37/fs/qnx4/inode.c 2003-07-14 08:52:52.000000000 -0700 @@ -434,7 +434,7 @@ static int qnx4_readpage(struct file *fi static int qnx4_prepare_write(struct file *file, struct page *page, unsigned from, unsigned to) { - struct qnx4_inode_info *qnx4_inode = qnx4_i(page->mapping->host); + struct qnx4_inode_info *qnx4_inode = qnx4_i(page_mapping(page)->host); return cont_prepare_write(page, from, to, qnx4_get_block, &qnx4_inode->mmu_private); } diff -prauN linux-2.6.0-test1/fs/reiserfs/inode.c wli-2.6.0-test1-37/fs/reiserfs/inode.c --- linux-2.6.0-test1/fs/reiserfs/inode.c 2003-07-13 20:32:39.000000000 -0700 +++ wli-2.6.0-test1-37/fs/reiserfs/inode.c 2003-07-14 08:52:52.000000000 -0700 @@ -1998,7 +1998,7 @@ static void lock_buffer_for_writepage(st lock_buffer(bh); } else { if (test_set_buffer_locked(bh)) { - __set_page_dirty_nobuffers(page); + set_page_dirty_nobuffers(page); return; } } @@ -2017,7 +2017,7 @@ static void lock_buffer_for_writepage(st * code to handle reiserfs tails. */ static int reiserfs_write_full_page(struct page *page, struct writeback_control *wbc) { - struct inode *inode = page->mapping->host ; + struct inode *inode = page_mapping(page)->host ; unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT ; int error = 0; unsigned long block ; @@ -2170,7 +2170,7 @@ static int reiserfs_readpage (struct fil static int reiserfs_writepage (struct page * page, struct writeback_control *wbc) { - struct inode *inode = page->mapping->host ; + struct inode *inode = page_mapping(page)->host ; reiserfs_wait_on_write_block(inode->i_sb) ; return reiserfs_write_full_page(page, wbc) ; } @@ -2178,7 +2178,7 @@ static int reiserfs_writepage (struct pa int reiserfs_prepare_write(struct file *f, struct page *page, unsigned from, unsigned to) { - struct inode *inode = page->mapping->host ; + struct inode *inode = page_mapping(page)->host ; reiserfs_wait_on_write_block(inode->i_sb) ; fix_tail_page_for_writing(page) ; return block_prepare_write(page, from, to, reiserfs_get_block) ; @@ -2191,7 +2191,7 @@ static sector_t reiserfs_aop_bmap(struct static int reiserfs_commit_write(struct file *f, struct page *page, unsigned from, unsigned to) { - struct inode *inode = page->mapping->host ; + struct inode *inode = page_mapping(page)->host ; loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; int ret ; @@ -2282,7 +2282,7 @@ void i_attrs_to_sd_attrs( struct inode * */ static int reiserfs_releasepage(struct page *page, int unused_gfp_flags) { - struct inode *inode = page->mapping->host ; + struct inode *inode = page_mapping(page)->host ; struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb) ; struct buffer_head *head ; struct buffer_head *bh ; diff -prauN linux-2.6.0-test1/fs/romfs/inode.c wli-2.6.0-test1-37/fs/romfs/inode.c --- linux-2.6.0-test1/fs/romfs/inode.c 2003-07-13 20:35:56.000000000 -0700 +++ wli-2.6.0-test1-37/fs/romfs/inode.c 2003-07-14 08:52:52.000000000 -0700 @@ -414,7 +414,7 @@ out: unlock_kernel(); static int romfs_readpage(struct file *file, struct page * page) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; unsigned long offset, avail, readlen; void *buf; int result = -EIO; diff -prauN linux-2.6.0-test1/fs/select.c wli-2.6.0-test1-37/fs/select.c --- linux-2.6.0-test1/fs/select.c 2003-07-13 20:29:27.000000000 -0700 +++ wli-2.6.0-test1-37/fs/select.c 2003-07-14 09:45:14.000000000 -0700 @@ -20,6 +20,7 @@ #include /* for STICKY_TIMEOUTS */ #include #include +#include #include @@ -126,13 +127,16 @@ void __pollwait(struct file *filp, wait_ static int max_select_fd(unsigned long n, fd_set_bits *fds) { unsigned long *open_fds; + fd_set *open_fdset; unsigned long set; int max; /* handle last in-complete long-word first */ set = ~(~0UL << (n & (__NFDBITS-1))); n /= __NFDBITS; - open_fds = current->files->open_fds->fds_bits+n; + open_fdset = current->files->open_fds; + read_barrier_depends(); + open_fds = open_fdset->fds_bits+n; max = 0; if (set) { set &= BITS(fds, n); @@ -179,9 +183,9 @@ int do_select(int n, fd_set_bits *fds, l int retval, i; long __timeout = *timeout; - spin_lock(¤t->files->file_lock); + rcu_read_lock(); retval = max_select_fd(n, fds); - spin_unlock(¤t->files->file_lock); + rcu_read_unlock(); if (retval < 0) return retval; diff -prauN linux-2.6.0-test1/fs/smbfs/file.c wli-2.6.0-test1-37/fs/smbfs/file.c --- linux-2.6.0-test1/fs/smbfs/file.c 2003-07-13 20:37:33.000000000 -0700 +++ wli-2.6.0-test1-37/fs/smbfs/file.c 2003-07-14 08:52:52.000000000 -0700 @@ -172,7 +172,7 @@ smb_writepage_sync(struct inode *inode, static int smb_writepage(struct page *page, struct writeback_control *wbc) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); struct inode *inode; unsigned long end_index; unsigned offset = PAGE_CACHE_SIZE; diff -prauN linux-2.6.0-test1/fs/super.c wli-2.6.0-test1-37/fs/super.c --- linux-2.6.0-test1/fs/super.c 2003-07-13 20:33:47.000000000 -0700 +++ wli-2.6.0-test1-37/fs/super.c 2003-07-14 10:16:55.000000000 -0700 @@ -62,7 +62,7 @@ static struct super_block *alloc_super(v } INIT_LIST_HEAD(&s->s_dirty); INIT_LIST_HEAD(&s->s_io); - INIT_LIST_HEAD(&s->s_files); + file_list_init(s->s_file_lists); INIT_LIST_HEAD(&s->s_instances); INIT_HLIST_HEAD(&s->s_anon); init_rwsem(&s->s_umount); @@ -432,13 +432,15 @@ out: static void mark_files_ro(struct super_block *sb) { struct file *f; + int cpu; - file_list_lock(); - list_for_each_entry(f, &sb->s_files, f_list) { - if (S_ISREG(f->f_dentry->d_inode->i_mode) && file_count(f)) - f->f_mode &= ~FMODE_WRITE; - } - file_list_unlock(); + file_list_lock_all(sb->s_file_lists); + for (cpu = 0; cpu < NR_CPUS; ++cpu) + list_for_each_entry(f, &sb->s_file_lists[cpu].list, f_list) + if (S_ISREG(f->f_dentry->d_inode->i_mode) && + file_count(f)) + f->f_mode &= ~FMODE_WRITE; + file_list_unlock_all(sb->s_file_lists); } /** diff -prauN linux-2.6.0-test1/fs/sysv/dir.c wli-2.6.0-test1-37/fs/sysv/dir.c --- linux-2.6.0-test1/fs/sysv/dir.c 2003-07-13 20:32:31.000000000 -0700 +++ wli-2.6.0-test1-37/fs/sysv/dir.c 2003-07-14 08:52:52.000000000 -0700 @@ -39,10 +39,10 @@ static inline unsigned long dir_pages(st static int dir_commit_chunk(struct page *page, unsigned from, unsigned to) { - struct inode *dir = (struct inode *)page->mapping->host; + struct inode *dir = (struct inode *)page_mapping(page)->host; int err = 0; - page->mapping->a_ops->commit_write(NULL, page, from, to); + page_mapping(page)->a_ops->commit_write(NULL, page, from, to); if (IS_DIRSYNC(dir)) err = write_one_page(page, 1); else @@ -225,7 +225,7 @@ got_it: from = (char*)de - (char*)page_address(page); to = from + SYSV_DIRSIZE; lock_page(page); - err = page->mapping->a_ops->prepare_write(NULL, page, from, to); + err = page_mapping(page)->a_ops->prepare_write(NULL, page, from, to); if (err) goto out_unlock; memcpy (de->name, name, namelen); @@ -245,7 +245,7 @@ out_unlock: int sysv_delete_entry(struct sysv_dir_entry *de, struct page *page) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); struct inode *inode = (struct inode*)mapping->host; char *kaddr = (char*)page_address(page); unsigned from = (char*)de - kaddr; @@ -347,13 +347,13 @@ not_empty: void sysv_set_link(struct sysv_dir_entry *de, struct page *page, struct inode *inode) { - struct inode *dir = (struct inode*)page->mapping->host; + struct inode *dir = (struct inode*)page_mapping(page)->host; unsigned from = (char *)de-(char*)page_address(page); unsigned to = from + SYSV_DIRSIZE; int err; lock_page(page); - err = page->mapping->a_ops->prepare_write(NULL, page, from, to); + err = page_mapping(page)->a_ops->prepare_write(NULL, page, from, to); if (err) BUG(); de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino); diff -prauN linux-2.6.0-test1/fs/udf/file.c wli-2.6.0-test1-37/fs/udf/file.c --- linux-2.6.0-test1/fs/udf/file.c 2003-07-13 20:34:43.000000000 -0700 +++ wli-2.6.0-test1-37/fs/udf/file.c 2003-07-14 08:52:52.000000000 -0700 @@ -46,7 +46,7 @@ static int udf_adinicb_readpage(struct file *file, struct page * page) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; char *kaddr; if (!PageLocked(page)) @@ -64,7 +64,7 @@ static int udf_adinicb_readpage(struct f static int udf_adinicb_writepage(struct page *page, struct writeback_control *wbc) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; char *kaddr; if (!PageLocked(page)) @@ -87,7 +87,7 @@ static int udf_adinicb_prepare_write(str static int udf_adinicb_commit_write(struct file *file, struct page *page, unsigned offset, unsigned to) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; char *kaddr = page_address(page); memcpy(UDF_I_DATA(inode) + UDF_I_LENEATTR(inode) + offset, diff -prauN linux-2.6.0-test1/fs/udf/symlink.c wli-2.6.0-test1-37/fs/udf/symlink.c --- linux-2.6.0-test1/fs/udf/symlink.c 2003-07-13 20:31:58.000000000 -0700 +++ wli-2.6.0-test1-37/fs/udf/symlink.c 2003-07-14 08:52:52.000000000 -0700 @@ -80,7 +80,7 @@ static void udf_pc_to_char(struct super_ static int udf_symlink_filler(struct file *file, struct page *page) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; struct buffer_head *bh = NULL; char *symlink; int err = -EIO; diff -prauN linux-2.6.0-test1/fs/xfs/linux/xfs_aops.c wli-2.6.0-test1-37/fs/xfs/linux/xfs_aops.c --- linux-2.6.0-test1/fs/xfs/linux/xfs_aops.c 2003-07-13 20:37:58.000000000 -0700 +++ wli-2.6.0-test1-37/fs/xfs/linux/xfs_aops.c 2003-07-14 08:52:52.000000000 -0700 @@ -210,7 +210,7 @@ probe_unwritten_page( if (PageWriteback(page)) goto out; - if (page->mapping && page_has_buffers(page)) { + if (page_mapping(page) && page_has_buffers(page)) { struct buffer_head *bh, *head; unsigned long p_offset = 0; @@ -257,7 +257,7 @@ probe_unmapped_page( if (PageWriteback(page)) goto out; - if (page->mapping && PageDirty(page)) { + if (page_mapping(page) && PageDirty(page)) { if (page_has_buffers(page)) { struct buffer_head *bh, *head; @@ -337,7 +337,7 @@ probe_delalloc_page( if (PageWriteback(page)) goto out; - if (page->mapping && page_has_buffers(page)) { + if (page_mapping(page) && page_has_buffers(page)) { struct buffer_head *bh, *head; int acceptable = 0; @@ -621,7 +621,7 @@ page_state_convert( int startio, int unmapped) /* also implies page uptodate */ { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; struct buffer_head *bh_arr[MAX_BUF_PER_PAGE], *bh, *head; page_buf_bmap_t *mp, map; unsigned long p_offset = 0, end_index; @@ -1032,7 +1032,7 @@ linvfs_writepage( int error; int need_trans; int delalloc, unmapped, unwritten; - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; /* * We need a transaction if: diff -prauN linux-2.6.0-test1/include/asm-alpha/pgalloc.h wli-2.6.0-test1-37/include/asm-alpha/pgalloc.h --- linux-2.6.0-test1/include/asm-alpha/pgalloc.h 2003-07-13 20:28:55.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-alpha/pgalloc.h 2003-07-14 06:49:00.000000000 -0700 @@ -24,9 +24,9 @@ pmd_populate_kernel(struct mm_struct *mm } static inline void -pgd_populate(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmd) +pgd_populate(struct mm_struct *mm, pgd_t *pgd, struct page *pmd) { - pgd_set(pgd, pmd); + pgd_set(pgd, page_address(pmd)); } extern pgd_t *pgd_alloc(struct mm_struct *mm); @@ -37,19 +37,29 @@ pgd_free(pgd_t *pgd) free_page((unsigned long)pgd); } -static inline pmd_t * +static inline struct page * pmd_alloc_one(struct mm_struct *mm, unsigned long address) { - pmd_t *ret = (pmd_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT); - if (ret) - clear_page(ret); - return ret; + struct page *page = alloc_page(GFP_KERNEL|__GFP_REPEAT); + if (page) + clear_highpage(page); + return page; +} + +static inline pmd_t * +pmd_alloc_one_kernel(struct mm_struct *mm, unsigned long addr) +{ + struct page *page = pmd_alloc_one(mm, addr); + if (page) + return page_address(page); + else + return NULL; } static inline void -pmd_free(pmd_t *pmd) +pmd_free(struct page *pmd) { - free_page((unsigned long)pmd); + __free_page(pmd); } extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long addr); diff -prauN linux-2.6.0-test1/include/asm-alpha/pgtable.h wli-2.6.0-test1-37/include/asm-alpha/pgtable.h --- linux-2.6.0-test1/include/asm-alpha/pgtable.h 2003-07-13 20:32:33.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-alpha/pgtable.h 2003-07-14 06:49:00.000000000 -0700 @@ -229,9 +229,11 @@ pmd_page_kernel(pmd_t pmd) #define pmd_page(pmd) (mem_map + ((pmd_val(pmd) & _PFN_MASK) >> 32)) #endif -extern inline unsigned long pgd_page(pgd_t pgd) +extern inline unsigned long __pgd_page(pgd_t pgd) { return PAGE_OFFSET + ((pgd_val(pgd) & _PFN_MASK) >> (32-PAGE_SHIFT)); } +#define pgd_page(pgd) virt_to_page(__pgd_page(pgd)) + extern inline int pte_none(pte_t pte) { return !pte_val(pte); } extern inline int pte_present(pte_t pte) { return pte_val(pte) & _PAGE_VALID; } extern inline void pte_clear(pte_t *ptep) { pte_val(*ptep) = 0; } @@ -280,9 +282,15 @@ extern inline pte_t pte_mkyoung(pte_t pt /* Find an entry in the second-level page table.. */ extern inline pmd_t * pmd_offset(pgd_t * dir, unsigned long address) { - return (pmd_t *) pgd_page(*dir) + ((address >> PMD_SHIFT) & (PTRS_PER_PAGE - 1)); + return (pmd_t *)__pgd_page(*dir) + ((address >> PMD_SHIFT) & (PTRS_PER_PAGE - 1)); } +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) + /* Find an entry in the third-level page table.. */ extern inline pte_t * pte_offset_kernel(pmd_t * dir, unsigned long address) { diff -prauN linux-2.6.0-test1/include/asm-alpha/smp.h wli-2.6.0-test1-37/include/asm-alpha/smp.h --- linux-2.6.0-test1/include/asm-alpha/smp.h 2003-07-13 20:38:05.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-alpha/smp.h 2003-07-14 06:31:10.000000000 -0700 @@ -3,6 +3,7 @@ #include #include +#include #include #include @@ -44,27 +45,12 @@ extern struct cpuinfo_alpha cpu_data[NR_ #define hard_smp_processor_id() __hard_smp_processor_id() #define smp_processor_id() (current_thread_info()->cpu) -extern unsigned long cpu_present_mask; -extern volatile unsigned long cpu_online_map; +extern cpumask_t cpu_present_mask; +extern cpumask_t long cpu_online_map; extern int smp_num_cpus; -#define cpu_possible(cpu) (cpu_present_mask & (1UL << (cpu))) -#define cpu_online(cpu) (cpu_online_map & (1UL << (cpu))) - -static inline int -num_online_cpus(void) -{ - return hweight64(cpu_online_map); -} - -extern inline int -any_online_cpu(unsigned int mask) -{ - if (mask & cpu_online_map) - return __ffs(mask & cpu_online_map); - - return -1; -} +#define cpu_possible(cpu) cpu_isset(cpu, cpu_present_mask) +#define cpu_online(cpu) cpu_isset(cpu, cpu_online_map) extern int smp_call_function_on_cpu(void (*func) (void *info), void *info,int retry, int wait, unsigned long cpu); diff -prauN linux-2.6.0-test1/include/asm-arm/pgalloc.h wli-2.6.0-test1-37/include/asm-arm/pgalloc.h --- linux-2.6.0-test1/include/asm-arm/pgalloc.h 2003-07-13 20:38:36.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-arm/pgalloc.h 2003-07-14 06:49:00.000000000 -0700 @@ -16,7 +16,8 @@ /* * Since we have only two-level page tables, these are trivial */ -#define pmd_alloc_one(mm,addr) ({ BUG(); ((pmd_t *)2); }) +#define pmd_alloc_one(mm,addr) ({ BUG(); ((struct page *)2); }) +#define pmd_alloc_one_kernel(mm,addr) ({ BUG(); ((pmd_t *)2); }) #define pmd_free(pmd) do { } while (0) #define pgd_populate(mm,pmd,pte) BUG() diff -prauN linux-2.6.0-test1/include/asm-arm/pgtable.h wli-2.6.0-test1-37/include/asm-arm/pgtable.h --- linux-2.6.0-test1/include/asm-arm/pgtable.h 2003-07-13 20:32:28.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-arm/pgtable.h 2003-07-14 06:49:00.000000000 -0700 @@ -125,6 +125,11 @@ extern struct page *empty_zero_page; /* Find an entry in the second-level page table.. */ #define pmd_offset(dir, addr) ((pmd_t *)(dir)) +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) /* Find an entry in the third-level page table.. */ #define __pte_index(addr) (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) diff -prauN linux-2.6.0-test1/include/asm-arm/proc-armv/cache.h wli-2.6.0-test1-37/include/asm-arm/proc-armv/cache.h --- linux-2.6.0-test1/include/asm-arm/proc-armv/cache.h 2003-07-13 20:33:50.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-arm/proc-armv/cache.h 2003-07-14 08:52:52.000000000 -0700 @@ -246,8 +246,8 @@ flush_cache_page(struct vm_area_struct * * flush_dcache_page is used when the kernel has written to the page * cache page at virtual address page->virtual. * - * If this page isn't mapped (ie, page->mapping = NULL), or it has - * userspace mappings (page->mapping->i_mmap or page->mapping->i_mmap_shared) + * If this page isn't mapped (ie, page_mapping(page) = NULL), or it has + * userspace mappings (page_mapping(page)->i_mmap or page_mapping(page)->i_mmap_shared) * then we _must_ always clean + invalidate the dcache entries associated * with the kernel mapping. * @@ -262,7 +262,7 @@ extern void __flush_dcache_page(struct p static inline void flush_dcache_page(struct page *page) { - if (page->mapping && !mapping_mapped(page->mapping)) + if (page_mapping(page) && !mapping_mapped(page_mapping(page))) set_bit(PG_dcache_dirty, &page->flags); else __flush_dcache_page(page); diff -prauN linux-2.6.0-test1/include/asm-arm26/pgalloc.h wli-2.6.0-test1-37/include/asm-arm26/pgalloc.h --- linux-2.6.0-test1/include/asm-arm26/pgalloc.h 2003-07-13 20:39:28.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-arm26/pgalloc.h 2003-07-14 06:49:52.000000000 -0700 @@ -55,7 +55,8 @@ pmd_populate_kernel(struct mm_struct *mm * is thrown away. It just cant be zero. -IM */ -#define pmd_alloc_one(mm,addr) ({ BUG(); ((pmd_t *)2); }) +#define pmd_alloc_one(mm,addr) ({ BUG(); ((struct page *)2); }) +#define pmd_alloc_one_kernel(mm,addr) ({ BUG(); ((pmd_t *)2); }) #define pmd_free(pmd) do { } while (0) #define pgd_populate(mm,pmd,pte) BUG() diff -prauN linux-2.6.0-test1/include/asm-arm26/pgtable.h wli-2.6.0-test1-37/include/asm-arm26/pgtable.h --- linux-2.6.0-test1/include/asm-arm26/pgtable.h 2003-07-13 20:30:40.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-arm26/pgtable.h 2003-07-14 06:49:00.000000000 -0700 @@ -99,7 +99,7 @@ extern struct page *empty_zero_page; * on arm26 we have no 2nd level page table. we simulate this by removing the * PMD. * - * pgd_none is 0 to prevernt pmd_alloc() calling __pmd_alloc(). This causes it + * pgd_none is 0 to prevernt pmd_alloc_map() calling __pmd_alloc(). This causes it * to return pmd_offset(pgd,addr) which is a pointer to the pgd (IOW, a no-op). * * however, to work this way, whilst we are allocating 32 pgds, containing 32 @@ -134,7 +134,7 @@ extern struct page *empty_zero_page; #define _PMD_PRESENT (0x01) -/* These definitions allow us to optimise out stuff like pmd_alloc() */ +/* These definitions allow us to optimise out stuff like pmd_alloc_map() */ #define pgd_none(pgd) (0) #define pgd_bad(pgd) (0) #define pgd_present(pgd) (1) @@ -189,6 +189,12 @@ extern struct page *empty_zero_page; #define pte_unmap(pte) do { } while (0) #define pte_unmap_nested(pte) do { } while (0) +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pgd, addr) do { } while (0) +#define pmd_unmap_nested(pgd, addr) do { } while (0) + #define _PAGE_PRESENT 0x01 #define _PAGE_READONLY 0x02 diff -prauN linux-2.6.0-test1/include/asm-arm26/rmap.h wli-2.6.0-test1-37/include/asm-arm26/rmap.h --- linux-2.6.0-test1/include/asm-arm26/rmap.h 2003-07-13 20:33:13.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-arm26/rmap.h 2003-07-14 08:52:52.000000000 -0700 @@ -14,14 +14,14 @@ static inline void pgtable_add_rmap(struct page *page, struct mm_struct * mm, unsigned long address) { - page->mapping = (void *)mm; + set_page_mapping(page, mm); page->index = address & ~((PTRS_PER_PTE * PAGE_SIZE) - 1); inc_page_state(nr_page_table_pages); } static inline void pgtable_remove_rmap(struct page *page) { - page->mapping = NULL; + set_page_mapping(page, NULL); page->index = 0; dec_page_state(nr_page_table_pages); } @@ -29,7 +29,7 @@ static inline void pgtable_remove_rmap(s static inline struct mm_struct * ptep_to_mm(pte_t * ptep) { struct page * page = virt_to_page(ptep); - return (struct mm_struct *)page->mapping; + return (struct mm_struct *)page_mapping(page); } /* The page table takes half of the page */ diff -prauN linux-2.6.0-test1/include/asm-cris/pgalloc.h wli-2.6.0-test1-37/include/asm-cris/pgalloc.h --- linux-2.6.0-test1/include/asm-cris/pgalloc.h 2003-07-13 20:37:26.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-cris/pgalloc.h 2003-07-14 06:50:25.000000000 -0700 @@ -57,7 +57,8 @@ extern inline void pte_free(struct page * the pgd will always be present.. */ -#define pmd_alloc_one(mm, addr) ({ BUG(); ((pmd_t *)2); }) +#define pmd_alloc_one(mm, addr) ({ BUG(); ((struct page *)2); }) +#define pmd_alloc_one_kernel(mm, addr) ({ BUG(); ((pmd_t *)2); }) #define pmd_free(x) do { } while (0) #define __pmd_free_tlb(tlb,x) do { } while (0) #define pgd_populate(mm, pmd, pte) BUG() diff -prauN linux-2.6.0-test1/include/asm-cris/pgtable.h wli-2.6.0-test1-37/include/asm-cris/pgtable.h --- linux-2.6.0-test1/include/asm-cris/pgtable.h 2003-07-13 20:29:21.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-cris/pgtable.h 2003-07-14 06:49:00.000000000 -0700 @@ -281,6 +281,12 @@ extern inline pmd_t * pmd_offset(pgd_t * return (pmd_t *) dir; } +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) + /* Find an entry in the third-level page table.. */ #define __pte_offset(address) \ (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) diff -prauN linux-2.6.0-test1/include/asm-generic/cpumask_array.h wli-2.6.0-test1-37/include/asm-generic/cpumask_array.h --- linux-2.6.0-test1/include/asm-generic/cpumask_array.h 1969-12-31 16:00:00.000000000 -0800 +++ wli-2.6.0-test1-37/include/asm-generic/cpumask_array.h 2003-07-14 06:31:10.000000000 -0700 @@ -0,0 +1,41 @@ +#ifndef __ASM_GENERIC_CPUMASK_ARRAY_H +#define __ASM_GENERIC_CPUMASK_ARRAY_H + +#define CPU_ARRAY_SIZE BITS_TO_LONGS(NR_CPUS) + +#define cpu_set(cpu, map) set_bit(cpu, (map).mask) +#define cpu_clear(cpu, map) clear_bit(cpu, (map).mask) +#define cpu_isset(cpu, map) test_bit(cpu, (map).mask) +#define cpu_test_and_set(cpu, map) test_and_set_bit(cpu, (map).mask) + +#define cpus_and(dst,src1,src2) bitmap_and((dst).mask,(src1).mask, (src2).mask, NR_CPUS) +#define cpus_or(dst,src1,src2) bitmap_or((dst).mask, (src1).mask, (src2).mask, NR_CPUS) +#define cpus_clear(map) bitmap_clear((map).mask, NR_CPUS) +#define cpus_complement(map) bitmap_complement((map).mask, NR_CPUS) +#define cpus_equal(map1, map2) bitmap_equal((map1).mask, (map2).mask, NR_CPUS) +#define cpus_empty(map) bitmap_empty(map.mask, NR_CPUS) +#define cpus_weight(map) bitmap_weight((map).mask, NR_CPUS) +#define cpus_shift_right(d, s, n) bitmap_shift_right((d).mask, (s).mask, n, NR_CPUS) +#define cpus_shift_left(d, s, n) bitmap_shift_left((d).mask, (s).mask, n, NR_CPUS) +#define first_cpu(map) find_first_bit((map).mask, NR_CPUS) +#define next_cpu(cpu, map) find_next_bit((map).mask, NR_CPUS, cpu) + +/* only ever use this for things that are _never_ used on large boxen */ +#define cpus_coerce(map) ((map).mask[0]) +#define cpus_promote(map) ({ cpumask_t __cpu_mask = CPU_MASK_NONE;\ + __cpu_mask.mask[0] = map; \ + __cpu_mask; \ + }) +#define cpumask_of_cpu(cpu) ({ cpumask_t __cpu_mask = CPU_MASK_NONE;\ + cpu_set(cpu, __cpu_mask); \ + __cpu_mask; \ + }) +#define any_online_cpu(map) find_first_bit((map).mask, NR_CPUS) + +/* + * um, these need to be usable as static initializers + */ +#define CPU_MASK_ALL { {[0 ... CPU_ARRAY_SIZE-1] = ~0UL} } +#define CPU_MASK_NONE { {[0 ... CPU_ARRAY_SIZE-1] = 0UL} } + +#endif /* __ASM_GENERIC_CPUMASK_ARRAY_H */ diff -prauN linux-2.6.0-test1/include/asm-generic/cpumask_const_reference.h wli-2.6.0-test1-37/include/asm-generic/cpumask_const_reference.h --- linux-2.6.0-test1/include/asm-generic/cpumask_const_reference.h 1969-12-31 16:00:00.000000000 -0800 +++ wli-2.6.0-test1-37/include/asm-generic/cpumask_const_reference.h 2003-07-14 06:31:10.000000000 -0700 @@ -0,0 +1,29 @@ +#ifndef __ASM_GENERIC_CPUMASK_CONST_REFERENCE_H +#define __ASM_GENERIC_CPUMASK_CONST_REFERENCE_H + +struct cpumask_ref { + const cpumask_t *val; +}; + +typedef const struct cpumask_ref cpumask_const_t; + +#define mk_cpumask_const(map) ((cpumask_const_t){ &(map) }) +#define cpu_isset_const(cpu, map) cpu_isset(cpu, *(map).val) + +#define cpus_and_const(dst,src1,src2) cpus_and(dst,*(src1).val,*(src2).val) +#define cpus_or_const(dst,src1,src2) cpus_or(dst,*(src1).val,*(src2).val) + +#define cpus_equal_const(map1, map2) cpus_equal(*(map1).val, *(map2).val) + +#define cpus_copy_const(map1, map2) bitmap_copy((map1).mask, (map2).val->mask, NR_CPUS) + +#define cpus_empty_const(map) cpus_empty(*(map).val) +#define cpus_weight_const(map) cpus_weight(*(map).val) +#define first_cpu_const(map) first_cpu(*(map).val) +#define next_cpu_const(cpu, map) next_cpu(cpu, *(map).val) + +/* only ever use this for things that are _never_ used on large boxen */ +#define cpus_coerce_const(map) cpus_coerce(*(map).val) +#define any_online_cpu_const(map) any_online_cpu(*(map).val) + +#endif /* __ASM_GENERIC_CPUMASK_CONST_REFERENCE_H */ diff -prauN linux-2.6.0-test1/include/asm-generic/cpumask_const_value.h wli-2.6.0-test1-37/include/asm-generic/cpumask_const_value.h --- linux-2.6.0-test1/include/asm-generic/cpumask_const_value.h 1969-12-31 16:00:00.000000000 -0800 +++ wli-2.6.0-test1-37/include/asm-generic/cpumask_const_value.h 2003-07-14 06:31:10.000000000 -0700 @@ -0,0 +1,21 @@ +#ifndef __ASM_GENERIC_CPUMASK_CONST_VALUE_H +#define __ASM_GENERIC_CPUMASK_CONST_VALUE_H + +typedef const cpumask_t cpumask_const_t; + +#define mk_cpumask_const(map) ((cpumask_const_t)(map)) +#define cpu_isset_const(cpu, map) cpu_isset(cpu, map) +#define cpus_and_const(dst,src1,src2) cpus_and(dst, src1, src2) +#define cpus_or_const(dst,src1,src2) cpus_or(dst, src1, src2) +#define cpus_equal_const(map1, map2) cpus_equal(map1, map2) +#define cpus_empty_const(map) cpus_empty(map) +#define cpus_copy_const(map1, map2) do { map1 = (cpumask_t)map2; } while (0) +#define cpus_weight_const(map) cpus_weight(map) +#define first_cpu_const(map) first_cpu(map) +#define next_cpu_const(cpu, map) next_cpu(cpu, map) + +/* only ever use this for things that are _never_ used on large boxen */ +#define cpus_coerce_const(map) cpus_coerce(map) +#define any_online_cpu_const(map) any_online_cpu(map) + +#endif /* __ASM_GENERIC_CPUMASK_CONST_VALUE_H */ diff -prauN linux-2.6.0-test1/include/asm-generic/cpumask_up.h wli-2.6.0-test1-37/include/asm-generic/cpumask_up.h --- linux-2.6.0-test1/include/asm-generic/cpumask_up.h 1969-12-31 16:00:00.000000000 -0800 +++ wli-2.6.0-test1-37/include/asm-generic/cpumask_up.h 2003-07-14 06:31:10.000000000 -0700 @@ -0,0 +1,60 @@ +#ifndef __ASM_GENERIC_CPUMASK_UP_H +#define __ASM_GENERIC_CPUMASK_UP_H + +#define CPU_ARRAY_SIZE BITS_TO_LONGS(NR_CPUS) + +#define cpus_coerce(map) ((map).mask[0]) + +#define cpu_set(cpu, map) do { cpus_coerce(map) = 1UL; } while (0) +#define cpu_clear(cpu, map) do { cpus_coerce(map) = 0UL; } while (0) +#define cpu_isset(cpu, map) (cpus_coerce(map) != 0UL) +#define cpu_test_and_set(cpu, map) test_and_set_bit(0, (map).mask) + +#define cpus_and(dst, src1, src2) \ + do { \ + if (cpus_coerce(src1) && cpus_coerce(src2)) \ + cpus_coerce(dst) = 1UL; \ + else \ + cpus_coerce(dst) = 0UL; \ + } while (0) + +#define cpus_or(dst, src1, src2) \ + do { \ + if (cpus_coerce(src1) || cpus_coerce(src2)) \ + cpus_coerce(dst) = 1UL; \ + else \ + cpus_coerce(dst) = 0UL; \ + } while (0) + +#define cpus_clear(map) do { cpus_coerce(map) = 0UL; } while (0) + +#define cpus_complement(map) \ + do { \ + cpus_coerce(map) = !cpus_coerce(map); \ + } while (0) + +#define cpus_equal(map1, map2) (cpus_coerce(map1) == cpus_coerce(map2)) +#define cpus_empty(map) (cpus_coerce(map) == 0UL) +#define cpus_weight(map) (cpus_coerce(map) ? 1UL : 0UL) +#define cpus_shift_right(d, s, n) do { cpus_coerce(d) = 0UL; } while (0) +#define cpus_shift_left(d, s, n) do { cpus_coerce(d) = 0UL; } while (0) +#define first_cpu(map) (cpus_coerce(map) ? 0 : 1) +#define next_cpu(cpu, map) 1 + +/* only ever use this for things that are _never_ used on large boxen */ +#define cpus_promote(map) \ + ({ \ + cpumask_t __tmp__; \ + cpus_coerce(__tmp__) = map; \ + __tmp__; \ + }) +#define cpumask_of_cpu(cpu) cpus_promote(1) +#define any_online_cpu(map) (cpus_coerce(map) ? 0 : 1) + +/* + * um, these need to be usable as static initializers + */ +#define CPU_MASK_ALL { {[0 ... CPU_ARRAY_SIZE-1] = 1UL} } +#define CPU_MASK_NONE { {[0 ... CPU_ARRAY_SIZE-1] = 0UL} } + +#endif /* __ASM_GENERIC_CPUMASK_UP_H */ diff -prauN linux-2.6.0-test1/include/asm-generic/percpu.h wli-2.6.0-test1-37/include/asm-generic/percpu.h --- linux-2.6.0-test1/include/asm-generic/percpu.h 2003-07-13 20:32:28.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-generic/percpu.h 2003-07-14 09:29:09.000000000 -0700 @@ -28,8 +28,8 @@ static inline void percpu_modcopy(void * #define DEFINE_PER_CPU(type, name) \ __typeof__(type) name##__per_cpu -#define per_cpu(var, cpu) ((void)cpu, var##__per_cpu) -#define __get_cpu_var(var) var##__per_cpu +#define per_cpu(var, cpu) ( (void)(cpu), var##__per_cpu ) +#define __get_cpu_var(var) var##__per_cpu #endif /* SMP */ diff -prauN linux-2.6.0-test1/include/asm-generic/rmap.h wli-2.6.0-test1-37/include/asm-generic/rmap.h --- linux-2.6.0-test1/include/asm-generic/rmap.h 2003-07-13 20:35:14.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-generic/rmap.h 1969-12-31 16:00:00.000000000 -0800 @@ -1,90 +0,0 @@ -#ifndef _GENERIC_RMAP_H -#define _GENERIC_RMAP_H -/* - * linux/include/asm-generic/rmap.h - * - * Architecture dependent parts of the reverse mapping code, - * this version should work for most architectures with a - * 'normal' page table layout. - * - * We use the struct page of the page table page to find out - * the process and full address of a page table entry: - * - page->mapping points to the process' mm_struct - * - page->index has the high bits of the address - * - the lower bits of the address are calculated from the - * offset of the page table entry within the page table page - * - * For CONFIG_HIGHPTE, we need to represent the address of a pte in a - * scalar pte_addr_t. The pfn of the pte's page is shifted left by PAGE_SIZE - * bits and is then ORed with the byte offset of the pte within its page. - * - * For CONFIG_HIGHMEM4G, the pte_addr_t is 32 bits. 20 for the pfn, 12 for - * the offset. - * - * For CONFIG_HIGHMEM64G, the pte_addr_t is 64 bits. 52 for the pfn, 12 for - * the offset. - */ -#include - -static inline void pgtable_add_rmap(struct page * page, struct mm_struct * mm, unsigned long address) -{ -#ifdef BROKEN_PPC_PTE_ALLOC_ONE - /* OK, so PPC calls pte_alloc() before mem_map[] is setup ... ;( */ - extern int mem_init_done; - - if (!mem_init_done) - return; -#endif - page->mapping = (void *)mm; - page->index = address & ~((PTRS_PER_PTE * PAGE_SIZE) - 1); - inc_page_state(nr_page_table_pages); -} - -static inline void pgtable_remove_rmap(struct page * page) -{ - page->mapping = NULL; - page->index = 0; - dec_page_state(nr_page_table_pages); -} - -static inline struct mm_struct * ptep_to_mm(pte_t * ptep) -{ - struct page * page = kmap_atomic_to_page(ptep); - return (struct mm_struct *) page->mapping; -} - -static inline unsigned long ptep_to_address(pte_t * ptep) -{ - struct page * page = kmap_atomic_to_page(ptep); - unsigned long low_bits; - low_bits = ((unsigned long)ptep & ~PAGE_MASK) * PTRS_PER_PTE; - return page->index + low_bits; -} - -#ifdef CONFIG_HIGHPTE -static inline pte_addr_t ptep_to_paddr(pte_t *ptep) -{ - pte_addr_t paddr; - paddr = ((pte_addr_t)page_to_pfn(kmap_atomic_to_page(ptep))) << PAGE_SHIFT; - return paddr + (pte_addr_t)((unsigned long)ptep & ~PAGE_MASK); -} -#else -static inline pte_addr_t ptep_to_paddr(pte_t *ptep) -{ - return (pte_addr_t)ptep; -} -#endif - -#ifndef CONFIG_HIGHPTE -static inline pte_t *rmap_ptep_map(pte_addr_t pte_paddr) -{ - return (pte_t *)pte_paddr; -} - -static inline void rmap_ptep_unmap(pte_t *pte) -{ - return; -} -#endif - -#endif /* _GENERIC_RMAP_H */ diff -prauN linux-2.6.0-test1/include/asm-h8300/pgtable.h wli-2.6.0-test1-37/include/asm-h8300/pgtable.h --- linux-2.6.0-test1/include/asm-h8300/pgtable.h 2003-07-13 20:33:13.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-h8300/pgtable.h 2003-07-14 06:49:00.000000000 -0700 @@ -15,6 +15,11 @@ typedef pte_t *pte_addr_t; #define pgd_clear(pgdp) #define kern_addr_valid(addr) (1) #define pmd_offset(a, b) ((void *)0) +#define pmd_offset_kernel(a,b) pmd_offset(a,b) +#define pmd_offset_map(a,b) pmd_offset(a,b) +#define pmd_offset_map_nested(a,b) pmd_offset(a,b) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) #define PAGE_NONE __pgprot(0) /* these mean nothing to NO_MM */ #define PAGE_SHARED __pgprot(0) /* these mean nothing to NO_MM */ diff -prauN linux-2.6.0-test1/include/asm-i386/a.out.h wli-2.6.0-test1-37/include/asm-i386/a.out.h --- linux-2.6.0-test1/include/asm-i386/a.out.h 2003-07-13 20:33:10.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-i386/a.out.h 2003-07-14 09:33:21.000000000 -0700 @@ -19,7 +19,11 @@ struct exec #ifdef __KERNEL__ +#ifdef CONFIG_MMAP_TOPDOWN +#define STACK_TOP (128 << 20) +#else #define STACK_TOP TASK_SIZE +#endif #endif diff -prauN linux-2.6.0-test1/include/asm-i386/atomic.h wli-2.6.0-test1-37/include/asm-i386/atomic.h --- linux-2.6.0-test1/include/asm-i386/atomic.h 2003-07-13 20:32:29.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-i386/atomic.h 2003-07-14 06:31:10.000000000 -0700 @@ -193,7 +193,7 @@ __asm__ __volatile__(LOCK "andl %0,%1" \ #define atomic_set_mask(mask, addr) \ __asm__ __volatile__(LOCK "orl %0,%1" \ -: : "r" (mask),"m" (*addr) : "memory") +: : "r" (mask),"m" (*(addr)) : "memory") /* Atomic operations are already serializing on x86 */ #define smp_mb__before_atomic_dec() barrier() diff -prauN linux-2.6.0-test1/include/asm-i386/bitops.h wli-2.6.0-test1-37/include/asm-i386/bitops.h --- linux-2.6.0-test1/include/asm-i386/bitops.h 2003-07-13 20:35:52.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-i386/bitops.h 2003-07-14 06:31:10.000000000 -0700 @@ -270,7 +270,7 @@ static __inline__ int variable_test_bit( * Returns the bit-number of the first zero bit, not the number of the byte * containing a bit. */ -static __inline__ int find_first_zero_bit(unsigned long * addr, unsigned size) +static __inline__ int find_first_zero_bit(const unsigned long *addr, unsigned size) { int d0, d1, d2; int res; @@ -302,7 +302,7 @@ static __inline__ int find_first_zero_bi * Returns the bit-number of the first set bit, not the number of the byte * containing a bit. */ -static __inline__ int find_first_bit(unsigned long * addr, unsigned size) +static __inline__ int find_first_bit(const unsigned long *addr, unsigned size) { int d0, d1; int res; @@ -328,7 +328,7 @@ static __inline__ int find_first_bit(uns * @offset: The bitnumber to start searching at * @size: The maximum size to search */ -static __inline__ int find_next_zero_bit(unsigned long * addr, int size, int offset) +static __inline__ int find_next_zero_bit(const unsigned long *addr, int size, int offset) { unsigned long * p = ((unsigned long *) addr) + (offset >> 5); int set = 0, bit = offset & 31, res; @@ -361,9 +361,9 @@ static __inline__ int find_next_zero_bit * @offset: The bitnumber to start searching at * @size: The maximum size to search */ -static __inline__ int find_next_bit(unsigned long *addr, int size, int offset) +static __inline__ int find_next_bit(const unsigned long *addr, int size, int offset) { - unsigned long * p = addr + (offset >> 5); + const unsigned long *p = addr + (offset >> 5); int set = 0, bit = offset & 31, res; if (bit) { @@ -430,7 +430,7 @@ static __inline__ unsigned long __ffs(un * unlikely to be set. It's guaranteed that at least one of the 140 * bits is cleared. */ -static inline int sched_find_first_bit(unsigned long *b) +static inline int sched_find_first_bit(const unsigned long *b) { if (unlikely(b[0])) return __ffs(b[0]); diff -prauN linux-2.6.0-test1/include/asm-i386/genapic.h wli-2.6.0-test1-37/include/asm-i386/genapic.h --- linux-2.6.0-test1/include/asm-i386/genapic.h 2003-07-13 20:34:42.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-i386/genapic.h 2003-07-14 06:33:08.000000000 -0700 @@ -1,13 +1,13 @@ #ifndef _ASM_GENAPIC_H #define _ASM_GENAPIC_H 1 -/* +/* * Generic APIC driver interface. - * - * An straight forward mapping of the APIC related parts of the + * + * An straight forward mapping of the APIC related parts of the * x86 subarchitecture interface to a dynamic object. - * - * This is used by the "generic" x86 subarchitecture. + * + * This is used by the "generic" x86 subarchitecture. * * Copyright 2003 Andi Kleen, SuSE Labs. */ @@ -22,23 +22,23 @@ struct genapic { int (*probe)(void); int (*apic_id_registered)(void); - unsigned long (*target_cpus)(void); + cpumask_t (*target_cpus)(void); int int_delivery_mode; int int_dest_mode; int apic_broadcast_id; int esr_disable; - unsigned long (*check_apicid_used)(unsigned long bitmap, int apicid); + unsigned long (*check_apicid_used)(physid_mask_t bitmap, int apicid); unsigned long (*check_apicid_present)(int apicid); int no_balance_irq; void (*init_apic_ldr)(void); - unsigned long (*ioapic_phys_id_map)(unsigned long map); + physid_mask_t (*ioapic_phys_id_map)(physid_mask_t map); void (*clustered_apic_check)(void); int (*multi_timer_check)(int apic, int irq); int (*apicid_to_node)(int logical_apicid); int (*cpu_to_logical_apicid)(int cpu); int (*cpu_present_to_apicid)(int mps_cpu); - unsigned long (*apicid_to_cpu_present)(int phys_apicid); + physid_mask_t (*apicid_to_cpu_present)(int phys_apicid); int (*mpc_apic_id)(struct mpc_config_processor *m, struct mpc_config_translation *t); void (*setup_portio_remap)(void); @@ -59,11 +59,11 @@ struct genapic { int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id); unsigned (*get_apic_id)(unsigned long x); - unsigned long apic_id_mask; - unsigned int (*cpu_mask_to_apicid)(unsigned long cpumask); + unsigned long apic_id_mask; + unsigned int (*cpu_mask_to_apicid)(cpumask_const_t cpumask); /* ipi */ - void (*send_IPI_mask)(int mask, int vector); + void (*send_IPI_mask)(cpumask_t mask, int vector); void (*send_IPI_allbutself)(int vector); void (*send_IPI_all)(int vector); }; diff -prauN linux-2.6.0-test1/include/asm-i386/highmem.h wli-2.6.0-test1-37/include/asm-i386/highmem.h --- linux-2.6.0-test1/include/asm-i386/highmem.h 2003-07-13 20:30:38.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-i386/highmem.h 2003-07-14 08:29:24.000000000 -0700 @@ -22,6 +22,7 @@ #include #include +#include #include #include @@ -39,7 +40,12 @@ extern void kmap_init(void); * easily, subsequent pte tables have to be allocated in one physical * chunk of RAM. */ -#define PKMAP_BASE (0xff800000UL) +#if NR_CPUS <= 32 +#define PKMAP_BASE (0xff400000UL) +#else +#define PKMAP_BASE (0xfe800000UL) +#endif + #ifdef CONFIG_X86_PAE #define LAST_PKMAP 512 #else @@ -49,14 +55,63 @@ extern void kmap_init(void); #define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -extern void * FASTCALL(kmap_high(struct page *page)); -extern void FASTCALL(kunmap_high(struct page *page)); +void *FASTCALL(kmap_high(struct page *page)); +void FASTCALL(kunmap_high(struct page *page)); + +void *FASTCALL(__kmap_atomic(struct page *page, enum km_type type, unsigned long vaddr)); + +#define page_is_low(page) ((page) < highmem_start_page) +#define addr_is_low(addr) ((unsigned long)(addr) < FIXADDR_START) + +static inline void *kmap(struct page *page) +{ + might_sleep(); + if (page_is_low(page)) + return lowmem_page_address(page); + else + return kmap_high(page); +} + +static inline void kunmap(struct page *page) +{ + BUG_ON(in_interrupt()); + if (!page_is_low(page)) + kunmap_high(page); +} + +static inline void *kmap_atomic(struct page *page, enum km_type type) +{ + inc_preempt_count(); + if (page_is_low(page)) + return lowmem_page_address(page); + else + return __kmap_atomic(page, type, __fix_to_virt(FIX_KMAP_BEGIN + type)); +} + +#ifdef CONFIG_DEBUG_HIGHMEM +void FASTCALL(__kunmap_atomic(void *kvaddr, enum km_type type, unsigned long vaddr)); +#else +static inline void __kunmap_atomic(void *kvaddr, enum km_type idx, unsigned long vaddr) +{ +} +#endif -void *kmap(struct page *page); -void kunmap(struct page *page); -void *kmap_atomic(struct page *page, enum km_type type); -void kunmap_atomic(void *kvaddr, enum km_type type); -struct page *kmap_atomic_to_page(void *ptr); +static inline void kunmap_atomic(void *kvaddr, enum km_type type) +{ + if (!addr_is_low(kvaddr)) + __kunmap_atomic(kvaddr, type, __fix_to_virt(FIX_KMAP_BEGIN + type)); + dec_preempt_count(); +} + +static inline struct page *kmap_atomic_to_page(void *vaddr) +{ + if (addr_is_low(vaddr)) + return virt_to_page(vaddr); + else { + unsigned long idx = virt_to_fix((unsigned long)vaddr); + return pte_page(*(kmap_pte - (idx - FIX_KMAP_BEGIN))); + } +} #endif /* __KERNEL__ */ diff -prauN linux-2.6.0-test1/include/asm-i386/hw_irq.h wli-2.6.0-test1-37/include/asm-i386/hw_irq.h --- linux-2.6.0-test1/include/asm-i386/hw_irq.h 2003-07-13 20:38:53.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-i386/hw_irq.h 2003-07-14 06:31:10.000000000 -0700 @@ -30,33 +30,33 @@ extern int irq_vector[NR_IRQS]; extern void (*interrupt[NR_IRQS])(void); #ifdef CONFIG_SMP -extern asmlinkage void reschedule_interrupt(void); -extern asmlinkage void invalidate_interrupt(void); -extern asmlinkage void call_function_interrupt(void); +asmlinkage void reschedule_interrupt(void); +asmlinkage void invalidate_interrupt(void); +asmlinkage void call_function_interrupt(void); #endif #ifdef CONFIG_X86_LOCAL_APIC -extern asmlinkage void apic_timer_interrupt(void); -extern asmlinkage void error_interrupt(void); -extern asmlinkage void spurious_interrupt(void); -extern asmlinkage void thermal_interrupt(struct pt_regs); +asmlinkage void apic_timer_interrupt(void); +asmlinkage void error_interrupt(void); +asmlinkage void spurious_interrupt(void); +asmlinkage void thermal_interrupt(struct pt_regs); #endif -extern void mask_irq(unsigned int irq); -extern void unmask_irq(unsigned int irq); -extern void disable_8259A_irq(unsigned int irq); -extern void enable_8259A_irq(unsigned int irq); -extern int i8259A_irq_pending(unsigned int irq); -extern void make_8259A_irq(unsigned int irq); -extern void init_8259A(int aeoi); -extern void FASTCALL(send_IPI_self(int vector)); -extern void init_VISWS_APIC_irqs(void); -extern void setup_IO_APIC(void); -extern void disable_IO_APIC(void); -extern void print_IO_APIC(void); -extern int IO_APIC_get_PCI_irq_vector(int bus, int slot, int fn); -extern void send_IPI(int dest, int vector); -extern void setup_ioapic_dest(unsigned long mask); +void mask_irq(unsigned int irq); +void unmask_irq(unsigned int irq); +void disable_8259A_irq(unsigned int irq); +void enable_8259A_irq(unsigned int irq); +int i8259A_irq_pending(unsigned int irq); +void make_8259A_irq(unsigned int irq); +void init_8259A(int aeoi); +void FASTCALL(send_IPI_self(int vector)); +void init_VISWS_APIC_irqs(void); +void setup_IO_APIC(void); +void disable_IO_APIC(void); +void print_IO_APIC(void); +int IO_APIC_get_PCI_irq_vector(int bus, int slot, int fn); +void send_IPI(int dest, int vector); +void setup_ioapic_dest(cpumask_t mask); extern unsigned long io_apic_irqs; diff -prauN linux-2.6.0-test1/include/asm-i386/kmap_types.h wli-2.6.0-test1-37/include/asm-i386/kmap_types.h --- linux-2.6.0-test1/include/asm-i386/kmap_types.h 2003-07-13 20:36:38.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-i386/kmap_types.h 2003-07-14 06:49:00.000000000 -0700 @@ -17,14 +17,16 @@ D(3) KM_USER0, D(4) KM_USER1, D(5) KM_BIO_SRC_IRQ, D(6) KM_BIO_DST_IRQ, -D(7) KM_PTE0, -D(8) KM_PTE1, -D(9) KM_PTE2, -D(10) KM_IRQ0, -D(11) KM_IRQ1, -D(12) KM_SOFTIRQ0, -D(13) KM_SOFTIRQ1, -D(14) KM_TYPE_NR +D(7) KM_PMD0, +D(8) KM_PMD1, +D(9) KM_PTE0, +D(10) KM_PTE1, +D(11) KM_PTE2, +D(12) KM_IRQ0, +D(13) KM_IRQ1, +D(14) KM_SOFTIRQ0, +D(15) KM_SOFTIRQ1, +D(16) KM_TYPE_NR }; #undef D diff -prauN linux-2.6.0-test1/include/asm-i386/linkage.h wli-2.6.0-test1-37/include/asm-i386/linkage.h --- linux-2.6.0-test1/include/asm-i386/linkage.h 2003-07-13 20:39:32.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-i386/linkage.h 2003-07-14 08:39:52.000000000 -0700 @@ -3,6 +3,7 @@ #define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0))) #define FASTCALL(x) x __attribute__((regparm(3))) +#define IRQHANDLER(x) x __attribute__((regparm(1))) #ifdef CONFIG_X86_ALIGNMENT_16 #define __ALIGN .align 16,0x90 diff -prauN linux-2.6.0-test1/include/asm-i386/mach-bigsmp/mach_apic.h wli-2.6.0-test1-37/include/asm-i386/mach-bigsmp/mach_apic.h --- linux-2.6.0-test1/include/asm-i386/mach-bigsmp/mach_apic.h 2003-07-13 20:38:35.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-i386/mach-bigsmp/mach_apic.h 2003-07-14 06:33:08.000000000 -0700 @@ -20,7 +20,7 @@ static inline int apic_id_registered(voi } #define APIC_DFR_VALUE (APIC_DFR_CLUSTER) -static inline unsigned long target_cpus(void) +static inline cpumask_t target_cpus(void) { return cpu_online_map; } @@ -29,14 +29,15 @@ static inline unsigned long target_cpus( #define INT_DELIVERY_MODE dest_LowestPrio #define INT_DEST_MODE 1 /* logical delivery broadcast to all procs */ -#define APIC_BROADCAST_ID (0x0f) -static inline unsigned long check_apicid_used(unsigned long bitmap, int apicid) -{ +#define APIC_BROADCAST_ID (0xff) +static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid) +{ return 0; -} +} + static inline unsigned long check_apicid_present(int bit) { - return (phys_cpu_present_map & (1 << bit)); + return physid_isset(bit, phys_cpu_present_map); } #define apicid_cluster(apicid) (apicid & 0xF0) @@ -88,9 +89,9 @@ static inline int cpu_present_to_apicid( return (int) bios_cpu_apicid[mps_cpu]; } -static inline unsigned long apicid_to_cpu_present(int phys_apicid) +static inline physid_mask_t apicid_to_cpu_present(int phys_apicid) { - return (1ul << phys_apicid); + return physid_mask_of_physid(phys_apicid); } extern volatile u8 cpu_2_logical_apicid[]; @@ -108,13 +109,13 @@ static inline int mpc_apic_id(struct mpc (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8, (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4, m->mpc_apicver); - return (m->mpc_apicid); + return m->mpc_apicid; } -static inline ulong ioapic_phys_id_map(ulong phys_map) +static inline physid_mask_t ioapic_phys_id_map(physid_mask_t phys_map) { /* For clustered we don't have a good way to do this yet - hack */ - return (0x0F); + return physids_promote(0xFUL); } #define WAKE_SECONDARY_VIA_INIT @@ -132,25 +133,25 @@ static inline int check_phys_apicid_pres return (1); } -static inline unsigned int cpu_mask_to_apicid (unsigned long cpumask) +static inline unsigned int cpu_mask_to_apicid(cpumask_const_t cpumask) { int num_bits_set; int cpus_found = 0; int cpu; int apicid; - num_bits_set = hweight32(cpumask); + num_bits_set = cpus_weight_const(cpumask); /* Return id to all */ - if (num_bits_set == 32) + if (num_bits_set == NR_CPUS) return (int) 0xFF; /* * The cpus in the mask must all be on the apic cluster. If are not * on the same apicid cluster return default value of TARGET_CPUS. */ - cpu = ffs(cpumask)-1; + cpu = first_cpu_const(cpumask); apicid = cpu_to_logical_apicid(cpu); while (cpus_found < num_bits_set) { - if (cpumask & (1 << cpu)) { + if (cpu_isset_const(cpu, cpumask)) { int new_apicid = cpu_to_logical_apicid(cpu); if (apicid_cluster(apicid) != apicid_cluster(new_apicid)){ diff -prauN linux-2.6.0-test1/include/asm-i386/mach-bigsmp/mach_ipi.h wli-2.6.0-test1-37/include/asm-i386/mach-bigsmp/mach_ipi.h --- linux-2.6.0-test1/include/asm-i386/mach-bigsmp/mach_ipi.h 2003-07-13 20:33:13.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-i386/mach-bigsmp/mach_ipi.h 2003-07-14 06:31:10.000000000 -0700 @@ -1,18 +1,19 @@ #ifndef __ASM_MACH_IPI_H #define __ASM_MACH_IPI_H -inline void send_IPI_mask_sequence(int mask, int vector); +inline void send_IPI_mask_sequence(cpumask_t mask, int vector); -static inline void send_IPI_mask(int mask, int vector) +static inline void send_IPI_mask(cpumask_t mask, int vector) { send_IPI_mask_sequence(mask, vector); } static inline void send_IPI_allbutself(int vector) { - unsigned long mask = cpu_online_map & ~(1 << smp_processor_id()); + cpumask_t mask = cpu_online_map; + cpu_clear(smp_processor_id(), mask); - if (mask) + if (!cpus_empty(mask)) send_IPI_mask(mask, vector); } diff -prauN linux-2.6.0-test1/include/asm-i386/mach-default/mach_apic.h wli-2.6.0-test1-37/include/asm-i386/mach-default/mach_apic.h --- linux-2.6.0-test1/include/asm-i386/mach-default/mach_apic.h 2003-07-13 20:39:27.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-i386/mach-default/mach_apic.h 2003-07-14 06:33:08.000000000 -0700 @@ -5,12 +5,12 @@ #define APIC_DFR_VALUE (APIC_DFR_FLAT) -static inline unsigned long target_cpus(void) +static inline cpumask_t target_cpus(void) { #ifdef CONFIG_SMP return cpu_online_map; #else - return 1; + return cpumask_of_cpu(0); #endif } #define TARGET_CPUS (target_cpus()) @@ -21,16 +21,20 @@ static inline unsigned long target_cpus( #define INT_DELIVERY_MODE dest_LowestPrio #define INT_DEST_MODE 1 /* logical delivery broadcast to all procs */ +/* + * this isn't really broadcast, just a (potentially inaccurate) upper + * bound for valid physical APIC id's + */ #define APIC_BROADCAST_ID 0x0F -static inline unsigned long check_apicid_used(unsigned long bitmap, int apicid) -{ - return (bitmap & (1UL << apicid)); -} +static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid) +{ + return physid_isset(apicid, bitmap); +} -static inline unsigned long check_apicid_present(int bit) +static inline unsigned long check_apicid_present(int bit) { - return (phys_cpu_present_map & (1UL << bit)); + return physid_isset(bit, phys_cpu_present_map); } /* @@ -50,7 +54,7 @@ static inline void init_apic_ldr(void) apic_write_around(APIC_LDR, val); } -static inline unsigned long ioapic_phys_id_map(unsigned long phys_map) +static inline physid_mask_t ioapic_phys_id_map(physid_mask_t phys_map) { return phys_map; } @@ -82,9 +86,9 @@ static inline int cpu_present_to_apicid( return mps_cpu; } -static inline unsigned long apicid_to_cpu_present(int phys_apicid) +static inline physid_mask_t apicid_to_cpu_present(int phys_apicid) { - return (1ul << phys_apicid); + return physid_mask_of_physid(phys_apicid); } static inline int mpc_apic_id(struct mpc_config_processor *m, @@ -104,18 +108,17 @@ static inline void setup_portio_remap(vo static inline int check_phys_apicid_present(int boot_cpu_physical_apicid) { - return test_bit(boot_cpu_physical_apicid, &phys_cpu_present_map); + return physid_isset(boot_cpu_physical_apicid, phys_cpu_present_map); } static inline int apic_id_registered(void) { - return (test_bit(GET_APIC_ID(apic_read(APIC_ID)), - &phys_cpu_present_map)); + return physid_isset(GET_APIC_ID(apic_read(APIC_ID)), phys_cpu_present_map); } -static inline unsigned int cpu_mask_to_apicid (unsigned long cpumask) +static inline unsigned int cpu_mask_to_apicid(cpumask_const_t cpumask) { - return cpumask; + return cpus_coerce_const(cpumask); } static inline void enable_apic_mode(void) diff -prauN linux-2.6.0-test1/include/asm-i386/mach-default/mach_ipi.h wli-2.6.0-test1-37/include/asm-i386/mach-default/mach_ipi.h --- linux-2.6.0-test1/include/asm-i386/mach-default/mach_ipi.h 2003-07-13 20:37:36.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-i386/mach-default/mach_ipi.h 2003-07-14 06:31:10.000000000 -0700 @@ -1,10 +1,10 @@ #ifndef __ASM_MACH_IPI_H #define __ASM_MACH_IPI_H -inline void send_IPI_mask_bitmask(int mask, int vector); +inline void send_IPI_mask_bitmask(cpumask_t mask, int vector); inline void __send_IPI_shortcut(unsigned int shortcut, int vector); -static inline void send_IPI_mask(int mask, int vector) +static inline void send_IPI_mask(cpumask_t mask, int vector) { send_IPI_mask_bitmask(mask, vector); } diff -prauN linux-2.6.0-test1/include/asm-i386/mach-es7000/mach_apic.h wli-2.6.0-test1-37/include/asm-i386/mach-es7000/mach_apic.h --- linux-2.6.0-test1/include/asm-i386/mach-es7000/mach_apic.h 2003-07-13 20:29:27.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-i386/mach-es7000/mach_apic.h 2003-07-14 06:33:08.000000000 -0700 @@ -11,12 +11,12 @@ static inline int apic_id_registered(voi return (1); } -static inline unsigned long target_cpus(void) +static inline cpumask_t target_cpus(void) { #if defined CONFIG_ES7000_CLUSTERED_APIC - return (0xff); + return CPU_MASK_ALL; #else - return (bios_cpu_apicid[smp_processor_id()]); + return cpumask_of_cpu(bios_cpu_apicid[smp_processor_id()]); #endif } #define TARGET_CPUS (target_cpus()) @@ -40,13 +40,13 @@ static inline unsigned long target_cpus( #define APIC_BROADCAST_ID (0xff) -static inline unsigned long check_apicid_used(unsigned long bitmap, int apicid) +static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid) { return 0; } static inline unsigned long check_apicid_present(int bit) { - return (phys_cpu_present_map & (1 << bit)); + return physid_isset(bit, phys_cpu_present_map); } #define apicid_cluster(apicid) (apicid & 0xF0) @@ -88,7 +88,7 @@ static inline void clustered_apic_check( int apic = bios_cpu_apicid[smp_processor_id()]; printk("Enabling APIC mode: %s. Using %d I/O APICs, target cpus %lx\n", (apic_version[apic] == 0x14) ? - "Physical Cluster" : "Logical Cluster", nr_ioapics, TARGET_CPUS); + "Physical Cluster" : "Logical Cluster", nr_ioapics, cpus_coerce(TARGET_CPUS)); } static inline int multi_timer_check(int apic, int irq) @@ -110,10 +110,13 @@ static inline int cpu_present_to_apicid( return (int) bios_cpu_apicid[mps_cpu]; } -static inline unsigned long apicid_to_cpu_present(int phys_apicid) +static inline physid_mask_t apicid_to_cpu_present(int phys_apicid) { - static int cpu = 0; - return (1ul << cpu++); + static int id = 0; + physid_mask_t mask; + mask = physid_mask_of_physid(id); + ++id; + return mask; } extern volatile u8 cpu_2_logical_apicid[]; @@ -123,7 +126,7 @@ static inline int cpu_to_logical_apicid( return (int)cpu_2_logical_apicid[cpu]; } -static inline int mpc_apic_id(struct mpc_config_processor *m, int quad) +static inline int mpc_apic_id(struct mpc_config_processor *m, struct mpc_config_translation *unused) { printk("Processor #%d %ld:%ld APIC version %d\n", m->mpc_apicid, @@ -133,10 +136,10 @@ static inline int mpc_apic_id(struct mpc return (m->mpc_apicid); } -static inline ulong ioapic_phys_id_map(ulong phys_map) +static inline physid_mask_t ioapic_phys_id_map(physid_mask_t phys_map) { /* For clustered we don't have a good way to do this yet - hack */ - return (0xff); + return physids_promote(0xff); } @@ -151,32 +154,30 @@ static inline int check_phys_apicid_pres return (1); } -static inline unsigned int cpu_mask_to_apicid (unsigned long cpumask) +static inline unsigned int cpu_mask_to_apicid(cpumask_const_t cpumask) { int num_bits_set; int cpus_found = 0; int cpu; int apicid; - if (cpumask == TARGET_CPUS) - return cpumask; - num_bits_set = hweight32(cpumask); + num_bits_set = cpus_weight_const(cpumask); /* Return id to all */ - if (num_bits_set == 32) - return TARGET_CPUS; + if (num_bits_set == NR_CPUS) + return 0xFF; /* * The cpus in the mask must all be on the apic cluster. If are not * on the same apicid cluster return default value of TARGET_CPUS. */ - cpu = ffs(cpumask)-1; + cpu = first_cpu_const(cpumask); apicid = cpu_to_logical_apicid(cpu); while (cpus_found < num_bits_set) { - if (cpumask & (1 << cpu)) { + if (cpu_isset_const(cpu, cpumask)) { int new_apicid = cpu_to_logical_apicid(cpu); if (apicid_cluster(apicid) != apicid_cluster(new_apicid)){ printk ("%s: Not a valid mask!\n",__FUNCTION__); - return TARGET_CPUS; + return 0xFF; } apicid = new_apicid; cpus_found++; diff -prauN linux-2.6.0-test1/include/asm-i386/mach-es7000/mach_ipi.h wli-2.6.0-test1-37/include/asm-i386/mach-es7000/mach_ipi.h --- linux-2.6.0-test1/include/asm-i386/mach-es7000/mach_ipi.h 2003-07-13 20:32:34.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-i386/mach-es7000/mach_ipi.h 2003-07-14 06:31:10.000000000 -0700 @@ -1,18 +1,19 @@ #ifndef __ASM_MACH_IPI_H #define __ASM_MACH_IPI_H -static inline void send_IPI_mask_sequence(int mask, int vector); +static inline void send_IPI_mask_sequence(cpumask_t mask, int vector); -static inline void send_IPI_mask(int mask, int vector) +static inline void send_IPI_mask(cpumask_t mask, int vector) { send_IPI_mask_sequence(mask, vector); } static inline void send_IPI_allbutself(int vector) { - unsigned long mask = cpu_online_map & ~(1 << smp_processor_id()); - - if (mask) + cpumask_t mask = cpumask_of_cpu(smp_processor_id()); + cpus_complement(mask); + cpus_and(mask, mask, cpu_online_map); + if (!cpus_empty(mask)) send_IPI_mask(mask, vector); } diff -prauN linux-2.6.0-test1/include/asm-i386/mach-numaq/mach_apic.h wli-2.6.0-test1-37/include/asm-i386/mach-numaq/mach_apic.h --- linux-2.6.0-test1/include/asm-i386/mach-numaq/mach_apic.h 2003-07-13 20:33:38.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-i386/mach-numaq/mach_apic.h 2003-07-17 02:36:57.000000000 -0700 @@ -6,7 +6,13 @@ #define APIC_DFR_VALUE (APIC_DFR_CLUSTER) -#define TARGET_CPUS (~0UL) +static inline cpumask_t target_cpus(void) +{ + cpumask_t tmp = CPU_MASK_ALL; + return tmp; +} + +#define TARGET_CPUS (target_cpus()) #define NO_BALANCE_IRQ (1) #define esr_disable (1) @@ -15,13 +21,13 @@ #define INT_DEST_MODE 0 /* physical delivery on LOCAL quad */ #define APIC_BROADCAST_ID 0x0F -#define check_apicid_used(bitmap, apicid) ((bitmap) & (1 << (apicid))) -#define check_apicid_present(bit) (phys_cpu_present_map & (1 << bit)) +#define check_apicid_used(bitmap, apicid) physid_isset(apicid, bitmap) +#define check_apicid_present(bit) physid_isset(bit, phys_cpu_present_map) #define apicid_cluster(apicid) (apicid & 0xF0) static inline int apic_id_registered(void) { - return (1); + return 1; } static inline void init_apic_ldr(void) @@ -31,6 +37,7 @@ static inline void init_apic_ldr(void) static inline void clustered_apic_check(void) { + nr_ioapics = min(2, nr_ioapics); printk("Enabling APIC mode: %s. Using %d I/O APICs\n", "NUMA-Q", nr_ioapics); } @@ -41,13 +48,13 @@ static inline void clustered_apic_check( */ static inline int multi_timer_check(int apic, int irq) { - return (apic != 0 && irq == 0); + return apic != 0 && irq == 0; } -static inline ulong ioapic_phys_id_map(ulong phys_map) +static inline physid_mask_t ioapic_phys_id_map(physid_mask_t phys_map) { /* We don't have a good way to do this yet - hack */ - return 0xf; + return physids_promote(0xFUL); } /* Mapping from cpu number to logical apicid */ @@ -59,22 +66,25 @@ static inline int cpu_to_logical_apicid( static inline int cpu_present_to_apicid(int mps_cpu) { - return ( ((mps_cpu/4)*16) + (1<<(mps_cpu%4)) ); + return ((mps_cpu >> 2) << 4) | (1 << (mps_cpu & 0x3)); } static inline int generate_logical_apicid(int quad, int phys_apicid) { - return ( (quad << 4) + (phys_apicid ? phys_apicid << 1 : 1) ); + return (quad << 4) + (phys_apicid ? phys_apicid << 1 : 1); } static inline int apicid_to_node(int logical_apicid) { - return (logical_apicid >> 4); + return logical_apicid >> 4; } -static inline unsigned long apicid_to_cpu_present(int logical_apicid) +static inline physid_mask_t apicid_to_cpu_present(int logical_apicid) { - return ( (logical_apicid&0xf) << (4*apicid_to_node(logical_apicid)) ); + int node = apicid_to_node(logical_apicid); + int cpu = __ffs(logical_apicid & 0xf); + + return physid_mask_of_physid(cpu + 4*node); } static inline int mpc_apic_id(struct mpc_config_processor *m, @@ -115,7 +125,7 @@ static inline void enable_apic_mode(void * We use physical apicids here, not logical, so just return the default * physical broadcast to stop people from breaking us */ -static inline unsigned int cpu_mask_to_apicid (unsigned long cpumask) +static inline unsigned int cpu_mask_to_apicid(cpumask_const_t cpumask) { return (int) 0xF; } diff -prauN linux-2.6.0-test1/include/asm-i386/mach-numaq/mach_ipi.h wli-2.6.0-test1-37/include/asm-i386/mach-numaq/mach_ipi.h --- linux-2.6.0-test1/include/asm-i386/mach-numaq/mach_ipi.h 2003-07-13 20:36:42.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-i386/mach-numaq/mach_ipi.h 2003-07-14 06:31:10.000000000 -0700 @@ -1,18 +1,19 @@ #ifndef __ASM_MACH_IPI_H #define __ASM_MACH_IPI_H -static inline void send_IPI_mask_sequence(int mask, int vector); +static inline void send_IPI_mask_sequence(cpumask_t, int vector); -static inline void send_IPI_mask(int mask, int vector) +static inline void send_IPI_mask(cpumask_t mask, int vector) { send_IPI_mask_sequence(mask, vector); } static inline void send_IPI_allbutself(int vector) { - unsigned long mask = cpu_online_map & ~(1 << smp_processor_id()); + cpumask_t mask = cpu_online_map; + cpu_clear(smp_processor_id(), mask); - if (mask) + if (!cpus_empty(mask)) send_IPI_mask(mask, vector); } diff -prauN linux-2.6.0-test1/include/asm-i386/mach-summit/mach_apic.h wli-2.6.0-test1-37/include/asm-i386/mach-summit/mach_apic.h --- linux-2.6.0-test1/include/asm-i386/mach-summit/mach_apic.h 2003-07-13 20:35:51.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-i386/mach-summit/mach_apic.h 2003-07-14 06:33:08.000000000 -0700 @@ -18,17 +18,18 @@ static inline unsigned long xapic_phys_t #define APIC_DFR_VALUE (APIC_DFR_CLUSTER) -static inline unsigned long target_cpus(void) +static inline cpumask_t target_cpus(void) { - return (~0UL); + cpumask_t tmp = CPU_MASK_ALL; + return tmp; } #define TARGET_CPUS (target_cpus()) #define INT_DELIVERY_MODE (dest_Fixed) #define INT_DEST_MODE 1 /* logical delivery broadcast to all procs */ -#define APIC_BROADCAST_ID (0x0F) -static inline unsigned long check_apicid_used(unsigned long bitmap, int apicid) +#define APIC_BROADCAST_ID (0xFF) +static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid) { return 0; } @@ -72,7 +73,7 @@ static inline void clustered_apic_check( static inline int apicid_to_node(int logical_apicid) { - return (logical_apicid >> 5); /* 2 clusterids per CEC */ + return logical_apicid >> 5; /* 2 clusterids per CEC */ } /* Mapping from cpu number to logical apicid */ @@ -87,15 +88,15 @@ static inline int cpu_present_to_apicid( return (int) bios_cpu_apicid[mps_cpu]; } -static inline ulong ioapic_phys_id_map(ulong phys_map) +static inline physid_mask_t ioapic_phys_id_map(physid_mask_t phys_id_map) { /* For clustered we don't have a good way to do this yet - hack */ - return 0x0F; + return physids_promote(0x0F); } -static inline unsigned long apicid_to_cpu_present(int apicid) +static inline physid_mask_t apicid_to_cpu_present(int apicid) { - return 1; + return physid_mask_of_physid(0); } static inline int mpc_apic_id(struct mpc_config_processor *m, @@ -122,25 +123,25 @@ static inline void enable_apic_mode(void { } -static inline unsigned int cpu_mask_to_apicid (unsigned long cpumask) +static inline unsigned int cpu_mask_to_apicid(cpumask_const_t cpumask) { int num_bits_set; int cpus_found = 0; int cpu; int apicid; - num_bits_set = hweight32(cpumask); + num_bits_set = cpus_weight_const(cpumask); /* Return id to all */ - if (num_bits_set == 32) + if (num_bits_set == NR_CPUS) return (int) 0xFF; /* * The cpus in the mask must all be on the apic cluster. If are not * on the same apicid cluster return default value of TARGET_CPUS. */ - cpu = ffs(cpumask)-1; + cpu = first_cpu_const(cpumask); apicid = cpu_to_logical_apicid(cpu); while (cpus_found < num_bits_set) { - if (cpumask & (1 << cpu)) { + if (cpu_isset_const(cpu, cpumask)) { int new_apicid = cpu_to_logical_apicid(cpu); if (apicid_cluster(apicid) != apicid_cluster(new_apicid)){ diff -prauN linux-2.6.0-test1/include/asm-i386/mach-summit/mach_ipi.h wli-2.6.0-test1-37/include/asm-i386/mach-summit/mach_ipi.h --- linux-2.6.0-test1/include/asm-i386/mach-summit/mach_ipi.h 2003-07-13 20:31:22.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-i386/mach-summit/mach_ipi.h 2003-07-14 06:31:10.000000000 -0700 @@ -1,18 +1,19 @@ #ifndef __ASM_MACH_IPI_H #define __ASM_MACH_IPI_H -inline void send_IPI_mask_sequence(int mask, int vector); +inline void send_IPI_mask_sequence(cpumask_t mask, int vector); -static inline void send_IPI_mask(int mask, int vector) +static inline void send_IPI_mask(cpumask_t mask, int vector) { send_IPI_mask_sequence(mask, vector); } static inline void send_IPI_allbutself(int vector) { - unsigned long mask = cpu_online_map & ~(1 << smp_processor_id()); + cpumask_t mask = cpu_online_map; + cpu_clear(smp_processor_id(), mask); - if (mask) + if (!cpus_empty(mask)) send_IPI_mask(mask, vector); } diff -prauN linux-2.6.0-test1/include/asm-i386/mach-visws/mach_apic.h wli-2.6.0-test1-37/include/asm-i386/mach-visws/mach_apic.h --- linux-2.6.0-test1/include/asm-i386/mach-visws/mach_apic.h 2003-07-13 20:29:22.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-i386/mach-visws/mach_apic.h 2003-07-14 06:33:08.000000000 -0700 @@ -12,17 +12,16 @@ #ifdef CONFIG_SMP #define TARGET_CPUS cpu_online_map #else - #define TARGET_CPUS 0x01 + #define TARGET_CPUS cpumask_of_cpu(0) #endif #define APIC_BROADCAST_ID 0x0F -#define check_apicid_used(bitmap, apicid) (bitmap & (1 << apicid)) -#define check_apicid_present(bit) (phys_cpu_present_map & (1 << bit)) +#define check_apicid_used(bitmap, apicid) physid_isset(apicid, bitmap) +#define check_apicid_present(bit) physid_isset(bit, phys_cpu_present_map) static inline int apic_id_registered(void) { - return (test_bit(GET_APIC_ID(apic_read(APIC_ID)), - &phys_cpu_present_map)); + return physid_isset(GET_APIC_ID(apic_read(APIC_ID)), phys_cpu_present_map); } /* @@ -61,9 +60,9 @@ static inline int cpu_present_to_apicid( return mps_cpu; } -static inline unsigned long apicid_to_cpu_present(int apicid) +static inline physid_mask_t apicid_to_cpu_present(int apicid) { - return (1ul << apicid); + return physid_mask_of_physid(apicid); } #define WAKE_SECONDARY_VIA_INIT @@ -78,11 +77,11 @@ static inline void enable_apic_mode(void static inline int check_phys_apicid_present(int boot_cpu_physical_apicid) { - return test_bit(boot_cpu_physical_apicid, &phys_cpu_present_map); + return physid_isset(boot_cpu_physical_apicid, phys_cpu_present_map); } -static inline unsigned int cpu_mask_to_apicid (unsigned long cpumask) +static inline unsigned int cpu_mask_to_apicid(cpumask_const_t cpumask) { - return cpumask; + return cpus_coerce_const(cpumask); } #endif /* __ASM_MACH_APIC_H */ diff -prauN linux-2.6.0-test1/include/asm-i386/mmu_context.h wli-2.6.0-test1-37/include/asm-i386/mmu_context.h --- linux-2.6.0-test1/include/asm-i386/mmu_context.h 2003-07-13 20:28:53.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-i386/mmu_context.h 2003-07-14 06:31:10.000000000 -0700 @@ -31,12 +31,12 @@ static inline void switch_mm(struct mm_s if (likely(prev != next)) { /* stop flush ipis for the previous mm */ - clear_bit(cpu, &prev->cpu_vm_mask); + cpu_clear(cpu, prev->cpu_vm_mask); #ifdef CONFIG_SMP cpu_tlbstate[cpu].state = TLBSTATE_OK; cpu_tlbstate[cpu].active_mm = next; #endif - set_bit(cpu, &next->cpu_vm_mask); + cpu_set(cpu, next->cpu_vm_mask); /* Re-load page tables */ load_cr3(next->pgd); @@ -52,7 +52,7 @@ static inline void switch_mm(struct mm_s cpu_tlbstate[cpu].state = TLBSTATE_OK; BUG_ON(cpu_tlbstate[cpu].active_mm != next); - if (!test_and_set_bit(cpu, &next->cpu_vm_mask)) { + if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) { /* We were in lazy tlb mode and leave_mm disabled * tlb flush IPI delivery. We must reload %cr3. */ diff -prauN linux-2.6.0-test1/include/asm-i386/mpspec.h wli-2.6.0-test1-37/include/asm-i386/mpspec.h --- linux-2.6.0-test1/include/asm-i386/mpspec.h 2003-07-13 20:28:53.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-i386/mpspec.h 2003-07-14 06:33:08.000000000 -0700 @@ -1,6 +1,7 @@ #ifndef __ASM_MPSPEC_H #define __ASM_MPSPEC_H +#include #include #include @@ -11,7 +12,6 @@ extern int quad_local_to_mp_bus_id [NR_C extern int mp_bus_id_to_pci_bus [MAX_MP_BUSSES]; extern unsigned int boot_cpu_physical_apicid; -extern unsigned long phys_cpu_present_map; extern int smp_found_config; extern void find_smp_config (void); extern void get_smp_config (void); @@ -41,5 +41,49 @@ extern void mp_config_ioapic_for_sci(int extern void mp_parse_prt (void); #endif /*CONFIG_ACPI_BOOT*/ +#define PHYSID_ARRAY_SIZE BITS_TO_LONGS(MAX_APICS) + +struct physid_mask +{ + unsigned long mask[PHYSID_ARRAY_SIZE]; +}; + +typedef struct physid_mask physid_mask_t; + +#define physid_set(physid, map) set_bit(physid, (map).mask) +#define physid_clear(physid, map) clear_bit(physid, (map).mask) +#define physid_isset(physid, map) test_bit(physid, (map).mask) +#define physid_test_and_set(physid, map) test_and_set_bit(physid, (map).mask) + +#define physids_and(dst, src1, src2) bitmap_and((dst).mask, (src1).mask, (src2).mask, MAX_APICS) +#define physids_or(dst, src1, src2) bitmap_or((dst).mask, (src1).mask, (src2).mask, MAX_APICS) +#define physids_clear(map) bitmap_clear((map).mask, MAX_APICS) +#define physids_complement(map) bitmap_complement((map).mask, MAX_APICS) +#define physids_empty(map) bitmap_empty((map).mask, MAX_APICS) +#define physids_equal(map1, map2) bitmap_equal((map1).mask, (map2).mask, MAX_APICS) +#define physids_weight(map) bitmap_weight((map).mask, MAX_APICS) +#define physids_shift_right(d, s, n) bitmap_shift_right((d).mask, (s).mask, n, MAX_APICS) +#define physids_shift_left(d, s, n) bitmap_shift_left((d).mask, (s).mask, n, MAX_APICS) +#define physids_coerce(map) ((map).mask[0]) + +#define physids_promote(physids) \ + ({ \ + physid_mask_t __physid_mask = PHYSID_MASK_NONE; \ + __physid_mask.mask[0] = physids; \ + __physid_mask; \ + }) + +#define physid_mask_of_physid(physid) \ + ({ \ + physid_mask_t __physid_mask = PHYSID_MASK_NONE; \ + physid_set(physid, __physid_mask); \ + __physid_mask; \ + }) + +#define PHYSID_MASK_ALL { {[0 ... PHYSID_ARRAY_SIZE-1] = ~0UL} } +#define PHYSID_MASK_NONE { {[0 ... PHYSID_ARRAY_SIZE-1] = 0UL} } + +extern physid_mask_t phys_cpu_present_map; + #endif diff -prauN linux-2.6.0-test1/include/asm-i386/numaq.h wli-2.6.0-test1-37/include/asm-i386/numaq.h --- linux-2.6.0-test1/include/asm-i386/numaq.h 2003-07-13 20:34:40.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-i386/numaq.h 2003-07-14 09:29:09.000000000 -0700 @@ -28,7 +28,8 @@ #ifdef CONFIG_X86_NUMAQ -#define MAX_NUMNODES 8 +#define MAX_NUMNODES 16 +#define MAX_NODE_CPUS 4 extern void get_memcfg_numaq(void); #define get_memcfg_numa() get_memcfg_numaq() @@ -159,7 +160,7 @@ struct sys_cfg_data { static inline unsigned long *get_zholes_size(int nid) { - return 0; + return NULL; } #endif /* CONFIG_X86_NUMAQ */ #endif /* NUMAQ_H */ diff -prauN linux-2.6.0-test1/include/asm-i386/page.h wli-2.6.0-test1-37/include/asm-i386/page.h --- linux-2.6.0-test1/include/asm-i386/page.h 2003-07-13 20:30:48.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-i386/page.h 2003-07-14 08:36:18.000000000 -0700 @@ -3,7 +3,11 @@ /* PAGE_SHIFT determines the page size */ #define PAGE_SHIFT 12 -#define PAGE_SIZE (1UL << PAGE_SHIFT) +#ifndef __ASSEMBLY__ +#define PAGE_SIZE (1UL << PAGE_SHIFT) +#else +#define PAGE_SIZE (1 << PAGE_SHIFT) +#endif #define PAGE_MASK (~(PAGE_SIZE-1)) #define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1)) diff -prauN linux-2.6.0-test1/include/asm-i386/percpu.h wli-2.6.0-test1-37/include/asm-i386/percpu.h --- linux-2.6.0-test1/include/asm-i386/percpu.h 2003-07-13 20:38:35.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-i386/percpu.h 2003-07-14 09:29:09.000000000 -0700 @@ -3,4 +3,9 @@ #include +#ifdef CONFIG_NUMA +#undef __GENERIC_PER_CPU +void setup_per_cpu_areas(void); +#endif + #endif /* __ARCH_I386_PERCPU__ */ diff -prauN linux-2.6.0-test1/include/asm-i386/pgalloc.h wli-2.6.0-test1-37/include/asm-i386/pgalloc.h --- linux-2.6.0-test1/include/asm-i386/pgalloc.h 2003-07-13 20:29:59.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-i386/pgalloc.h 2003-07-14 07:10:48.000000000 -0700 @@ -31,25 +31,36 @@ static inline void pte_free_kernel(pte_t free_page((unsigned long)pte); } -static inline void pte_free(struct page *pte) -{ - __free_page(pte); -} - - -#define __pte_free_tlb(tlb,pte) tlb_remove_page((tlb),(pte)) - /* * allocating and freeing a pmd is trivial: the 1-entry pmd is * inside the pgd, so has no extra memory associated with it. * (In the PAE case we free the pmds as part of the pgd.) */ -#define pmd_alloc_one(mm, addr) ({ BUG(); ((pmd_t *)2); }) +#define pmd_alloc_one(mm, addr) ({ BUG(); ((struct page *)2); }) +#define pmd_alloc_one_kernel(mm, addr) ({ BUG(); ((pmd_t *)2); }) #define pmd_free(x) do { } while (0) #define __pmd_free_tlb(tlb,x) do { } while (0) #define pgd_populate(mm, pmd, pte) BUG() #define check_pgt_cache() do { } while (0) +#include + +static inline void pte_free(struct page *page) +{ + struct mmu_gather *tlb = &per_cpu(mmu_gathers, get_cpu()); + tlb_remove_page(tlb, page); + put_cpu(); +} + +static inline void pte_free_tlb(struct mmu_gather *tlb, struct page *page) +{ + tlb_remove_page(tlb, page); +} + +static inline void pmd_free_tlb(struct mmu_gather *tlb, struct page *page) +{ +} + #endif /* _I386_PGALLOC_H */ diff -prauN linux-2.6.0-test1/include/asm-i386/pgtable-2level.h wli-2.6.0-test1-37/include/asm-i386/pgtable-2level.h --- linux-2.6.0-test1/include/asm-i386/pgtable-2level.h 2003-07-13 20:34:39.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-i386/pgtable-2level.h 2003-07-14 06:49:00.000000000 -0700 @@ -48,13 +48,15 @@ static inline int pgd_present(pgd_t pgd) #define set_pmd(pmdptr, pmdval) (*(pmdptr) = pmdval) #define set_pgd(pgdptr, pgdval) (*(pgdptr) = pgdval) -#define pgd_page(pgd) \ -((unsigned long) __va(pgd_val(pgd) & PAGE_MASK)) +#define pgd_page(pgd) pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT) + +#define pmd_offset_map(pgd, addr) ({ (pmd_t *)(pgd); }) +#define pmd_offset_map_nested(pgd, addr) pmd_offset_map(pgd, addr) +#define pmd_offset_kernel(pgd, addr) pmd_offset_map(pgd, addr) + +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) -static inline pmd_t * pmd_offset(pgd_t * dir, unsigned long address) -{ - return (pmd_t *) dir; -} #define ptep_get_and_clear(xp) __pte(xchg(&(xp)->pte_low, 0)) #define pte_same(a, b) ((a).pte_low == (b).pte_low) #define pte_page(x) pfn_to_page(pte_pfn(x)) diff -prauN linux-2.6.0-test1/include/asm-i386/pgtable-3level.h wli-2.6.0-test1-37/include/asm-i386/pgtable-3level.h --- linux-2.6.0-test1/include/asm-i386/pgtable-3level.h 2003-07-13 20:37:17.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-i386/pgtable-3level.h 2003-07-14 07:23:47.000000000 -0700 @@ -64,12 +64,32 @@ static inline void set_pte(pte_t *ptep, */ static inline void pgd_clear (pgd_t * pgd) { } -#define pgd_page(pgd) \ -((unsigned long) __va(pgd_val(pgd) & PAGE_MASK)) +static inline unsigned long pgd_pfn(pgd_t pgd) +{ + return pgd_val(pgd) >> PAGE_SHIFT; +} + +#define pgd_page(pgd) pfn_to_page(pgd_pfn(pgd)) + +#define pmd_offset_kernel(pgd, addr) \ + ((pmd_t *)__va(pgd_val(*(pgd)) & PAGE_MASK) + pmd_index(addr)) /* Find an entry in the second-level page table.. */ -#define pmd_offset(dir, address) ((pmd_t *) pgd_page(*(dir)) + \ - pmd_index(address)) +#ifdef CONFIG_HIGHPMD +#define __pmd_offset(pgd, addr, type) \ + ((pmd_t *)kmap_atomic(pgd_page(*(pgd)), type) + pmd_index(addr)) +#define __pmd_unmap(pmd, type) kunmap_atomic(pmd, type) +#else +#define __pmd_offset(pgd, addr, type) \ + ((pmd_t *)__va(pgd_val(*(pgd)) & PAGE_MASK) + pmd_index(addr)) +#define __pmd_unmap(pmd, type) do { } while (0) +#endif + +#define pmd_offset_map(pgd, addr) __pmd_offset(pgd, addr, KM_PMD0) +#define pmd_offset_map_nested(pgd, addr) __pmd_offset(pgd, addr, KM_PMD1) + +#define pmd_unmap(pmd) __pmd_unmap(pmd, KM_PMD0); +#define pmd_unmap_nested(pmd) __pmd_unmap(pmd, KM_PMD1); static inline pte_t ptep_get_and_clear(pte_t *ptep) { @@ -123,6 +143,4 @@ static inline pmd_t pfn_pmd(unsigned lon #define pgoff_to_pte(off) ((pte_t) { _PAGE_FILE, (off) }) #define PTE_FILE_MAX_BITS 32 -extern struct kmem_cache_s *pae_pgd_cachep; - #endif /* _I386_PGTABLE_3LEVEL_H */ diff -prauN linux-2.6.0-test1/include/asm-i386/pgtable.h wli-2.6.0-test1-37/include/asm-i386/pgtable.h --- linux-2.6.0-test1/include/asm-i386/pgtable.h 2003-07-13 20:36:42.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-i386/pgtable.h 2003-07-14 09:33:21.000000000 -0700 @@ -16,11 +16,18 @@ #include #include #include +#include +#include +#include #ifndef _I386_BITOPS_H #include #endif +#ifdef CONFIG_MMAP_TOPDOWN +#define HAVE_ARCH_UNMAPPED_AREA +#endif + extern pgd_t swapper_pg_dir[1024]; extern void paging_init(void); @@ -31,33 +38,29 @@ extern void paging_init(void); extern unsigned long empty_zero_page[1024]; #define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) -#endif /* !__ASSEMBLY__ */ +extern kmem_cache_t *pgd_cache; +extern struct list_head pgd_list; +extern spinlock_t pgd_lock; +void pgtable_cache_init(void); +void pgd_ctor(void *, kmem_cache_t *, unsigned long); +void pgd_dtor(void *, kmem_cache_t *, unsigned long); + +#define HAVE_ARCH_PAGETABLE_CACHE +void shrink_pagetable_cache(int gfp_mask); /* * The Linux x86 paging architecture is 'compile-time dual-mode', it * implements both the traditional 2-level x86 page tables and the * newer 3-level PAE-mode page tables. */ -#ifndef __ASSEMBLY__ #ifdef CONFIG_X86_PAE # include - -/* - * Need to initialise the X86 PAE caches - */ -extern void pgtable_cache_init(void); - #else # include - -/* - * No page table caches to initialise - */ -#define pgtable_cache_init() do { } while (0) - -#endif #endif +#endif /* !__ASSEMBLY__ */ + #define PMD_SIZE (1UL << PMD_SHIFT) #define PMD_MASK (~(PMD_SIZE-1)) #define PGDIR_SIZE (1UL << PGDIR_SHIFT) diff -prauN linux-2.6.0-test1/include/asm-i386/rmap.h wli-2.6.0-test1-37/include/asm-i386/rmap.h --- linux-2.6.0-test1/include/asm-i386/rmap.h 2003-07-13 20:35:15.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-i386/rmap.h 1969-12-31 16:00:00.000000000 -0800 @@ -1,21 +0,0 @@ -#ifndef _I386_RMAP_H -#define _I386_RMAP_H - -/* nothing to see, move along */ -#include - -#ifdef CONFIG_HIGHPTE -static inline pte_t *rmap_ptep_map(pte_addr_t pte_paddr) -{ - unsigned long pfn = (unsigned long)(pte_paddr >> PAGE_SHIFT); - unsigned long off = ((unsigned long)pte_paddr) & ~PAGE_MASK; - return (pte_t *)((char *)kmap_atomic(pfn_to_page(pfn), KM_PTE2) + off); -} - -static inline void rmap_ptep_unmap(pte_t *pte) -{ - kunmap_atomic(pte, KM_PTE2); -} -#endif - -#endif diff -prauN linux-2.6.0-test1/include/asm-i386/smp.h wli-2.6.0-test1-37/include/asm-i386/smp.h --- linux-2.6.0-test1/include/asm-i386/smp.h 2003-07-13 20:34:43.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-i386/smp.h 2003-07-14 06:33:08.000000000 -0700 @@ -8,6 +8,7 @@ #include #include #include +#include #endif #ifdef CONFIG_X86_LOCAL_APIC @@ -31,9 +32,7 @@ */ extern void smp_alloc_memory(void); -extern unsigned long phys_cpu_present_map; -extern unsigned long cpu_online_map; -extern volatile unsigned long smp_invalidate_needed; +extern physid_mask_t phys_cpu_present_map; extern int pic_mode; extern int smp_num_siblings; extern int cpu_sibling_map[]; @@ -54,37 +53,19 @@ extern void zap_low_mappings (void); */ #define smp_processor_id() (current_thread_info()->cpu) -extern volatile unsigned long cpu_callout_map; +extern volatile cpumask_t cpu_callout_map; -#define cpu_possible(cpu) (cpu_callout_map & (1<<(cpu))) -#define cpu_online(cpu) (cpu_online_map & (1<<(cpu))) - -#define for_each_cpu(cpu, mask) \ - for(mask = cpu_online_map; \ - cpu = __ffs(mask), mask != 0; \ - mask &= ~(1< +#include #ifndef __ASSEMBLY__ #include #endif @@ -30,9 +32,11 @@ struct thread_info { __s32 preempt_count; /* 0 => preemptable, <0 => BUG */ mm_segment_t addr_limit; /* thread address space: + 0 for interrupts: illegal 0-0xBFFFFFFF for user-thead 0-0xFFFFFFFF for kernel-thread */ + struct thread_info *irq_stack; /* pointer to cpu irq stack */ struct restart_block restart_block; __u8 supervisor_stack[0]; @@ -48,7 +52,8 @@ struct thread_info { #define TI_CPU 0x00000010 #define TI_PRE_COUNT 0x00000014 #define TI_ADDR_LIMIT 0x00000018 -#define TI_RESTART_BLOCK 0x000001C +#define TI_IRQ_STACK 0x0000001C +#define TI_RESTART_BLOCK 0x0000026 #endif @@ -59,46 +64,64 @@ struct thread_info { * * preempt_count needs to be 1 initially, until the scheduler is functional. */ +#ifdef CONFIG_4K_STACK +#define THREAD_ORDER 0 +#define STACK_WARN 0x200 +#define STACK_PANIC 0x100 +#else +#define THREAD_ORDER 1 +#define STACK_WARN ((THREAD_SIZE)>>1) +#define STACK_PANIC 0x100 +#endif +#define INIT_THREAD_SIZE THREAD_SIZE + #ifndef __ASSEMBLY__ -#define INIT_THREAD_INFO(tsk) \ -{ \ - .task = &tsk, \ - .exec_domain = &default_exec_domain, \ - .flags = 0, \ - .cpu = 0, \ - .preempt_count = 1, \ - .addr_limit = KERNEL_DS, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ +#define INIT_THREAD_INFO(tsk) \ +{ \ + .task = &tsk, \ + .exec_domain = &default_exec_domain, \ + .flags = 0, \ + .cpu = 0, \ + .preempt_count = 1, \ + .addr_limit = KERNEL_DS, \ + .irq_stack = &init_irq_union.thread_info, \ + .restart_block = { \ + .fn = do_no_restart_syscall, \ + } \ } #define init_thread_info (init_thread_union.thread_info) #define init_stack (init_thread_union.stack) +/* thread information allocation */ +#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER) +#define alloc_thread_info(task) ((struct thread_info *)kmalloc(THREAD_SIZE, GFP_KERNEL)) +#define free_thread_info(info) kfree(info) +#define get_thread_info(ti) get_task_struct((ti)->task) +#define put_thread_info(ti) put_task_struct((ti)->task) + /* how to get the thread information struct from C */ static inline struct thread_info *current_thread_info(void) { struct thread_info *ti; - __asm__("andl %%esp,%0; ":"=r" (ti) : "0" (~8191UL)); + __asm__("andl %%esp,%0; ":"=r" (ti) : "0" (~(THREAD_SIZE - 1))); return ti; } -/* thread information allocation */ -#define THREAD_SIZE (2*PAGE_SIZE) -#define alloc_thread_info(tsk) ((struct thread_info *) __get_free_pages(GFP_KERNEL,1)) -#define free_thread_info(ti) free_pages((unsigned long) (ti), 1) -#define get_thread_info(ti) get_task_struct((ti)->task) -#define put_thread_info(ti) put_task_struct((ti)->task) - #else /* !__ASSEMBLY__ */ +#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER) + /* how to get the thread information struct from ASM */ #define GET_THREAD_INFO(reg) \ - movl $-8192, reg; \ + movl $-THREAD_SIZE, reg; \ andl %esp, reg +/* use this one if reg already contains %esp */ +#define GET_THREAD_INFO_WITH_ESP(reg) \ +andl $-THREAD_SIZE, reg + #endif /* diff -prauN linux-2.6.0-test1/include/asm-i386/tlb.h wli-2.6.0-test1-37/include/asm-i386/tlb.h --- linux-2.6.0-test1/include/asm-i386/tlb.h 2003-07-13 20:32:33.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-i386/tlb.h 2003-07-14 07:10:48.000000000 -0700 @@ -1,10 +1,58 @@ #ifndef _I386_TLB_H #define _I386_TLB_H +/* + * include/asm-i386/tlb.h + * (C) June 2003 William Irwin, IBM + * Routines for pagetable cacheing and release. + */ + +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_HIGHPTE +#define GFP_PTE (GFP_KERNEL|__GFP_REPEAT|__GFP_HIGHMEM) +#else +#define GFP_PTE (GFP_KERNEL|__GFP_REPEAT) +#endif + +#ifdef CONFIG_HIGHPMD +#define GFP_PMD (GFP_KERNEL|__GFP_REPEAT|__GFP_HIGHMEM) +#else +#define GFP_PMD (GFP_KERNEL|__GFP_REPEAT) +#endif + +#define PG_PTE PG_arch_1 +#define NR_PTE 128 +#define FREE_PTE_NR NR_PTE +#define NR_NONPTE 512 +#define MAX_ZONE_ID (MAX_NUMNODES * MAX_NR_ZONES) + +#define PagePTE(page) test_bit(PG_PTE, &(page)->flags) +#define SetPagePTE(page) set_bit(PG_PTE, &(page)->flags) +#define ClearPagePTE(page) clear_bit(PG_PTE, &(page)->flags) +#define TestSetPagePTE(page) test_and_set_bit(PG_PTE, &(page)->flags) +#define TestClearPagePTE(page) test_and_clear_bit(PG_PTE, &(page)->flags) +#define PageZoneID(page) ((page)->flags >> ZONE_SHIFT) /* - * x86 doesn't need any special per-pte or - * per-vma handling.. + * vmscan.c does smp_call_function() to shoot down cached pagetables under + * memory pressure. */ +struct mmu_gather { + struct mm_struct *mm; + int nr_pte_active, nr_pte_ready, nr_nonpte, need_flush, fullmm, freed; + struct list_head active_list[MAX_ZONE_ID], ready_list[MAX_ZONE_ID]; + int active_count[MAX_ZONE_ID], ready_count[MAX_ZONE_ID]; + struct page *nonpte[NR_NONPTE]; +}; + +DECLARE_PER_CPU(struct mmu_gather, mmu_gathers); + #define tlb_start_vma(tlb, vma) do { } while (0) #define tlb_end_vma(tlb, vma) do { } while (0) #define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0) @@ -15,6 +63,122 @@ */ #define tlb_flush(tlb) flush_tlb_mm((tlb)->mm) -#include +void tlb_init(void); -#endif +static inline +struct mmu_gather *tlb_gather_mmu(struct mm_struct *mm, unsigned int flush) +{ + struct mmu_gather *tlb = &per_cpu(mmu_gathers, get_cpu()); + tlb->mm = mm; + tlb->fullmm = flush; + tlb->freed = 0; + put_cpu(); + return tlb; +} + +static inline +void tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *pte, unsigned long addr) +{ + tlb->need_flush = 1; +} + +static inline +void tlb_flush_ready(struct mmu_gather *tlb) +{ + int zone; + + for (zone = 0; tlb->nr_pte_ready >= NR_PTE && zone < MAX_ZONE_ID; ++zone) { + struct page *head; + + if (!tlb->ready_count[zone]) + continue; + + head = list_entry(tlb->ready_list[zone].next, struct page, list); + list_del_init(&head->list); + list_splice_init(&tlb->ready_list[zone], &head->list); + head->private = tlb->ready_count[zone]; + tlb->nr_pte_ready -= tlb->ready_count[zone]; + tlb->ready_count[zone] = 0; + free_pages_bulk(zone_table[zone], head, 0); + } +} + +static inline +void tlb_flush_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) +{ + int zone; + unsigned long flags; + + if (!tlb->need_flush && tlb->nr_nonpte < NR_NONPTE) + return; + + tlb->need_flush = 0; + tlb_flush(tlb); + + smp_local_irq_save(flags); + + if (tlb->nr_nonpte) { + free_pages_and_swap_cache(tlb->nonpte, tlb->nr_nonpte); + tlb->nr_nonpte = 0; + } + + for (zone = 0; zone < MAX_ZONE_ID; ++zone) { + if (!tlb->active_count[zone]) + continue; + + list_splice_init(&tlb->active_list[zone], &tlb->ready_list[zone]); + tlb->ready_count[zone] += tlb->active_count[zone]; + tlb->active_count[zone] = 0; + } + tlb->nr_pte_ready += tlb->nr_pte_active; + tlb->nr_pte_active = 0; + if (tlb->nr_pte_ready >= NR_PTE) + tlb_flush_ready(tlb); + + smp_local_irq_restore(flags); +} + +static inline +void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) +{ + if (tlb->mm->rss >= tlb->freed) + tlb->mm->rss -= tlb->freed; + else + tlb->mm->rss = 0; + tlb_flush_mmu(tlb, start, end); +} + +static inline +void tlb_remove_nonpte_page(struct mmu_gather *tlb, struct page *page) +{ + tlb->nonpte[tlb->nr_nonpte] = page; + tlb->nr_nonpte++; + if (tlb->nr_nonpte >= NR_NONPTE) + tlb_flush_mmu(tlb, 0, 0); +} + +static inline +void tlb_remove_pte_page(struct mmu_gather *tlb, struct page *page) +{ + int zone = PageZoneID(page); + ClearPagePTE(page); + tlb->nr_pte_active++; + tlb->active_count[zone]++; + list_add(&page->list, &tlb->active_list[zone]); +} + +static inline +void tlb_remove_page(struct mmu_gather *tlb, struct page *page) +{ + unsigned long flags; + + smp_local_irq_save(flags); + tlb->need_flush = 1; + if (PagePTE(page)) + tlb_remove_pte_page(tlb, page); + else + tlb_remove_nonpte_page(tlb, page); + smp_local_irq_restore(flags); +} + +#endif /* _I386_TLB_H */ diff -prauN linux-2.6.0-test1/include/asm-i386/topology.h wli-2.6.0-test1-37/include/asm-i386/topology.h --- linux-2.6.0-test1/include/asm-i386/topology.h 2003-07-13 20:31:20.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-i386/topology.h 2003-07-14 06:31:10.000000000 -0700 @@ -31,9 +31,11 @@ #include +#include + /* Mappings between logical cpu number and node number */ -extern volatile unsigned long node_2_cpu_mask[]; -extern volatile int cpu_2_node[]; +extern cpumask_t node_2_cpu_mask[]; +extern int cpu_2_node[]; /* Returns the number of the node containing CPU 'cpu' */ static inline int cpu_to_node(int cpu) @@ -49,7 +51,7 @@ static inline int cpu_to_node(int cpu) #define parent_node(node) (node) /* Returns a bitmask of CPUs on Node 'node'. */ -static inline unsigned long node_to_cpumask(int node) +static inline cpumask_t node_to_cpumask(int node) { return node_2_cpu_mask[node]; } @@ -57,14 +59,15 @@ static inline unsigned long node_to_cpum /* Returns the number of the first CPU on Node 'node'. */ static inline int node_to_first_cpu(int node) { - return __ffs(node_to_cpumask(node)); + cpumask_t mask = node_to_cpumask(node); + return first_cpu(mask); } /* Returns the number of the first MemBlk on Node 'node' */ #define node_to_memblk(node) (node) /* Returns the number of the node containing PCI bus 'bus' */ -static inline unsigned long pcibus_to_cpumask(int bus) +static inline cpumask_t pcibus_to_cpumask(int bus) { return node_to_cpumask(mp_bus_id_to_node[bus]); } diff -prauN linux-2.6.0-test1/include/asm-ia64/bitops.h wli-2.6.0-test1-37/include/asm-ia64/bitops.h --- linux-2.6.0-test1/include/asm-ia64/bitops.h 2003-07-13 20:37:58.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-ia64/bitops.h 2003-07-14 06:31:10.000000000 -0700 @@ -409,7 +409,7 @@ found_middle: * Find next bit in a bitmap reasonably efficiently.. */ static inline int -find_next_bit (void *addr, unsigned long size, unsigned long offset) +find_next_bit(const void *addr, unsigned long size, unsigned long offset) { unsigned long *p = ((unsigned long *) addr) + (offset >> 6); unsigned long result = offset & ~63UL; diff -prauN linux-2.6.0-test1/include/asm-ia64/pgalloc.h wli-2.6.0-test1-37/include/asm-ia64/pgalloc.h --- linux-2.6.0-test1/include/asm-ia64/pgalloc.h 2003-07-13 20:29:22.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-ia64/pgalloc.h 2003-07-14 06:49:00.000000000 -0700 @@ -71,9 +71,9 @@ pgd_free (pgd_t *pgd) } static inline void -pgd_populate (struct mm_struct *mm, pgd_t *pgd_entry, pmd_t *pmd) +pgd_populate (struct mm_struct *mm, pgd_t *pgd_entry, struct page *pmd) { - pgd_val(*pgd_entry) = __pa(pmd); + pgd_val(*pgd_entry) = __pa(page_address(pmd)); } @@ -90,8 +90,8 @@ pmd_alloc_one_fast (struct mm_struct *mm return (pmd_t *)ret; } -static inline pmd_t* -pmd_alloc_one (struct mm_struct *mm, unsigned long addr) +static inline pmd_t * +pmd_alloc_one_kernel(struct mm_struct *mm, unsigned long addr) { pmd_t *pmd = (pmd_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT); @@ -100,9 +100,16 @@ pmd_alloc_one (struct mm_struct *mm, uns return pmd; } +static inline struct page *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) +{ + pmd_t *pmd = pmd_alloc_one_kernel(mm, addr); + return pmd ? virt_to_page(pmd) : NULL; +} + static inline void -pmd_free (pmd_t *pmd) +pmd_free(struct page *page) { + pmd_t *pmd = page_address(page); *(unsigned long *)pmd = (unsigned long) pmd_quicklist; pmd_quicklist = (unsigned long *) pmd; ++pgtable_cache_size; diff -prauN linux-2.6.0-test1/include/asm-ia64/pgtable.h wli-2.6.0-test1-37/include/asm-ia64/pgtable.h --- linux-2.6.0-test1/include/asm-ia64/pgtable.h 2003-07-13 20:32:34.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-ia64/pgtable.h 2003-07-14 06:49:00.000000000 -0700 @@ -257,7 +257,8 @@ ia64_phys_addr_valid (unsigned long addr #define pgd_bad(pgd) (!ia64_phys_addr_valid(pgd_val(pgd))) #define pgd_present(pgd) (pgd_val(pgd) != 0UL) #define pgd_clear(pgdp) (pgd_val(*(pgdp)) = 0UL) -#define pgd_page(pgd) ((unsigned long) __va(pgd_val(pgd) & _PFN_MASK)) +#define __pgd_page(pgd) ((unsigned long)__va(pgd_val(pgd) & _PFN_MASK)) +#define pgd_page(pgd) virt_to_page(__pgd_page(pgd)) /* * The following have defined behavior only work if pte_present() is true. @@ -326,7 +327,13 @@ pgd_offset (struct mm_struct *mm, unsign /* Find an entry in the second-level page table.. */ #define pmd_offset(dir,addr) \ - ((pmd_t *) pgd_page(*(dir)) + (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))) + ((pmd_t *)__pgd_page(*(dir)) + (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))) + +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) /* * Find an entry in the third-level page table. This looks more complicated than it diff -prauN linux-2.6.0-test1/include/asm-ia64/smp.h wli-2.6.0-test1-37/include/asm-ia64/smp.h --- linux-2.6.0-test1/include/asm-ia64/smp.h 2003-07-13 20:34:03.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-ia64/smp.h 2003-07-14 06:31:10.000000000 -0700 @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -37,8 +38,8 @@ extern struct smp_boot_data { extern char no_int_routing __initdata; -extern unsigned long phys_cpu_present_map; -extern volatile unsigned long cpu_online_map; +extern cpumask_t phys_cpu_present_map; +extern cpumask_t cpu_online_map; extern unsigned long ipi_base_addr; extern unsigned char smp_int_redirect; @@ -47,22 +48,7 @@ extern volatile int ia64_cpu_to_sapicid[ extern unsigned long ap_wakeup_vector; -#define cpu_possible(cpu) (phys_cpu_present_map & (1UL << (cpu))) -#define cpu_online(cpu) (cpu_online_map & (1UL << (cpu))) - -static inline unsigned int -num_online_cpus (void) -{ - return hweight64(cpu_online_map); -} - -static inline unsigned int -any_online_cpu (unsigned int mask) -{ - if (mask & cpu_online_map) - return __ffs(mask & cpu_online_map); - return NR_CPUS; -} +#define cpu_possible(cpu) cpu_isset(cpu, phys_cpu_present_map) /* * Function to map hard smp processor id to logical id. Slow, so don't use this in diff -prauN linux-2.6.0-test1/include/asm-m68k/motorola_pgalloc.h wli-2.6.0-test1-37/include/asm-m68k/motorola_pgalloc.h --- linux-2.6.0-test1/include/asm-m68k/motorola_pgalloc.h 2003-07-13 20:39:36.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-m68k/motorola_pgalloc.h 2003-07-14 06:49:00.000000000 -0700 @@ -63,19 +63,28 @@ static inline void __pte_free_tlb(struct } -static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address) +static inline pmd_t *pmd_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { return get_pointer_table(); } -static inline int pmd_free(pmd_t *pmd) +static inline struct page *pmd_alloc_one_kernel(struct mm_struct *mm, unsigned long addr) { - return free_pointer_table(pmd); + pmd_t *pmd = pmd_alloc_one_kernel(mm, addr); + if (pmd) + return virt_to_page(pmd); + else + return NULL; } -static inline int __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) +static inline int pmd_free(struct page *pmd) { - return free_pointer_table(pmd); + return free_pointer_table(page_address(pmd)); +} + +static inline int __pmd_free_tlb(struct mmu_gather *tlb, struct page *pmd) +{ + return free_pointer_table(page_address(pmd)); } @@ -100,9 +109,9 @@ static inline void pmd_populate(struct m pmd_set(pmd, page_address(page)); } -static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmd) +static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, struct page *pmd) { - pgd_set(pgd, pmd); + pgd_set(pgd, page_address(pmd)); } #endif /* _MOTOROLA_PGALLOC_H */ diff -prauN linux-2.6.0-test1/include/asm-m68k/motorola_pgtable.h wli-2.6.0-test1-37/include/asm-m68k/motorola_pgtable.h --- linux-2.6.0-test1/include/asm-m68k/motorola_pgtable.h 2003-07-13 20:35:15.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-m68k/motorola_pgtable.h 2003-07-14 06:49:00.000000000 -0700 @@ -115,6 +115,7 @@ extern inline void pgd_set(pgd_t * pgdp, #define __pte_page(pte) ((unsigned long)__va(pte_val(pte) & PAGE_MASK)) #define __pmd_page(pmd) ((unsigned long)__va(pmd_val(pmd) & _TABLE_MASK)) #define __pgd_page(pgd) ((unsigned long)__va(pgd_val(pgd) & _TABLE_MASK)) +#define pgd_page(pgd) virt_to_page(__pgd_page(pgd)) #define pte_none(pte) (!pte_val(pte)) @@ -203,6 +204,12 @@ extern inline pmd_t * pmd_offset(pgd_t * return (pmd_t *)__pgd_page(*dir) + ((address >> PMD_SHIFT) & (PTRS_PER_PMD-1)); } +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) + /* Find an entry in the third-level page table.. */ extern inline pte_t * pte_offset_kernel(pmd_t * pmdp, unsigned long address) { diff -prauN linux-2.6.0-test1/include/asm-m68k/sun3_pgalloc.h wli-2.6.0-test1-37/include/asm-m68k/sun3_pgalloc.h --- linux-2.6.0-test1/include/asm-m68k/sun3_pgalloc.h 2003-07-13 20:33:23.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-m68k/sun3_pgalloc.h 2003-07-14 06:49:00.000000000 -0700 @@ -18,7 +18,8 @@ extern const char bad_pmd_string[]; -#define pmd_alloc_one(mm,address) ({ BUG(); ((pmd_t *)2); }) +#define pmd_alloc_one(mm,address) ({ BUG(); ((struct page *)2); }) +#define pmd_alloc_one_kernel(mm,address) ({ BUG(); ((pmd_t *)2); }) static inline void pte_free_kernel(pte_t * pte) diff -prauN linux-2.6.0-test1/include/asm-m68knommu/pgtable.h wli-2.6.0-test1-37/include/asm-m68knommu/pgtable.h --- linux-2.6.0-test1/include/asm-m68knommu/pgtable.h 2003-07-13 20:34:43.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-m68knommu/pgtable.h 2003-07-14 06:49:00.000000000 -0700 @@ -21,7 +21,12 @@ typedef pte_t *pte_addr_t; #define pgd_bad(pgd) (0) #define pgd_clear(pgdp) #define kern_addr_valid(addr) (1) -#define pmd_offset(a, b) ((void *)0) +#define pmd_offset(a, b) ((void *)0) +#define pmd_offset_kernel(a, b) pmd_offset(a, b) +#define pmd_offset_map(a, b) pmd_offset(a, b) +#define pmd_offset_map_nested(a, b) pmd_offset(a, b) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) #define PAGE_NONE __pgprot(0) #define PAGE_SHARED __pgprot(0) diff -prauN linux-2.6.0-test1/include/asm-mips/pgalloc.h wli-2.6.0-test1-37/include/asm-mips/pgalloc.h --- linux-2.6.0-test1/include/asm-mips/pgalloc.h 2003-07-13 20:34:33.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-mips/pgalloc.h 2003-07-14 06:49:00.000000000 -0700 @@ -96,7 +96,8 @@ static inline void pte_free(struct page * allocating and freeing a pmd is trivial: the 1-entry pmd is * inside the pgd, so has no extra memory associated with it. */ -#define pmd_alloc_one(mm, addr) ({ BUG(); ((pmd_t *)2); }) +#define pmd_alloc_one(mm, addr) ({ BUG(); ((struct page *)2); }) +#define pmd_alloc_one_kernel(mm, addr) ({ BUG(); ((pmd_t *)2); }) #define pmd_free(x) do { } while (0) #define __pmd_free_tlb(tlb,x) do { } while (0) diff -prauN linux-2.6.0-test1/include/asm-mips/pgtable.h wli-2.6.0-test1-37/include/asm-mips/pgtable.h --- linux-2.6.0-test1/include/asm-mips/pgtable.h 2003-07-13 20:32:32.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-mips/pgtable.h 2003-07-14 06:49:00.000000000 -0700 @@ -374,6 +374,12 @@ static inline pmd_t *pmd_offset(pgd_t *d return (pmd_t *) dir; } +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) + /* Find an entry in the third-level page table.. */ #define __pte_offset(address) \ (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) diff -prauN linux-2.6.0-test1/include/asm-mips/smp.h wli-2.6.0-test1-37/include/asm-mips/smp.h --- linux-2.6.0-test1/include/asm-mips/smp.h 2003-07-13 20:30:48.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-mips/smp.h 2003-07-14 06:31:10.000000000 -0700 @@ -17,6 +17,7 @@ #include #include +#include #include #define smp_processor_id() (current_thread_info()->cpu) @@ -45,56 +46,17 @@ extern struct call_data_struct *call_dat #define SMP_RESCHEDULE_YOURSELF 0x1 /* XXX braindead */ #define SMP_CALL_FUNCTION 0x2 -#if (NR_CPUS <= _MIPS_SZLONG) - -typedef unsigned long cpumask_t; - -#define CPUMASK_CLRALL(p) (p) = 0 -#define CPUMASK_SETB(p, bit) (p) |= 1UL << (bit) -#define CPUMASK_CLRB(p, bit) (p) &= ~(1UL << (bit)) -#define CPUMASK_TSTB(p, bit) ((p) & (1UL << (bit))) - -#elif (NR_CPUS <= 128) - -/* - * The foll should work till 128 cpus. - */ -#define CPUMASK_SIZE (NR_CPUS/_MIPS_SZLONG) -#define CPUMASK_INDEX(bit) ((bit) >> 6) -#define CPUMASK_SHFT(bit) ((bit) & 0x3f) - -typedef struct { - unsigned long _bits[CPUMASK_SIZE]; -} cpumask_t; - -#define CPUMASK_CLRALL(p) (p)._bits[0] = 0, (p)._bits[1] = 0 -#define CPUMASK_SETB(p, bit) (p)._bits[CPUMASK_INDEX(bit)] |= \ - (1UL << CPUMASK_SHFT(bit)) -#define CPUMASK_CLRB(p, bit) (p)._bits[CPUMASK_INDEX(bit)] &= \ - ~(1UL << CPUMASK_SHFT(bit)) -#define CPUMASK_TSTB(p, bit) ((p)._bits[CPUMASK_INDEX(bit)] & \ - (1UL << CPUMASK_SHFT(bit))) - -#else -#error cpumask macros only defined for 128p kernels -#endif - extern cpumask_t phys_cpu_present_map; extern cpumask_t cpu_online_map; -#define cpu_possible(cpu) (phys_cpu_present_map & (1<<(cpu))) -#define cpu_online(cpu) (cpu_online_map & (1<<(cpu))) - -extern inline unsigned int num_online_cpus(void) -{ - return hweight32(cpu_online_map); -} +#define cpu_possible(cpu) cpu_isset(cpu, phys_cpu_present_map) +#define cpu_online(cpu) cpu_isset(cpu, cpu_online_map) -extern volatile unsigned long cpu_callout_map; +extern cpumask_t cpu_callout_map; /* We don't mark CPUs online until __cpu_up(), so we need another measure */ static inline int num_booting_cpus(void) { - return hweight32(cpu_callout_map); + return cpus_weight(cpu_callout_map); } #endif /* CONFIG_SMP */ diff -prauN linux-2.6.0-test1/include/asm-mips64/pgalloc.h wli-2.6.0-test1-37/include/asm-mips64/pgalloc.h --- linux-2.6.0-test1/include/asm-mips64/pgalloc.h 2003-07-13 20:37:23.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-mips64/pgalloc.h 2003-07-14 06:49:00.000000000 -0700 @@ -28,7 +28,7 @@ static inline void pmd_populate(struct m set_pmd(pmd, __pmd((PAGE_OFFSET + page_to_pfn(pte)) << PAGE_SHIFT)); } -#define pgd_populate(mm, pgd, pmd) set_pgd(pgd, __pgd(pmd)) +#define pgd_populate(mm, pgd, pmd) set_pgd(pgd, __pgd(page_address(pmd))) static inline pgd_t *pgd_alloc(struct mm_struct *mm) { @@ -88,7 +88,7 @@ static inline void pte_free(struct page #define __pte_free_tlb(tlb,pte) tlb_remove_page((tlb),(pte)) #define __pmd_free_tlb(tlb,x) do { } while (0) -static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address) +static inline pmd_t *pmd_alloc_one_kernel(struct mm_struct *mm, unsigned long addr) { pmd_t *pmd; @@ -98,9 +98,18 @@ static inline pmd_t *pmd_alloc_one(struc return pmd; } -static inline void pmd_free(pmd_t *pmd) +static inline struct page *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) { - free_pages((unsigned long)pmd, PMD_ORDER); + pmd_t *pmd = pmd_alloc_one_kernel(mm, addr); + if (pmd) + return virt_to_page(pmd); + else + return NULL; +} + +static inline void pmd_free(struct page *pmd) +{ + __free_pages(pmd, PMD_ORDER); } extern pte_t kptbl[(PAGE_SIZE << PGD_ORDER)/sizeof(pte_t)]; diff -prauN linux-2.6.0-test1/include/asm-mips64/pgtable.h wli-2.6.0-test1-37/include/asm-mips64/pgtable.h --- linux-2.6.0-test1/include/asm-mips64/pgtable.h 2003-07-13 20:38:51.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-mips64/pgtable.h 2003-07-14 06:49:00.000000000 -0700 @@ -155,11 +155,13 @@ extern pmd_t empty_bad_pmd_table[2*PAGE_ #define pmd_page(pmd) (pfn_to_page(pmd_phys(pmd) >> PAGE_SHIFT)) #define pmd_page_kernel(pmd) pmd_val(pmd) -static inline unsigned long pgd_page(pgd_t pgd) +static inline unsigned long __pgd_page(pgd_t pgd) { return pgd_val(pgd); } +#define pgd_page(pgd) virt_to_page(__pgd_page(pgd)) + static inline int pte_none(pte_t pte) { return !(pte_val(pte) & ~_PAGE_GLOBAL); @@ -397,10 +399,16 @@ static inline pte_t pte_modify(pte_t pte /* Find an entry in the second-level page table.. */ static inline pmd_t *pmd_offset(pgd_t * dir, unsigned long address) { - return (pmd_t *) pgd_page(*dir) + + return (pmd_t *)__pgd_page(*dir) + ((address >> PMD_SHIFT) & (PTRS_PER_PMD - 1)); } +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pmd) do { } while(0) +#define pmd_unmap_nested(pmd) do { } while(0) + /* Find an entry in the third-level page table.. */ #define __pte_offset(address) \ (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) diff -prauN linux-2.6.0-test1/include/asm-mips64/smp.h wli-2.6.0-test1-37/include/asm-mips64/smp.h --- linux-2.6.0-test1/include/asm-mips64/smp.h 2003-07-13 20:32:34.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-mips64/smp.h 2003-07-14 06:31:10.000000000 -0700 @@ -17,6 +17,7 @@ #include #include +#include #include #define smp_processor_id() (current_thread_info()->cpu) @@ -45,56 +46,17 @@ extern struct call_data_struct *call_dat #define SMP_RESCHEDULE_YOURSELF 0x1 /* XXX braindead */ #define SMP_CALL_FUNCTION 0x2 -#if (NR_CPUS <= _MIPS_SZLONG) - -typedef unsigned long cpumask_t; - -#define CPUMASK_CLRALL(p) (p) = 0 -#define CPUMASK_SETB(p, bit) (p) |= 1UL << (bit) -#define CPUMASK_CLRB(p, bit) (p) &= ~(1UL << (bit)) -#define CPUMASK_TSTB(p, bit) ((p) & (1UL << (bit))) - -#elif (NR_CPUS <= 128) - -/* - * The foll should work till 128 cpus. - */ -#define CPUMASK_SIZE (NR_CPUS/_MIPS_SZLONG) -#define CPUMASK_INDEX(bit) ((bit) >> 6) -#define CPUMASK_SHFT(bit) ((bit) & 0x3f) - -typedef struct { - unsigned long _bits[CPUMASK_SIZE]; -} cpumask_t; - -#define CPUMASK_CLRALL(p) (p)._bits[0] = 0, (p)._bits[1] = 0 -#define CPUMASK_SETB(p, bit) (p)._bits[CPUMASK_INDEX(bit)] |= \ - (1UL << CPUMASK_SHFT(bit)) -#define CPUMASK_CLRB(p, bit) (p)._bits[CPUMASK_INDEX(bit)] &= \ - ~(1UL << CPUMASK_SHFT(bit)) -#define CPUMASK_TSTB(p, bit) ((p)._bits[CPUMASK_INDEX(bit)] & \ - (1UL << CPUMASK_SHFT(bit))) - -#else -#error cpumask macros only defined for 128p kernels -#endif - extern cpumask_t phys_cpu_present_map; extern cpumask_t cpu_online_map; -#define cpu_possible(cpu) (phys_cpu_present_map & (1<<(cpu))) -#define cpu_online(cpu) (cpu_online_map & (1<<(cpu))) - -extern inline unsigned int num_online_cpus(void) -{ - return hweight32(cpu_online_map); -} +#define cpu_possible(cpu) cpu_isset(cpu, phys_cpu_present_map) +#define cpu_online(cpu) cpu_isset(cpu, cpu_online_map) -extern volatile unsigned long cpu_callout_map; +extern cpumask_t cpu_callout_map; /* We don't mark CPUs online until __cpu_up(), so we need another measure */ static inline int num_booting_cpus(void) { - return hweight32(cpu_callout_map); + return cpus_weight(cpu_callout_map); } #endif /* CONFIG_SMP */ diff -prauN linux-2.6.0-test1/include/asm-parisc/cacheflush.h wli-2.6.0-test1-37/include/asm-parisc/cacheflush.h --- linux-2.6.0-test1/include/asm-parisc/cacheflush.h 2003-07-13 20:38:43.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-parisc/cacheflush.h 2003-07-14 08:52:52.000000000 -0700 @@ -66,7 +66,7 @@ extern void __flush_dcache_page(struct p static inline void flush_dcache_page(struct page *page) { - if (page->mapping && list_empty(&page->mapping->i_mmap) && + if (page_mapping(page) && list_empty(&page_mapping(page)->i_mmap) && list_empty(&page->mapping->i_mmap_shared)) { set_bit(PG_dcache_dirty, &page->flags); } else { diff -prauN linux-2.6.0-test1/include/asm-parisc/pgalloc.h wli-2.6.0-test1-37/include/asm-parisc/pgalloc.h --- linux-2.6.0-test1/include/asm-parisc/pgalloc.h 2003-07-13 20:38:43.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-parisc/pgalloc.h 2003-07-14 06:49:00.000000000 -0700 @@ -28,12 +28,12 @@ static inline void pgd_free(pgd_t *pgd) /* Three Level Page Table Support for pmd's */ -static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmd) +static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, struct page *pmd) { - pgd_val(*pgd) = _PAGE_TABLE + __pa((unsigned long)pmd); + pgd_val(*pgd) = _PAGE_TABLE + __pa(page_address(pmd)); } -static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address) +static inline pmd_t *pmd_alloc_one_kernel(struct mm_struct *mm, unsigned long addr) { pmd_t *pmd = (pmd_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT); if (pmd) @@ -41,9 +41,18 @@ static inline pmd_t *pmd_alloc_one(struc return pmd; } -static inline void pmd_free(pmd_t *pmd) +static inline struct page *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) { - free_page((unsigned long)pmd); + pmd_t *pmd = pmd_alloc_one_kernel(mm, addr); + if (pmd) + return virt_to_page(pmd); + else + return NULL; +} + +static inline void pmd_free(struct page *pmd) +{ + __free_page(pmd); } #else @@ -55,7 +64,8 @@ static inline void pmd_free(pmd_t *pmd) * inside the pgd, so has no extra memory associated with it. */ -#define pmd_alloc_one(mm, addr) ({ BUG(); ((pmd_t *)2); }) +#define pmd_alloc_one(mm, addr) ({ BUG(); ((struct page *)2); }) +#define pmd_alloc_one_kernel(mm, addr) pmd_alloc_one(mm, addr) #define pmd_free(x) do { } while (0) #define pgd_populate(mm, pmd, pte) BUG() diff -prauN linux-2.6.0-test1/include/asm-parisc/pgtable.h wli-2.6.0-test1-37/include/asm-parisc/pgtable.h --- linux-2.6.0-test1/include/asm-parisc/pgtable.h 2003-07-13 20:37:32.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-parisc/pgtable.h 2003-07-14 06:49:00.000000000 -0700 @@ -242,7 +242,8 @@ extern unsigned long *empty_zero_page; #ifdef __LP64__ -#define pgd_page(pgd) ((unsigned long) __va(pgd_val(pgd) & PAGE_MASK)) +#define __pgd_page(pgd) ((unsigned long) __va(pgd_val(pgd) & PAGE_MASK)) +#define pgd_page(pgd) virt_to_page(__pgd_page(pgd)) /* For 64 bit we have three level tables */ @@ -339,11 +340,17 @@ extern inline pte_t pte_modify(pte_t pte #ifdef __LP64__ #define pmd_offset(dir,address) \ -((pmd_t *) pgd_page(*(dir)) + (((address)>>PMD_SHIFT) & (PTRS_PER_PMD-1))) +((pmd_t *)__pgd_page(*(dir)) + (((address)>>PMD_SHIFT) & (PTRS_PER_PMD-1))) #else #define pmd_offset(dir,addr) ((pmd_t *) dir) #endif +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) + /* Find an entry in the third-level page table.. */ #define pte_index(address) (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE-1)) #define pte_offset_kernel(pmd, address) \ diff -prauN linux-2.6.0-test1/include/asm-parisc/smp.h wli-2.6.0-test1-37/include/asm-parisc/smp.h --- linux-2.6.0-test1/include/asm-parisc/smp.h 2003-07-13 20:37:15.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-parisc/smp.h 2003-07-14 06:31:10.000000000 -0700 @@ -14,9 +14,10 @@ #ifndef ASSEMBLY #include #include /* for NR_CPUS */ +#include typedef unsigned long address_t; -extern volatile unsigned long cpu_online_map; +extern cpumask_t cpu_online_map; /* @@ -51,22 +52,10 @@ extern void smp_send_reschedule(int cpu) extern unsigned long cpu_present_mask; #define smp_processor_id() (current_thread_info()->cpu) -#define cpu_online(cpu) (cpu_online_map & (1<<(cpu))) +#define cpu_online(cpu) cpu_isset(cpu, cpu_online_map) -#define cpu_possible(cpu) (cpu_present_mask & (1<<(cpu))) +#define cpu_possible(cpu) cpu_isset(cpu, cpu_present_mask) -extern inline unsigned int num_online_cpus(void) -{ - return hweight32(cpu_online_map); -} - -extern inline unsigned int any_online_cpu(unsigned int mask) -{ - if (mask & cpu_online_map) - return __ffs(mask & cpu_online_map); - - return NR_CPUS; -} #endif /* CONFIG_SMP */ #define NO_PROC_ID 0xFF /* No processor magic marker */ diff -prauN linux-2.6.0-test1/include/asm-ppc/pgalloc.h wli-2.6.0-test1-37/include/asm-ppc/pgalloc.h --- linux-2.6.0-test1/include/asm-ppc/pgalloc.h 2003-07-13 20:28:54.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-ppc/pgalloc.h 2003-07-14 06:49:00.000000000 -0700 @@ -15,7 +15,8 @@ extern void pgd_free(pgd_t *pgd); * We don't have any real pmd's, and this code never triggers because * the pgd will always be present.. */ -#define pmd_alloc_one(mm,address) ({ BUG(); ((pmd_t *)2); }) +#define pmd_alloc_one(mm,address) ({ BUG(); ((struct page *)2); }) +#define pmd_alloc_one_kernel(mm,addr) ({ BUG(); ((pmd_t *)2); }) #define pmd_free(x) do { } while (0) #define __pmd_free_tlb(tlb,x) do { } while (0) #define pgd_populate(mm, pmd, pte) BUG() diff -prauN linux-2.6.0-test1/include/asm-ppc/pgtable.h wli-2.6.0-test1-37/include/asm-ppc/pgtable.h --- linux-2.6.0-test1/include/asm-ppc/pgtable.h 2003-07-13 20:31:51.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-ppc/pgtable.h 2003-07-14 06:49:00.000000000 -0700 @@ -370,8 +370,9 @@ static inline int pgd_bad(pgd_t pgd) { static inline int pgd_present(pgd_t pgd) { return 1; } #define pgd_clear(xp) do { } while (0) -#define pgd_page(pgd) \ +#define __pgd_page(pgd) \ ((unsigned long) __va(pgd_val(pgd) & PAGE_MASK)) +#define pgd_page(pgd) virt_to_page(__pgd_page(pgd)) /* * The following only work if pte_present() is true. @@ -503,6 +504,12 @@ static inline pmd_t * pmd_offset(pgd_t * return (pmd_t *) dir; } +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) + /* Find an entry in the third-level page table.. */ #define pte_index(address) \ (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) diff -prauN linux-2.6.0-test1/include/asm-ppc/smp.h wli-2.6.0-test1-37/include/asm-ppc/smp.h --- linux-2.6.0-test1/include/asm-ppc/smp.h 2003-07-13 20:31:21.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-ppc/smp.h 2003-07-14 06:31:10.000000000 -0700 @@ -14,6 +14,7 @@ #include #include #include +#include #ifdef CONFIG_SMP @@ -28,8 +29,8 @@ struct cpuinfo_PPC { }; extern struct cpuinfo_PPC cpu_data[]; -extern unsigned long cpu_online_map; -extern unsigned long cpu_possible_map; +extern cpumask_t cpu_online_map; +extern cpumask_t cpu_possible_map; extern unsigned long smp_proc_in_lock[]; extern volatile unsigned long cpu_callin_map[]; extern int smp_tb_synchronized; @@ -45,21 +46,8 @@ extern void smp_local_timer_interrupt(st #define smp_processor_id() (current_thread_info()->cpu) -#define cpu_online(cpu) (cpu_online_map & (1<<(cpu))) -#define cpu_possible(cpu) (cpu_possible_map & (1<<(cpu))) - -extern inline unsigned int num_online_cpus(void) -{ - return hweight32(cpu_online_map); -} - -extern inline unsigned int any_online_cpu(unsigned int mask) -{ - if (mask & cpu_online_map) - return __ffs(mask & cpu_online_map); - - return NR_CPUS; -} +#define cpu_online(cpu) cpu_isset(cpu, cpu_online_map) +#define cpu_possible(cpu) cpu_isset(cpu, cpu_possible_map) extern int __cpu_up(unsigned int cpu); diff -prauN linux-2.6.0-test1/include/asm-ppc64/mmu_context.h wli-2.6.0-test1-37/include/asm-ppc64/mmu_context.h --- linux-2.6.0-test1/include/asm-ppc64/mmu_context.h 2003-07-13 20:32:43.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-ppc64/mmu_context.h 2003-07-14 06:31:53.000000000 -0700 @@ -143,7 +143,7 @@ switch_mm(struct mm_struct *prev, struct struct task_struct *tsk) { flush_stab(tsk, next); - set_bit(smp_processor_id(), &next->cpu_vm_mask); + cpu_set(smp_processor_id(), next->cpu_vm_mask); } #define deactivate_mm(tsk,mm) do { } while (0) diff -prauN linux-2.6.0-test1/include/asm-ppc64/pgalloc.h wli-2.6.0-test1-37/include/asm-ppc64/pgalloc.h --- linux-2.6.0-test1/include/asm-ppc64/pgalloc.h 2003-07-13 20:29:29.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-ppc64/pgalloc.h 2003-07-14 06:49:00.000000000 -0700 @@ -26,10 +26,10 @@ pgd_free(pgd_t *pgd) free_page((unsigned long)pgd); } -#define pgd_populate(MM, PGD, PMD) pgd_set(PGD, PMD) +#define pgd_populate(MM, PGD, PMD) pgd_set(PGD, page_address(PMD)) static inline pmd_t * -pmd_alloc_one(struct mm_struct *mm, unsigned long addr) +pmd_alloc_one_kernel(struct mm_struct *mm, unsigned long addr) { pmd_t *pmd; @@ -39,10 +39,19 @@ pmd_alloc_one(struct mm_struct *mm, unsi return pmd; } +static inline struct page *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) +{ + pmd_t *pmd = pmd_alloc_one_kernel(mm, addr); + if (pmd) + return virt_to_page(pmd); + else + return NULL; +} + static inline void -pmd_free(pmd_t *pmd) +pmd_free(struct page *pmd) { - free_page((unsigned long)pmd); + __free_page(pmd); } #define __pmd_free_tlb(tlb, pmd) pmd_free(pmd) diff -prauN linux-2.6.0-test1/include/asm-ppc64/pgtable.h wli-2.6.0-test1-37/include/asm-ppc64/pgtable.h --- linux-2.6.0-test1/include/asm-ppc64/pgtable.h 2003-07-13 20:38:37.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-ppc64/pgtable.h 2003-07-14 06:49:00.000000000 -0700 @@ -190,7 +190,8 @@ extern unsigned long empty_zero_page[PAG #define pgd_bad(pgd) ((pgd_val(pgd)) == 0) #define pgd_present(pgd) (pgd_val(pgd) != 0UL) #define pgd_clear(pgdp) (pgd_val(*(pgdp)) = 0UL) -#define pgd_page(pgd) (__bpn_to_ba(pgd_val(pgd))) +#define __pgd_page(pgd) (__bpn_to_ba(pgd_val(pgd))) +#define pgd_page(pgd) virt_to_page(__pgd_page(pgd)) /* * Find an entry in a page-table-directory. We combine the address region @@ -203,12 +204,18 @@ extern unsigned long empty_zero_page[PAG /* Find an entry in the second-level page table.. */ #define pmd_offset(dir,addr) \ - ((pmd_t *) pgd_page(*(dir)) + (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))) + ((pmd_t *)__pgd_page(*(dir)) + (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))) /* Find an entry in the third-level page table.. */ #define pte_offset_kernel(dir,addr) \ ((pte_t *) pmd_page_kernel(*(dir)) + (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))) +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) + #define pte_offset_map(dir,addr) pte_offset_kernel((dir), (addr)) #define pte_offset_map_nested(dir,addr) pte_offset_kernel((dir), (addr)) #define pte_unmap(pte) do { } while(0) diff -prauN linux-2.6.0-test1/include/asm-ppc64/smp.h wli-2.6.0-test1-37/include/asm-ppc64/smp.h --- linux-2.6.0-test1/include/asm-ppc64/smp.h 2003-07-13 20:28:51.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-ppc64/smp.h 2003-07-14 06:31:10.000000000 -0700 @@ -19,6 +19,7 @@ #include #include +#include #include #ifdef CONFIG_SMP @@ -27,31 +28,14 @@ #include -extern unsigned long cpu_online_map; - extern void smp_message_pass(int target, int msg, unsigned long data, int wait); extern void smp_send_tlb_invalidate(int); extern void smp_send_xmon_break(int cpu); struct pt_regs; extern void smp_message_recv(int, struct pt_regs *); -#define cpu_online(cpu) test_bit((cpu), &cpu_online_map) - #define cpu_possible(cpu) paca[cpu].active -static inline unsigned int num_online_cpus(void) -{ - return hweight64(cpu_online_map); -} - -static inline unsigned int any_online_cpu(unsigned int mask) -{ - if (mask & cpu_online_map) - return __ffs(mask & cpu_online_map); - - return NR_CPUS; -} - #define smp_processor_id() (get_paca()->xPacaIndex) /* Since OpenPIC has only 4 IPIs, we use slightly different message numbers. diff -prauN linux-2.6.0-test1/include/asm-ppc64/tlb.h wli-2.6.0-test1-37/include/asm-ppc64/tlb.h --- linux-2.6.0-test1/include/asm-ppc64/tlb.h 2003-07-13 20:32:42.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-ppc64/tlb.h 2003-07-14 06:31:10.000000000 -0700 @@ -49,6 +49,7 @@ static inline void __tlb_remove_tlb_entr struct ppc64_tlb_batch *batch = &ppc64_tlb_batch[cpu]; unsigned long i = batch->index; pte_t pte; + cpumask_t local_cpumask = cpumask_of_cpu(cpu); if (pte_val(*ptep) & _PAGE_HASHPTE) { pte = __pte(pte_update(ptep, _PAGE_HPTEFLAGS, 0)); @@ -61,7 +62,7 @@ static inline void __tlb_remove_tlb_entr if (i == PPC64_TLB_BATCH_NR) { int local = 0; - if (tlb->mm->cpu_vm_mask == (1UL << cpu)) + if (cpus_equal(tlb->mm->cpu_vm_mask, local_cpumask)) local = 1; flush_hash_range(tlb->mm->context, i, local); @@ -78,8 +79,9 @@ static inline void tlb_flush(struct mmu_ int cpu = smp_processor_id(); struct ppc64_tlb_batch *batch = &ppc64_tlb_batch[cpu]; int local = 0; + cpumask_t local_cpumask = cpumask_of_cpu(smp_processor_id()); - if (tlb->mm->cpu_vm_mask == (1UL << smp_processor_id())) + if (cpus_equal(tlb->mm->cpu_vm_mask, local_cpumask)) local = 1; flush_hash_range(tlb->mm->context, batch->index, local); diff -prauN linux-2.6.0-test1/include/asm-s390/bitops.h wli-2.6.0-test1-37/include/asm-s390/bitops.h --- linux-2.6.0-test1/include/asm-s390/bitops.h 2003-07-13 20:29:27.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-s390/bitops.h 2003-07-14 06:45:57.000000000 -0700 @@ -505,7 +505,7 @@ static inline int __test_bit(unsigned lo unsigned char ch; addr = (unsigned long) ptr + ((nr ^ (__BITOPS_WORDSIZE - 8)) >> 3); - ch = *(unsigned char *) addr; + ch = *(volatile unsigned char *) addr; return (ch >> (nr & 7)) & 1; } diff -prauN linux-2.6.0-test1/include/asm-s390/mmu_context.h wli-2.6.0-test1-37/include/asm-s390/mmu_context.h --- linux-2.6.0-test1/include/asm-s390/mmu_context.h 2003-07-13 20:33:49.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-s390/mmu_context.h 2003-07-14 06:45:00.000000000 -0700 @@ -42,7 +42,7 @@ static inline void switch_mm(struct mm_s : : "m" (pgd) ); #endif /* __s390x__ */ } - set_bit(smp_processor_id(), &next->cpu_vm_mask); + cpu_bit(smp_processor_id(), next->cpu_vm_mask); } #define deactivate_mm(tsk,mm) do { } while (0) diff -prauN linux-2.6.0-test1/include/asm-s390/pgalloc.h wli-2.6.0-test1-37/include/asm-s390/pgalloc.h --- linux-2.6.0-test1/include/asm-s390/pgalloc.h 2003-07-13 20:35:15.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-s390/pgalloc.h 2003-07-14 06:49:00.000000000 -0700 @@ -61,12 +61,13 @@ static inline void pgd_free(pgd_t *pgd) * We use pmd cache only on s390x, so these are dummy routines. This * code never triggers because the pgd will always be present. */ -#define pmd_alloc_one(mm,address) ({ BUG(); ((pmd_t *)2); }) +#define pmd_alloc_one(mm,address) ({ BUG(); ((struct page *)2); }) +#define pmd_alloc_one_kernel(mm,addr) ({ BUG(); ((pmd_t *)2); }) #define pmd_free(x) do { } while (0) #define __pmd_free_tlb(tlb,x) do { } while (0) #define pgd_populate(mm, pmd, pte) BUG() #else /* __s390x__ */ -static inline pmd_t * pmd_alloc_one(struct mm_struct *mm, unsigned long vmaddr) +static inline pmd_t * pmd_alloc_one_kernel(struct mm_struct *mm, unsigned long vmaddr) { pmd_t *pmd; int i; @@ -79,16 +80,25 @@ static inline pmd_t * pmd_alloc_one(stru return pmd; } -static inline void pmd_free (pmd_t *pmd) +static inline struct page *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) { - free_pages((unsigned long) pmd, 2); + pmd_t *pmd = pmd_alloc_one_kernel(mm, addr); + if (pmd) + return virt_to_page(pmd); + else + return NULL; +} + +static inline void pmd_free(struct page *pmd) +{ + __free_pages(pmd, 2); } #define __pmd_free_tlb(tlb,pmd) pmd_free(pmd) -static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmd) +static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, struct page *pmd) { - pgd_val(*pgd) = _PGD_ENTRY | __pa(pmd); + pgd_val(*pgd) = _PGD_ENTRY | __pa(page_address(pmd)); } #endif /* __s390x__ */ diff -prauN linux-2.6.0-test1/include/asm-s390/pgtable.h wli-2.6.0-test1-37/include/asm-s390/pgtable.h --- linux-2.6.0-test1/include/asm-s390/pgtable.h 2003-07-13 20:37:13.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-s390/pgtable.h 2003-07-14 06:49:00.000000000 -0700 @@ -613,6 +613,7 @@ static inline pte_t mk_pte_phys(unsigned /* to find an entry in a page-table-directory */ #define pgd_index(address) ((address >> PGDIR_SHIFT) & (PTRS_PER_PGD-1)) #define pgd_offset(mm, address) ((mm)->pgd+pgd_index(address)) +#define pgd_page(pgd) virt_to_page(pgd_page_kernel(pgd)) /* to find an entry in a kernel page-table-directory */ #define pgd_offset_k(address) pgd_offset(&init_mm, address) @@ -634,6 +635,12 @@ extern inline pmd_t * pmd_offset(pgd_t * #endif /* __s390x__ */ +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) + /* Find an entry in the third-level page table.. */ #define pte_index(address) (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE-1)) #define pte_offset_kernel(pmd, address) \ diff -prauN linux-2.6.0-test1/include/asm-s390/smp.h wli-2.6.0-test1-37/include/asm-s390/smp.h --- linux-2.6.0-test1/include/asm-s390/smp.h 2003-07-13 20:34:30.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-s390/smp.h 2003-07-14 06:31:10.000000000 -0700 @@ -11,6 +11,7 @@ #include #include +#include #include #if defined(__KERNEL__) && defined(CONFIG_SMP) && !defined(__ASSEMBLY__) @@ -28,8 +29,8 @@ typedef struct __u16 cpu; } sigp_info; -extern volatile unsigned long cpu_online_map; -extern volatile unsigned long cpu_possible_map; +extern cpumask_t cpu_online_map; +extern cpumask_t cpu_possible_map; #define NO_PROC_ID 0xFF /* No processor magic marker */ @@ -47,25 +48,8 @@ extern volatile unsigned long cpu_possib #define smp_processor_id() (current_thread_info()->cpu) -#define cpu_online(cpu) (cpu_online_map & (1<<(cpu))) -#define cpu_possible(cpu) (cpu_possible_map & (1<<(cpu))) - -extern inline unsigned int num_online_cpus(void) -{ -#ifndef __s390x__ - return hweight32(cpu_online_map); -#else /* __s390x__ */ - return hweight64(cpu_online_map); -#endif /* __s390x__ */ -} - -extern inline unsigned int any_online_cpu(unsigned int mask) -{ - if (mask & cpu_online_map) - return __ffs(mask & cpu_online_map); - - return NR_CPUS; -} +#define cpu_online(cpu) cpu_isset(cpu, cpu_online_map) +#define cpu_possible(cpu) cpu_isset(cpu, cpu_possible_map) extern __inline__ __u16 hard_smp_processor_id(void) { diff -prauN linux-2.6.0-test1/include/asm-s390/tlbflush.h wli-2.6.0-test1-37/include/asm-s390/tlbflush.h --- linux-2.6.0-test1/include/asm-s390/tlbflush.h 2003-07-13 20:38:52.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-s390/tlbflush.h 2003-07-14 06:44:42.000000000 -0700 @@ -98,13 +98,15 @@ static inline void global_flush_tlb(void static inline void __flush_tlb_mm(struct mm_struct * mm) { + cpumask_t local_cpumask; preempt_disable(); - if (mm->cpu_vm_mask != (1UL << smp_processor_id())) { + local_cpumask = cpumask_of_cpu(smp_processor_id()); + if (cpus_equal(mm->cpu_vm_mask, local_cpumask)) { /* mm was active on more than one cpu. */ if (mm == current->active_mm && atomic_read(&mm->mm_users) == 1) /* this cpu is the only one using the mm. */ - mm->cpu_vm_mask = 1UL << smp_processor_id(); + mm->cpu_vm_mask = local_cpumask; global_flush_tlb(); } else local_flush_tlb(); diff -prauN linux-2.6.0-test1/include/asm-sh/pgalloc.h wli-2.6.0-test1-37/include/asm-sh/pgalloc.h --- linux-2.6.0-test1/include/asm-sh/pgalloc.h 2003-07-13 20:34:30.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-sh/pgalloc.h 2003-07-14 08:52:52.000000000 -0700 @@ -94,7 +94,8 @@ static inline void pte_free(struct page * inside the pgd, so has no extra memory associated with it. */ -#define pmd_alloc_one(mm, addr) ({ BUG(); ((pmd_t *)2); }) +#define pmd_alloc_one_kernel(mm, addr) ({ BUG(); ((pmd_t *)2); }) +#define pmd_alloc_one(mm, addr) ({ BUG(); ((struct page *)2); }) #define pmd_free(x) do { } while (0) #define __pmd_free_tlb(tlb,x) do { } while (0) #define pgd_populate(mm, pmd, pte) BUG() @@ -115,8 +116,8 @@ static inline pte_t ptep_get_and_clear(p unsigned long pfn = pte_pfn(pte); if (pfn_valid(pfn)) { page = pfn_to_page(pfn); - if (!page->mapping - || list_empty(&page->mapping->i_mmap_shared)) + if (!page_mapping(page) + || list_empty(&page_mapping(page)->i_mmap_shared)) __clear_bit(PG_mapped, &page->flags); } } diff -prauN linux-2.6.0-test1/include/asm-sh/pgtable-2level.h wli-2.6.0-test1-37/include/asm-sh/pgtable-2level.h --- linux-2.6.0-test1/include/asm-sh/pgtable-2level.h 2003-07-13 20:38:46.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-sh/pgtable-2level.h 2003-07-14 06:49:00.000000000 -0700 @@ -48,14 +48,21 @@ static inline void pgd_clear (pgd_t * pg #define set_pmd(pmdptr, pmdval) (*(pmdptr) = pmdval) #define set_pgd(pgdptr, pgdval) (*(pgdptr) = pgdval) -#define pgd_page(pgd) \ +#define __pgd_page(pgd) \ ((unsigned long) __va(pgd_val(pgd) & PAGE_MASK)) +#define pgd_page(pgd) virt_to_page(__pgd_page(pgd)) static inline pmd_t * pmd_offset(pgd_t * dir, unsigned long address) { return (pmd_t *) dir; } +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) + #define pte_pfn(x) ((unsigned long)(((x).pte >> PAGE_SHIFT))) #define pfn_pte(pfn, prot) __pte(((pfn) << PAGE_SHIFT) | pgprot_val(prot)) #define pfn_pmd(pfn, prot) __pmd(((pfn) << PAGE_SHIFT) | pgprot_val(prot)) diff -prauN linux-2.6.0-test1/include/asm-sparc/pgalloc.h wli-2.6.0-test1-37/include/asm-sparc/pgalloc.h --- linux-2.6.0-test1/include/asm-sparc/pgalloc.h 2003-07-13 20:30:36.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-sparc/pgalloc.h 2003-07-14 06:49:00.000000000 -0700 @@ -38,15 +38,24 @@ BTFIXUPDEF_CALL(void, free_pgd_fast, pgd BTFIXUPDEF_CALL(void, pgd_set, pgd_t *, pmd_t *) #define pgd_set(pgdp,pmdp) BTFIXUP_CALL(pgd_set)(pgdp,pmdp) -#define pgd_populate(MM, PGD, PMD) pgd_set(PGD, PMD) +#define pgd_populate(MM, PGD, PMD) pgd_set(PGD, page_address(PMD)) -BTFIXUPDEF_CALL(pmd_t *, pmd_alloc_one, struct mm_struct *, unsigned long) -#define pmd_alloc_one(mm, address) BTFIXUP_CALL(pmd_alloc_one)(mm, address) +BTFIXUPDEF_CALL(pmd_t *, __pmd_alloc_one, struct mm_struct *, unsigned long) +#define pmd_alloc_one_kernel(mm, address) BTFIXUP_CALL(__pmd_alloc_one)(mm, address) + +static inline struct page *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) +{ + pmd_t *pmd = pmd_alloc_one_kernel(mm, addr); + if (pmd) + return virt_to_page(pmd); + else + return NULL; +} BTFIXUPDEF_CALL(void, free_pmd_fast, pmd_t *) #define free_pmd_fast(pmd) BTFIXUP_CALL(free_pmd_fast)(pmd) -#define pmd_free(pmd) free_pmd_fast(pmd) +#define pmd_free(pmd) free_pmd_fast(page_address(pmd)) #define __pmd_free_tlb(tlb, pmd) pmd_free(pmd) BTFIXUPDEF_CALL(void, pmd_populate, pmd_t *, struct page *) diff -prauN linux-2.6.0-test1/include/asm-sparc/pgtable.h wli-2.6.0-test1-37/include/asm-sparc/pgtable.h --- linux-2.6.0-test1/include/asm-sparc/pgtable.h 2003-07-13 20:34:41.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-sparc/pgtable.h 2003-07-14 06:49:00.000000000 -0700 @@ -202,10 +202,11 @@ extern unsigned long empty_zero_page; /* */ BTFIXUPDEF_CALL_CONST(struct page *, pmd_page, pmd_t) -BTFIXUPDEF_CALL_CONST(unsigned long, pgd_page, pgd_t) +BTFIXUPDEF_CALL_CONST(unsigned long, __pgd_page, pgd_t) #define pmd_page(pmd) BTFIXUP_CALL(pmd_page)(pmd) -#define pgd_page(pgd) BTFIXUP_CALL(pgd_page)(pgd) +#define __pgd_page(pgd) BTFIXUP_CALL(__pgd_page)(pgd) +#define pgd_page(pgd) virt_to_page(__pgd_page(pgd)) BTFIXUPDEF_SETHI(none_mask) BTFIXUPDEF_CALL_CONST(int, pte_present, pte_t) @@ -352,6 +353,11 @@ extern __inline__ pte_t pte_modify(pte_t /* Find an entry in the second-level page table.. */ BTFIXUPDEF_CALL(pmd_t *, pmd_offset, pgd_t *, unsigned long) #define pmd_offset(dir,addr) BTFIXUP_CALL(pmd_offset)(dir,addr) +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) /* Find an entry in the third-level page table.. */ BTFIXUPDEF_CALL(pte_t *, pte_offset_kernel, pmd_t *, unsigned long) diff -prauN linux-2.6.0-test1/include/asm-sparc/smp.h wli-2.6.0-test1-37/include/asm-sparc/smp.h --- linux-2.6.0-test1/include/asm-sparc/smp.h 2003-07-13 20:31:58.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-sparc/smp.h 2003-07-14 06:31:10.000000000 -0700 @@ -8,6 +8,7 @@ #include #include +#include #include #include diff -prauN linux-2.6.0-test1/include/asm-sparc64/bitops.h wli-2.6.0-test1-37/include/asm-sparc64/bitops.h --- linux-2.6.0-test1/include/asm-sparc64/bitops.h 2003-07-13 20:30:48.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-sparc64/bitops.h 2003-07-14 06:43:52.000000000 -0700 @@ -156,6 +156,14 @@ static __inline__ int ffs(int x) #ifdef ULTRA_HAS_POPULATION_COUNT +static __inline__ unsigned int hweight64(unsigned long w) +{ + unsigned int res; + + __asm__ ("popc %1,%0" : "=r" (res) : "r" (w)); + return res; +} + static __inline__ unsigned int hweight32(unsigned int w) { unsigned int res; @@ -182,6 +190,7 @@ static __inline__ unsigned int hweight8( #else +#define hweight64(x) generic_hweight64(x) #define hweight32(x) generic_hweight32(x) #define hweight16(x) generic_hweight16(x) #define hweight8(x) generic_hweight8(x) diff -prauN linux-2.6.0-test1/include/asm-sparc64/mmu_context.h wli-2.6.0-test1-37/include/asm-sparc64/mmu_context.h --- linux-2.6.0-test1/include/asm-sparc64/mmu_context.h 2003-07-13 20:38:48.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-sparc64/mmu_context.h 2003-07-14 06:43:52.000000000 -0700 @@ -125,7 +125,7 @@ static inline void switch_mm(struct mm_s } { - unsigned long vm_mask = (1UL << smp_processor_id()); + int cpu = smp_processor_id(); /* Even if (mm == old_mm) we _must_ check * the cpu_vm_mask. If we do not we could @@ -133,8 +133,8 @@ static inline void switch_mm(struct mm_s * smp_flush_tlb_{page,range,mm} on sparc64 * and lazy tlb switches work. -DaveM */ - if (!ctx_valid || !(mm->cpu_vm_mask & vm_mask)) { - mm->cpu_vm_mask |= vm_mask; + if (!ctx_valid || !cpu_isset(cpu, mm->cpu_vm_mask)) { + cpu_set(cpu, mm->cpu_vm_mask); __flush_tlb_mm(CTX_HWBITS(mm->context), SECONDARY_CONTEXT); } } @@ -148,14 +148,14 @@ extern void __flush_tlb_mm(unsigned long /* Activate a new MM instance for the current task. */ static inline void activate_mm(struct mm_struct *active_mm, struct mm_struct *mm) { - unsigned long vm_mask; + int cpu; spin_lock(&mm->page_table_lock); if (!CTX_VALID(mm->context)) get_new_mmu_context(mm); - vm_mask = (1UL << smp_processor_id()); - if (!(mm->cpu_vm_mask & vm_mask)) - mm->cpu_vm_mask |= vm_mask; + cpu = smp_processor_id(); + if (!cpu_isset(cpu, mm->cpu_vm_mask)) + cpu_set(cpu, mm->cpu_vm_mask); spin_unlock(&mm->page_table_lock); load_secondary_context(mm); diff -prauN linux-2.6.0-test1/include/asm-sparc64/pgalloc.h wli-2.6.0-test1-37/include/asm-sparc64/pgalloc.h --- linux-2.6.0-test1/include/asm-sparc64/pgalloc.h 2003-07-13 20:38:38.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-sparc64/pgalloc.h 2003-07-14 06:49:00.000000000 -0700 @@ -132,7 +132,7 @@ static __inline__ void free_pgd_slow(pgd #define DCACHE_COLOR(address) 0 #endif -#define pgd_populate(MM, PGD, PMD) pgd_set(PGD, PMD) +#define pgd_populate(MM, PGD, PMD) pgd_set(PGD, page_address(PMD)) static __inline__ pmd_t *pmd_alloc_one_fast(struct mm_struct *mm, unsigned long address) { @@ -153,7 +153,7 @@ static __inline__ pmd_t *pmd_alloc_one_f return (pmd_t *)ret; } -static __inline__ pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address) +static __inline__ pmd_t *pmd_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { pmd_t *pmd; @@ -166,6 +166,15 @@ static __inline__ pmd_t *pmd_alloc_one(s return pmd; } +static inline struct page *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) +{ + pmd_t *pmd = pmd_alloc_one_kernel(mm, addr); + if (pmd) + return virt_to_page(pmd); + else + return NULL; +} + static __inline__ void free_pmd_fast(pmd_t *pmd) { unsigned long color = DCACHE_COLOR((unsigned long)pmd); @@ -222,7 +231,7 @@ static __inline__ void free_pte_slow(pte #define pte_free_kernel(pte) free_pte_fast(pte) #define pte_free(pte) free_pte_fast(page_address(pte)) -#define pmd_free(pmd) free_pmd_fast(pmd) +#define pmd_free(pmd) free_pmd_fast(page_address(pmd)) #define pgd_free(pgd) free_pgd_fast(pgd) #define pgd_alloc(mm) get_pgd_fast() diff -prauN linux-2.6.0-test1/include/asm-sparc64/pgtable.h wli-2.6.0-test1-37/include/asm-sparc64/pgtable.h --- linux-2.6.0-test1/include/asm-sparc64/pgtable.h 2003-07-13 20:29:22.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-sparc64/pgtable.h 2003-07-14 06:49:00.000000000 -0700 @@ -228,7 +228,8 @@ static inline pte_t pte_modify(pte_t ori (pgd_val(*(pgdp)) = (__pa((unsigned long) (pmdp)) >> 11UL)) #define __pmd_page(pmd) ((unsigned long) __va((pmd_val(pmd)<<11UL))) #define pmd_page(pmd) virt_to_page((void *)__pmd_page(pmd)) -#define pgd_page(pgd) ((unsigned long) __va((pgd_val(pgd)<<11UL))) +#define __pgd_page(pgd) ((unsigned long) __va((pgd_val(pgd)<<11UL))) +#define pgd_page(pgd) virt_to_page(__pgd_page(pgd)) #define pte_none(pte) (!pte_val(pte)) #define pte_present(pte) (pte_val(pte) & _PAGE_PRESENT) #define pte_clear(pte) (pte_val(*(pte)) = 0UL) @@ -270,8 +271,13 @@ static inline pte_t pte_modify(pte_t ori #define pgd_offset_k(address) pgd_offset(&init_mm, address) /* Find an entry in the second-level page table.. */ -#define pmd_offset(dir, address) ((pmd_t *) pgd_page(*(dir)) + \ +#define pmd_offset(dir, address) ((pmd_t *)__pgd_page(*(dir)) + \ ((address >> PMD_SHIFT) & (REAL_PTRS_PER_PMD-1))) +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) /* Find an entry in the third-level page table.. */ #define pte_index(dir, address) ((pte_t *) __pmd_page(*(dir)) + \ diff -prauN linux-2.6.0-test1/include/asm-sparc64/smp.h wli-2.6.0-test1-37/include/asm-sparc64/smp.h --- linux-2.6.0-test1/include/asm-sparc64/smp.h 2003-07-13 20:32:41.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-sparc64/smp.h 2003-07-14 06:43:52.000000000 -0700 @@ -14,6 +14,7 @@ #ifndef __ASSEMBLY__ +#include #include /* PROM provided per-processor information we need @@ -68,25 +69,14 @@ extern cpuinfo_sparc cpu_data[NR_CPUS]; extern unsigned char boot_cpu_id; -extern unsigned long phys_cpu_present_map; -#define cpu_possible(cpu) (phys_cpu_present_map & (1UL << (cpu))) +extern cpumask_t phys_cpu_present_map; +#define cpu_possible(cpu) cpu_isset(cpu, phys_cpu_present_map) -extern unsigned long cpu_online_map; -#define cpu_online(cpu) (cpu_online_map & (1UL << (cpu))) - -extern atomic_t sparc64_num_cpus_online; -#define num_online_cpus() (atomic_read(&sparc64_num_cpus_online)) +#define cpu_online(cpu) cpu_isset(cpu, cpu_online_map) extern atomic_t sparc64_num_cpus_possible; #define num_possible_cpus() (atomic_read(&sparc64_num_cpus_possible)) -static inline unsigned int any_online_cpu(unsigned long mask) -{ - if ((mask &= cpu_online_map) != 0UL) - return __ffs(mask); - return NR_CPUS; -} - /* * General functions that each host system must provide. */ diff -prauN linux-2.6.0-test1/include/asm-um/pgalloc.h wli-2.6.0-test1-37/include/asm-um/pgalloc.h --- linux-2.6.0-test1/include/asm-um/pgalloc.h 2003-07-13 20:37:26.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-um/pgalloc.h 2003-07-14 06:49:00.000000000 -0700 @@ -42,7 +42,8 @@ static inline void pte_free(struct page * inside the pgd, so has no extra memory associated with it. */ -#define pmd_alloc_one(mm, addr) ({ BUG(); ((pmd_t *)2); }) +#define pmd_alloc_one(mm, addr) ({ BUG(); ((struct page *)2); }) +#define pmd_alloc_one_kernel(mm, addr) ({ BUG(); ((pmd_t *)2); }) #define pmd_free(x) do { } while (0) #define __pmd_free_tlb(tlb,x) do { } while (0) #define pgd_populate(mm, pmd, pte) BUG() diff -prauN linux-2.6.0-test1/include/asm-um/pgtable.h wli-2.6.0-test1-37/include/asm-um/pgtable.h --- linux-2.6.0-test1/include/asm-um/pgtable.h 2003-07-13 20:39:37.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-um/pgtable.h 2003-07-14 06:49:00.000000000 -0700 @@ -374,6 +374,12 @@ static inline pmd_t * pmd_offset(pgd_t * return (pmd_t *) dir; } +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pgd, addr) do { } while (0) +#define pmd_unmap_nested(pgd, addr) do { } while (0) + /* Find an entry in the third-level page table.. */ #define pte_index(address) (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) #define pte_offset_kernel(dir, address) \ diff -prauN linux-2.6.0-test1/include/asm-um/smp.h wli-2.6.0-test1-37/include/asm-um/smp.h --- linux-2.6.0-test1/include/asm-um/smp.h 2003-07-13 20:28:54.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-um/smp.h 2003-07-14 06:31:10.000000000 -0700 @@ -1,13 +1,14 @@ #ifndef __UM_SMP_H #define __UM_SMP_H -extern unsigned long cpu_online_map; - #ifdef CONFIG_SMP #include "linux/config.h" #include "linux/bitops.h" #include "asm/current.h" +#include "linux/cpumask.h" + +extern cpumask_t cpu_online_map; #define smp_processor_id() (current->thread_info->cpu) #define cpu_logical_map(n) (n) @@ -16,16 +17,11 @@ extern unsigned long cpu_online_map; extern int hard_smp_processor_id(void); #define NO_PROC_ID -1 -#define cpu_online(cpu) (cpu_online_map & (1<<(cpu))) +#define cpu_online(cpu) cpu_isset(cpu, cpu_online_map) extern int ncpus; #define cpu_possible(cpu) (cpu < ncpus) -extern inline unsigned int num_online_cpus(void) -{ - return(hweight32(cpu_online_map)); -} - extern inline void smp_cpus_done(unsigned int maxcpus) { } diff -prauN linux-2.6.0-test1/include/asm-v850/pgtable.h wli-2.6.0-test1-37/include/asm-v850/pgtable.h --- linux-2.6.0-test1/include/asm-v850/pgtable.h 2003-07-13 20:36:32.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-v850/pgtable.h 2003-07-14 06:49:00.000000000 -0700 @@ -13,6 +13,11 @@ typedef pte_t *pte_addr_t; #define pgd_clear(pgdp) ((void)0) #define pmd_offset(a, b) ((void *)0) +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) #define kern_addr_valid(addr) (1) diff -prauN linux-2.6.0-test1/include/asm-x86_64/mpspec.h wli-2.6.0-test1-37/include/asm-x86_64/mpspec.h --- linux-2.6.0-test1/include/asm-x86_64/mpspec.h 2003-07-13 20:33:50.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-x86_64/mpspec.h 2003-07-14 06:31:10.000000000 -0700 @@ -171,7 +171,7 @@ extern int quad_local_to_mp_bus_id [NR_C extern int mp_bus_id_to_pci_bus [MAX_MP_BUSSES]; extern unsigned int boot_cpu_physical_apicid; -extern unsigned long phys_cpu_present_map; +extern cpumask_t phys_cpu_present_map; extern int smp_found_config; extern void find_smp_config (void); extern void get_smp_config (void); diff -prauN linux-2.6.0-test1/include/asm-x86_64/pgalloc.h wli-2.6.0-test1-37/include/asm-x86_64/pgalloc.h --- linux-2.6.0-test1/include/asm-x86_64/pgalloc.h 2003-07-13 20:34:33.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-x86_64/pgalloc.h 2003-07-14 06:49:00.000000000 -0700 @@ -10,7 +10,7 @@ #define pmd_populate_kernel(mm, pmd, pte) \ set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte))) #define pgd_populate(mm, pgd, pmd) \ - set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pmd))) + set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(page_address(pmd)))) static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte) { @@ -22,18 +22,25 @@ extern __inline__ pmd_t *get_pmd(void) return (pmd_t *)get_zeroed_page(GFP_KERNEL); } -extern __inline__ void pmd_free(pmd_t *pmd) +extern __inline__ void pmd_free(struct page *pmd) { - if ((unsigned long)pmd & (PAGE_SIZE-1)) - BUG(); - free_page((unsigned long)pmd); + __free_page(pmd); } -static inline pmd_t *pmd_alloc_one (struct mm_struct *mm, unsigned long addr) +static inline pmd_t *pmd_alloc_one_kernel(struct mm_struct *mm, unsigned long addr) { return (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); } +static inline struct page *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) +{ + pmd_t *pmd = pmd_alloc_one_kernel(mm, addr); + if (pmd) + return virt_to_page(pmd); + else + return NULL; +} + static inline pgd_t *pgd_alloc (struct mm_struct *mm) { return (pgd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); diff -prauN linux-2.6.0-test1/include/asm-x86_64/pgtable.h wli-2.6.0-test1-37/include/asm-x86_64/pgtable.h --- linux-2.6.0-test1/include/asm-x86_64/pgtable.h 2003-07-13 20:36:38.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-x86_64/pgtable.h 2003-07-14 06:49:00.000000000 -0700 @@ -98,8 +98,9 @@ static inline void set_pml4(pml4_t *dst, pml4_val(*dst) = pml4_val(val); } -#define pgd_page(pgd) \ +#define __pgd_page(pgd) \ ((unsigned long) __va(pgd_val(pgd) & PHYSICAL_PAGE_MASK)) +#define pgd_page(pgd) virt_to_page(__pgd_page(pgd)) #define ptep_get_and_clear(xp) __pte(xchg(&(xp)->pte, 0)) #define pte_same(a, b) ((a).pte == (b).pte) @@ -332,8 +333,13 @@ static inline pgd_t *current_pgd_offset_ #define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT)) #define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1)) -#define pmd_offset(dir, address) ((pmd_t *) pgd_page(*(dir)) + \ +#define pmd_offset(dir, address) ((pmd_t *)__pgd_page(*(dir)) + \ pmd_index(address)) +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) #define pmd_none(x) (!pmd_val(x)) #define pmd_present(x) (pmd_val(x) & _PAGE_PRESENT) #define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0) diff -prauN linux-2.6.0-test1/include/asm-x86_64/smp.h wli-2.6.0-test1-37/include/asm-x86_64/smp.h --- linux-2.6.0-test1/include/asm-x86_64/smp.h 2003-07-13 20:28:53.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-x86_64/smp.h 2003-07-14 06:31:10.000000000 -0700 @@ -7,6 +7,7 @@ #ifndef __ASSEMBLY__ #include #include +#include #include extern int disable_apic; #endif @@ -35,8 +36,8 @@ struct pt_regs; */ extern void smp_alloc_memory(void); -extern unsigned long phys_cpu_present_map; -extern unsigned long cpu_online_map; +extern cpumask_t phys_cpu_present_map; +extern cpumask_t cpu_online_map; extern volatile unsigned long smp_invalidate_needed; extern int pic_mode; extern void smp_flush_tlb(void); @@ -56,35 +57,17 @@ void smp_stop_cpu(void); * compresses data structures. */ -extern volatile unsigned long cpu_callout_map; +extern cpumask_t cpu_callout_map; -#define cpu_possible(cpu) (cpu_callout_map & (1<<(cpu))) -#define cpu_online(cpu) (cpu_online_map & (1<<(cpu))) - -#define for_each_cpu(cpu, mask) \ - for(mask = cpu_online_map; \ - cpu = __ffs(mask), mask != 0; \ - mask &= ~(1UL< diff -prauN linux-2.6.0-test1/include/asm-x86_64/topology.h wli-2.6.0-test1-37/include/asm-x86_64/topology.h --- linux-2.6.0-test1/include/asm-x86_64/topology.h 2003-07-13 20:36:32.000000000 -0700 +++ wli-2.6.0-test1-37/include/asm-x86_64/topology.h 2003-07-14 06:31:10.000000000 -0700 @@ -8,13 +8,13 @@ /* Map the K8 CPU local memory controllers to a simple 1:1 CPU:NODE topology */ extern int fake_node; -extern unsigned long cpu_online_map; +extern cpumask_t cpu_online_map; #define cpu_to_node(cpu) (fake_node ? 0 : (cpu)) #define memblk_to_node(memblk) (fake_node ? 0 : (memblk)) #define parent_node(node) (node) #define node_to_first_cpu(node) (fake_node ? 0 : (node)) -#define node_to_cpu_mask(node) (fake_node ? cpu_online_map : (1UL << (node))) +#define node_to_cpu_mask(node) (fake_node ? cpu_online_map : cpumask_of_cpu(node)) #define node_to_memblk(node) (node) #define NODE_BALANCE_RATE 30 /* CHECKME */ diff -prauN linux-2.6.0-test1/include/linux/bitmap.h wli-2.6.0-test1-37/include/linux/bitmap.h --- linux-2.6.0-test1/include/linux/bitmap.h 1969-12-31 16:00:00.000000000 -0800 +++ wli-2.6.0-test1-37/include/linux/bitmap.h 2003-07-14 07:05:31.000000000 -0700 @@ -0,0 +1,149 @@ +#ifndef __LINUX_BITMAP_H +#define __LINUX_BITMAP_H + +#ifndef __ASSEMBLY__ + +#include +#include +#include +#include +#include +#include + +static inline int bitmap_empty(const volatile unsigned long *bitmap, int bits) +{ + int k, nr = bits/BITS_PER_LONG; + for (k = 0; k < nr; ++k) + if (bitmap[k]) + return 0; + + if (bits % BITS_PER_LONG) + if (bitmap[k] & ((1UL << (bits % BITS_PER_LONG)) - 1)) + return 0; + + return 1; +} + +static inline int bitmap_full(const volatile unsigned long *bitmap, int bits) +{ + int k, nr = bits/BITS_PER_LONG; + for (k = 0; k < nr; ++k) + if (~bitmap[k]) + return 0; + + if (bits % BITS_PER_LONG) + if (~bitmap[k] & ((1UL << (bits % BITS_PER_LONG)) - 1)) + return 0; + + return 1; +} + +static inline int bitmap_equal(const volatile unsigned long *bitmap1, volatile unsigned long *bitmap2, int bits) +{ + int k, nr = bits/BITS_PER_LONG; + for (k = 0; k < nr; ++k) + if (bitmap1[k] != bitmap2[k]) + return 0; + + if (bits % BITS_PER_LONG) + if ((bitmap1[k] ^ bitmap2[k]) & ((1UL << (bits % BITS_PER_LONG)) - 1)) + return 0; + + return 1; +} + +static inline void bitmap_complement(volatile unsigned long *bitmap, int bits) +{ + int k, nr = BITS_TO_LONGS(bits); + + for (k = 0; k < nr; ++k) + bitmap[k] = ~bitmap[k]; +} + +static inline void bitmap_clear(volatile unsigned long *bitmap, int bits) +{ + CLEAR_BITMAP((unsigned long *)bitmap, bits); +} + +static inline void bitmap_fill(volatile unsigned long *bitmap, int bits) +{ + memset((unsigned long *)bitmap, 0xff, BITS_TO_LONGS(bits)*sizeof(unsigned long)); +} + +static inline void bitmap_copy(volatile unsigned long *dst, const volatile unsigned long *src, int bits) +{ + memcpy((unsigned long *)dst, (unsigned long *)src, BITS_TO_LONGS(bits)*sizeof(unsigned long)); +} + +static inline void bitmap_shift_left(volatile unsigned long *,const volatile unsigned long *,int,int); +static inline void bitmap_shift_right(volatile unsigned long *dst, const volatile unsigned long *src, int shift, int bits) +{ + int k, nr = bits - shift; + DECLARE_BITMAP(__shr_tmp, bits); + + bitmap_clear(__shr_tmp, bits); + for (k = 0; k < nr; ++k) + if (test_bit(k + shift, src)) + set_bit(k, __shr_tmp); + bitmap_copy(dst, __shr_tmp, bits); +} + +static inline void bitmap_shift_left(volatile unsigned long *dst, const volatile unsigned long *src, int shift, int bits) +{ + int k; + DECLARE_BITMAP(__shl_tmp, bits); + + bitmap_clear(__shl_tmp, bits); + for (k = bits; k >= shift; --k) + if (test_bit(k - shift, src)) + set_bit(k, __shl_tmp); + bitmap_copy(dst, __shl_tmp, bits); +} + +static inline void bitmap_and(volatile unsigned long *dst, const volatile unsigned long *bitmap1, const volatile unsigned long *bitmap2, int bits) +{ + int k, nr = BITS_TO_LONGS(bits); + + for (k = 0; k < nr; ++k) + dst[k] = bitmap1[k] & bitmap2[k]; +} + +static inline void bitmap_or(volatile unsigned long *dst, const volatile unsigned long *bitmap1, const volatile unsigned long *bitmap2, int bits) +{ + int k, nr = BITS_TO_LONGS(bits); + + for (k = 0; k < nr; ++k) + dst[k] = bitmap1[k] | bitmap2[k]; +} + +#if BITS_PER_LONG == 32 +static inline int bitmap_weight(const volatile unsigned long *bitmap, int bits) +{ + int k, w = 0, nr = bits/BITS_PER_LONG; + + for (k = 0; k < nr; ++k) + w += hweight32(bitmap[k]); + + if (bits % BITS_PER_LONG) + w+= hweight32(bitmap[k] & ((1UL << (bits % BITS_PER_LONG)) - 1)); + + return w; +} +#else +static inline int bitmap_weight(const volatile unsigned long *bitmap, int bits) +{ + int k, w = 0, nr = bits/BITS_PER_LONG; + + for (k = 0; k < nr; ++k) + w += hweight64(bitmap[k]); + + if (bits % BITS_PER_LONG) + w += hweight64(bitmap[k] & ((1UL << (bits % BITS_PER_LONG)) - 1)); + + return w; +} +#endif + +#endif /* __ASSEMBLY__ */ + +#endif /* __LINUX_BITMAP_H */ diff -prauN linux-2.6.0-test1/include/linux/cpumask.h wli-2.6.0-test1-37/include/linux/cpumask.h --- linux-2.6.0-test1/include/linux/cpumask.h 1969-12-31 16:00:00.000000000 -0800 +++ wli-2.6.0-test1-37/include/linux/cpumask.h 2003-07-14 06:31:10.000000000 -0700 @@ -0,0 +1,62 @@ +#ifndef __LINUX_CPUMASK_H +#define __LINUX_CPUMASK_H + +#include +#include +#include +#include +#include + +#define CPU_ARRAY_SIZE BITS_TO_LONGS(NR_CPUS) + +struct cpumask +{ + unsigned long mask[CPU_ARRAY_SIZE]; +}; + +typedef struct cpumask cpumask_t; + +#ifdef CONFIG_SMP +#include +#else +#include +#endif + +#if NR_CPUS <= 4*BITS_PER_LONG +#include +#else +#include +#endif + + +#ifdef CONFIG_SMP + +extern cpumask_t cpu_online_map; + +#define num_online_cpus() cpus_weight(cpu_online_map) +#define cpu_online(cpu) cpu_isset(cpu, cpu_online_map) +#else +#define cpu_online_map cpumask_of_cpu(0) +#define num_online_cpus() 1 +#define cpu_online(cpu) ({ BUG_ON((cpu) != 0); 1; }) +#endif + +static inline int next_online_cpu(int cpu, cpumask_t map) +{ + do + cpu = next_cpu_const(cpu, map); + while (cpu < NR_CPUS && !cpu_online(cpu)); + return cpu; +} + +#define for_each_cpu(cpu, map) \ + for (cpu = first_cpu_const(map); \ + cpu < NR_CPUS; \ + cpu = next_cpu_const(cpu,map)) + +#define for_each_online_cpu(cpu, map) \ + for (cpu = first_cpu_const(map); \ + cpu < NR_CPUS; \ + cpu = next_online_cpu(cpu,map)) + +#endif /* __LINUX_CPUMASK_H */ diff -prauN linux-2.6.0-test1/include/linux/file.h wli-2.6.0-test1-37/include/linux/file.h --- linux-2.6.0-test1/include/linux/file.h 2003-07-13 20:28:53.000000000 -0700 +++ wli-2.6.0-test1-37/include/linux/file.h 2003-07-14 09:45:14.000000000 -0700 @@ -9,6 +9,7 @@ #include #include #include +#include /* * The default fd array needs to be at least BITS_PER_LONG, @@ -64,8 +65,13 @@ static inline struct file * fcheck_files { struct file * file = NULL; - if (fd < files->max_fds) - file = files->fd[fd]; + if (fd < files->max_fds) { + struct file ** fd_array; + rmb(); + fd_array = files->fd; + read_barrier_depends(); + file = fd_array[fd]; + } return file; } diff -prauN linux-2.6.0-test1/include/linux/fs.h wli-2.6.0-test1-37/include/linux/fs.h --- linux-2.6.0-test1/include/linux/fs.h 2003-07-13 20:32:28.000000000 -0700 +++ wli-2.6.0-test1-37/include/linux/fs.h 2003-07-17 15:01:26.000000000 -0700 @@ -19,6 +19,8 @@ #include #include #include +#include +#include #include struct iovec; @@ -309,11 +311,29 @@ struct address_space_operations { loff_t offset, unsigned long nr_segs); }; +#if NR_CPUS > 8 +typedef rwlock_t mapping_rwlock_t; +#define mapping_rdlock(lock) read_lock(lock) +#define mapping_rdunlock(lock) read_unlock(lock) +#define mapping_wrlock(lock) write_lock(lock) +#define mapping_wrunlock(lock) write_unlock(lock) +#define mapping_rwlock_init(lock) rwlock_init(lock) +#define MAPPING_RW_LOCK_UNLOCKED RW_LOCK_UNLOCKED +#else +typedef spinlock_t mapping_rwlock_t; +#define mapping_rdlock(lock) spin_lock(lock) +#define mapping_rdunlock(lock) spin_unlock(lock) +#define mapping_wrlock(lock) spin_lock(lock) +#define mapping_wrunlock(lock) spin_unlock(lock) +#define mapping_rwlock_init(lock) spin_lock_init(lock) +#define MAPPING_RW_LOCK_UNLOCKED SPIN_LOCK_UNLOCKED +#endif + struct backing_dev_info; struct address_space { struct inode *host; /* owner: inode, block_device */ struct radix_tree_root page_tree; /* radix tree of all pages */ - spinlock_t page_lock; /* and spinlock protecting it */ + mapping_rwlock_t page_lock; /* and spinlock protecting it */ struct list_head clean_pages; /* list of clean pages */ struct list_head dirty_pages; /* list of dirty pages */ struct list_head locked_pages; /* list of locked pages */ @@ -322,7 +342,7 @@ struct address_space { struct address_space_operations *a_ops; /* methods */ struct list_head i_mmap; /* list of private mappings */ struct list_head i_mmap_shared; /* list of shared mappings */ - struct semaphore i_shared_sem; /* protect both above lists */ + spinlock_t i_shared_lock; /* protect both above lists */ unsigned long dirtied_when; /* jiffies of first page dirtying */ int gfp_mask; /* how to allocate the pages */ struct backing_dev_info *backing_dev_info; /* device readahead, etc */ @@ -487,8 +507,37 @@ struct file_ra_state { unsigned long mmap_miss; /* Cache miss stat for mmap accesses */ }; +struct file_list { + spinlock_t lock; + struct list_head list; +}; + +static inline void file_list_init(struct file_list *flist) +{ + int cpu; + for (cpu = 0; cpu < NR_CPUS; ++cpu) { + INIT_LIST_HEAD(&flist[cpu].list); + spin_lock_init(&flist[cpu].lock); + } +} + +static inline void file_list_lock_all(struct file_list *flist) +{ + int cpu; + for (cpu = 0; cpu < NR_CPUS; ++cpu) + spin_lock(&flist[cpu].lock); +} + +static inline void file_list_unlock_all(struct file_list *flist) +{ + int cpu; + for (cpu = NR_CPUS - 1; cpu >= 0; --cpu) + spin_unlock(&flist[cpu].lock); +} + struct file { struct list_head f_list; + struct file_list *f_container; struct dentry *f_dentry; struct vfsmount *f_vfsmnt; struct file_operations *f_op; @@ -511,9 +560,6 @@ struct file { struct list_head f_ep_links; spinlock_t f_ep_lock; }; -extern spinlock_t files_lock; -#define file_list_lock() spin_lock(&files_lock); -#define file_list_unlock() spin_unlock(&files_lock); #define get_file(x) atomic_inc(&(x)->f_count) #define file_count(x) atomic_read(&(x)->f_count) @@ -670,7 +716,7 @@ struct super_block { struct list_head s_dirty; /* dirty inodes */ struct list_head s_io; /* parked for writeback */ struct hlist_head s_anon; /* anonymous dentries for (nfs) exporting */ - struct list_head s_files; + struct file_list s_file_lists[NR_CPUS]; struct block_device *s_bdev; struct list_head s_instances; @@ -1256,8 +1302,6 @@ static inline void insert_inode_hash(str } extern struct file * get_empty_filp(void); -extern void file_move(struct file *f, struct list_head *list); -extern void file_kill(struct file *f); struct bio; extern int submit_bio(int, struct bio *); extern int bdev_read_only(struct block_device *); diff -prauN linux-2.6.0-test1/include/linux/fs_struct.h wli-2.6.0-test1-37/include/linux/fs_struct.h --- linux-2.6.0-test1/include/linux/fs_struct.h 2003-07-13 20:36:36.000000000 -0700 +++ wli-2.6.0-test1-37/include/linux/fs_struct.h 2003-07-14 10:07:15.000000000 -0700 @@ -1,28 +1,41 @@ #ifndef _LINUX_FS_STRUCT_H #define _LINUX_FS_STRUCT_H +#include +#include struct dentry; struct vfsmount; +struct fs_dirs { + struct dentry * root, * pwd, * altroot; + struct vfsmount * rootmnt, * pwdmnt, * altrootmnt; + struct rcu_head rcu; + struct work_struct work; +}; + struct fs_struct { atomic_t count; - rwlock_t lock; + spinlock_t lock; + struct fs_dirs *dirs; int umask; - struct dentry * root, * pwd, * altroot; - struct vfsmount * rootmnt, * pwdmnt, * altrootmnt; }; #define INIT_FS { \ .count = ATOMIC_INIT(1), \ - .lock = RW_LOCK_UNLOCKED, \ + .lock = SPIN_LOCK_UNLOCKED, \ + .dirs = &init_dirs, \ .umask = 0022, \ } -extern void exit_fs(struct task_struct *); -extern void set_fs_altroot(void); -extern void set_fs_root(struct fs_struct *, struct vfsmount *, struct dentry *); -extern void set_fs_pwd(struct fs_struct *, struct vfsmount *, struct dentry *); -extern struct fs_struct *copy_fs_struct(struct fs_struct *); -extern void put_fs_struct(struct fs_struct *); +void exit_fs(struct task_struct *); +int set_fs_altroot(void); +int set_fs_root(struct fs_struct *, struct vfsmount *, struct dentry *); +int set_fs_pwd(struct fs_struct *, struct vfsmount *, struct dentry *); +struct fs_struct *copy_fs_struct(struct fs_struct *); +void put_fs_struct(struct fs_struct *); +void FASTCALL(free_fs_dirs(struct fs_dirs *)); +void FASTCALL(release_fs_dirs_pwd(struct fs_dirs *)); +void FASTCALL(release_fs_dirs_root(struct fs_dirs *)); +void FASTCALL(release_fs_dirs_altroot(struct fs_dirs *)); #endif /* _LINUX_FS_STRUCT_H */ diff -prauN linux-2.6.0-test1/include/linux/gfp.h wli-2.6.0-test1-37/include/linux/gfp.h --- linux-2.6.0-test1/include/linux/gfp.h 2003-07-13 20:32:28.000000000 -0700 +++ wli-2.6.0-test1-37/include/linux/gfp.h 2003-07-14 07:10:48.000000000 -0700 @@ -76,6 +76,7 @@ static inline struct page * alloc_pages_ extern unsigned long FASTCALL(__get_free_pages(unsigned int gfp_mask, unsigned int order)); extern unsigned long FASTCALL(get_zeroed_page(unsigned int gfp_mask)); +void free_pages_bulk(struct zone *zone, struct page *page, unsigned int order); #define __get_free_page(gfp_mask) \ __get_free_pages((gfp_mask),0) diff -prauN linux-2.6.0-test1/include/linux/hugetlb.h wli-2.6.0-test1-37/include/linux/hugetlb.h --- linux-2.6.0-test1/include/linux/hugetlb.h 2003-07-13 20:34:29.000000000 -0700 +++ wli-2.6.0-test1-37/include/linux/hugetlb.h 2003-07-14 07:33:22.000000000 -0700 @@ -41,6 +41,11 @@ mark_mm_hugetlb(struct mm_struct *mm, st #define is_hugepage_only_range(addr, len) 0 #endif +#define vm_account_huge_inc(vma, pte, addr) \ + vm_account(vma, pte, addr, HPAGE_SIZE/PAGE_SIZE) +#define vm_account_huge_dec(vma, pte, addr) \ + vm_account(vma, pte, addr, -(HPAGE_SIZE/PAGE_SIZE)) + #else /* !CONFIG_HUGETLB_PAGE */ static inline int is_vm_hugetlb_page(struct vm_area_struct *vma) diff -prauN linux-2.6.0-test1/include/linux/init_task.h wli-2.6.0-test1-37/include/linux/init_task.h --- linux-2.6.0-test1/include/linux/init_task.h 2003-07-13 20:30:38.000000000 -0700 +++ wli-2.6.0-test1-37/include/linux/init_task.h 2003-07-14 06:31:10.000000000 -0700 @@ -70,7 +70,7 @@ .prio = MAX_PRIO-20, \ .static_prio = MAX_PRIO-20, \ .policy = SCHED_NORMAL, \ - .cpus_allowed = ~0UL, \ + .cpus_allowed = CPU_MASK_ALL, \ .mm = NULL, \ .active_mm = &init_mm, \ .run_list = LIST_HEAD_INIT(tsk.run_list), \ diff -prauN linux-2.6.0-test1/include/linux/irq.h wli-2.6.0-test1-37/include/linux/irq.h --- linux-2.6.0-test1/include/linux/irq.h 2003-07-13 20:33:13.000000000 -0700 +++ wli-2.6.0-test1-37/include/linux/irq.h 2003-07-14 06:31:10.000000000 -0700 @@ -15,6 +15,7 @@ #include #include +#include #include #include @@ -44,7 +45,7 @@ struct hw_interrupt_type { void (*disable)(unsigned int irq); void (*ack)(unsigned int irq); void (*end)(unsigned int irq); - void (*set_affinity)(unsigned int irq, unsigned long mask); + void (*set_affinity)(unsigned int irq, cpumask_t dest); }; typedef struct hw_interrupt_type hw_irq_controller; diff -prauN linux-2.6.0-test1/include/linux/mm.h wli-2.6.0-test1-37/include/linux/mm.h --- linux-2.6.0-test1/include/linux/mm.h 2003-07-13 20:29:29.000000000 -0700 +++ wli-2.6.0-test1-37/include/linux/mm.h 2003-07-14 10:13:03.000000000 -0700 @@ -12,6 +12,7 @@ #include #include #include +#include #ifndef CONFIG_DISCONTIGMEM /* Don't use mapnrs, do it properly */ extern unsigned long max_mapnr; @@ -77,6 +78,7 @@ struct vm_area_struct { units, *not* PAGE_CACHE_SIZE */ struct file * vm_file; /* File we map to (can be NULL). */ void * vm_private_data; /* was vm_pte (shared mem) */ + struct rcu_head rcu; }; /* @@ -110,6 +112,7 @@ struct vm_area_struct { #define VM_RESERVED 0x00080000 /* Don't unmap it from swap_out */ #define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ +#define VM_DEAD 0x00800000 /* vma is dead, don't touch */ #ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */ #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS @@ -146,8 +149,6 @@ struct vm_operations_struct { int (*populate)(struct vm_area_struct * area, unsigned long address, unsigned long len, pgprot_t prot, unsigned long pgoff, int nonblock); }; -/* forward declaration; pte_chain is meant to be internal to rmap.c */ -struct pte_chain; struct mmu_gather; struct inode; @@ -171,15 +172,12 @@ struct page { updated asynchronously */ atomic_t count; /* Usage count, see below. */ struct list_head list; /* ->mapping has some page lists. */ - struct address_space *mapping; /* The inode (or ...) we belong to. */ + unsigned long __mapping; /* The inode (or ...) we belong to. */ unsigned long index; /* Our offset within mapping. */ struct list_head lru; /* Pageout list, eg. active_list; protected by zone->lru_lock !! */ - union { - struct pte_chain *chain;/* Reverse pte mapping pointer. - * protected by PG_chainlock */ - pte_addr_t direct; - } pte; + atomic_t mapcount; + struct rmap_chain *chain; unsigned long private; /* mapping-private opaque data */ /* @@ -375,13 +373,41 @@ void page_address_init(void); #endif /* + * On an anonymous page mapped into a user virutal memory area, + * page->mapping points to its anonmm, not to a struct address_space. + * + * Please note that, confusingly, page_mapping() refers to the inode + * struct address_space which maps the page from disk, where page_mapped() + * refers to whether it's mapped into a user virtual address space. + */ +static inline struct address_space *page_mapping(struct page *page) +{ + if (PageAnon(page)) + return NULL; + else + return (struct address_space *)page->__mapping; +} + +struct anon; +static inline struct anon *page_anon(struct page *page) +{ + BUG_ON(!PageAnon(page)); + return (struct anon *)page->__mapping; +} + +static inline void set_page_mapping(struct page *page, void *ptr) +{ + page->__mapping = (unsigned long)ptr; +} + +/* * Return true if this page is mapped into pagetables. Subtle: test pte.direct * rather than pte.chain. Because sometimes pte.direct is 64-bit, and .chain * is only 32-bit. */ static inline int page_mapped(struct page *page) { - return page->pte.direct != 0; + return atomic_read(&page->mapcount) != 0; } /* @@ -423,23 +449,27 @@ int zeromap_page_range(struct vm_area_st extern int vmtruncate(struct inode * inode, loff_t offset); extern pmd_t *FASTCALL(__pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)); +pmd_t *FASTCALL(__pmd_alloc_kernel(struct mm_struct *mm, pgd_t *pmd, unsigned long address)); extern pte_t *FASTCALL(pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address)); -extern pte_t *FASTCALL(pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address)); +pte_t *FASTCALL(pte_alloc_map(struct mm_struct *mm, pgd_t *pgd, pmd_t **pmd, unsigned long address)); extern int install_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot); extern int handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma, unsigned long address, int write_access); extern int make_pages_present(unsigned long addr, unsigned long end); extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write); extern long sys_remap_file_pages(unsigned long start, unsigned long size, unsigned long prot, unsigned long pgoff, unsigned long nonblock); -void put_dirty_page(struct task_struct *tsk, struct page *page, - unsigned long address, pgprot_t prot); +void put_dirty_page(task_t *task, struct vm_area_struct *vma, + struct page *page, unsigned long address, pgprot_t prot); int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int len, int write, int force, struct page **pages, struct vm_area_struct **vmas); -int __set_page_dirty_buffers(struct page *page); -int __set_page_dirty_nobuffers(struct page *page); +int set_page_dirty(struct page *page); +int set_page_dirty_buffers(struct page *page); +int set_page_dirty_nobuffers(struct page *page); int set_page_dirty_lock(struct page *page); +void free_vma(struct vm_area_struct *); + /* * Prototype to add a shrinker callback for ageable caches. * @@ -464,33 +494,15 @@ extern struct shrinker *set_shrinker(int extern void remove_shrinker(struct shrinker *shrinker); /* - * If the mapping doesn't provide a set_page_dirty a_op, then - * just fall through and assume that it wants buffer_heads. - * FIXME: make the method unconditional. - */ -static inline int set_page_dirty(struct page *page) -{ - if (page->mapping) { - int (*spd)(struct page *); - - spd = page->mapping->a_ops->set_page_dirty; - if (spd) - return (*spd)(page); - } - return __set_page_dirty_buffers(page); -} - -/* * On a two-level page table, this ends up being trivial. Thus the * inlining and the symmetry break with pte_alloc_map() that does all * of this out-of-line. */ -static inline pmd_t *pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) -{ - if (pgd_none(*pgd)) - return __pmd_alloc(mm, pgd, address); - return pmd_offset(pgd, address); -} +#define pmd_alloc_map(mm, pgd, addr) \ + (pgd_none(*(pgd))? __pmd_alloc(mm,pgd,addr): pmd_offset_map(pgd,addr)) + +#define pmd_alloc_kernel(mm, pgd, addr) \ + (pgd_none(*(pgd))? __pmd_alloc_kernel(mm,pgd,addr): pmd_offset_kernel(pgd,addr)) extern void free_area_init(unsigned long * zones_size); extern void free_area_init_node(int nid, pg_data_t *pgdat, struct page *pmap, @@ -612,6 +624,75 @@ kernel_map_pages(struct page *page, int { } #endif + + static inline void vm_account(struct vm_area_struct *vma, pte_t pte, + unsigned long addr, long adjustment) + { + struct mm_struct *mm = vma->vm_mm; + unsigned long pfn; + struct page *page; + + if (!pte_present(pte)) + return; + + pfn = pte_pfn(pte); + if (!pfn_valid(pfn)) + goto out; + + page = pfn_to_page(pfn); + if (PageReserved(page)) + goto out; + + if (vma->vm_flags & VM_EXECUTABLE) + mm->text += adjustment; + else if (vma->vm_flags & (VM_STACK_FLAGS & (VM_GROWSUP | VM_GROWSDOWN))) { + mm->data += adjustment; + mm->stack += adjustment; + } else if (addr >= TASK_UNMAPPED_BASE) + mm->lib += adjustment; + else + mm->data += adjustment; + + if (page_mapping(page)) + mm->shared += adjustment; + + out: + if (pte_write(pte)) + mm->dirty += adjustment; + } + + #define vm_account_inc(vma, pte, addr) vm_account(vma, pte, addr, +1) + #define vm_account_dec(vma, pte, addr) vm_account(vma, pte, addr, -1) + + static inline void vm_ptep_set_wrprotect(struct mm_struct *mm, pte_t *pte) + { + if (pte_write(*pte)) + mm->dirty--; + ptep_set_wrprotect(pte); + } + + static inline void vm_set_pte(struct vm_area_struct *vma, pte_t *dst, + pte_t val, unsigned long addr) + { + vm_account_inc(vma, val, addr); + set_pte(dst, val); + } + + static inline pte_t vm_ptep_get_and_clear(struct vm_area_struct *vma, + pte_t *pte, unsigned long addr) + { + pte_t val = ptep_get_and_clear(pte); + vm_account_dec(vma, val, addr); + return val; + } + + static inline void vm_pte_clear(struct vm_area_struct *vma, pte_t *pte, + unsigned long addr) + { + pte_t val = *pte; + pte_clear(pte); + vm_account_dec(vma, val, addr); + } #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff -prauN linux-2.6.0-test1/include/linux/mmzone.h wli-2.6.0-test1-37/include/linux/mmzone.h --- linux-2.6.0-test1/include/linux/mmzone.h 2003-07-13 20:34:41.000000000 -0700 +++ wli-2.6.0-test1-37/include/linux/mmzone.h 2003-07-14 06:51:30.000000000 -0700 @@ -26,8 +26,8 @@ #endif struct free_area { - struct list_head free_list; - unsigned long *map; + struct list_head free_list, deferred_pages; + unsigned long *map, globally_free, active, locally_free; }; struct pglist_data; diff -prauN linux-2.6.0-test1/include/linux/mount.h wli-2.6.0-test1-37/include/linux/mount.h --- linux-2.6.0-test1/include/linux/mount.h 2003-07-13 20:30:35.000000000 -0700 +++ wli-2.6.0-test1-37/include/linux/mount.h 2003-07-14 08:28:38.000000000 -0700 @@ -54,6 +54,7 @@ extern void free_vfsmnt(struct vfsmount extern struct vfsmount *alloc_vfsmnt(const char *name); extern struct vfsmount *do_kern_mount(const char *fstype, int flags, const char *name, void *data); +extern seqlock_t mnt_move_lock; extern spinlock_t vfsmount_lock; #endif diff -prauN linux-2.6.0-test1/include/linux/node.h wli-2.6.0-test1-37/include/linux/node.h --- linux-2.6.0-test1/include/linux/node.h 2003-07-13 20:38:37.000000000 -0700 +++ wli-2.6.0-test1-37/include/linux/node.h 2003-07-14 06:31:10.000000000 -0700 @@ -20,9 +20,10 @@ #define _LINUX_NODE_H_ #include +#include struct node { - unsigned long cpumap; /* Bitmap of CPUs on the Node */ + cpumask_t cpumap; /* Bitmap of CPUs on the Node */ struct sys_device sysdev; }; diff -prauN linux-2.6.0-test1/include/linux/page-flags.h wli-2.6.0-test1-37/include/linux/page-flags.h --- linux-2.6.0-test1/include/linux/page-flags.h 2003-07-13 20:36:33.000000000 -0700 +++ wli-2.6.0-test1-37/include/linux/page-flags.h 2003-07-14 10:13:03.000000000 -0700 @@ -69,12 +69,13 @@ #define PG_private 12 /* Has something at ->private */ #define PG_writeback 13 /* Page is under writeback */ #define PG_nosave 14 /* Used for system suspend/resume */ -#define PG_chainlock 15 /* lock bit for ->pte_chain */ +#define PG_rmaplock 15 /* lock bit for ->pte_chain */ -#define PG_direct 16 /* ->pte_chain points directly at pte */ -#define PG_mappedtodisk 17 /* Has blocks allocated on-disk */ -#define PG_reclaim 18 /* To be reclaimed asap */ -#define PG_compound 19 /* Part of a compound page */ +#define PG_mappedtodisk 16 /* Has blocks allocated on-disk */ +#define PG_reclaim 17 /* To be reclaimed asap */ +#define PG_compound 18 /* Part of a compound page */ +#define PG_anon 19 /* Anonymous page */ +#define PG_swapcache 20 /* Swap page; swp_entry_t in ->private */ /* @@ -87,6 +88,7 @@ struct page_state { unsigned long nr_unstable; /* NFS unstable pages */ unsigned long nr_page_table_pages;/* Pages used for pagetables */ unsigned long nr_mapped; /* mapped into pagetables */ + unsigned long nr_swapcache; /* in swapcache */ unsigned long nr_slab; /* In slab */ #define GET_PAGE_STATE_LAST nr_slab @@ -248,12 +250,6 @@ extern void get_full_page_state(struct p #define ClearPageNosave(page) clear_bit(PG_nosave, &(page)->flags) #define TestClearPageNosave(page) test_and_clear_bit(PG_nosave, &(page)->flags) -#define PageDirect(page) test_bit(PG_direct, &(page)->flags) -#define SetPageDirect(page) set_bit(PG_direct, &(page)->flags) -#define TestSetPageDirect(page) test_and_set_bit(PG_direct, &(page)->flags) -#define ClearPageDirect(page) clear_bit(PG_direct, &(page)->flags) -#define TestClearPageDirect(page) test_and_clear_bit(PG_direct, &(page)->flags) - #define PageMappedToDisk(page) test_bit(PG_mappedtodisk, &(page)->flags) #define SetPageMappedToDisk(page) set_bit(PG_mappedtodisk, &(page)->flags) #define ClearPageMappedToDisk(page) clear_bit(PG_mappedtodisk, &(page)->flags) @@ -267,15 +263,16 @@ extern void get_full_page_state(struct p #define SetPageCompound(page) set_bit(PG_compound, &(page)->flags) #define ClearPageCompound(page) clear_bit(PG_compound, &(page)->flags) -/* - * The PageSwapCache predicate doesn't use a PG_flag at this time, - * but it may again do so one day. - */ +#define PageAnon(page) test_bit(PG_anon, &(page)->flags) +#define SetPageAnon(page) set_bit(PG_anon, &(page)->flags) +#define ClearPageAnon(page) clear_bit(PG_anon, &(page)->flags) + #ifdef CONFIG_SWAP -extern struct address_space swapper_space; -#define PageSwapCache(page) ((page)->mapping == &swapper_space) +#define PageSwapCache(page) test_bit(PG_swapcache, &(page)->flags) +#define SetPageSwapCache(page) set_bit(PG_swapcache, &(page)->flags) +#define ClearPageSwapCache(page) clear_bit(PG_swapcache, &(page)->flags) #else -#define PageSwapCache(page) 0 +#define PageSwapCache(page) 0 #endif struct page; /* forward declaration */ diff -prauN linux-2.6.0-test1/include/linux/pagemap.h wli-2.6.0-test1-37/include/linux/pagemap.h --- linux-2.6.0-test1/include/linux/pagemap.h 2003-07-13 20:28:56.000000000 -0700 +++ wli-2.6.0-test1-37/include/linux/pagemap.h 2003-07-14 08:52:52.000000000 -0700 @@ -116,17 +116,6 @@ static inline unsigned long get_page_cac return atomic_read(&nr_pagecache); } -static inline void ___add_to_page_cache(struct page *page, - struct address_space *mapping, unsigned long index) -{ - list_add(&page->list, &mapping->clean_pages); - page->mapping = mapping; - page->index = index; - - mapping->nrpages++; - pagecache_acct(1); -} - extern void FASTCALL(__lock_page(struct page *page)); extern void FASTCALL(unlock_page(struct page *page)); diff -prauN linux-2.6.0-test1/include/linux/pagevec.h wli-2.6.0-test1-37/include/linux/pagevec.h --- linux-2.6.0-test1/include/linux/pagevec.h 2003-07-13 20:38:53.000000000 -0700 +++ wli-2.6.0-test1-37/include/linux/pagevec.h 2003-07-17 02:21:08.000000000 -0700 @@ -4,8 +4,15 @@ * In many places it is efficient to batch an operation up against multiple * pages. A pagevec is a multipage container which is used for that. */ +#include -#define PAGEVEC_SIZE 16 +#define __MIN_PVEC_SIZE 16 +#define __MAX_PVEC_SIZE 1024 +#define __PVEC_MIN(x,y) ((x) < (y) ? (x) : (y)) +#define __PVEC_MAX(x,y) ((x) > (y) ? (x) : (y)) +#define __PVEC_SIZE (4*NR_CPUS) +#define __PAGEVEC_SIZE __PVEC_MIN(__PVEC_SIZE, __MAX_PVEC_SIZE) +#define PAGEVEC_SIZE __PVEC_MAX(__PAGEVEC_SIZE, __MIN_PVEC_SIZE) struct page; struct address_space; diff -prauN linux-2.6.0-test1/include/linux/pid.h wli-2.6.0-test1-37/include/linux/pid.h --- linux-2.6.0-test1/include/linux/pid.h 2003-07-13 20:31:58.000000000 -0700 +++ wli-2.6.0-test1-37/include/linux/pid.h 2003-07-14 07:32:19.000000000 -0700 @@ -47,6 +47,7 @@ extern void FASTCALL(detach_pid(struct t * held. */ extern struct pid *FASTCALL(find_pid(enum pid_type, int)); +int find_next_pid(int); extern int alloc_pidmap(void); extern void FASTCALL(free_pidmap(int)); diff -prauN linux-2.6.0-test1/include/linux/rcupdate.h wli-2.6.0-test1-37/include/linux/rcupdate.h --- linux-2.6.0-test1/include/linux/rcupdate.h 2003-07-13 20:32:28.000000000 -0700 +++ wli-2.6.0-test1-37/include/linux/rcupdate.h 2003-07-14 06:31:10.000000000 -0700 @@ -40,6 +40,7 @@ #include #include #include +#include /** * struct rcu_head - callback structure for use with RCU @@ -67,7 +68,7 @@ struct rcu_ctrlblk { spinlock_t mutex; /* Guard this struct */ long curbatch; /* Current batch number. */ long maxbatch; /* Max requested batch number. */ - unsigned long rcu_cpu_mask; /* CPUs that need to switch in order */ + cpumask_t rcu_cpu_mask; /* CPUs that need to switch in order */ /* for current batch to proceed. */ }; @@ -114,7 +115,7 @@ static inline int rcu_pending(int cpu) rcu_batch_before(RCU_batch(cpu), rcu_ctrlblk.curbatch)) || (list_empty(&RCU_curlist(cpu)) && !list_empty(&RCU_nxtlist(cpu))) || - test_bit(cpu, &rcu_ctrlblk.rcu_cpu_mask)) + cpu_isset(cpu, rcu_ctrlblk.rcu_cpu_mask)) return 1; else return 0; diff -prauN linux-2.6.0-test1/include/linux/rmap-locking.h wli-2.6.0-test1-37/include/linux/rmap-locking.h --- linux-2.6.0-test1/include/linux/rmap-locking.h 2003-07-13 20:30:41.000000000 -0700 +++ wli-2.6.0-test1-37/include/linux/rmap-locking.h 1969-12-31 16:00:00.000000000 -0800 @@ -1,23 +0,0 @@ -/* - * include/linux/rmap-locking.h - * - * Locking primitives for exclusive access to a page's reverse-mapping - * pte chain. - */ - -#include - -struct pte_chain; -extern kmem_cache_t *pte_chain_cache; - -#define pte_chain_lock(page) bit_spin_lock(PG_chainlock, &page->flags) -#define pte_chain_unlock(page) bit_spin_unlock(PG_chainlock, &page->flags) - -struct pte_chain *pte_chain_alloc(int gfp_flags); -void __pte_chain_free(struct pte_chain *pte_chain); - -static inline void pte_chain_free(struct pte_chain *pte_chain) -{ - if (pte_chain) - __pte_chain_free(pte_chain); -} diff -prauN linux-2.6.0-test1/include/linux/rmap.h wli-2.6.0-test1-37/include/linux/rmap.h --- linux-2.6.0-test1/include/linux/rmap.h 1969-12-31 16:00:00.000000000 -0800 +++ wli-2.6.0-test1-37/include/linux/rmap.h 2003-07-14 10:24:21.000000000 -0700 @@ -0,0 +1,163 @@ +/* + * include/linux/rmap.h + * + * Locking primitives for exclusive access to a page's reverse-mapping + * pte chain. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +struct anon { + atomic_t count; + spinlock_t lock; + struct list_head list; + struct rcu_head rcu; +}; + +#ifdef CONFIG_MMU + +int FASTCALL(rmap_get_cpu(void)); +void FASTCALL(page_turn_rmap(struct page *, struct vm_area_struct *)); +void FASTCALL(page_move_rmap(struct page *, struct vm_area_struct *, unsigned long, unsigned long)); +void FASTCALL(add_rmap_address(struct page *, unsigned long)); +void FASTCALL(clear_page_chained(struct page *page)); + +/* + * Called from mm/vmscan.c to handle pageout + */ +int FASTCALL(page_referenced(struct page *)); +int FASTCALL(try_to_unmap(struct page *)); + +void init_rmap(void); +int exec_rmap(struct mm_struct *); +void dup_rmap(struct mm_struct *, struct mm_struct *); +void exit_rmap(struct mm_struct *); + +/* + * Return values of try_to_unmap(): + */ +#define SWAP_SUCCESS 0 +#define SWAP_AGAIN 1 +#define SWAP_FAIL 2 + +#else /* !CONFIG_MMU */ +#define page_referenced(page) TestClearPageReferenced(page) +#define init_rmap() do { } while (0) +#define exec_rmap(mm) ({ 0; }) +#define dup_rmap(new, old) ({ 0; }) +#define exit_rmap(mm) do { } while (0) +#define try_to_unmap(page) ({ SWAP_FAIL; }) +#endif /* CONFIG_MMU */ + +#define NOADDR (~0UL) + +static inline void rmap_lock(struct page *page) +{ + bit_spin_lock(PG_rmaplock, &page->flags); +} + +static inline void rmap_unlock(struct page *page) +{ + bit_spin_unlock(PG_rmaplock, &page->flags); +} + +#define NRSLOT ((L1_CACHE_BYTES - sizeof(unsigned long))/sizeof(unsigned long)) + +struct rmap_chain { + unsigned long slot[NRSLOT]; /* first contains count, then */ + struct rmap_chain *next; /* user virtual addresses */ +}; + +static inline void page_dup_rmap(struct page *page) +{ + atomic_inc(&page->mapcount); +} + +static inline void clear_page_anon(struct page *page) +{ + set_page_mapping(page, NULL); + ClearPageAnon(page); +} + +/** + * page_remove_rmap - take down reverse mapping to a page + * @page: page to remove mapping from + * + * For general use: Remove the reverse mapping from the page. + * after that the caller can clear the page table entry and free + * the page. Caller needs to hold the mm->page_table_lock. + */ +static inline void page_remove_rmap(struct page *page) +{ + if (!atomic_dec_and_test(&page->mapcount)) + return; + + rmap_lock(page); + dec_page_state(nr_mapped); + if (PageAnon(page)) + clear_page_anon(page); + if (page->chain) + clear_page_chained(page); + rmap_unlock(page); +} + +static inline unsigned long vma_address(struct page *page, struct vm_area_struct *vma) +{ + unsigned long pgoff, address; + pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); + if (address < vma->vm_start || address >= vma->vm_end) + return NOADDR; + else + return address; +} + +/** + * page_add_rmap - add reverse mapping entry to a page + * @page: the page to add the mapping to + * @vma: the vma into which this page is being mapped + * @address: the virtual address at which the page is being mapped + * @anon: is this an anonymous (not file-backed) page? + * + * Add a new pte reverse mapping to a page. + * The caller needs to hold the mm->page_table_lock. + */ +static inline void page_add_rmap(struct page *page, struct vm_area_struct *vma, + unsigned long address, int anon) +{ + if (!pfn_valid(page_to_pfn(page)) || PageReserved(page)) + return; + + address &= PAGE_MASK; + + rmap_lock(page); + + if (!page_mapped(page)) + inc_page_state(nr_mapped); + + atomic_inc(&page->mapcount); + + if (page->__mapping) { + if (anon) { + BUG_ON(!PageAnon(page)); + if (address != page->index) + add_rmap_address(page, address); + } else { + BUG_ON(PageAnon(page)); + if (address != vma_address(page, vma)) + add_rmap_address(page, address); + } + } else if (anon) { + SetPageAnon(page); + set_page_mapping(page, vma->vm_mm->anon); + page->index = address; + } + rmap_unlock(page); +} diff -prauN linux-2.6.0-test1/include/linux/sched.h wli-2.6.0-test1-37/include/linux/sched.h --- linux-2.6.0-test1/include/linux/sched.h 2003-07-13 20:30:40.000000000 -0700 +++ wli-2.6.0-test1-37/include/linux/sched.h 2003-07-18 10:52:16.000000000 -0700 @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -28,6 +29,7 @@ #include #include #include +#include struct exec_domain; @@ -196,13 +198,16 @@ struct mm_struct { * together off init_mm.mmlist, and are protected * by mmlist_lock */ + struct anon *anon; /* set of forks between execs */ + struct list_head anon_list; /* chain of mm's against anon */ unsigned long start_code, end_code, start_data, end_data; unsigned long start_brk, brk, start_stack; unsigned long arg_start, arg_end, env_start, env_end; unsigned long rss, total_vm, locked_vm; + unsigned long shared, text, lib, data, dirty, stack; unsigned long def_flags; - unsigned long cpu_vm_mask; + cpumask_t cpu_vm_mask; unsigned long swap_address; unsigned dumpable:1; @@ -221,6 +226,7 @@ struct mm_struct { struct kioctx *ioctx_list; struct kioctx default_kioctx; + struct rcu_head rcu; }; extern int mmlist_nr; @@ -338,10 +344,11 @@ struct task_struct { prio_array_t *array; unsigned long sleep_avg; + unsigned long avg_start; unsigned long last_run; unsigned long policy; - unsigned long cpus_allowed; + cpumask_t cpus_allowed; unsigned int time_slice, first_time_slice; struct list_head tasks; @@ -488,9 +495,9 @@ do { if (atomic_dec_and_test(&(tsk)->usa #define PF_SYNCWRITE 0x00200000 /* I am doing a sync write */ #ifdef CONFIG_SMP -extern int set_cpus_allowed(task_t *p, unsigned long new_mask); +extern int set_cpus_allowed(task_t *p, cpumask_t new_mask); #else -static inline int set_cpus_allowed(task_t *p, unsigned long new_mask) +static inline int set_cpus_allowed(task_t *p, cpumask_t new_mask) { return 0; } diff -prauN linux-2.6.0-test1/include/linux/smp.h wli-2.6.0-test1-37/include/linux/smp.h --- linux-2.6.0-test1/include/linux/smp.h 2003-07-13 20:34:41.000000000 -0700 +++ wli-2.6.0-test1-37/include/linux/smp.h 2003-07-14 06:31:10.000000000 -0700 @@ -102,9 +102,6 @@ void smp_prepare_boot_cpu(void); #define smp_call_function(func,info,retry,wait) ({ 0; }) #define on_each_cpu(func,info,retry,wait) ({ func(info); 0; }) static inline void smp_send_reschedule(int cpu) { } -#define cpu_online_map 1 -#define cpu_online(cpu) ({ BUG_ON((cpu) != 0); 1; }) -#define num_online_cpus() 1 #define num_booting_cpus() 1 #define cpu_possible(cpu) ({ BUG_ON((cpu) != 0); 1; }) #define smp_prepare_boot_cpu() do {} while (0) diff -prauN linux-2.6.0-test1/include/linux/swap.h wli-2.6.0-test1-37/include/linux/swap.h --- linux-2.6.0-test1/include/linux/swap.h 2003-07-13 20:29:27.000000000 -0700 +++ wli-2.6.0-test1-37/include/linux/swap.h 2003-07-14 08:52:52.000000000 -0700 @@ -77,7 +77,6 @@ struct reclaim_state { #ifdef __KERNEL__ struct address_space; -struct pte_chain; struct sysinfo; struct writeback_control; struct zone; @@ -163,6 +162,7 @@ extern unsigned int nr_free_pages(void); extern unsigned int nr_free_pages_pgdat(pg_data_t *pgdat); extern unsigned int nr_free_buffer_pages(void); extern unsigned int nr_free_pagecache_pages(void); +unsigned long nr_deferred_pages(void); /* linux/mm/swap.c */ extern void FASTCALL(lru_cache_add(struct page *)); @@ -178,25 +178,8 @@ extern int try_to_free_pages(struct zone extern int shrink_all_memory(int); extern int vm_swappiness; -/* linux/mm/rmap.c */ -#ifdef CONFIG_MMU -int FASTCALL(page_referenced(struct page *)); -struct pte_chain *FASTCALL(page_add_rmap(struct page *, pte_t *, - struct pte_chain *)); -void FASTCALL(page_remove_rmap(struct page *, pte_t *)); -int FASTCALL(try_to_unmap(struct page *)); - /* linux/mm/shmem.c */ -extern int shmem_unuse(swp_entry_t entry, struct page *page); -#else -#define page_referenced(page) TestClearPageReferenced(page) -#define try_to_unmap(page) SWAP_FAIL -#endif /* CONFIG_MMU */ - -/* return values of try_to_unmap */ -#define SWAP_SUCCESS 0 -#define SWAP_AGAIN 1 -#define SWAP_FAIL 2 +int shmem_unuse(swp_entry_t entry, struct page *page); #ifdef CONFIG_SWAP /* linux/mm/page_io.c */ @@ -206,7 +189,6 @@ extern int rw_swap_page_sync(int, swp_en /* linux/mm/swap_state.c */ extern struct address_space swapper_space; -#define total_swapcache_pages swapper_space.nrpages extern void show_swap_cache_info(void); extern int add_to_swap(struct page *); extern void __delete_from_swap_cache(struct page *); @@ -245,7 +227,6 @@ extern spinlock_t swaplock; #else /* CONFIG_SWAP */ #define total_swap_pages 0 -#define total_swapcache_pages 0UL #define si_swapinfo(val) \ do { (val)->freeswap = (val)->totalswap = 0; } while (0) diff -prauN linux-2.6.0-test1/include/linux/topology.h wli-2.6.0-test1-37/include/linux/topology.h --- linux-2.6.0-test1/include/linux/topology.h 2003-07-13 20:36:32.000000000 -0700 +++ wli-2.6.0-test1-37/include/linux/topology.h 2003-07-14 06:31:10.000000000 -0700 @@ -27,6 +27,7 @@ #ifndef _LINUX_TOPOLOGY_H #define _LINUX_TOPOLOGY_H +#include #include #include #include @@ -34,7 +35,12 @@ #include #ifndef nr_cpus_node -#define nr_cpus_node(node) (hweight_long(node_to_cpumask(node))) +#define nr_cpus_node(node) \ + ({ \ + cpumask_t __tmp__; \ + __tmp__ = node_to_cpumask(node); \ + cpus_weight(__tmp__); \ + }) #endif static inline int __next_node_with_cpus(int node) diff -prauN linux-2.6.0-test1/include/linux/tty.h wli-2.6.0-test1-37/include/linux/tty.h --- linux-2.6.0-test1/include/linux/tty.h 2003-07-13 20:34:30.000000000 -0700 +++ wli-2.6.0-test1-37/include/linux/tty.h 2003-07-14 10:20:40.000000000 -0700 @@ -282,7 +282,7 @@ struct tty_struct { struct work_struct hangup_work; void *disc_data; void *driver_data; - struct list_head tty_files; + struct file_list tty_file_lists[NR_CPUS]; #define N_TTY_BUF_SIZE 4096 @@ -310,6 +310,7 @@ struct tty_struct { spinlock_t read_lock; /* If the tty has a pending do_SAK, queue it here - akpm */ struct work_struct SAK_work; + struct rcu_head rcu; }; /* tty magic number */ diff -prauN linux-2.6.0-test1/init/do_mounts.c wli-2.6.0-test1-37/init/do_mounts.c --- linux-2.6.0-test1/init/do_mounts.c 2003-07-13 20:32:44.000000000 -0700 +++ wli-2.6.0-test1-37/init/do_mounts.c 2003-07-14 10:07:15.000000000 -0700 @@ -233,10 +233,10 @@ static int __init do_mount_root(char *na return err; sys_chdir("/root"); - ROOT_DEV = current->fs->pwdmnt->mnt_sb->s_dev; + ROOT_DEV = current->fs->dirs->pwdmnt->mnt_sb->s_dev; printk("VFS: Mounted root (%s filesystem)%s.\n", - current->fs->pwdmnt->mnt_sb->s_type->name, - current->fs->pwdmnt->mnt_sb->s_flags & MS_RDONLY ? + current->fs->dirs->pwdmnt->mnt_sb->s_type->name, + current->fs->dirs->pwdmnt->mnt_sb->s_flags & MS_RDONLY ? " readonly" : ""); return 0; } diff -prauN linux-2.6.0-test1/init/main.c wli-2.6.0-test1-37/init/main.c --- linux-2.6.0-test1/init/main.c 2003-07-13 20:31:20.000000000 -0700 +++ wli-2.6.0-test1-37/init/main.c 2003-07-14 08:56:31.000000000 -0700 @@ -80,7 +80,6 @@ extern void signals_init(void); extern void buffer_init(void); extern void pidhash_init(void); extern void pidmap_init(void); -extern void pte_chain_init(void); extern void radix_tree_init(void); extern void free_initmem(void); extern void populate_rootfs(void); @@ -436,7 +435,6 @@ asmlinkage void __init start_kernel(void kmem_cache_init(); pidmap_init(); pgtable_cache_init(); - pte_chain_init(); fork_init(num_physpages); proc_caches_init(); buffer_init(); diff -prauN linux-2.6.0-test1/ipc/shm.c wli-2.6.0-test1-37/ipc/shm.c --- linux-2.6.0-test1/ipc/shm.c 2003-07-13 20:34:31.000000000 -0700 +++ wli-2.6.0-test1-37/ipc/shm.c 2003-07-14 08:33:37.000000000 -0700 @@ -380,9 +380,9 @@ static void shm_get_stat(unsigned long * if (is_file_hugepages(shp->shm_file)) { struct address_space *mapping = inode->i_mapping; - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); *rss += (HPAGE_SIZE/PAGE_SIZE)*mapping->nrpages; - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); } else { struct shmem_inode_info *info = SHMEM_I(inode); spin_lock(&info->lock); diff -prauN linux-2.6.0-test1/kernel/exec_domain.c wli-2.6.0-test1-37/kernel/exec_domain.c --- linux-2.6.0-test1/kernel/exec_domain.c 2003-07-13 20:35:14.000000000 -0700 +++ wli-2.6.0-test1-37/kernel/exec_domain.c 2003-07-14 10:07:15.000000000 -0700 @@ -189,7 +189,8 @@ __set_personality(u_long personality) current->personality = personality; oep = current_thread_info()->exec_domain; current_thread_info()->exec_domain = ep; - set_fs_altroot(); + if (set_fs_altroot()) + return -ENOMEM; module_put(oep->module); return 0; diff -prauN linux-2.6.0-test1/kernel/exit.c wli-2.6.0-test1-37/kernel/exit.c --- linux-2.6.0-test1/kernel/exit.c 2003-07-13 20:37:32.000000000 -0700 +++ wli-2.6.0-test1-37/kernel/exit.c 2003-07-14 10:07:15.000000000 -0700 @@ -382,15 +382,20 @@ static inline void __put_fs_struct(struc { /* No need to hold fs->lock if we are killing it */ if (atomic_dec_and_test(&fs->count)) { - dput(fs->root); - mntput(fs->rootmnt); - dput(fs->pwd); - mntput(fs->pwdmnt); - if (fs->altroot) { - dput(fs->altroot); - mntput(fs->altrootmnt); + struct fs_dirs *dirs; + rcu_read_lock(); + dirs = fs->dirs; + dput(dirs->root); + mntput(dirs->rootmnt); + dput(dirs->pwd); + mntput(dirs->pwdmnt); + if (dirs->altroot) { + dput(dirs->altroot); + mntput(dirs->altrootmnt); } + free_fs_dirs(dirs); kmem_cache_free(fs_cachep, fs); + rcu_read_unlock(); } } diff -prauN linux-2.6.0-test1/kernel/fork.c wli-2.6.0-test1-37/kernel/fork.c --- linux-2.6.0-test1/kernel/fork.c 2003-07-13 20:30:39.000000000 -0700 +++ wli-2.6.0-test1-37/kernel/fork.c 2003-07-14 10:07:15.000000000 -0700 @@ -28,8 +28,10 @@ #include #include #include +#include #include #include +#include #include #include @@ -265,7 +267,7 @@ static inline int dup_mmap(struct mm_str mm->free_area_cache = TASK_UNMAPPED_BASE; mm->map_count = 0; mm->rss = 0; - mm->cpu_vm_mask = 0; + cpus_clear(mm->cpu_vm_mask); pprev = &mm->mmap; /* @@ -306,9 +308,9 @@ static inline int dup_mmap(struct mm_str atomic_dec(&inode->i_writecount); /* insert tmp into the share list, just after mpnt */ - down(&inode->i_mapping->i_shared_sem); - list_add_tail(&tmp->shared, &mpnt->shared); - up(&inode->i_mapping->i_shared_sem); + spin_lock(&inode->i_mapping->i_shared_lock); + list_add_tail_rcu(&tmp->shared, &mpnt->shared); + spin_unlock(&inode->i_mapping->i_shared_lock); } /* @@ -362,8 +364,21 @@ static inline void mm_free_pgd(struct mm spinlock_t mmlist_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; int mmlist_nr; +/* SLAB cache for mm_struct structures (tsk->mm) */ +static kmem_cache_t *mm_cachep; + #define allocate_mm() (kmem_cache_alloc(mm_cachep, SLAB_KERNEL)) -#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) + +static void __free_mm(void *mm) +{ + kmem_cache_free(mm_cachep, mm); +} + +void free_mm(struct mm_struct *mm) +{ + INIT_RCU_HEAD(&mm->rcu); + call_rcu(&mm->rcu, __free_mm, mm); +} #include @@ -378,6 +393,7 @@ static struct mm_struct * mm_init(struct mm->ioctx_list = NULL; mm->default_kioctx = (struct kioctx)INIT_KIOCTX(mm->default_kioctx, *mm); mm->free_area_cache = TASK_UNMAPPED_BASE; + mm->shared = mm->text = mm->lib = mm->data = mm->dirty = mm->stack = 0; if (likely(!mm_alloc_pgd(mm))) { mm->def_flags = 0; @@ -395,11 +411,15 @@ struct mm_struct * mm_alloc(void) struct mm_struct * mm; mm = allocate_mm(); - if (mm) { - memset(mm, 0, sizeof(*mm)); - return mm_init(mm); + if (!mm) + return NULL; + memset(mm, 0, sizeof(*mm)); + if (exec_rmap(mm)) { + mm_free_pgd(mm); + free_mm(mm); + return NULL; } - return NULL; + return mm_init(mm); } /* @@ -426,6 +446,7 @@ void mmput(struct mm_struct *mm) spin_unlock(&mmlist_lock); exit_aio(mm); exit_mmap(mm); + exit_rmap(mm); mmdrop(mm); } } @@ -512,6 +533,8 @@ static int copy_mm(unsigned long clone_f if (!mm_init(mm)) goto fail_nomem; + dup_rmap(mm, oldmm); + if (init_new_context(tsk,mm)) goto free_pt; @@ -535,22 +558,29 @@ static inline struct fs_struct *__copy_f struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL); /* We don't need to lock fs - think why ;-) */ if (fs) { + struct fs_dirs *dirs, *olddirs = old->dirs; + dirs = kmalloc(sizeof(struct fs_dirs), GFP_KERNEL); + if (!dirs) { + kmem_cache_free(fs_cachep, fs); + return NULL; + } atomic_set(&fs->count, 1); - fs->lock = RW_LOCK_UNLOCKED; + fs->lock = SPIN_LOCK_UNLOCKED; fs->umask = old->umask; - read_lock(&old->lock); - fs->rootmnt = mntget(old->rootmnt); - fs->root = dget(old->root); - fs->pwdmnt = mntget(old->pwdmnt); - fs->pwd = dget(old->pwd); - if (old->altroot) { - fs->altrootmnt = mntget(old->altrootmnt); - fs->altroot = dget(old->altroot); + rcu_read_lock(); /* old->lock */ + dirs->rootmnt = mntget(olddirs->rootmnt); + dirs->root = dget(olddirs->root); + dirs->pwdmnt = mntget(olddirs->pwdmnt); + dirs->pwd = dget(olddirs->pwd); + if (olddirs->altroot) { + dirs->altrootmnt = mntget(olddirs->altrootmnt); + dirs->altroot = dget(olddirs->altroot); } else { - fs->altrootmnt = NULL; - fs->altroot = NULL; + dirs->altrootmnt = NULL; + dirs->altroot = NULL; } - read_unlock(&old->lock); + rcu_read_unlock(); /* old->lock */ + fs->dirs = dirs; } return fs; } @@ -575,10 +605,12 @@ static inline int copy_fs(unsigned long static int count_open_files(struct files_struct *files, int size) { int i; + fd_set *open_fds = files->open_fds; + read_barrier_depends(); /* Find the last open fd */ for (i = size/(8*sizeof(long)); i > 0; ) { - if (files->open_fds->fds_bits[--i]) + if (open_fds->fds_bits[--i]) break; } i = (i+1) * 8 * sizeof(long); @@ -603,6 +635,11 @@ static int copy_files(unsigned long clon goto out; } + /* We don't yet have the oldf readlock, but even if the old + fdset gets grown now, we'll only copy up to "size" fds */ + size = oldf->max_fdset; + rmb(); + tsk->files = NULL; error = -ENOMEM; newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL); @@ -619,9 +656,6 @@ static int copy_files(unsigned long clon newf->open_fds = &newf->open_fds_init; newf->fd = &newf->fd_array[0]; - /* We don't yet have the oldf readlock, but even if the old - fdset gets grown now, we'll only copy up to "size" fds */ - size = oldf->max_fdset; if (size > __FD_SETSIZE) { newf->max_fdset = 0; spin_lock(&newf->file_lock); @@ -630,7 +664,7 @@ static int copy_files(unsigned long clon if (error) goto out_release; } - spin_lock(&oldf->file_lock); + rcu_read_lock(); open_files = count_open_files(oldf, size); @@ -641,7 +675,7 @@ static int copy_files(unsigned long clon */ nfds = NR_OPEN_DEFAULT; if (open_files > nfds) { - spin_unlock(&oldf->file_lock); + rcu_read_unlock(); newf->max_fds = 0; spin_lock(&newf->file_lock); error = expand_fd_array(newf, open_files-1); @@ -649,10 +683,11 @@ static int copy_files(unsigned long clon if (error) goto out_release; nfds = newf->max_fds; - spin_lock(&oldf->file_lock); + rcu_read_lock(); } old_fds = oldf->fd; + read_barrier_depends(); new_fds = newf->fd; memcpy(newf->open_fds->fds_bits, oldf->open_fds->fds_bits, open_files/8); @@ -664,7 +699,7 @@ static int copy_files(unsigned long clon get_file(f); *new_fds++ = f; } - spin_unlock(&oldf->file_lock); + rcu_read_unlock(); /* compute the remainder to be cleared */ size = (newf->max_fds - open_files) * sizeof(struct file *); @@ -1151,8 +1186,7 @@ kmem_cache_t *fs_cachep; /* SLAB cache for vm_area_struct structures */ kmem_cache_t *vm_area_cachep; -/* SLAB cache for mm_struct structures (tsk->mm) */ -kmem_cache_t *mm_cachep; +void init_rmap(void); void __init proc_caches_init(void) { @@ -1191,4 +1225,6 @@ void __init proc_caches_init(void) SLAB_HWCACHE_ALIGN, NULL, NULL); if(!mm_cachep) panic("vma_init: Cannot alloc mm_struct SLAB cache"); + + init_rmap(); } diff -prauN linux-2.6.0-test1/kernel/kmod.c wli-2.6.0-test1-37/kernel/kmod.c --- linux-2.6.0-test1/kernel/kmod.c 2003-07-13 20:32:33.000000000 -0700 +++ wli-2.6.0-test1-37/kernel/kmod.c 2003-07-14 10:07:15.000000000 -0700 @@ -169,7 +169,7 @@ static int ____call_usermodehelper(void spin_unlock_irq(¤t->sighand->siglock); retval = -EPERM; - if (current->fs->root) + if (current->fs->dirs->root) retval = execve(sub_info->path, sub_info->argv,sub_info->envp); /* Exec failed? */ diff -prauN linux-2.6.0-test1/kernel/ksyms.c wli-2.6.0-test1-37/kernel/ksyms.c --- linux-2.6.0-test1/kernel/ksyms.c 2003-07-13 20:28:56.000000000 -0700 +++ wli-2.6.0-test1-37/kernel/ksyms.c 2003-07-14 10:16:55.000000000 -0700 @@ -187,7 +187,6 @@ EXPORT_SYMBOL(close_private_file); EXPORT_SYMBOL(filp_open); EXPORT_SYMBOL(filp_close); EXPORT_SYMBOL(put_filp); -EXPORT_SYMBOL(files_lock); EXPORT_SYMBOL(check_disk_change); EXPORT_SYMBOL(invalidate_bdev); EXPORT_SYMBOL(invalidate_inodes); diff -prauN linux-2.6.0-test1/kernel/module.c wli-2.6.0-test1-37/kernel/module.c --- linux-2.6.0-test1/kernel/module.c 2003-07-13 20:36:38.000000000 -0700 +++ wli-2.6.0-test1-37/kernel/module.c 2003-07-14 06:43:52.000000000 -0700 @@ -471,7 +471,7 @@ static int stopref(void *cpu) struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; setscheduler(current->pid, SCHED_FIFO, ¶m); #endif - set_cpus_allowed(current, 1UL << (unsigned long)cpu); + set_cpus_allowed(current, cpumask_of_cpu((int)(long)cpu)); /* Ack: we are alive */ atomic_inc(&stopref_thread_ack); @@ -524,7 +524,7 @@ static void stopref_set_state(enum stopr static int stop_refcounts(void) { unsigned int i, cpu; - unsigned long old_allowed; + cpumask_t old_allowed; int ret = 0; /* One thread per cpu. We'll do our own. */ @@ -532,7 +532,7 @@ static int stop_refcounts(void) /* FIXME: racy with set_cpus_allowed. */ old_allowed = current->cpus_allowed; - set_cpus_allowed(current, 1UL << (unsigned long)cpu); + set_cpus_allowed(current, cpumask_of_cpu(cpu)); atomic_set(&stopref_thread_ack, 0); stopref_num_threads = 0; diff -prauN linux-2.6.0-test1/kernel/pid.c wli-2.6.0-test1-37/kernel/pid.c --- linux-2.6.0-test1/kernel/pid.c 2003-07-13 20:34:42.000000000 -0700 +++ wli-2.6.0-test1-37/kernel/pid.c 2003-07-17 03:36:11.000000000 -0700 @@ -172,13 +172,22 @@ int attach_pid(task_t *task, enum pid_ty if (pid) atomic_inc(&pid->count); else { + struct list_head *elem, *bucket; + pid = &task->pids[type].pid; pid->nr = nr; atomic_set(&pid->count, 1); INIT_LIST_HEAD(&pid->task_list); pid->task = task; get_task_struct(task); - list_add(&pid->hash_chain, &pid_hash[type][pid_hashfn(nr)]); + bucket = &pid_hash[type][pid_hashfn(nr)]; + __list_for_each(elem, bucket) { + struct pid *walk; + walk = list_entry(elem, struct pid, hash_chain); + if (walk->nr > nr) + break; + } + list_add_tail(&pid->hash_chain, elem); } list_add_tail(&task->pids[type].pid_chain, &pid->task_list); task->pids[type].pidptr = pid; @@ -221,6 +230,42 @@ void detach_pid(task_t *task, enum pid_t free_pidmap(nr); } +/** + * find_next_pid - Returns the pid of next task. + * @pid: Starting point for the search. + * + * Returns the pid number of the task that follows behind + * "pid". The function works even if the input pid value + * is not valid anymore. + */ + int find_next_pid(int pid) +{ + struct list_head *elem, *bucket; + + if(!pid) { + bucket = &pid_hash[PIDTYPE_PID][0]; + } else { + bucket = &pid_hash[PIDTYPE_PID][pid_hashfn(pid)]; + } + read_lock(&tasklist_lock); +next_chain: + __list_for_each(elem, bucket) { + struct pid *walk; + walk = list_entry(elem, struct pid, hash_chain); + if (walk->nr > pid) { + pid = walk->nr; + read_unlock(&tasklist_lock); + return pid; + } + } + pid = 0; + bucket++; + if (bucket < &pid_hash[PIDTYPE_PID][1<nr_active++; p->array = array; } +/* + * normalise_sleep converts a task's sleep_avg to + * an appropriate proportion of MIN_SLEEP_AVG. + */ +static inline void normalise_sleep(task_t *p) +{ + unsigned long old_avg_time = jiffies - p->avg_start; + + if (unlikely(old_avg_time < MIN_SLEEP_AVG)) + return; + + if (p->sleep_avg > MAX_SLEEP_AVG) + p->sleep_avg = MAX_SLEEP_AVG; + + if (old_avg_time > MAX_SLEEP_AVG) + old_avg_time = MAX_SLEEP_AVG; + + p->sleep_avg = p->sleep_avg * MIN_SLEEP_AVG / old_avg_time; + p->avg_start = jiffies - MIN_SLEEP_AVG; +} /* * effective_prio - return the priority that is based on the static @@ -315,11 +337,28 @@ static inline void enqueue_task(struct t static int effective_prio(task_t *p) { int bonus, prio; + unsigned long sleep_period; if (rt_task(p)) return p->prio; - bonus = MAX_USER_PRIO*PRIO_BONUS_RATIO*p->sleep_avg/MAX_SLEEP_AVG/100 - + sleep_period = jiffies - p->avg_start; + + if (unlikely(!sleep_period)) + return p->static_prio; + + if (sleep_period > MAX_SLEEP_AVG) + sleep_period = MAX_SLEEP_AVG; + + if (p->sleep_avg > sleep_period) + sleep_period = p->sleep_avg; + + /* + * The bonus is determined according to the accumulated + * sleep avg over the duration the task has been running + * until it reaches MAX_SLEEP_AVG. -ck + */ + bonus = MAX_USER_PRIO*PRIO_BONUS_RATIO*p->sleep_avg/sleep_period/100 - MAX_USER_PRIO*PRIO_BONUS_RATIO/100/2; prio = p->static_prio - bonus; @@ -350,31 +389,47 @@ static inline void activate_task(task_t long sleep_time = jiffies - p->last_run - 1; if (sleep_time > 0) { - int sleep_avg; - /* - * This code gives a bonus to interactive tasks. - * - * The boost works by updating the 'average sleep time' - * value here, based on ->last_run. The more time a task - * spends sleeping, the higher the average gets - and the - * higher the priority boost gets as well. + * User tasks that sleep a long time are categorised as idle and + * will get just under interactive status with a small runtime + * to allow them to become interactive or non-interactive rapidly */ - sleep_avg = p->sleep_avg + sleep_time; + if (sleep_time > MIN_SLEEP_AVG && p->mm){ + p->avg_start = jiffies - MIN_SLEEP_AVG; + p->sleep_avg = MIN_SLEEP_AVG * (MAX_BONUS - INTERACTIVE_DELTA - 2) / + MAX_BONUS; + } else { + unsigned long runtime = jiffies - p->avg_start; - /* - * 'Overflow' bonus ticks go to the waker as well, so the - * ticks are not lost. This has the effect of further - * boosting tasks that are related to maximum-interactive - * tasks. - */ - if (sleep_avg > MAX_SLEEP_AVG) - sleep_avg = MAX_SLEEP_AVG; - if (p->sleep_avg != sleep_avg) { - p->sleep_avg = sleep_avg; - p->prio = effective_prio(p); + if (runtime > MAX_SLEEP_AVG) + runtime = MAX_SLEEP_AVG; + + /* + * This code gives a bonus to interactive tasks. + * + * The boost works by updating the 'average sleep time' + * value here, based on ->last_run. The more time a task + * spends sleeping, the higher the average gets - and the + * higher the priority boost gets as well. + */ + p->sleep_avg += sleep_time; + + /* + * Processes that sleep get pushed to a higher priority + * each time they sleep + */ + p->sleep_avg = (p->sleep_avg * MAX_BONUS / runtime + 1) * runtime / MAX_BONUS; + + if (p->sleep_avg > MAX_SLEEP_AVG) + p->sleep_avg = MAX_SLEEP_AVG; + } + + if (unlikely(p->avg_start > jiffies)){ + p->avg_start = jiffies; + p->sleep_avg = 0; } } + p->prio = effective_prio(p); __activate_task(p, rq); } @@ -489,7 +544,7 @@ repeat_lock_task: */ if (unlikely(sync && !task_running(rq, p) && (task_cpu(p) != smp_processor_id()) && - (p->cpus_allowed & (1UL << smp_processor_id())))) { + cpu_isset(smp_processor_id(), p->cpus_allowed))) { set_task_cpu(p, smp_processor_id()); task_rq_unlock(rq, &flags); @@ -551,6 +606,7 @@ void wake_up_forked_process(task_t * p) * from forking tasks that are max-interactive. */ current->sleep_avg = current->sleep_avg * PARENT_PENALTY / 100; + normalise_sleep(p); p->sleep_avg = p->sleep_avg * CHILD_PENALTY / 100; p->prio = effective_prio(p); set_task_cpu(p, smp_processor_id()); @@ -591,6 +647,8 @@ void sched_exit(task_t * p) * If the child was a (relative-) CPU hog then decrease * the sleep_avg of the parent as well. */ + normalise_sleep(p); + normalise_sleep(p->parent); if (p->sleep_avg < p->parent->sleep_avg) p->parent->sleep_avg = (p->parent->sleep_avg * EXIT_WEIGHT + p->sleep_avg) / (EXIT_WEIGHT + 1); @@ -758,13 +816,13 @@ static inline void double_rq_unlock(runq */ static void sched_migrate_task(task_t *p, int dest_cpu) { - unsigned long old_mask; + cpumask_t old_mask; old_mask = p->cpus_allowed; - if (!(old_mask & (1UL << dest_cpu))) + if (!cpu_isset(dest_cpu, old_mask)) return; /* force the process onto the specified CPU */ - set_cpus_allowed(p, 1UL << dest_cpu); + set_cpus_allowed(p, cpumask_of_cpu(dest_cpu)); /* restore the cpus allowed mask */ set_cpus_allowed(p, old_mask); @@ -777,7 +835,7 @@ static void sched_migrate_task(task_t *p static int sched_best_cpu(struct task_struct *p) { int i, minload, load, best_cpu, node = 0; - unsigned long cpumask; + cpumask_t cpumask; best_cpu = task_cpu(p); if (cpu_rq(best_cpu)->nr_running <= 2) @@ -801,7 +859,7 @@ static int sched_best_cpu(struct task_st minload = 10000000; cpumask = node_to_cpumask(node); for (i = 0; i < NR_CPUS; ++i) { - if (!(cpumask & (1UL << i))) + if (!cpu_isset(i, cpumask)) continue; if (cpu_rq(i)->nr_running < minload) { best_cpu = i; @@ -888,7 +946,7 @@ static inline unsigned int double_lock_b /* * find_busiest_queue - find the busiest runqueue among the cpus in cpumask. */ -static inline runqueue_t *find_busiest_queue(runqueue_t *this_rq, int this_cpu, int idle, int *imbalance, unsigned long cpumask) +static inline runqueue_t *find_busiest_queue(runqueue_t *this_rq, int this_cpu, int idle, int *imbalance, cpumask_t cpumask) { int nr_running, load, max_load, i; runqueue_t *busiest, *rq_src; @@ -923,7 +981,7 @@ static inline runqueue_t *find_busiest_q busiest = NULL; max_load = 1; for (i = 0; i < NR_CPUS; i++) { - if (!((1UL << i) & cpumask)) + if (!cpu_isset(i, cpumask)) continue; rq_src = cpu_rq(i); @@ -942,10 +1000,10 @@ static inline runqueue_t *find_busiest_q if (likely(!busiest)) goto out; - *imbalance = (max_load - nr_running) / 2; + *imbalance = max_load - nr_running; /* It needs an at least ~25% imbalance to trigger balancing. */ - if (!idle && (*imbalance < (max_load + 3)/4)) { + if (!idle && ((*imbalance)*4 < max_load)) { busiest = NULL; goto out; } @@ -955,10 +1013,15 @@ static inline runqueue_t *find_busiest_q * Make sure nothing changed since we checked the * runqueue length. */ - if (busiest->nr_running <= nr_running + 1) { + if (busiest->nr_running <= nr_running) { spin_unlock(&busiest->lock); busiest = NULL; } + /* + * We only want to steal a number of tasks equal to 1/2 the imbalance, + * otherwise we'll just shift the imbalance to the new queue: + */ + *imbalance /= 2; out: return busiest; } @@ -995,7 +1058,7 @@ static inline void pull_task(runqueue_t * We call this with the current runqueue locked, * irqs disabled. */ -static void load_balance(runqueue_t *this_rq, int idle, unsigned long cpumask) +static void load_balance(runqueue_t *this_rq, int idle, cpumask_t cpumask) { int imbalance, idx, this_cpu = smp_processor_id(); runqueue_t *busiest; @@ -1049,7 +1112,7 @@ skip_queue: #define CAN_MIGRATE_TASK(p,rq,this_cpu) \ ((!idle || (jiffies - (p)->last_run > cache_decay_ticks)) && \ !task_running(rq, p) && \ - ((p)->cpus_allowed & (1UL << (this_cpu)))) + cpu_isset(this_cpu, (p)->cpus_allowed)) curr = curr->prev; @@ -1092,10 +1155,10 @@ out: static void balance_node(runqueue_t *this_rq, int idle, int this_cpu) { int node = find_busiest_node(cpu_to_node(this_cpu)); - unsigned long cpumask, this_cpumask = 1UL << this_cpu; if (node >= 0) { - cpumask = node_to_cpumask(node) | this_cpumask; + cpumask_t cpumask = node_to_cpumask(node); + cpu_set(this_cpu, cpumask); spin_lock(&this_rq->lock); load_balance(this_rq, idle, cpumask); spin_unlock(&this_rq->lock); @@ -1207,11 +1270,7 @@ void scheduler_tick(int user_ticks, int spin_lock(&rq->lock); /* * The task was running during this tick - update the - * time slice counter and the sleep average. Note: we - * do not update a thread's priority until it either - * goes to sleep or uses up its timeslice. This makes - * it possible for interactive tasks to use up their - * timeslices at their highest priority levels. + * time slice counter and the sleep average. */ if (p->sleep_avg) p->sleep_avg--; @@ -1244,6 +1303,17 @@ void scheduler_tick(int user_ticks, int enqueue_task(p, rq->expired); } else enqueue_task(p, rq->active); + } else if (p->mm && !((task_timeslice(p) - p->time_slice) % + (MIN_TIMESLICE * (MAX_BONUS + 1 - p->sleep_avg * MAX_BONUS / MAX_SLEEP_AVG)))){ + /* + * Running user tasks get requeued with their remaining timeslice + * after a period proportional to how cpu intensive they are to + * minimise the duration one interactive task can starve another + */ + dequeue_task(p, rq->active); + set_tsk_need_resched(p); + p->prio = effective_prio(p); + enqueue_task(p, rq->active); } out_unlock: spin_unlock(&rq->lock); @@ -1898,7 +1968,7 @@ out_unlock: asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, unsigned long __user *user_mask_ptr) { - unsigned long new_mask; + cpumask_t new_mask; int retval; task_t *p; @@ -1946,7 +2016,7 @@ asmlinkage long sys_sched_getaffinity(pi unsigned long __user *user_mask_ptr) { unsigned int real_len; - unsigned long mask; + cpumask_t mask; int retval; task_t *p; @@ -1962,7 +2032,7 @@ asmlinkage long sys_sched_getaffinity(pi goto out_unlock; retval = 0; - mask = p->cpus_allowed & cpu_online_map; + cpus_and(mask, p->cpus_allowed, cpu_online_map); out_unlock: read_unlock(&tasklist_lock); @@ -2292,7 +2362,7 @@ typedef struct { * task must not exit() & deallocate itself prematurely. The * call is not atomic; no spinlocks may be held. */ -int set_cpus_allowed(task_t *p, unsigned long new_mask) +int set_cpus_allowed(task_t *p, cpumask_t new_mask) { unsigned long flags; migration_req_t req; @@ -2307,7 +2377,7 @@ int set_cpus_allowed(task_t *p, unsigned * Can the task run on the task's current CPU? If not then * migrate the thread off to a proper CPU. */ - if (new_mask & (1UL << task_cpu(p))) { + if (cpu_isset(task_cpu(p), new_mask)) { task_rq_unlock(rq, &flags); return 0; } @@ -2377,7 +2447,7 @@ static int migration_thread(void * data) * migration thread on this CPU, guaranteed (we're started * serially). */ - set_cpus_allowed(current, 1UL << cpu); + set_cpus_allowed(current, cpumask_of_cpu(cpu)); ret = setscheduler(0, SCHED_FIFO, ¶m); diff -prauN linux-2.6.0-test1/kernel/softirq.c wli-2.6.0-test1-37/kernel/softirq.c --- linux-2.6.0-test1/kernel/softirq.c 2003-07-13 20:31:58.000000000 -0700 +++ wli-2.6.0-test1-37/kernel/softirq.c 2003-07-14 06:31:10.000000000 -0700 @@ -322,9 +322,8 @@ static int ksoftirqd(void * __bind_cpu) current->flags |= PF_IOTHREAD; /* Migrate to the right CPU */ - set_cpus_allowed(current, 1UL << cpu); - if (smp_processor_id() != cpu) - BUG(); + set_cpus_allowed(current, cpumask_of_cpu(cpu)); + BUG_ON(smp_processor_id() != cpu); __set_current_state(TASK_INTERRUPTIBLE); mb(); diff -prauN linux-2.6.0-test1/kernel/workqueue.c wli-2.6.0-test1-37/kernel/workqueue.c --- linux-2.6.0-test1/kernel/workqueue.c 2003-07-13 20:38:52.000000000 -0700 +++ wli-2.6.0-test1-37/kernel/workqueue.c 2003-07-14 06:31:10.000000000 -0700 @@ -176,7 +176,7 @@ static int worker_thread(void *__startup cwq->thread = current; set_user_nice(current, -10); - set_cpus_allowed(current, 1UL << cpu); + set_cpus_allowed(current, cpumask_of_cpu(cpu)); complete(&startup->done); diff -prauN linux-2.6.0-test1/mm/filemap.c wli-2.6.0-test1-37/mm/filemap.c --- linux-2.6.0-test1/mm/filemap.c 2003-07-13 20:32:41.000000000 -0700 +++ wli-2.6.0-test1-37/mm/filemap.c 2003-07-17 15:01:26.000000000 -0700 @@ -55,13 +55,16 @@ /* * Lock ordering: * - * ->i_shared_sem (vmtruncate) - * ->private_lock (__free_pte->__set_page_dirty_buffers) + * ->i_shared_lock (vmtruncate) + * ->private_lock (__free_pte->set_page_dirty_buffers) * ->swap_list_lock * ->swap_device_lock (exclusive_swap_page, others) * ->mapping->page_lock * ->mmap_sem - * ->i_shared_sem (various places) + * ->i_shared_lock (various places) + * + * ->lock_page + * ->i_shared_lock (page_convert_anon) * * ->inode_lock * ->sb_lock (fs/fs-writeback.c) @@ -79,11 +82,11 @@ */ void __remove_from_page_cache(struct page *page) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); radix_tree_delete(&mapping->page_tree, page->index); list_del(&page->list); - page->mapping = NULL; + set_page_mapping(page, NULL); mapping->nrpages--; pagecache_acct(-1); @@ -91,22 +94,24 @@ void __remove_from_page_cache(struct pag void remove_from_page_cache(struct page *page) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); - if (unlikely(!PageLocked(page))) - PAGE_BUG(page); + BUG_ON(!PageLocked(page)); - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); __remove_from_page_cache(page); - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); + page_cache_release(page); } static inline int sync_page(struct page *page) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); if (mapping && mapping->a_ops && mapping->a_ops->sync_page) return mapping->a_ops->sync_page(page); + if (PageSwapCache(page)) + blk_run_queues(); return 0; } @@ -130,9 +135,9 @@ static int __filemap_fdatawrite(struct a if (mapping->backing_dev_info->memory_backed) return 0; - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); list_splice_init(&mapping->dirty_pages, &mapping->io_pages); - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); ret = do_writepages(mapping, &wbc); return ret; } @@ -163,7 +168,7 @@ int filemap_fdatawait(struct address_spa restart: progress = 0; - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); while (!list_empty(&mapping->locked_pages)) { struct page *page; @@ -177,7 +182,7 @@ restart: if (!PageWriteback(page)) { if (++progress > 32) { if (need_resched()) { - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); __cond_resched(); goto restart; } @@ -187,16 +192,16 @@ restart: progress = 0; page_cache_get(page); - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); wait_on_page_writeback(page); if (PageError(page)) ret = -EIO; page_cache_release(page); - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); } - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); return ret; } @@ -204,16 +209,9 @@ restart: * This adds a page to the page cache, starting out as locked, unreferenced, * not uptodate and with no errors. * - * This function is used for two things: adding newly allocated pagecache - * pages and for moving existing anon pages into swapcache. - * - * In the case of pagecache pages, the page is new, so we can just run - * SetPageLocked() against it. The other page state flags were set by - * rmqueue() - * - * In the case of swapcache, try_to_swap_out() has already locked the page, so - * SetPageLocked() is ugly-but-OK there too. The required page state has been - * set up by swap_out_add_to_swap_cache(). + * This function is used to add newly allocated pagecache pages; + * the page is new, so we can just run SetPageLocked() against it. + * The other page state flags were set by rmqueue(). * * This function does not add the page to the LRU. The caller must do that. */ @@ -224,15 +222,19 @@ int add_to_page_cache(struct page *page, if (error == 0) { page_cache_get(page); - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); error = radix_tree_insert(&mapping->page_tree, offset, page); if (!error) { SetPageLocked(page); - ___add_to_page_cache(page, mapping, offset); + list_add(&page->list, &mapping->clean_pages); + set_page_mapping(page, mapping); + page->index = offset; + mapping->nrpages++; + pagecache_acct(+1); } else { page_cache_release(page); } - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); radix_tree_preload_end(); } return error; @@ -361,11 +363,11 @@ struct page * find_get_page(struct addre * We scan the hash list read-only. Addition to and removal from * the hash-list needs a held write-lock. */ - spin_lock(&mapping->page_lock); + mapping_rdlock(&mapping->page_lock); page = radix_tree_lookup(&mapping->page_tree, offset); if (page) page_cache_get(page); - spin_unlock(&mapping->page_lock); + mapping_rdunlock(&mapping->page_lock); return page; } @@ -376,11 +378,11 @@ struct page *find_trylock_page(struct ad { struct page *page; - spin_lock(&mapping->page_lock); + mapping_rdlock(&mapping->page_lock); page = radix_tree_lookup(&mapping->page_tree, offset); if (page && TestSetPageLocked(page)) page = NULL; - spin_unlock(&mapping->page_lock); + mapping_rdunlock(&mapping->page_lock); return page; } @@ -400,25 +402,25 @@ struct page *find_lock_page(struct addre { struct page *page; - spin_lock(&mapping->page_lock); + mapping_rdlock(&mapping->page_lock); repeat: page = radix_tree_lookup(&mapping->page_tree, offset); if (page) { page_cache_get(page); if (TestSetPageLocked(page)) { - spin_unlock(&mapping->page_lock); + mapping_rdunlock(&mapping->page_lock); lock_page(page); - spin_lock(&mapping->page_lock); + mapping_rdlock(&mapping->page_lock); /* Has the page been truncated while we slept? */ - if (page->mapping != mapping || page->index != offset) { + if (page_mapping(page) != mapping || page->index != offset) { unlock_page(page); page_cache_release(page); goto repeat; } } } - spin_unlock(&mapping->page_lock); + mapping_rdunlock(&mapping->page_lock); return page; } @@ -488,12 +490,12 @@ unsigned int find_get_pages(struct addre unsigned int i; unsigned int ret; - spin_lock(&mapping->page_lock); + mapping_rdlock(&mapping->page_lock); ret = radix_tree_gang_lookup(&mapping->page_tree, (void **)pages, start, nr_pages); for (i = 0; i < ret; i++) page_cache_get(pages[i]); - spin_unlock(&mapping->page_lock); + mapping_rdunlock(&mapping->page_lock); return ret; } @@ -621,8 +623,8 @@ page_not_up_to_date: /* Get exclusive access to the page ... */ lock_page(page); - /* Did it get unhashed before we got the lock? */ - if (!page->mapping) { + /* Did it get removed from the radix tree before we got the lock? */ + if (!page_mapping(page)) { unlock_page(page); page_cache_release(page); continue; @@ -1053,8 +1055,8 @@ page_not_uptodate: inc_page_state(pgmajfault); lock_page(page); - /* Did it get unhashed while we waited for it? */ - if (!page->mapping) { + /* Did it get removed from the radix tree while we waited for it? */ + if (!page_mapping(page)) { unlock_page(page); page_cache_release(page); goto retry_all; @@ -1081,7 +1083,7 @@ page_not_uptodate: lock_page(page); /* Somebody truncated the page on us? */ - if (!page->mapping) { + if (!page_mapping(page)) { unlock_page(page); page_cache_release(page); goto retry_all; @@ -1160,8 +1162,8 @@ no_cached_page: page_not_uptodate: lock_page(page); - /* Did it get unhashed while we waited for it? */ - if (!page->mapping) { + /* Did it get removed from the radix tree while we waited for it? */ + if (!page_mapping(page)) { unlock_page(page); goto err; } @@ -1187,7 +1189,7 @@ page_not_uptodate: lock_page(page); /* Somebody truncated the page on us? */ - if (!page->mapping) { + if (!page_mapping(page)) { unlock_page(page); goto err; } @@ -1355,7 +1357,7 @@ retry: goto out; lock_page(page); - if (!page->mapping) { + if (!page_mapping(page)) { unlock_page(page); page_cache_release(page); goto retry; diff -prauN linux-2.6.0-test1/mm/fremap.c wli-2.6.0-test1-37/mm/fremap.c --- linux-2.6.0-test1/mm/fremap.c 2003-07-13 20:29:22.000000000 -0700 +++ wli-2.6.0-test1-37/mm/fremap.c 2003-07-14 09:10:59.000000000 -0700 @@ -12,11 +12,14 @@ #include #include #include -#include +#include #include #include #include +/* + * This is never done to an anonymous page so page->mapping is never altered. + */ static inline int zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { @@ -28,13 +31,13 @@ static inline int zap_pte(struct mm_stru unsigned long pfn = pte_pfn(pte); flush_cache_page(vma, addr); - pte = ptep_get_and_clear(ptep); + pte = vm_ptep_get_and_clear(vma, ptep, addr); if (pfn_valid(pfn)) { struct page *page = pfn_to_page(pfn); if (!PageReserved(page)) { if (pte_dirty(pte)) set_page_dirty(page); - page_remove_rmap(page, ptep); + page_remove_rmap(page); page_cache_release(page); mm->rss--; } @@ -43,7 +46,7 @@ static inline int zap_pte(struct mm_stru } else { if (!pte_file(pte)) free_swap_and_cache(pte_to_swp_entry(pte)); - pte_clear(ptep); + vm_pte_clear(vma, ptep, addr); return 0; } } @@ -59,19 +62,18 @@ int install_page(struct mm_struct *mm, s pte_t *pte; pgd_t *pgd; pmd_t *pmd; - struct pte_chain *pte_chain; - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) - goto err; pgd = pgd_offset(mm, addr); + if (!rmap_get_cpu()) + goto err; spin_lock(&mm->page_table_lock); + put_cpu(); - pmd = pmd_alloc(mm, pgd, addr); + pmd = pmd_alloc_map(mm, pgd, addr); if (!pmd) goto err_unlock; - pte = pte_alloc_map(mm, pmd, addr); + pte = pte_alloc_map(mm, pgd, &pmd, addr); if (!pte) goto err_unlock; @@ -79,19 +81,18 @@ int install_page(struct mm_struct *mm, s mm->rss++; flush_icache_page(vma, page); - set_pte(pte, mk_pte(page, prot)); - pte_chain = page_add_rmap(page, pte, pte_chain); + vm_set_pte(vma, pte, mk_pte(page, prot), addr); + if (!PageReserved(page)) + page_add_rmap(page, vma, addr, 0); pte_unmap(pte); + pmd_unmap(pmd); if (flush) flush_tlb_page(vma, addr); update_mmu_cache(vma, addr, *pte); - spin_unlock(&mm->page_table_lock); - pte_chain_free(pte_chain); - return 0; + err = 0; err_unlock: spin_unlock(&mm->page_table_lock); - pte_chain_free(pte_chain); err: return err; } diff -prauN linux-2.6.0-test1/mm/memory.c wli-2.6.0-test1-37/mm/memory.c --- linux-2.6.0-test1/mm/memory.c 2003-07-13 20:33:49.000000000 -0700 +++ wli-2.6.0-test1-37/mm/memory.c 2003-07-17 15:47:44.000000000 -0700 @@ -44,10 +44,9 @@ #include #include #include -#include +#include #include -#include #include #include #include @@ -96,7 +95,7 @@ static inline void free_one_pmd(struct m } page = pmd_page(*dir); pmd_clear(dir); - pgtable_remove_rmap(page); + dec_page_state(nr_page_table_pages); pte_free_tlb(tlb, page); } @@ -104,6 +103,7 @@ static inline void free_one_pgd(struct m { int j; pmd_t * pmd; + struct page *page; if (pgd_none(*dir)) return; @@ -112,11 +112,13 @@ static inline void free_one_pgd(struct m pgd_clear(dir); return; } - pmd = pmd_offset(dir, 0); + page = pgd_page(*dir); + pmd = pmd_offset_map(dir, 0); pgd_clear(dir); for (j = 0; j < PTRS_PER_PMD ; j++) free_one_pmd(tlb, pmd+j); - pmd_free_tlb(tlb, pmd); + pmd_unmap(pmd); + pmd_free_tlb(tlb, page); } /* @@ -136,30 +138,38 @@ void clear_page_tables(struct mmu_gather } while (--nr); } -pte_t * pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address) +/* + * error return happens with pmd unmapped + */ +pte_t *pte_alloc_map(struct mm_struct *mm, pgd_t *pgd, pmd_t **pmd, unsigned long addr) { - if (!pmd_present(*pmd)) { + if (!pmd_present(**pmd)) { struct page *new; + pmd_unmap(*pmd); spin_unlock(&mm->page_table_lock); - new = pte_alloc_one(mm, address); + new = pte_alloc_one(mm, addr); spin_lock(&mm->page_table_lock); - if (!new) + if (!new) { + *pmd = NULL; return NULL; + } + + *pmd = pmd_offset_map(pgd, addr); /* * Because we dropped the lock, we should re-check the * entry, as somebody else could have populated it.. */ - if (pmd_present(*pmd)) { + if (pmd_present(**pmd)) { pte_free(new); goto out; } - pgtable_add_rmap(new, mm, address); - pmd_populate(mm, pmd, new); + inc_page_state(nr_page_table_pages); + pmd_populate(mm, *pmd, new); } out: - return pte_offset_map(pmd, address); + return pte_offset_map(*pmd, addr); } pte_t * pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address) @@ -181,7 +191,7 @@ pte_t * pte_alloc_kernel(struct mm_struc pte_free_kernel(new); goto out; } - pgtable_add_rmap(virt_to_page(new), mm, address); + inc_page_state(nr_page_table_pages); pmd_populate_kernel(mm, pmd, new); } out: @@ -199,7 +209,7 @@ out: * variable count and make things faster. -jj * * dst->page_table_lock is held on entry and exit, - * but may be dropped within pmd_alloc() and pte_alloc_map(). + * but may be dropped within pmd_alloc_map() and pte_alloc_map(). */ int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma) @@ -208,20 +218,10 @@ int copy_page_range(struct mm_struct *ds unsigned long address = vma->vm_start; unsigned long end = vma->vm_end; unsigned long cow; - struct pte_chain *pte_chain = NULL; if (is_vm_hugetlb_page(vma)) return copy_hugetlb_page_range(dst, src, vma); - pte_chain = pte_chain_alloc(GFP_ATOMIC); - if (!pte_chain) { - spin_unlock(&dst->page_table_lock); - pte_chain = pte_chain_alloc(GFP_KERNEL); - spin_lock(&dst->page_table_lock); - if (!pte_chain) - goto nomem; - } - cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; src_pgd = pgd_offset(src, address)-1; dst_pgd = pgd_offset(dst, address)-1; @@ -244,11 +244,10 @@ skip_copy_pmd_range: address = (address continue; } - src_pmd = pmd_offset(src_pgd, address); - dst_pmd = pmd_alloc(dst, dst_pgd, address); + dst_pmd = pmd_alloc_map(dst, dst_pgd, address); if (!dst_pmd) goto nomem; - + src_pmd = pmd_offset_map_nested(src_pgd, address); do { pte_t * src_pte, * dst_pte; @@ -261,15 +260,20 @@ skip_copy_pmd_range: address = (address pmd_clear(src_pmd); skip_copy_pte_range: address = (address + PMD_SIZE) & PMD_MASK; - if (address >= end) + if (address >= end) { + pmd_unmap(dst_pmd); + pmd_unmap_nested(src_pmd); goto out; + } goto cont_copy_pmd_range; } - dst_pte = pte_alloc_map(dst, dst_pmd, address); + pmd_unmap_nested(src_pmd); + dst_pte = pte_alloc_map(dst, dst_pgd, &dst_pmd, address); if (!dst_pte) goto nomem; spin_lock(&src->page_table_lock); + src_pmd = pmd_offset_map_nested(src_pgd, address); src_pte = pte_offset_map_nested(src_pmd, address); do { pte_t pte = *src_pte; @@ -284,8 +288,7 @@ skip_copy_pte_range: if (!pte_present(pte)) { if (!pte_file(pte)) swap_duplicate(pte_to_swp_entry(pte)); - set_pte(dst_pte, pte); - goto cont_copy_pte_range_noset; + goto cont_copy_pte_range; } pfn = pte_pfn(pte); /* the pte points outside of valid memory, the @@ -293,13 +296,13 @@ skip_copy_pte_range: * and not mapped via rmap - duplicate the * mapping as is. */ - page = NULL; - if (pfn_valid(pfn)) - page = pfn_to_page(pfn); - - if (!page || PageReserved(page)) { - set_pte(dst_pte, pte); - goto cont_copy_pte_range_noset; + if (!pfn_valid(pfn)) { + page = NULL; + goto cont_copy_pte_range; + } else { + page = pfn_to_page(pfn); + if (PageReserved(page)) + goto cont_copy_pte_range; } /* @@ -307,7 +310,7 @@ skip_copy_pte_range: * in the parent and the child */ if (cow) { - ptep_set_wrprotect(src_pte); + vm_ptep_set_wrprotect(src, src_pte); pte = *src_pte; } @@ -320,35 +323,14 @@ skip_copy_pte_range: pte = pte_mkold(pte); get_page(page); dst->rss++; - - set_pte(dst_pte, pte); - pte_chain = page_add_rmap(page, dst_pte, - pte_chain); - if (pte_chain) - goto cont_copy_pte_range_noset; - pte_chain = pte_chain_alloc(GFP_ATOMIC); - if (pte_chain) - goto cont_copy_pte_range_noset; - - /* - * pte_chain allocation failed, and we need to - * run page reclaim. - */ - pte_unmap_nested(src_pte); - pte_unmap(dst_pte); - spin_unlock(&src->page_table_lock); - spin_unlock(&dst->page_table_lock); - pte_chain = pte_chain_alloc(GFP_KERNEL); - spin_lock(&dst->page_table_lock); - if (!pte_chain) - goto nomem; - spin_lock(&src->page_table_lock); - dst_pte = pte_offset_map(dst_pmd, address); - src_pte = pte_offset_map_nested(src_pmd, - address); + page_dup_rmap(page); +cont_copy_pte_range: + vm_set_pte(vma, dst_pte, pte, address); cont_copy_pte_range_noset: address += PAGE_SIZE; if (address >= end) { + pmd_unmap(dst_pmd); + pmd_unmap_nested(src_pmd); pte_unmap_nested(src_pte); pte_unmap(dst_pte); goto out_unlock; @@ -364,19 +346,19 @@ cont_copy_pmd_range: src_pmd++; dst_pmd++; } while ((unsigned long)src_pmd & PMD_TABLE_MASK); + pmd_unmap_nested(src_pmd-1); + pmd_unmap(dst_pmd-1); } out_unlock: spin_unlock(&src->page_table_lock); out: - pte_chain_free(pte_chain); return 0; nomem: - pte_chain_free(pte_chain); return -ENOMEM; } static void -zap_pte_range(struct mmu_gather *tlb, pmd_t * pmd, +zap_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long address, unsigned long size) { unsigned long offset; @@ -401,32 +383,32 @@ zap_pte_range(struct mmu_gather *tlb, pm if (pte_present(pte)) { unsigned long pfn = pte_pfn(pte); - pte = ptep_get_and_clear(ptep); + pte = vm_ptep_get_and_clear(vma, ptep, address + offset); tlb_remove_tlb_entry(tlb, ptep, address+offset); if (pfn_valid(pfn)) { struct page *page = pfn_to_page(pfn); if (!PageReserved(page)) { if (pte_dirty(pte)) set_page_dirty(page); - if (page->mapping && pte_young(pte) && + if (page_mapping(page) && pte_young(pte) && !PageSwapCache(page)) mark_page_accessed(page); tlb->freed++; - page_remove_rmap(page, ptep); + page_remove_rmap(page); tlb_remove_page(tlb, page); } } } else { if (!pte_file(pte)) free_swap_and_cache(pte_to_swp_entry(pte)); - pte_clear(ptep); + vm_pte_clear(vma, ptep, address); } } pte_unmap(ptep-1); } static void -zap_pmd_range(struct mmu_gather *tlb, pgd_t * dir, +zap_pmd_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pgd_t *dir, unsigned long address, unsigned long size) { pmd_t * pmd; @@ -439,15 +421,16 @@ zap_pmd_range(struct mmu_gather *tlb, pg pgd_clear(dir); return; } - pmd = pmd_offset(dir, address); + pmd = pmd_offset_map(dir, address); end = address + size; if (end > ((address + PGDIR_SIZE) & PGDIR_MASK)) end = ((address + PGDIR_SIZE) & PGDIR_MASK); do { - zap_pte_range(tlb, pmd, address, end - address); + zap_pte_range(tlb, vma, pmd, address, end - address); address = (address + PMD_SIZE) & PMD_MASK; pmd++; } while (address < end); + pmd_unmap(pmd - 1); } void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, @@ -465,7 +448,7 @@ void unmap_page_range(struct mmu_gather dir = pgd_offset(vma->vm_mm, address); tlb_start_vma(tlb, vma); do { - zap_pmd_range(tlb, dir, address, end - address); + zap_pmd_range(tlb, vma, dir, address, end - address); address = (address + PGDIR_SIZE) & PGDIR_MASK; dir++; } while (address && (address < end)); @@ -629,20 +612,27 @@ follow_page(struct mm_struct *mm, unsign if (pgd_none(*pgd) || pgd_bad(*pgd)) goto out; - pmd = pmd_offset(pgd, address); + pmd = pmd_offset_map(pgd, address); if (pmd_none(*pmd)) - goto out; - if (pmd_huge(*pmd)) - return follow_huge_pmd(mm, address, pmd, write); - if (pmd_bad(*pmd)) - goto out; + goto out_unmap; + if (pmd_bad(*pmd)) { + pmd_ERROR(*pmd); + pmd_clear(pmd); + goto out_unmap; + } + if (pmd_huge(*pmd)) { + struct page *page = follow_huge_pmd(mm, address, pmd, write); + pmd_unmap(pmd); + return page; + } ptep = pte_offset_map(pmd, address); if (!ptep) - goto out; + goto out_unmap; pte = *ptep; pte_unmap(ptep); + pmd_unmap(pmd); if (pte_present(pte)) { if (!write || (pte_write(pte) && pte_dirty(pte))) { pfn = pte_pfn(pte); @@ -653,6 +643,9 @@ follow_page(struct mm_struct *mm, unsign out: return NULL; +out_unmap: + pmd_unmap(pmd); + goto out; } /* @@ -711,7 +704,7 @@ int get_user_pages(struct task_struct *t pgd = pgd_offset_k(pg); if (!pgd) return i ? : -EFAULT; - pmd = pmd_offset(pgd, pg); + pmd = pmd_offset_kernel(pgd, pg); if (!pmd) return i ? : -EFAULT; pte = pte_offset_kernel(pmd, pg); @@ -785,8 +778,8 @@ out: return i; } -static void zeromap_pte_range(pte_t * pte, unsigned long address, - unsigned long size, pgprot_t prot) +static void zeromap_pte_range(struct vm_area_struct *vma, pte_t *pte, + unsigned long address, unsigned long size, pgprot_t prot) { unsigned long end; @@ -797,14 +790,14 @@ static void zeromap_pte_range(pte_t * pt do { pte_t zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE(address), prot)); BUG_ON(!pte_none(*pte)); - set_pte(pte, zero_pte); + vm_set_pte(vma, pte, zero_pte, address); address += PAGE_SIZE; pte++; } while (address && (address < end)); } -static inline int zeromap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned long address, - unsigned long size, pgprot_t prot) +static inline int zeromap_pmd_range(struct vm_area_struct *vma, pgd_t *pgd, + pmd_t **pmd, unsigned long address, unsigned long size, pgprot_t prot) { unsigned long end; @@ -813,13 +806,13 @@ static inline int zeromap_pmd_range(stru if (end > PGDIR_SIZE) end = PGDIR_SIZE; do { - pte_t * pte = pte_alloc_map(mm, pmd, address); + pte_t *pte = pte_alloc_map(vma->vm_mm, pgd, pmd, address); if (!pte) return -ENOMEM; - zeromap_pte_range(pte, address, end - address, prot); + zeromap_pte_range(vma, pte, address, end - address, prot); pte_unmap(pte); address = (address + PMD_SIZE) & PMD_MASK; - pmd++; + (*pmd)++; } while (address && (address < end)); return 0; } @@ -839,13 +832,14 @@ int zeromap_page_range(struct vm_area_st spin_lock(&mm->page_table_lock); do { - pmd_t *pmd = pmd_alloc(mm, dir, address); + pmd_t *pmd = pmd_alloc_map(mm, dir, address); error = -ENOMEM; if (!pmd) break; - error = zeromap_pmd_range(mm, pmd, address, end - address, prot); + error = zeromap_pmd_range(vma, dir, &pmd, address, end - address, prot); if (error) break; + pmd_unmap(pmd - 1); address = (address + PGDIR_SIZE) & PGDIR_MASK; dir++; } while (address && (address < end)); @@ -859,8 +853,9 @@ int zeromap_page_range(struct vm_area_st * mappings are removed. any references to nonexistent pages results * in null mappings (currently treated as "copy-on-access") */ -static inline void remap_pte_range(pte_t * pte, unsigned long address, unsigned long size, - unsigned long phys_addr, pgprot_t prot) +static inline void remap_pte_range(struct vm_area_struct *vma, pte_t *pte, + unsigned long address, unsigned long size, + unsigned long phys_addr, pgprot_t prot) { unsigned long end; unsigned long pfn; @@ -873,15 +868,16 @@ static inline void remap_pte_range(pte_t do { BUG_ON(!pte_none(*pte)); if (!pfn_valid(pfn) || PageReserved(pfn_to_page(pfn))) - set_pte(pte, pfn_pte(pfn, prot)); + vm_set_pte(vma, pte, pfn_pte(pfn, prot), address); address += PAGE_SIZE; pfn++; pte++; } while (address && (address < end)); } -static inline int remap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned long address, unsigned long size, - unsigned long phys_addr, pgprot_t prot) +static inline int remap_pmd_range(struct vm_area_struct *vma, pgd_t *pgd, + pmd_t **pmd, unsigned long address, unsigned long size, + unsigned long phys_addr, pgprot_t prot) { unsigned long base, end; @@ -892,13 +888,13 @@ static inline int remap_pmd_range(struct end = PGDIR_SIZE; phys_addr -= address; do { - pte_t * pte = pte_alloc_map(mm, pmd, base + address); + pte_t *pte = pte_alloc_map(vma->vm_mm, pgd, pmd, base + address); if (!pte) return -ENOMEM; - remap_pte_range(pte, base + address, end - address, address + phys_addr, prot); + remap_pte_range(vma, pte, base + address, end - address, address + phys_addr, prot); pte_unmap(pte); address = (address + PMD_SIZE) & PMD_MASK; - pmd++; + (*pmd)++; } while (address && (address < end)); return 0; } @@ -920,13 +916,14 @@ int remap_page_range(struct vm_area_stru spin_lock(&mm->page_table_lock); do { - pmd_t *pmd = pmd_alloc(mm, dir, from); + pmd_t *pmd = pmd_alloc_map(mm, dir, from); error = -ENOMEM; if (!pmd) break; - error = remap_pmd_range(mm, pmd, from, end - from, phys_addr + from, prot); + error = remap_pmd_range(vma, dir, &pmd, from, end - from, phys_addr + from, prot); if (error) break; + pmd_unmap(pmd - 1); from = (from + PGDIR_SIZE) & PGDIR_MASK; dir++; } while (from && (from < end)); @@ -943,9 +940,10 @@ int remap_page_range(struct vm_area_stru * * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock */ -static inline void establish_pte(struct vm_area_struct * vma, unsigned long address, pte_t *page_table, pte_t entry) +static inline void establish_pte(struct vm_area_struct *vma, + unsigned long address, pte_t *page_table, pte_t entry) { - set_pte(page_table, entry); + vm_set_pte(vma, page_table, entry, address); flush_tlb_page(vma, address); update_mmu_cache(vma, address, entry); } @@ -953,8 +951,9 @@ static inline void establish_pte(struct /* * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock */ -static inline void break_cow(struct vm_area_struct * vma, struct page * new_page, unsigned long address, - pte_t *page_table) +static inline void break_cow(struct vm_area_struct *vma, + struct page *new_page, unsigned long address, + pte_t *page_table) { invalidate_vcache(address, vma->vm_mm, new_page); flush_cache_page(vma, address); @@ -986,7 +985,6 @@ static int do_wp_page(struct mm_struct * { struct page *old_page, *new_page; unsigned long pfn = pte_pfn(pte); - struct pte_chain *pte_chain = NULL; int ret; if (unlikely(!pfn_valid(pfn))) { @@ -996,6 +994,7 @@ static int do_wp_page(struct mm_struct * * data, but for the moment just pretend this is OOM. */ pte_unmap(page_table); + pmd_unmap(pmd); printk(KERN_ERR "do_wp_page: bogus page at address %08lx\n", address); goto oom; @@ -1004,17 +1003,22 @@ static int do_wp_page(struct mm_struct * if (!TestSetPageLocked(old_page)) { int reuse = can_share_swap_page(old_page); - unlock_page(old_page); - if (reuse) { + if (!reuse) + unlock_page(old_page); + else { flush_cache_page(vma, address); establish_pte(vma, address, page_table, pte_mkyoung(pte_mkdirty(pte_mkwrite(pte)))); + page_turn_rmap(old_page, vma); pte_unmap(page_table); + pmd_unmap(pmd); ret = VM_FAULT_MINOR; + unlock_page(old_page); goto out; } } pte_unmap(page_table); + pmd_unmap(pmd); /* * Ok, we need to copy. Oh, well.. @@ -1022,9 +1026,6 @@ static int do_wp_page(struct mm_struct * page_cache_get(old_page); spin_unlock(&mm->page_table_lock); - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) - goto no_mem; new_page = alloc_page(GFP_HIGHUSER); if (!new_page) goto no_mem; @@ -1034,32 +1035,37 @@ static int do_wp_page(struct mm_struct * * Re-check the pte - we dropped the lock */ spin_lock(&mm->page_table_lock); + pmd = pmd_offset_map(pgd_offset(mm, address), address); page_table = pte_offset_map(pmd, address); if (pte_same(*page_table, pte)) { if (PageReserved(old_page)) ++mm->rss; - page_remove_rmap(old_page, page_table); + else + /* should be file-backed, ->__mapping not modified */ + page_remove_rmap(old_page); break_cow(vma, new_page, address, page_table); - pte_chain = page_add_rmap(new_page, page_table, pte_chain); + + /* we have a unique reference, so PG_locked need not be held */ + page_add_rmap(new_page, vma, address, 1); lru_cache_add_active(new_page); /* Free the old page.. */ new_page = old_page; } pte_unmap(page_table); + pmd_unmap(pmd); page_cache_release(new_page); page_cache_release(old_page); ret = VM_FAULT_MINOR; - goto out; +out: + spin_unlock(&mm->page_table_lock); + return ret; no_mem: page_cache_release(old_page); oom: ret = VM_FAULT_OOM; -out: - spin_unlock(&mm->page_table_lock); - pte_chain_free(pte_chain); - return ret; + goto out; } static void vmtruncate_list(struct list_head *head, unsigned long pgoff) @@ -1068,17 +1074,21 @@ static void vmtruncate_list(struct list_ struct vm_area_struct *vma; struct list_head *curr; - list_for_each(curr, head) { + list_for_each_rcu(curr, head) { + struct mmu_gather *tlb; + vma = list_entry(curr, struct vm_area_struct, shared); + + if (vma->vm_flags & VM_DEAD) + continue; + start = vma->vm_start; end = vma->vm_end; len = end - start; /* mapping wholly truncated? */ - if (vma->vm_pgoff >= pgoff) { - zap_page_range(vma, start, len); - continue; - } + if (vma->vm_pgoff >= pgoff) + goto nuke_it_all; /* mapping wholly unaffected? */ len = len >> PAGE_SHIFT; @@ -1089,7 +1099,13 @@ static void vmtruncate_list(struct list_ /* Ok, partially affected.. */ start += diff << PAGE_SHIFT; len = (len - diff) << PAGE_SHIFT; - zap_page_range(vma, start, len); + end = start + len; +nuke_it_all: + spin_lock(&vma->vm_mm->page_table_lock); + tlb = tlb_gather_mmu(vma->vm_mm, 0); + unmap_page_range(tlb, vma, start, end); + tlb_finish_mmu(tlb, start, end); + spin_unlock(&vma->vm_mm->page_table_lock); } } @@ -1111,12 +1127,12 @@ int vmtruncate(struct inode * inode, lof goto do_expand; i_size_write(inode, offset); pgoff = (offset + PAGE_SIZE - 1) >> PAGE_SHIFT; - down(&mapping->i_shared_sem); + rcu_read_lock(); /* mapping->i_shared_lock */ if (unlikely(!list_empty(&mapping->i_mmap))) vmtruncate_list(&mapping->i_mmap, pgoff); if (unlikely(!list_empty(&mapping->i_mmap_shared))) vmtruncate_list(&mapping->i_mmap_shared, pgoff); - up(&mapping->i_shared_sem); + rcu_read_unlock(); /* mapping->i_shared_lock */ truncate_inode_pages(mapping, offset); goto out_truncate; @@ -1177,9 +1193,9 @@ static int do_swap_page(struct mm_struct swp_entry_t entry = pte_to_swp_entry(orig_pte); pte_t pte; int ret = VM_FAULT_MINOR; - struct pte_chain *pte_chain = NULL; pte_unmap(page_table); + pmd_unmap(pmd); spin_unlock(&mm->page_table_lock); page = lookup_swap_cache(entry); if (!page) { @@ -1191,12 +1207,14 @@ static int do_swap_page(struct mm_struct * we released the page table lock. */ spin_lock(&mm->page_table_lock); + pmd = pmd_offset_map(pgd_offset(mm, address), address); page_table = pte_offset_map(pmd, address); if (pte_same(*page_table, orig_pte)) ret = VM_FAULT_OOM; else ret = VM_FAULT_MINOR; pte_unmap(page_table); + pmd_unmap(pmd); spin_unlock(&mm->page_table_lock); goto out; } @@ -1207,26 +1225,27 @@ static int do_swap_page(struct mm_struct } mark_page_accessed(page); - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) { - ret = -ENOMEM; - goto out; - } lock_page(page); + if (!rmap_get_cpu()) { + ret = VM_FAULT_OOM; + goto outrel; + } + spin_lock(&mm->page_table_lock); + put_cpu(); + pmd = pmd_offset_map(pgd_offset(mm, address), address); + page_table = pte_offset_map(pmd, address); + /* * Back out if somebody else faulted in this pte while we * released the page table lock. */ - spin_lock(&mm->page_table_lock); - page_table = pte_offset_map(pmd, address); if (!pte_same(*page_table, orig_pte)) { pte_unmap(page_table); + pmd_unmap(pmd); spin_unlock(&mm->page_table_lock); - unlock_page(page); - page_cache_release(page); ret = VM_FAULT_MINOR; - goto out; + goto outrel; } /* The page isn't present yet, go ahead with the fault. */ @@ -1239,19 +1258,23 @@ static int do_swap_page(struct mm_struct pte = mk_pte(page, vma->vm_page_prot); if (write_access && can_share_swap_page(page)) pte = pte_mkdirty(pte_mkwrite(pte)); - unlock_page(page); flush_icache_page(vma, page); - set_pte(page_table, pte); - pte_chain = page_add_rmap(page, page_table, pte_chain); + vm_set_pte(vma, page_table, pte, address); + page_add_rmap(page, vma, address, 1); + unlock_page(page); /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, address, pte); + pmd_unmap(pmd); pte_unmap(page_table); spin_unlock(&mm->page_table_lock); out: - pte_chain_free(pte_chain); return ret; +outrel: + unlock_page(page); + page_cache_release(page); + goto out; } /* @@ -1266,20 +1289,8 @@ do_anonymous_page(struct mm_struct *mm, { pte_t entry; struct page * page = ZERO_PAGE(addr); - struct pte_chain *pte_chain; int ret; - pte_chain = pte_chain_alloc(GFP_ATOMIC); - if (!pte_chain) { - pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) - goto no_mem; - spin_lock(&mm->page_table_lock); - page_table = pte_offset_map(pmd, addr); - } - /* Read-only mapping of ZERO_PAGE. */ entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot)); @@ -1287,6 +1298,7 @@ do_anonymous_page(struct mm_struct *mm, if (write_access) { /* Allocate our own private page. */ pte_unmap(page_table); + pmd_unmap(pmd); spin_unlock(&mm->page_table_lock); page = alloc_page(GFP_HIGHUSER); @@ -1295,9 +1307,11 @@ do_anonymous_page(struct mm_struct *mm, clear_user_highpage(page, addr); spin_lock(&mm->page_table_lock); + pmd = pmd_offset_map(pgd_offset(mm, addr), addr); page_table = pte_offset_map(pmd, addr); if (!pte_none(*page_table)) { + pmd_unmap(pmd); pte_unmap(page_table); page_cache_release(page); spin_unlock(&mm->page_table_lock); @@ -1306,26 +1320,26 @@ do_anonymous_page(struct mm_struct *mm, } mm->rss++; entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); + } + + vm_set_pte(vma, page_table, entry, addr); + if (write_access) { + page_add_rmap(page, vma, addr, 1); lru_cache_add_active(page); mark_page_accessed(page); } - - set_pte(page_table, entry); - /* ignores ZERO_PAGE */ - pte_chain = page_add_rmap(page, page_table, pte_chain); + pmd_unmap(pmd); pte_unmap(page_table); /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, addr, entry); spin_unlock(&mm->page_table_lock); ret = VM_FAULT_MINOR; - goto out; - -no_mem: - ret = VM_FAULT_OOM; out: - pte_chain_free(pte_chain); return ret; +no_mem: + ret = VM_FAULT_OOM; + goto out; } /* @@ -1346,13 +1360,13 @@ do_no_page(struct mm_struct *mm, struct { struct page * new_page; pte_t entry; - struct pte_chain *pte_chain; - int ret; + int ret, anon = 0; if (!vma->vm_ops || !vma->vm_ops->nopage) return do_anonymous_page(mm, vma, page_table, pmd, write_access, address); pte_unmap(page_table); + pmd_unmap(pmd); spin_unlock(&mm->page_table_lock); new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, 0); @@ -1363,26 +1377,25 @@ do_no_page(struct mm_struct *mm, struct if (new_page == NOPAGE_OOM) return VM_FAULT_OOM; - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) - goto oom; - /* * Should we do an early C-O-W break? */ - if (write_access && !(vma->vm_flags & VM_SHARED)) { + if ((write_access || page_zone(new_page)->zone_pgdat->node_id != numa_node_id()) && !(vma->vm_flags & VM_SHARED)) { struct page * page = alloc_page(GFP_HIGHUSER); - if (!page) { - page_cache_release(new_page); + if (!page) goto oom; - } + /* start with refcount 1 */ copy_user_highpage(page, new_page, address); page_cache_release(new_page); - lru_cache_add_active(page); + anon = 1; new_page = page; } + if (!rmap_get_cpu()) + goto oom; spin_lock(&mm->page_table_lock); + put_cpu(); + pmd = pmd_offset_map(pgd_offset(mm, address), address); page_table = pte_offset_map(pmd, address); /* @@ -1402,12 +1415,29 @@ do_no_page(struct mm_struct *mm, struct entry = mk_pte(new_page, vma->vm_page_prot); if (write_access) entry = pte_mkwrite(pte_mkdirty(entry)); - set_pte(page_table, entry); - pte_chain = page_add_rmap(new_page, page_table, pte_chain); + vm_set_pte(vma, page_table, entry, address); + + /* + * PG_locked not held for the anon case, but we have a + * unique reference, and ->__mapping is untouched when file-backed + */ + if (!PageReserved(new_page)) + page_add_rmap(new_page, vma, address, anon); + + /* kswapd can find us now, but we're already prepped */ + if (anon) + lru_cache_add_active(new_page); pte_unmap(page_table); + pmd_unmap(pmd); } else { /* One of our sibling threads was faster, back out. */ pte_unmap(page_table); + pmd_unmap(pmd); + /* + * In the anon case, we never hit the LRU, so we free instantly, + * where in mainline the LRU retains a reference. In the file- + * backed case, we merely release a reference acquired earlier. + */ page_cache_release(new_page); spin_unlock(&mm->page_table_lock); ret = VM_FAULT_MINOR; @@ -1418,12 +1448,12 @@ do_no_page(struct mm_struct *mm, struct update_mmu_cache(vma, address, entry); spin_unlock(&mm->page_table_lock); ret = VM_FAULT_MAJOR; - goto out; -oom: - ret = VM_FAULT_OOM; out: - pte_chain_free(pte_chain); return ret; +oom: + page_cache_release(new_page); + ret = VM_FAULT_OOM; + goto out; } /* @@ -1444,13 +1474,14 @@ static int do_file_page(struct mm_struct */ if (!vma->vm_ops || !vma->vm_ops->populate || (write_access && !(vma->vm_flags & VM_SHARED))) { - pte_clear(pte); + vm_pte_clear(vma, pte, address); return do_no_page(mm, vma, address, write_access, pte, pmd); } pgoff = pte_to_pgoff(*pte); pte_unmap(pte); + pmd_unmap(pmd); spin_unlock(&mm->page_table_lock); err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE, vma->vm_page_prot, pgoff, 0); @@ -1511,6 +1542,7 @@ static inline int handle_pte_fault(struc entry = pte_mkyoung(entry); establish_pte(vma, address, pte, entry); pte_unmap(pte); + pmd_unmap(pmd); spin_unlock(&mm->page_table_lock); return VM_FAULT_MINOR; } @@ -1537,10 +1569,10 @@ int handle_mm_fault(struct mm_struct *mm * and the SMP-safe atomic PTE updates. */ spin_lock(&mm->page_table_lock); - pmd = pmd_alloc(mm, pgd, address); + pmd = pmd_alloc_map(mm, pgd, address); if (pmd) { - pte_t * pte = pte_alloc_map(mm, pmd, address); + pte_t *pte = pte_alloc_map(mm, pgd, &pmd, address); if (pte) return handle_pte_fault(mm, vma, address, write_access, pte, pmd); } @@ -1559,10 +1591,33 @@ int handle_mm_fault(struct mm_struct *mm */ pmd_t *__pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) { + struct page *page; + + spin_unlock(&mm->page_table_lock); + page = pmd_alloc_one(mm, address); + spin_lock(&mm->page_table_lock); + if (!page) + return NULL; + + /* + * Because we dropped the lock, we should re-check the + * entry, as somebody else could have populated it.. + */ + if (pgd_present(*pgd)) { + pmd_free(page); + goto out; + } + pgd_populate(mm, pgd, page); +out: + return pmd_offset_map(pgd, address); +} + +pmd_t *__pmd_alloc_kernel(struct mm_struct *mm, pgd_t *pgd, unsigned long address) +{ pmd_t *new; spin_unlock(&mm->page_table_lock); - new = pmd_alloc_one(mm, address); + new = pmd_alloc_one_kernel(mm, address); spin_lock(&mm->page_table_lock); if (!new) return NULL; @@ -1572,12 +1627,12 @@ pmd_t *__pmd_alloc(struct mm_struct *mm, * entry, as somebody else could have populated it.. */ if (pgd_present(*pgd)) { - pmd_free(new); + pmd_free(virt_to_page(new)); goto out; } - pgd_populate(mm, pgd, new); + pgd_populate(mm, pgd, virt_to_page(new)); out: - return pmd_offset(pgd, address); + return pmd_offset_kernel(pgd, address); } int make_pages_present(unsigned long addr, unsigned long end) @@ -1609,7 +1664,7 @@ struct page * vmalloc_to_page(void * vma pte_t *ptep, pte; if (!pgd_none(*pgd)) { - pmd = pmd_offset(pgd, addr); + pmd = pmd_offset_map(pgd, addr); if (!pmd_none(*pmd)) { preempt_disable(); ptep = pte_offset_map(pmd, addr); @@ -1619,6 +1674,7 @@ struct page * vmalloc_to_page(void * vma pte_unmap(ptep); preempt_enable(); } + pmd_unmap(pmd); } return page; } diff -prauN linux-2.6.0-test1/mm/mmap.c wli-2.6.0-test1-37/mm/mmap.c --- linux-2.6.0-test1/mm/mmap.c 2003-07-13 20:35:15.000000000 -0700 +++ wli-2.6.0-test1-37/mm/mmap.c 2003-07-18 18:38:30.000000000 -0700 @@ -58,8 +58,19 @@ EXPORT_SYMBOL(sysctl_overcommit_memory); EXPORT_SYMBOL(sysctl_overcommit_ratio); EXPORT_SYMBOL(vm_committed_space); +static void __free_vma(void *vma) +{ + kmem_cache_free(vm_area_cachep, vma); +} + +void free_vma(struct vm_area_struct *vma) +{ + INIT_LIST_HEAD(&vma->rcu.list); + call_rcu(&vma->rcu, __free_vma, vma); +} + /* - * Requires inode->i_mapping->i_shared_sem + * Requires inode->i_mapping->i_shared_lock */ static inline void __remove_shared_vm_struct(struct vm_area_struct *vma, struct inode *inode) @@ -67,7 +78,8 @@ __remove_shared_vm_struct(struct vm_area if (inode) { if (vma->vm_flags & VM_DENYWRITE) atomic_inc(&inode->i_writecount); - list_del_init(&vma->shared); + vma->vm_flags |= VM_DEAD; + list_del_rcu(&vma->shared); } } @@ -81,9 +93,9 @@ static void remove_shared_vm_struct(stru if (file) { struct inode *inode = file->f_dentry->d_inode; - down(&inode->i_mapping->i_shared_sem); + spin_lock(&inode->i_mapping->i_shared_lock); __remove_shared_vm_struct(vma, inode); - up(&inode->i_mapping->i_shared_sem); + spin_unlock(&inode->i_mapping->i_shared_lock); } } @@ -264,9 +276,9 @@ static inline void __vma_link_file(struc atomic_dec(&inode->i_writecount); if (vma->vm_flags & VM_SHARED) - list_add_tail(&vma->shared, &mapping->i_mmap_shared); + list_add_tail_rcu(&vma->shared, &mapping->i_mmap_shared); else - list_add_tail(&vma->shared, &mapping->i_mmap); + list_add_tail_rcu(&vma->shared, &mapping->i_mmap); } } @@ -290,12 +302,12 @@ static void vma_link(struct mm_struct *m mapping = vma->vm_file->f_dentry->d_inode->i_mapping; if (mapping) - down(&mapping->i_shared_sem); + spin_lock(&mapping->i_shared_lock); spin_lock(&mm->page_table_lock); __vma_link(mm, vma, prev, rb_link, rb_parent); spin_unlock(&mm->page_table_lock); if (mapping) - up(&mapping->i_shared_sem); + spin_unlock(&mapping->i_shared_lock); mark_mm_hugetlb(mm, vma); mm->map_count++; @@ -322,6 +334,28 @@ static inline int is_mergeable_vma(struc return 1; } +static void move_vma_start(struct vm_area_struct *vma, unsigned long addr) +{ + spinlock_t *lock = &vma->vm_mm->page_table_lock; + struct inode *inode = NULL; + + if (vma->vm_file) { + inode = vma->vm_file->f_dentry->d_inode; + spin_lock(&inode->i_mapping->i_shared_lock); + } + spin_lock(lock); + if (inode) + __remove_shared_vm_struct(vma, inode); + /* If no vm_file, perhaps we should always keep vm_pgoff at 0?? */ + vma->vm_pgoff += (long)(addr - vma->vm_start) >> PAGE_SHIFT; + vma->vm_start = addr; + if (inode) { + __vma_link_file(vma); + spin_unlock(&inode->i_mapping->i_shared_lock); + } + spin_unlock(lock); +} + /* * Return true if we can merge this (vm_flags,file,vm_pgoff,size) * in front of (at a lower virtual address and file offset than) the vma. @@ -374,8 +408,6 @@ static int vma_merge(struct mm_struct *m unsigned long end, unsigned long vm_flags, struct file *file, unsigned long pgoff) { - spinlock_t * lock = &mm->page_table_lock; - /* * We later require that vma->vm_flags == vm_flags, so this tests * vma->vm_flags & VM_SPECIAL, too. @@ -395,12 +427,13 @@ static int vma_merge(struct mm_struct *m is_mergeable_vma(prev, file, vm_flags) && can_vma_merge_after(prev, vm_flags, file, pgoff)) { struct vm_area_struct *next; + spinlock_t *lock = &mm->page_table_lock; struct inode *inode = file ? file->f_dentry->d_inode : NULL; int need_up = 0; if (unlikely(file && prev->vm_next && prev->vm_next->vm_file == file)) { - down(&inode->i_mapping->i_shared_sem); + spin_lock(&inode->i_mapping->i_shared_lock); need_up = 1; } spin_lock(lock); @@ -418,17 +451,17 @@ static int vma_merge(struct mm_struct *m __remove_shared_vm_struct(next, inode); spin_unlock(lock); if (need_up) - up(&inode->i_mapping->i_shared_sem); + spin_unlock(&inode->i_mapping->i_shared_lock); if (file) fput(file); mm->map_count--; - kmem_cache_free(vm_area_cachep, next); + free_vma(next); return 1; } spin_unlock(lock); if (need_up) - up(&inode->i_mapping->i_shared_sem); + spin_unlock(&inode->i_mapping->i_shared_lock); return 1; } @@ -442,10 +475,7 @@ static int vma_merge(struct mm_struct *m pgoff, (end - addr) >> PAGE_SHIFT)) return 0; if (end == prev->vm_start) { - spin_lock(lock); - prev->vm_start = addr; - prev->vm_pgoff -= (end - addr) >> PAGE_SHIFT; - spin_unlock(lock); + move_vma_start(prev, addr); return 1; } } @@ -672,20 +702,25 @@ munmap_back: atomic_inc(&inode->i_writecount); fput(file); } - kmem_cache_free(vm_area_cachep, vma); + free_vma(vma); } out: mm->total_vm += len >> PAGE_SHIFT; - if (vm_flags & VM_LOCKED) { - mm->locked_vm += len >> PAGE_SHIFT; - make_pages_present(addr, addr + len); - } - if (flags & MAP_POPULATE) { + if (flags & (MAP_POPULATE|MAP_EXECUTABLE)) { up_write(&mm->mmap_sem); sys_remap_file_pages(addr, len, prot, pgoff, flags & MAP_NONBLOCK); down_write(&mm->mmap_sem); } + + /* prefault the stack and locked mappings */ + if ((vm_flags & VM_LOCKED) || + ((vm_flags & VM_STACK_FLAGS) == VM_STACK_FLAGS && + len <= min(4*PAGE_SIZE, current->rlim[RLIMIT_STACK].rlim_cur))) { + if (vm_flags & VM_LOCKED) + mm->locked_vm += len >> PAGE_SHIFT; + make_pages_present(addr, addr + len); + } return addr; unmap_and_free_vma: @@ -697,7 +732,7 @@ unmap_and_free_vma: /* Undo any partial mapping done by a device driver. */ zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start); free_vma: - kmem_cache_free(vm_area_cachep, vma); + free_vma(vma); unacct_error: if (charged) vm_unacct_memory(charged); @@ -1084,7 +1119,7 @@ static void unmap_vma(struct mm_struct * area->vm_ops->close(area); if (area->vm_file) fput(area->vm_file); - kmem_cache_free(vm_area_cachep, area); + free_vma(area); } /* @@ -1174,8 +1209,7 @@ int split_vma(struct mm_struct * mm, str if (new_below) { new->vm_end = addr; - vma->vm_start = addr; - vma->vm_pgoff += ((addr - new->vm_start) >> PAGE_SHIFT); + move_vma_start(vma, addr); } else { vma->vm_end = addr; new->vm_start = addr; @@ -1423,7 +1457,7 @@ void exit_mmap(struct mm_struct *mm) } if (vma->vm_file) fput(vma->vm_file); - kmem_cache_free(vm_area_cachep, vma); + free_vma(vma); vma = next; } } diff -prauN linux-2.6.0-test1/mm/mprotect.c wli-2.6.0-test1-37/mm/mprotect.c --- linux-2.6.0-test1/mm/mprotect.c 2003-07-13 20:33:11.000000000 -0700 +++ wli-2.6.0-test1-37/mm/mprotect.c 2003-07-14 08:45:37.000000000 -0700 @@ -24,11 +24,11 @@ #include static inline void -change_pte_range(pmd_t *pmd, unsigned long address, - unsigned long size, pgprot_t newprot) +change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long address, unsigned long size, pgprot_t newprot) { pte_t * pte; - unsigned long end; + unsigned long start, end; if (pmd_none(*pmd)) return; @@ -38,6 +38,7 @@ change_pte_range(pmd_t *pmd, unsigned lo return; } pte = pte_offset_map(pmd, address); + start = address & PMD_MASK; address &= ~PMD_MASK; end = address + size; if (end > PMD_SIZE) @@ -50,8 +51,8 @@ change_pte_range(pmd_t *pmd, unsigned lo * bits by wiping the pte and then setting the new pte * into place. */ - entry = ptep_get_and_clear(pte); - set_pte(pte, pte_modify(entry, newprot)); + entry = vm_ptep_get_and_clear(vma, pte, address + start); + vm_set_pte(vma, pte, pte_modify(entry, newprot), start + address); } address += PAGE_SIZE; pte++; @@ -60,11 +61,11 @@ change_pte_range(pmd_t *pmd, unsigned lo } static inline void -change_pmd_range(pgd_t *pgd, unsigned long address, - unsigned long size, pgprot_t newprot) +change_pmd_range(struct vm_area_struct *vma, pgd_t *pgd, + unsigned long address, unsigned long size, pgprot_t newprot) { pmd_t * pmd; - unsigned long end; + unsigned long start, end; if (pgd_none(*pgd)) return; @@ -73,16 +74,18 @@ change_pmd_range(pgd_t *pgd, unsigned lo pgd_clear(pgd); return; } - pmd = pmd_offset(pgd, address); + pmd = pmd_offset_map(pgd, address); + start = address & PGDIR_MASK; address &= ~PGDIR_MASK; end = address + size; if (end > PGDIR_SIZE) end = PGDIR_SIZE; do { - change_pte_range(pmd, address, end - address, newprot); + change_pte_range(vma, pmd, start + address, end - address, newprot); address = (address + PMD_SIZE) & PMD_MASK; pmd++; } while (address && (address < end)); + pmd_unmap(pmd - 1); } static void @@ -98,7 +101,7 @@ change_protection(struct vm_area_struct BUG(); spin_lock(¤t->mm->page_table_lock); do { - change_pmd_range(dir, start, end - start, newprot); + change_pmd_range(vma, dir, start, end - start, newprot); start = (start + PGDIR_SIZE) & PGDIR_MASK; dir++; } while (start && (start < end)); @@ -135,7 +138,7 @@ mprotect_attempt_merge(struct vm_area_st __vma_unlink(mm, vma, prev); spin_unlock(&mm->page_table_lock); - kmem_cache_free(vm_area_cachep, vma); + free_vma(vma); mm->map_count--; return 1; } @@ -297,7 +300,7 @@ sys_mprotect(unsigned long start, size_t __vma_unlink(prev->vm_mm, next, prev); spin_unlock(&prev->vm_mm->page_table_lock); - kmem_cache_free(vm_area_cachep, next); + free_vma(next); prev->vm_mm->map_count--; } out: diff -prauN linux-2.6.0-test1/mm/mremap.c wli-2.6.0-test1-37/mm/mremap.c --- linux-2.6.0-test1/mm/mremap.c 2003-07-13 20:34:40.000000000 -0700 +++ wli-2.6.0-test1-37/mm/mremap.c 2003-07-14 09:10:59.000000000 -0700 @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include #include @@ -38,7 +38,7 @@ static pte_t *get_one_pte_map_nested(str goto end; } - pmd = pmd_offset(pgd, addr); + pmd = pmd_offset_map_nested(pgd, addr); if (pmd_none(*pmd)) goto end; if (pmd_bad(*pmd)) { @@ -53,6 +53,7 @@ static pte_t *get_one_pte_map_nested(str pte = NULL; } end: + pmd_unmap_nested(pmd); return pte; } @@ -61,12 +62,15 @@ static inline int page_table_present(str { pgd_t *pgd; pmd_t *pmd; + int ret; pgd = pgd_offset(mm, addr); if (pgd_none(*pgd)) return 0; - pmd = pmd_offset(pgd, addr); - return pmd_present(*pmd); + pmd = pmd_offset_map(pgd, addr); + ret = pmd_present(*pmd); + pmd_unmap(pmd); + return ret != 0; } #else #define page_table_present(mm, addr) (1) @@ -74,40 +78,38 @@ static inline int page_table_present(str static inline pte_t *alloc_one_pte_map(struct mm_struct *mm, unsigned long addr) { + pgd_t *pgd; pmd_t *pmd; pte_t *pte = NULL; - pmd = pmd_alloc(mm, pgd_offset(mm, addr), addr); + pgd = pgd_offset(mm, addr); + pmd = pmd_alloc_map(mm, pgd, addr); if (pmd) - pte = pte_alloc_map(mm, pmd, addr); + pte = pte_alloc_map(mm, pgd, &pmd, addr); + pmd_unmap(pmd); return pte; } static int -copy_one_pte(struct mm_struct *mm, pte_t *src, pte_t *dst, - struct pte_chain **pte_chainp) +copy_one_pte(struct vm_area_struct *vma, pte_t *src, pte_t *dst, + unsigned long old_addr, unsigned long new_addr) { - int error = 0; - pte_t pte; - struct page *page = NULL; - - if (pte_present(*src)) - page = pte_page(*src); - if (!pte_none(*src)) { - if (page) - page_remove_rmap(page, src); - pte = ptep_get_and_clear(src); - if (!dst) { - /* No dest? We must put it back. */ - dst = src; - error++; + pte_t pte; + if (!dst) + return -1; + pte = vm_ptep_get_and_clear(vma, src, old_addr); + vm_set_pte(vma, dst, pte, new_addr); + if (pte_present(pte)) { + unsigned long pfn = pte_pfn(pte); + if (pfn_valid(pfn)) { + struct page *page = pfn_to_page(pfn); + if (!PageReserved(page)) + page_move_rmap(page, vma, old_addr, new_addr); + } } - set_pte(dst, pte); - if (page) - *pte_chainp = page_add_rmap(page, dst, *pte_chainp); } - return error; + return 0; } static int @@ -115,16 +117,16 @@ move_one_page(struct vm_area_struct *vma unsigned long new_addr) { struct mm_struct *mm = vma->vm_mm; - int error = 0; pte_t *src, *dst; - struct pte_chain *pte_chain; + int error = 0; - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) { + if (!rmap_get_cpu()) { error = -ENOMEM; goto out; } + spin_lock(&mm->page_table_lock); + put_cpu(); src = get_one_pte_map_nested(mm, old_addr); if (src) { /* @@ -139,13 +141,12 @@ move_one_page(struct vm_area_struct *vma dst = alloc_one_pte_map(mm, new_addr); if (src == NULL) src = get_one_pte_map_nested(mm, old_addr); - error = copy_one_pte(mm, src, dst, &pte_chain); + error = copy_one_pte(vma, src, dst, old_addr, new_addr); pte_unmap_nested(src); pte_unmap(dst); } flush_tlb_page(vma, old_addr); spin_unlock(&mm->page_table_lock); - pte_chain_free(pte_chain); out: return error; } @@ -214,7 +215,7 @@ static unsigned long move_vma(struct vm_ if (vma == next) vma = prev; mm->map_count--; - kmem_cache_free(vm_area_cachep, next); + free_vma(next); } } else if (next->vm_start == new_addr + new_len && can_vma_merge(next, vma->vm_flags) && @@ -290,7 +291,7 @@ static unsigned long move_vma(struct vm_ return new_addr; } if (allocated_vma) - kmem_cache_free(vm_area_cachep, new_vma); + free_vma(new_vma); out: return -ENOMEM; } diff -prauN linux-2.6.0-test1/mm/msync.c wli-2.6.0-test1-37/mm/msync.c --- linux-2.6.0-test1/mm/msync.c 2003-07-13 20:32:45.000000000 -0700 +++ wli-2.6.0-test1-37/mm/msync.c 2003-07-14 06:49:00.000000000 -0700 @@ -82,7 +82,7 @@ static inline int filemap_sync_pmd_range pgd_clear(pgd); return 0; } - pmd = pmd_offset(pgd, address); + pmd = pmd_offset_map(pgd, address); if ((address & PGDIR_MASK) != (end & PGDIR_MASK)) end = (address & PGDIR_MASK) + PGDIR_SIZE; error = 0; @@ -91,6 +91,7 @@ static inline int filemap_sync_pmd_range address = (address + PMD_SIZE) & PMD_MASK; pmd++; } while (address && (address < end)); + pmd_unmap(pmd - 1); return error; } diff -prauN linux-2.6.0-test1/mm/nommu.c wli-2.6.0-test1-37/mm/nommu.c --- linux-2.6.0-test1/mm/nommu.c 2003-07-13 20:37:30.000000000 -0700 +++ wli-2.6.0-test1-37/mm/nommu.c 2003-07-14 08:56:31.000000000 -0700 @@ -562,7 +562,3 @@ unsigned long get_unmapped_area(struct f { return -ENOMEM; } - -void pte_chain_init(void) -{ -} diff -prauN linux-2.6.0-test1/mm/page-writeback.c wli-2.6.0-test1-37/mm/page-writeback.c --- linux-2.6.0-test1/mm/page-writeback.c 2003-07-13 20:36:06.000000000 -0700 +++ wli-2.6.0-test1-37/mm/page-writeback.c 2003-07-14 08:52:52.000000000 -0700 @@ -451,7 +451,7 @@ int do_writepages(struct address_space * */ int write_one_page(struct page *page, int wait) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); int ret = 0; struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, @@ -463,12 +463,12 @@ int write_one_page(struct page *page, in if (wait) wait_on_page_writeback(page); - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); list_del(&page->list); if (test_clear_page_dirty(page)) { list_add(&page->list, &mapping->locked_pages); page_cache_get(page); - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); ret = mapping->a_ops->writepage(page, &wbc); if (ret == 0 && wait) { wait_on_page_writeback(page); @@ -478,7 +478,7 @@ int write_one_page(struct page *page, in page_cache_release(page); } else { list_add(&page->list, &mapping->clean_pages); - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); unlock_page(page); } return ret; @@ -490,31 +490,31 @@ EXPORT_SYMBOL(write_one_page); * and move it to the dirty_pages list. Also perform space reservation if * required. * - * __set_page_dirty_nobuffers() may return -ENOSPC. But if it does, the page + * set_page_dirty_nobuffers() may return -ENOSPC. But if it does, the page * is still safe, as long as it actually manages to find some blocks at * writeback time. * * This is also used when a single buffer is being dirtied: we want to set the * page dirty in that case, but not all the buffers. This is a "bottom-up" - * dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying. + * dirtying, whereas set_page_dirty_buffers() is a "top-down" dirtying. */ -int __set_page_dirty_nobuffers(struct page *page) +int set_page_dirty_nobuffers(struct page *page) { int ret = 0; if (!TestSetPageDirty(page)) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); if (mapping) { - spin_lock(&mapping->page_lock); - if (page->mapping) { /* Race with truncate? */ - BUG_ON(page->mapping != mapping); + mapping_wrlock(&mapping->page_lock); + if (page_mapping(page)) { /* Race with truncate? */ + BUG_ON(page_mapping(page) != mapping); if (!mapping->backing_dev_info->memory_backed) inc_page_state(nr_dirty); list_del(&page->list); list_add(&page->list, &mapping->dirty_pages); } - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); if (!PageSwapCache(page)) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); @@ -522,7 +522,28 @@ int __set_page_dirty_nobuffers(struct pa } return ret; } -EXPORT_SYMBOL(__set_page_dirty_nobuffers); +EXPORT_SYMBOL(set_page_dirty_nobuffers); + +/* + * If the mapping doesn't provide a set_page_dirty() a_op, then + * just fall through and assume that it wants bh's. + */ +int set_page_dirty(struct page *page) +{ + struct address_space *mapping = page_mapping(page); + int (*spd)(struct page *); + + if (!mapping) { + SetPageDirty(page); + return 0; + } + spd = mapping->a_ops->set_page_dirty; + if (spd) + return (*spd)(page); + else + return set_page_dirty_buffers(page); +} +EXPORT_SYMBOL(set_page_dirty); /* * set_page_dirty() is racy if the caller has no reference against @@ -551,7 +572,7 @@ int set_page_dirty_lock(struct page *pag int test_clear_page_dirty(struct page *page) { if (TestClearPageDirty(page)) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); if (mapping && !mapping->backing_dev_info->memory_backed) dec_page_state(nr_dirty); diff -prauN linux-2.6.0-test1/mm/page_alloc.c wli-2.6.0-test1-37/mm/page_alloc.c --- linux-2.6.0-test1/mm/page_alloc.c 2003-07-13 20:30:01.000000000 -0700 +++ wli-2.6.0-test1-37/mm/page_alloc.c 2003-07-17 03:06:09.000000000 -0700 @@ -74,7 +74,7 @@ static void bad_page(const char *functio { printk("Bad page state at %s\n", function); printk("flags:0x%08lx mapping:%p mapped:%d count:%d\n", - page->flags, page->mapping, + page->flags, (void *)page->__mapping, page_mapped(page), page_count(page)); printk("Backtrace:\n"); dump_stack(); @@ -84,9 +84,12 @@ static void bad_page(const char *functio 1 << PG_lru | 1 << PG_active | 1 << PG_dirty | + 1 << PG_rmaplock | + 1 << PG_anon | + 1 << PG_swapcache | 1 << PG_writeback); set_page_count(page, 0); - page->mapping = NULL; + set_page_mapping(page, NULL); } #ifndef CONFIG_HUGETLB_PAGE @@ -168,7 +171,7 @@ static void destroy_compound_page(struct * -- wli */ -static inline void __free_pages_bulk (struct page *page, struct page *base, +static inline void buddy_free(struct page *page, struct page *base, struct zone *zone, struct free_area *area, unsigned long mask, unsigned int order) { @@ -181,7 +184,6 @@ static inline void __free_pages_bulk (st BUG(); index = page_idx >> (1 + order); - zone->free_pages -= mask; while (mask + (1 << (MAX_ORDER-1))) { struct page *buddy1, *buddy2; @@ -202,17 +204,45 @@ static inline void __free_pages_bulk (st BUG_ON(bad_range(zone, buddy2)); list_del(&buddy1->list); mask <<= 1; + area->globally_free--; area++; index >>= 1; page_idx &= mask; } list_add(&(base + page_idx)->list, &area->free_list); + area->globally_free++; +} + +static inline void __free_pages_bulk(struct page *page, struct page *base, + struct zone *zone, struct free_area *area, unsigned long mask, + unsigned int order) +{ + switch (area->active - area->locally_free) { + case 0: + if (!list_empty(&area->deferred_pages)) { + struct page *defer = list_entry(area->deferred_pages.next, struct page, list); + list_del(&defer->list); + area->locally_free--; + buddy_free(defer, base, zone, area, mask, order); + } + /* fall through */ + case 1: + buddy_free(page, base, zone, area, mask, order); + break; + default: + list_add(&page->list, &area->deferred_pages); + area->locally_free++; + break; + } + if (area->active) + area->active--; + zone->free_pages += 1 << order; } static inline void free_pages_check(const char *function, struct page *page) { if ( page_mapped(page) || - page->mapping != NULL || + page->__mapping != 0 || page_count(page) != 0 || (page->flags & ( 1 << PG_lru | @@ -220,6 +250,9 @@ static inline void free_pages_check(cons 1 << PG_locked | 1 << PG_active | 1 << PG_reclaim | + 1 << PG_rmaplock | + 1 << PG_anon | + 1 << PG_swapcache | 1 << PG_writeback ))) bad_page(function, page); if (PageDirty(page)) @@ -237,41 +270,78 @@ static inline void free_pages_check(cons * And clear the zone's pages_scanned counter, to hold off the "all pages are * pinned" detection logic. */ -static int -free_pages_bulk(struct zone *zone, int count, - struct list_head *list, unsigned int order) +void free_pages_bulk(struct zone *zone, struct page *page, unsigned int order) { - unsigned long mask, flags; + unsigned long mask, flags, count; struct free_area *area; - struct page *base, *page = NULL; - int ret = 0; + struct page *base, *save; + LIST_HEAD(tmp); + count = page->private; mask = (~0UL) << order; base = zone->zone_mem_map; area = zone->free_area + order; spin_lock_irqsave(&zone->lock, flags); zone->all_unreclaimable = 0; zone->pages_scanned = 0; - while (!list_empty(list) && count--) { - page = list_entry(list->prev, struct page, list); - /* have to delete it as __free_pages_bulk list manipulates */ - list_del(&page->list); - __free_pages_bulk(page, base, zone, area, mask, order); - ret++; + + if (order || area->active - area->locally_free <= 2*count) { + list_splice(&page->list, &tmp); + list_add(&page->list, &tmp); + page->private = 0; + } + + if (order) { + list_for_each_entry_safe(page, save, &tmp, list) { + list_del(&page->list); + __free_pages_bulk(page, base, zone, area, mask, order); + } + } else if (area->active - area->locally_free <= 2*count) { + /* + * This is a somewhat ad hoc approach to dealing with + * the interaction of gang allocation and the deferred + * coalescing heuristics. + */ + if (area->active - area->locally_free < count) { + int local = 0; + + while (local < count && area->locally_free) { + struct page *follow, *head = + list_entry(area->deferred_pages.next, struct page, lru); + list_del(&head->lru); + list_for_each_entry_safe(follow, save, &head->list, list) { + list_del(&follow->list); + buddy_free(follow, base, zone, area, mask, 0); + } + local += head->private; + area->locally_free -= head->private; + head->private = 0; + buddy_free(head, base, zone, area, mask, 0); + } + } + list_for_each_entry_safe(page, save, &tmp, list) { + list_del(&page->list); + buddy_free(page, base, zone, area, mask, order); + } + } else { + area->locally_free += count; + list_add(&page->lru, &area->deferred_pages); + } + if (!order) { + zone->free_pages += count; + area->active -= min(area->active, count); } spin_unlock_irqrestore(&zone->lock, flags); - return ret; } void __free_pages_ok(struct page *page, unsigned int order) { - LIST_HEAD(list); - mod_page_state(pgfree, 1 << order); free_pages_check(__FUNCTION__, page); - list_add(&page->list, &list); kernel_map_pages(page, 1<private = 1; + INIT_LIST_HEAD(&page->list); + free_pages_bulk(page_zone(page), page, order); } #define MARK_USED(index, order, area) \ @@ -284,10 +354,10 @@ expand(struct zone *zone, struct page *p unsigned long size = 1 << high; while (high > low) { - BUG_ON(bad_range(zone, page)); area--; high--; size >>= 1; + area->globally_free++; list_add(&page->list, &area->free_list); MARK_USED(index, high, area); index += size; @@ -317,7 +387,7 @@ static inline void set_page_refs(struct */ static void prep_new_page(struct page *page, int order) { - if (page->mapping || page_mapped(page) || + if (page->__mapping || page_mapped(page) || (page->flags & ( 1 << PG_private | 1 << PG_locked | @@ -325,6 +395,9 @@ static void prep_new_page(struct page *p 1 << PG_active | 1 << PG_dirty | 1 << PG_reclaim | + 1 << PG_rmaplock | + 1 << PG_anon | + 1 << PG_swapcache | 1 << PG_writeback ))) bad_page(__FUNCTION__, page); @@ -338,7 +411,7 @@ static void prep_new_page(struct page *p * Do the hard work of removing an element from the buddy allocator. * Call me with the zone->lock already held. */ -static struct page *__rmqueue(struct zone *zone, unsigned int order) +static struct page *buddy_alloc(struct zone *zone, unsigned int order) { struct free_area * area; unsigned int current_order; @@ -352,16 +425,144 @@ static struct page *__rmqueue(struct zon page = list_entry(area->free_list.next, struct page, list); list_del(&page->list); + area->globally_free--; index = page - zone->zone_mem_map; if (current_order != MAX_ORDER-1) MARK_USED(index, current_order, area); - zone->free_pages -= 1UL << order; return expand(zone, page, index, order, current_order, area); } return NULL; } +/* + * This is bad; some way to avoid putting singleton pages on the + * deferred lists should be worked out at some point. + */ +static void split_pages(struct zone *zone, struct page *page, int page_order, int deferred_order) +{ + int split_order = deferred_order - 1; + unsigned long split_offset = 1UL << split_order; + struct page *split_page; + + while (split_order >= page_order) { + split_page = &page[split_offset]; + if (split_order) + list_add(&split_page->list, + &zone->free_area[split_order].deferred_pages); + else if (!zone->free_area[split_order].locally_free) { + INIT_LIST_HEAD(&split_page->list); + split_page->private = 1; + list_add(&split_page->lru, + &zone->free_area[split_order].deferred_pages); + } else { + struct page *head; + head = list_entry(zone->free_area[split_order].deferred_pages.next, struct page, lru); + head->private++; + list_add(&split_page->list, &head->list); + } + zone->free_area[split_order].locally_free++; + --split_order; + split_offset >>= 1; + } +} + +#define COALESCE_BATCH 256 +static inline struct page *steal_deferred_page(struct zone *zone, int order) +{ + struct page *page; + struct list_head *elem; + struct free_area *area = zone->free_area; + int found_order, k; + + if (zone->free_pages < (1 << order)) + return NULL; + + /* the range of found_order precludes order 0 */ + for (found_order = order + 1; found_order < MAX_ORDER; ++found_order) + if (!list_empty(&area[found_order].deferred_pages)) { + elem = area[found_order].deferred_pages.next; + page = list_entry(elem, struct page, list); + list_del(elem); + area[found_order].locally_free--; + split_pages(zone, page, order, found_order); + return page; + } + + for (found_order = order - 1; found_order >= 0; --found_order) { + for (k = 0; k < COALESCE_BATCH; ++k) { + unsigned long mask = (~0UL) << found_order; + if (list_empty(&area[found_order].deferred_pages)) + break; + elem = area[found_order].deferred_pages.next; + if (found_order) { + page = list_entry(elem, struct page, list); + list_del(elem); + area[found_order].locally_free--; + buddy_free(page, zone->zone_mem_map, zone, &area[found_order], mask, found_order); + } else { + LIST_HEAD(tmp); + struct page *save; + + page = list_entry(elem, struct page, lru); + list_del(elem); + area[found_order].locally_free -= page->private; + page->private = 0; + list_splice(&page->list, &tmp); + list_add(&page->list, &tmp); + list_for_each_entry_safe(page, save, &tmp, list) { + list_del(&page->list); + buddy_free(page, zone->zone_mem_map, zone, &area[found_order], mask, found_order); + } + } + } + page = buddy_alloc(zone, order); + if (page) + return page; + } + return buddy_alloc(zone, order); +} + +static inline int __rmqueue(struct zone *zone, unsigned int order, struct list_head *list) +{ + struct free_area *area = &zone->free_area[order]; + struct page *page; + int count; + + if (!list_empty(&area->deferred_pages)) { + if (order) { + page = list_entry(area->deferred_pages.next, struct page, list); + list_del(&page->list); + count = 1; + } else { + page = list_entry(area->deferred_pages.next, struct page, lru); + list_del(&page->lru); + count = page->private; + page->private = 0; + list_splice(&page->list, list); + } + + area->locally_free -= count; + area->active += count; + zone->free_pages -= count << order; + } else { + page = buddy_alloc(zone, order); + if (page) + count = 1; + else { + page = steal_deferred_page(zone, order); + if (page) + count = 1; + else + return 0; + } + area->active += count; + zone->free_pages -= count << order; + } + list_add(&page->list, list); + return count; +} + /* * Obtain a specified number of elements from the buddy allocator, all under * a single hold of the lock, for efficiency. Add them to the supplied list. @@ -371,17 +572,14 @@ static int rmqueue_bulk(struct zone *zon unsigned long count, struct list_head *list) { unsigned long flags; - int i; - int allocated = 0; - struct page *page; + int i, j, allocated = 0; spin_lock_irqsave(&zone->lock, flags); - for (i = 0; i < count; ++i) { - page = __rmqueue(zone, order); - if (page == NULL) + for (i = 0; i < count && allocated < count; ++i) { + j = __rmqueue(zone, order, list); + if (!j) break; - allocated++; - list_add_tail(&page->list, list); + allocated += j; } spin_unlock_irqrestore(&zone->lock, flags); return allocated; @@ -426,10 +624,14 @@ void drain_local_pages(void) pset = &zone->pageset[smp_processor_id()]; for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { struct per_cpu_pages *pcp; + struct page *page, *save; pcp = &pset->pcp[i]; - pcp->count -= free_pages_bulk(zone, pcp->count, - &pcp->list, 0); + list_for_each_entry_safe(page, save, &pcp->list, lru) { + list_del(&page->lru); + pcp->count -= page->private; + free_pages_bulk(zone, page, 0); + } } } local_irq_restore(flags); @@ -445,15 +647,28 @@ static void free_hot_cold_page(struct pa struct zone *zone = page_zone(page); struct per_cpu_pages *pcp; unsigned long flags; + struct page *head; kernel_map_pages(page, 1, 0); inc_page_state(pgfree); free_pages_check(__FUNCTION__, page); pcp = &zone->pageset[get_cpu()].pcp[cold]; local_irq_save(flags); - if (pcp->count >= pcp->high) - pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0); - list_add(&page->list, &pcp->list); + while (pcp->count >= pcp->high) { + struct page *free = list_entry(pcp->list.prev, struct page, lru); + list_del(&free->lru); + pcp->count -= free->private; + free_pages_bulk(zone, free, 0); + } + head = list_entry(pcp->list.next, struct page, lru); + if (!list_empty(&pcp->list) && head->private < pcp->batch) { + list_add(&page->list, &head->list); + head->private++; + } else { + INIT_LIST_HEAD(&page->list); + list_add(&page->lru, &pcp->list); + page->private = 1; + } pcp->count++; local_irq_restore(flags); put_cpu(); @@ -478,31 +693,75 @@ void free_cold_page(struct page *page) static struct page *buffered_rmqueue(struct zone *zone, int order, int cold) { unsigned long flags; - struct page *page = NULL; + struct page *head, *page = NULL; + struct per_cpu_pages *pcp = NULL; if (order == 0) { - struct per_cpu_pages *pcp; - pcp = &zone->pageset[get_cpu()].pcp[cold]; local_irq_save(flags); - if (pcp->count <= pcp->low) - pcp->count += rmqueue_bulk(zone, 0, - pcp->batch, &pcp->list); + if (pcp->count <= pcp->low) { + LIST_HEAD(tmp); + int k; + + k = rmqueue_bulk(zone, 0, pcp->batch, &tmp); + if (k) { + pcp->count += k; + head = list_entry(tmp.next, struct page, list); + list_del_init(&head->list); + head->private = k; + list_splice(&tmp, &head->list); + list_add(&head->lru, &pcp->list); + } + } if (pcp->count) { - page = list_entry(pcp->list.next, struct page, list); - list_del(&page->list); + head = list_entry(pcp->list.next, struct page, lru); + if (head->private == 1) { + list_del(&head->lru); + page = head; + page->private = 0; + } else { + page = list_entry(head->list.next, struct page,list); + list_del(&page->list); + head->private--; + } pcp->count--; } local_irq_restore(flags); put_cpu(); } - if (page == NULL) { + if (unlikely(!page)) { + LIST_HEAD(tmp); + int count; + + if (!order) + pcp = &zone->pageset[get_cpu()].pcp[cold]; + spin_lock_irqsave(&zone->lock, flags); - page = __rmqueue(zone, order); - spin_unlock_irqrestore(&zone->lock, flags); + count = __rmqueue(zone, order, &tmp); + spin_unlock(&zone->lock); + + if (!list_empty(&tmp)) + page = list_entry(tmp.next, struct page, list); + + if (!order && count > 1) { + struct page *head; + + list_del(&page->list); + pcp->count += count - 1; + head = list_entry(tmp.next, struct page, list); + list_del_init(&head->list); + head->private = count - 1; + list_splice(&tmp, &head->list); + list_add(&head->lru, &pcp->list); + } + + local_irq_restore(flags); + if (order && page) prep_compound_page(page, order); + else if (!order) + put_cpu(); } if (page != NULL) { @@ -820,6 +1079,17 @@ static void show_node(struct zone *zone) #define show_node(zone) do { } while (0) #endif +unsigned long nr_deferred_pages(void) +{ + struct zone *zone; + unsigned long order, pages = 0; + + for_each_zone(zone) + for (order = 0; order < MAX_ORDER; ++order) + pages += zone->free_area[order].locally_free << order; + return pages; +} + /* * Accumulate the page_state information across all CPUs. * The result is unavoidably approximate - it can change @@ -991,8 +1261,7 @@ void show_free_areas(void) } for_each_zone(zone) { - struct list_head *elem; - unsigned long nr, flags, order, total = 0; + unsigned long order, total = 0; show_node(zone); printk("%s: ", zone->name); @@ -1001,16 +1270,20 @@ void show_free_areas(void) continue; } - spin_lock_irqsave(&zone->lock, flags); + printk("buddy: "); for (order = 0; order < MAX_ORDER; order++) { - nr = 0; - list_for_each(elem, &zone->free_area[order].free_list) - ++nr; - total += nr << order; - printk("%lu*%lukB ", nr, K(1UL) << order); + printk("%lu*%lukB ", zone->free_area[order].globally_free, K(1UL) << order); + total += zone->free_area[order].globally_free << order; } - spin_unlock_irqrestore(&zone->lock, flags); - printk("= %lukB\n", K(total)); + printk("\ndefer: "); + for (order = 0; order < MAX_ORDER; order++) { + printk("%lu*%lukB ", zone->free_area[order].locally_free, K(1UL) << order); + total += zone->free_area[order].locally_free << order; + } + printk("\nactive: "); + for (order = 0; order < MAX_ORDER; order++) + printk("%lu*%lukB ", zone->free_area[order].active, K(1UL) << order); + printk("\n= %lukB\n", K(total)); } show_swap_cache_info(); @@ -1118,7 +1391,7 @@ static inline unsigned long wait_table_s * on IO we've got bigger problems than wait queue collision. * Limit the size of the wait table to a reasonable size. */ - size = min(size, 4096UL); + size = min(size, 1UL << (16 + fls(NR_CPUS))); return max(size, 4UL); } @@ -1247,7 +1520,7 @@ static void __init free_area_init_core(s batch = zone->present_pages / 1024; if (batch * PAGE_SIZE > 256 * 1024) batch = (256 * 1024) / PAGE_SIZE; - batch /= 4; /* We effectively *= 4 below */ + batch *= 4; /* We effectively *= 4 below */ if (batch < 1) batch = 1; @@ -1307,8 +1580,11 @@ static void __init free_area_init_core(s for (i = 0; ; i++) { unsigned long bitmap_size; - + INIT_LIST_HEAD(&zone->free_area[i].deferred_pages); INIT_LIST_HEAD(&zone->free_area[i].free_list); + zone->free_area[i].globally_free = 0; + zone->free_area[i].locally_free = 0; + zone->free_area[i].active = 0; if (i == MAX_ORDER-1) { zone->free_area[i].map = NULL; break; @@ -1414,24 +1690,22 @@ static int frag_show(struct seq_file *m, pg_data_t *pgdat = (pg_data_t *)arg; struct zone *zone; struct zone *node_zones = pgdat->node_zones; - unsigned long flags; int order; for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { if (!zone->present_pages) continue; - spin_lock_irqsave(&zone->lock, flags); - seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); - for (order = 0; order < MAX_ORDER; ++order) { - unsigned long nr_bufs = 0; - struct list_head *elem; - - list_for_each(elem, &(zone->free_area[order].free_list)) - ++nr_bufs; - seq_printf(m, "%6lu ", nr_bufs); - } - spin_unlock_irqrestore(&zone->lock, flags); + seq_printf(m, "Node %d, zone %8s\n", pgdat->node_id, zone->name); + seq_puts(m, "buddy: "); + for (order = 0; order < MAX_ORDER; ++order) + seq_printf(m, "%6lu ", zone->free_area[order].globally_free); + seq_puts(m, "\ndefer: "); + for (order = 0; order < MAX_ORDER; ++order) + seq_printf(m, "%6lu ", zone->free_area[order].locally_free); + seq_puts(m, "\nactive: "); + for (order = 0; order < MAX_ORDER; ++order) + seq_printf(m, "%6lu ", zone->free_area[order].active); seq_putc(m, '\n'); } return 0; @@ -1450,6 +1724,7 @@ static char *vmstat_text[] = { "nr_unstable", "nr_page_table_pages", "nr_mapped", + "nr_swapcache", "nr_slab", "pgpgin", diff -prauN linux-2.6.0-test1/mm/page_io.c wli-2.6.0-test1-37/mm/page_io.c --- linux-2.6.0-test1/mm/page_io.c 2003-07-13 20:30:41.000000000 -0700 +++ wli-2.6.0-test1-37/mm/page_io.c 2003-07-14 08:52:52.000000000 -0700 @@ -16,8 +16,6 @@ #include #include #include -#include /* for block_sync_page() */ -#include #include #include @@ -32,7 +30,7 @@ get_swap_bio(int gfp_flags, struct page swp_entry_t entry; BUG_ON(!PageSwapCache(page)); - entry.val = page->index; + entry.val = page->private; sis = get_swap_info_struct(swp_type(entry)); bio->bi_sector = map_swap_page(sis, swp_offset(entry)) * @@ -130,13 +128,6 @@ out: return ret; } -struct address_space_operations swap_aops = { - .writepage = swap_writepage, - .readpage = swap_readpage, - .sync_page = block_sync_page, - .set_page_dirty = __set_page_dirty_nobuffers, -}; - /* * A scruffy utility function to read or write an arbitrary swap page * and wait on the I/O. @@ -150,9 +141,8 @@ int rw_swap_page_sync(int rw, swp_entry_ lock_page(page); - BUG_ON(page->mapping); - page->mapping = &swapper_space; - page->index = entry.val; + SetPageSwapCache(page); + page->private = entry.val; if (rw == READ) { ret = swap_readpage(NULL, page); @@ -161,7 +151,7 @@ int rw_swap_page_sync(int rw, swp_entry_ ret = swap_writepage(page, &swap_wbc); wait_on_page_writeback(page); } - page->mapping = NULL; + ClearPageSwapCache(page); if (ret == 0 && (!PageUptodate(page) || PageError(page))) ret = -EIO; return ret; diff -prauN linux-2.6.0-test1/mm/readahead.c wli-2.6.0-test1-37/mm/readahead.c --- linux-2.6.0-test1/mm/readahead.c 2003-07-13 20:30:01.000000000 -0700 +++ wli-2.6.0-test1-37/mm/readahead.c 2003-07-14 08:33:37.000000000 -0700 @@ -218,7 +218,7 @@ __do_page_cache_readahead(struct address /* * Preallocate as many pages as we will need. */ - spin_lock(&mapping->page_lock); + mapping_rdlock(&mapping->page_lock); for (page_idx = 0; page_idx < nr_to_read; page_idx++) { unsigned long page_offset = offset + page_idx; @@ -229,16 +229,16 @@ __do_page_cache_readahead(struct address if (page) continue; - spin_unlock(&mapping->page_lock); + mapping_rdunlock(&mapping->page_lock); page = page_cache_alloc_cold(mapping); - spin_lock(&mapping->page_lock); + mapping_rdlock(&mapping->page_lock); if (!page) break; page->index = page_offset; list_add(&page->list, &page_pool); ret++; } - spin_unlock(&mapping->page_lock); + mapping_rdunlock(&mapping->page_lock); /* * Now start the IO. We ignore I/O errors - if the page is not diff -prauN linux-2.6.0-test1/mm/rmap.c wli-2.6.0-test1-37/mm/rmap.c --- linux-2.6.0-test1/mm/rmap.c 2003-07-13 20:38:02.000000000 -0700 +++ wli-2.6.0-test1-37/mm/rmap.c 2003-07-14 10:24:21.000000000 -0700 @@ -5,528 +5,634 @@ * Released under the General Public License (GPL). * * - * Simple, low overhead pte-based reverse mapping scheme. - * This is kept modular because we may want to experiment - * with object-based reverse mapping schemes. Please try - * to keep this thing as modular as possible. + * Simple, low overhead reverse mapping scheme. + * Please try to keep this thing as modular as possible. */ /* * Locking: - * - the page->pte.chain is protected by the PG_chainlock bit, + * - the page->rmap field is protected by the PG_rmaplock bit, * which nests within the the mm->page_table_lock, * which nests within the page lock. * - because swapout locking is opposite to the locking order * in the page fault path, the swapout path uses trylocks * on the mm->page_table_lock */ + #include #include #include #include #include #include -#include +#include #include #include - -#include -#include -#include +#include +#include #include /* #define DEBUG_RMAP */ /* - * Shared pages have a chain of pte_chain structures, used to locate - * all the mappings to this page. We only need a pointer to the pte - * here, the page struct for the page table page contains the process - * it belongs to and the offset within that process. - * - * We use an array of pte pointers in this structure to minimise cache misses - * while traversing reverse maps. + * struct addresser: for next_rmap_address to dole out user addresses + * one by one to page_referenced() or try_to_unmap() */ -#define NRPTE ((L1_CACHE_BYTES - sizeof(unsigned long))/sizeof(pte_addr_t)) +struct addresser { + unsigned long address, count; + struct rmap_chain *chain; + int index; +}; -/* - * next_and_idx encodes both the address of the next pte_chain and the - * offset of the highest-index used pte in ptes[]. - */ -struct pte_chain { - unsigned long next_and_idx; - pte_addr_t ptes[NRPTE]; -} ____cacheline_aligned; +static kmem_cache_t *rmap_chain_cache; + +static DEFINE_PER_CPU(struct rmap_chain *, rmap_chain) = NULL; -kmem_cache_t *pte_chain_cache; +kmem_cache_t *anon_cache; -static inline struct pte_chain *pte_chain_next(struct pte_chain *pte_chain) +static void anon_ctor(void *arg, kmem_cache_t *cache, unsigned long unused) { - return (struct pte_chain *)(pte_chain->next_and_idx & ~NRPTE); + struct anon *anon = (struct anon *)arg; + atomic_set(&anon->count, 1); + anon->lock = SPIN_LOCK_UNLOCKED; + INIT_LIST_HEAD(&anon->list); + INIT_RCU_HEAD(&anon->rcu); } -static inline struct pte_chain *pte_chain_ptr(unsigned long pte_chain_addr) +static void rmap_chain_ctor(void *arg, kmem_cache_t *cache, unsigned long flags) { - return (struct pte_chain *)(pte_chain_addr & ~NRPTE); + int i; + struct rmap_chain *chain = (struct rmap_chain *)arg; + + for (i = 0; i < NRSLOT; ++i) + chain->slot[i] = NOADDR; + chain->next = NULL; } -static inline int pte_chain_idx(struct pte_chain *pte_chain) +static inline void rmap_chain_dtor(struct rmap_chain *chain) { - return pte_chain->next_and_idx & NRPTE; + int i; + for (i = 0; i < NRSLOT; ++i) + if (chain->slot[i] != NOADDR) + chain->slot[i] = NOADDR; + if (chain->next) + chain->next = NULL; } -static inline unsigned long -pte_chain_encode(struct pte_chain *pte_chain, int idx) +void __init init_rmap(void) { - return (unsigned long)pte_chain | idx; + anon_cache = kmem_cache_create("anon", sizeof(struct anon), 0, 0, anon_ctor, NULL); + if (!anon_cache) + panic("init_rmap: Cannot alloc anon slab cache\n"); + rmap_chain_cache = kmem_cache_create("rmap_chain", sizeof(struct rmap_chain), 0, 0, rmap_chain_ctor, NULL); } -/* - * pte_chain list management policy: - * - * - If a page has a pte_chain list then it is shared by at least two processes, - * because a single sharing uses PageDirect. (Well, this isn't true yet, - * coz this code doesn't collapse singletons back to PageDirect on the remove - * path). - * - A pte_chain list has free space only in the head member - all succeeding - * members are 100% full. - * - If the head element has free space, it occurs in its leading slots. - * - All free space in the pte_chain is at the start of the head member. - * - Insertion into the pte_chain puts a pte pointer in the last free slot of - * the head member. - * - Removal from a pte chain moves the head pte of the head member onto the - * victim pte and frees the head member if it became empty. - */ +int exec_rmap(struct mm_struct *mm) +{ + struct anon *anon = kmem_cache_alloc(anon_cache, GFP_KERNEL); + if (!anon) + return -ENOMEM; + mm->anon = anon; + /* unique reference; no locking required */ + list_add_rcu(&mm->anon_list, &anon->list); + return 0; +} -/** - ** VM stuff below this comment - **/ +void dup_rmap(struct mm_struct *new, struct mm_struct *old) +{ + struct anon *anon = old->anon; + atomic_inc(&anon->count); + new->anon = anon; + spin_lock(&anon->lock); + list_add_tail_rcu(&new->anon_list, &anon->list); + spin_unlock(&anon->lock); +} -/** - * page_referenced - test if the page was referenced - * @page: the page to test - * - * Quick test_and_clear_referenced for all mappings to a page, - * returns the number of processes which referenced the page. - * Caller needs to hold the pte_chain_lock. - * - * If the page has a single-entry pte_chain, collapse that back to a PageDirect - * representation. This way, it's only done under memory pressure. - */ -int page_referenced(struct page * page) +static void free_anon(void *__anon) { - struct pte_chain *pc; - int referenced = 0; + struct anon *anon = (struct anon *)__anon; + INIT_LIST_HEAD(&anon->list); + atomic_set(&anon->count, 1); + kmem_cache_free(anon_cache, anon); +} - if (TestClearPageReferenced(page)) - referenced++; +void exit_rmap(struct mm_struct *mm) +{ + struct anon *anon = mm->anon; - if (PageDirect(page)) { - pte_t *pte = rmap_ptep_map(page->pte.direct); - if (ptep_test_and_clear_young(pte)) - referenced++; - rmap_ptep_unmap(pte); - } else { - int nr_chains = 0; + mm->anon = NULL; + spin_lock(&anon->lock); + list_del_rcu(&mm->anon_list); + spin_unlock(&anon->lock); + + if (!atomic_dec_and_test(&anon->count)) + return; + + call_rcu(&anon->rcu, free_anon, anon); +} + +/** + ** Functions for manipulating struct rmap_chain. + **/ - /* Check all the page tables mapping this page. */ - for (pc = page->pte.chain; pc; pc = pte_chain_next(pc)) { - int i; - - for (i = NRPTE-1; i >= 0; i--) { - pte_addr_t pte_paddr = pc->ptes[i]; - pte_t *p; - - if (!pte_paddr) - break; - p = rmap_ptep_map(pte_paddr); - if (ptep_test_and_clear_young(p)) - referenced++; - rmap_ptep_unmap(p); - nr_chains++; +/* + * Boolean rmap_get_cpu() ensures the cpu has an rmap_chain cached + * in case it is needed later while lock is held. It is never needed + * when page_add_rmap() is adding a freshly allocated anon page. + * caller does put_cpu() once ->page_table_lock prevents preemption. + */ +int rmap_get_cpu(void) +{ + struct rmap_chain **cache, *chain; + might_sleep(); + cache = &per_cpu(rmap_chain, get_cpu()); + if (*cache) + return 1; + put_cpu(); + chain = kmem_cache_alloc(rmap_chain_cache, GFP_KERNEL); + cache = &per_cpu(rmap_chain, get_cpu()); + if (*cache) + kmem_cache_free(rmap_chain_cache, chain); + else if (chain) + *cache = chain; + else { + put_cpu(); + return 0; + } + return 1; +} + +static struct rmap_chain *get_rmap_chain(void) +{ + struct rmap_chain **cache, *chain; + int i; + + /* + * ->page_table_lock and rmap_lock are held, no need to get_cpu() + */ + cache = &per_cpu(rmap_chain, smp_processor_id()); + chain = *cache; + *cache = NULL; + for (i = 0; i < NRSLOT; ++i) + chain->slot[i] = NOADDR; + chain->next = NULL; + return chain; +} + +void add_rmap_address(struct page *page, unsigned long address) +{ + struct rmap_chain *chain = page->chain; + int i = 0; + + if (!chain) + page->chain = get_rmap_chain(); + else { + /* + * Check lest duplicates arise, and find a free slot at the end + */ + for (chain = page->chain; ; chain = chain->next) { + for (i = 0; i < NRSLOT; ++i) { + if (chain->slot[i] == NOADDR) + goto set; + else if (chain->slot[i] == address) + return; } + if (!chain->next) + chain->next = get_rmap_chain(); } - if (nr_chains == 1) { - pc = page->pte.chain; - page->pte.direct = pc->ptes[NRPTE-1]; - SetPageDirect(page); - pc->ptes[NRPTE-1] = 0; - __pte_chain_free(pc); + } +set: + chain->slot[i] = address; +} + +static int +next_rmap_address(struct page *page, struct vm_area_struct *vma, + struct addresser *addresser) +{ + /* bootstrap it */ + if (addresser->address == NOADDR) { + /* set chain and index for next call */ + addresser->chain = page->chain; + addresser->index = 0; + if (vma) { + addresser->address = vma_address(page, vma); + if (addresser->address != NOADDR) + return 1; + } else { + addresser->address = page->index; + return 1; } } - return referenced; + while (addresser->chain) { + if (addresser->index >= NRSLOT) + addresser->index = 0; + addresser->address = + addresser->chain->slot[addresser->index]; + if (addresser->address == NOADDR) + break; + addresser->index++; + if (addresser->index >= NRSLOT) + addresser->chain = addresser->chain->next; + if (!vma || addresser->address != vma_address(page, vma)) + return 1; + } + return 0; } -/** - * page_add_rmap - add reverse mapping entry to a page - * @page: the page to add the mapping to - * @ptep: the page table entry mapping this page - * - * Add a new pte reverse mapping to a page. - * The caller needs to hold the mm->page_table_lock. - */ -struct pte_chain * -page_add_rmap(struct page *page, pte_t *ptep, struct pte_chain *pte_chain) +void clear_page_chained(struct page *page) { - pte_addr_t pte_paddr = ptep_to_paddr(ptep); - struct pte_chain *cur_pte_chain; + struct rmap_chain *chain = page->chain; - if (!pfn_valid(page_to_pfn(page)) || PageReserved(page)) - return pte_chain; + /* + * This is only called when mapcount goes to 0, which + * means it's possible for a page to accumulate a large + * chain of stale addresses. But normally try_to_unmap_one() + * will bring the count to 0 and free them all here. + */ + do { + struct rmap_chain *next = chain->next; + rmap_chain_dtor(chain); + kmem_cache_free(rmap_chain_cache, chain); + chain = next; + } while (chain); +} - pte_chain_lock(page); +/** + ** Subfunctions of page_referenced(): page_referenced_one() called + ** repeatedly from page_referenced_obj(); + **/ - if (page->pte.direct == 0) { - page->pte.direct = pte_paddr; - SetPageDirect(page); - inc_page_state(nr_mapped); - goto out; - } +static inline int page_referenced_one(struct page *page, struct mm_struct *mm, + struct addresser *addresser) +{ + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + int referenced = 0; - if (PageDirect(page)) { - /* Convert a direct pointer into a pte_chain */ - ClearPageDirect(page); - pte_chain->ptes[NRPTE-1] = page->pte.direct; - pte_chain->ptes[NRPTE-2] = pte_paddr; - pte_chain->next_and_idx = pte_chain_encode(NULL, NRPTE-2); - page->pte.direct = 0; - page->pte.chain = pte_chain; - pte_chain = NULL; /* We consumed it */ + if (!spin_trylock(&mm->page_table_lock)) { + referenced = 1; goto out; } - cur_pte_chain = page->pte.chain; - if (cur_pte_chain->ptes[0]) { /* It's full */ - pte_chain->next_and_idx = pte_chain_encode(cur_pte_chain, - NRPTE - 1); - page->pte.chain = pte_chain; - pte_chain->ptes[NRPTE-1] = pte_paddr; - pte_chain = NULL; /* We consumed it */ + pgd = pgd_offset(mm, addresser->address); + if (!pgd_present(*pgd)) + goto out_unlock; + + pmd = pmd_offset_map(pgd, addresser->address); + if (!pmd) goto out; - } - cur_pte_chain->ptes[pte_chain_idx(cur_pte_chain) - 1] = pte_paddr; - cur_pte_chain->next_and_idx--; + + if (!pmd_present(*pmd)) + goto out_unmap_pmd; + + pte = pte_offset_map(pmd, addresser->address); + if (!pte_present(*pte)) + goto out_unmap_pte; + + if (page_to_pfn(page) != pte_pfn(*pte)) + goto out_unmap_pte; + + referenced = ptep_test_and_clear_young(pte); + addresser->count--; + +out_unmap_pmd: + pmd_unmap(pmd); +out_unmap_pte: + pte_unmap(pte); +out_unlock: + spin_unlock(&mm->page_table_lock); out: - pte_chain_unlock(page); - return pte_chain; + return referenced; } -/** - * page_remove_rmap - take down reverse mapping to a page - * @page: page to remove mapping from - * @ptep: page table entry to remove - * - * Removes the reverse mapping from the pte_chain of the page, - * after that the caller can clear the page table entry and free - * the page. - * Caller needs to hold the mm->page_table_lock. - */ -void page_remove_rmap(struct page *page, pte_t *ptep) +static inline int +page_referenced_anon(struct page *page, struct addresser *addresser) { - pte_addr_t pte_paddr = ptep_to_paddr(ptep); - struct pte_chain *pc; + struct mm_struct *mm; + struct anon *anon; + int referenced = 0; - if (!pfn_valid(page_to_pfn(page)) || PageReserved(page)) - return; + rcu_read_lock(); /* anon->lock */ - pte_chain_lock(page); + anon = page_anon(page); + if (!anon) + goto out; - if (!page_mapped(page)) - goto out_unlock; /* remap_page_range() from a driver? */ + list_for_each_entry_rcu(mm, &anon->list, anon_list) { + if (!mm->anon || !mm->rss) + continue; + addresser->address = NOADDR; + while (next_rmap_address(page, NULL, addresser)) { + referenced += page_referenced_one(page, mm, addresser); + if (!addresser->count) + goto out; + } + } +out: + rcu_read_unlock(); /* anon->lock */ + return referenced; +} - if (PageDirect(page)) { - if (page->pte.direct == pte_paddr) { - page->pte.direct = 0; - ClearPageDirect(page); - goto out; +static inline int page_referenced_obj(struct page *page, struct addresser *addresser) +{ + struct address_space *mapping = page_mapping(page); + struct vm_area_struct *vma; + int referenced = 0; + + /* bail if it's a Morton page */ + if (!mapping) + return 0; + + rcu_read_lock(); /* mapping->i_shared_lock */ + list_for_each_entry_rcu(vma, &mapping->i_mmap, shared) { + if (vma->vm_flags & VM_DEAD) + continue; + if (!vma->vm_mm->rss) + continue; + addresser->address = NOADDR; + while (next_rmap_address(page, vma, addresser)) { + referenced += page_referenced_one(page, vma->vm_mm, addresser); + if (!addresser->count) + goto out; } - } else { - struct pte_chain *start = page->pte.chain; - struct pte_chain *next; - int victim_i = -1; - - for (pc = start; pc; pc = next) { - int i; - - next = pte_chain_next(pc); - if (next) - prefetch(next); - for (i = pte_chain_idx(pc); i < NRPTE; i++) { - pte_addr_t pa = pc->ptes[i]; - - if (victim_i == -1) - victim_i = i; - if (pa != pte_paddr) - continue; - pc->ptes[i] = start->ptes[victim_i]; - start->ptes[victim_i] = 0; - if (victim_i == NRPTE-1) { - /* Emptied a pte_chain */ - page->pte.chain = pte_chain_next(start); - __pte_chain_free(start); - } else { - start->next_and_idx++; - } + } + + list_for_each_entry_rcu(vma, &mapping->i_mmap_shared, shared) { + if (vma->vm_flags & VM_DEAD) + continue; + if (!vma->vm_mm->rss) + continue; + addresser->address = NOADDR; + while (next_rmap_address(page, vma, addresser)) { + referenced += page_referenced_one(page, vma->vm_mm, addresser); + if (!addresser->count) goto out; - } } } out: - if (!page_mapped(page)) - dec_page_state(nr_mapped); -out_unlock: - pte_chain_unlock(page); - return; + rcu_read_unlock(); /* mapping->i_shared_lock */ + return referenced; } /** - * try_to_unmap_one - worker function for try_to_unmap - * @page: page to unmap - * @ptep: page table entry to unmap from page + * page_referenced - test if the page was referenced + * @page: the page to test * - * Internal helper function for try_to_unmap, called for each page - * table entry mapping a page. Because locking order here is opposite - * to the locking order used by the page fault path, we use trylocks. - * Locking: - * page lock shrink_list(), trylock - * pte_chain_lock shrink_list() - * mm->page_table_lock try_to_unmap_one(), trylock + * returns the number of ptes which referenced the page. + * Caller needs to hold the rmap_lock. */ -static int FASTCALL(try_to_unmap_one(struct page *, pte_addr_t)); -static int try_to_unmap_one(struct page * page, pte_addr_t paddr) +int page_referenced(struct page * page) { - pte_t *ptep = rmap_ptep_map(paddr); - unsigned long address = ptep_to_address(ptep); - struct mm_struct * mm = ptep_to_mm(ptep); - struct vm_area_struct * vma; - pte_t pte; - int ret; + int referenced = !!TestClearPageReferenced(page); + struct addresser addresser; + + addresser.count = atomic_read(&page->mapcount); + if (!addresser.count || !page->__mapping) + return 0; + else if (PageAnon(page)) + referenced += page_referenced_anon(page, &addresser); + else + referenced += page_referenced_obj(page, &addresser); + return referenced; +} + +void page_turn_rmap(struct page *page, struct vm_area_struct *vma) +{ + struct anon *old, *new; + old = page_anon(page); + new = vma->vm_mm->anon; + + BUG_ON(!PageAnon(page)); + BUG_ON(atomic_read(&page->mapcount) != 1); + + if (old == new) + return; + + rmap_lock(page); + set_page_mapping(page, new); + rmap_unlock(page); +} + +void page_move_rmap(struct page *page, struct vm_area_struct *vma, + unsigned long old, unsigned long new) +{ + if (!page_mapped(page) || !page->__mapping) + return; + + rmap_lock(page); + + if (PageAnon(page)) { + /* + * Don't check atomic_read(&page->mapcount) == 1 here + * because the mapcount could be 1 but the page + * could still have a chain, and our new address + * in that chain. + */ + if (atomic_read(&page->mapcount) == 1) + page->index = new; + else if (new != page->index) + add_rmap_address(page, new); + } else { + /* + * Just in case things are nonlinear. + */ + if (old != vma_address(page, vma)) + add_rmap_address(page, new); + } - if (!mm) - BUG(); + rmap_unlock(page); +} + +static int try_to_unmap_one(struct page *page, struct mm_struct *mm, + struct addresser *addresser, struct vm_area_struct *vma) +{ + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + pte_t pteval; + unsigned long address = addresser->address; + int ret = SWAP_AGAIN; /* * We need the page_table_lock to protect us from page faults, * munmap, fork, etc... */ - if (!spin_trylock(&mm->page_table_lock)) { - rmap_ptep_unmap(ptep); - return SWAP_AGAIN; - } - + if (!spin_trylock(&mm->page_table_lock)) + goto out; - /* During mremap, it's possible pages are not in a VMA. */ - vma = find_vma(mm, address); - if (!vma) { + /* If the page is mlock()'d, we can't unmap it. */ + if (!vma) + vma = find_vma(mm, address); + if (!vma || (vma->vm_flags & VM_LOCKED)) { ret = SWAP_FAIL; goto out_unlock; } - /* The page is mlock()d, we cannot swap it out. */ - if (vma->vm_flags & VM_LOCKED) { - ret = SWAP_FAIL; + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) goto out_unlock; - } + pmd = pmd_offset_map(pgd, address); + if (!pmd_present(*pmd)) + goto out_unmap_pmd; + pte = pte_offset_map(pmd, address); + if (!pte_present(*pte)) + goto out_unmap_pte; + + if (page_to_pfn(page) != pte_pfn(*pte)) + goto out_unmap_pte; + + addresser->count--; /* Nuke the page table entry. */ flush_cache_page(vma, address); - pte = ptep_get_and_clear(ptep); + pteval = vm_ptep_get_and_clear(vma, pte, address); flush_tlb_page(vma, address); - if (PageSwapCache(page)) { + if (PageAnon(page)) { /* * Store the swap location in the pte. * See handle_pte_fault() ... */ - swp_entry_t entry = { .val = page->index }; + swp_entry_t entry = { .val = page->private }; + BUG_ON(!PageSwapCache(page)); swap_duplicate(entry); - set_pte(ptep, swp_entry_to_pte(entry)); - BUG_ON(pte_file(*ptep)); + vm_set_pte(vma, pte, swp_entry_to_pte(entry), address); + BUG_ON(pte_file(*pte)); } else { - unsigned long pgidx; /* - * If a nonlinear mapping then store the file page offset - * in the pte. + * If a nonlinear mapping from sys_remap_file_pages(), + * then store the file page offset in the pte. */ - pgidx = (address - vma->vm_start) >> PAGE_SHIFT; - pgidx += vma->vm_pgoff; - pgidx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT; - if (page->index != pgidx) { - set_pte(ptep, pgoff_to_pte(page->index)); - BUG_ON(!pte_file(*ptep)); + if (address != vma_address(page, vma)) { + vm_set_pte(vma, pte, pgoff_to_pte(page->index), address); + BUG_ON(!pte_file(*pte)); } } /* Move the dirty bit to the physical page now the pte is gone. */ - if (pte_dirty(pte)) + if (pte_dirty(pteval)) set_page_dirty(page); - mm->rss--; + BUG_ON(!atomic_read(&page->mapcount)); + if (atomic_dec_and_test(&page->mapcount)) + if (page->chain) + clear_page_chained(page); page_cache_release(page); - ret = SWAP_SUCCESS; + mm->rss--; +out_unmap_pmd: + pmd_unmap(pmd); +out_unmap_pte: + pte_unmap(pte); out_unlock: - rmap_ptep_unmap(ptep); spin_unlock(&mm->page_table_lock); +out: return ret; } -/** - * try_to_unmap - try to remove all page table mappings to a page - * @page: the page to get unmapped - * - * Tries to remove all the page table entries which are mapping this - * page, used in the pageout path. Caller must hold the page lock - * and its pte chain lock. Return values are: - * - * SWAP_SUCCESS - we succeeded in removing all mappings - * SWAP_AGAIN - we missed a trylock, try again later - * SWAP_FAIL - the page is unswappable - */ -int try_to_unmap(struct page * page) +static inline int try_to_unmap_anon(struct page *page, struct addresser *addresser) { - struct pte_chain *pc, *next_pc, *start; - int ret = SWAP_SUCCESS; - int victim_i = -1; - - /* This page should not be on the pageout lists. */ - if (PageReserved(page)) - BUG(); - if (!PageLocked(page)) - BUG(); - /* We need backing store to swap out a page. */ - if (!page->mapping) - BUG(); - - if (PageDirect(page)) { - ret = try_to_unmap_one(page, page->pte.direct); - if (ret == SWAP_SUCCESS) { - page->pte.direct = 0; - ClearPageDirect(page); - } + struct mm_struct *mm; + struct anon *anon; + int ret = SWAP_AGAIN; + + rcu_read_lock(); /* anon->lock */ + + anon = page_anon(page); + if (!anon) goto out; - } - start = page->pte.chain; - for (pc = start; pc; pc = next_pc) { - int i; - - next_pc = pte_chain_next(pc); - if (next_pc) - prefetch(next_pc); - for (i = pte_chain_idx(pc); i < NRPTE; i++) { - pte_addr_t pte_paddr = pc->ptes[i]; - - if (!pte_paddr) - continue; - if (victim_i == -1) - victim_i = i; - - switch (try_to_unmap_one(page, pte_paddr)) { - case SWAP_SUCCESS: - /* - * Release a slot. If we're releasing the - * first pte in the first pte_chain then - * pc->ptes[i] and start->ptes[victim_i] both - * refer to the same thing. It works out. - */ - pc->ptes[i] = start->ptes[victim_i]; - start->ptes[victim_i] = 0; - victim_i++; - if (victim_i == NRPTE) { - page->pte.chain = pte_chain_next(start); - __pte_chain_free(start); - start = page->pte.chain; - victim_i = 0; - } else { - start->next_and_idx++; - } - break; - case SWAP_AGAIN: - /* Skip this pte, remembering status. */ - ret = SWAP_AGAIN; - continue; - case SWAP_FAIL: - ret = SWAP_FAIL; + list_for_each_entry_rcu(mm, &anon->list, anon_list) { + if (!mm->anon) + continue; + addresser->address = NOADDR; + while (next_rmap_address(page, NULL, addresser)) { + ret = try_to_unmap_one(page, mm, addresser, NULL); + if (ret == SWAP_FAIL || !addresser->count) goto out; - } } } out: - if (!page_mapped(page)) - dec_page_state(nr_mapped); + rcu_read_unlock(); /* anon->lock */ return ret; } -/** - ** No more VM stuff below this comment, only pte_chain helper - ** functions. - **/ - -static void pte_chain_ctor(void *p, kmem_cache_t *cachep, unsigned long flags) +static inline int try_to_unmap_obj(struct page *page, struct addresser *addresser) { - struct pte_chain *pc = p; + struct address_space *mapping; + struct vm_area_struct *vma; + int ret = SWAP_AGAIN; + + mapping = page_mapping(page); + + /* bail if it's a Morton page */ + if (!mapping) + return SWAP_FAIL; + + rcu_read_lock(); /* mapping->i_shared_lock */ + + list_for_each_entry_rcu(vma, &mapping->i_mmap, shared) { + if (vma->vm_flags & VM_DEAD) + continue; + if (!vma->vm_mm->rss) + continue; + addresser->address = NOADDR; + while (next_rmap_address(page, vma, addresser)) { + ret = try_to_unmap_one(page, vma->vm_mm, addresser, vma); + if (ret == SWAP_FAIL || !addresser->count) + goto out; + } + } - memset(pc, 0, sizeof(*pc)); + list_for_each_entry_rcu(vma, &mapping->i_mmap_shared, shared) { + if (vma->vm_flags & VM_DEAD) + continue; + if (!vma->vm_mm->rss) + continue; + addresser->address = NOADDR; + while (next_rmap_address(page, vma, addresser)) { + ret = try_to_unmap_one(page, vma->vm_mm, addresser, vma); + if (ret == SWAP_FAIL || !addresser->count) + goto out; + } + } +out: + rcu_read_unlock(); /* mapping->i_shared_lock */ + return ret; } -DEFINE_PER_CPU(struct pte_chain *, local_pte_chain) = 0; - /** - * __pte_chain_free - free pte_chain structure - * @pte_chain: pte_chain struct to free - */ -void __pte_chain_free(struct pte_chain *pte_chain) -{ - struct pte_chain **pte_chainp; - - pte_chainp = &get_cpu_var(local_pte_chain); - if (pte_chain->next_and_idx) - pte_chain->next_and_idx = 0; - if (*pte_chainp) - kmem_cache_free(pte_chain_cache, *pte_chainp); - *pte_chainp = pte_chain; - put_cpu_var(local_pte_chain); -} - -/* - * pte_chain_alloc(): allocate a pte_chain structure for use by page_add_rmap(). + * try_to_unmap - try to remove all page table mappings to a page + * @page: the page to get unmapped + * + * Tries to remove all the page table entries which are mapping this + * page, used in the pageout path. Caller must hold the page lock + * and its pte chain lock. Return values are: * - * The caller of page_add_rmap() must perform the allocation because - * page_add_rmap() is invariably called under spinlock. Often, page_add_rmap() - * will not actually use the pte_chain, because there is space available in one - * of the existing pte_chains which are attached to the page. So the case of - * allocating and then freeing a single pte_chain is specially optimised here, - * with a one-deep per-cpu cache. + * SWAP_SUCCESS - we succeeded in removing all mappings + * SWAP_AGAIN - we missed a trylock, try again later + * SWAP_FAIL - the page is unswappable */ -struct pte_chain *pte_chain_alloc(int gfp_flags) +int try_to_unmap(struct page *page) { - struct pte_chain *ret; - struct pte_chain **pte_chainp; - - if (gfp_flags & __GFP_WAIT) - might_sleep(); + struct addresser addresser; + int ret; - pte_chainp = &get_cpu_var(local_pte_chain); - if (*pte_chainp) { - ret = *pte_chainp; - *pte_chainp = NULL; - put_cpu_var(local_pte_chain); - } else { - put_cpu_var(local_pte_chain); - ret = kmem_cache_alloc(pte_chain_cache, gfp_flags); + BUG_ON(PageReserved(page)); + BUG_ON(!PageLocked(page)); + BUG_ON(!page_mapped(page)); + + addresser.count = atomic_read(&page->mapcount); + if (PageAnon(page)) + ret = try_to_unmap_anon(page, &addresser); + else + ret = try_to_unmap_obj(page, &addresser); + if (!page_mapped(page)) { + dec_page_state(nr_mapped); + if (PageAnon(page)) + clear_page_anon(page); + ret = SWAP_SUCCESS; } return ret; } - -void __init pte_chain_init(void) -{ - pte_chain_cache = kmem_cache_create( "pte_chain", - sizeof(struct pte_chain), - 0, - SLAB_MUST_HWCACHE_ALIGN, - pte_chain_ctor, - NULL); - - if (!pte_chain_cache) - panic("failed to create pte_chain cache!\n"); -} diff -prauN linux-2.6.0-test1/mm/shmem.c wli-2.6.0-test1-37/mm/shmem.c --- linux-2.6.0-test1/mm/shmem.c 2003-07-13 20:33:41.000000000 -0700 +++ wli-2.6.0-test1-37/mm/shmem.c 2003-07-14 08:52:52.000000000 -0700 @@ -694,7 +694,7 @@ static int shmem_writepage(struct page * BUG_ON(!PageLocked(page)); BUG_ON(page_mapped(page)); - mapping = page->mapping; + mapping = page_mapping(page); index = page->index; inode = mapping->host; info = SHMEM_I(inode); @@ -1109,7 +1109,7 @@ static struct inode_operations shmem_sym static int shmem_prepare_write(struct file *file, struct page *page, unsigned offset, unsigned to) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; return shmem_getpage(inode, page->index, &page, SGP_WRITE); } @@ -1765,7 +1765,7 @@ static void destroy_inodecache(void) static struct address_space_operations shmem_aops = { .writepage = shmem_writepage, - .set_page_dirty = __set_page_dirty_nobuffers, + .set_page_dirty = set_page_dirty_nobuffers, #ifdef CONFIG_TMPFS .prepare_write = shmem_prepare_write, .commit_write = simple_commit_write, diff -prauN linux-2.6.0-test1/mm/slab.c wli-2.6.0-test1-37/mm/slab.c --- linux-2.6.0-test1/mm/slab.c 2003-07-13 20:36:48.000000000 -0700 +++ wli-2.6.0-test1-37/mm/slab.c 2003-07-14 07:07:23.000000000 -0700 @@ -2717,7 +2717,7 @@ void ptrinfo(unsigned long addr) printk("No pgd.\n"); break; } - pmd = pmd_offset(pgd, addr); + pmd = pmd_offset_kernel(pgd, addr); if (pmd_none(*pmd)) { printk("No pmd.\n"); break; diff -prauN linux-2.6.0-test1/mm/swap_state.c wli-2.6.0-test1-37/mm/swap_state.c --- linux-2.6.0-test1/mm/swap_state.c 2003-07-13 20:33:46.000000000 -0700 +++ wli-2.6.0-test1-37/mm/swap_state.c 2003-07-14 08:52:52.000000000 -0700 @@ -21,22 +21,16 @@ static struct backing_dev_info swap_back .memory_backed = 1, /* Does not contribute to dirty memory */ }; -extern struct address_space_operations swap_aops; +static struct address_space_operations swap_aops = { + .writepage = swap_writepage, + .readpage = swap_readpage, +}; struct address_space swapper_space = { .page_tree = RADIX_TREE_INIT(GFP_ATOMIC), - .page_lock = SPIN_LOCK_UNLOCKED, - .clean_pages = LIST_HEAD_INIT(swapper_space.clean_pages), - .dirty_pages = LIST_HEAD_INIT(swapper_space.dirty_pages), - .io_pages = LIST_HEAD_INIT(swapper_space.io_pages), - .locked_pages = LIST_HEAD_INIT(swapper_space.locked_pages), + .page_lock = MAPPING_RW_LOCK_UNLOCKED, .a_ops = &swap_aops, .backing_dev_info = &swap_backing_dev_info, - .i_mmap = LIST_HEAD_INIT(swapper_space.i_mmap), - .i_mmap_shared = LIST_HEAD_INIT(swapper_space.i_mmap_shared), - .i_shared_sem = __MUTEX_INITIALIZER(swapper_space.i_shared_sem), - .private_lock = SPIN_LOCK_UNLOCKED, - .private_list = LIST_HEAD_INIT(swapper_space.private_list), }; #define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) @@ -58,30 +52,50 @@ void show_swap_cache_info(void) swap_cache_info.noent_race, swap_cache_info.exist_race); } +static int __add_to_swap_cache(struct page *page, swp_entry_t entry) +{ + int error; + + BUG_ON(PageSwapCache(page)); + BUG_ON(PagePrivate(page)); + error = radix_tree_preload(GFP_ATOMIC); + if (error) + return error; + + page_cache_get(page); + mapping_wrlock(&swapper_space.page_lock); + error = radix_tree_insert(&swapper_space.page_tree, entry.val, page); + if (error) + page_cache_release(page); + else { + SetPageLocked(page); + SetPageSwapCache(page); + page->private = entry.val; + inc_page_state(nr_swapcache); + } + mapping_wrunlock(&swapper_space.page_lock); + radix_tree_preload_end(); + return error; +} + static int add_to_swap_cache(struct page *page, swp_entry_t entry) { int error; - if (page->mapping) - BUG(); if (!swap_duplicate(entry)) { INC_CACHE_INFO(noent_race); return -ENOENT; } - error = add_to_page_cache(page, &swapper_space, entry.val, GFP_KERNEL); + error = __add_to_swap_cache(page, entry); /* * Anon pages are already on the LRU, we don't run lru_cache_add here. */ - if (error != 0) { + if (error) { swap_free(entry); if (error == -EEXIST) INC_CACHE_INFO(exist_race); return error; } - if (!PageLocked(page)) - BUG(); - if (!PageSwapCache(page)) - BUG(); INC_CACHE_INFO(add_total); return 0; } @@ -95,7 +109,9 @@ void __delete_from_swap_cache(struct pag BUG_ON(!PageLocked(page)); BUG_ON(!PageSwapCache(page)); BUG_ON(PageWriteback(page)); - __remove_from_page_cache(page); + radix_tree_delete(&swapper_space.page_tree, page->private); + ClearPageSwapCache(page); + dec_page_state(nr_swapcache); INC_CACHE_INFO(del_total); } @@ -139,8 +155,7 @@ int add_to_swap(struct page * page) /* * Add it to the swap cache and mark it dirty */ - err = add_to_page_cache(page, &swapper_space, - entry.val, GFP_ATOMIC); + err = __add_to_swap_cache(page, entry); if (pf_flags & PF_MEMALLOC) current->flags |= PF_MEMALLOC; @@ -148,8 +163,7 @@ int add_to_swap(struct page * page) switch (err) { case 0: /* Success */ SetPageUptodate(page); - ClearPageDirty(page); - set_page_dirty(page); + SetPageDirty(page); INC_CACHE_INFO(add_total); return 1; case -EEXIST: @@ -175,15 +189,16 @@ void delete_from_swap_cache(struct page { swp_entry_t entry; + BUG_ON(!PageSwapCache(page)); BUG_ON(!PageLocked(page)); BUG_ON(PageWriteback(page)); BUG_ON(PagePrivate(page)); - entry.val = page->index; + entry.val = page->private; - spin_lock(&swapper_space.page_lock); + mapping_wrlock(&swapper_space.page_lock); __delete_from_swap_cache(page); - spin_unlock(&swapper_space.page_lock); + mapping_wrunlock(&swapper_space.page_lock); swap_free(entry); page_cache_release(page); @@ -191,27 +206,10 @@ void delete_from_swap_cache(struct page int move_to_swap_cache(struct page *page, swp_entry_t entry) { - struct address_space *mapping = page->mapping; - int err; - - spin_lock(&swapper_space.page_lock); - spin_lock(&mapping->page_lock); - - err = radix_tree_insert(&swapper_space.page_tree, entry.val, page); - if (!err) { - __remove_from_page_cache(page); - ___add_to_page_cache(page, &swapper_space, entry.val); - } - - spin_unlock(&mapping->page_lock); - spin_unlock(&swapper_space.page_lock); - + int err = __add_to_swap_cache(page, entry); if (!err) { - if (!swap_duplicate(entry)) - BUG(); - /* shift page from clean_pages to dirty_pages list */ - BUG_ON(PageDirty(page)); - set_page_dirty(page); + BUG_ON(!swap_duplicate(entry)); + SetPageDirty(page); INC_CACHE_INFO(add_total); } else if (err == -EEXIST) INC_CACHE_INFO(exist_race); @@ -221,29 +219,13 @@ int move_to_swap_cache(struct page *page int move_from_swap_cache(struct page *page, unsigned long index, struct address_space *mapping) { - swp_entry_t entry; - int err; - - BUG_ON(!PageLocked(page)); - BUG_ON(PageWriteback(page)); - BUG_ON(PagePrivate(page)); - - entry.val = page->index; - - spin_lock(&swapper_space.page_lock); - spin_lock(&mapping->page_lock); - - err = radix_tree_insert(&mapping->page_tree, index, page); - if (!err) { - __delete_from_swap_cache(page); - ___add_to_page_cache(page, mapping, index); + int err = add_to_page_cache(page, mapping, index, GFP_ATOMIC); + if (err == -EEXIST) { + INC_CACHE_INFO(exist_race); + err = 0; } - - spin_unlock(&mapping->page_lock); - spin_unlock(&swapper_space.page_lock); - if (!err) { - swap_free(entry); + delete_from_swap_cache(page); /* shift page from clean_pages to dirty_pages list */ ClearPageDirty(page); set_page_dirty(page); @@ -307,11 +289,17 @@ void free_pages_and_swap_cache(struct pa * lock getting page table operations atomic even if we drop the page * lock before returning. */ -struct page * lookup_swap_cache(swp_entry_t entry) +struct page *lookup_swap_cache(swp_entry_t entry) { - struct page *found; + struct page *page; - found = find_get_page(&swapper_space, entry.val); + mapping_rdlock(&swapper_space.page_lock); + page = radix_tree_lookup(&swapper_space.page_tree, entry.val); + if (page) { + page_cache_get(page); + INC_CACHE_INFO(find_success); + } + mapping_rdunlock(&swapper_space.page_lock); /* * Unsafe to assert PageSwapCache and mapping on page found: * if SMP nothing prevents swapoff from deleting this page from @@ -319,9 +307,7 @@ struct page * lookup_swap_cache(swp_entr * that, but no need to change: we _have_ got the right page. */ INC_CACHE_INFO(find_total); - if (found) - INC_CACHE_INFO(find_success); - return found; + return page; } /* @@ -330,7 +316,7 @@ struct page * lookup_swap_cache(swp_entr * A failure return means that either the page allocation failed or that * the swap entry is no longer in use. */ -struct page * read_swap_cache_async(swp_entry_t entry) +struct page *read_swap_cache_async(swp_entry_t entry) { struct page *found_page, *new_page = NULL; int err; @@ -342,7 +328,11 @@ struct page * read_swap_cache_async(swp_ * that would confuse statistics: use find_get_page() * directly. */ - found_page = find_get_page(&swapper_space, entry.val); + mapping_rdlock(&swapper_space.page_lock); + found_page = radix_tree_lookup(&swapper_space.page_tree, entry.val); + if (found_page) + page_cache_get(found_page); + mapping_rdunlock(&swapper_space.page_lock); if (found_page) break; diff -prauN linux-2.6.0-test1/mm/swapfile.c wli-2.6.0-test1-37/mm/swapfile.c --- linux-2.6.0-test1/mm/swapfile.c 2003-07-13 20:31:50.000000000 -0700 +++ wli-2.6.0-test1-37/mm/swapfile.c 2003-07-14 09:10:59.000000000 -0700 @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include @@ -246,16 +246,16 @@ static int exclusive_swap_page(struct pa struct swap_info_struct * p; swp_entry_t entry; - entry.val = page->index; + entry.val = page->private; p = swap_info_get(entry); if (p) { /* Is the only swap cache user the cache itself? */ if (p->swap_map[swp_offset(entry)] == 1) { /* Recheck the page count with the pagecache lock held.. */ - spin_lock(&swapper_space.page_lock); + mapping_rdlock(&swapper_space.page_lock); if (page_count(page) - !!PagePrivate(page) == 2) retval = 1; - spin_unlock(&swapper_space.page_lock); + mapping_rdunlock(&swapper_space.page_lock); } swap_info_put(p); } @@ -314,7 +314,7 @@ int remove_exclusive_swap_page(struct pa if (page_count(page) != 2) /* 2: us + cache */ return 0; - entry.val = page->index; + entry.val = page->private; p = swap_info_get(entry); if (!p) return 0; @@ -323,13 +323,13 @@ int remove_exclusive_swap_page(struct pa retval = 0; if (p->swap_map[swp_offset(entry)] == 1) { /* Recheck the page count with the pagecache lock held.. */ - spin_lock(&swapper_space.page_lock); + mapping_wrlock(&swapper_space.page_lock); if ((page_count(page) == 2) && !PageWriteback(page)) { __delete_from_swap_cache(page); SetPageDirty(page); retval = 1; } - spin_unlock(&swapper_space.page_lock); + mapping_wrunlock(&swapper_space.page_lock); } swap_info_put(p); @@ -352,8 +352,13 @@ void free_swap_and_cache(swp_entry_t ent p = swap_info_get(entry); if (p) { - if (swap_entry_free(p, swp_offset(entry)) == 1) - page = find_trylock_page(&swapper_space, entry.val); + if (swap_entry_free(p, swp_offset(entry)) == 1) { + mapping_rdlock(&swapper_space.page_lock); + page = radix_tree_lookup(&swapper_space.page_tree, entry.val); + if (page && TestSetPageLocked(page)) + page = NULL; + mapping_rdunlock(&swapper_space.page_lock); + } swap_info_put(p); } if (page) { @@ -382,21 +387,21 @@ void free_swap_and_cache(swp_entry_t ent * what to do if a write is requested later. */ /* vma->vm_mm->page_table_lock is held */ -static void +static inline void unuse_pte(struct vm_area_struct *vma, unsigned long address, pte_t *dir, - swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp) + swp_entry_t entry, struct page *page) { - vma->vm_mm->rss++; get_page(page); - set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot))); - *pte_chainp = page_add_rmap(page, dir, *pte_chainp); + vm_set_pte(vma, dir, pte_mkold(mk_pte(page, vma->vm_page_prot)), address); + vma->vm_mm->rss++; + page_add_rmap(page, vma, address, 1); swap_free(entry); } /* vma->vm_mm->page_table_lock is held */ static int unuse_pmd(struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long size, unsigned long offset, - swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp) + swp_entry_t entry, struct page *page) { pte_t * pte; unsigned long end; @@ -421,8 +426,7 @@ static int unuse_pmd(struct vm_area_stru * Test inline before going to call unuse_pte. */ if (unlikely(pte_same(*pte, swp_pte))) { - unuse_pte(vma, offset + address, pte, - entry, page, pte_chainp); + unuse_pte(vma, offset + address, pte, entry, page); pte_unmap(pte); return 1; } @@ -436,7 +440,7 @@ static int unuse_pmd(struct vm_area_stru /* vma->vm_mm->page_table_lock is held */ static int unuse_pgd(struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long size, - swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp) + swp_entry_t entry, struct page *page) { pmd_t * pmd; unsigned long offset, end; @@ -448,7 +452,7 @@ static int unuse_pgd(struct vm_area_stru pgd_clear(dir); return 0; } - pmd = pmd_offset(dir, address); + pmd = pmd_offset_map(dir, address); offset = address & PGDIR_MASK; address &= ~PGDIR_MASK; end = address + size; @@ -457,26 +461,25 @@ static int unuse_pgd(struct vm_area_stru if (address >= end) BUG(); do { - if (unuse_pmd(vma, pmd, address, end - address, - offset, entry, page, pte_chainp)) + if (unuse_pmd(vma, pmd, address, end - address, offset, entry, page)) return 1; address = (address + PMD_SIZE) & PMD_MASK; pmd++; } while (address && (address < end)); + pmd_unmap(pmd - 1); return 0; } /* vma->vm_mm->page_table_lock is held */ static int unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir, - swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp) + swp_entry_t entry, struct page *page) { unsigned long start = vma->vm_start, end = vma->vm_end; if (start >= end) BUG(); do { - if (unuse_pgd(vma, pgdir, start, end - start, - entry, page, pte_chainp)) + if (unuse_pgd(vma, pgdir, start, end - start, entry, page)) return 1; start = (start + PGDIR_SIZE) & PGDIR_MASK; pgdir++; @@ -488,23 +491,20 @@ static int unuse_process(struct mm_struc swp_entry_t entry, struct page* page) { struct vm_area_struct* vma; - struct pte_chain *pte_chain; - - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) - return -ENOMEM; /* * Go through process' page directory. */ + if (!rmap_get_cpu()) + return -ENOMEM; spin_lock(&mm->page_table_lock); + put_cpu(); for (vma = mm->mmap; vma; vma = vma->vm_next) { pgd_t * pgd = pgd_offset(mm, vma->vm_start); - if (unuse_vma(vma, pgd, entry, page, &pte_chain)) + if (unuse_vma(vma, pgd, entry, page)) break; } spin_unlock(&mm->page_table_lock); - pte_chain_free(pte_chain); return 0; } @@ -652,8 +652,14 @@ static int try_to_unuse(unsigned int typ if (swcount > 1) { if (start_mm == &init_mm) shmem = shmem_unuse(entry, page); - else + else { retval = unuse_process(start_mm, entry, page); + if (retval) { + unlock_page(page); + page_cache_release(page); + break; + } + } } if (*swap_map > 1) { int set_start_mm = (*swap_map >= swcount); @@ -676,9 +682,7 @@ static int try_to_unuse(unsigned int typ cond_resched(); swcount = *swap_map; - if (swcount <= 1) - ; - else if (mm == &init_mm) { + if (mm == &init_mm) { set_start_mm = 1; shmem = shmem_unuse(entry, page); } else @@ -994,9 +998,10 @@ int page_queue_congested(struct page *pa BUG_ON(!PageLocked(page)); /* It pins the swap_info_struct */ - bdi = page->mapping->backing_dev_info; - if (PageSwapCache(page)) { - swp_entry_t entry = { .val = page->index }; + if (!PageSwapCache(page)) + bdi = page_mapping(page)->backing_dev_info; + else { + swp_entry_t entry = { .val = page->private }; struct swap_info_struct *sis; sis = get_swap_info_struct(swp_type(entry)); diff -prauN linux-2.6.0-test1/mm/truncate.c wli-2.6.0-test1-37/mm/truncate.c --- linux-2.6.0-test1/mm/truncate.c 2003-07-13 20:38:38.000000000 -0700 +++ wli-2.6.0-test1-37/mm/truncate.c 2003-07-14 08:52:52.000000000 -0700 @@ -18,7 +18,7 @@ static int do_invalidatepage(struct page *page, unsigned long offset) { int (*invalidatepage)(struct page *, unsigned long); - invalidatepage = page->mapping->a_ops->invalidatepage; + invalidatepage = page_mapping(page)->a_ops->invalidatepage; if (invalidatepage == NULL) invalidatepage = block_invalidatepage; return (*invalidatepage)(page, offset); @@ -36,7 +36,7 @@ static inline void truncate_partial_page * becomes anonymous. It will be left on the LRU and may even be mapped into * user pagetables if we're racing with filemap_nopage(). * - * We need to bale out if page->mapping is no longer equal to the original + * We need to bale out if page_mapping(page) is no longer equal to the original * mapping. This happens a) when the VM reclaimed the page while we waited on * its lock, b) when a concurrent invalidate_inode_pages got there first and * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space. @@ -44,7 +44,7 @@ static inline void truncate_partial_page static void truncate_complete_page(struct address_space *mapping, struct page *page) { - if (page->mapping != mapping) + if (page_mapping(page) != mapping) return; if (PagePrivate(page)) @@ -54,32 +54,31 @@ truncate_complete_page(struct address_sp ClearPageUptodate(page); ClearPageMappedToDisk(page); remove_from_page_cache(page); - page_cache_release(page); /* pagecache ref */ } /* * This is for invalidate_inode_pages(). That function can be called at * any time, and is not supposed to throw away dirty pages. But pages can * be marked dirty at any time too. So we re-check the dirtiness inside - * ->page_lock. That provides exclusion against the __set_page_dirty + * ->page_lock. That provides exclusion against the set_page_dirty * functions. */ static int invalidate_complete_page(struct address_space *mapping, struct page *page) { - if (page->mapping != mapping) + if (page_mapping(page) != mapping) return 0; if (PagePrivate(page) && !try_to_release_page(page, 0)) return 0; - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); if (PageDirty(page)) { - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); return 0; } __remove_from_page_cache(page); - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); ClearPageUptodate(page); page_cache_release(page); /* pagecache ref */ return 1; @@ -250,7 +249,7 @@ void invalidate_inode_pages2(struct addr struct page *page = pvec.pages[i]; lock_page(page); - if (page->mapping == mapping) { /* truncate race? */ + if (page_mapping(page) == mapping) { /* truncate race? */ wait_on_page_writeback(page); next = page->index + 1; if (page_mapped(page)) diff -prauN linux-2.6.0-test1/mm/vmalloc.c wli-2.6.0-test1-37/mm/vmalloc.c --- linux-2.6.0-test1/mm/vmalloc.c 2003-07-13 20:34:43.000000000 -0700 +++ wli-2.6.0-test1-37/mm/vmalloc.c 2003-07-14 06:49:00.000000000 -0700 @@ -70,7 +70,7 @@ static void unmap_area_pmd(pgd_t *dir, u return; } - pmd = pmd_offset(dir, address); + pmd = pmd_offset_kernel(dir, address); address &= ~PGDIR_MASK; end = address + size; if (end > PGDIR_SIZE) @@ -159,7 +159,7 @@ int map_vm_area(struct vm_struct *area, dir = pgd_offset_k(address); spin_lock(&init_mm.page_table_lock); do { - pmd_t *pmd = pmd_alloc(&init_mm, dir, address); + pmd_t *pmd = pmd_alloc_kernel(&init_mm, dir, address); if (!pmd) { err = -ENOMEM; break; diff -prauN linux-2.6.0-test1/mm/vmscan.c wli-2.6.0-test1-37/mm/vmscan.c --- linux-2.6.0-test1/mm/vmscan.c 2003-07-13 20:30:43.000000000 -0700 +++ wli-2.6.0-test1-37/mm/vmscan.c 2003-07-14 08:56:31.000000000 -0700 @@ -27,7 +27,7 @@ #include #include #include -#include +#include #include #include @@ -172,23 +172,23 @@ static int shrink_slab(long scanned, uns return 0; } -/* Must be called with page's pte_chain_lock held. */ +/* Must be called with page's rmap_lock held. */ static inline int page_mapping_inuse(struct page *page) { - struct address_space *mapping = page->mapping; + struct address_space *mapping; /* Page is in somebody's page tables. */ if (page_mapped(page)) return 1; - /* XXX: does this happen ? */ - if (!mapping) - return 0; - /* Be more reluctant to reclaim swapcache than pagecache */ if (PageSwapCache(page)) return 1; + mapping = page_mapping(page); + if (!mapping) + return 0; + /* File is mmap'd by somebody. */ if (!list_empty(&mapping->i_mmap)) return 1; @@ -253,14 +253,14 @@ shrink_list(struct list_head *page_list, if (PageWriteback(page)) goto keep_locked; - pte_chain_lock(page); + rmap_lock(page); if (page_referenced(page) && page_mapping_inuse(page)) { /* In active use or really unfreeable. Activate it. */ - pte_chain_unlock(page); + rmap_unlock(page); goto activate_locked; } - mapping = page->mapping; + mapping = page_mapping(page); #ifdef CONFIG_SWAP /* @@ -269,12 +269,14 @@ shrink_list(struct list_head *page_list, * * XXX: implement swap clustering ? */ - if (page_mapped(page) && !mapping && !PagePrivate(page)) { - pte_chain_unlock(page); + if (PageSwapCache(page)) + mapping = &swapper_space; + else if (PageAnon(page)) { + rmap_unlock(page); if (!add_to_swap(page)) goto activate_locked; - pte_chain_lock(page); - mapping = page->mapping; + rmap_lock(page); + mapping = &swapper_space; } #endif /* CONFIG_SWAP */ @@ -285,16 +287,16 @@ shrink_list(struct list_head *page_list, if (page_mapped(page) && mapping) { switch (try_to_unmap(page)) { case SWAP_FAIL: - pte_chain_unlock(page); + rmap_unlock(page); goto activate_locked; case SWAP_AGAIN: - pte_chain_unlock(page); + rmap_unlock(page); goto keep_locked; case SWAP_SUCCESS: ; /* try to free the page below */ } } - pte_chain_unlock(page); + rmap_unlock(page); /* * If the page is dirty, only perform writeback if that write @@ -324,7 +326,7 @@ shrink_list(struct list_head *page_list, goto keep_locked; if (!may_write_to_queue(mapping->backing_dev_info)) goto keep_locked; - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); if (test_clear_page_dirty(page)) { int res; struct writeback_control wbc = { @@ -334,8 +336,9 @@ shrink_list(struct list_head *page_list, .for_reclaim = 1, }; - list_move(&page->list, &mapping->locked_pages); - spin_unlock(&mapping->page_lock); + if (!PageSwapCache(page)) + list_move(&page->list, &mapping->locked_pages); + mapping_wrunlock(&mapping->page_lock); SetPageReclaim(page); res = mapping->a_ops->writepage(page, &wbc); @@ -350,7 +353,7 @@ shrink_list(struct list_head *page_list, } goto keep; } - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); } /* @@ -367,7 +370,7 @@ shrink_list(struct list_head *page_list, * try_to_release_page() will discover that cleanness and will * drop the buffers and mark the page clean - it can be freed. * - * Rarely, pages can have buffers and no ->mapping. These are + * Rarely, pages can have buffers and no page_mapping(). These are * the pages which were not successfully invalidated in * truncate_complete_page(). We try to drop those buffers here * and if that worked, and the page is no longer mapped into @@ -384,7 +387,7 @@ shrink_list(struct list_head *page_list, if (!mapping) goto keep_locked; /* truncate got there first */ - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); /* * The non-racy check for busy page. It is critical to check @@ -392,15 +395,15 @@ shrink_list(struct list_head *page_list, * not in use by anybody. (pagecache + us == 2) */ if (page_count(page) != 2 || PageDirty(page)) { - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); goto keep_locked; } #ifdef CONFIG_SWAP if (PageSwapCache(page)) { - swp_entry_t swap = { .val = page->index }; + swp_entry_t swap = { .val = page->private }; __delete_from_swap_cache(page); - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); swap_free(swap); __put_page(page); /* The pagecache ref */ goto free_it; @@ -408,7 +411,7 @@ shrink_list(struct list_head *page_list, #endif /* CONFIG_SWAP */ __remove_from_page_cache(page); - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); __put_page(page); free_it: @@ -628,13 +631,13 @@ refill_inactive_zone(struct zone *zone, page = list_entry(l_hold.prev, struct page, lru); list_del(&page->lru); if (page_mapped(page)) { - pte_chain_lock(page); + rmap_lock(page); if (page_mapped(page) && page_referenced(page)) { - pte_chain_unlock(page); + rmap_unlock(page); list_add(&page->lru, &l_active); continue; } - pte_chain_unlock(page); + rmap_unlock(page); if (!reclaim_mapped) { list_add(&page->lru, &l_active); continue; @@ -644,7 +647,7 @@ refill_inactive_zone(struct zone *zone, * FIXME: need to consider page_count(page) here if/when we * reap orphaned pages via the LRU (Daniel's locking stuff) */ - if (total_swap_pages == 0 && !page->mapping && + if (total_swap_pages == 0 && !page_mapping(page) && !PagePrivate(page)) { list_add(&page->lru, &l_active); continue; @@ -799,6 +802,10 @@ shrink_caches(struct zone *classzone, in } return ret; } + +#ifndef HAVE_ARCH_PAGETABLE_CACHE +#define shrink_pagetable_cache(gfp_mask) do { } while (0) +#endif /* * This is the main entry point to direct page reclaim. @@ -848,6 +855,9 @@ int try_to_free_pages(struct zone *cz, */ wakeup_bdflush(total_scanned); + /* shoot down some pagetable caches before napping */ + shrink_pagetable_cache(gfp_mask); + /* Take a nap, wait for some writeback to complete */ blk_congestion_wait(WRITE, HZ/10); if (cz - cz->zone_pgdat->node_zones < ZONE_HIGHMEM) { @@ -930,6 +940,7 @@ static int balance_pgdat(pg_data_t *pgda } if (all_zones_ok) break; + shrink_pagetable_cache(GFP_HIGHUSER); blk_congestion_wait(WRITE, HZ/10); } return nr_pages - to_free; @@ -956,11 +967,11 @@ int kswapd(void *p) struct reclaim_state reclaim_state = { .reclaimed_slab = 0, }; - unsigned long cpumask; + cpumask_t cpumask; daemonize("kswapd%d", pgdat->node_id); cpumask = node_to_cpumask(pgdat->node_id); - if (cpumask) + if (!cpus_empty(cpumask)) set_cpus_allowed(tsk, cpumask); current->reclaim_state = &reclaim_state; diff -prauN linux-2.6.0-test1/net/ipv4/netfilter/ipt_owner.c wli-2.6.0-test1-37/net/ipv4/netfilter/ipt_owner.c --- linux-2.6.0-test1/net/ipv4/netfilter/ipt_owner.c 2003-07-13 20:38:06.000000000 -0700 +++ wli-2.6.0-test1-37/net/ipv4/netfilter/ipt_owner.c 2003-07-14 09:45:14.000000000 -0700 @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -26,7 +27,7 @@ match_comm(const struct sk_buff *skb, co task_lock(p); files = p->files; if(files) { - spin_lock(&files->file_lock); + rcu_read_lock(); for (i=0; i < files->max_fds; i++) { if (fcheck_files(files, i) == skb->sk->sk_socket->file) { @@ -36,7 +37,7 @@ match_comm(const struct sk_buff *skb, co return 1; } } - spin_unlock(&files->file_lock); + rcu_read_unlock(); } task_unlock(p); } while_each_thread(g, p); @@ -92,14 +93,14 @@ match_sid(const struct sk_buff *skb, pid task_lock(p); files = p->files; if (files) { - spin_lock(&files->file_lock); + rcu_read_lock(); for (i=0; i < files->max_fds; i++) { if (fcheck_files(files, i) == file) { found = 1; break; } } - spin_unlock(&files->file_lock); + rcu_read_unlock(); } task_unlock(p); if (found) diff -prauN linux-2.6.0-test1/net/ipv6/netfilter/ip6t_owner.c wli-2.6.0-test1-37/net/ipv6/netfilter/ip6t_owner.c --- linux-2.6.0-test1/net/ipv6/netfilter/ip6t_owner.c 2003-07-13 20:37:15.000000000 -0700 +++ wli-2.6.0-test1-37/net/ipv6/netfilter/ip6t_owner.c 2003-07-14 09:45:14.000000000 -0700 @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -29,7 +30,7 @@ match_pid(const struct sk_buff *skb, pid task_lock(p); files = p->files; if(files) { - spin_lock(&files->file_lock); + rcu_read_lock(); for (i=0; i < files->max_fds; i++) { if (fcheck_files(files, i) == skb->sk->sk_socket->file) { spin_unlock(&files->file_lock); @@ -38,7 +39,7 @@ match_pid(const struct sk_buff *skb, pid return 1; } } - spin_unlock(&files->file_lock); + rcu_read_unlock(); } task_unlock(p); out: @@ -62,14 +63,14 @@ match_sid(const struct sk_buff *skb, pid task_lock(p); files = p->files; if (files) { - spin_lock(&files->file_lock); + rcu_read_lock(); for (i=0; i < files->max_fds; i++) { if (fcheck_files(files, i) == file) { found = 1; break; } } - spin_unlock(&files->file_lock); + rcu_read_unlock(); } task_unlock(p); if (found)