diff -purN -X /home/mbligh/.diff.exclude 000-virgin/Documentation/filesystems/proc.txt 999-mjb/Documentation/filesystems/proc.txt --- 000-virgin/Documentation/filesystems/proc.txt 2003-10-01 11:46:27.000000000 -0700 +++ 999-mjb/Documentation/filesystems/proc.txt 2003-10-02 16:39:40.000000000 -0700 @@ -38,6 +38,7 @@ Table of Contents 2.8 /proc/sys/net/ipv4 - IPV4 settings 2.9 Appletalk 2.10 IPX + 2.11 /proc/sys/sched - scheduler tunables ------------------------------------------------------------------------------ Preface @@ -1805,6 +1806,104 @@ The /proc/net/ipx_route table holds a gives the destination network, the router node (or Directly) and the network address of the router (or Connected) for internal networks. +2.11 /proc/sys/sched - scheduler tunables +----------------------------------------- + +Useful knobs for tuning the scheduler live in /proc/sys/sched. + +child_penalty +------------- + +Percentage of the parent's sleep_avg that children inherit. sleep_avg is +a running average of the time a process spends sleeping. Tasks with high +sleep_avg values are considered interactive and given a higher dynamic +priority and a larger timeslice. You typically want this some value just +under 100. + +exit_weight +----------- + +When a CPU hog task exits, its parent's sleep_avg is reduced by a factor of +exit_weight against the exiting task's sleep_avg. + +interactive_delta +----------------- + +If a task is "interactive" it is reinserted into the active array after it +has expired its timeslice, instead of being inserted into the expired array. +How "interactive" a task must be in order to be deemed interactive is a +function of its nice value. This interactive limit is scaled linearly by nice +value and is offset by the interactive_delta. + +max_sleep_avg +------------- + +max_sleep_avg is the largest value (in ms) stored for a task's running sleep +average. The larger this value, the longer a task needs to sleep to be +considered interactive (maximum interactive bonus is a function of +max_sleep_avg). + +max_timeslice +------------- + +Maximum timeslice, in milliseconds. This is the value given to tasks of the +highest dynamic priority. + +min_timeslice +------------- + +Minimum timeslice, in milliseconds. This is the value given to tasks of the +lowest dynamic priority. Every task gets at least this slice of the processor +per array switch. + +parent_penalty +-------------- + +Percentage of the parent's sleep_avg that it retains across a fork(). +sleep_avg is a running average of the time a process spends sleeping. Tasks +with high sleep_avg values are considered interactive and given a higher +dynamic priority and a larger timeslice. Normally, this value is 100 and thus +task's retain their sleep_avg on fork. If you want to punish interactive +tasks for forking, set this below 100. + +prio_bonus_ratio +---------------- + +Middle percentage of the priority range that tasks can receive as a dynamic +priority. The default value of 25% ensures that nice values at the +extremes are still enforced. For example, nice +19 interactive tasks will +never be able to preempt a nice 0 CPU hog. Setting this higher will increase +the size of the priority range the tasks can receive as a bonus. Setting +this lower will decrease this range, making the interactivity bonus less +apparent and user nice values more applicable. + +starvation_limit +---------------- + +Sufficiently interactive tasks are reinserted into the active array when they +run out of timeslice. Normally, tasks are inserted into the expired array. +Reinserting interactive tasks into the active array allows them to remain +runnable, which is important to interactive performance. This could starve +expired tasks, however, since the interactive task could prevent the array +switch. To prevent starving the tasks on the expired array for too long. the +starvation_limit is the longest (in ms) we will let the expired array starve +at the expense of reinserting interactive tasks back into active. Higher +values here give more preferance to running interactive tasks, at the expense +of expired tasks. Lower values provide more fair scheduling behavior, at the +expense of interactivity. The units are in milliseconds. + +idle_node_rebalance_ratio +------------------------- + +On NUMA machines, we normally rebalance within nodes, but we also rebalance +globally every N idle rebalance ticks, where N = idle_node_rebalance_ratio. + +busy_node_rebalance_ratio +------------------------- + +On NUMA machines, we normally rebalance within nodes, but we also rebalance +globally every N busy rebalance ticks, where N = busy_node_rebalance_ratio. + ------------------------------------------------------------------------------ Summary ------------------------------------------------------------------------------ diff -purN -X /home/mbligh/.diff.exclude 000-virgin/Makefile 999-mjb/Makefile --- 000-virgin/Makefile 2003-10-01 11:47:28.000000000 -0700 +++ 999-mjb/Makefile 2003-10-02 16:54:34.000000000 -0700 @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 6 SUBLEVEL = 0 -EXTRAVERSION = -test6 +EXTRAVERSION = -test6-mjb1 # *DOCUMENTATION* # To see a list of typical targets execute "make help" @@ -156,6 +156,8 @@ HOSTCXX = g++ HOSTCFLAGS = -Wall -Wstrict-prototypes -O2 -fomit-frame-pointer HOSTCXXFLAGS = -O2 +GCOV_FLAGS = -fprofile-arcs -ftest-coverage + # That's our default target when none is given on the command line # Note that 'modules' will be added as a prerequisite as well, @@ -286,6 +288,8 @@ export VERSION PATCHLEVEL SUBLEVEL EXTRA export CPPFLAGS NOSTDINC_FLAGS OBJCOPYFLAGS LDFLAGS export CFLAGS CFLAGS_KERNEL CFLAGS_MODULE export AFLAGS AFLAGS_KERNEL AFLAGS_MODULE +export CFLAGS_NOGCOV + export MODVERDIR := .tmp_versions @@ -655,6 +659,11 @@ depend dep: # --------------------------------------------------------------------------- # Modules +CFLAGS_NOGCOV := $(CFLAGS) +ifdef CONFIG_GCOV_ALL +CFLAGS += $(GCOV_FLAGS) +endif + ifdef CONFIG_MODULES # By default, build modules as well @@ -777,6 +786,7 @@ clean: archclean $(clean-dirs) $(call cmd,rmclean) @find . $(RCS_FIND_IGNORE) \ \( -name '*.[oas]' -o -name '*.ko' -o -name '.*.cmd' \ + -o -name '*.bb' -o -name '*.bbg' -o -name '*.da' \ -o -name '.*.d' -o -name '.*.tmp' -o -name '*.mod.c' \) \ -type f -print | xargs rm -f diff -purN -X /home/mbligh/.diff.exclude 000-virgin/arch/i386/Kconfig 999-mjb/arch/i386/Kconfig --- 000-virgin/arch/i386/Kconfig 2003-10-01 11:47:33.000000000 -0700 +++ 999-mjb/arch/i386/Kconfig 2003-10-02 16:43:03.000000000 -0700 @@ -453,17 +453,17 @@ config NR_CPUS This is purely to save memory - each supported CPU adds approximately eight kilobytes to the kernel image. -config PREEMPT - bool "Preemptible Kernel" - help - This option reduces the latency of the kernel when reacting to - real-time or interactive events by allowing a low priority process to - be preempted even if it is in kernel mode executing a system call. - This allows applications to run more reliably even when the system is - under load. - - Say Y here if you are building a kernel for a desktop, embedded - or real-time system. Say N if you are unsure. +# config PREEMPT +# bool "Preemptible Kernel" +# help +# This option reduces the latency of the kernel when reacting to +# real-time or interactive events by allowing a low priority process to +# be preempted even if it is in kernel mode executing a system call. +# This allows applications to run more reliably even when the system is +# under load. +# +# Say Y here if you are building a kernel for a desktop, embedded +# or real-time system. Say N if you are unsure. config X86_UP_APIC bool "Local APIC support on uniprocessors" if !SMP @@ -682,6 +682,44 @@ config HIGHMEM64G endchoice +choice + help + On i386, a process can only virtually address 4GB of memory. This + lets you select how much of that virtual space you would like to + devoted to userspace, and how much to the kernel. + + Some userspace programs would like to address as much as possible and + have few demands of the kernel other than it get out of the way. These + users may opt to use the 3.5GB option to give their userspace program + as much room as possible. Due to alignment issues imposed by PAE, + the "3.5GB" option is unavailable if "64GB" high memory support is + enabled. + + Other users (especially those who use PAE) may be running out of + ZONE_NORMAL memory. Those users may benefit from increasing the + kernel's virtual address space size by taking it away from userspace, + which may not need all of its space. An indicator that this is + happening is when /proc/Meminfo's "LowFree:" is a small percentage of + "LowTotal:" while "HighFree:" is very large. + + If unsure, say "3GB" + prompt "User address space size" + default 1GB + +config 05GB + bool "3.5 GB" + depends on !HIGHMEM64G + +config 1GB + bool "3 GB" + +config 2GB + bool "2 GB" + +config 3GB + bool "1 GB" +endchoice + config HIGHMEM bool depends on HIGHMEM64G || HIGHMEM4G @@ -699,6 +737,11 @@ config NUMA default n if X86_PC default y if (X86_NUMAQ || X86_SUMMIT) +config NUMA_SCHED + bool "Numa Scheduling Support" + depends on NUMA + default y + # Need comments to help the hapless user trying to turn on NUMA support comment "NUMA (NUMA-Q) requires SMP, 64GB highmem support" depends on X86_NUMAQ && (!HIGHMEM64G || !SMP) @@ -784,6 +827,33 @@ config MTRR See for more information. +choice + help + This is unrelated to your processor's speed. This variable alters + how often the system is asked to generate timer interrupts. A larger + value can lead to a more responsive system, but also causes extra + overhead from the increased number of context switches. + + If in doubt, leave it at the default of 1000. + + prompt "Kernel HZ" + default 1000HZ + +config 100HZ + bool "100 Hz" + +config 1000HZ + bool "1000 Hz" +endchoice + +config IRQBALANCE + bool "Enable kernel irq balancing" + depends on SMP + default y + help + The defalut yes will allow the kernel to do irq load balancing. + Saying no will keep the kernel from doing irq load balancing. + config HAVE_DEC_LOCK bool depends on (SMP || PREEMPT) && X86_CMPXCHG @@ -1168,6 +1238,36 @@ source "drivers/usb/Kconfig" source "arch/i386/oprofile/Kconfig" +menu "GCOV coverage profiling" + +config GCOV_PROFILE + bool "GCOV coverage profiling" + ---help--- + Provide infrastructure for coverage support for the kernel. This + will not compile the kernel by default with the necessary flags. + To obtain coverage information for the entire kernel, one should + enable the subsequent option (Profile entire kernel). If only + particular files or directories of the kernel are desired, then + one must provide the following compile options for such targets: + "-fprofile-arcs -ftest-coverage" in the CFLAGS. To obtain + access to the coverage data one must insmod the gcov-proc kernel + module. + +config GCOV_ALL + bool "GCOV_ALL" + depends on GCOV_PROFILE + ---help--- + If you say Y here, it will compile the entire kernel with coverage + option enabled. + +config GCOV_PROC + tristate "gcov-proc module" + depends on GCOV_PROFILE && PROC_FS + ---help--- + This is the gcov-proc module that exposes gcov data through the + /proc filesystem + +endmenu menu "Kernel hacking" @@ -1214,6 +1314,26 @@ config MAGIC_SYSRQ keys are documented in . Don't say Y unless you really know what this hack does. +config X86_EARLY_PRINTK + bool "Early console support" + default n + depends on DEBUG_KERNEL + help + Write kernel log output directly into the VGA buffer or serial port. + This is useful for kernel debugging when your machine crashes very + early before the console code is initialized. For normal operation + it is not recommended because it looks ugly and doesn't cooperate + with klogd/syslogd or the X server.You should normally N here, + unless you want to debug such a crash. + + Syntax: earlyprintk=vga + earlyprintk=serial[,ttySn[,baudrate]] + Append ,keep to not disable it when the real console takes over. + Only vga or serial at a time, not both. + Currently only ttyS0 and ttyS1 are supported. + Interaction with the standard serial driver is not very good. + The VGA output is eventually overwritten by the real console. + config DEBUG_SPINLOCK bool "Spinlock debugging" depends on DEBUG_KERNEL @@ -1231,6 +1351,15 @@ config DEBUG_PAGEALLOC This results in a large slowdown, but helps to find certain types of memory corruptions. +config SPINLINE + bool "Spinlock inlining" + depends on DEBUG_KERNEL + help + This will change spinlocks from out of line to inline, making them + account cost to the callers in readprofile, rather than the lock + itself (as ".text.lock.filename"). This can be helpful for finding + the callers of locks. + config DEBUG_HIGHMEM bool "Highmem debugging" depends on DEBUG_KERNEL && HIGHMEM @@ -1253,6 +1382,14 @@ config DEBUG_SPINLOCK_SLEEP If you say Y here, various routines which may sleep will become very noisy if they are called with a spinlock held. +config LOCKMETER + bool "Kernel lock metering" + depends on SMP + help + Say Y to enable kernel lock metering, which adds overhead to SMP + locks, but allows you to see various statistics using the lockstat + command + config FRAME_POINTER bool "Compile the kernel with frame pointers" help diff -purN -X /home/mbligh/.diff.exclude 000-virgin/arch/i386/Makefile 999-mjb/arch/i386/Makefile --- 000-virgin/arch/i386/Makefile 2003-10-01 11:47:33.000000000 -0700 +++ 999-mjb/arch/i386/Makefile 2003-10-02 16:39:38.000000000 -0700 @@ -98,6 +98,7 @@ drivers-$(CONFIG_PM) += arch/i386/powe CFLAGS += $(mflags-y) AFLAGS += $(mflags-y) +AFLAGS_vmlinux.lds.o += -imacros $(TOPDIR)/include/asm-i386/page.h boot := arch/i386/boot diff -purN -X /home/mbligh/.diff.exclude 000-virgin/arch/i386/boot/compressed/Makefile 999-mjb/arch/i386/boot/compressed/Makefile --- 000-virgin/arch/i386/boot/compressed/Makefile 2003-03-20 11:25:38.000000000 -0800 +++ 999-mjb/arch/i386/boot/compressed/Makefile 2003-10-02 16:43:03.000000000 -0700 @@ -7,6 +7,7 @@ targets := vmlinux vmlinux.bin vmlinux.bin.gz head.o misc.o piggy.o EXTRA_AFLAGS := -traditional +CFLAGS := $(CFLAGS_NOGCOV) LDFLAGS_vmlinux := -Ttext $(IMAGE_OFFSET) -e startup_32 $(obj)/vmlinux: $(obj)/head.o $(obj)/misc.o $(obj)/piggy.o FORCE diff -purN -X /home/mbligh/.diff.exclude 000-virgin/arch/i386/kernel/apic.c 999-mjb/arch/i386/kernel/apic.c --- 000-virgin/arch/i386/kernel/apic.c 2003-10-01 11:46:30.000000000 -0700 +++ 999-mjb/arch/i386/kernel/apic.c 2003-10-02 16:41:02.000000000 -0700 @@ -1017,7 +1017,7 @@ int setup_profiling_timer(unsigned int m * multiplier is 1 and it can be changed by writing the new multiplier * value into /proc/profile. */ - +extern void calc_load_cpu(int cpu); inline void smp_local_timer_interrupt(struct pt_regs * regs) { int cpu = smp_processor_id(); @@ -1045,6 +1045,7 @@ inline void smp_local_timer_interrupt(st #ifdef CONFIG_SMP update_process_times(user_mode(regs)); + calc_load_cpu(cpu); #endif } diff -purN -X /home/mbligh/.diff.exclude 000-virgin/arch/i386/kernel/entry.S 999-mjb/arch/i386/kernel/entry.S --- 000-virgin/arch/i386/kernel/entry.S 2003-10-01 11:40:40.000000000 -0700 +++ 999-mjb/arch/i386/kernel/entry.S 2003-10-02 16:41:14.000000000 -0700 @@ -829,7 +829,7 @@ ENTRY(sys_call_table) .long sys_getdents64 /* 220 */ .long sys_fcntl64 .long sys_ni_syscall /* reserved for TUX */ - .long sys_ni_syscall + .long sys_mbind .long sys_gettid .long sys_readahead /* 225 */ .long sys_setxattr diff -purN -X /home/mbligh/.diff.exclude 000-virgin/arch/i386/kernel/head.S 999-mjb/arch/i386/kernel/head.S --- 000-virgin/arch/i386/kernel/head.S 2003-10-01 11:40:40.000000000 -0700 +++ 999-mjb/arch/i386/kernel/head.S 2003-10-02 16:43:03.000000000 -0700 @@ -487,3 +487,24 @@ ENTRY(cpu_gdt_table) .fill (NR_CPUS-1)*GDT_ENTRIES,8,0 /* other CPU's GDT */ #endif +#ifdef CONFIG_GCOV_PROFILE +/* + * The .ctors-section contains a list of pointers to constructor + * functions which are used to initialize gcov structures. + * + * Because there is no NULL at the end of the constructor list + * in the kernel we need the addresses of both the constructor + * as well as the destructor list which are supposed to be + * adjacent. + */ + +.section ".ctors","aw" +.globl __CTOR_LIST__ +.type __CTOR_LIST__,@object +__CTOR_LIST__: +.section ".dtors","aw" +.globl __DTOR_LIST__ +.type __DTOR_LIST__,@object +__DTOR_LIST__: +#endif + diff -purN -X /home/mbligh/.diff.exclude 000-virgin/arch/i386/kernel/io_apic.c 999-mjb/arch/i386/kernel/io_apic.c --- 000-virgin/arch/i386/kernel/io_apic.c 2003-10-01 11:47:33.000000000 -0700 +++ 999-mjb/arch/i386/kernel/io_apic.c 2003-10-02 16:40:46.000000000 -0700 @@ -272,7 +272,7 @@ static void set_ioapic_affinity(unsigned spin_unlock_irqrestore(&ioapic_lock, flags); } -#if defined(CONFIG_SMP) +#if defined(CONFIG_IRQBALANCE) # include /* kernel_thread() */ # include /* kstat */ # include /* kmalloc() */ @@ -670,8 +670,6 @@ static int __init irqbalance_disable(cha __setup("noirqbalance", irqbalance_disable); -static void set_ioapic_affinity(unsigned int irq, cpumask_t mask); - static inline void move_irq(int irq) { /* note - we hold the desc->lock */ @@ -683,9 +681,11 @@ static inline void move_irq(int irq) __initcall(balanced_irq_init); -#else /* !SMP */ +#else /* !CONFIG_IRQBALANCE */ static inline void move_irq(int irq) { } +#endif /* CONFIG_IRQBALANCE */ +#ifndef CONFIG_SMP void send_IPI_self(int vector) { unsigned int cfg; @@ -700,7 +700,7 @@ void send_IPI_self(int vector) */ apic_write_around(APIC_ICR, cfg); } -#endif /* defined(CONFIG_SMP) */ +#endif /* !CONFIG_SMP */ /* diff -purN -X /home/mbligh/.diff.exclude 000-virgin/arch/i386/kernel/vmlinux.lds.S 999-mjb/arch/i386/kernel/vmlinux.lds.S --- 000-virgin/arch/i386/kernel/vmlinux.lds.S 2003-10-01 11:40:41.000000000 -0700 +++ 999-mjb/arch/i386/kernel/vmlinux.lds.S 2003-10-02 16:39:38.000000000 -0700 @@ -10,7 +10,7 @@ ENTRY(startup_32) jiffies = jiffies_64; SECTIONS { - . = 0xC0000000 + 0x100000; + . = __PAGE_OFFSET + 0x100000; /* read-only */ _text = .; /* Text and read-only data */ .text : { diff -purN -X /home/mbligh/.diff.exclude 000-virgin/arch/i386/lib/dec_and_lock.c 999-mjb/arch/i386/lib/dec_and_lock.c --- 000-virgin/arch/i386/lib/dec_and_lock.c 2002-12-09 18:45:50.000000000 -0800 +++ 999-mjb/arch/i386/lib/dec_and_lock.c 2003-10-02 16:39:44.000000000 -0700 @@ -10,6 +10,7 @@ #include #include +#ifndef ATOMIC_DEC_AND_LOCK int atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock) { int counter; @@ -38,3 +39,5 @@ slow_path: spin_unlock(lock); return 0; } +#endif + diff -purN -X /home/mbligh/.diff.exclude 000-virgin/arch/i386/mm/hugetlbpage.c 999-mjb/arch/i386/mm/hugetlbpage.c --- 000-virgin/arch/i386/mm/hugetlbpage.c 2003-10-01 11:47:33.000000000 -0700 +++ 999-mjb/arch/i386/mm/hugetlbpage.c 2003-10-02 16:42:17.000000000 -0700 @@ -61,6 +61,27 @@ static struct page *alloc_fresh_huge_pag void free_huge_page(struct page *page); +#ifdef CONFIG_NUMA + +static inline void huge_inc_rss(struct mm_struct *mm, struct page *page) +{ + mm->rss += (HPAGE_SIZE / PAGE_SIZE); + mm->pernode_rss[page_to_nid(page)] += (HPAGE_SIZE / PAGE_SIZE); +} + +static inline void huge_dec_rss(struct mm_struct *mm, struct page *page) +{ + mm->rss -= (HPAGE_SIZE / PAGE_SIZE); + mm->pernode_rss[page_to_nid(page)] -= (HPAGE_SIZE / PAGE_SIZE); +} + +#else /* !CONFIG_NUMA */ + +#define huge_inc_rss(mm, page) ((mm)->rss += (HPAGE_SIZE / PAGE_SIZE)) +#define huge_dec_rss(mm, page) ((mm)->rss -= (HPAGE_SIZE / PAGE_SIZE)) + +#endif /* CONFIG_NUMA */ + static struct page *alloc_hugetlb_page(void) { int i; @@ -105,7 +126,7 @@ static void set_huge_pte(struct mm_struc { pte_t entry; - mm->rss += (HPAGE_SIZE / PAGE_SIZE); + huge_inc_rss(mm, page); if (write_access) { entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); @@ -145,7 +166,7 @@ int copy_hugetlb_page_range(struct mm_st ptepage = pte_page(entry); get_page(ptepage); set_pte(dst_pte, entry); - dst->rss += (HPAGE_SIZE / PAGE_SIZE); + huge_inc_rss(dst, ptepage); addr += HPAGE_SIZE; } return 0; @@ -314,8 +335,8 @@ void unmap_hugepage_range(struct vm_area page = pte_page(*pte); huge_page_release(page); pte_clear(pte); + huge_dec_rss(mm, page); } - mm->rss -= (end - start) >> PAGE_SHIFT; flush_tlb_range(vma, start, end); } diff -purN -X /home/mbligh/.diff.exclude 000-virgin/arch/ppc/Kconfig 999-mjb/arch/ppc/Kconfig --- 000-virgin/arch/ppc/Kconfig 2003-10-01 11:47:34.000000000 -0700 +++ 999-mjb/arch/ppc/Kconfig 2003-10-02 16:43:03.000000000 -0700 @@ -1288,6 +1288,36 @@ source "drivers/usb/Kconfig" source "lib/Kconfig" +menu "GCOV coverage profiling" + +config GCOV_PROFILE + bool "GCOV coverage profiling" + ---help--- + Provide infrastructure for coverage support for the kernel. This + will not compile the kernel by default with the necessary flags. + To obtain coverage information for the entire kernel, one should + enable the subsequent option (Profile entire kernel). If only + particular files or directories of the kernel are desired, then + one must provide the following compile options for such targets: + "-fprofile-arcs -ftest-coverage" in the CFLAGS. To obtain + access to the coverage data one must insmod the gcov-prof kernel + module. + +config GCOV_ALL + bool "GCOV_ALL" + depends on GCOV_PROFILE + ---help--- + If you say Y here, it will compile the entire kernel with coverage + option enabled. + +config GCOV_PROC + tristate "gcov-proc module" + depends on GCOV_PROFILE && PROC_FS + ---help--- + This is the gcov-proc module that exposes gcov data through the + /proc filesystem + +endmenu menu "Kernel hacking" diff -purN -X /home/mbligh/.diff.exclude 000-virgin/arch/ppc/boot/openfirmware/common.c 999-mjb/arch/ppc/boot/openfirmware/common.c --- 000-virgin/arch/ppc/boot/openfirmware/common.c 2002-12-09 18:46:16.000000000 -0800 +++ 999-mjb/arch/ppc/boot/openfirmware/common.c 2003-10-02 16:43:03.000000000 -0700 @@ -30,6 +30,10 @@ struct memchunk { static struct memchunk *freechunks; +#ifdef CONFIG_GCOV_PROFILE +void __bb_init_func (void *ptr /* struct bb *blocks */) { } +#endif + static void *zalloc(void *x, unsigned items, unsigned size) { void *p; diff -purN -X /home/mbligh/.diff.exclude 000-virgin/arch/ppc/boot/prep/misc.c 999-mjb/arch/ppc/boot/prep/misc.c --- 000-virgin/arch/ppc/boot/prep/misc.c 2003-01-13 16:04:55.000000000 -0800 +++ 999-mjb/arch/ppc/boot/prep/misc.c 2003-10-02 16:43:03.000000000 -0700 @@ -71,6 +71,10 @@ extern unsigned long serial_init(int cha extern void serial_fixups(void); extern unsigned long get_mem_size(void); +#ifdef CONFIG_GCOV_PROFILE +void __bb_init_func (void *ptr /* struct bb *blocks */) { } +#endif + void writel(unsigned int val, unsigned int address) { diff -purN -X /home/mbligh/.diff.exclude 000-virgin/arch/ppc/kernel/Makefile 999-mjb/arch/ppc/kernel/Makefile --- 000-virgin/arch/ppc/kernel/Makefile 2003-10-01 11:47:36.000000000 -0700 +++ 999-mjb/arch/ppc/kernel/Makefile 2003-10-02 16:43:03.000000000 -0700 @@ -18,8 +18,8 @@ extra-$(CONFIG_6xx) += idle_6xx.o extra-$(CONFIG_POWER4) += idle_power4.o extra-y += vmlinux.lds.s -obj-y := entry.o traps.o irq.o idle.o time.o misc.o \ - process.o signal.o ptrace.o align.o \ +obj-y := entry.o ptrace.o traps.o irq.o idle.o time.o misc.o \ + process.o signal.o align.o \ semaphore.o syscalls.o setup.o \ cputable.o ppc_htab.o obj-$(CONFIG_6xx) += l2cr.o cpu_setup_6xx.o diff -purN -X /home/mbligh/.diff.exclude 000-virgin/arch/ppc/kernel/entry.S 999-mjb/arch/ppc/kernel/entry.S --- 000-virgin/arch/ppc/kernel/entry.S 2003-10-01 11:47:36.000000000 -0700 +++ 999-mjb/arch/ppc/kernel/entry.S 2003-10-02 16:43:03.000000000 -0700 @@ -106,10 +106,26 @@ transfer_to_handler: mfspr r11,SPRN_HID0 mtcr r11 BEGIN_FTR_SECTION +#ifdef CONFIG_GCOV_PROFILE + bt- 8,near1_power_save_6xx_restore /* Check DOZE */ + b skip1_power_save_6xx_restore +near1_power_save_6xx_restore: + b power_save_6xx_restore +skip1_power_save_6xx_restore: +#else bt- 8,power_save_6xx_restore /* Check DOZE */ +#endif END_FTR_SECTION_IFSET(CPU_FTR_CAN_DOZE) BEGIN_FTR_SECTION +#ifdef CONFIG_GCOV_PROFILE + bt- 9,near2_power_save_6xx_restore /* Check NAP */ + b skip2_power_save_6xx_restore +near2_power_save_6xx_restore: + b power_save_6xx_restore +skip2_power_save_6xx_restore: +#else bt- 9,power_save_6xx_restore /* Check NAP */ +#endif END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP) #endif /* CONFIG_6xx */ .globl transfer_to_handler_cont diff -purN -X /home/mbligh/.diff.exclude 000-virgin/arch/ppc/kernel/head.S 999-mjb/arch/ppc/kernel/head.S --- 000-virgin/arch/ppc/kernel/head.S 2003-10-01 11:47:36.000000000 -0700 +++ 999-mjb/arch/ppc/kernel/head.S 2003-10-02 16:43:03.000000000 -0700 @@ -1742,3 +1742,25 @@ intercept_table: */ abatron_pteptrs: .space 8 + +#ifdef CONFIG_GCOV_PROFILE +/* + * The .ctors-section contains a list of pointers to constructor + * functions which are used to initialize gcov structures. + * + * Because there is no NULL at the end of the constructor list + * in the kernel we need the addresses of both the constructor + * as well as the destructor list which are supposed to be + * adjacent. + */ + +.section ".ctors","aw" +.globl __CTOR_LIST__ +.type __CTOR_LIST__,@object +__CTOR_LIST__: +.section ".dtors","aw" +.globl __DTOR_LIST__ +.type __DTOR_LIST__,@object +__DTOR_LIST__: +#endif + diff -purN -X /home/mbligh/.diff.exclude 000-virgin/arch/ppc/syslib/prom_init.c 999-mjb/arch/ppc/syslib/prom_init.c --- 000-virgin/arch/ppc/syslib/prom_init.c 2003-10-01 11:47:37.000000000 -0700 +++ 999-mjb/arch/ppc/syslib/prom_init.c 2003-10-02 16:43:03.000000000 -0700 @@ -737,7 +737,11 @@ prom_instantiate_rtas(void) * Actually OF has bugs so we just arbitrarily * use memory at the 6MB point. */ +#ifdef CONFIG_GCOV_PROFILE + rtas_data = 0x990000; +#else rtas_data = 6 << 20; +#endif prom_print(" at "); prom_print_hex(rtas_data); } diff -purN -X /home/mbligh/.diff.exclude 000-virgin/arch/ppc64/Kconfig 999-mjb/arch/ppc64/Kconfig --- 000-virgin/arch/ppc64/Kconfig 2003-10-01 11:47:38.000000000 -0700 +++ 999-mjb/arch/ppc64/Kconfig 2003-10-02 16:43:03.000000000 -0700 @@ -323,6 +323,37 @@ config VIOPATH source "arch/ppc64/oprofile/Kconfig" +menu "GCOV coverage profiling" + +config GCOV_PROFILE + bool "GCOV coverage profiling" + ---help--- + Provide infrastructure for coverage support for the kernel. This + will not compile the kernel by default with the necessary flags. + To obtain coverage information for the entire kernel, one should + enable the subsequent option (Profile entire kernel). If only + particular files or directories of the kernel are desired, then + one must provide the following compile options for such targets: + "-fprofile-arcs -ftest-coverage" in the CFLAGS. To obtain + access to the coverage data one must insmod the gcov-prof kernel + module. + +config GCOV_ALL + bool "GCOV_ALL" + depends on GCOV_PROFILE + ---help--- + If you say Y here, it will compile the entire kernel with coverage + option enabled. + +config GCOV_PROC + tristate "gcov-proc module" + depends on GCOV_PROFILE && PROC_FS + ---help--- + This is the gcov-proc module that exposes gcov data through the + /proc filesystem + +endmenu + menu "Kernel hacking" config DEBUG_KERNEL diff -purN -X /home/mbligh/.diff.exclude 000-virgin/arch/ppc64/kernel/head.S 999-mjb/arch/ppc64/kernel/head.S --- 000-virgin/arch/ppc64/kernel/head.S 2003-10-01 11:47:38.000000000 -0700 +++ 999-mjb/arch/ppc64/kernel/head.S 2003-10-02 16:43:03.000000000 -0700 @@ -1926,3 +1926,24 @@ stab_array: .globl cmd_line cmd_line: .space 512 + +#ifdef CONFIG_GCOV_PROFILE +/* + * The .ctors-section contains a list of pointers to constructor + * functions which are used to initialize gcov structures. + * + * Because there is no NULL at the end of the constructor list + * in the kernel we need the addresses of both the constructor + * as well as the destructor list which are supposed to be + * adjacent. + */ + +.section ".ctors","aw" +.globl __CTOR_LIST__ +.type __CTOR_LIST__,@object +__CTOR_LIST__: +.section ".dtors","aw" +.globl __DTOR_LIST__ +.type __DTOR_LIST__,@object +__DTOR_LIST__: +#endif diff -purN -X /home/mbligh/.diff.exclude 000-virgin/arch/sparc64/kernel/devices.c 999-mjb/arch/sparc64/kernel/devices.c --- 000-virgin/arch/sparc64/kernel/devices.c 2003-10-01 11:40:45.000000000 -0700 +++ 999-mjb/arch/sparc64/kernel/devices.c 2003-10-02 16:39:44.000000000 -0700 @@ -117,6 +117,8 @@ int cpu_find_by_mid(int mid, int *prom_n prom_node, NULL); } +unsigned long cpu_hz; + void __init device_scan(void) { /* FIX ME FAST... -DaveM */ diff -purN -X /home/mbligh/.diff.exclude 000-virgin/arch/sparc64/lib/rwlock.S 999-mjb/arch/sparc64/lib/rwlock.S --- 000-virgin/arch/sparc64/lib/rwlock.S 2002-12-09 18:45:55.000000000 -0800 +++ 999-mjb/arch/sparc64/lib/rwlock.S 2003-10-02 16:39:44.000000000 -0700 @@ -63,5 +63,33 @@ __write_lock: /* %o0 = lock_ptr */ be,pt %icc, 99b membar #StoreLoad | #StoreStore ba,a,pt %xcc, 1b + + .globl __read_trylock +__read_trylock: /* %o0 = lock_ptr */ + ldsw [%o0], %g5 + brlz,pn %g5, 100f + add %g5, 1, %g7 + cas [%o0], %g5, %g7 + cmp %g5, %g7 + bne,pn %icc, __read_trylock + membar #StoreLoad | #StoreStore + retl + mov 1, %o0 + + .globl __write_trylock +__write_trylock: /* %o0 = lock_ptr */ + sethi %hi(0x80000000), %g2 +1: lduw [%o0], %g5 +4: brnz,pn %g5, 100f + or %g5, %g2, %g7 + cas [%o0], %g5, %g7 + cmp %g5, %g7 + bne,pn %icc, 1b + membar #StoreLoad | #StoreStore + retl + mov 1, %o0 +100: retl + mov 0, %o0 + rwlock_impl_end: diff -purN -X /home/mbligh/.diff.exclude 000-virgin/arch/x86_64/Kconfig 999-mjb/arch/x86_64/Kconfig --- 000-virgin/arch/x86_64/Kconfig 2003-10-01 11:47:39.000000000 -0700 +++ 999-mjb/arch/x86_64/Kconfig 2003-10-02 16:43:03.000000000 -0700 @@ -435,6 +435,37 @@ source "drivers/usb/Kconfig" source "arch/x86_64/oprofile/Kconfig" +menu "GCOV coverage profiling" + +config GCOV_PROFILE + bool "GCOV coverage profiling" + ---help--- + Provide infrastructure for coverage support for the kernel. This + will not compile the kernel by default with the necessary flags. + To obtain coverage information for the entire kernel, one should + enable the subsequent option (Profile entire kernel). If only + particular files or directories of the kernel are desired, then + one must provide the following compile options for such targets: + "-fprofile-arcs -ftest-coverage" in the CFLAGS. To obtain + access to the coverage data one must insmod the gcov-prof kernel + module. + +config GCOV_ALL + bool "GCOV_ALL" + depends on GCOV_PROFILE + ---help--- + If you say Y here, it will compile the entire kernel with coverage + option enabled. + +config GCOV_PROC + tristate "gcov-proc module" + depends on GCOV_PROFILE && PROC_FS + ---help--- + This is the gcov-proc module that exposes gcov data through the + /proc filesystem + +endmenu + menu "Kernel hacking" config DEBUG_KERNEL diff -purN -X /home/mbligh/.diff.exclude 000-virgin/arch/x86_64/kernel/head.S 999-mjb/arch/x86_64/kernel/head.S --- 000-virgin/arch/x86_64/kernel/head.S 2003-10-01 11:34:39.000000000 -0700 +++ 999-mjb/arch/x86_64/kernel/head.S 2003-10-02 16:43:03.000000000 -0700 @@ -383,3 +383,23 @@ ENTRY(idt_table) .quad 0 .endr +#ifdef CONFIG_GCOV_PROFILE +/* + * The .ctors-section contains a list of pointers to constructor + * functions which are used to initialize gcov structures. + * + * Because there is no NULL at the end of the constructor list + * in the kernel we need the addresses of both the constructor + * as well as the destructor list which are supposed to be + * adjacent. + */ + +.section ".ctors","aw" +.globl __CTOR_LIST__ +.type __CTOR_LIST__,@object +__CTOR_LIST__: +.section ".dtors","aw" +.globl __DTOR_LIST__ +.type __DTOR_LIST__,@object +__DTOR_LIST__: +#endif diff -purN -X /home/mbligh/.diff.exclude 000-virgin/drivers/Makefile 999-mjb/drivers/Makefile --- 000-virgin/drivers/Makefile 2003-10-01 11:46:32.000000000 -0700 +++ 999-mjb/drivers/Makefile 2003-10-02 16:43:03.000000000 -0700 @@ -49,3 +49,4 @@ obj-$(CONFIG_ISDN_BOOL) += isdn/ obj-$(CONFIG_MCA) += mca/ obj-$(CONFIG_EISA) += eisa/ obj-$(CONFIG_CPU_FREQ) += cpufreq/ +obj-$(CONFIG_GCOV_PROC) += gcov/ diff -purN -X /home/mbligh/.diff.exclude 000-virgin/drivers/gcov/Makefile 999-mjb/drivers/gcov/Makefile --- 000-virgin/drivers/gcov/Makefile 1969-12-31 16:00:00.000000000 -0800 +++ 999-mjb/drivers/gcov/Makefile 2003-10-02 16:43:03.000000000 -0700 @@ -0,0 +1,8 @@ +# +# Makefile for GCOV profiling kernel module +# + +obj-$(CONFIG_GCOV_PROC) += gcov-proc.o + +$(obj)/gcov-proc.o: $(obj)/gcov-proc.c + diff -purN -X /home/mbligh/.diff.exclude 000-virgin/drivers/gcov/gcov-proc.c 999-mjb/drivers/gcov/gcov-proc.c --- 000-virgin/drivers/gcov/gcov-proc.c 1969-12-31 16:00:00.000000000 -0800 +++ 999-mjb/drivers/gcov/gcov-proc.c 2003-10-02 16:43:03.000000000 -0700 @@ -0,0 +1,713 @@ +/* + * This kernel module provides access to coverage data produced by + * an instrumented kernel via an entry in the proc file system + * at /proc/gcov/. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (c) International Business Machines Corp., 2002 + * + * Author: Hubertus Franke + * Rajan Ravindran + * + * Bugfixes by Peter.Oberparleiter@de.ibm.com: + * Changes by Paul Larson + * Automatically detect gcc version for gcov_type + * + */ + +#include +#include +#include + +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +#define GCOV_PROF_PROC "gcov" + +static DECLARE_MUTEX_LOCKED(gcov_lock); +#define DOWN() down(&gcov_lock); +#define UP() up(&gcov_lock); +#define PAD8(x) ((x + 7) & ~7) + +//#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,4)) +//static inline struct proc_dir_entry *PDE(const struct inode *inode) +//{ +// return ((struct proc_dir_entry *) inode->u.generic_ip); +//} +//#endif + +/* ################################################################### + # NOTICE ########################################################## + ################################################################### + + GCOV_TYPE defines the count type used by the instrumentation code. + Kernels compiled with a gcc version prior to 3.1 should use LONG, + otherwise LONG LONG. */ + +#if __GNUC__ >= 3 && __GNUC_MINOR__ >= 1 +typedef long long gcov_type; +#else +typedef long gcov_type; +#endif + + +struct bb +{ + long zero_word; + const char *filename; + gcov_type *counts; + long ncounts; + struct bb *next; + const unsigned long *addresses; + + /* Older GCC's did not emit these fields. */ + long nwords; + const char **functions; + const long *line_nums; + const char **filenames; + char *flags; +}; + +extern struct bb *bb_head; +static struct file_operations proc_gcov_operations; +extern char *gcov_kernelpath; +extern void (*gcov_callback)(int cmd, struct bb *); +extern void do_global_ctors(char *, char *, struct module *, int); + +static int create_bb_links = 1; +static int kernel_path_len; + +int debug = 0; +#define PPRINTK(x) do { if (debug) { printk x ; } } while (0) + +struct gcov_ftree_node +{ + int isdir; /* directory or file */ + char *fname; /* only the name within the hierachy */ + struct gcov_ftree_node *sibling; /* sibling of tree */ + struct gcov_ftree_node *files; /* children of tree */ + struct gcov_ftree_node *parent; /* parent of current gcov_ftree_node */ + struct proc_dir_entry *proc[4]; + struct bb *bb; + /* below only valid for leaf nodes == files */ + unsigned long offset; /* offset in global file */ + struct gcov_ftree_node *next; /* next leave node */ +}; + +static struct proc_dir_entry *proc_vmlinux = NULL; +static struct gcov_ftree_node *leave_nodes = NULL; +static struct gcov_ftree_node *dumpall_cached_node = NULL; +static struct gcov_ftree_node tree_root = + { 1, GCOV_PROF_PROC, NULL, NULL, NULL, + { NULL, NULL, NULL, NULL} , NULL, 0,NULL }; +static char *endings[3] = { ".bb", ".bbg", ".c" }; + + +/* Calculate the header size of an entry in the vmlinux-tracefile which + contains the collection of trace data of all instrumented kernel objects. + + An entry header is defined as: + 0: length of filename of the respective .da file padded to 8 bytes + 8: filename padded to 8 bytes + + */ + +static inline unsigned long +hdr_ofs (struct gcov_ftree_node *tptr) +{ + return 8 + PAD8(strlen (tptr->bb->filename) + 1); +} + + +/* Calculate the total size of an entry in the vmlinux-tracefile. + An entry consists of the header, an 8 byte word for the number + of counts in this entry and the actual array of 8 byte counts. */ + +static inline unsigned long +dump_size(struct gcov_ftree_node *tptr) +{ + return (hdr_ofs(tptr) + (tptr->bb->ncounts+1)*8); +} + + +/* Store a portable representation of VALUE in DEST using BYTES*8-1 bits. + Return a non-zero value if VALUE requires more than BYTES*8-1 bits + to store (this is adapted code from gcc/gcov-io.h). */ + +static int +store_gcov_type (gcov_type value, void *buf, int offset, int len) +{ + const size_t bytes = 8; + char dest[10]; + int upper_bit = (value < 0 ? 128 : 0); + size_t i; + + if (value < 0) { + gcov_type oldvalue = value; + value = -value; + if (oldvalue != -value) + return 1; + } + + for(i = 0 ; + i < (sizeof (value) < bytes ? sizeof (value) : bytes) ; + i++) { + dest[i] = value & (i == (bytes - 1) ? 127 : 255); + value = value / 256; + } + + if (value && value != -1) + return 1; + + for(; i < bytes ; i++) + dest[i] = 0; + dest[bytes - 1] |= upper_bit; + copy_to_user(buf,&dest[offset],len); + return 0; +} + + +/* Create a directory entry in the proc file system and fill in + the respective fields in the provided tree node. Return a + non-zero value on error. */ + +int +create_dir_proc (struct gcov_ftree_node *bt, char *fname) +{ + bt->proc[0] = proc_mkdir(fname, bt->parent->proc[0]); + bt->proc[1] = bt->proc[2] = bt->proc[3] = NULL; + return (bt->proc[0] == NULL); +} + + +/* Replace file ending in with . Return a new + string containing the new filename or NULL on error. */ + +static +char* replace_ending (const char *fname,char *end, char *newend) +{ + char *newfname; + char *cptr = strstr(fname,end); + int len; + if (cptr == NULL) + return NULL; + len = cptr - fname; + newfname = (char*)kmalloc(len+strlen(newend)+1,GFP_KERNEL); + if (newfname == NULL) + return NULL; + memcpy(newfname,fname,len); + strcpy(newfname+len,newend); + return newfname; +} + + +/* Create a file entry in the proc file system and update the respective + fields on the tree node. Optionally try to create links to the + source, .bb and .bbg files. Return a non-zero value on error. */ + +int +create_file_proc (struct gcov_ftree_node *bt, struct bb *bptr, char *fname, + const char *fullname) +{ + bt->proc[0] = create_proc_entry(fname, S_IWUSR | S_IRUGO, + bt->parent->proc[0]); + if (!bt->proc[0]) { + PPRINTK(("error creating file proc <%s>\n", fname)); + return 1; + } + + bt->proc[0]->proc_fops = &proc_gcov_operations; + bt->proc[0]->size = 8 + (8 * bptr->ncounts); + + if (create_bb_links) { + int i; + for (i=0;i<3;i++) { + char *newfname; + char *newfullname; + newfname = replace_ending(fname,".da",endings[i]); + newfullname = replace_ending(fullname,".da",endings[i]); + if ((newfname) && (newfullname)) { + bt->proc[i+1] = proc_symlink(newfname,bt->parent->proc[0],newfullname); + } + if (newfname) kfree(newfname); + if (newfullname) kfree(newfullname); + } + } else { + bt->proc[1] = bt->proc[2] = bt->proc[3] = NULL; + } + return 0; +} + + +/* Recursively check and if necessary create the file specified by + and all its path components, both in the proc file-system as + well as in the internal tree structure. */ + +void +check_proc_fs(const char *fullname, struct gcov_ftree_node *parent, + char *name, struct bb *bbptr) +{ + char dirname[128]; + char *localname = name; + char *tname; + int isdir; + struct gcov_ftree_node *tptr; + + tname = strstr(name, "/"); + if ((isdir = (tname != NULL))) { + memcpy(dirname,name,tname-name); + dirname[tname-name] = '\0'; + localname = dirname; + } + + /* search the list of files in gcov_ftree_node and + * see whether file already exists in this directory level */ + for ( tptr = parent->files ; tptr ; tptr = tptr->sibling) { + if (!strcmp(tptr->fname,localname)) + break; + } + if (!tptr) { + /* no entry yet */ + tptr = (struct gcov_ftree_node*) + kmalloc(sizeof(struct gcov_ftree_node),GFP_KERNEL); + tptr->parent = parent; + + if (!isdir) { + if (create_file_proc(tptr, bbptr, localname,fullname)) { + kfree(tptr); + return; + } + tptr->bb = bbptr; + tptr->proc[0]->data = tptr; + tptr->next = leave_nodes; + leave_nodes = tptr; + } else { + int len = strlen(dirname)+1; + localname = (char*)kmalloc(len,GFP_KERNEL); + strncpy(localname,dirname,len); + if (create_dir_proc(tptr,localname)) { + kfree(tptr); + kfree(localname); + return; + } + tptr->bb = NULL; + tptr->proc[0]->data = NULL; + tptr->next = NULL; + } + tptr->isdir = isdir; + tptr->fname = localname; + tptr->files = NULL; + tptr->sibling = parent->files; + parent->files = tptr; + } + if (isdir) + check_proc_fs(fullname,tptr,tname+1,bbptr); +} + + +/* Read out tracefile data to user space. Return the number of bytes + read. */ + +static ssize_t +read_gcov(struct file *file, char *buf, + size_t count, loff_t *ppos) +{ + unsigned long p = *ppos; + ssize_t read; + gcov_type ncnt; + struct bb *bbptr; + gcov_type slen; + gcov_type *wptr; + struct gcov_ftree_node *treeptr; + struct proc_dir_entry * de; + int dumpall; + unsigned int hdrofs; + unsigned long poffs; + + DOWN(); + + read = 0; + hdrofs = 0; + poffs = 0; + de = PDE(file->f_dentry->d_inode); + + /* Check whether this is a request to /proc/gcov/vmlinux in + which case we should dump the complete tracefile. */ + dumpall = (de == proc_vmlinux); + + + /* Have treeptr point to the tree node to be dumped. */ + + if (!dumpall) + treeptr = (struct gcov_ftree_node*) (de ? de->data : NULL); + else { + /* dumpall_cached_node will speed up things in case + of a sequential read. */ + if (dumpall_cached_node && (p >= dumpall_cached_node->offset)) { + treeptr = dumpall_cached_node; + } + else + treeptr = leave_nodes; + + /* Search the tree node that covers the requested + tracefile offset. */ + while (treeptr) { + struct gcov_ftree_node *next = treeptr->next; + if ((next == NULL) || (p < next->offset)) { + hdrofs = hdr_ofs(treeptr); + poffs = treeptr->offset; + break; + } + treeptr = next; + } + dumpall_cached_node = treeptr; + } + + bbptr = treeptr ? treeptr->bb : NULL; + + if (bbptr == NULL) + goto out; + + ncnt = (gcov_type) bbptr->ncounts; + p -= poffs; + + do { + if (p < hdrofs) { + /* User wants to read parts of the header. */ + + slen = PAD8(strlen(treeptr->bb->filename)+1); + + if (p >= 8) { + /* Read filename */ + if (slen > (gcov_type) count) slen = count; + copy_to_user (buf, &treeptr->bb->filename[p-8], + slen); + count-=slen;buf+= slen;read+=slen;p+=slen; + continue; + } + wptr = &slen; + } + else if (p < (hdrofs + 8)) { + /* User wants to read the number of counts in this + entry. */ + + wptr = &ncnt; + } + else if (p < (hdrofs) + (unsigned long) (ncnt+1)*8) { + /* User wants to read actual counters */ + + wptr = &bbptr->counts[((p-hdrofs)/8)-1]; + } + else + break; + + /* do we have to write partial word */ + + if ((count < 8) || (p & 0x7)) { + /* partial write */ + unsigned long offset = p & 0x7; + unsigned long length = (count+offset)<8?count:(8-offset); + + store_gcov_type(*wptr,buf, offset, length); + buf+=length;p+=length;count-=length;read+=length; + break; + } else { + store_gcov_type(*wptr,buf, 0, 8); + buf+=8;p+=8;count-=8;read+=8; + } + } while (count > 0); + *ppos = p + poffs; +out: + UP(); + return read; +} + + +/* A write to any of our proc file-system entries is interpreted + as a request to reset the data from that node. */ + +static ssize_t +write_gcov(struct file * file, const char * buf, + size_t count, loff_t *ppos) +{ + struct bb *ptr; + struct proc_dir_entry * de; + int resetall, i; + struct gcov_ftree_node *tptr; + + DOWN(); + + de = PDE(file->f_dentry->d_inode); + + if (de == NULL) { + count = 0; + goto out; + } + + /* Check for a write to /proc/gcov/vmlinux */ + resetall = (de == proc_vmlinux); + + if (resetall) { + /* Reset all nodes */ + for (ptr = bb_head; ptr != (struct bb *) 0; ptr = ptr->next) + { + int i; + if (ptr->counts == NULL) continue; + for (i = 0; i < ptr->ncounts; i++) + ptr->counts[i]=0; + } + } else { + /* Reset a single node */ + tptr = (struct gcov_ftree_node*)(de->data); + if (tptr == NULL) + goto out; + ptr = tptr->bb; + if (ptr->ncounts != 0) { + for (i = 0; i < ptr->ncounts; i++) + ptr->counts[i]=0; + } + } +out: + UP(); + return count; +} + + +/* This struct identifies the functions to be used for proc file-system + interaction. */ + +static struct file_operations proc_gcov_operations = { + read: read_gcov, + write: write_gcov +}; + + +/* Recursively remove a node and all its children from the internal + data tree and from the proc file-system. */ + +void +cleanup_node(struct gcov_ftree_node *node, int delname, int del_in_parent) +{ + struct gcov_ftree_node *next,*tptr; + struct proc_dir_entry *par_proc; + + PPRINTK(("parent n:%p p:%p f:%p s:%p <%s>\n", node, + node->parent, node->files, node->sibling, node->fname)); + if ((tptr = node->parent)) { + if (del_in_parent) { + /* Remove node from parent's list of children */ + struct gcov_ftree_node *cptr,*prev_cptr; + for ( prev_cptr = NULL, cptr = tptr->files; cptr && (cptr != node); + prev_cptr = cptr, cptr = cptr->sibling); + if (prev_cptr == NULL) + tptr->files = cptr->sibling; + else + prev_cptr->sibling = cptr->sibling; + } + par_proc = (struct proc_dir_entry*)(tptr->proc[0]); + } else + par_proc = &proc_root; + + if (node->isdir) { + /* In case of a directory, clean up all child nodes. */ + next = node->files; + node->files = NULL; + for (tptr = next ; tptr; ) { + next = tptr->sibling; + cleanup_node(tptr,1,0); + tptr = next; + } + remove_proc_entry(node->fname, par_proc); + if (delname) kfree(node->fname); + } else { + /* Remove file entry and optional links. */ + remove_proc_entry(node->fname, par_proc); + if (create_bb_links) { + int i; + for (i=0;i<3;i++) { + char *newfname; + if (node->proc[i+1] == NULL) continue; + newfname = replace_ending(node->fname,".da",endings[i]); + if (newfname) { + PPRINTK(("remove_proc_entry <%s>\n", node->fname)); + remove_proc_entry(newfname, par_proc); + kfree(newfname); + } + } + } + } + /* free the data */ + if (node != &tree_root) + kfree(node); +} + + +/* Create a tree node for the given bb struct and initiate the + creation of a corresponding proc file-system entry. */ + +static void +create_node_tree(struct bb *bbptr) +{ + const char *tmp; + const char *filename = bbptr->filename; + char *modname; + int len; + + PPRINTK(("kernelpath <%s> <%s>\n", gcov_kernelpath, filename)); + + /* Check whether this is a file located in the kernel source + directory. */ + if (!strncmp (filename, gcov_kernelpath, kernel_path_len)) + { + /* Remove kernel path and create relative proc-file-system + entry. */ + tmp = filename + kernel_path_len+1; + if (*tmp == '0') return; + check_proc_fs(filename, &tree_root, (char*)tmp, bbptr); + } + else { + /* Insert entry to module sub-directory. */ + len = strlen(filename); + modname = (char *)kmalloc (len + 7, GFP_KERNEL); + strcpy(modname, "module"); + strcat (modname, filename); + check_proc_fs(filename, &tree_root, modname, bbptr); + } +} + + +/* This function will be used as gcov_callback, i.e. it is + called from constructor and destructor code of all instrumented + object files. It updates the local tree structure and the proc + file-system entries. */ + +static void +gcov_cleanup(int cmd, struct bb *bbptr) +{ + unsigned long offset = 0; + struct gcov_ftree_node *tptr; + struct gcov_ftree_node *parent; + struct gcov_ftree_node *prev_cptr; + + DOWN(); + switch (cmd) { + case 0: + /* remove leave node */ + prev_cptr = NULL; + for (tptr = leave_nodes; tptr ; prev_cptr = tptr, tptr = tptr->next) { + if (tptr->bb == bbptr) break; + } + if (!tptr) { + PPRINTK(("Can't find module in /proc/gcov\n")); + UP(); + return; + } + if (prev_cptr) + prev_cptr->next = tptr->next; + else + leave_nodes = tptr->next; + dumpall_cached_node = NULL; + + + /* Find highest level node without further siblings */ + + parent = tptr->parent; + do { + if (parent->files->sibling != NULL) break; + tptr = parent; + parent = parent->parent; + } while (parent); + cleanup_node(tptr,0,1); + + /* Update the offsets at which a certain node can + be found in the tracefile. */ + for (tptr = leave_nodes; tptr; tptr = tptr->next) { + tptr->offset = offset; + offset += dump_size(tptr); + } + break; + + case 1: + /* insert node */ + create_node_tree(bbptr); + + /* Update the offsets at which a certain node can + be found in the tracefile. */ + for (tptr = leave_nodes; tptr; tptr = tptr->next) { + tptr->offset = offset; + offset += dump_size(tptr); + } + + break; + } + UP(); +} + + +/* Initialize the data structure by calling the constructor code + of all instrumented object files and creating the proc + file-system entries. */ + +int +init_module(void) +{ + struct bb *bbptr; + unsigned long offset = 0; + struct gcov_ftree_node *tptr; + + PPRINTK(("init module <%s>\n\n", GCOV_PROF_PROC)); + + do_global_ctors(NULL, NULL, NULL, 0); + + tree_root.proc[0] = proc_mkdir(GCOV_PROF_PROC, 0); + kernel_path_len = strlen(gcov_kernelpath); + + for (bbptr = bb_head; bbptr ; bbptr = bbptr->next) { + create_node_tree(bbptr); + } + + /* Fill in the offset at which a certain node can + be found in the tracefile. */ + for (tptr = leave_nodes; tptr; tptr = tptr->next) { + tptr->offset = offset; + offset += dump_size(tptr); + } + + proc_vmlinux = create_proc_entry("vmlinux",S_IWUSR | S_IRUGO, + tree_root.proc[0]); + if (proc_vmlinux) + proc_vmlinux->proc_fops = &proc_gcov_operations; + + gcov_callback = gcov_cleanup; + UP(); + return 0; +} + + +void +cleanup_module(void) +{ + PPRINTK(("remove module <%s>\n\n", GCOV_PROF_PROC)); + gcov_callback = NULL; + DOWN(); + cleanup_node(&tree_root,0,0); +} + +//module_init(gcov_init_module); +//module_exit(gcov_cleanup_module); diff -purN -X /home/mbligh/.diff.exclude 000-virgin/drivers/net/loopback.c 999-mjb/drivers/net/loopback.c --- 000-virgin/drivers/net/loopback.c 2003-10-01 11:46:39.000000000 -0700 +++ 999-mjb/drivers/net/loopback.c 2003-10-02 16:39:46.000000000 -0700 @@ -184,7 +184,7 @@ struct net_device loopback_dev = { .rebuild_header = eth_rebuild_header, .flags = IFF_LOOPBACK, .features = NETIF_F_SG|NETIF_F_FRAGLIST - |NETIF_F_NO_CSUM|NETIF_F_HIGHDMA|NETIF_F_TSO, + |NETIF_F_NO_CSUM|NETIF_F_HIGHDMA, }; /* Setup and register the of the LOOPBACK device. */ diff -purN -X /home/mbligh/.diff.exclude 000-virgin/drivers/pci/probe.c 999-mjb/drivers/pci/probe.c --- 000-virgin/drivers/pci/probe.c 2003-10-01 11:35:11.000000000 -0700 +++ 999-mjb/drivers/pci/probe.c 2003-10-02 16:39:49.000000000 -0700 @@ -176,7 +176,7 @@ void __devinit pci_read_bridge_bases(str limit |= (io_limit_hi << 16); } - if (base && base <= limit) { + if (base <= limit) { res->flags = (io_base_lo & PCI_IO_RANGE_TYPE_MASK) | IORESOURCE_IO; res->start = base; res->end = limit + 0xfff; diff -purN -X /home/mbligh/.diff.exclude 000-virgin/fs/aio.c 999-mjb/fs/aio.c --- 000-virgin/fs/aio.c 2003-10-01 11:48:15.000000000 -0700 +++ 999-mjb/fs/aio.c 2003-10-02 16:39:54.000000000 -0700 @@ -203,6 +203,7 @@ static struct kioctx *ioctx_alloc(unsign { struct mm_struct *mm; struct kioctx *ctx; + int ret = 0; /* Prevent overflows */ if ((nr_events > (0x10000000U / sizeof(struct io_event))) || @@ -232,7 +233,8 @@ static struct kioctx *ioctx_alloc(unsign INIT_LIST_HEAD(&ctx->run_list); INIT_WORK(&ctx->wq, aio_kick_handler, ctx); - if (aio_setup_ring(ctx) < 0) + ret = aio_setup_ring(ctx); + if (unlikely(ret < 0)) goto out_freectx; /* limit the number of system wide aios */ @@ -259,7 +261,7 @@ out_cleanup: out_freectx: mmdrop(mm); kmem_cache_free(kioctx_cachep, ctx); - ctx = ERR_PTR(-ENOMEM); + ctx = ERR_PTR(ret); dprintk("aio: error allocating ioctx %p\n", ctx); return ctx; diff -purN -X /home/mbligh/.diff.exclude 000-virgin/fs/binfmt_aout.c 999-mjb/fs/binfmt_aout.c --- 000-virgin/fs/binfmt_aout.c 2003-10-01 11:48:15.000000000 -0700 +++ 999-mjb/fs/binfmt_aout.c 2003-10-02 16:42:17.000000000 -0700 @@ -309,7 +309,7 @@ static int load_aout_binary(struct linux (current->mm->start_brk = N_BSSADDR(ex)); current->mm->free_area_cache = TASK_UNMAPPED_BASE; - current->mm->rss = 0; + zero_rss(current->mm); current->mm->mmap = NULL; compute_creds(bprm); current->flags &= ~PF_FORKNOEXEC; diff -purN -X /home/mbligh/.diff.exclude 000-virgin/fs/binfmt_elf.c 999-mjb/fs/binfmt_elf.c --- 000-virgin/fs/binfmt_elf.c 2003-10-01 11:48:15.000000000 -0700 +++ 999-mjb/fs/binfmt_elf.c 2003-10-02 16:42:17.000000000 -0700 @@ -634,7 +634,7 @@ static int load_elf_binary(struct linux_ /* Do this so that we can load the interpreter, if need be. We will change some of these later */ - current->mm->rss = 0; + zero_rss(current->mm); current->mm->free_area_cache = TASK_UNMAPPED_BASE; retval = setup_arg_pages(bprm); if (retval < 0) { diff -purN -X /home/mbligh/.diff.exclude 000-virgin/fs/binfmt_flat.c 999-mjb/fs/binfmt_flat.c --- 000-virgin/fs/binfmt_flat.c 2003-10-01 11:35:23.000000000 -0700 +++ 999-mjb/fs/binfmt_flat.c 2003-10-02 16:42:17.000000000 -0700 @@ -643,7 +643,7 @@ static int load_flat_file(struct linux_b current->mm->start_brk = datapos + data_len + bss_len; current->mm->brk = (current->mm->start_brk + 3) & ~3; current->mm->context.end_brk = memp + ksize((void *) memp) - stack_len; - current->mm->rss = 0; + zero_rss(current->mm); } if (flags & FLAT_FLAG_KTRACE) diff -purN -X /home/mbligh/.diff.exclude 000-virgin/fs/binfmt_som.c 999-mjb/fs/binfmt_som.c --- 000-virgin/fs/binfmt_som.c 2003-02-13 16:36:36.000000000 -0800 +++ 999-mjb/fs/binfmt_som.c 2003-10-02 16:42:17.000000000 -0700 @@ -259,7 +259,7 @@ load_som_binary(struct linux_binprm * bp create_som_tables(bprm); current->mm->start_stack = bprm->p; - current->mm->rss = 0; + zero_rss(current->mm); #if 0 printk("(start_brk) %08lx\n" , (unsigned long) current->mm->start_brk); diff -purN -X /home/mbligh/.diff.exclude 000-virgin/fs/buffer.c 999-mjb/fs/buffer.c --- 000-virgin/fs/buffer.c 2003-10-01 11:41:12.000000000 -0700 +++ 999-mjb/fs/buffer.c 2003-10-02 16:53:55.000000000 -0700 @@ -865,14 +865,14 @@ int __set_page_dirty_buffers(struct page spin_unlock(&mapping->private_lock); if (!TestSetPageDirty(page)) { - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); if (page->mapping) { /* Race with truncate? */ if (!mapping->backing_dev_info->memory_backed) inc_page_state(nr_dirty); list_del(&page->list); list_add(&page->list, &mapping->dirty_pages); } - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); } diff -purN -X /home/mbligh/.diff.exclude 000-virgin/fs/exec.c 999-mjb/fs/exec.c --- 000-virgin/fs/exec.c 2003-10-01 11:48:15.000000000 -0700 +++ 999-mjb/fs/exec.c 2003-10-02 16:42:17.000000000 -0700 @@ -317,10 +317,11 @@ void put_dirty_page(struct task_struct * } lru_cache_add_active(page); flush_dcache_page(page); + SetPageAnon(page); set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(page, prot)))); pte_chain = page_add_rmap(page, pte, pte_chain); pte_unmap(pte); - tsk->mm->rss++; + inc_rss(tsk->mm, page); spin_unlock(&tsk->mm->page_table_lock); /* no need for flush_tlb */ diff -purN -X /home/mbligh/.diff.exclude 000-virgin/fs/fs-writeback.c 999-mjb/fs/fs-writeback.c --- 000-virgin/fs/fs-writeback.c 2003-07-28 15:31:09.000000000 -0700 +++ 999-mjb/fs/fs-writeback.c 2003-10-02 16:53:55.000000000 -0700 @@ -150,10 +150,10 @@ __sync_single_inode(struct inode *inode, * read speculatively by this cpu before &= ~I_DIRTY -- mikulas */ - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); if (wait || !wbc->for_kupdate || list_empty(&mapping->io_pages)) list_splice_init(&mapping->dirty_pages, &mapping->io_pages); - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); spin_unlock(&inode_lock); do_writepages(mapping, wbc); diff -purN -X /home/mbligh/.diff.exclude 000-virgin/fs/inode.c 999-mjb/fs/inode.c --- 000-virgin/fs/inode.c 2003-10-01 11:47:01.000000000 -0700 +++ 999-mjb/fs/inode.c 2003-10-02 16:53:55.000000000 -0700 @@ -147,6 +147,9 @@ static struct inode *alloc_inode(struct mapping->dirtied_when = 0; mapping->assoc_mapping = NULL; mapping->backing_dev_info = &default_backing_dev_info; +#ifdef CONFIG_NUMA + mapping->binding = NULL; +#endif if (sb->s_bdev) mapping->backing_dev_info = sb->s_bdev->bd_inode->i_mapping->backing_dev_info; memset(&inode->u, 0, sizeof(inode->u)); @@ -184,7 +187,7 @@ void inode_init_once(struct inode *inode INIT_LIST_HEAD(&inode->i_devices); sema_init(&inode->i_sem, 1); INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC); - spin_lock_init(&inode->i_data.page_lock); + mapping_rwlock_init(&inode->i_data.page_lock); init_MUTEX(&inode->i_data.i_shared_sem); atomic_set(&inode->i_data.truncate_count, 0); INIT_LIST_HEAD(&inode->i_data.private_list); diff -purN -X /home/mbligh/.diff.exclude 000-virgin/fs/mpage.c 999-mjb/fs/mpage.c --- 000-virgin/fs/mpage.c 2003-10-01 11:41:13.000000000 -0700 +++ 999-mjb/fs/mpage.c 2003-10-02 16:53:55.000000000 -0700 @@ -635,7 +635,7 @@ mpage_writepages(struct address_space *m if (get_block == NULL) writepage = mapping->a_ops->writepage; - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); while (!list_empty(&mapping->io_pages) && !done) { struct page *page = list_entry(mapping->io_pages.prev, struct page, list); @@ -655,7 +655,7 @@ mpage_writepages(struct address_space *m list_add(&page->list, &mapping->locked_pages); page_cache_get(page); - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); /* * At this point we hold neither mapping->page_lock nor @@ -695,12 +695,12 @@ mpage_writepages(struct address_space *m unlock_page(page); } page_cache_release(page); - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); } /* * Leave any remaining dirty pages on ->io_pages */ - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); if (bio) mpage_bio_submit(WRITE, bio); return ret; diff -purN -X /home/mbligh/.diff.exclude 000-virgin/fs/proc/proc_misc.c 999-mjb/fs/proc/proc_misc.c --- 000-virgin/fs/proc/proc_misc.c 2003-10-01 11:48:19.000000000 -0700 +++ 999-mjb/fs/proc/proc_misc.c 2003-10-02 16:41:02.000000000 -0700 @@ -134,6 +134,41 @@ static struct vmalloc_info get_vmalloc_i return vmi; } +static int real_loadavg_read_proc(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + int a, b, c, cpu; + int len; + + a = tasks_running[0] + (FIXED_1/200); + b = tasks_running[1] + (FIXED_1/200); + c = tasks_running[2] + (FIXED_1/200); + len = sprintf(page,"Domain load1 load2 load3 nr_run/nr_thrd\n"); + len += sprintf(page+len,"SYSTEM %5d.%02d %5d.%02d %5d.%02d %7ld/%7d\n", + LOAD_INT(a), LOAD_FRAC(a), + LOAD_INT(b), LOAD_FRAC(b), + LOAD_INT(c), LOAD_FRAC(c), + nr_running(), nr_threads); + for (cpu = 0; cpu < NR_CPUS; ++cpu) { + unsigned long nr_running; + if (!cpu_online(cpu)) + continue; + preempt_disable(); + a = per_cpu(cpu_tasks_running,cpu)[0] + (FIXED_1/200); + b = per_cpu(cpu_tasks_running,cpu)[1] + (FIXED_1/200); + c = per_cpu(cpu_tasks_running,cpu)[2] + (FIXED_1/200); + nr_running = nr_running_cpu(cpu); + preempt_enable(); + len += sprintf(page+len, "%5d %5d.%02d %5d.%02d %5d.%02d %7ld/%7d\n", + cpu, + LOAD_INT(a), LOAD_FRAC(a), + LOAD_INT(b), LOAD_FRAC(b), + LOAD_INT(c), LOAD_FRAC(c), + nr_running, nr_threads); + } + return proc_calc_metrics(page, start, off, count, eof, len); +} + static int uptime_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) { @@ -342,6 +377,71 @@ static struct file_operations proc_modul }; #endif +#ifdef CONFIG_NUMA +#define K(x) ((x) << (PAGE_SHIFT - 10)) +static int show_meminfo_numa (struct seq_file *m, void *v) +{ + int *d = v; + int nid = *d; + struct sysinfo i; + si_meminfo_node(&i, nid); + seq_printf(m, "\n" + "Node %d MemTotal: %8lu kB\n" + "Node %d MemFree: %8lu kB\n" + "Node %d MemUsed: %8lu kB\n" + "Node %d HighTotal: %8lu kB\n" + "Node %d HighFree: %8lu kB\n" + "Node %d LowTotal: %8lu kB\n" + "Node %d LowFree: %8lu kB\n", + nid, K(i.totalram), + nid, K(i.freeram), + nid, K(i.totalram-i.freeram), + nid, K(i.totalhigh), + nid, K(i.freehigh), + nid, K(i.totalram-i.totalhigh), + nid, K(i.freeram-i.freehigh)); + + return 0; +} +#undef K + +extern struct seq_operations meminfo_numa_op; +static int meminfo_numa_open(struct inode *inode, struct file *file) +{ + return seq_open(file,&meminfo_numa_op); +} + +static struct file_operations proc_meminfo_numa_operations = { + open: meminfo_numa_open, + read: seq_read, + llseek: seq_lseek, + release: seq_release, +}; + +static void *meminfo_numa_start(struct seq_file *m, loff_t *pos) +{ + return *pos < numnodes ? pos : NULL; +} + +static void *meminfo_numa_next(struct seq_file *m, void *v, loff_t *pos) +{ + ++*pos; + return meminfo_numa_start(m, pos); +} + +static void meminfo_numa_stop(struct seq_file *m, void *v) +{ +} + +struct seq_operations meminfo_numa_op = { + .start = meminfo_numa_start, + .next = meminfo_numa_next, + .stop = meminfo_numa_stop, + .show = show_meminfo_numa, +}; + +#endif + extern struct seq_operations slabinfo_op; extern ssize_t slabinfo_write(struct file *, const char __user *, size_t, loff_t *); static int slabinfo_open(struct inode *inode, struct file *file) @@ -638,6 +738,36 @@ static void create_seq_entry(char *name, entry->proc_fops = f; } +#ifdef CONFIG_LOCKMETER +extern ssize_t get_lockmeter_info(char *, size_t, loff_t *); +extern ssize_t put_lockmeter_info(const char *, size_t); +extern int get_lockmeter_info_size(void); + +/* + * This function accesses lock metering information. + */ +static ssize_t read_lockmeter(struct file *file, char *buf, + size_t count, loff_t *ppos) +{ + return get_lockmeter_info(buf, count, ppos); +} + +/* + * Writing to /proc/lockmeter resets the counters + */ +static ssize_t write_lockmeter(struct file * file, const char * buf, + size_t count, loff_t *ppos) +{ + return put_lockmeter_info(buf, count); +} + +static struct file_operations proc_lockmeter_operations = { + NULL, /* lseek */ + read: read_lockmeter, + write: write_lockmeter, +}; +#endif /* CONFIG_LOCKMETER */ + void __init proc_misc_init(void) { struct proc_dir_entry *entry; @@ -646,6 +776,7 @@ void __init proc_misc_init(void) int (*read_proc)(char*,char**,off_t,int,int*,void*); } *p, simple_ones[] = { {"loadavg", loadavg_read_proc}, + {"real_loadavg",real_loadavg_read_proc}, {"uptime", uptime_read_proc}, {"meminfo", meminfo_read_proc}, {"version", version_read_proc}, @@ -685,6 +816,9 @@ void __init proc_misc_init(void) #ifdef CONFIG_MODULES create_seq_entry("modules", 0, &proc_modules_operations); #endif +#ifdef CONFIG_NUMA + create_seq_entry("meminfo.numa",0,&proc_meminfo_numa_operations); +#endif #ifdef CONFIG_PROC_KCORE proc_root_kcore = create_proc_entry("kcore", S_IRUSR, NULL); if (proc_root_kcore) { @@ -705,6 +839,13 @@ void __init proc_misc_init(void) if (entry) entry->proc_fops = &proc_sysrq_trigger_operations; #endif +#ifdef CONFIG_LOCKMETER + entry = create_proc_entry("lockmeter", S_IWUSR | S_IRUGO, NULL); + if (entry) { + entry->proc_fops = &proc_lockmeter_operations; + entry->size = get_lockmeter_info_size(); + } +#endif #ifdef CONFIG_PPC32 { extern struct file_operations ppc_htab_operations; diff -purN -X /home/mbligh/.diff.exclude 000-virgin/fs/proc/task_mmu.c 999-mjb/fs/proc/task_mmu.c --- 000-virgin/fs/proc/task_mmu.c 2003-10-01 11:47:04.000000000 -0700 +++ 999-mjb/fs/proc/task_mmu.c 2003-10-02 16:42:17.000000000 -0700 @@ -3,6 +3,22 @@ #include #include +#ifdef CONFIG_NUMA +char *task_mem_pernode(struct mm_struct *mm, char *buffer) +{ + int nid; + + for (nid = 0; nid < MAX_NUMNODES; nid++){ + buffer += sprintf(buffer, "VmRSS-node_%d:\t%8lu kb\n", + nid, mm->pernode_rss[nid] << (PAGE_SHIFT-10)); + } + + return buffer; +} +#else /* !CONFIG_NUMA */ +#define task_mem_pernode(mm, buffer) (buffer) +#endif /* CONFIG_NUMA */ + char *task_mem(struct mm_struct *mm, char *buffer) { unsigned long data = 0, stack = 0, exec = 0, lib = 0; @@ -39,6 +55,7 @@ char *task_mem(struct mm_struct *mm, cha mm->rss << (PAGE_SHIFT-10), data - stack, stack, exec - lib, lib); + buffer = task_mem_pernode(mm, buffer); up_read(&mm->mmap_sem); return buffer; } diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/asm-alpha/lockmeter.h 999-mjb/include/asm-alpha/lockmeter.h --- 000-virgin/include/asm-alpha/lockmeter.h 1969-12-31 16:00:00.000000000 -0800 +++ 999-mjb/include/asm-alpha/lockmeter.h 2003-10-02 16:39:44.000000000 -0700 @@ -0,0 +1,90 @@ +/* + * Written by John Hawkes (hawkes@sgi.com) + * Based on klstat.h by Jack Steiner (steiner@sgi.com) + * + * Modified by Peter Rival (frival@zk3.dec.com) + */ + +#ifndef _ALPHA_LOCKMETER_H +#define _ALPHA_LOCKMETER_H + +#include +#define CPU_CYCLE_FREQUENCY hwrpb->cycle_freq + +#define get_cycles64() get_cycles() + +#define THIS_CPU_NUMBER smp_processor_id() + +#include +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,3,0) +#define local_irq_save(x) \ + __save_and_cli(x) +#define local_irq_restore(x) \ + __restore_flags(x) +#endif /* Linux version 2.2.x */ + +#define SPINLOCK_MAGIC_INIT /**/ + +/* + * Macros to cache and retrieve an index value inside of a lock + * these macros assume that there are less than 65536 simultaneous + * (read mode) holders of a rwlock. + * We also assume that the hash table has less than 32767 entries. + * the high order bit is used for write locking a rw_lock + * Note: although these defines and macros are the same as what is being used + * in include/asm-i386/lockmeter.h, they are present here to easily + * allow an alternate Alpha implementation. + */ +/* + * instrumented spinlock structure -- never used to allocate storage + * only used in macros below to overlay a spinlock_t + */ +typedef struct inst_spinlock_s { + /* remember, Alpha is little endian */ + unsigned short lock; + unsigned short index; +} inst_spinlock_t; +#define PUT_INDEX(lock_ptr,indexv) ((inst_spinlock_t *)(lock_ptr))->index = indexv +#define GET_INDEX(lock_ptr) ((inst_spinlock_t *)(lock_ptr))->index + +/* + * macros to cache and retrieve an index value in a read/write lock + * as well as the cpu where a reader busy period started + * we use the 2nd word (the debug word) for this, so require the + * debug word to be present + */ +/* + * instrumented rwlock structure -- never used to allocate storage + * only used in macros below to overlay a rwlock_t + */ +typedef struct inst_rwlock_s { + volatile int lock; + unsigned short index; + unsigned short cpu; +} inst_rwlock_t; +#define PUT_RWINDEX(rwlock_ptr,indexv) ((inst_rwlock_t *)(rwlock_ptr))->index = indexv +#define GET_RWINDEX(rwlock_ptr) ((inst_rwlock_t *)(rwlock_ptr))->index +#define PUT_RW_CPU(rwlock_ptr,cpuv) ((inst_rwlock_t *)(rwlock_ptr))->cpu = cpuv +#define GET_RW_CPU(rwlock_ptr) ((inst_rwlock_t *)(rwlock_ptr))->cpu + +/* + * return true if rwlock is write locked + * (note that other lock attempts can cause the lock value to be negative) + */ +#define RWLOCK_IS_WRITE_LOCKED(rwlock_ptr) (((inst_rwlock_t *)rwlock_ptr)->lock & 1) +#define IABS(x) ((x) > 0 ? (x) : -(x)) + +#define RWLOCK_READERS(rwlock_ptr) rwlock_readers(rwlock_ptr) +extern inline int rwlock_readers(rwlock_t *rwlock_ptr) +{ + int tmp = (int) ((inst_rwlock_t *)rwlock_ptr)->lock; + /* readers subtract 2, so we have to: */ + /* - andnot off a possible writer (bit 0) */ + /* - get the absolute value */ + /* - divide by 2 (right shift by one) */ + /* to find the number of readers */ + if (tmp == 0) return(0); + else return(IABS(tmp & ~1)>>1); +} + +#endif /* _ALPHA_LOCKMETER_H */ diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/asm-alpha/spinlock.h 999-mjb/include/asm-alpha/spinlock.h --- 000-virgin/include/asm-alpha/spinlock.h 2003-06-05 14:55:52.000000000 -0700 +++ 999-mjb/include/asm-alpha/spinlock.h 2003-10-02 16:39:44.000000000 -0700 @@ -6,6 +6,10 @@ #include #include +#ifdef CONFIG_LOCKMETER +#undef DEBUG_SPINLOCK +#undef DEBUG_RWLOCK +#endif /* * Simple spin lock operations. There are two variants, one clears IRQ's @@ -95,9 +99,18 @@ static inline int _raw_spin_trylock(spin typedef struct { volatile int write_lock:1, read_counter:31; +#ifdef CONFIG_LOCKMETER + /* required for LOCKMETER since all bits in lock are used */ + /* need this storage for CPU and lock INDEX ............. */ + unsigned magic; +#endif } /*__attribute__((aligned(32)))*/ rwlock_t; +#ifdef CONFIG_LOCKMETER +#define RW_LOCK_UNLOCKED (rwlock_t) { 0, 0, 0 } +#else #define RW_LOCK_UNLOCKED (rwlock_t) { 0, 0 } +#endif #define rwlock_init(x) do { *(x) = RW_LOCK_UNLOCKED; } while(0) #define rwlock_is_locked(x) (*(volatile int *)(x) != 0) @@ -169,4 +182,41 @@ static inline void _raw_read_unlock(rwlo : "m" (*lock) : "memory"); } +#ifdef CONFIG_LOCKMETER +static inline int _raw_write_trylock(rwlock_t *lock) +{ + long temp,result; + + __asm__ __volatile__( + " ldl_l %1,%0\n" + " mov $31,%2\n" + " bne %1,1f\n" + " or $31,1,%2\n" + " stl_c %2,%0\n" + "1: mb\n" + : "=m" (*(volatile int *)lock), "=&r" (temp), "=&r" (result) + : "m" (*(volatile int *)lock) + ); + + return (result); +} + +static inline int _raw_read_trylock(rwlock_t *lock) +{ + unsigned long temp,result; + + __asm__ __volatile__( + " ldl_l %1,%0\n" + " mov $31,%2\n" + " blbs %1,1f\n" + " subl %1,2,%2\n" + " stl_c %2,%0\n" + "1: mb\n" + : "=m" (*(volatile int *)lock), "=&r" (temp), "=&r" (result) + : "m" (*(volatile int *)lock) + ); + return (result); +} +#endif /* CONFIG_LOCKMETER */ + #endif /* _ALPHA_SPINLOCK_H */ diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/asm-generic/tlb.h 999-mjb/include/asm-generic/tlb.h --- 000-virgin/include/asm-generic/tlb.h 2003-10-01 11:41:15.000000000 -0700 +++ 999-mjb/include/asm-generic/tlb.h 2003-10-02 16:42:17.000000000 -0700 @@ -39,7 +39,6 @@ struct mmu_gather { unsigned int nr; /* set to ~0U means fast mode */ unsigned int need_flush;/* Really unmapped some ptes? */ unsigned int fullmm; /* non-zero means full mm flush */ - unsigned long freed; struct page * pages[FREE_PTE_NR]; }; @@ -60,7 +59,6 @@ tlb_gather_mmu(struct mm_struct *mm, uns tlb->nr = num_online_cpus() > 1 ? 0U : ~0U; tlb->fullmm = full_mm_flush; - tlb->freed = 0; return tlb; } @@ -85,13 +83,6 @@ tlb_flush_mmu(struct mmu_gather *tlb, un static inline void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) { - int freed = tlb->freed; - struct mm_struct *mm = tlb->mm; - int rss = mm->rss; - - if (rss < freed) - freed = rss; - mm->rss = rss - freed; tlb_flush_mmu(tlb, start, end); /* keep the page table cache within bounds */ diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/asm-i386/early_printk.h 999-mjb/include/asm-i386/early_printk.h --- 000-virgin/include/asm-i386/early_printk.h 1969-12-31 16:00:00.000000000 -0800 +++ 999-mjb/include/asm-i386/early_printk.h 2003-10-02 16:39:35.000000000 -0700 @@ -0,0 +1,8 @@ +#ifndef __X86_EARLY_PRINTK_H_I386_ +#define __X86_EARLY_PRINTK_H_I386_ + +#define VGABASE 0xB8000 +#define SERIAL_BASES { 0x3f8, 0x2f8 } +#define SERIAL_BASES_LEN 2 + +#endif diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/asm-i386/lockmeter.h 999-mjb/include/asm-i386/lockmeter.h --- 000-virgin/include/asm-i386/lockmeter.h 1969-12-31 16:00:00.000000000 -0800 +++ 999-mjb/include/asm-i386/lockmeter.h 2003-10-02 16:39:44.000000000 -0700 @@ -0,0 +1,127 @@ +/* + * Copyright (C) 1999,2000 Silicon Graphics, Inc. + * + * Written by John Hawkes (hawkes@sgi.com) + * Based on klstat.h by Jack Steiner (steiner@sgi.com) + * + * Modified by Ray Bryant (raybry@us.ibm.com) + * Changes Copyright (C) 2000 IBM, Inc. + * Added save of index in spinlock_t to improve efficiency + * of "hold" time reporting for spinlocks. + * Added support for hold time statistics for read and write + * locks. + * Moved machine dependent code here from include/lockmeter.h. + * + */ + +#ifndef _I386_LOCKMETER_H +#define _I386_LOCKMETER_H + +#include +#include + +#include + +#ifdef __KERNEL__ +extern unsigned long cpu_khz; +#define CPU_CYCLE_FREQUENCY (cpu_khz * 1000) +#else +#define CPU_CYCLE_FREQUENCY 450000000 +#endif + +#define THIS_CPU_NUMBER smp_processor_id() + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,3,0) +#define local_irq_save(x) \ + __asm__ __volatile__("pushfl ; popl %0 ; cli":"=g" (x): /* no input */ :"memory") + +#define local_irq_restore(x) \ + __asm__ __volatile__("pushl %0 ; popfl": /* no output */ :"g" (x):"memory") +#endif /* Linux version 2.2.x */ + +/* + * macros to cache and retrieve an index value inside of a spin lock + * these macros assume that there are less than 65536 simultaneous + * (read mode) holders of a rwlock. Not normally a problem!! + * we also assume that the hash table has less than 65535 entries. + */ +/* + * instrumented spinlock structure -- never used to allocate storage + * only used in macros below to overlay a spinlock_t + */ +typedef struct inst_spinlock_s { + /* remember, Intel is little endian */ + unsigned short lock; + unsigned short index; +} inst_spinlock_t; +#define PUT_INDEX(lock_ptr,indexv) ((inst_spinlock_t *)(lock_ptr))->index = indexv +#define GET_INDEX(lock_ptr) ((inst_spinlock_t *)(lock_ptr))->index + +/* + * macros to cache and retrieve an index value in a read/write lock + * as well as the cpu where a reader busy period started + * we use the 2nd word (the debug word) for this, so require the + * debug word to be present + */ +/* + * instrumented rwlock structure -- never used to allocate storage + * only used in macros below to overlay a rwlock_t + */ +typedef struct inst_rwlock_s { + volatile int lock; + unsigned short index; + unsigned short cpu; +} inst_rwlock_t; +#define PUT_RWINDEX(rwlock_ptr,indexv) ((inst_rwlock_t *)(rwlock_ptr))->index = indexv +#define GET_RWINDEX(rwlock_ptr) ((inst_rwlock_t *)(rwlock_ptr))->index +#define PUT_RW_CPU(rwlock_ptr,cpuv) ((inst_rwlock_t *)(rwlock_ptr))->cpu = cpuv +#define GET_RW_CPU(rwlock_ptr) ((inst_rwlock_t *)(rwlock_ptr))->cpu + +/* + * return the number of readers for a rwlock_t + */ +#define RWLOCK_READERS(rwlock_ptr) rwlock_readers(rwlock_ptr) + +extern inline int rwlock_readers(rwlock_t *rwlock_ptr) +{ + int tmp = (int) rwlock_ptr->lock; + /* read and write lock attempts may cause the lock value to temporarily */ + /* be negative. Until it is >= 0 we know nothing (i. e. can't tell if */ + /* is -1 because it was write locked and somebody tried to read lock it */ + /* or if it is -1 because it was read locked and somebody tried to write*/ + /* lock it. ........................................................... */ + do { + tmp = (int) rwlock_ptr->lock; + } while (tmp < 0); + if (tmp == 0) return(0); + else return(RW_LOCK_BIAS-tmp); +} + +/* + * return true if rwlock is write locked + * (note that other lock attempts can cause the lock value to be negative) + */ +#define RWLOCK_IS_WRITE_LOCKED(rwlock_ptr) ((rwlock_ptr)->lock <= 0) +#define IABS(x) ((x) > 0 ? (x) : -(x)) +#define RWLOCK_IS_READ_LOCKED(rwlock_ptr) ((IABS((rwlock_ptr)->lock) % RW_LOCK_BIAS) != 0) + +/* this is a lot of typing just to get gcc to emit "rdtsc" */ +static inline long long get_cycles64 (void) +{ +#ifndef CONFIG_X86_TSC + #error this code requires CONFIG_X86_TSC +#else + union longlong_u { + long long intlong; + struct intint_s { + uint32_t eax; + uint32_t edx; + } intint; + } longlong; + + rdtsc(longlong.intint.eax,longlong.intint.edx); + return longlong.intlong; +#endif +} + +#endif /* _I386_LOCKMETER_H */ diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/asm-i386/mmzone.h 999-mjb/include/asm-i386/mmzone.h --- 000-virgin/include/asm-i386/mmzone.h 2003-10-01 11:48:22.000000000 -0700 +++ 999-mjb/include/asm-i386/mmzone.h 2003-10-02 16:42:48.000000000 -0700 @@ -10,7 +10,49 @@ #ifdef CONFIG_DISCONTIGMEM +#ifdef CONFIG_NUMA + #ifdef CONFIG_X86_NUMAQ + #include + #else /* summit or generic arch */ + #include + #endif +#else /* !CONFIG_NUMA */ + #define get_memcfg_numa get_memcfg_numa_flat + #define get_zholes_size(n) (0) +#endif /* CONFIG_NUMA */ + extern struct pglist_data *node_data[]; +#define NODE_DATA(nid) (node_data[nid]) + +/* + * generic node memory support, the following assumptions apply: + * + * 1) memory comes in 256Mb contigious chunks which are either present or not + * 2) we will not have more than 64Gb in total + * + * for now assume that 64Gb is max amount of RAM for whole system + * 64Gb / 4096bytes/page = 16777216 pages + */ +#define MAX_NR_PAGES 16777216 +#define MAX_ELEMENTS 256 +#define PAGES_PER_ELEMENT (MAX_NR_PAGES/MAX_ELEMENTS) + +extern u8 physnode_map[]; + +static inline int pfn_to_nid(unsigned long pfn) +{ +#ifdef CONFIG_NUMA + return(physnode_map[(pfn) / PAGES_PER_ELEMENT]); +#else + return 0; +#endif +} + +static inline struct pglist_data *pfn_to_pgdat(unsigned long pfn) +{ + return(NODE_DATA(pfn_to_nid(pfn))); +} + /* * Following are macros that are specific to this numa platform. @@ -43,11 +85,6 @@ extern struct pglist_data *node_data[]; */ #define kvaddr_to_nid(kaddr) pfn_to_nid(__pa(kaddr) >> PAGE_SHIFT) -/* - * Return a pointer to the node data for node n. - */ -#define NODE_DATA(nid) (node_data[nid]) - #define node_mem_map(nid) (NODE_DATA(nid)->node_mem_map) #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) #define node_end_pfn(nid) \ @@ -93,40 +130,6 @@ extern struct pglist_data *node_data[]; */ #define pfn_valid(pfn) ((pfn) < num_physpages) -/* - * generic node memory support, the following assumptions apply: - * - * 1) memory comes in 256Mb contigious chunks which are either present or not - * 2) we will not have more than 64Gb in total - * - * for now assume that 64Gb is max amount of RAM for whole system - * 64Gb / 4096bytes/page = 16777216 pages - */ -#define MAX_NR_PAGES 16777216 -#define MAX_ELEMENTS 256 -#define PAGES_PER_ELEMENT (MAX_NR_PAGES/MAX_ELEMENTS) - -extern u8 physnode_map[]; - -static inline int pfn_to_nid(unsigned long pfn) -{ - return(physnode_map[(pfn) / PAGES_PER_ELEMENT]); -} -static inline struct pglist_data *pfn_to_pgdat(unsigned long pfn) -{ - return(NODE_DATA(pfn_to_nid(pfn))); -} - -#ifdef CONFIG_X86_NUMAQ -#include -#elif CONFIG_ACPI_SRAT -#include -#elif CONFIG_X86_PC -#define get_zholes_size(n) (0) -#else -#define pfn_to_nid(pfn) (0) -#endif /* CONFIG_X86_NUMAQ */ - extern int get_memcfg_numa_flat(void ); /* * This allows any one NUMA architecture to be compiled diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/asm-i386/page.h 999-mjb/include/asm-i386/page.h --- 000-virgin/include/asm-i386/page.h 2003-04-09 11:48:05.000000000 -0700 +++ 999-mjb/include/asm-i386/page.h 2003-10-02 16:39:38.000000000 -0700 @@ -115,9 +115,26 @@ static __inline__ int get_order(unsigned #endif /* __ASSEMBLY__ */ #ifdef __ASSEMBLY__ -#define __PAGE_OFFSET (0xC0000000) +#include +#ifdef CONFIG_05GB +#define __PAGE_OFFSET (0xE0000000) +#elif defined(CONFIG_1GB) +#define __PAGE_OFFSET (0xC0000000) +#elif defined(CONFIG_2GB) +#define __PAGE_OFFSET (0x80000000) +#elif defined(CONFIG_3GB) +#define __PAGE_OFFSET (0x40000000) +#endif #else -#define __PAGE_OFFSET (0xC0000000UL) +#ifdef CONFIG_05GB +#define __PAGE_OFFSET (0xE0000000UL) +#elif defined(CONFIG_1GB) +#define __PAGE_OFFSET (0xC0000000UL) +#elif defined(CONFIG_2GB) +#define __PAGE_OFFSET (0x80000000UL) +#elif defined(CONFIG_3GB) +#define __PAGE_OFFSET (0x40000000UL) +#endif #endif diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/asm-i386/param.h 999-mjb/include/asm-i386/param.h --- 000-virgin/include/asm-i386/param.h 2002-12-09 18:45:45.000000000 -0800 +++ 999-mjb/include/asm-i386/param.h 2003-10-02 16:39:36.000000000 -0700 @@ -2,11 +2,19 @@ #define _ASMi386_PARAM_H #ifdef __KERNEL__ -# define HZ 1000 /* Internal kernel timer frequency */ -# define USER_HZ 100 /* .. some user interfaces are in "ticks" */ -# define CLOCKS_PER_SEC (USER_HZ) /* like times() */ +#include + +#ifdef CONFIG_1000HZ +# define HZ 1000 /* Internal kernel timer frequency */ +#else +# define HZ 100 #endif +#define USER_HZ 100 /* .. some user interfaces are in "ticks" */ +#define CLOCKS_PER_SEC (USER_HZ) /* like times() */ + +#endif /* __KERNEL__ */ + #ifndef HZ #define HZ 100 #endif diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/asm-i386/processor.h 999-mjb/include/asm-i386/processor.h --- 000-virgin/include/asm-i386/processor.h 2003-10-01 11:48:22.000000000 -0700 +++ 999-mjb/include/asm-i386/processor.h 2003-10-02 16:39:38.000000000 -0700 @@ -299,7 +299,11 @@ extern unsigned int mca_pentium_flag; /* This decides where the kernel will search for a free chunk of vm * space during mmap's. */ +#ifdef CONFIG_05GB +#define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 16)) +#else #define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3)) +#endif /* * Size of io_bitmap, covering ports 0 to 0x3ff. diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/asm-i386/rwlock.h 999-mjb/include/asm-i386/rwlock.h --- 000-virgin/include/asm-i386/rwlock.h 2002-12-09 18:46:25.000000000 -0800 +++ 999-mjb/include/asm-i386/rwlock.h 2003-10-02 16:39:42.000000000 -0700 @@ -20,28 +20,52 @@ #define RW_LOCK_BIAS 0x01000000 #define RW_LOCK_BIAS_STR "0x01000000" -#define __build_read_lock_ptr(rw, helper) \ - asm volatile(LOCK "subl $1,(%0)\n\t" \ - "js 2f\n" \ - "1:\n" \ - LOCK_SECTION_START("") \ - "2:\tcall " helper "\n\t" \ - "jmp 1b\n" \ - LOCK_SECTION_END \ - ::"a" (rw) : "memory") - -#define __build_read_lock_const(rw, helper) \ - asm volatile(LOCK "subl $1,%0\n\t" \ - "js 2f\n" \ - "1:\n" \ - LOCK_SECTION_START("") \ - "2:\tpushl %%eax\n\t" \ - "leal %0,%%eax\n\t" \ - "call " helper "\n\t" \ - "popl %%eax\n\t" \ - "jmp 1b\n" \ - LOCK_SECTION_END \ - :"=m" (*(volatile int *)rw) : : "memory") +#ifdef CONFIG_SPINLINE + + #define __build_read_lock_ptr(rw, helper) \ + asm volatile(LOCK "subl $1,(%0)\n\t" \ + "jns 1f\n\t" \ + "call " helper "\n\t" \ + "1:\t" \ + ::"a" (rw) : "memory") + + #define __build_read_lock_const(rw, helper) \ + asm volatile(LOCK "subl $1,%0\n\t" \ + "jns 1f\n\t" \ + "pushl %%eax\n\t" \ + "leal %0,%%eax\n\t" \ + "call " helper "\n\t" \ + "popl %%eax\n\t" \ + "1:\t" \ + :"=m" (*(volatile int *)rw) : : "memory") + +#else /* !CONFIG_SPINLINE */ + + #define __build_read_lock_ptr(rw, helper) \ + asm volatile(LOCK "subl $1,(%0)\n\t" \ + "js 2f\n" \ + "1:\n" \ + LOCK_SECTION_START("") \ + "2:\tcall " helper "\n\t" \ + "jmp 1b\n" \ + LOCK_SECTION_END \ + ::"a" (rw) : "memory") + + #define __build_read_lock_const(rw, helper) \ + asm volatile(LOCK "subl $1,%0\n\t" \ + "js 2f\n" \ + "1:\n" \ + LOCK_SECTION_START("") \ + "2:\tpushl %%eax\n\t" \ + "leal %0,%%eax\n\t" \ + "call " helper "\n\t" \ + "popl %%eax\n\t" \ + "jmp 1b\n" \ + LOCK_SECTION_END \ + :"=m" (*(volatile int *)rw) : : "memory") + +#endif /* CONFIG_SPINLINE */ + #define __build_read_lock(rw, helper) do { \ if (__builtin_constant_p(rw)) \ @@ -50,28 +74,51 @@ __build_read_lock_ptr(rw, helper); \ } while (0) -#define __build_write_lock_ptr(rw, helper) \ - asm volatile(LOCK "subl $" RW_LOCK_BIAS_STR ",(%0)\n\t" \ - "jnz 2f\n" \ - "1:\n" \ - LOCK_SECTION_START("") \ - "2:\tcall " helper "\n\t" \ - "jmp 1b\n" \ - LOCK_SECTION_END \ - ::"a" (rw) : "memory") - -#define __build_write_lock_const(rw, helper) \ - asm volatile(LOCK "subl $" RW_LOCK_BIAS_STR ",%0\n\t" \ - "jnz 2f\n" \ - "1:\n" \ - LOCK_SECTION_START("") \ - "2:\tpushl %%eax\n\t" \ - "leal %0,%%eax\n\t" \ - "call " helper "\n\t" \ - "popl %%eax\n\t" \ - "jmp 1b\n" \ - LOCK_SECTION_END \ - :"=m" (*(volatile int *)rw) : : "memory") +#ifdef CONFIG_SPINLINE + + #define __build_write_lock_ptr(rw, helper) \ + asm volatile(LOCK "subl $" RW_LOCK_BIAS_STR ",(%0)\n\t" \ + "jz 1f\n\t" \ + "call " helper "\n\t" \ + "1:\n" \ + ::"a" (rw) : "memory") + + #define __build_write_lock_const(rw, helper) \ + asm volatile(LOCK "subl $" RW_LOCK_BIAS_STR ",%0\n\t" \ + "jz 1f\n\t" \ + "pushl %%eax\n\t" \ + "leal %0,%%eax\n\t" \ + "call " helper "\n\t" \ + "popl %%eax\n\t" \ + "1:\n" \ + :"=m" (*(volatile int *)rw) : : "memory") + +#else /* !CONFIG_SPINLINE */ + + #define __build_write_lock_ptr(rw, helper) \ + asm volatile(LOCK "subl $" RW_LOCK_BIAS_STR ",(%0)\n\t" \ + "jnz 2f\n" \ + "1:\n" \ + LOCK_SECTION_START("") \ + "2:\tcall " helper "\n\t" \ + "jmp 1b\n" \ + LOCK_SECTION_END \ + ::"a" (rw) : "memory") + + #define __build_write_lock_const(rw, helper) \ + asm volatile(LOCK "subl $" RW_LOCK_BIAS_STR ",%0\n\t" \ + "jnz 2f\n" \ + "1:\n" \ + LOCK_SECTION_START("") \ + "2:\tpushl %%eax\n\t" \ + "leal %0,%%eax\n\t" \ + "call " helper "\n\t" \ + "popl %%eax\n\t" \ + "jmp 1b\n" \ + LOCK_SECTION_END \ + :"=m" (*(volatile int *)rw) : : "memory") + +#endif /* CONFIG_SPINLINE */ #define __build_write_lock(rw, helper) do { \ if (__builtin_constant_p(rw)) \ diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/asm-i386/spinlock.h 999-mjb/include/asm-i386/spinlock.h --- 000-virgin/include/asm-i386/spinlock.h 2003-06-05 14:56:10.000000000 -0700 +++ 999-mjb/include/asm-i386/spinlock.h 2003-10-02 16:39:44.000000000 -0700 @@ -43,18 +43,35 @@ typedef struct { #define spin_is_locked(x) (*(volatile signed char *)(&(x)->lock) <= 0) #define spin_unlock_wait(x) do { barrier(); } while(spin_is_locked(x)) -#define spin_lock_string \ - "\n1:\t" \ - "lock ; decb %0\n\t" \ - "js 2f\n" \ - LOCK_SECTION_START("") \ - "2:\t" \ - "rep;nop\n\t" \ - "cmpb $0,%0\n\t" \ - "jle 2b\n\t" \ - "jmp 1b\n" \ - LOCK_SECTION_END +#ifdef CONFIG_SPINLINE + #define spin_lock_string \ + "\n1:\t" \ + "lock ; decb %0\n\t" \ + "js 2f\n" \ + "jmp 3f\n" \ + "2:\t" \ + "rep;nop\n\t" \ + "cmpb $0,%0\n\t" \ + "jle 2b\n\t" \ + "jmp 1b\n" \ + "3:\t" + +#else /* !CONFIG_SPINLINE */ + + #define spin_lock_string \ + "\n1:\t" \ + "lock ; decb %0\n\t" \ + "js 2f\n" \ + LOCK_SECTION_START("") \ + "2:\t" \ + "rep;nop\n\t" \ + "cmpb $0,%0\n\t" \ + "jle 2b\n\t" \ + "jmp 1b\n" \ + LOCK_SECTION_END + +#endif /* CONFIG_SPINLINE */ /* * This works. Despite all the confusion. * (except on PPro SMP or if we are using OOSTORE) @@ -138,6 +155,11 @@ here: */ typedef struct { volatile unsigned int lock; +#if CONFIG_LOCKMETER + /* required for LOCKMETER since all bits in lock are used */ + /* and we need this storage for CPU and lock INDEX */ + unsigned lockmeter_magic; +#endif #ifdef CONFIG_DEBUG_SPINLOCK unsigned magic; #endif @@ -145,11 +167,19 @@ typedef struct { #define RWLOCK_MAGIC 0xdeaf1eed +#ifdef CONFIG_LOCKMETER +#if CONFIG_DEBUG_SPINLOCK +#define RWLOCK_MAGIC_INIT , 0, RWLOCK_MAGIC +#else +#define RWLOCK_MAGIC_INIT , 0 +#endif +#else /* !CONFIG_LOCKMETER */ #ifdef CONFIG_DEBUG_SPINLOCK #define RWLOCK_MAGIC_INIT , RWLOCK_MAGIC #else #define RWLOCK_MAGIC_INIT /* */ #endif +#endif /* !CONFIG_LOCKMETER */ #define RW_LOCK_UNLOCKED (rwlock_t) { RW_LOCK_BIAS RWLOCK_MAGIC_INIT } @@ -196,4 +226,58 @@ static inline int _raw_write_trylock(rwl return 0; } +#ifdef CONFIG_LOCKMETER +static inline int _raw_read_trylock(rwlock_t *lock) +{ +/* FIXME -- replace with assembler */ + atomic_t *count = (atomic_t *)lock; + atomic_dec(count); + if (count->counter > 0) + return 1; + atomic_inc(count); + return 0; +} +#endif + +#if defined(CONFIG_LOCKMETER) && defined(CONFIG_HAVE_DEC_LOCK) +extern void _metered_spin_lock (spinlock_t *lock); +extern void _metered_spin_unlock(spinlock_t *lock); + +/* + * Matches what is in arch/i386/lib/dec_and_lock.c, except this one is + * "static inline" so that the spin_lock(), if actually invoked, is charged + * against the real caller, not against the catch-all atomic_dec_and_lock + */ +static inline int atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock) +{ + int counter; + int newcount; + +repeat: + counter = atomic_read(atomic); + newcount = counter-1; + + if (!newcount) + goto slow_path; + + asm volatile("lock; cmpxchgl %1,%2" + :"=a" (newcount) + :"r" (newcount), "m" (atomic->counter), "0" (counter)); + + /* If the above failed, "eax" will have changed */ + if (newcount != counter) + goto repeat; + return 0; + +slow_path: + _metered_spin_lock(lock); + if (atomic_dec_and_test(atomic)) + return 1; + _metered_spin_unlock(lock); + return 0; +} + +#define ATOMIC_DEC_AND_LOCK +#endif + #endif /* __ASM_SPINLOCK_H */ diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/asm-i386/unistd.h 999-mjb/include/asm-i386/unistd.h --- 000-virgin/include/asm-i386/unistd.h 2003-10-01 11:41:15.000000000 -0700 +++ 999-mjb/include/asm-i386/unistd.h 2003-10-02 16:41:14.000000000 -0700 @@ -228,7 +228,7 @@ #define __NR_madvise1 219 /* delete when C lib stub is removed */ #define __NR_getdents64 220 #define __NR_fcntl64 221 -/* 223 is unused */ +#define __NR_mbind 223 #define __NR_gettid 224 #define __NR_readahead 225 #define __NR_setxattr 226 diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/asm-ia64/lockmeter.h 999-mjb/include/asm-ia64/lockmeter.h --- 000-virgin/include/asm-ia64/lockmeter.h 1969-12-31 16:00:00.000000000 -0800 +++ 999-mjb/include/asm-ia64/lockmeter.h 2003-10-02 16:39:44.000000000 -0700 @@ -0,0 +1,72 @@ +/* + * Copyright (C) 1999,2000 Silicon Graphics, Inc. + * + * Written by John Hawkes (hawkes@sgi.com) + * Based on klstat.h by Jack Steiner (steiner@sgi.com) + */ + +#ifndef _IA64_LOCKMETER_H +#define _IA64_LOCKMETER_H + +#ifdef local_cpu_data +#define CPU_CYCLE_FREQUENCY local_cpu_data->itc_freq +#else +#define CPU_CYCLE_FREQUENCY my_cpu_data.itc_freq +#endif +#define get_cycles64() get_cycles() + +#define THIS_CPU_NUMBER smp_processor_id() + +/* + * macros to cache and retrieve an index value inside of a lock + * these macros assume that there are less than 65536 simultaneous + * (read mode) holders of a rwlock. + * we also assume that the hash table has less than 32767 entries. + */ +/* + * instrumented spinlock structure -- never used to allocate storage + * only used in macros below to overlay a spinlock_t + */ +typedef struct inst_spinlock_s { + /* remember, Intel is little endian */ + volatile unsigned short lock; + volatile unsigned short index; +} inst_spinlock_t; +#define PUT_INDEX(lock_ptr,indexv) ((inst_spinlock_t *)(lock_ptr))->index = indexv +#define GET_INDEX(lock_ptr) ((inst_spinlock_t *)(lock_ptr))->index + +/* + * macros to cache and retrieve an index value in a read/write lock + * as well as the cpu where a reader busy period started + * we use the 2nd word (the debug word) for this, so require the + * debug word to be present + */ +/* + * instrumented rwlock structure -- never used to allocate storage + * only used in macros below to overlay a rwlock_t + */ +typedef struct inst_rwlock_s { + volatile int read_counter:31; + volatile int write_lock:1; + volatile unsigned short index; + volatile unsigned short cpu; +} inst_rwlock_t; +#define PUT_RWINDEX(rwlock_ptr,indexv) ((inst_rwlock_t *)(rwlock_ptr))->index = indexv +#define GET_RWINDEX(rwlock_ptr) ((inst_rwlock_t *)(rwlock_ptr))->index +#define PUT_RW_CPU(rwlock_ptr,cpuv) ((inst_rwlock_t *)(rwlock_ptr))->cpu = cpuv +#define GET_RW_CPU(rwlock_ptr) ((inst_rwlock_t *)(rwlock_ptr))->cpu + +/* + * return the number of readers for a rwlock_t + */ +#define RWLOCK_READERS(rwlock_ptr) ((rwlock_ptr)->read_counter) + +/* + * return true if rwlock is write locked + * (note that other lock attempts can cause the lock value to be negative) + */ +#define RWLOCK_IS_WRITE_LOCKED(rwlock_ptr) ((rwlock_ptr)->write_lock) +#define RWLOCK_IS_READ_LOCKED(rwlock_ptr) ((rwlock_ptr)->read_counter) + +#endif /* _IA64_LOCKMETER_H */ + diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/asm-ia64/spinlock.h 999-mjb/include/asm-ia64/spinlock.h --- 000-virgin/include/asm-ia64/spinlock.h 2003-10-01 11:48:23.000000000 -0700 +++ 999-mjb/include/asm-ia64/spinlock.h 2003-10-02 16:39:44.000000000 -0700 @@ -190,4 +190,25 @@ do { \ clear_bit(31, (x)); \ }) +#ifdef CONFIG_LOCKMETER +extern void _metered_spin_lock (spinlock_t *lock); +extern void _metered_spin_unlock(spinlock_t *lock); + +/* + * Use a less efficient, and inline, atomic_dec_and_lock() if lockmetering + * so we can see the callerPC of who is actually doing the spin_lock(). + * Otherwise, all we see is the generic rollup of all locks done by + * atomic_dec_and_lock(). + */ +static inline int atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock) +{ + _metered_spin_lock(lock); + if (atomic_dec_and_test(atomic)) + return 1; + _metered_spin_unlock(lock); + return 0; +} +#define ATOMIC_DEC_AND_LOCK +#endif + #endif /* _ASM_IA64_SPINLOCK_H */ diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/asm-mips/lockmeter.h 999-mjb/include/asm-mips/lockmeter.h --- 000-virgin/include/asm-mips/lockmeter.h 1969-12-31 16:00:00.000000000 -0800 +++ 999-mjb/include/asm-mips/lockmeter.h 2003-10-02 16:39:44.000000000 -0700 @@ -0,0 +1,126 @@ +/* + * Copyright (C) 1999,2000 Silicon Graphics, Inc. + * + * Written by John Hawkes (hawkes@sgi.com) + * Based on klstat.h by Jack Steiner (steiner@sgi.com) + * Ported to mips32 for Asita Technologies + * by D.J. Barrow ( dj.barrow@asitatechnologies.com ) + */ +#ifndef _ASM_LOCKMETER_H +#define _ASM_LOCKMETER_H + +/* do_gettimeoffset is a function pointer on mips */ +/* & it is not included by */ +#include +#include +#include + +#define SPINLOCK_MAGIC_INIT /* */ + +#define CPU_CYCLE_FREQUENCY get_cpu_cycle_frequency() + +#define THIS_CPU_NUMBER smp_processor_id() + +static uint32_t cpu_cycle_frequency = 0; + +static uint32_t get_cpu_cycle_frequency(void) +{ + /* a total hack, slow and invasive, but ... it works */ + int sec; + uint32_t start_cycles; + struct timeval tv; + + if (cpu_cycle_frequency == 0) { /* uninitialized */ + do_gettimeofday(&tv); + sec = tv.tv_sec; /* set up to catch the tv_sec rollover */ + while (sec == tv.tv_sec) { do_gettimeofday(&tv); } + sec = tv.tv_sec; /* rolled over to a new sec value */ + start_cycles = get_cycles(); + while (sec == tv.tv_sec) { do_gettimeofday(&tv); } + cpu_cycle_frequency = get_cycles() - start_cycles; + } + + return cpu_cycle_frequency; +} + +extern struct timeval xtime; + +static uint64_t get_cycles64(void) +{ + static uint64_t last_get_cycles64 = 0; + uint64_t ret; + unsigned long sec; + unsigned long usec, usec_offset; + +again: + sec = xtime.tv_sec; + usec = xtime.tv_usec; + usec_offset = do_gettimeoffset(); + if ((xtime.tv_sec != sec) || + (xtime.tv_usec != usec)|| + (usec_offset >= 20000)) + goto again; + + ret = ((uint64_t)(usec + usec_offset) * cpu_cycle_frequency); + /* We can't do a normal 64 bit division on mips without libgcc.a */ + do_div(ret,1000000); + ret += ((uint64_t)sec * cpu_cycle_frequency); + + /* XXX why does time go backwards? do_gettimeoffset? general time adj? */ + if (ret <= last_get_cycles64) + ret = last_get_cycles64+1; + last_get_cycles64 = ret; + + return ret; +} + +/* + * macros to cache and retrieve an index value inside of a lock + * these macros assume that there are less than 65536 simultaneous + * (read mode) holders of a rwlock. + * we also assume that the hash table has less than 32767 entries. + * the high order bit is used for write locking a rw_lock + */ +#define INDEX_MASK 0x7FFF0000 +#define READERS_MASK 0x0000FFFF +#define INDEX_SHIFT 16 +#define PUT_INDEX(lockp,index) \ + lockp->lock = (((lockp->lock) & ~INDEX_MASK) | (index) << INDEX_SHIFT) +#define GET_INDEX(lockp) \ + (((lockp->lock) & INDEX_MASK) >> INDEX_SHIFT) + +/* + * macros to cache and retrieve an index value in a read/write lock + * as well as the cpu where a reader busy period started + * we use the 2nd word (the debug word) for this, so require the + * debug word to be present + */ +/* + * instrumented rwlock structure -- never used to allocate storage + * only used in macros below to overlay a rwlock_t + */ +typedef struct inst_rwlock_s { + volatile int lock; + unsigned short index; + unsigned short cpu; +} inst_rwlock_t; +#define PUT_RWINDEX(rwlock_ptr,indexv) ((inst_rwlock_t *)(rwlock_ptr))->index = indexv +#define GET_RWINDEX(rwlock_ptr) ((inst_rwlock_t *)(rwlock_ptr))->index +#define PUT_RW_CPU(rwlock_ptr,cpuv) ((inst_rwlock_t *)(rwlock_ptr))->cpu = cpuv +#define GET_RW_CPU(rwlock_ptr) ((inst_rwlock_t *)(rwlock_ptr))->cpu + +/* + * return the number of readers for a rwlock_t + */ +#define RWLOCK_READERS(rwlock_ptr) rwlock_readers(rwlock_ptr) + +extern inline int rwlock_readers(rwlock_t *rwlock_ptr) +{ + int tmp = (int) rwlock_ptr->lock; + return (tmp >= 0) ? tmp : 0; +} + +#define RWLOCK_IS_WRITE_LOCKED(rwlock_ptr) ((rwlock_ptr)->lock < 0) +#define RWLOCK_IS_READ_LOCKED(rwlock_ptr) ((rwlock_ptr)->lock > 0) + +#endif /* _ASM_LOCKMETER_H */ diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/asm-mips/spinlock.h 999-mjb/include/asm-mips/spinlock.h --- 000-virgin/include/asm-mips/spinlock.h 2003-07-02 14:44:56.000000000 -0700 +++ 999-mjb/include/asm-mips/spinlock.h 2003-10-02 16:39:44.000000000 -0700 @@ -91,9 +91,18 @@ static inline unsigned int _raw_spin_try typedef struct { volatile unsigned int lock; +#if CONFIG_LOCKMETER + /* required for LOCKMETER since all bits in lock are used */ + /* and we need this storage for CPU and lock INDEX */ + unsigned lockmeter_magic; +#endif } rwlock_t; +#ifdef CONFIG_LOCKMETER +#define RW_LOCK_UNLOCKED (rwlock_t) { 0, 0 } +#else #define RW_LOCK_UNLOCKED (rwlock_t) { 0 } +#endif #define rwlock_init(x) do { *(x) = RW_LOCK_UNLOCKED; } while(0) diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/asm-mips64/lockmeter.h 999-mjb/include/asm-mips64/lockmeter.h --- 000-virgin/include/asm-mips64/lockmeter.h 1969-12-31 16:00:00.000000000 -0800 +++ 999-mjb/include/asm-mips64/lockmeter.h 2003-10-02 16:39:44.000000000 -0700 @@ -0,0 +1,120 @@ +/* + * Copyright (C) 1999,2000 Silicon Graphics, Inc. + * + * Written by John Hawkes (hawkes@sgi.com) + * Based on klstat.h by Jack Steiner (steiner@sgi.com) + */ + +#ifndef _ASM_LOCKMETER_H +#define _ASM_LOCKMETER_H + +#include + +#define SPINLOCK_MAGIC_INIT /* */ + +#define CPU_CYCLE_FREQUENCY get_cpu_cycle_frequency() + +#define THIS_CPU_NUMBER smp_processor_id() + +static uint32_t cpu_cycle_frequency = 0; + +static uint32_t get_cpu_cycle_frequency(void) +{ + /* a total hack, slow and invasive, but ... it works */ + int sec; + uint32_t start_cycles; + struct timeval tv; + + if (cpu_cycle_frequency == 0) { /* uninitialized */ + do_gettimeofday(&tv); + sec = tv.tv_sec; /* set up to catch the tv_sec rollover */ + while (sec == tv.tv_sec) { do_gettimeofday(&tv); } + sec = tv.tv_sec; /* rolled over to a new sec value */ + start_cycles = get_cycles(); + while (sec == tv.tv_sec) { do_gettimeofday(&tv); } + cpu_cycle_frequency = get_cycles() - start_cycles; + } + + return cpu_cycle_frequency; +} + +extern struct timeval xtime; +extern long do_gettimeoffset(void); + +static uint64_t get_cycles64(void) +{ + static uint64_t last_get_cycles64 = 0; + uint64_t ret; + unsigned long sec; + unsigned long usec, usec_offset; + +again: + sec = xtime.tv_sec; + usec = xtime.tv_usec; + usec_offset = do_gettimeoffset(); + if ((xtime.tv_sec != sec) || + (xtime.tv_usec != usec)|| + (usec_offset >= 20000)) + goto again; + + ret = ((uint64_t)sec * cpu_cycle_frequency) + + ( ((uint64_t)(usec + usec_offset) * cpu_cycle_frequency) / 1000000 ); + + /* XXX why does time go backwards? do_gettimeoffset? general time adj? */ + if (ret <= last_get_cycles64) + ret = last_get_cycles64+1; + last_get_cycles64 = ret; + + return ret; +} + +/* + * macros to cache and retrieve an index value inside of a lock + * these macros assume that there are less than 65536 simultaneous + * (read mode) holders of a rwlock. + * we also assume that the hash table has less than 32767 entries. + * the high order bit is used for write locking a rw_lock + */ +#define INDEX_MASK 0x7FFF0000 +#define READERS_MASK 0x0000FFFF +#define INDEX_SHIFT 16 +#define PUT_INDEX(lockp,index) \ + lockp->lock = (((lockp->lock) & ~INDEX_MASK) | (index) << INDEX_SHIFT) +#define GET_INDEX(lockp) \ + (((lockp->lock) & INDEX_MASK) >> INDEX_SHIFT) + +/* + * macros to cache and retrieve an index value in a read/write lock + * as well as the cpu where a reader busy period started + * we use the 2nd word (the debug word) for this, so require the + * debug word to be present + */ +/* + * instrumented rwlock structure -- never used to allocate storage + * only used in macros below to overlay a rwlock_t + */ +typedef struct inst_rwlock_s { + volatile int lock; + unsigned short index; + unsigned short cpu; +} inst_rwlock_t; +#define PUT_RWINDEX(rwlock_ptr,indexv) ((inst_rwlock_t *)(rwlock_ptr))->index = indexv +#define GET_RWINDEX(rwlock_ptr) ((inst_rwlock_t *)(rwlock_ptr))->index +#define PUT_RW_CPU(rwlock_ptr,cpuv) ((inst_rwlock_t *)(rwlock_ptr))->cpu = cpuv +#define GET_RW_CPU(rwlock_ptr) ((inst_rwlock_t *)(rwlock_ptr))->cpu + +/* + * return the number of readers for a rwlock_t + */ +#define RWLOCK_READERS(rwlock_ptr) rwlock_readers(rwlock_ptr) + +extern inline int rwlock_readers(rwlock_t *rwlock_ptr) +{ + int tmp = (int) rwlock_ptr->lock; + return (tmp >= 0) ? tmp : 0; +} + +#define RWLOCK_IS_WRITE_LOCKED(rwlock_ptr) ((rwlock_ptr)->lock < 0) +#define RWLOCK_IS_READ_LOCKED(rwlock_ptr) ((rwlock_ptr)->lock > 0) + +#endif /* _ASM_LOCKMETER_H */ diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/asm-sparc64/lockmeter.h 999-mjb/include/asm-sparc64/lockmeter.h --- 000-virgin/include/asm-sparc64/lockmeter.h 1969-12-31 16:00:00.000000000 -0800 +++ 999-mjb/include/asm-sparc64/lockmeter.h 2003-10-02 16:39:44.000000000 -0700 @@ -0,0 +1,47 @@ +/* + * Copyright (C) 2000 Anton Blanchard (anton@linuxcare.com) + */ + +#ifndef _SPARC64_LOCKMETER_H +#define _SPARC64_LOCKMETER_H + +#include + +#include + +extern unsigned long cpu_hz; +#define CPU_CYCLE_FREQUENCY cpu_hz + +#define THIS_CPU_NUMBER __cpu_number_map[smp_processor_id()] + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,3,0) +#define local_irq_save(x) __save_and_cli(x) +#define local_irq_restore(x) __restore_flags(x) +#endif /* Linux version 2.2.x */ + +#define PUT_INDEX(lock_ptr,indexv) (lock_ptr)->index = (indexv) +#define GET_INDEX(lock_ptr) (lock_ptr)->index + +#define PUT_RWINDEX(rwlock_ptr,indexv) (rwlock_ptr)->index = (indexv) +#define GET_RWINDEX(rwlock_ptr) (rwlock_ptr)->index +#define PUT_RW_CPU(rwlock_ptr,cpuv) (rwlock_ptr)->cpu = (cpuv) +#define GET_RW_CPU(rwlock_ptr) (rwlock_ptr)->cpu + +#define RWLOCK_READERS(rwlock_ptr) rwlock_readers(rwlock_ptr) + +extern inline int rwlock_readers(rwlock_t *rwlock_ptr) +{ + signed int tmp = rwlock_ptr->lock; + + if (tmp > 0) + return tmp; + else + return 0; +} + +#define RWLOCK_IS_WRITE_LOCKED(rwlock_ptr) ((signed int)((rwlock_ptr)->lock) < 0) +#define RWLOCK_IS_READ_LOCKED(rwlock_ptr) ((signed int)((rwlock_ptr)->lock) > 0) + +#define get_cycles64() get_cycles() + +#endif /* _SPARC64_LOCKMETER_H */ diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/asm-sparc64/spinlock.h 999-mjb/include/asm-sparc64/spinlock.h --- 000-virgin/include/asm-sparc64/spinlock.h 2002-12-09 18:45:48.000000000 -0800 +++ 999-mjb/include/asm-sparc64/spinlock.h 2003-10-02 16:39:44.000000000 -0700 @@ -30,15 +30,23 @@ #ifndef CONFIG_DEBUG_SPINLOCK -typedef unsigned char spinlock_t; -#define SPIN_LOCK_UNLOCKED 0 +typedef struct { + unsigned char lock; + unsigned int index; +} spinlock_t; -#define spin_lock_init(lock) (*((unsigned char *)(lock)) = 0) -#define spin_is_locked(lock) (*((volatile unsigned char *)(lock)) != 0) +#ifdef CONFIG_LOCKMETER +#define SPIN_LOCK_UNLOCKED (spinlock_t) {0, 0} +#else +#define SPIN_LOCK_UNLOCKED (spinlock_t) { 0 } +#endif -#define spin_unlock_wait(lock) \ +#define spin_lock_init(__lock) do { *(__lock) = SPIN_LOCK_UNLOCKED; } while(0) +#define spin_is_locked(__lock) (*((volatile unsigned char *)(&((__lock)->lock))) != 0) + +#define spin_unlock_wait(__lock) \ do { membar("#LoadLoad"); \ -} while(*((volatile unsigned char *)lock)) +} while(*((volatile unsigned char *)(&(((spinlock_t *)__lock)->lock)))) static __inline__ void _raw_spin_lock(spinlock_t *lock) { @@ -109,8 +117,20 @@ extern int _spin_trylock (spinlock_t *lo #ifndef CONFIG_DEBUG_SPINLOCK -typedef unsigned int rwlock_t; -#define RW_LOCK_UNLOCKED 0 +#ifdef CONFIG_LOCKMETER +typedef struct { + unsigned int lock; + unsigned int index; + unsigned int cpu; +} rwlock_t; +#define RW_LOCK_UNLOCKED (rwlock_t) { 0, 0, 0xff } +#else +typedef struct { + unsigned int lock; +} rwlock_t; +#define RW_LOCK_UNLOCKED (rwlock_t) { 0 } +#endif + #define rwlock_init(lp) do { *(lp) = RW_LOCK_UNLOCKED; } while(0) #define rwlock_is_locked(x) (*(x) != RW_LOCK_UNLOCKED) diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/asm-x86_64/early_printk.h 999-mjb/include/asm-x86_64/early_printk.h --- 000-virgin/include/asm-x86_64/early_printk.h 1969-12-31 16:00:00.000000000 -0800 +++ 999-mjb/include/asm-x86_64/early_printk.h 2003-10-02 16:39:35.000000000 -0700 @@ -0,0 +1,8 @@ +#ifndef __X86_EARLY_PRINTK_H_X86_64_ +#define __X86_EARLY_PRINTK_H_X86_64_ + +#define VGABASE 0xffffffff800b8000UL +#define SERIAL_BASES { 0x3f8, 0x2f8 } +#define SERIAL_BASES_LEN 2 + +#endif diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/linux/early_printk.h 999-mjb/include/linux/early_printk.h --- 000-virgin/include/linux/early_printk.h 1969-12-31 16:00:00.000000000 -0800 +++ 999-mjb/include/linux/early_printk.h 2003-10-02 16:39:35.000000000 -0700 @@ -0,0 +1,47 @@ +#ifndef __X86_EARLY_PRINTK_H_ +#define __X86_EARLY_PRINTK_H_ + +#ifdef CONFIG_X86_EARLY_PRINTK +#include +#include +#include +#include +#include +#include + +/* Simple VGA output */ + +#define MAX_YPOS 25 +#define MAX_XPOS 80 + +/* Simple serial port output */ + +#define DEFAULT_BAUD 57600 +#define XMTRDY 0x20 + +#define DLAB 0x80 + +#define TXR 0 /* Transmit register (WRITE) */ +#define RXR 0 /* Receive register (READ) */ +#define IER 1 /* Interrupt Enable */ +#define IIR 2 /* Interrupt ID */ +#define FCR 2 /* FIFO control */ +#define LCR 3 /* Line control */ +#define MCR 4 /* Modem control */ +#define LSR 5 /* Line Status */ +#define MSR 6 /* Modem Status */ +#define DLL 0 /* Divisor Latch Low */ +#define DLH 1 /* Divisor latch High */ + + +void early_printk(const char *fmt, ...); +int __init setup_early_printk(); + +#else + +#define early_printk(...) do {} while(0) +#define setup_early_printk() do {} while(0) + +#endif + +#endif diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/linux/fs.h 999-mjb/include/linux/fs.h --- 000-virgin/include/linux/fs.h 2003-10-01 11:48:25.000000000 -0700 +++ 999-mjb/include/linux/fs.h 2003-10-02 16:53:55.000000000 -0700 @@ -19,6 +19,8 @@ #include #include #include +#include +#include #include struct iovec; @@ -315,11 +317,29 @@ struct address_space_operations { loff_t offset, unsigned long nr_segs); }; +#if NR_CPUS > 8 +typedef rwlock_t mapping_rwlock_t; +#define mapping_rdlock(lock) read_lock(lock) +#define mapping_rdunlock(lock) read_unlock(lock) +#define mapping_wrlock(lock) write_lock(lock) +#define mapping_wrunlock(lock) write_unlock(lock) +#define mapping_rwlock_init(lock) rwlock_init(lock) +#define MAPPING_RW_LOCK_UNLOCKED RW_LOCK_UNLOCKED +#else +typedef spinlock_t mapping_rwlock_t; +#define mapping_rdlock(lock) spin_lock(lock) +#define mapping_rdunlock(lock) spin_unlock(lock) +#define mapping_wrlock(lock) spin_lock(lock) +#define mapping_wrunlock(lock) spin_unlock(lock) +#define mapping_rwlock_init(lock) spin_lock_init(lock) +#define MAPPING_RW_LOCK_UNLOCKED SPIN_LOCK_UNLOCKED +#endif + struct backing_dev_info; struct address_space { struct inode *host; /* owner: inode, block_device */ struct radix_tree_root page_tree; /* radix tree of all pages */ - spinlock_t page_lock; /* and spinlock protecting it */ + mapping_rwlock_t page_lock; /* and spinlock protecting it */ struct list_head clean_pages; /* list of clean pages */ struct list_head dirty_pages; /* list of dirty pages */ struct list_head locked_pages; /* list of locked pages */ @@ -336,6 +356,9 @@ struct address_space { spinlock_t private_lock; /* for use by the address_space */ struct list_head private_list; /* ditto */ struct address_space *assoc_mapping; /* ditto */ +#ifdef CONFIG_NUMA + struct binding *binding; /* for memory bindings */ +#endif }; struct block_device { diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/linux/gfp.h 999-mjb/include/linux/gfp.h --- 000-virgin/include/linux/gfp.h 2003-10-01 11:41:17.000000000 -0700 +++ 999-mjb/include/linux/gfp.h 2003-10-02 16:44:09.000000000 -0700 @@ -32,6 +32,7 @@ #define __GFP_NOFAIL 0x800 /* Retry for ever. Cannot fail */ #define __GFP_NORETRY 0x1000 /* Do not retry. Might fail */ #define __GFP_NO_GROW 0x2000 /* Slab internal usage */ +#define __GFP_NODE_STRICT 0x4000 /* Do not fall back to other nodes */ #define __GFP_BITS_SHIFT 16 /* Room for 16 __GFP_FOO bits */ #define __GFP_BITS_MASK ((1 << __GFP_BITS_SHIFT) - 1) @@ -69,7 +70,7 @@ static inline struct page * alloc_pages_ if (unlikely(order >= MAX_ORDER)) return NULL; - return __alloc_pages(gfp_mask, order, NODE_DATA(nid)->node_zonelists + (gfp_mask & GFP_ZONEMASK)); + return __alloc_pages(gfp_mask, order, get_node_zonelist(nid, gfp_mask)); } #define alloc_pages(gfp_mask, order) \ diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/linux/lockmeter.h 999-mjb/include/linux/lockmeter.h --- 000-virgin/include/linux/lockmeter.h 1969-12-31 16:00:00.000000000 -0800 +++ 999-mjb/include/linux/lockmeter.h 2003-10-02 16:39:44.000000000 -0700 @@ -0,0 +1,320 @@ +/* + * Copyright (C) 1999-2002 Silicon Graphics, Inc. + * + * Written by John Hawkes (hawkes@sgi.com) + * Based on klstat.h by Jack Steiner (steiner@sgi.com) + * + * Modified by Ray Bryant (raybry@us.ibm.com) Feb-Apr 2000 + * Changes Copyright (C) 2000 IBM, Inc. + * Added save of index in spinlock_t to improve efficiency + * of "hold" time reporting for spinlocks + * Added support for hold time statistics for read and write + * locks. + * Moved machine dependent code to include/asm/lockmeter.h. + * + */ + +#ifndef _LINUX_LOCKMETER_H +#define _LINUX_LOCKMETER_H + + +/*--------------------------------------------------- + * architecture-independent lockmeter.h + *-------------------------------------------------*/ + +/* + * raybry -- version 2: added efficient hold time statistics + * requires lstat recompile, so flagged as new version + * raybry -- version 3: added global reader lock data + * hawkes -- version 4: removed some unnecessary fields to simplify mips64 port + */ +#define LSTAT_VERSION 5 + +int lstat_update(void*, void*, int); +int lstat_update_time(void*, void*, int, uint32_t); + +/* + * Currently, the mips64 and sparc64 kernels talk to a 32-bit lockstat, so we + * need to force compatibility in the inter-communication data structure. + */ + +#if defined(CONFIG_MIPS32_COMPAT) +#define TIME_T uint32_t +#elif defined(CONFIG_SPARC32_COMPAT) +#define TIME_T uint64_t +#else +#define TIME_T time_t +#endif + +#if defined(__KERNEL__) || (!defined(CONFIG_MIPS32_COMPAT) && !defined(CONFIG_SPARC32_COMPAT)) || (_MIPS_SZLONG==32) +#define POINTER void * +#else +#define POINTER int64_t +#endif + +/* + * Values for the "action" parameter passed to lstat_update. + * ZZZ - do we want a try-success status here??? + */ +#define LSTAT_ACT_NO_WAIT 0 +#define LSTAT_ACT_SPIN 1 +#define LSTAT_ACT_REJECT 2 +#define LSTAT_ACT_WW_SPIN 3 +#define LSTAT_ACT_SLEPT 4 /* UNUSED */ + +#define LSTAT_ACT_MAX_VALUES 4 /* NOTE: Increase to 5 if use ACT_SLEPT */ + +/* + * Special values for the low 2 bits of an RA passed to + * lstat_update. + */ +/* we use these values to figure out what kind of lock data */ +/* is stored in the statistics table entry at index ....... */ +#define LSTAT_RA_SPIN 0 /* spin lock data */ +#define LSTAT_RA_READ 1 /* read lock statistics */ +#define LSTAT_RA_SEMA 2 /* RESERVED */ +#define LSTAT_RA_WRITE 3 /* write lock statistics*/ + +#define LSTAT_RA(n) \ + ((void*)( ((unsigned long)__builtin_return_address(0) & ~3) | n) ) + +/* + * Constants used for lock addresses in the lstat_directory + * to indicate special values of the lock address. + */ +#define LSTAT_MULTI_LOCK_ADDRESS NULL + +/* + * Maximum size of the lockstats tables. Increase this value + * if its not big enough. (Nothing bad happens if its not + * big enough although some locks will not be monitored.) + * We record overflows of this quantity in lstat_control.dir_overflows + * + * Note: The max value here must fit into the field set + * and obtained by the macro's PUT_INDEX() and GET_INDEX(). + * This value depends on how many bits are available in the + * lock word in the particular machine implementation we are on. + */ +#define LSTAT_MAX_STAT_INDEX 2000 + +/* + * Size and mask for the hash table into the directory. + */ +#define LSTAT_HASH_TABLE_SIZE 4096 /* must be 2**N */ +#define LSTAT_HASH_TABLE_MASK (LSTAT_HASH_TABLE_SIZE-1) + +#define DIRHASH(ra) ((unsigned long)(ra)>>2 & LSTAT_HASH_TABLE_MASK) + +/* + * This defines an entry in the lockstat directory. It contains + * information about a lock being monitored. + * A directory entry only contains the lock identification - + * counts on usage of the lock are kept elsewhere in a per-cpu + * data structure to minimize cache line pinging. + */ +typedef struct { + POINTER caller_ra; /* RA of code that set lock */ + POINTER lock_ptr; /* lock address */ + ushort next_stat_index; /* Used to link multiple locks that have the same hash table value */ +} lstat_directory_entry_t; + +/* + * A multi-dimensioned array used to contain counts for lock accesses. + * The array is 3-dimensional: + * - CPU number. Keep from thrashing cache lines between CPUs + * - Directory entry index. Identifies the lock + * - Action. Indicates what kind of contention occurred on an + * access to the lock. + * + * The index of an entry in the directory is the same as the 2nd index + * of the entry in the counts array. + */ +/* + * This table contains data for spin_locks, write locks, and read locks + * Not all data is used for all cases. In particular, the hold time + * information is not stored here for read locks since that is a global + * (e. g. cannot be separated out by return address) quantity. + * See the lstat_read_lock_counts_t structure for the global read lock + * hold time. + */ +typedef struct { + uint64_t cum_wait_ticks; /* sum of wait times */ + /* for write locks, sum of time a */ + /* writer is waiting for a reader */ + int64_t cum_hold_ticks; /* cumulative sum of holds */ + /* not used for read mode locks */ + /* must be signed. ............... */ + uint32_t max_wait_ticks; /* max waiting time */ + uint32_t max_hold_ticks; /* max holding time */ + uint64_t cum_wait_ww_ticks; /* sum times writer waits on writer*/ + uint32_t max_wait_ww_ticks; /* max wait time writer vs writer */ + /* prev 2 only used for write locks*/ + uint32_t acquire_time; /* time lock acquired this CPU */ + uint32_t count[LSTAT_ACT_MAX_VALUES]; +} lstat_lock_counts_t; + +typedef lstat_lock_counts_t lstat_cpu_counts_t[LSTAT_MAX_STAT_INDEX]; + +/* + * User request to: + * - turn statistic collection on/off, or to reset + */ +#define LSTAT_OFF 0 +#define LSTAT_ON 1 +#define LSTAT_RESET 2 +#define LSTAT_RELEASE 3 + +#define LSTAT_MAX_READ_LOCK_INDEX 1000 +typedef struct { + POINTER lock_ptr; /* address of lock for output stats */ + uint32_t read_lock_count; + int64_t cum_hold_ticks; /* sum of read lock hold times over */ + /* all callers. ....................*/ + uint32_t write_index; /* last write lock hash table index */ + uint32_t busy_periods; /* count of busy periods ended this */ + uint64_t start_busy; /* time this busy period started. ..*/ + uint64_t busy_ticks; /* sum of busy periods this lock. ..*/ + uint64_t max_busy; /* longest busy period for this lock*/ + uint32_t max_readers; /* maximum number of readers ...... */ +#ifdef USER_MODE_TESTING + rwlock_t entry_lock; /* lock for this read lock entry... */ + /* avoid having more than one rdr at*/ + /* needed for user space testing... */ + /* not needed for kernel 'cause it */ + /* is non-preemptive. ............. */ +#endif +} lstat_read_lock_counts_t; +typedef lstat_read_lock_counts_t lstat_read_lock_cpu_counts_t[LSTAT_MAX_READ_LOCK_INDEX]; + +#if defined(__KERNEL__) || defined(USER_MODE_TESTING) + +#ifndef USER_MODE_TESTING +#include +#else +#include "asm_newlockmeter.h" +#endif + +/* + * Size and mask for the hash table into the directory. + */ +#define LSTAT_HASH_TABLE_SIZE 4096 /* must be 2**N */ +#define LSTAT_HASH_TABLE_MASK (LSTAT_HASH_TABLE_SIZE-1) + +#define DIRHASH(ra) ((unsigned long)(ra)>>2 & LSTAT_HASH_TABLE_MASK) + +/* + * This version eliminates the per processor lock stack. What we do is to + * store the index of the lock hash structure in unused bits in the lock + * itself. Then on unlock we can find the statistics record without doing + * any additional hash or lock stack lookup. This works for spin_locks. + * Hold time reporting is now basically as cheap as wait time reporting + * so we ignore the difference between LSTAT_ON_HOLD and LSTAT_ON_WAIT + * as in version 1.1.* of lockmeter. + * + * For rw_locks, we store the index of a global reader stats structure in + * the lock and the writer index is stored in the latter structure. + * For read mode locks we hash at the time of the lock to find an entry + * in the directory for reader wait time and the like. + * At unlock time for read mode locks, we update just the global structure + * so we don't need to know the reader directory index value at unlock time. + * + */ + +/* + * Protocol to change lstat_control.state + * This is complicated because we don't want the cum_hold_time for + * a rw_lock to be decremented in _read_lock_ without making sure it + * is incremented in _read_lock_ and vice versa. So here is the + * way we change the state of lstat_control.state: + * I. To Turn Statistics On + * After allocating storage, set lstat_control.state non-zero. + * This works because we don't start updating statistics for in use + * locks until the reader lock count goes to zero. + * II. To Turn Statistics Off: + * (0) Disable interrupts on this CPU + * (1) Seize the lstat_control.directory_lock + * (2) Obtain the current value of lstat_control.next_free_read_lock_index + * (3) Store a zero in lstat_control.state. + * (4) Release the lstat_control.directory_lock + * (5) For each lock in the read lock list up to the saved value + * (well, -1) of the next_free_read_lock_index, do the following: + * (a) Check validity of the stored lock address + * by making sure that the word at the saved addr + * has an index that matches this entry. If not + * valid, then skip this entry. + * (b) If there is a write lock already set on this lock, + * skip to (d) below. + * (c) Set a non-metered write lock on the lock + * (d) set the cached INDEX in the lock to zero + * (e) Release the non-metered write lock. + * (6) Re-enable interrupts + * + * These rules ensure that a read lock will not have its statistics + * partially updated even though the global lock recording state has + * changed. See put_lockmeter_info() for implementation. + * + * The reason for (b) is that there may be write locks set on the + * syscall path to put_lockmeter_info() from user space. If we do + * not do this check, then we can deadlock. A similar problem would + * occur if the lock was read locked by the current CPU. At the + * moment this does not appear to happen. + */ + +/* + * Main control structure for lockstat. Used to turn statistics on/off + * and to maintain directory info. + */ +typedef struct { + int state; + spinlock_t control_lock; /* used to serialize turning statistics on/off */ + spinlock_t directory_lock; /* for serialize adding entries to directory */ + volatile int next_free_dir_index;/* next free entry in the directory */ + /* FIXME not all of these fields are used / needed .............. */ + /* the following fields represent data since */ + /* first "lstat on" or most recent "lstat reset" */ + TIME_T first_started_time; /* time when measurement first enabled */ + TIME_T started_time; /* time when measurement last started */ + TIME_T ending_time; /* time when measurement last disabled */ + uint64_t started_cycles64; /* cycles when measurement last started */ + uint64_t ending_cycles64; /* cycles when measurement last disabled */ + uint64_t enabled_cycles64; /* total cycles with measurement enabled */ + int intervals; /* number of measurement intervals recorded */ + /* i. e. number of times did lstat on;lstat off */ + lstat_directory_entry_t *dir; /* directory */ + int dir_overflow; /* count of times ran out of space in directory */ + int rwlock_overflow; /* count of times we couldn't allocate a rw block*/ + ushort *hashtab; /* hash table for quick dir scans */ + lstat_cpu_counts_t *counts[NR_CPUS]; /* Array of pointers to per-cpu stats */ + int next_free_read_lock_index; /* next rwlock reader (global) stats block */ + lstat_read_lock_cpu_counts_t *read_lock_counts[NR_CPUS]; /* per cpu read lock stats */ +} lstat_control_t; + +#endif /* defined(__KERNEL__) || defined(USER_MODE_TESTING) */ + +typedef struct { + short lstat_version; /* version of the data */ + short state; /* the current state is returned */ + int maxcpus; /* Number of cpus present */ + int next_free_dir_index; /* index of the next free directory entry */ + TIME_T first_started_time; /* when measurement enabled for first time */ + TIME_T started_time; /* time in secs since 1969 when stats last turned on */ + TIME_T ending_time; /* time in secs since 1969 when stats last turned off */ + uint32_t cycleval; /* cycles per second */ +#ifdef notyet + void *kernel_magic_addr; /* address of kernel_magic */ + void *kernel_end_addr; /* contents of kernel magic (points to "end") */ +#endif + int next_free_read_lock_index; /* index of next (global) read lock stats struct */ + uint64_t started_cycles64; /* cycles when measurement last started */ + uint64_t ending_cycles64; /* cycles when stats last turned off */ + uint64_t enabled_cycles64; /* total cycles with measurement enabled */ + int intervals; /* number of measurement intervals recorded */ + /* i.e. number of times we did lstat on;lstat off*/ + int dir_overflow; /* number of times we wanted more space in directory */ + int rwlock_overflow; /* # of times we wanted more space in read_locks_count */ + struct new_utsname uts; /* info about machine where stats are measured */ + /* -T option of lockstat allows data to be */ + /* moved to another machine. ................. */ +} lstat_user_request_t; + +#endif /* _LINUX_LOCKMETER_H */ diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/linux/mm.h 999-mjb/include/linux/mm.h --- 000-virgin/include/linux/mm.h 2003-10-01 11:48:26.000000000 -0700 +++ 999-mjb/include/linux/mm.h 2003-10-02 16:42:18.000000000 -0700 @@ -180,6 +180,7 @@ struct page { struct pte_chain *chain;/* Reverse pte mapping pointer. * protected by PG_chainlock */ pte_addr_t direct; + int mapcount; } pte; unsigned long private; /* mapping-private opaque data */ @@ -616,6 +617,39 @@ extern struct page * follow_page(struct extern int remap_page_range(struct vm_area_struct *vma, unsigned long from, unsigned long to, unsigned long size, pgprot_t prot); +/* + * Given a struct page, determine which node's memory it is from. + * TODO: There's probably a more efficient way to do this... + */ +static inline int page_to_nid(struct page *page) +{ + return pfn_to_nid(page_to_pfn(page)); +} + +#ifdef CONFIG_NUMA +static inline void zero_rss(struct mm_struct *mm) +{ + mm->rss = 0; + memset(mm->pernode_rss, 0, MAX_NUMNODES * sizeof(*mm->pernode_rss)); +} + +static inline void inc_rss(struct mm_struct *mm, struct page *page) +{ + mm->rss++; + mm->pernode_rss[page_to_nid(page)]++; +} + +static inline void dec_rss(struct mm_struct *mm, struct page *page) +{ + mm->rss--; + mm->pernode_rss[page_to_nid(page)]--; +} +#else /* !CONFIG_NUMA */ +#define zero_rss(mm) ((mm)->rss = 0) +#define inc_rss(mm, page) ((mm)->rss++) +#define dec_rss(mm, page) ((mm)->rss--) +#endif /* CONFIG_NUMA */ + #ifndef CONFIG_DEBUG_PAGEALLOC static inline void kernel_map_pages(struct page *page, int numpages, int enable) diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/linux/mmzone.h 999-mjb/include/linux/mmzone.h --- 000-virgin/include/linux/mmzone.h 2003-10-01 11:47:13.000000000 -0700 +++ 999-mjb/include/linux/mmzone.h 2003-10-02 16:42:48.000000000 -0700 @@ -307,6 +307,7 @@ extern struct pglist_data contig_page_da #define NODE_DATA(nid) (&contig_page_data) #define NODE_MEM_MAP(nid) mem_map #define MAX_NR_NODES 1 +#define pfn_to_nid(pfn) (0) #else /* CONFIG_DISCONTIGMEM */ #include @@ -369,6 +370,19 @@ static inline unsigned int num_online_me #define num_online_memblks() 1 #endif /* CONFIG_DISCONTIGMEM || CONFIG_NUMA */ + +static inline struct zonelist *get_node_zonelist(int nid, int gfp_mask) +{ + return NODE_DATA(nid)->node_zonelists + (gfp_mask & GFP_ZONEMASK); +} + +#define get_zonelist(gfp_mask) get_node_zonelist(numa_node_id(), gfp_mask) + +/* Structure to keep track of memory segment (VMA) bindings */ +struct binding { + struct zonelist zonelist; +}; + #endif /* !__ASSEMBLY__ */ #endif /* __KERNEL__ */ #endif /* _LINUX_MMZONE_H */ diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/linux/module.h 999-mjb/include/linux/module.h --- 000-virgin/include/linux/module.h 2003-07-28 15:33:25.000000000 -0700 +++ 999-mjb/include/linux/module.h 2003-10-02 16:43:03.000000000 -0700 @@ -257,6 +257,11 @@ struct module /* The command line arguments (may be mangled). People like keeping pointers to this stuff */ char *args; + +#ifdef CONFIG_GCOV_PROFILE + const char *ctors_start; /* Pointer to start of .ctors-section */ + const char *ctors_end; /* Pointer to end of .ctors-section */ +#endif }; /* FIXME: It'd be nice to isolate modules during init, too, so they diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/linux/page-flags.h 999-mjb/include/linux/page-flags.h --- 000-virgin/include/linux/page-flags.h 2003-10-01 11:47:13.000000000 -0700 +++ 999-mjb/include/linux/page-flags.h 2003-10-02 16:39:41.000000000 -0700 @@ -75,6 +75,7 @@ #define PG_mappedtodisk 17 /* Has blocks allocated on-disk */ #define PG_reclaim 18 /* To be reclaimed asap */ #define PG_compound 19 /* Part of a compound page */ +#define PG_anon 20 /* Anonymous page */ /* @@ -269,6 +270,10 @@ extern void get_full_page_state(struct p #define SetPageCompound(page) set_bit(PG_compound, &(page)->flags) #define ClearPageCompound(page) clear_bit(PG_compound, &(page)->flags) +#define PageAnon(page) test_bit(PG_anon, &(page)->flags) +#define SetPageAnon(page) set_bit(PG_anon, &(page)->flags) +#define ClearPageAnon(page) clear_bit(PG_anon, &(page)->flags) + /* * The PageSwapCache predicate doesn't use a PG_flag at this time, * but it may again do so one day. diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/linux/pagemap.h 999-mjb/include/linux/pagemap.h --- 000-virgin/include/linux/pagemap.h 2003-10-01 11:41:17.000000000 -0700 +++ 999-mjb/include/linux/pagemap.h 2003-10-02 16:41:14.000000000 -0700 @@ -50,14 +50,37 @@ static inline void mapping_set_gfp_mask( #define page_cache_release(page) put_page(page) void release_pages(struct page **pages, int nr, int cold); +#ifndef CONFIG_NUMA + +static inline struct page *__page_cache_alloc(struct address_space *x, int gfp_mask) +{ + return alloc_pages(gfp_mask, 0); +} + +#else /* CONFIG_NUMA */ + +static inline struct page *__page_cache_alloc(struct address_space *x, int gfp_mask) +{ + struct zonelist *zonelist; + + if (!x->binding) + zonelist = get_zonelist(gfp_mask); + else + zonelist = &x->binding->zonelist; + + return __alloc_pages(gfp_mask, 0, zonelist); +} + +#endif /* !CONFIG_NUMA */ + static inline struct page *page_cache_alloc(struct address_space *x) { - return alloc_pages(mapping_gfp_mask(x), 0); + return __page_cache_alloc(x, mapping_gfp_mask(x)); } static inline struct page *page_cache_alloc_cold(struct address_space *x) { - return alloc_pages(mapping_gfp_mask(x)|__GFP_COLD, 0); + return __page_cache_alloc(x, mapping_gfp_mask(x)|__GFP_COLD); } typedef int filler_t(void *, struct page *); diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/linux/pci.h 999-mjb/include/linux/pci.h --- 000-virgin/include/linux/pci.h 2003-10-01 11:41:17.000000000 -0700 +++ 999-mjb/include/linux/pci.h 2003-10-02 16:39:49.000000000 -0700 @@ -461,10 +461,10 @@ struct pci_bus { void *sysdata; /* hook for sys-specific extension */ struct proc_dir_entry *procdir; /* directory entry in /proc/bus/pci */ - unsigned char number; /* bus number */ - unsigned char primary; /* number of primary bridge */ - unsigned char secondary; /* number of secondary bridge */ - unsigned char subordinate; /* max number of subordinate buses */ + unsigned int number; /* bus number */ + unsigned int primary; /* number of primary bridge */ + unsigned int secondary; /* number of secondary bridge */ + unsigned int subordinate; /* max number of subordinate buses */ char name[48]; diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/linux/sched.h 999-mjb/include/linux/sched.h --- 000-virgin/include/linux/sched.h 2003-10-01 11:48:26.000000000 -0700 +++ 999-mjb/include/linux/sched.h 2003-10-02 16:42:18.000000000 -0700 @@ -71,7 +71,11 @@ struct exec_domain; * the EXP_n values would be 1981, 2034 and 2043 if still using only * 11 bit fractions. */ -extern unsigned long avenrun[]; /* Load averages */ +extern unsigned long avenrun[]; /* Load averages */ +extern unsigned long tasks_running[3]; /* Real load averages */ +DECLARE_PER_CPU(unsigned long[3],cpu_tasks_running); /* Real load averages per cpu */ + +extern unsigned long tasks_running[]; /* Real load averages */ #define FSHIFT 11 /* nr of bits of precision */ #define FIXED_1 (1<rss */ + spinlock_t page_table_lock; /* Protects task page tables and RSS data */ struct list_head mmlist; /* List of all active mm's. These are globally strung * together off init_mm.mmlist, and are protected @@ -202,7 +207,11 @@ struct mm_struct { unsigned long start_code, end_code, start_data, end_data; unsigned long start_brk, brk, start_stack; unsigned long arg_start, arg_end, env_start, env_end; - unsigned long rss, total_vm, locked_vm; + unsigned long total_vm, locked_vm; + unsigned long rss; +#ifdef CONFIG_NUMA + unsigned long pernode_rss[MAX_NUMNODES]; +#endif unsigned long def_flags; cpumask_t cpu_vm_mask; unsigned long swap_address; @@ -510,7 +519,7 @@ static inline int set_cpus_allowed(task_ extern unsigned long long sched_clock(void); -#ifdef CONFIG_NUMA +#ifdef CONFIG_NUMA_SCHED extern void sched_balance_exec(void); extern void node_nr_running_init(void); #else diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/linux/spinlock.h 999-mjb/include/linux/spinlock.h --- 000-virgin/include/linux/spinlock.h 2003-07-02 14:45:00.000000000 -0700 +++ 999-mjb/include/linux/spinlock.h 2003-10-02 16:39:44.000000000 -0700 @@ -184,6 +184,17 @@ typedef struct { #endif /* !SMP */ +#ifdef CONFIG_LOCKMETER +extern void _metered_spin_lock (spinlock_t *lock); +extern void _metered_spin_unlock (spinlock_t *lock); +extern int _metered_spin_trylock(spinlock_t *lock); +extern void _metered_read_lock (rwlock_t *lock); +extern void _metered_read_unlock (rwlock_t *lock); +extern void _metered_write_lock (rwlock_t *lock); +extern void _metered_write_unlock (rwlock_t *lock); +extern int _metered_write_trylock(rwlock_t *lock); +#endif + /* * Define the various spin_lock and rw_lock methods. Note we define these * regardless of whether CONFIG_SMP or CONFIG_PREEMPT are set. The various @@ -389,6 +400,141 @@ do { \ _raw_spin_trylock(lock) ? 1 : \ ({preempt_enable(); local_bh_enable(); 0;});}) +#ifdef CONFIG_LOCKMETER +#undef spin_lock +#undef spin_trylock +#undef spin_unlock +#undef spin_lock_irqsave +#undef spin_lock_irq +#undef spin_lock_bh +#undef read_lock +#undef read_unlock +#undef write_lock +#undef write_unlock +#undef write_trylock +#undef spin_unlock_bh +#undef read_lock_irqsave +#undef read_lock_irq +#undef read_lock_bh +#undef read_unlock_bh +#undef write_lock_irqsave +#undef write_lock_irq +#undef write_lock_bh +#undef write_unlock_bh + +#define spin_lock(lock) \ +do { \ + preempt_disable(); \ + _metered_spin_lock(lock); \ +} while(0) + +#define spin_trylock(lock) ({preempt_disable(); _metered_spin_trylock(lock) ? \ + 1 : ({preempt_enable(); 0;});}) +#define spin_unlock(lock) \ +do { \ + _metered_spin_unlock(lock); \ + preempt_enable(); \ +} while (0) + +#define spin_lock_irqsave(lock, flags) \ +do { \ + local_irq_save(flags); \ + preempt_disable(); \ + _metered_spin_lock(lock); \ +} while (0) + +#define spin_lock_irq(lock) \ +do { \ + local_irq_disable(); \ + preempt_disable(); \ + _metered_spin_lock(lock); \ +} while (0) + +#define spin_lock_bh(lock) \ +do { \ + local_bh_disable(); \ + preempt_disable(); \ + _metered_spin_lock(lock); \ +} while (0) + +#define spin_unlock_bh(lock) \ +do { \ + _metered_spin_unlock(lock); \ + preempt_enable(); \ + local_bh_enable(); \ +} while (0) + + +#define read_lock(lock) ({preempt_disable(); _metered_read_lock(lock);}) +#define read_unlock(lock) ({_metered_read_unlock(lock); preempt_enable();}) +#define write_lock(lock) ({preempt_disable(); _metered_write_lock(lock);}) +#define write_unlock(lock) ({_metered_write_unlock(lock); preempt_enable();}) +#define write_trylock(lock) ({preempt_disable();_metered_write_trylock(lock) ? \ + 1 : ({preempt_enable(); 0;});}) +#define spin_unlock_no_resched(lock) \ +do { \ + _metered_spin_unlock(lock); \ + preempt_enable_no_resched(); \ +} while (0) + +#define read_lock_irqsave(lock, flags) \ +do { \ + local_irq_save(flags); \ + preempt_disable(); \ + _metered_read_lock(lock); \ +} while (0) + +#define read_lock_irq(lock) \ +do { \ + local_irq_disable(); \ + preempt_disable(); \ + _metered_read_lock(lock); \ +} while (0) + +#define read_lock_bh(lock) \ +do { \ + local_bh_disable(); \ + preempt_disable(); \ + _metered_read_lock(lock); \ +} while (0) + +#define read_unlock_bh(lock) \ +do { \ + _metered_read_unlock(lock); \ + preempt_enable(); \ + local_bh_enable(); \ +} while (0) + +#define write_lock_irqsave(lock, flags) \ +do { \ + local_irq_save(flags); \ + preempt_disable(); \ + _metered_write_lock(lock); \ +} while (0) + +#define write_lock_irq(lock) \ +do { \ + local_irq_disable(); \ + preempt_disable(); \ + _metered_write_lock(lock); \ +} while (0) + +#define write_lock_bh(lock) \ +do { \ + local_bh_disable(); \ + preempt_disable(); \ + _metered_write_lock(lock); \ +} while (0) + +#define write_unlock_bh(lock) \ +do { \ + _metered_write_unlock(lock); \ + preempt_enable(); \ + local_bh_enable(); \ +} while (0) + +#endif /* !CONFIG_LOCKMETER */ + /* "lock on reference count zero" */ #ifndef ATOMIC_DEC_AND_LOCK #include diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/linux/swap.h 999-mjb/include/linux/swap.h --- 000-virgin/include/linux/swap.h 2003-10-01 11:48:26.000000000 -0700 +++ 999-mjb/include/linux/swap.h 2003-10-02 16:39:41.000000000 -0700 @@ -185,6 +185,8 @@ struct pte_chain *FASTCALL(page_add_rmap void FASTCALL(page_remove_rmap(struct page *, pte_t *)); int FASTCALL(try_to_unmap(struct page *)); +int page_convert_anon(struct page *); + /* linux/mm/shmem.c */ extern int shmem_unuse(swp_entry_t entry, struct page *page); #else diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/linux/sysctl.h 999-mjb/include/linux/sysctl.h --- 000-virgin/include/linux/sysctl.h 2003-10-01 11:47:14.000000000 -0700 +++ 999-mjb/include/linux/sysctl.h 2003-10-02 16:39:40.000000000 -0700 @@ -61,7 +61,8 @@ enum CTL_DEV=7, /* Devices */ CTL_BUS=8, /* Busses */ CTL_ABI=9, /* Binary emulation */ - CTL_CPU=10 /* CPU stuff (speed scaling, etc) */ + CTL_CPU=10, /* CPU stuff (speed scaling, etc) */ + CTL_SCHED=11, /* scheduler tunables */ }; /* CTL_BUS names: */ @@ -156,6 +157,21 @@ enum VM_MIN_FREE_KBYTES=21, /* Minimum free kilobytes to maintain */ }; +/* Tunable scheduler parameters in /proc/sys/sched/ */ +enum { + SCHED_MIN_TIMESLICE=1, /* minimum process timeslice */ + SCHED_MAX_TIMESLICE=2, /* maximum process timeslice */ + SCHED_CHILD_PENALTY=3, /* penalty on fork to child */ + SCHED_PARENT_PENALTY=4, /* penalty on fork to parent */ + SCHED_EXIT_WEIGHT=5, /* penalty to parent of CPU hog child */ + SCHED_PRIO_BONUS_RATIO=6, /* percent of max prio given as bonus */ + SCHED_INTERACTIVE_DELTA=7, /* delta used to scale interactivity */ + SCHED_MAX_SLEEP_AVG=8, /* maximum sleep avg attainable */ + SCHED_STARVATION_LIMIT=9, /* no re-active if expired is starved */ + SCHED_NODE_THRESHOLD=10, /* NUMA node rebalance threshold */ + SCHED_IDLE_NODE_REBALANCE_RATIO=11, /* how often to global balance */ + SCHED_BUSY_NODE_REBALANCE_RATIO=12, /* how often to global balance */ +}; /* CTL_NET names: */ enum diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/linux/timex.h 999-mjb/include/linux/timex.h --- 000-virgin/include/linux/timex.h 2003-06-24 16:43:14.000000000 -0700 +++ 999-mjb/include/linux/timex.h 2003-10-02 16:39:36.000000000 -0700 @@ -78,7 +78,7 @@ #elif HZ >= 768 && HZ < 1536 # define SHIFT_HZ 10 #else -# error You lose. +# error Please use a HZ value which is between 12 and 1536 #endif /* diff -purN -X /home/mbligh/.diff.exclude 000-virgin/init/main.c 999-mjb/init/main.c --- 000-virgin/init/main.c 2003-10-01 11:48:27.000000000 -0700 +++ 999-mjb/init/main.c 2003-10-02 16:43:03.000000000 -0700 @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -113,6 +114,10 @@ char *execute_command; /* Setup configured maximum number of CPUs to activate */ static unsigned int max_cpus = NR_CPUS; +#if defined(CONFIG_GCOV_PROFILE) && (defined(CONFIG_PPC32) || defined(CONFIG_PPC64)) +void __bb_fork_func (void) { } +#endif + /* * Setup routine for controlling SMP activation * @@ -387,6 +392,8 @@ asmlinkage void __init start_kernel(void */ lock_kernel(); printk(linux_banner); + setup_early_printk(); + setup_arch(&command_line); setup_per_zone_pages_min(); setup_per_cpu_areas(); diff -purN -X /home/mbligh/.diff.exclude 000-virgin/ipc/shm.c 999-mjb/ipc/shm.c --- 000-virgin/ipc/shm.c 2003-10-01 11:47:15.000000000 -0700 +++ 999-mjb/ipc/shm.c 2003-10-02 16:53:55.000000000 -0700 @@ -380,9 +380,9 @@ static void shm_get_stat(unsigned long * if (is_file_hugepages(shp->shm_file)) { struct address_space *mapping = inode->i_mapping; - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); *rss += (HPAGE_SIZE/PAGE_SIZE)*mapping->nrpages; - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); } else { struct shmem_inode_info *info = SHMEM_I(inode); spin_lock(&info->lock); diff -purN -X /home/mbligh/.diff.exclude 000-virgin/kernel/Makefile 999-mjb/kernel/Makefile --- 000-virgin/kernel/Makefile 2003-10-01 11:48:27.000000000 -0700 +++ 999-mjb/kernel/Makefile 2003-10-02 16:43:03.000000000 -0700 @@ -8,9 +8,16 @@ obj-y = sched.o fork.o exec_domain.o signal.o sys.o kmod.o workqueue.o pid.o \ rcupdate.o intermodule.o extable.o params.o posix-timers.o +ifdef CONFIG_GCOV_PROFILE +obj-y += gcov.o +export-objs += gcov.o +CFLAGS_gcov.o := -DGCOV_PATH='"$(TOPDIR)"' +endif + obj-$(CONFIG_FUTEX) += futex.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o obj-$(CONFIG_SMP) += cpu.o +obj-$(CONFIG_LOCKMETER) += lockmeter.o obj-$(CONFIG_UID16) += uid16.o obj-$(CONFIG_MODULES) += ksyms.o module.o obj-$(CONFIG_KALLSYMS) += kallsyms.o @@ -19,6 +26,7 @@ obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_IKCONFIG) += configs.o obj-$(CONFIG_IKCONFIG_PROC) += configs.o +obj-$(CONFIG_X86_EARLY_PRINTK) += early_printk.o ifneq ($(CONFIG_IA64),y) # According to Alan Modra , the -fno-omit-frame-pointer is diff -purN -X /home/mbligh/.diff.exclude 000-virgin/kernel/early_printk.c 999-mjb/kernel/early_printk.c --- 000-virgin/kernel/early_printk.c 1969-12-31 16:00:00.000000000 -0800 +++ 999-mjb/kernel/early_printk.c 2003-10-02 16:39:35.000000000 -0700 @@ -0,0 +1,218 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +/* Simple VGA output */ + +#define MAX_YPOS 25 +#define MAX_XPOS 80 + +static int current_ypos = 1, current_xpos = 0; + +static void early_vga_write(struct console *con, const char *str, unsigned n) +{ + char c; + int i, k, j; + + while ((c = *str++) != '\0' && n-- > 0) { + if (current_ypos >= MAX_YPOS) { + /* scroll 1 line up */ + for(k = 1, j = 0; k < MAX_YPOS; k++, j++) { + for(i = 0; i < MAX_XPOS; i++) { + writew(readw(VGABASE + 2*(MAX_XPOS*k + i)), + VGABASE + 2*(MAX_XPOS*j + i)); + } + } + for(i = 0; i < MAX_XPOS; i++) { + writew(0x720, VGABASE + 2*(MAX_XPOS*j + i)); + } + current_ypos = MAX_YPOS-1; + } + if (c == '\n') { + current_xpos = 0; + current_ypos++; + } else if (c != '\r') { + writew(((0x7 << 8) | (unsigned short) c), + VGABASE + 2*(MAX_XPOS*current_ypos + current_xpos++)); + if (current_xpos >= MAX_XPOS) { + current_xpos = 0; + current_ypos++; + } + } + } +} + +static struct console early_vga_console = { + .name = "earlyvga", + .write = early_vga_write, + .flags = CON_PRINTBUFFER, + .index = -1, +}; + +/* Serial functions losely based on a similar package from Klaus P. Gerlicher */ + +int early_serial_base; /* ttyS0 */ + +static int early_serial_putc(unsigned char ch) +{ + unsigned timeout = 0xffff; + while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout) + rep_nop(); + outb(ch, early_serial_base + TXR); + return timeout ? 0 : -1; +} + +static void early_serial_write(struct console *con, const char *s, unsigned n) +{ + while (*s && n-- > 0) { + early_serial_putc(*s); + if (*s == '\n') + early_serial_putc('\r'); + s++; + } +} + +static __init void early_serial_init(char *opt) +{ + unsigned char c; + unsigned divisor, baud = DEFAULT_BAUD; + static int bases[] = SERIAL_BASES; + char *s, *e; + + early_serial_base = bases[0]; + + if (*opt == ',') + ++opt; + + s = strsep(&opt, ","); + if (s != NULL) { + unsigned port; + if (!strncmp(s,"0x",2)) + early_serial_base = simple_strtoul(s, &e, 16); + else { + if (!strncmp(s,"ttyS",4)) + s+=4; + port = simple_strtoul(s, &e, 10); + if (port > (SERIAL_BASES_LEN-1) || s == e) + port = 0; + early_serial_base = bases[port]; + } + } + + outb(0x3, early_serial_base + LCR); /* 8n1 */ + outb(0, early_serial_base + IER); /* no interrupt */ + outb(0, early_serial_base + FCR); /* no fifo */ + outb(0x3, early_serial_base + MCR); /* DTR + RTS */ + + s = strsep(&opt, ","); + if (s != NULL) { + baud = simple_strtoul(s, &e, 0); + if (baud == 0 || s == e) + baud = DEFAULT_BAUD; + } + + divisor = 115200 / baud; + c = inb(early_serial_base + LCR); + outb(c | DLAB, early_serial_base + LCR); + outb(divisor & 0xff, early_serial_base + DLL); + outb((divisor >> 8) & 0xff, early_serial_base + DLH); + outb(c & ~DLAB, early_serial_base + LCR); +} + +static struct console early_serial_console = { + .name = "earlyser", + .write = early_serial_write, + .flags = CON_PRINTBUFFER, + .index = -1, +}; + +/* Direct interface for emergencies */ +struct console *early_console = &early_vga_console; +static int early_console_initialized = 0; + +void early_printk(const char *fmt, ...) +{ + char buf[512]; + int n; + va_list ap; + va_start(ap,fmt); + n = vsnprintf(buf,512,fmt,ap); + early_console->write(early_console,buf,n); + va_end(ap); +} + +static int keep_early; + +int __init setup_early_printk(void) +{ + char *space, *s; + char buf[256]; + char cmd[COMMAND_LINE_SIZE]; + char *opt; + + /* Get our own copy of the cmd line */ + memcpy(cmd, COMMAND_LINE, COMMAND_LINE_SIZE); + cmd[COMMAND_LINE_SIZE-1] = '\0'; + opt = cmd; + + s = strstr(opt, "earlyprintk="); + if (s == NULL) + return -1; + opt = s+12; + + if (early_console_initialized) + return -1; + + strncpy(buf,opt,256); + buf[255] = 0; + space = strchr(buf, ' '); + if (space) + *space = 0; + + if (strstr(buf,"keep")) + keep_early = 1; + + if (!strncmp(buf, "serial", 6)) { + early_serial_init(buf + 6); + early_console = &early_serial_console; + } else if (!strncmp(buf, "ttyS", 4)) { + early_serial_init(buf); + early_console = &early_serial_console; + } else if (!strncmp(buf, "vga", 3)) { + early_console = &early_vga_console; + } else { + early_console = NULL; + return -1; + } + early_console_initialized = 1; + register_console(early_console); + printk("early printk console registered\n"); + return 0; +} + +void __init disable_early_printk(void) +{ + if (!early_console_initialized || !early_console) + return; + if (!keep_early) { + printk("disabling early console...\n"); + unregister_console(early_console); + early_console_initialized = 0; + } else { + printk("keeping early console.\n"); + } +} + +/* syntax: earlyprintk=vga + earlyprintk=serial[,ttySn[,baudrate]] + Append ,keep to not disable it when the real console takes over. + Only vga or serial at a time, not both. + Currently only ttyS0 and ttyS1 are supported. + Interaction with the standard serial driver is not very good. + The VGA output is eventually overwritten by the real console. */ +__setup("earlyprintk=", setup_early_printk); diff -purN -X /home/mbligh/.diff.exclude 000-virgin/kernel/fork.c 999-mjb/kernel/fork.c --- 000-virgin/kernel/fork.c 2003-10-01 11:48:27.000000000 -0700 +++ 999-mjb/kernel/fork.c 2003-10-02 16:42:18.000000000 -0700 @@ -232,7 +232,7 @@ static inline int dup_mmap(struct mm_str mm->mmap_cache = NULL; mm->free_area_cache = TASK_UNMAPPED_BASE; mm->map_count = 0; - mm->rss = 0; + zero_rss(mm); cpus_clear(mm->cpu_vm_mask); pprev = &mm->mmap; diff -purN -X /home/mbligh/.diff.exclude 000-virgin/kernel/gcov.c 999-mjb/kernel/gcov.c --- 000-virgin/kernel/gcov.c 1969-12-31 16:00:00.000000000 -0800 +++ 999-mjb/kernel/gcov.c 2003-10-02 16:43:03.000000000 -0700 @@ -0,0 +1,158 @@ +/* + * Coverage support under Linux + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (c) International Business Machines Corp., 2002 + * + * Author: Hubertus Franke + * Rajan Ravindran + * + * Modified by + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +struct bb +{ + long zero_word; + const char *filename; + long *counts; + long ncounts; + struct bb *next; + const unsigned long *addresses; + + /* Older GCC's did not emit these fields. */ + long nwords; + const char **functions; + const long *line_nums; + const char **filenames; + char *flags; +}; + +struct bb *bb_head; +struct module *bb_context_address; +void (*gcov_callback)(int cmd, struct bb *bbptr) = NULL; + +#ifdef GCOV_PATH +char *gcov_kernelpath = GCOV_PATH; +#else +char *gcov_kernelpath = __FILE__; +#endif + + +void +__bb_init_func (struct bb *blocks) +{ + if (blocks->zero_word) + return; + + /* Set up linked list. */ + blocks->zero_word = 1; + + /* Store the address of the module of which this object-file is a part + of (set in do_global_ctors). */ + blocks->addresses = (unsigned long *) bb_context_address; + + blocks->next = bb_head; + bb_head = blocks; + + if (gcov_callback && bb_context_address) + (*gcov_callback)(1,blocks); +} + +/* Call constructors for all kernel objects and dynamic modules. This function + * is called both during module initialization and when the gcov kernel + * module is insmod'ed. The list of constructors is compiled into the + * kernel at &__CTOR_LIST__ to &__DTOR_LIST__ (labels are defined in + * head.S). In the case of a dynamic module the list is located at + * ctors_start to ctors_end. + * + * The constructors in turn call __bb_init_func, reporting the respective + * struct bb for each object file. + */ + +void +do_global_ctors (char *ctors_start, char *ctors_end, struct module *addr, int mod_flag) +{ + extern char __CTOR_LIST__; + extern char __DTOR_LIST__; + typedef void (*func_ptr)(void) ; + func_ptr *constructor_ptr=NULL; + + if (!mod_flag) { + /* Set start and end ptr from global kernel constructor list. */ + ctors_start = &__CTOR_LIST__; + ctors_end = &__DTOR_LIST__; + bb_context_address = NULL; + } else { + /* Set context to current module address. */ + bb_context_address = addr; + } + + if (!ctors_start) + return; + + /* Call all constructor functions until either the end of the + list is reached or until a NULL is encountered. */ + for (constructor_ptr = (func_ptr *) ctors_start; + (constructor_ptr != (func_ptr *) ctors_end) && + (*constructor_ptr != NULL); + constructor_ptr++) { + (*constructor_ptr) (); + } +} + + +/* When a module is unloaded, this function is called to remove + * the respective bb entries from our list. context specifies + * the address of the module that is unloaded. */ + +void +remove_bb_link (struct module *context) +{ + struct bb *bbptr; + struct bb *prev = NULL; + + /* search for all the module's bbptrs */ + for (bbptr = bb_head; bbptr ; bbptr = bbptr->next) { + if (bbptr->addresses == (unsigned long *) context) { + if (gcov_callback) + (*gcov_callback)(0,bbptr); + if (prev == NULL) + bb_head = bbptr->next; + else + prev->next = bbptr->next; + } + else + prev = bbptr; + } +} + +EXPORT_SYMBOL(bb_head); +EXPORT_SYMBOL(__bb_init_func); +EXPORT_SYMBOL(do_global_ctors); +EXPORT_SYMBOL(gcov_kernelpath); +EXPORT_SYMBOL(gcov_callback); diff -purN -X /home/mbligh/.diff.exclude 000-virgin/kernel/ksyms.c 999-mjb/kernel/ksyms.c --- 000-virgin/kernel/ksyms.c 2003-10-01 11:48:27.000000000 -0700 +++ 999-mjb/kernel/ksyms.c 2003-10-02 16:39:44.000000000 -0700 @@ -607,6 +607,16 @@ EXPORT_SYMBOL(__per_cpu_offset); EXPORT_SYMBOL(set_fs_pwd); EXPORT_SYMBOL(set_fs_root); +#if defined(CONFIG_LOCKMETER) +EXPORT_SYMBOL(_metered_spin_lock); +EXPORT_SYMBOL(_metered_spin_unlock); +EXPORT_SYMBOL(_metered_spin_trylock); +EXPORT_SYMBOL(_metered_read_lock); +EXPORT_SYMBOL(_metered_read_unlock); +EXPORT_SYMBOL(_metered_write_lock); +EXPORT_SYMBOL(_metered_write_unlock); +#endif + /* debug */ EXPORT_SYMBOL(dump_stack); EXPORT_SYMBOL(ptrace_notify); diff -purN -X /home/mbligh/.diff.exclude 000-virgin/kernel/lockmeter.c 999-mjb/kernel/lockmeter.c --- 000-virgin/kernel/lockmeter.c 1969-12-31 16:00:00.000000000 -0800 +++ 999-mjb/kernel/lockmeter.c 2003-10-02 16:39:44.000000000 -0700 @@ -0,0 +1,1088 @@ +/* + * Copyright (C) 1999,2000 Silicon Graphics, Inc. + * + * Written by John Hawkes (hawkes@sgi.com) + * Based on klstat.c by Jack Steiner (steiner@sgi.com) + * + * Modified by Ray Bryant (raybry@us.ibm.com) + * Changes Copyright (C) 2000 IBM, Inc. + * Added save of index in spinlock_t to improve efficiency + * of "hold" time reporting for spinlocks + * Added support for hold time statistics for read and write + * locks. + */ + +#ifdef __KERNEL__ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#else +#define __SMP__ +#include +#include +#include +#include "bitops.h" +#include "user_scaffold.h" +#include +#include +#include "newlockmeter.h" +#endif + +#ifdef __KERNEL__ +#define ASSERT(cond) +#define bzero(loc,size) memset(loc,0,size) +#endif + +/*<---------------------------------------------------*/ +/* lockmeter.c */ +/*>---------------------------------------------------*/ + +#ifdef __KERNEL__ +static lstat_control_t lstat_control __cacheline_aligned = {LSTAT_OFF, SPIN_LOCK_UNLOCKED, SPIN_LOCK_UNLOCKED, 19*0, NR_CPUS*0, 0, NR_CPUS*0}; +#else +lstat_control_t lstat_control = {LSTAT_OFF, SPIN_LOCK_UNLOCKED, SPIN_LOCK_UNLOCKED, 19*0, NR_CPUS*0, 0, NR_CPUS*0}; +#endif + +int smp_num_cpus=NR_CPUS; + +#undef BUG +#define BUG() + +static ushort lstat_make_dir_entry(void *, void *); + +/* + * lstat_lookup + * + * Given a RA, locate the directory entry for the lock. + */ +static ushort +lstat_lookup( + void *lock_ptr, + void *caller_ra) +{ + ushort index; + lstat_directory_entry_t *dirp; + + dirp = lstat_control.dir; + + index = lstat_control.hashtab[DIRHASH(caller_ra)]; + while (dirp[index].caller_ra != caller_ra) { + if (index == 0) { + return(lstat_make_dir_entry(lock_ptr, caller_ra)); + } + index = dirp[index].next_stat_index; + } + + if (dirp[index].lock_ptr != NULL && + dirp[index].lock_ptr != lock_ptr) { + dirp[index].lock_ptr = NULL; + } + + return(index); +} + + +/* + * lstat_make_dir_entry + * Called to add a new lock to the lock directory. + */ +static ushort +lstat_make_dir_entry( + void *lock_ptr, + void *caller_ra) +{ + lstat_directory_entry_t *dirp; + ushort index, hindex; + unsigned long flags; + + /* lock the table without recursively reentering this metering code */ + do { local_irq_save(flags); + _raw_spin_lock(&lstat_control.directory_lock); } while(0); + + hindex = DIRHASH(caller_ra); + index = lstat_control.hashtab[hindex]; + dirp = lstat_control.dir; + while (index && dirp[index].caller_ra != caller_ra) + index = dirp[index].next_stat_index; + + if (index == 0) { + if(lstat_control.next_free_dir_index < LSTAT_MAX_STAT_INDEX) { + index = lstat_control.next_free_dir_index++; + lstat_control.dir[index].caller_ra = caller_ra; + lstat_control.dir[index].lock_ptr = lock_ptr; + lstat_control.dir[index].next_stat_index = lstat_control.hashtab[hindex]; + lstat_control.hashtab[hindex] = index; + } else { + lstat_control.dir_overflow++; + } + } + + do { _raw_spin_unlock(&lstat_control.directory_lock); + local_irq_restore(flags);} while(0); + return(index); +} + +int +lstat_update ( + void *lock_ptr, + void *caller_ra, + int action) +{ + int index; + int cpu; + + ASSERT(action < LSTAT_ACT_MAX_VALUES); + + if (lstat_control.state == LSTAT_OFF) { + return(0); + } + + index = lstat_lookup(lock_ptr, caller_ra); + cpu = THIS_CPU_NUMBER; + (*lstat_control.counts[cpu])[index].count[action]++; + (*lstat_control.counts[cpu])[index].acquire_time = get_cycles(); + + return(index); +} + +int +lstat_update_time ( + void *lock_ptr, + void *caller_ra, + int action, + uint32_t ticks) +{ + ushort index; + int cpu; + + ASSERT(action < LSTAT_ACT_MAX_VALUES); + + if (lstat_control.state == LSTAT_OFF) { + return(0); + } + + index = lstat_lookup(lock_ptr, caller_ra); + cpu = THIS_CPU_NUMBER; + (*lstat_control.counts[cpu])[index].count[action]++; + (*lstat_control.counts[cpu])[index].cum_wait_ticks += (uint64_t)ticks; + if ((*lstat_control.counts[cpu])[index].max_wait_ticks < ticks) + (*lstat_control.counts[cpu])[index].max_wait_ticks = ticks; + + (*lstat_control.counts[cpu])[index].acquire_time = get_cycles(); + + return(index); +} + +void _metered_spin_lock(spinlock_t *lock_ptr) +{ + if (lstat_control.state == LSTAT_OFF) { + _raw_spin_lock(lock_ptr); /* do the real lock */ + PUT_INDEX(lock_ptr,0); /* clean index in case lockmetering */ + /* gets turned on before unlock */ + } else { + void *this_pc = LSTAT_RA(LSTAT_RA_SPIN); + int index; + + if (_raw_spin_trylock(lock_ptr)) { + index = lstat_update(lock_ptr, this_pc, LSTAT_ACT_NO_WAIT); + } else { + uint32_t start_cycles = get_cycles(); + _raw_spin_lock(lock_ptr); /* do the real lock */ + index = lstat_update_time(lock_ptr, this_pc, LSTAT_ACT_SPIN, + get_cycles() - start_cycles); + } + /* save the index in the lock itself for use in spin unlock */ + PUT_INDEX(lock_ptr,index); + } +} + +int _metered_spin_trylock(spinlock_t *lock_ptr) +{ + if (lstat_control.state == LSTAT_OFF) { + return _raw_spin_trylock(lock_ptr); + } else { + int retval; + void *this_pc = LSTAT_RA(LSTAT_RA_SPIN); + + if ((retval = _raw_spin_trylock(lock_ptr))) { + int index = lstat_update(lock_ptr, this_pc, LSTAT_ACT_NO_WAIT); + /* save the index in the lock itself for use in spin unlock */ + PUT_INDEX(lock_ptr,index); + } else { + lstat_update(lock_ptr, this_pc, LSTAT_ACT_REJECT); + } + + return retval; + } +} + +void _metered_spin_unlock(spinlock_t *lock_ptr) +{ + int index=-1; + + if (lstat_control.state != LSTAT_OFF) { + index = GET_INDEX(lock_ptr); + /* + * If statistics were turned off when we set the lock, + * then the index can be zero. If that is the case, + * then collect no stats on this call. + */ + if (index > 0) { + uint32_t hold_time; + int cpu = THIS_CPU_NUMBER; + hold_time = get_cycles() - (*lstat_control.counts[cpu])[index].acquire_time; + (*lstat_control.counts[cpu])[index].cum_hold_ticks += (uint64_t)hold_time; + if ((*lstat_control.counts[cpu])[index].max_hold_ticks < hold_time) + (*lstat_control.counts[cpu])[index].max_hold_ticks = hold_time; + } + } + + /* make sure we don't have a stale index value saved */ + PUT_INDEX(lock_ptr,0); + _raw_spin_unlock(lock_ptr); /* do the real unlock */ +} + +/* + * allocate the next global read lock structure and store its index + * in the rwlock at "lock_ptr". + */ +uint32_t alloc_rwlock_struct(rwlock_t *rwlock_ptr) +{ + int index; + int flags; + int cpu=THIS_CPU_NUMBER; + + /* If we've already overflowed, then do a quick exit */ + if (lstat_control.next_free_read_lock_index > LSTAT_MAX_READ_LOCK_INDEX) { + lstat_control.rwlock_overflow++; + return(0); + } + + do { local_irq_save(flags); + _raw_spin_lock(&lstat_control.directory_lock); } while(0); + + /* It is possible this changed while we were waiting for the directory_lock */ + if (lstat_control.state == LSTAT_OFF) { + index=0; + goto unlock; + } + + /* It is possible someone else got here first and set the index */ + if ((index=GET_RWINDEX(rwlock_ptr)) == 0) { + + /* we can't turn on read stats for this lock while there are readers */ + /* (this would mess up the running hold time sum at unlock time) */ + if (RWLOCK_READERS(rwlock_ptr) != 0) { + index=0; + goto unlock; + } + + /* if stats are turned on after being off, we may need to return an old */ + /* index from when the statistics were on last time. ................... */ + for(index=1;index= LSTAT_MAX_READ_LOCK_INDEX) { + lstat_control.rwlock_overflow++; + index = 0; + goto unlock; + } + index = lstat_control.next_free_read_lock_index++; + + /* initialize the global read stats data structure for each cpu */ + for(cpu=0; cpu < smp_num_cpus; cpu++) { + (*lstat_control.read_lock_counts[cpu])[index].lock_ptr = rwlock_ptr; + } +put_index_and_unlock: + /* store the index for the read lock structure into the lock */ + PUT_RWINDEX(rwlock_ptr,index); + } + +unlock: + do { _raw_spin_unlock(&lstat_control.directory_lock); + local_irq_restore(flags);} while(0); + + return(index); +} + +void +_metered_read_lock(rwlock_t *rwlock_ptr) +{ + void *this_pc; + uint32_t start_cycles; + int index; + int cpu; + int flags; + int readers_before, readers_after; + uint64_t cycles64; + + if (lstat_control.state == LSTAT_OFF) { + _raw_read_lock(rwlock_ptr); + /* clean index in case lockmetering turns on before an unlock */ + PUT_RWINDEX(rwlock_ptr, 0); + return; + } + + this_pc = LSTAT_RA(LSTAT_RA_READ); + cpu = THIS_CPU_NUMBER; + index = GET_RWINDEX(rwlock_ptr); + + /* allocate the global stats entry for this lock, if needed */ + if (index==0) { + index = alloc_rwlock_struct(rwlock_ptr); + } + + readers_before = RWLOCK_READERS(rwlock_ptr); + if (_raw_read_trylock(rwlock_ptr)) { + /* + * We have decremented the lock to count a new reader, + * and have confirmed that no writer has it locked. + */ + /* update statistics if enabled */ + if (index>0) { +#ifndef __KERNEL__ + _raw_spin_lock((spinlock_t *)&(*lstat_control.read_lock_counts[cpu])[index].entry_lock); +#else + do { local_irq_save(flags); } while(0); +#endif + lstat_update((void *)rwlock_ptr, this_pc, LSTAT_ACT_NO_WAIT); + /* preserve value of TSC so cum_hold_ticks and start_busy use same value */ + cycles64 = get_cycles64(); + (*lstat_control.read_lock_counts[cpu])[index].cum_hold_ticks -= cycles64; + + /* record time and cpu of start of busy period */ + /* this is not perfect (some race conditions are possible) */ + if (readers_before==0) { + (*lstat_control.read_lock_counts[cpu])[index].start_busy = cycles64; + PUT_RW_CPU(rwlock_ptr, cpu); + } + readers_after=RWLOCK_READERS(rwlock_ptr); + if (readers_after > (*lstat_control.read_lock_counts[cpu])[index].max_readers) + (*lstat_control.read_lock_counts[cpu])[index].max_readers = readers_after; +#ifndef __KERNEL__ + _raw_spin_unlock((spinlock_t*)&(*lstat_control.read_lock_counts[cpu])[index].entry_lock); +#else + do {local_irq_restore(flags);} while(0); +#endif + } + + return; + } + /* If we get here, then we could not quickly grab the read lock */ + + start_cycles = get_cycles(); /* start counting the wait time */ + + /* Now spin until read_lock is successful */ + _raw_read_lock(rwlock_ptr); + + lstat_update_time((void *)rwlock_ptr, this_pc, LSTAT_ACT_SPIN, + get_cycles() - start_cycles); + + /* update statistics if they are enabled for this lock */ + if (index>0) { +#ifndef __KERNEL__ + _raw_spin_lock((spinlock_t *)&(*lstat_control.read_lock_counts[cpu])[index].entry_lock); +#else + do { local_irq_save(flags); } while(0); +#endif + cycles64 = get_cycles64(); + (*lstat_control.read_lock_counts[cpu])[index].cum_hold_ticks -= cycles64; + + /* this is not perfect (some race conditions are possible) */ + if (readers_before==0) { + (*lstat_control.read_lock_counts[cpu])[index].start_busy = cycles64; + PUT_RW_CPU(rwlock_ptr, cpu); + } + readers_after=RWLOCK_READERS(rwlock_ptr); + if (readers_after > (*lstat_control.read_lock_counts[cpu])[index].max_readers) + (*lstat_control.read_lock_counts[cpu])[index].max_readers = readers_after; + +#ifndef __KERNEL__ + _raw_spin_unlock((spinlock_t *)&(*lstat_control.read_lock_counts[cpu])[index].entry_lock); +#else + do {local_irq_restore(flags);} while(0); +#endif + } +} + +void _metered_read_unlock(rwlock_t *rwlock_ptr) +{ + int index; + int cpu; + int flags; + uint64_t busy_length; + uint64_t cycles64; + + if (lstat_control.state == LSTAT_OFF) { + _raw_read_unlock(rwlock_ptr); + return; + } + + index = GET_RWINDEX(rwlock_ptr); + cpu = THIS_CPU_NUMBER; + + if (index>0) { +#ifndef __KERNEL__ + _raw_spin_lock((spinlock_t *)&(*lstat_control.read_lock_counts[cpu])[index].entry_lock); +#else + /* updates below are non-atomic */ + do { local_irq_save(flags); } while(0); +#endif + /* preserve value of TSC so cum_hold_ticks and busy_ticks are consistent.. */ + cycles64 = get_cycles64(); + (*lstat_control.read_lock_counts[cpu])[index].cum_hold_ticks += cycles64; + (*lstat_control.read_lock_counts[cpu])[index].read_lock_count++; + + /* once again, this is not perfect (some race conditions are possible) */ + if (RWLOCK_READERS(rwlock_ptr) == 1) { + int cpu1 = GET_RW_CPU(rwlock_ptr); + uint64_t last_start_busy = (*lstat_control.read_lock_counts[cpu1])[index].start_busy; + (*lstat_control.read_lock_counts[cpu])[index].busy_periods++; + if (cycles64 > last_start_busy) { + busy_length = cycles64 - last_start_busy; + (*lstat_control.read_lock_counts[cpu])[index].busy_ticks += busy_length; + if (busy_length > (*lstat_control.read_lock_counts[cpu])[index].max_busy) + (*lstat_control.read_lock_counts[cpu])[index].max_busy = busy_length; + } + } +#ifndef __KERNEL__ + _raw_spin_unlock((spinlock_t *)&(*lstat_control.read_lock_counts[cpu])[index].entry_lock); +#else + do {local_irq_restore(flags);} while(0); +#endif + } + + /* unlock the lock */ + _raw_read_unlock(rwlock_ptr); +} + +void _metered_write_lock(rwlock_t *rwlock_ptr) +{ + uint32_t start_cycles; + void *this_pc; + uint32_t spin_ticks = 0; /* in anticipation of a potential wait */ + int index; + int write_index = 0; + int cpu; + enum {writer_writer_conflict, writer_reader_conflict} why_wait = writer_writer_conflict; + + if (lstat_control.state == LSTAT_OFF) { + _raw_write_lock(rwlock_ptr); + /* clean index in case lockmetering turns on before an unlock */ + PUT_RWINDEX(rwlock_ptr, 0); + return; + } + + this_pc = LSTAT_RA(LSTAT_RA_WRITE); + cpu = THIS_CPU_NUMBER; + index = GET_RWINDEX(rwlock_ptr); + + /* allocate the global stats entry for this lock, if needed */ + if (index == 0) { + index = alloc_rwlock_struct(rwlock_ptr); + } + + if (_raw_write_trylock(rwlock_ptr)) { + /* We acquired the lock on the first try */ + write_index = lstat_update((void *)rwlock_ptr, this_pc, LSTAT_ACT_NO_WAIT); + /* save the write_index for use in unlock if stats enabled */ + if (index > 0) + (*lstat_control.read_lock_counts[cpu])[index].write_index = write_index; + return; + } + + /* If we get here, then we could not quickly grab the write lock */ + start_cycles = get_cycles(); /* start counting the wait time */ + + why_wait = RWLOCK_READERS(rwlock_ptr) ? writer_reader_conflict : writer_writer_conflict; + + /* Now set the lock and wait for conflicts to disappear */ + _raw_write_lock(rwlock_ptr); + + spin_ticks = get_cycles() - start_cycles; + + /* update stats -- if enabled */ + if (index > 0) + if (spin_ticks) { + if (why_wait == writer_reader_conflict) { + /* waited due to a reader holding the lock */ + write_index = lstat_update_time((void *)rwlock_ptr, this_pc, + LSTAT_ACT_SPIN, spin_ticks); + } else { + /* waited due to another writer holding the lock */ + write_index = lstat_update_time((void *)rwlock_ptr, this_pc, + LSTAT_ACT_WW_SPIN, spin_ticks); + (*lstat_control.counts[cpu])[write_index].cum_wait_ww_ticks += spin_ticks; + if (spin_ticks > + (*lstat_control.counts[cpu])[write_index].max_wait_ww_ticks) { + (*lstat_control.counts[cpu])[write_index].max_wait_ww_ticks = spin_ticks; + } + } + + /* save the directory index for use on write_unlock */ + (*lstat_control.read_lock_counts[cpu])[index].write_index = write_index; + } + +} + +void +_metered_write_unlock(rwlock_t *rwlock_ptr) +{ + int index; + int cpu; + int write_index; + uint32_t hold_time; + + if (lstat_control.state == LSTAT_OFF) { + _raw_write_unlock(rwlock_ptr); + return; + } + + cpu = THIS_CPU_NUMBER; + index = GET_RWINDEX(rwlock_ptr); + + /* update statistics if stats enabled for this lock */ + if (index>0) { + write_index = (*lstat_control.read_lock_counts[cpu])[index].write_index; + + hold_time = get_cycles() - (*lstat_control.counts[cpu])[write_index].acquire_time; + (*lstat_control.counts[cpu])[write_index].cum_hold_ticks += (uint64_t)hold_time; + if ((*lstat_control.counts[cpu])[write_index].max_hold_ticks < hold_time) + (*lstat_control.counts[cpu])[write_index].max_hold_ticks = hold_time; + } + _raw_write_unlock(rwlock_ptr); +} + +int _metered_write_trylock(rwlock_t *rwlock_ptr) +{ + int retval; + void *this_pc = LSTAT_RA(LSTAT_RA_WRITE); + + if ((retval = _raw_write_trylock(rwlock_ptr))) { + lstat_update(rwlock_ptr, this_pc, LSTAT_ACT_NO_WAIT); + } else { + lstat_update(rwlock_ptr, this_pc, LSTAT_ACT_REJECT); + } + + return retval; +} + +#ifdef __KERNEL__ +static void +init_control_space(void) +{ + /* Set all control space pointers to null and indices to "empty" */ + int cpu; + + /* + * Access CPU_CYCLE_FREQUENCY at the outset, which in some + * architectures may trigger a runtime calculation that uses a + * spinlock. Let's do this before lockmetering is turned on. + */ + if (CPU_CYCLE_FREQUENCY == 0) + BUG(); + + lstat_control.hashtab = NULL; + lstat_control.dir = NULL; + for (cpu=0; cpu max_len) + return actual_ret_bcount; + + copy_to_user(buffer, (void *)&req, next_ret_bcount); + actual_ret_bcount += next_ret_bcount; + return actual_ret_bcount; + } else { + /* measurement is off but valid data present */ + /* fetch time info from lstat_control */ + req.ending_time = lstat_control.ending_time; + req.ending_cycles64 = lstat_control.ending_cycles64; + req.enabled_cycles64 = lstat_control.enabled_cycles64; + } + } else { + /* this must be a read while data active--use current time, etc */ + do_gettimeofday(&tv); + req.ending_time = tv.tv_sec; + req.ending_cycles64 = get_cycles64(); + req.enabled_cycles64 = req.ending_cycles64-req.started_cycles64 + + lstat_control.enabled_cycles64; + } + + next_ret_bcount = sizeof(lstat_user_request_t); + if ((actual_ret_bcount + next_ret_bcount) > max_len) + return actual_ret_bcount; + + copy_to_user(buffer, (void *)&req, next_ret_bcount); + actual_ret_bcount += next_ret_bcount; + + if (!lstat_control.counts[0]) /* not initialized? */ + return actual_ret_bcount; + + next_ret_bcount = sizeof(lstat_cpu_counts_t); + for (cpu = 0; cpu < smp_num_cpus; cpu++) { + if ((actual_ret_bcount + next_ret_bcount) > max_len) + return actual_ret_bcount; /* leave early */ + copy_to_user(buffer + actual_ret_bcount, lstat_control.counts[cpu], + next_ret_bcount); + actual_ret_bcount += next_ret_bcount; + } + + next_ret_bcount = LSTAT_MAX_STAT_INDEX * sizeof(lstat_directory_entry_t); + if ( ((actual_ret_bcount + next_ret_bcount) > max_len) + || !lstat_control.dir ) + return actual_ret_bcount; /* leave early */ + + copy_to_user(buffer + actual_ret_bcount, lstat_control.dir, + next_ret_bcount); + actual_ret_bcount += next_ret_bcount; + + next_ret_bcount = sizeof(lstat_read_lock_cpu_counts_t); + for (cpu = 0; cpu < smp_num_cpus; cpu++) { + if (actual_ret_bcount + next_ret_bcount > max_len) + return actual_ret_bcount; + copy_to_user(buffer + actual_ret_bcount, lstat_control.read_lock_counts[cpu], + next_ret_bcount); + actual_ret_bcount += next_ret_bcount; + } + + return actual_ret_bcount; +} + +/* + * Writing to the /proc lockmeter node enables or disables metering. + * based upon the first byte of the "written" data. + * The following values are defined: + * LSTAT_ON: 1st call: allocates storage, intializes and turns on measurement + * subsequent calls just turn on measurement + * LSTAT_OFF: turns off measurement + * LSTAT_RESET: resets statistics + * LSTAT_RELEASE: releases statistics storage + * + * This allows one to accumulate statistics over several lockstat runs: + * + * lockstat on + * lockstat off + * ...repeat above as desired... + * lockstat get + * ...now start a new set of measurements... + * lockstat reset + * lockstat on + * ... + * + */ +ssize_t put_lockmeter_info(const char *buffer, size_t len) +{ + int error = 0; + int dirsize, countsize, read_lock_countsize, hashsize; + int cpu; + char put_char; + int i, read_lock_blocks, flags; + rwlock_t *lock_ptr; + struct timeval tv; + + if (len <= 0) + return -EINVAL; + + _raw_spin_lock(&lstat_control.control_lock); + + get_user(put_char, buffer); + switch (put_char) { + + case LSTAT_OFF: + if (lstat_control.state != LSTAT_OFF) { + /* + * To avoid seeing read lock hold times in an inconsisent state, + * we have to follow this protocol to turn off statistics + */ + do { local_irq_save(flags); } while(0); + /* getting this lock will stop any read lock block allocations */ + _raw_spin_lock(&lstat_control.directory_lock); + /* keep any more read lock blocks from being allocated */ + lstat_control.state = LSTAT_OFF; + /* record how may read lock blocks there are */ + read_lock_blocks = lstat_control.next_free_read_lock_index; + _raw_spin_unlock(&lstat_control.directory_lock); + /* now go through the list of read locks */ + cpu = THIS_CPU_NUMBER; + for(i=1;ictors_start && mod->ctors_end) + remove_bb_link(mod); +#endif + /* Module unload stuff */ module_unload_free(mod); @@ -1575,6 +1585,13 @@ static struct module *load_module(void _ /* Module has been moved. */ mod = (void *)sechdrs[modindex].sh_addr; +#ifdef CONFIG_GCOV_PROFILE + modindex = find_sec(hdr, sechdrs, secstrings, ".ctors"); + mod->ctors_start = (char *)sechdrs[modindex].sh_addr; + mod->ctors_end = (char *)(mod->ctors_start + + sechdrs[modindex].sh_size); +#endif + /* Now we've moved module, initialize linked lists, etc. */ module_unload_init(mod); @@ -1724,6 +1741,12 @@ sys_init_module(void __user *umod, /* Start the module */ ret = mod->init(); + +#ifdef CONFIG_GCOV_PROFILE + if (mod->ctors_start && mod->ctors_end) { + do_global_ctors(mod->ctors_start, mod->ctors_end, mod, 1); + } +#endif if (ret < 0) { /* Init routine failed: abort. Try to protect us from buggy refcounters. */ diff -purN -X /home/mbligh/.diff.exclude 000-virgin/kernel/sched.c 999-mjb/kernel/sched.c --- 000-virgin/kernel/sched.c 2003-10-01 11:48:28.000000000 -0700 +++ 999-mjb/kernel/sched.c 2003-10-02 16:41:02.000000000 -0700 @@ -37,7 +37,7 @@ #include #include -#ifdef CONFIG_NUMA +#ifdef CONFIG_NUMA_SCHED #define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu)) #else #define cpu_to_node_mask(cpu) (cpu_online_map) @@ -76,19 +76,28 @@ * maximum timeslice is 200 msecs. Timeslices get refilled after * they expire. */ -#define MIN_TIMESLICE ( 10 * HZ / 1000) -#define MAX_TIMESLICE (200 * HZ / 1000) + +int min_timeslice = (10 * HZ) / 1000; +#define MIN_TIMESLICE (min_timeslice) +int max_timeslice = (200 * HZ) / 1000; +#define MAX_TIMESLICE (max_timeslice) #define ON_RUNQUEUE_WEIGHT 30 -#define CHILD_PENALTY 95 -#define PARENT_PENALTY 100 -#define EXIT_WEIGHT 3 -#define PRIO_BONUS_RATIO 25 +int child_penalty = 95; +#define CHILD_PENALTY (child_penalty) +int parent_penalty = 100; +#define PARENT_PENALTY (parent_penalty) +int exit_weight = 3; +#define EXIT_WEIGHT (exit_weight) +int prio_bonus_ratio = 25; +#define PRIO_BONUS_RATIO (prio_bonus_ratio) #define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100) -#define INTERACTIVE_DELTA 2 +int interactive_delta = 2; +#define INTERACTIVE_DELTA (interactive_delta) #define MAX_SLEEP_AVG (AVG_TIMESLICE * MAX_BONUS) #define STARVATION_LIMIT (MAX_SLEEP_AVG) #define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG)) -#define NODE_THRESHOLD 125 +int node_threshold = 125; +#define NODE_THRESHOLD (node_threshold) #define CREDIT_LIMIT 100 /* @@ -203,7 +212,7 @@ struct runqueue { struct mm_struct *prev_mm; prio_array_t *active, *expired, arrays[2]; int prev_cpu_load[NR_CPUS]; -#ifdef CONFIG_NUMA +#ifdef CONFIG_NUMA_SCHED atomic_t *node_nr_running; int prev_node_load[MAX_NUMNODES]; #endif @@ -229,7 +238,7 @@ static DEFINE_PER_CPU(struct runqueue, r # define task_running(rq, p) ((rq)->curr == (p)) #endif -#ifdef CONFIG_NUMA +#ifdef CONFIG_NUMA_SCHED /* * Keep track of running tasks. @@ -266,13 +275,13 @@ __init void node_nr_running_init(void) } } -#else /* !CONFIG_NUMA */ +#else /* !CONFIG_NUMA_SCHED */ # define nr_running_init(rq) do { } while (0) # define nr_running_inc(rq) do { (rq)->nr_running++; } while (0) # define nr_running_dec(rq) do { (rq)->nr_running--; } while (0) -#endif /* CONFIG_NUMA */ +#endif /* CONFIG_NUMA_SCHED */ /* * task_rq_lock - lock the runqueue a given task resides on and disable @@ -822,6 +831,11 @@ unsigned long nr_running(void) return sum; } +unsigned long nr_running_cpu(int cpu) +{ + return cpu_rq(cpu)->nr_running; +} + unsigned long nr_uninterruptible(void) { unsigned long i, sum = 0; @@ -892,7 +906,7 @@ static inline void double_rq_unlock(runq spin_unlock(&rq2->lock); } -#ifdef CONFIG_NUMA +#ifdef CONFIG_NUMA_SCHED /* * If dest_cpu is allowed for this process, migrate the task to it. * This is accomplished by forcing the cpu_allowed mask to only @@ -919,36 +933,72 @@ static void sched_migrate_task(task_t *p */ static int sched_best_cpu(struct task_struct *p) { - int i, minload, load, best_cpu, node = 0; + int cpu, node, minload, load, best_cpu, best_node; + int this_cpu, this_node, this_node_load; cpumask_t cpumask; - best_cpu = task_cpu(p); - if (cpu_rq(best_cpu)->nr_running <= 2) - return best_cpu; + this_cpu = best_cpu = task_cpu(p); + if (cpu_rq(this_cpu)->nr_running <= 2) + return this_cpu; + this_node = best_node = cpu_to_node(this_cpu); + + /* + * First look for any node-local idle queue and use that. + * This improves performance under light loads (mbligh). + * In case this node turns out to be the lightest node, store the best + * cpu that we find, so we don't go sniffing the same runqueues again. + */ + minload = 10000000; + cpumask = node_to_cpumask(this_node); + for (cpu = 0; cpu < NR_CPUS; ++cpu) { + if (!cpu_isset(cpu, cpumask)) + continue; + load = cpu_rq(cpu)->nr_running; + if (load == 0) + return cpu; + if (load < minload) { + minload = load; + best_cpu = cpu; + } + } + /* + * Now find the lightest loaded node, and put it in best_node + * + * Node load is always divided by nr_cpus_node to normalise load + * values in case cpu count differs from node to node. We first + * multiply node_nr_running by 16 to get a little better resolution. + */ minload = 10000000; - for_each_node_with_cpus(i) { - /* - * Node load is always divided by nr_cpus_node to normalise - * load values in case cpu count differs from node to node. - * We first multiply node_nr_running by 10 to get a little - * better resolution. - */ - load = 10 * atomic_read(&node_nr_running[i]) / nr_cpus_node(i); + this_node_load = 16 * atomic_read(&node_nr_running[this_node]) + / nr_cpus_node(this_node); + for_each_node_with_cpus(node) { + if (node == this_node) + load = this_node_load; + else + load = 16 * atomic_read(&node_nr_running[node]) + / nr_cpus_node(node); if (load < minload) { minload = load; - node = i; + best_node = node; } } + /* If we chose this node, we already did the legwork earlier */ + if (best_node == this_node) + return best_cpu; + + /* Now find the lightest loaded cpu on best_node, and use that */ minload = 10000000; - cpumask = node_to_cpumask(node); - for (i = 0; i < NR_CPUS; ++i) { - if (!cpu_isset(i, cpumask)) + best_cpu = this_cpu; + cpumask = node_to_cpumask(best_node); + for (cpu = 0; cpu < NR_CPUS; ++cpu) { + if (!cpu_isset(cpu, cpumask)) continue; - if (cpu_rq(i)->nr_running < minload) { - best_cpu = i; - minload = cpu_rq(i)->nr_running; + load = cpu_rq(cpu)->nr_running; + if (load < minload) { + minload = load; + best_cpu = cpu; } } return best_cpu; @@ -999,7 +1049,10 @@ static int find_busiest_node(int this_no return node; } -#endif /* CONFIG_NUMA */ +#endif /* CONFIG_NUMA_SCHED */ + +int idle_node_rebalance_ratio = 10; +int busy_node_rebalance_ratio = 2; #ifdef CONFIG_SMP @@ -1247,10 +1300,10 @@ out: */ #define IDLE_REBALANCE_TICK (HZ/1000 ?: 1) #define BUSY_REBALANCE_TICK (HZ/5 ?: 1) -#define IDLE_NODE_REBALANCE_TICK (IDLE_REBALANCE_TICK * 5) -#define BUSY_NODE_REBALANCE_TICK (BUSY_REBALANCE_TICK * 2) +#define IDLE_NODE_REBALANCE_TICK (IDLE_REBALANCE_TICK * idle_node_rebalance_ratio) +#define BUSY_NODE_REBALANCE_TICK (BUSY_REBALANCE_TICK * busy_node_rebalance_ratio) -#ifdef CONFIG_NUMA +#ifdef CONFIG_NUMA_SCHED static void balance_node(runqueue_t *this_rq, int idle, int this_cpu) { int node = find_busiest_node(cpu_to_node(this_cpu)); @@ -1281,7 +1334,7 @@ static void rebalance_tick(runqueue_t *t * are not balanced.) */ if (idle) { -#ifdef CONFIG_NUMA +#ifdef CONFIG_NUMA_SCHED if (!(j % IDLE_NODE_REBALANCE_TICK)) balance_node(this_rq, idle, this_cpu); #endif @@ -1292,7 +1345,7 @@ static void rebalance_tick(runqueue_t *t } return; } -#ifdef CONFIG_NUMA +#ifdef CONFIG_NUMA_SCHED if (!(j % BUSY_NODE_REBALANCE_TICK)) balance_node(this_rq, idle, this_cpu); #endif diff -purN -X /home/mbligh/.diff.exclude 000-virgin/kernel/sys.c 999-mjb/kernel/sys.c --- 000-virgin/kernel/sys.c 2003-10-01 11:48:28.000000000 -0700 +++ 999-mjb/kernel/sys.c 2003-10-02 16:41:14.000000000 -0700 @@ -235,6 +235,7 @@ cond_syscall(sys_epoll_ctl) cond_syscall(sys_epoll_wait) cond_syscall(sys_pciconfig_read) cond_syscall(sys_pciconfig_write) +cond_syscall(sys_mbind) static int set_one_prio(struct task_struct *p, int niceval, int error) { diff -purN -X /home/mbligh/.diff.exclude 000-virgin/kernel/sysctl.c 999-mjb/kernel/sysctl.c --- 000-virgin/kernel/sysctl.c 2003-10-01 11:48:28.000000000 -0700 +++ 999-mjb/kernel/sysctl.c 2003-10-02 16:39:40.000000000 -0700 @@ -59,6 +59,16 @@ extern int cad_pid; extern int pid_max; extern int sysctl_lower_zone_protection; extern int min_free_kbytes; +extern int min_timeslice; +extern int max_timeslice; +extern int child_penalty; +extern int parent_penalty; +extern int exit_weight; +extern int prio_bonus_ratio; +extern int interactive_delta; +extern int node_threshold; +extern int idle_node_rebalance_ratio; +extern int busy_node_rebalance_ratio; /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ static int maxolduid = 65535; @@ -121,6 +131,7 @@ static struct ctl_table_header root_tabl static ctl_table kern_table[]; static ctl_table vm_table[]; +static ctl_table sched_table[]; #ifdef CONFIG_NET extern ctl_table net_table[]; #endif @@ -200,6 +211,12 @@ static ctl_table root_table[] = { .mode = 0555, .child = dev_table, }, + { + .ctl_name = CTL_SCHED, + .procname = "sched", + .mode = 0555, + .child = sched_table, + }, { .ctl_name = 0 } }; @@ -587,6 +604,7 @@ static ctl_table kern_table[] = { /* Constants for minimum and maximum testing in vm_table. We use these as one-element integer vectors. */ static int zero; +static int one = 1; static int one_hundred = 100; @@ -807,6 +825,42 @@ static ctl_table dev_table[] = { { .ctl_name = 0 } }; +static ctl_table sched_table[] = { + {SCHED_MAX_TIMESLICE, "max_timeslice", &max_timeslice, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &one, NULL}, + {SCHED_MIN_TIMESLICE, "min_timeslice", &min_timeslice, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &one, NULL}, + {SCHED_CHILD_PENALTY, "child_penalty", &child_penalty, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {SCHED_PARENT_PENALTY, "parent_penalty", &parent_penalty, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {SCHED_EXIT_WEIGHT, "exit_weight", &exit_weight, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {SCHED_PRIO_BONUS_RATIO, "prio_bonus_ratio", &prio_bonus_ratio, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {SCHED_INTERACTIVE_DELTA, "interactive_delta", &interactive_delta, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {SCHED_NODE_THRESHOLD, "node_threshold", &node_threshold, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + sysctl_intvec, NULL, &one, NULL}, + {SCHED_IDLE_NODE_REBALANCE_RATIO, "idle_node_rebalance_ratio", + &idle_node_rebalance_ratio, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {SCHED_BUSY_NODE_REBALANCE_RATIO, "busy_node_rebalance_ratio", + &busy_node_rebalance_ratio, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {0} +}; + extern void init_irq_proc (void); void __init sysctl_init(void) diff -purN -X /home/mbligh/.diff.exclude 000-virgin/kernel/timer.c 999-mjb/kernel/timer.c --- 000-virgin/kernel/timer.c 2003-10-01 11:47:15.000000000 -0700 +++ 999-mjb/kernel/timer.c 2003-10-02 16:41:02.000000000 -0700 @@ -750,6 +750,8 @@ static unsigned long count_active_tasks( * Requires xtime_lock to access. */ unsigned long avenrun[3]; +unsigned long tasks_running[3]; +DEFINE_PER_CPU(unsigned long[3],cpu_tasks_running); /* * calc_load - given tick count, update the avenrun load estimates. @@ -757,7 +759,7 @@ unsigned long avenrun[3]; */ static inline void calc_load(unsigned long ticks) { - unsigned long active_tasks; /* fixed-point */ + unsigned long active_tasks, running_tasks; /* fixed-point */ static int count = LOAD_FREQ; count -= ticks; @@ -767,9 +769,39 @@ static inline void calc_load(unsigned lo CALC_LOAD(avenrun[0], EXP_1, active_tasks); CALC_LOAD(avenrun[1], EXP_5, active_tasks); CALC_LOAD(avenrun[2], EXP_15, active_tasks); + running_tasks = nr_running() * FIXED_1; + CALC_LOAD(tasks_running[0], EXP_1, running_tasks); + CALC_LOAD(tasks_running[1], EXP_5, running_tasks); + CALC_LOAD(tasks_running[2], EXP_15, running_tasks); } } +/* + * This does the frequency calculation a little bit different from the + * global version above. It doesn't ever look at the kernel's concept + * of time, it just updates that stats every LOAD_FREQ times into the + * function. + * + * Using jiffies is more accurate, but there _are_ just statistics, so + * they're not worth messing with xtime_lock and company. If we miss + * an interrupt or two, big deal. + */ +void calc_load_cpu(int cpu) +{ + unsigned long running_tasks; + static DEFINE_PER_CPU(int, count) = { LOAD_FREQ }; + + per_cpu(count, cpu)--; + if (per_cpu(count, cpu) != 0) + return; + + per_cpu(count, cpu) += LOAD_FREQ; + running_tasks = nr_running_cpu(cpu) * FIXED_1; + CALC_LOAD(per_cpu(cpu_tasks_running, cpu)[0], EXP_1, running_tasks); + CALC_LOAD(per_cpu(cpu_tasks_running, cpu)[1], EXP_5, running_tasks); + CALC_LOAD(per_cpu(cpu_tasks_running, cpu)[2], EXP_15, running_tasks); +} + /* jiffies at the most recent update of wall time */ unsigned long wall_jiffies = INITIAL_JIFFIES; diff -purN -X /home/mbligh/.diff.exclude 000-virgin/mm/Makefile 999-mjb/mm/Makefile --- 000-virgin/mm/Makefile 2003-10-01 11:47:15.000000000 -0700 +++ 999-mjb/mm/Makefile 2003-10-02 16:41:14.000000000 -0700 @@ -7,8 +7,10 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ shmem.o vmalloc.o -obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ +obj-y := bootmem.o fadvise.o filemap.o mempool.o oom_kill.o \ page_alloc.o page-writeback.o pdflush.o readahead.o \ slab.o swap.o truncate.o vmscan.o $(mmu-y) obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o + +obj-$(CONFIG_NUMA) += mbind.o diff -purN -X /home/mbligh/.diff.exclude 000-virgin/mm/filemap.c 999-mjb/mm/filemap.c --- 000-virgin/mm/filemap.c 2003-10-01 11:48:28.000000000 -0700 +++ 999-mjb/mm/filemap.c 2003-10-02 16:53:55.000000000 -0700 @@ -70,6 +70,9 @@ * ->mmap_sem * ->i_sem (msync) * + * ->lock_page + * ->i_shared_sem (page_convert_anon) + * * ->inode_lock * ->sb_lock (fs/fs-writeback.c) * ->mapping->page_lock (__sync_single_inode) @@ -105,9 +108,9 @@ void remove_from_page_cache(struct page if (unlikely(!PageLocked(page))) PAGE_BUG(page); - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); __remove_from_page_cache(page); - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); } static inline int sync_page(struct page *page) @@ -139,9 +142,9 @@ static int __filemap_fdatawrite(struct a if (mapping->backing_dev_info->memory_backed) return 0; - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); list_splice_init(&mapping->dirty_pages, &mapping->io_pages); - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); ret = do_writepages(mapping, &wbc); return ret; } @@ -172,7 +175,7 @@ int filemap_fdatawait(struct address_spa restart: progress = 0; - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); while (!list_empty(&mapping->locked_pages)) { struct page *page; @@ -186,7 +189,7 @@ restart: if (!PageWriteback(page)) { if (++progress > 32) { if (need_resched()) { - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); __cond_resched(); goto restart; } @@ -196,16 +199,16 @@ restart: progress = 0; page_cache_get(page); - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); wait_on_page_writeback(page); if (PageError(page)) ret = -EIO; page_cache_release(page); - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); } - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); /* Check for outstanding write errors */ if (test_and_clear_bit(AS_ENOSPC, &mapping->flags)) @@ -240,7 +243,7 @@ int add_to_page_cache(struct page *page, if (error == 0) { page_cache_get(page); - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); error = radix_tree_insert(&mapping->page_tree, offset, page); if (!error) { SetPageLocked(page); @@ -248,7 +251,7 @@ int add_to_page_cache(struct page *page, } else { page_cache_release(page); } - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); radix_tree_preload_end(); } return error; @@ -377,11 +380,11 @@ struct page * find_get_page(struct addre * We scan the hash list read-only. Addition to and removal from * the hash-list needs a held write-lock. */ - spin_lock(&mapping->page_lock); + mapping_rdlock(&mapping->page_lock); page = radix_tree_lookup(&mapping->page_tree, offset); if (page) page_cache_get(page); - spin_unlock(&mapping->page_lock); + mapping_rdunlock(&mapping->page_lock); return page; } @@ -392,11 +395,11 @@ struct page *find_trylock_page(struct ad { struct page *page; - spin_lock(&mapping->page_lock); + mapping_rdlock(&mapping->page_lock); page = radix_tree_lookup(&mapping->page_tree, offset); if (page && TestSetPageLocked(page)) page = NULL; - spin_unlock(&mapping->page_lock); + mapping_rdunlock(&mapping->page_lock); return page; } @@ -416,15 +419,15 @@ struct page *find_lock_page(struct addre { struct page *page; - spin_lock(&mapping->page_lock); + mapping_rdlock(&mapping->page_lock); repeat: page = radix_tree_lookup(&mapping->page_tree, offset); if (page) { page_cache_get(page); if (TestSetPageLocked(page)) { - spin_unlock(&mapping->page_lock); + mapping_rdunlock(&mapping->page_lock); lock_page(page); - spin_lock(&mapping->page_lock); + mapping_rdlock(&mapping->page_lock); /* Has the page been truncated while we slept? */ if (page->mapping != mapping || page->index != offset) { @@ -434,7 +437,7 @@ repeat: } } } - spin_unlock(&mapping->page_lock); + mapping_rdunlock(&mapping->page_lock); return page; } @@ -504,12 +507,12 @@ unsigned int find_get_pages(struct addre unsigned int i; unsigned int ret; - spin_lock(&mapping->page_lock); + mapping_rdlock(&mapping->page_lock); ret = radix_tree_gang_lookup(&mapping->page_tree, (void **)pages, start, nr_pages); for (i = 0; i < ret; i++) page_cache_get(pages[i]); - spin_unlock(&mapping->page_lock); + mapping_rdunlock(&mapping->page_lock); return ret; } diff -purN -X /home/mbligh/.diff.exclude 000-virgin/mm/fremap.c 999-mjb/mm/fremap.c --- 000-virgin/mm/fremap.c 2003-10-01 11:48:28.000000000 -0700 +++ 999-mjb/mm/fremap.c 2003-10-02 16:42:18.000000000 -0700 @@ -38,7 +38,7 @@ static inline int zap_pte(struct mm_stru set_page_dirty(page); page_remove_rmap(page, ptep); page_cache_release(page); - mm->rss--; + dec_rss(mm, page); } } return 1; @@ -63,10 +63,26 @@ int install_page(struct mm_struct *mm, s pmd_t *pmd; pte_t pte_val; struct pte_chain *pte_chain; + unsigned long pgidx; pte_chain = pte_chain_alloc(GFP_KERNEL); if (!pte_chain) goto err; + + /* + * Convert this page to anon for objrmap if it's nonlinear + */ + pgidx = (addr - vma->vm_start) >> PAGE_SHIFT; + pgidx += vma->vm_pgoff; + pgidx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT; + if (!PageAnon(page) && (page->index != pgidx)) { + lock_page(page); + err = page_convert_anon(page); + unlock_page(page); + if (err < 0) + goto err_free; + } + pgd = pgd_offset(mm, addr); spin_lock(&mm->page_table_lock); @@ -80,7 +96,7 @@ int install_page(struct mm_struct *mm, s flush = zap_pte(mm, vma, addr, pte); - mm->rss++; + inc_rss(mm, page); flush_icache_page(vma, page); set_pte(pte, mk_pte(page, prot)); pte_chain = page_add_rmap(page, pte, pte_chain); @@ -89,12 +105,11 @@ int install_page(struct mm_struct *mm, s if (flush) flush_tlb_page(vma, addr); update_mmu_cache(vma, addr, pte_val); - spin_unlock(&mm->page_table_lock); - pte_chain_free(pte_chain); - return 0; + err = 0; err_unlock: spin_unlock(&mm->page_table_lock); +err_free: pte_chain_free(pte_chain); err: return err; diff -purN -X /home/mbligh/.diff.exclude 000-virgin/mm/mbind.c 999-mjb/mm/mbind.c --- 000-virgin/mm/mbind.c 1969-12-31 16:00:00.000000000 -0800 +++ 999-mjb/mm/mbind.c 2003-10-02 16:41:14.000000000 -0700 @@ -0,0 +1,147 @@ +/* + * mm/mbind.c + * + * Written by: Matthew Dobson, IBM Corporation + * + * Copyright (C) 2003, IBM Corp. + * + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Send feedback to + */ +#include +#include +#include +#include +#include + +/* Translate a cpumask to a nodemask */ +static inline void cpumask_to_nodemask(unsigned long * cpumask, unsigned long * nodemask) +{ + int i; + + for (i = 0; i < NR_CPUS; i++) + if (test_bit(i, cpumask)) + set_bit(cpu_to_node(i), nodemask); +} + +/* + * Adds the zones belonging to @pgdat to @zonelist. Returns the next + * index in @zonelist. + */ +static inline int add_node(pg_data_t *pgdat, struct zonelist *zonelist, int zone_num) +{ + int i; + struct zone *zone; + + for (i = MAX_NR_ZONES-1; i >=0 ; i--) { + zone = pgdat->node_zones + i; + if (zone->present_pages) + zonelist->zones[zone_num++] = zone; + } + return zone_num; +} + +/* Builds a binding for a region of memory, based on a bitmask of nodes. */ +static inline int build_binding(unsigned long * nodemask, struct binding *binding) +{ + int node, zone_num; + + memset(binding, 0, sizeof(struct binding)); + + /* Build binding zonelist */ + for (node = 0, zone_num = 0; node < MAX_NUMNODES; node++) + if (test_bit(node, nodemask) && node_online(node)) + zone_num = add_node(NODE_DATA(node), + &binding->zonelist, zone_num); + binding->zonelist.zones[zone_num] = NULL; + + if (zone_num == 0) + /* No zones were added to the zonelist. Let the caller know. */ + return -EINVAL; + + return 0; +} + + +/* + * mbind - Bind a range of a process' VM space to a set of memory blocks according to + * a predefined policy. + * @start: beginning address of memory region to bind + * @len: length of memory region to bind + * @mask_ptr: pointer to bitmask of cpus + * @mask_len: length of the bitmask + * @policy: flag specifying the policy to use for the segment + */ +asmlinkage unsigned long sys_mbind(unsigned long start, unsigned long len, + unsigned long *mask_ptr, unsigned int mask_len, unsigned long policy) +{ + DECLARE_BITMAP(cpu_mask, NR_CPUS); + DECLARE_BITMAP(node_mask, MAX_NUMNODES); + struct vm_area_struct *vma = NULL; + struct address_space *mapping; + int copy_len, error = 0; + + /* Deal with getting cpu_mask from userspace & translating to node_mask */ + CLEAR_BITMAP(cpu_mask, NR_CPUS); + CLEAR_BITMAP(node_mask, MAX_NUMNODES); + copy_len = min(mask_len, (unsigned int)NR_CPUS); + if (copy_from_user(cpu_mask, mask_ptr, (copy_len+7)/8)) { + error = -EFAULT; + goto out; + } + cpumask_to_nodemask(cpu_mask, node_mask); + + down_read(¤t->mm->mmap_sem); + vma = find_vma(current->mm, start); + up_read(¤t->mm->mmap_sem); + /* This is an ugly, gross hack. This is purely because I've hurt my + * brain trying to come up with a brilliant way of implementing this + * for VMA's in general. Shared Memory VMA's lend themselves to binding + * both because of how they're implemented, and their actual uses. + * If anyone has a great place to squirrel-away some data about the + * requested binding, and a way to easily force the allocator to respect + * these bindings, then send a patch, or let me know. Otherwise, this + * will have to wait for a stroke of insight. + */ + if (!(vma && vma->vm_file && vma->vm_ops && + vma->vm_ops->nopage == shmem_nopage)) { + /* This isn't a shm segment. For now, we bail. */ + error = -EINVAL; + goto out; + } + + mapping = vma->vm_file->f_dentry->d_inode->i_mapping; + if (mapping->binding) { + kfree(mapping->binding); + mapping->binding = NULL; + } + mapping->binding = kmalloc(sizeof(struct binding), GFP_KERNEL); + if (!mapping->binding) { + error = -ENOMEM; + goto out; + } + error = build_binding(node_mask, mapping->binding); + if (error) { + kfree(mapping->binding); + mapping->binding = NULL; + } + +out: + return error; +} diff -purN -X /home/mbligh/.diff.exclude 000-virgin/mm/memory.c 999-mjb/mm/memory.c --- 000-virgin/mm/memory.c 2003-10-01 11:48:28.000000000 -0700 +++ 999-mjb/mm/memory.c 2003-10-02 16:42:18.000000000 -0700 @@ -102,8 +102,7 @@ static inline void free_one_pmd(struct m static inline void free_one_pgd(struct mmu_gather *tlb, pgd_t * dir) { - int j; - pmd_t * pmd; + pmd_t * pmd, * md, * emd; if (pgd_none(*dir)) return; @@ -114,8 +113,21 @@ static inline void free_one_pgd(struct m } pmd = pmd_offset(dir, 0); pgd_clear(dir); - for (j = 0; j < PTRS_PER_PMD ; j++) - free_one_pmd(tlb, pmd+j); + /* + * Beware if changing the loop below. It once used int j, + * for (j = 0; j < PTRS_PER_PMD; j++) + * free_one_pmd(pmd+j); + * but some older i386 compilers (e.g. egcs-2.91.66, gcc-2.95.3) + * terminated the loop with a _signed_ address comparison + * using "jle", when configured for HIGHMEM64GB (X86_PAE). + * If also configured for 3GB of kernel virtual address space, + * if page at physical 0x3ffff000 virtual 0x7ffff000 is used as + * a pmd, when that mm exits the loop goes on to free "entries" + * found at 0x80000000 onwards. The loop below compiles instead + * to be terminated by unsigned address comparison using "jb". + */ + for (md = pmd, emd = pmd + PTRS_PER_PMD; md < emd; md++) + free_one_pmd(tlb,md); pmd_free_tlb(tlb, pmd); } @@ -319,7 +331,7 @@ skip_copy_pte_range: pte = pte_mkclean(pte); pte = pte_mkold(pte); get_page(page); - dst->rss++; + inc_rss(dst, page); set_pte(dst_pte, pte); pte_chain = page_add_rmap(page, dst_pte, @@ -411,7 +423,14 @@ zap_pte_range(struct mmu_gather *tlb, pm if (page->mapping && pte_young(pte) && !PageSwapCache(page)) mark_page_accessed(page); - tlb->freed++; + /* + * While we have the page that is being + * freed handy, make sure we decrement + * the mm's RSS accordingly. This is + * only important for NUMA per-node + * RSS accounting. + */ + dec_rss(tlb->mm, page); page_remove_rmap(page, ptep); tlb_remove_page(tlb, page); } @@ -1041,9 +1060,10 @@ static int do_wp_page(struct mm_struct * page_table = pte_offset_map(pmd, address); if (pte_same(*page_table, pte)) { if (PageReserved(old_page)) - ++mm->rss; + inc_rss(mm, new_page); page_remove_rmap(old_page, page_table); break_cow(vma, new_page, address, page_table); + SetPageAnon(new_page); pte_chain = page_add_rmap(new_page, page_table, pte_chain); lru_cache_add_active(new_page); @@ -1275,7 +1295,7 @@ static int do_swap_page(struct mm_struct if (vm_swap_full()) remove_exclusive_swap_page(page); - mm->rss++; + inc_rss(mm, page); pte = mk_pte(page, vma->vm_page_prot); if (write_access && can_share_swap_page(page)) pte = pte_mkdirty(pte_mkwrite(pte)); @@ -1283,6 +1303,7 @@ static int do_swap_page(struct mm_struct flush_icache_page(vma, page); set_pte(page_table, pte); + SetPageAnon(page); pte_chain = page_add_rmap(page, page_table, pte_chain); /* No need to invalidate - it was non-present before */ @@ -1344,10 +1365,11 @@ do_anonymous_page(struct mm_struct *mm, ret = VM_FAULT_MINOR; goto out; } - mm->rss++; + inc_rss(mm, page); entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); lru_cache_add_active(page); mark_page_accessed(page); + SetPageAnon(page); } set_pte(page_table, entry); @@ -1415,6 +1437,10 @@ retry: if (!pte_chain) goto oom; + /* See if nopage returned an anon page */ + if (!new_page->mapping || PageSwapCache(new_page)) + SetPageAnon(new_page); + /* * Should we do an early C-O-W break? */ @@ -1427,6 +1453,7 @@ retry: copy_user_highpage(page, new_page, address); page_cache_release(new_page); lru_cache_add_active(page); + SetPageAnon(page); new_page = page; } @@ -1458,7 +1485,7 @@ retry: /* Only go through if we didn't race with anybody else... */ if (pte_none(*page_table)) { if (!PageReserved(new_page)) - ++mm->rss; + inc_rss(mm, new_page); flush_icache_page(vma, new_page); entry = mk_pte(new_page, vma->vm_page_prot); if (write_access) diff -purN -X /home/mbligh/.diff.exclude 000-virgin/mm/mmap.c 999-mjb/mm/mmap.c --- 000-virgin/mm/mmap.c 2003-10-01 11:48:28.000000000 -0700 +++ 999-mjb/mm/mmap.c 2003-10-02 16:42:18.000000000 -0700 @@ -268,9 +268,7 @@ static void vma_link(struct mm_struct *m if (mapping) down(&mapping->i_shared_sem); - spin_lock(&mm->page_table_lock); __vma_link(mm, vma, prev, rb_link, rb_parent); - spin_unlock(&mm->page_table_lock); if (mapping) up(&mapping->i_shared_sem); @@ -299,6 +297,25 @@ static inline int is_mergeable_vma(struc return 1; } +static void move_vma_start(struct vm_area_struct *vma, unsigned long addr) +{ + struct inode *inode = NULL; + + if (vma->vm_file) { + inode = vma->vm_file->f_dentry->d_inode; + down(&inode->i_mapping->i_shared_sem); + } + if (inode) + __remove_shared_vm_struct(vma, inode); + /* If no vm_file, perhaps we should always keep vm_pgoff at 0?? */ + vma->vm_pgoff += (long)(addr - vma->vm_start) >> PAGE_SHIFT; + vma->vm_start = addr; + if (inode) { + __vma_link_file(vma); + up(&inode->i_mapping->i_shared_sem); + } +} + /* * Return true if we can merge this (vm_flags,file,vm_pgoff,size) * in front of (at a lower virtual address and file offset than) the vma. @@ -351,8 +368,6 @@ static int vma_merge(struct mm_struct *m unsigned long end, unsigned long vm_flags, struct file *file, unsigned long pgoff) { - spinlock_t * lock = &mm->page_table_lock; - /* * We later require that vma->vm_flags == vm_flags, so this tests * vma->vm_flags & VM_SPECIAL, too. @@ -380,7 +395,6 @@ static int vma_merge(struct mm_struct *m down(&inode->i_mapping->i_shared_sem); need_up = 1; } - spin_lock(lock); prev->vm_end = end; /* @@ -393,7 +407,6 @@ static int vma_merge(struct mm_struct *m prev->vm_end = next->vm_end; __vma_unlink(mm, next, prev); __remove_shared_vm_struct(next, inode); - spin_unlock(lock); if (need_up) up(&inode->i_mapping->i_shared_sem); if (file) @@ -403,7 +416,6 @@ static int vma_merge(struct mm_struct *m kmem_cache_free(vm_area_cachep, next); return 1; } - spin_unlock(lock); if (need_up) up(&inode->i_mapping->i_shared_sem); return 1; @@ -419,10 +431,7 @@ static int vma_merge(struct mm_struct *m pgoff, (end - addr) >> PAGE_SHIFT)) return 0; if (end == prev->vm_start) { - spin_lock(lock); - prev->vm_start = addr; - prev->vm_pgoff -= (end - addr) >> PAGE_SHIFT; - spin_unlock(lock); + move_vma_start(prev, addr); return 1; } } @@ -868,19 +877,16 @@ int expand_stack(struct vm_area_struct * */ address += 4 + PAGE_SIZE - 1; address &= PAGE_MASK; - spin_lock(&vma->vm_mm->page_table_lock); grow = (address - vma->vm_end) >> PAGE_SHIFT; /* Overcommit.. */ if (security_vm_enough_memory(grow)) { - spin_unlock(&vma->vm_mm->page_table_lock); return -ENOMEM; } if (address - vma->vm_start > current->rlim[RLIMIT_STACK].rlim_cur || ((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) > current->rlim[RLIMIT_AS].rlim_cur) { - spin_unlock(&vma->vm_mm->page_table_lock); vm_unacct_memory(grow); return -ENOMEM; } @@ -888,7 +894,6 @@ int expand_stack(struct vm_area_struct * vma->vm_mm->total_vm += grow; if (vma->vm_flags & VM_LOCKED) vma->vm_mm->locked_vm += grow; - spin_unlock(&vma->vm_mm->page_table_lock); return 0; } @@ -922,19 +927,16 @@ int expand_stack(struct vm_area_struct * * the spinlock only before relocating the vma range ourself. */ address &= PAGE_MASK; - spin_lock(&vma->vm_mm->page_table_lock); grow = (vma->vm_start - address) >> PAGE_SHIFT; /* Overcommit.. */ if (security_vm_enough_memory(grow)) { - spin_unlock(&vma->vm_mm->page_table_lock); return -ENOMEM; } if (vma->vm_end - address > current->rlim[RLIMIT_STACK].rlim_cur || ((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) > current->rlim[RLIMIT_AS].rlim_cur) { - spin_unlock(&vma->vm_mm->page_table_lock); vm_unacct_memory(grow); return -ENOMEM; } @@ -943,7 +945,6 @@ int expand_stack(struct vm_area_struct * vma->vm_mm->total_vm += grow; if (vma->vm_flags & VM_LOCKED) vma->vm_mm->locked_vm += grow; - spin_unlock(&vma->vm_mm->page_table_lock); return 0; } @@ -1106,8 +1107,6 @@ static void unmap_region(struct mm_struc /* * Create a list of vma's touched by the unmap, removing them from the mm's * vma list as we go.. - * - * Called with the page_table_lock held. */ static void detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, @@ -1151,8 +1150,7 @@ int split_vma(struct mm_struct * mm, str if (new_below) { new->vm_end = addr; - vma->vm_start = addr; - vma->vm_pgoff += ((addr - new->vm_start) >> PAGE_SHIFT); + move_vma_start(vma, addr); } else { vma->vm_end = addr; new->vm_start = addr; @@ -1231,8 +1229,8 @@ int do_munmap(struct mm_struct *mm, unsi /* * Remove the vma's, and unmap the actual pages */ - spin_lock(&mm->page_table_lock); detach_vmas_to_be_unmapped(mm, mpnt, prev, end); + spin_lock(&mm->page_table_lock); unmap_region(mm, mpnt, prev, start, end); spin_unlock(&mm->page_table_lock); @@ -1384,7 +1382,7 @@ void exit_mmap(struct mm_struct *mm) vma = mm->mmap; mm->mmap = mm->mmap_cache = NULL; mm->mm_rb = RB_ROOT; - mm->rss = 0; + zero_rss(mm); mm->total_vm = 0; mm->locked_vm = 0; diff -purN -X /home/mbligh/.diff.exclude 000-virgin/mm/page-writeback.c 999-mjb/mm/page-writeback.c --- 000-virgin/mm/page-writeback.c 2003-10-01 11:48:28.000000000 -0700 +++ 999-mjb/mm/page-writeback.c 2003-10-02 16:53:55.000000000 -0700 @@ -469,12 +469,12 @@ int write_one_page(struct page *page, in if (wait) wait_on_page_writeback(page); - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); list_del(&page->list); if (test_clear_page_dirty(page)) { list_add(&page->list, &mapping->locked_pages); page_cache_get(page); - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); ret = mapping->a_ops->writepage(page, &wbc); if (ret == 0 && wait) { wait_on_page_writeback(page); @@ -484,7 +484,7 @@ int write_one_page(struct page *page, in page_cache_release(page); } else { list_add(&page->list, &mapping->clean_pages); - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); unlock_page(page); } return ret; @@ -512,7 +512,7 @@ int __set_page_dirty_nobuffers(struct pa struct address_space *mapping = page->mapping; if (mapping) { - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); if (page->mapping) { /* Race with truncate? */ BUG_ON(page->mapping != mapping); if (!mapping->backing_dev_info->memory_backed) @@ -520,7 +520,7 @@ int __set_page_dirty_nobuffers(struct pa list_del(&page->list); list_add(&page->list, &mapping->dirty_pages); } - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); if (!PageSwapCache(page)) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); diff -purN -X /home/mbligh/.diff.exclude 000-virgin/mm/page_alloc.c 999-mjb/mm/page_alloc.c --- 000-virgin/mm/page_alloc.c 2003-10-01 11:48:28.000000000 -0700 +++ 999-mjb/mm/page_alloc.c 2003-10-02 16:44:09.000000000 -0700 @@ -225,6 +225,8 @@ static inline void free_pages_check(cons bad_page(function, page); if (PageDirty(page)) ClearPageDirty(page); + if (PageAnon(page)) + ClearPageAnon(page); } /* @@ -562,6 +564,10 @@ __alloc_pages(unsigned int gfp_mask, uns struct zone *z = zones[i]; unsigned long local_low; + if ((__GFP_NODE_STRICT & gfp_mask) && + (pfn_to_nid(z->zone_start_pfn) != numa_node_id())) + continue; + /* * This is the fabled 'incremental min'. We let real-time tasks * dip their real-time paws a little deeper into reserves. diff -purN -X /home/mbligh/.diff.exclude 000-virgin/mm/readahead.c 999-mjb/mm/readahead.c --- 000-virgin/mm/readahead.c 2003-10-01 11:35:37.000000000 -0700 +++ 999-mjb/mm/readahead.c 2003-10-02 16:53:55.000000000 -0700 @@ -222,7 +222,7 @@ __do_page_cache_readahead(struct address /* * Preallocate as many pages as we will need. */ - spin_lock(&mapping->page_lock); + mapping_rdlock(&mapping->page_lock); for (page_idx = 0; page_idx < nr_to_read; page_idx++) { unsigned long page_offset = offset + page_idx; @@ -233,16 +233,16 @@ __do_page_cache_readahead(struct address if (page) continue; - spin_unlock(&mapping->page_lock); + mapping_rdunlock(&mapping->page_lock); page = page_cache_alloc_cold(mapping); - spin_lock(&mapping->page_lock); + mapping_rdlock(&mapping->page_lock); if (!page) break; page->index = page_offset; list_add(&page->list, &page_pool); ret++; } - spin_unlock(&mapping->page_lock); + mapping_rdunlock(&mapping->page_lock); /* * Now start the IO. We ignore I/O errors - if the page is not diff -purN -X /home/mbligh/.diff.exclude 000-virgin/mm/rmap.c 999-mjb/mm/rmap.c --- 000-virgin/mm/rmap.c 2003-10-01 11:47:15.000000000 -0700 +++ 999-mjb/mm/rmap.c 2003-10-02 16:42:18.000000000 -0700 @@ -102,6 +102,136 @@ pte_chain_encode(struct pte_chain *pte_c **/ /** + * find_pte - Find a pte pointer given a vma and a struct page. + * @vma: the vma to search + * @page: the page to find + * + * Determine if this page is mapped in this vma. If it is, map and rethrn + * the pte pointer associated with it. Return null if the page is not + * mapped in this vma for any reason. + * + * This is strictly an internal helper function for the object-based rmap + * functions. + * + * It is the caller's responsibility to unmap the pte if it is returned. + */ +static inline pte_t * +find_pte(struct vm_area_struct *vma, struct page *page, unsigned long *addr) +{ + struct mm_struct *mm = vma->vm_mm; + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + unsigned long loffset; + unsigned long address; + + loffset = (page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT)); + address = vma->vm_start + ((loffset - vma->vm_pgoff) << PAGE_SHIFT); + if (address < vma->vm_start || address >= vma->vm_end) + goto out; + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + goto out; + + pmd = pmd_offset(pgd, address); + if (!pmd_present(*pmd)) + goto out; + + pte = pte_offset_map(pmd, address); + if (!pte_present(*pte)) + goto out_unmap; + + if (page_to_pfn(page) != pte_pfn(*pte)) + goto out_unmap; + + if (addr) + *addr = address; + + return pte; + +out_unmap: + pte_unmap(pte); +out: + return NULL; +} + +/** + * page_referenced_obj_one - referenced check for object-based rmap + * @vma: the vma to look in. + * @page: the page we're working on. + * + * Find a pte entry for a page/vma pair, then check and clear the referenced + * bit. + * + * This is strictly a helper function for page_referenced_obj. + */ +static int +page_referenced_obj_one(struct vm_area_struct *vma, struct page *page) +{ + struct mm_struct *mm = vma->vm_mm; + pte_t *pte; + int referenced = 0; + + if (!spin_trylock(&mm->page_table_lock)) + return 1; + + pte = find_pte(vma, page, NULL); + if (pte) { + if (ptep_test_and_clear_young(pte)) + referenced++; + pte_unmap(pte); + } + + spin_unlock(&mm->page_table_lock); + return referenced; +} + +/** + * page_referenced_obj_one - referenced check for object-based rmap + * @page: the page we're checking references on. + * + * For an object-based mapped page, find all the places it is mapped and + * check/clear the referenced flag. This is done by following the page->mapping + * pointer, then walking the chain of vmas it holds. It returns the number + * of references it found. + * + * This function is only called from page_referenced for object-based pages. + * + * The semaphore address_space->i_shared_sem is tried. If it can't be gotten, + * assume a reference count of 1. + */ +static int +page_referenced_obj(struct page *page) +{ + struct address_space *mapping = page->mapping; + struct vm_area_struct *vma; + int referenced = 0; + + if (!page->pte.mapcount) + return 0; + + if (!mapping) + BUG(); + + if (PageSwapCache(page)) + BUG(); + + if (down_trylock(&mapping->i_shared_sem)) + return 1; + + list_for_each_entry(vma, &mapping->i_mmap, shared) + referenced += page_referenced_obj_one(vma, page); + + list_for_each_entry(vma, &mapping->i_mmap_shared, shared) + referenced += page_referenced_obj_one(vma, page); + + up(&mapping->i_shared_sem); + + return referenced; +} + +/** * page_referenced - test if the page was referenced * @page: the page to test * @@ -120,6 +250,10 @@ int page_referenced(struct page * page) if (TestClearPageReferenced(page)) referenced++; + if (!PageAnon(page)) { + referenced += page_referenced_obj(page); + goto out; + } if (PageDirect(page)) { pte_t *pte = rmap_ptep_map(page->pte.direct); if (ptep_test_and_clear_young(pte)) @@ -153,6 +287,7 @@ int page_referenced(struct page * page) __pte_chain_free(pc); } } +out: return referenced; } @@ -175,6 +310,21 @@ page_add_rmap(struct page *page, pte_t * pte_chain_lock(page); + /* + * If this is an object-based page, just count it. We can + * find the mappings by walking the object vma chain for that object. + */ + if (!PageAnon(page)) { + if (!page->mapping) + BUG(); + if (PageSwapCache(page)) + BUG(); + if (!page->pte.mapcount) + inc_page_state(nr_mapped); + page->pte.mapcount++; + goto out; + } + if (page->pte.direct == 0) { page->pte.direct = pte_paddr; SetPageDirect(page); @@ -231,8 +381,25 @@ void page_remove_rmap(struct page *page, pte_chain_lock(page); if (!page_mapped(page)) - goto out_unlock; /* remap_page_range() from a driver? */ + goto out_unlock; + /* + * If this is an object-based page, just uncount it. We can + * find the mappings by walking the object vma chain for that object. + */ + if (!PageAnon(page)) { + if (!page->mapping) + BUG(); + if (PageSwapCache(page)) + BUG(); + if (!page->pte.mapcount) + BUG(); + page->pte.mapcount--; + if (!page->pte.mapcount) + dec_page_state(nr_mapped); + goto out_unlock; + } + if (PageDirect(page)) { if (page->pte.direct == pte_paddr) { page->pte.direct = 0; @@ -279,6 +446,102 @@ out_unlock: } /** + * try_to_unmap_obj - unmap a page using the object-based rmap method + * @page: the page to unmap + * + * Determine whether a page is mapped in a given vma and unmap it if it's found. + * + * This function is strictly a helper function for try_to_unmap_obj. + */ +static inline int +try_to_unmap_obj_one(struct vm_area_struct *vma, struct page *page) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long address; + pte_t *pte; + pte_t pteval; + int ret = SWAP_AGAIN; + + if (!spin_trylock(&mm->page_table_lock)) + return ret; + + pte = find_pte(vma, page, &address); + if (!pte) + goto out; + + if (vma->vm_flags & VM_LOCKED) { + ret = SWAP_FAIL; + goto out_unmap; + } + + flush_cache_page(vma, address); + pteval = ptep_get_and_clear(pte); + flush_tlb_page(vma, address); + + if (pte_dirty(pteval)) + set_page_dirty(page); + + if (!page->pte.mapcount) + BUG(); + + mm->rss--; + page->pte.mapcount--; + page_cache_release(page); + +out_unmap: + pte_unmap(pte); + +out: + spin_unlock(&mm->page_table_lock); + return ret; +} + +/** + * try_to_unmap_obj - unmap a page using the object-based rmap method + * @page: the page to unmap + * + * Find all the mappings of a page using the mapping pointer and the vma chains + * contained in the address_space struct it points to. + * + * This function is only called from try_to_unmap for object-based pages. + * + * The semaphore address_space->i_shared_sem is tried. If it can't be gotten, + * return a temporary error. + */ +static int +try_to_unmap_obj(struct page *page) +{ + struct address_space *mapping = page->mapping; + struct vm_area_struct *vma; + int ret = SWAP_AGAIN; + + if (!mapping) + BUG(); + + if (PageSwapCache(page)) + BUG(); + + if (down_trylock(&mapping->i_shared_sem)) + return ret; + + list_for_each_entry(vma, &mapping->i_mmap, shared) { + ret = try_to_unmap_obj_one(vma, page); + if (ret == SWAP_FAIL || !page->pte.mapcount) + goto out; + } + + list_for_each_entry(vma, &mapping->i_mmap_shared, shared) { + ret = try_to_unmap_obj_one(vma, page); + if (ret == SWAP_FAIL || !page->pte.mapcount) + goto out; + } + +out: + up(&mapping->i_shared_sem); + return ret; +} + +/** * try_to_unmap_one - worker function for try_to_unmap * @page: page to unmap * @ptep: page table entry to unmap from page @@ -360,7 +623,7 @@ static int try_to_unmap_one(struct page if (pte_dirty(pte)) set_page_dirty(page); - mm->rss--; + dec_rss(mm, page); page_cache_release(page); ret = SWAP_SUCCESS; @@ -397,6 +660,15 @@ int try_to_unmap(struct page * page) if (!page->mapping) BUG(); + /* + * If it's an object-based page, use the object vma chain to find all + * the mappings. + */ + if (!PageAnon(page)) { + ret = try_to_unmap_obj(page); + goto out; + } + if (PageDirect(page)) { ret = try_to_unmap_one(page, page->pte.direct); if (ret == SWAP_SUCCESS) { @@ -452,12 +724,115 @@ int try_to_unmap(struct page * page) } } out: - if (!page_mapped(page)) + if (!page_mapped(page)) { dec_page_state(nr_mapped); + ret = SWAP_SUCCESS; + } return ret; } /** + * page_convert_anon - Convert an object-based mapped page to pte_chain-based. + * @page: the page to convert + * + * Find all the mappings for an object-based page and convert them + * to 'anonymous', ie create a pte_chain and store all the pte pointers there. + * + * This function takes the address_space->i_shared_sem, sets the PageAnon flag, + * then sets the mm->page_table_lock for each vma and calls page_add_rmap. This + * means there is a period when PageAnon is set, but still has some mappings + * with no pte_chain entry. This is in fact safe, since page_remove_rmap will + * simply not find it. try_to_unmap might erroneously return success, but it + * will never be called because the page_convert_anon() caller has locked the + * page. + * + * page_referenced() may fail to scan all the appropriate pte's and may return + * an inaccurate result. This is so rare that it does not matter. + */ +int page_convert_anon(struct page *page) +{ + struct address_space *mapping; + struct vm_area_struct *vma; + struct pte_chain *pte_chain = NULL; + pte_t *pte; + int err = 0; + + mapping = page->mapping; + if (mapping == NULL) + goto out; /* truncate won the lock_page() race */ + + down(&mapping->i_shared_sem); + pte_chain_lock(page); + + /* + * Has someone else done it for us before we got the lock? + * If so, pte.direct or pte.chain has replaced pte.mapcount. + */ + if (PageAnon(page)) { + pte_chain_unlock(page); + goto out_unlock; + } + + SetPageAnon(page); + if (page->pte.mapcount == 0) { + pte_chain_unlock(page); + goto out_unlock; + } + /* This is gonna get incremented by page_add_rmap */ + dec_page_state(nr_mapped); + page->pte.mapcount = 0; + + /* + * Now that the page is marked as anon, unlock it. page_add_rmap will + * lock it as necessary. + */ + pte_chain_unlock(page); + + list_for_each_entry(vma, &mapping->i_mmap, shared) { + if (!pte_chain) { + pte_chain = pte_chain_alloc(GFP_KERNEL); + if (!pte_chain) { + err = -ENOMEM; + goto out_unlock; + } + } + spin_lock(&vma->vm_mm->page_table_lock); + pte = find_pte(vma, page, NULL); + if (pte) { + /* Make sure this isn't a duplicate */ + page_remove_rmap(page, pte); + pte_chain = page_add_rmap(page, pte, pte_chain); + pte_unmap(pte); + } + spin_unlock(&vma->vm_mm->page_table_lock); + } + list_for_each_entry(vma, &mapping->i_mmap_shared, shared) { + if (!pte_chain) { + pte_chain = pte_chain_alloc(GFP_KERNEL); + if (!pte_chain) { + err = -ENOMEM; + goto out_unlock; + } + } + spin_lock(&vma->vm_mm->page_table_lock); + pte = find_pte(vma, page, NULL); + if (pte) { + /* Make sure this isn't a duplicate */ + page_remove_rmap(page, pte); + pte_chain = page_add_rmap(page, pte, pte_chain); + pte_unmap(pte); + } + spin_unlock(&vma->vm_mm->page_table_lock); + } + +out_unlock: + pte_chain_free(pte_chain); + up(&mapping->i_shared_sem); +out: + return err; +} + +/** ** No more VM stuff below this comment, only pte_chain helper ** functions. **/ diff -purN -X /home/mbligh/.diff.exclude 000-virgin/mm/swap_state.c 999-mjb/mm/swap_state.c --- 000-virgin/mm/swap_state.c 2003-10-01 11:35:37.000000000 -0700 +++ 999-mjb/mm/swap_state.c 2003-10-02 16:53:55.000000000 -0700 @@ -25,7 +25,7 @@ extern struct address_space_operations s struct address_space swapper_space = { .page_tree = RADIX_TREE_INIT(GFP_ATOMIC), - .page_lock = SPIN_LOCK_UNLOCKED, + .page_lock = MAPPING_RW_LOCK_UNLOCKED, .clean_pages = LIST_HEAD_INIT(swapper_space.clean_pages), .dirty_pages = LIST_HEAD_INIT(swapper_space.dirty_pages), .io_pages = LIST_HEAD_INIT(swapper_space.io_pages), @@ -182,9 +182,9 @@ void delete_from_swap_cache(struct page entry.val = page->index; - spin_lock(&swapper_space.page_lock); + mapping_wrlock(&swapper_space.page_lock); __delete_from_swap_cache(page); - spin_unlock(&swapper_space.page_lock); + mapping_wrunlock(&swapper_space.page_lock); swap_free(entry); page_cache_release(page); @@ -195,8 +195,8 @@ int move_to_swap_cache(struct page *page struct address_space *mapping = page->mapping; int err; - spin_lock(&swapper_space.page_lock); - spin_lock(&mapping->page_lock); + mapping_wrlock(&swapper_space.page_lock); + mapping_wrlock(&mapping->page_lock); err = radix_tree_insert(&swapper_space.page_tree, entry.val, page); if (!err) { @@ -204,8 +204,8 @@ int move_to_swap_cache(struct page *page ___add_to_page_cache(page, &swapper_space, entry.val); } - spin_unlock(&mapping->page_lock); - spin_unlock(&swapper_space.page_lock); + mapping_wrunlock(&mapping->page_lock); + mapping_wrunlock(&swapper_space.page_lock); if (!err) { if (!swap_duplicate(entry)) @@ -231,8 +231,8 @@ int move_from_swap_cache(struct page *pa entry.val = page->index; - spin_lock(&swapper_space.page_lock); - spin_lock(&mapping->page_lock); + mapping_wrlock(&swapper_space.page_lock); + mapping_wrlock(&mapping->page_lock); err = radix_tree_insert(&mapping->page_tree, index, page); if (!err) { @@ -240,8 +240,8 @@ int move_from_swap_cache(struct page *pa ___add_to_page_cache(page, mapping, index); } - spin_unlock(&mapping->page_lock); - spin_unlock(&swapper_space.page_lock); + mapping_wrunlock(&mapping->page_lock); + mapping_wrunlock(&swapper_space.page_lock); if (!err) { swap_free(entry); diff -purN -X /home/mbligh/.diff.exclude 000-virgin/mm/swapfile.c 999-mjb/mm/swapfile.c --- 000-virgin/mm/swapfile.c 2003-10-01 11:47:15.000000000 -0700 +++ 999-mjb/mm/swapfile.c 2003-10-02 16:53:56.000000000 -0700 @@ -253,10 +253,10 @@ static int exclusive_swap_page(struct pa /* Is the only swap cache user the cache itself? */ if (p->swap_map[swp_offset(entry)] == 1) { /* Recheck the page count with the pagecache lock held.. */ - spin_lock(&swapper_space.page_lock); + mapping_rdlock(&swapper_space.page_lock); if (page_count(page) - !!PagePrivate(page) == 2) retval = 1; - spin_unlock(&swapper_space.page_lock); + mapping_rdunlock(&swapper_space.page_lock); } swap_info_put(p); } @@ -324,13 +324,13 @@ int remove_exclusive_swap_page(struct pa retval = 0; if (p->swap_map[swp_offset(entry)] == 1) { /* Recheck the page count with the pagecache lock held.. */ - spin_lock(&swapper_space.page_lock); + mapping_wrlock(&swapper_space.page_lock); if ((page_count(page) == 2) && !PageWriteback(page)) { __delete_from_swap_cache(page); SetPageDirty(page); retval = 1; } - spin_unlock(&swapper_space.page_lock); + mapping_wrunlock(&swapper_space.page_lock); } swap_info_put(p); @@ -387,9 +387,10 @@ static void unuse_pte(struct vm_area_struct *vma, unsigned long address, pte_t *dir, swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp) { - vma->vm_mm->rss++; + inc_rss(vma->vm_mm, page); get_page(page); set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot))); + SetPageAnon(page); *pte_chainp = page_add_rmap(page, dir, *pte_chainp); swap_free(entry); } @@ -498,6 +499,7 @@ static int unuse_process(struct mm_struc /* * Go through process' page directory. */ + down_read(&mm->mmap_sem); spin_lock(&mm->page_table_lock); for (vma = mm->mmap; vma; vma = vma->vm_next) { pgd_t * pgd = pgd_offset(mm, vma->vm_start); @@ -505,6 +507,7 @@ static int unuse_process(struct mm_struc break; } spin_unlock(&mm->page_table_lock); + up_read(&mm->mmap_sem); pte_chain_free(pte_chain); return 0; } diff -purN -X /home/mbligh/.diff.exclude 000-virgin/mm/truncate.c 999-mjb/mm/truncate.c --- 000-virgin/mm/truncate.c 2003-06-05 14:56:45.000000000 -0700 +++ 999-mjb/mm/truncate.c 2003-10-02 16:53:56.000000000 -0700 @@ -73,13 +73,13 @@ invalidate_complete_page(struct address_ if (PagePrivate(page) && !try_to_release_page(page, 0)) return 0; - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); if (PageDirty(page)) { - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); return 0; } __remove_from_page_cache(page); - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); ClearPageUptodate(page); page_cache_release(page); /* pagecache ref */ return 1; diff -purN -X /home/mbligh/.diff.exclude 000-virgin/mm/vmscan.c 999-mjb/mm/vmscan.c --- 000-virgin/mm/vmscan.c 2003-10-01 11:47:15.000000000 -0700 +++ 999-mjb/mm/vmscan.c 2003-10-02 16:53:56.000000000 -0700 @@ -353,7 +353,7 @@ shrink_list(struct list_head *page_list, goto keep_locked; if (!may_write_to_queue(mapping->backing_dev_info)) goto keep_locked; - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); if (test_clear_page_dirty(page)) { int res; struct writeback_control wbc = { @@ -364,7 +364,7 @@ shrink_list(struct list_head *page_list, }; list_move(&page->list, &mapping->locked_pages); - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); SetPageReclaim(page); res = mapping->a_ops->writepage(page, &wbc); @@ -380,7 +380,7 @@ shrink_list(struct list_head *page_list, } goto keep; } - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); } /* @@ -414,7 +414,7 @@ shrink_list(struct list_head *page_list, if (!mapping) goto keep_locked; /* truncate got there first */ - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); /* * The non-racy check for busy page. It is critical to check @@ -422,7 +422,7 @@ shrink_list(struct list_head *page_list, * not in use by anybody. (pagecache + us == 2) */ if (page_count(page) != 2 || PageDirty(page)) { - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); goto keep_locked; } @@ -430,7 +430,7 @@ shrink_list(struct list_head *page_list, if (PageSwapCache(page)) { swp_entry_t swap = { .val = page->index }; __delete_from_swap_cache(page); - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); swap_free(swap); __put_page(page); /* The pagecache ref */ goto free_it; @@ -438,7 +438,7 @@ shrink_list(struct list_head *page_list, #endif /* CONFIG_SWAP */ __remove_from_page_cache(page); - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); __put_page(page); free_it: diff -purN -X /home/mbligh/.diff.exclude 000-virgin/scripts/Makefile.build 999-mjb/scripts/Makefile.build --- 000-virgin/scripts/Makefile.build 2003-10-01 11:48:31.000000000 -0700 +++ 999-mjb/scripts/Makefile.build 2003-10-02 16:43:03.000000000 -0700 @@ -128,7 +128,16 @@ cmd_cc_i_c = $(CPP) $(c_flags) - quiet_cmd_cc_o_c = CC $(quiet_modtag) $@ ifndef CONFIG_MODVERSIONS -cmd_cc_o_c = $(CC) $(c_flags) -c -o $@ $< +new1_c_flags = $(c_flags:-I%=-I$(TOPDIR)/%) +new2_c_flags = $(new1_c_flags:-Wp%=) +PWD = $(TOPDIR) + +quiet_cmd_cc_o_c = CC $(quiet_modtag) $@ +cmd_cc_o_c = $(CC) $(c_flags) -E -o $@ $< \ + && cd $(dir $<) \ + && $(CC) $(new2_c_flags) -c -o $(notdir $@) $(notdir $<) \ + && cd $(TOPDIR) +#cmd_cc_o_c = $(CC) $(c_flags) -c -o $@ $< else # When module versioning is enabled the following steps are executed: @@ -143,12 +152,21 @@ else # replace the unresolved symbols __crc_exported_symbol with # the actual value of the checksum generated by genksyms -cmd_cc_o_c = $(CC) $(c_flags) -c -o $(@D)/.tmp_$(@F) $< +new1_c_flags = $(c_flags:-I%=-I$(TOPDIR)/%) +new2_c_flags = $(new1_c_flags:-Wp%=) +PWD = $(TOPDIR) + +quiet_cmd_cc_o_c = CC $(quiet_modtag) $@ +cmd_cc_o_c = $(CC) $(c_flags) -E -o $@ $< \ + && cd $(dir $<) \ + && $(CC) $(new2_c_flags) -c -o .tmp_$(@F) $(notdir $<) \ + && cd $(TOPDIR) +#cmd_cc_o_c = $(CC) $(c_flags) -c -o $(@D)/.tmp_$(@F) $< cmd_modversions = \ if ! $(OBJDUMP) -h $(@D)/.tmp_$(@F) | grep -q __ksymtab; then \ mv $(@D)/.tmp_$(@F) $@; \ else \ - $(CPP) -D__GENKSYMS__ $(c_flags) $< \ + $(CPP) -D__GENKSYMS__ $(new2_c_flags) $< \ | $(GENKSYMS) \ > $(@D)/.tmp_$(@F:.o=.ver); \ \