diff -purN -X /home/mbligh/.diff.exclude 000-virgin/Documentation/filesystems/proc.txt 141-no_vma_sort/Documentation/filesystems/proc.txt --- 000-virgin/Documentation/filesystems/proc.txt 2003-04-21 14:13:56.000000000 -0700 +++ 141-no_vma_sort/Documentation/filesystems/proc.txt 2003-06-09 10:16:47.000000000 -0700 @@ -37,6 +37,7 @@ Table of Contents 2.8 /proc/sys/net/ipv4 - IPV4 settings 2.9 Appletalk 2.10 IPX + 2.11 /proc/sys/sched - scheduler tunables ------------------------------------------------------------------------------ Preface @@ -1750,6 +1751,104 @@ The /proc/net/ipx_route table holds a gives the destination network, the router node (or Directly) and the network address of the router (or Connected) for internal networks. +2.11 /proc/sys/sched - scheduler tunables +----------------------------------------- + +Useful knobs for tuning the scheduler live in /proc/sys/sched. + +child_penalty +------------- + +Percentage of the parent's sleep_avg that children inherit. sleep_avg is +a running average of the time a process spends sleeping. Tasks with high +sleep_avg values are considered interactive and given a higher dynamic +priority and a larger timeslice. You typically want this some value just +under 100. + +exit_weight +----------- + +When a CPU hog task exits, its parent's sleep_avg is reduced by a factor of +exit_weight against the exiting task's sleep_avg. + +interactive_delta +----------------- + +If a task is "interactive" it is reinserted into the active array after it +has expired its timeslice, instead of being inserted into the expired array. +How "interactive" a task must be in order to be deemed interactive is a +function of its nice value. This interactive limit is scaled linearly by nice +value and is offset by the interactive_delta. + +max_sleep_avg +------------- + +max_sleep_avg is the largest value (in ms) stored for a task's running sleep +average. The larger this value, the longer a task needs to sleep to be +considered interactive (maximum interactive bonus is a function of +max_sleep_avg). + +max_timeslice +------------- + +Maximum timeslice, in milliseconds. This is the value given to tasks of the +highest dynamic priority. + +min_timeslice +------------- + +Minimum timeslice, in milliseconds. This is the value given to tasks of the +lowest dynamic priority. Every task gets at least this slice of the processor +per array switch. + +parent_penalty +-------------- + +Percentage of the parent's sleep_avg that it retains across a fork(). +sleep_avg is a running average of the time a process spends sleeping. Tasks +with high sleep_avg values are considered interactive and given a higher +dynamic priority and a larger timeslice. Normally, this value is 100 and thus +task's retain their sleep_avg on fork. If you want to punish interactive +tasks for forking, set this below 100. + +prio_bonus_ratio +---------------- + +Middle percentage of the priority range that tasks can receive as a dynamic +priority. The default value of 25% ensures that nice values at the +extremes are still enforced. For example, nice +19 interactive tasks will +never be able to preempt a nice 0 CPU hog. Setting this higher will increase +the size of the priority range the tasks can receive as a bonus. Setting +this lower will decrease this range, making the interactivity bonus less +apparent and user nice values more applicable. + +starvation_limit +---------------- + +Sufficiently interactive tasks are reinserted into the active array when they +run out of timeslice. Normally, tasks are inserted into the expired array. +Reinserting interactive tasks into the active array allows them to remain +runnable, which is important to interactive performance. This could starve +expired tasks, however, since the interactive task could prevent the array +switch. To prevent starving the tasks on the expired array for too long. the +starvation_limit is the longest (in ms) we will let the expired array starve +at the expense of reinserting interactive tasks back into active. Higher +values here give more preferance to running interactive tasks, at the expense +of expired tasks. Lower values provide more fair scheduling behavior, at the +expense of interactivity. The units are in milliseconds. + +idle_node_rebalance_ratio +------------------------- + +On NUMA machines, we normally rebalance within nodes, but we also rebalance +globally every N idle rebalance ticks, where N = idle_node_rebalance_ratio. + +busy_node_rebalance_ratio +------------------------- + +On NUMA machines, we normally rebalance within nodes, but we also rebalance +globally every N busy rebalance ticks, where N = busy_node_rebalance_ratio. + ------------------------------------------------------------------------------ Summary ------------------------------------------------------------------------------ diff -purN -X /home/mbligh/.diff.exclude 000-virgin/arch/i386/Kconfig 141-no_vma_sort/arch/i386/Kconfig --- 000-virgin/arch/i386/Kconfig 2003-06-05 14:51:24.000000000 -0700 +++ 141-no_vma_sort/arch/i386/Kconfig 2003-06-09 10:15:30.000000000 -0700 @@ -666,6 +666,44 @@ config HIGHMEM64G endchoice +choice + help + On i386, a process can only virtually address 4GB of memory. This + lets you select how much of that virtual space you would like to + devoted to userspace, and how much to the kernel. + + Some userspace programs would like to address as much as possible and + have few demands of the kernel other than it get out of the way. These + users may opt to use the 3.5GB option to give their userspace program + as much room as possible. Due to alignment issues imposed by PAE, + the "3.5GB" option is unavailable if "64GB" high memory support is + enabled. + + Other users (especially those who use PAE) may be running out of + ZONE_NORMAL memory. Those users may benefit from increasing the + kernel's virtual address space size by taking it away from userspace, + which may not need all of its space. An indicator that this is + happening is when /proc/Meminfo's "LowFree:" is a small percentage of + "LowTotal:" while "HighFree:" is very large. + + If unsure, say "3GB" + prompt "User address space size" + default 1GB + +config 05GB + bool "3.5 GB" + depends on !HIGHMEM64G + +config 1GB + bool "3 GB" + +config 2GB + bool "2 GB" + +config 3GB + bool "1 GB" +endchoice + config HIGHMEM bool depends on HIGHMEM64G || HIGHMEM4G @@ -768,6 +806,25 @@ config MTRR See for more information. +choice + help + This is unrelated to your processor's speed. This variable alters + how often the system is asked to generate timer interrupts. A larger + value can lead to a more responsive system, but also causes extra + overhead from the increased number of context switches. + + If in doubt, leave it at the default of 1000. + + prompt "Kernel HZ" + default 1000HZ + +config 100HZ + bool "100 Hz" + +config 1000HZ + bool "1000 Hz" +endchoice + config HAVE_DEC_LOCK bool depends on (SMP || PREEMPT) && X86_CMPXCHG @@ -1499,6 +1556,26 @@ config MAGIC_SYSRQ keys are documented in . Don't say Y unless you really know what this hack does. +config EARLY_PRINTK + bool "Early console support" + default n + depends on DEBUG_KERNEL + help + Write kernel log output directly into the VGA buffer or serial port. + This is useful for kernel debugging when your machine crashes very + early before the console code is initialized. For normal operation + it is not recommended because it looks ugly and doesn't cooperate + with klogd/syslogd or the X server.You should normally N here, + unless you want to debug such a crash. + + Syntax: earlyprintk=vga + earlyprintk=serial[,ttySn[,baudrate]] + Append ,keep to not disable it when the real console takes over. + Only vga or serial at a time, not both. + Currently only ttyS0 and ttyS1 are supported. + Interaction with the standard serial driver is not very good. + The VGA output is eventually overwritten by the real console. + config DEBUG_SPINLOCK bool "Spinlock debugging" depends on DEBUG_KERNEL diff -purN -X /home/mbligh/.diff.exclude 000-virgin/arch/i386/Makefile 141-no_vma_sort/arch/i386/Makefile --- 000-virgin/arch/i386/Makefile 2003-06-05 14:51:24.000000000 -0700 +++ 141-no_vma_sort/arch/i386/Makefile 2003-06-09 10:15:30.000000000 -0700 @@ -94,6 +94,7 @@ drivers-$(CONFIG_OPROFILE) += arch/i386 CFLAGS += $(mflags-y) AFLAGS += $(mflags-y) +AFLAGS_vmlinux.lds.o += -imacros $(TOPDIR)/include/asm-i386/page.h boot := arch/i386/boot diff -purN -X /home/mbligh/.diff.exclude 000-virgin/arch/i386/kernel/irq.c 141-no_vma_sort/arch/i386/kernel/irq.c --- 000-virgin/arch/i386/kernel/irq.c 2003-06-05 14:51:26.000000000 -0700 +++ 141-no_vma_sort/arch/i386/kernel/irq.c 2003-06-09 10:16:51.000000000 -0700 @@ -898,8 +898,9 @@ static int irq_affinity_write_proc (stru return -EINVAL; irq_affinity[irq] = new_value; +#ifndef CONFIG_X86_SUMMIT irq_desc[irq].handler->set_affinity(irq, new_value); - +#endif return full_count; } diff -purN -X /home/mbligh/.diff.exclude 000-virgin/arch/i386/kernel/smpboot.c 141-no_vma_sort/arch/i386/kernel/smpboot.c --- 000-virgin/arch/i386/kernel/smpboot.c 2003-06-05 14:51:27.000000000 -0700 +++ 141-no_vma_sort/arch/i386/kernel/smpboot.c 2003-06-09 10:15:18.000000000 -0700 @@ -62,7 +62,7 @@ int smp_num_siblings = 1; int phys_proc_id[NR_CPUS]; /* Package ID of each logical CPU */ /* Bitmask of currently online CPUs */ -unsigned long cpu_online_map; +unsigned long cpu_online_map = 1; static volatile unsigned long cpu_callin_map; volatile unsigned long cpu_callout_map; diff -purN -X /home/mbligh/.diff.exclude 000-virgin/arch/i386/vmlinux.lds.S 141-no_vma_sort/arch/i386/vmlinux.lds.S --- 000-virgin/arch/i386/vmlinux.lds.S 2003-06-05 14:51:28.000000000 -0700 +++ 141-no_vma_sort/arch/i386/vmlinux.lds.S 2003-06-09 10:15:30.000000000 -0700 @@ -10,7 +10,7 @@ ENTRY(startup_32) jiffies = jiffies_64; SECTIONS { - . = 0xC0000000 + 0x100000; + . = __PAGE_OFFSET + 0x100000; /* read-only */ _text = .; /* Text and read-only data */ .text : { diff -purN -X /home/mbligh/.diff.exclude 000-virgin/fs/exec.c 141-no_vma_sort/fs/exec.c --- 000-virgin/fs/exec.c 2003-06-05 14:55:26.000000000 -0700 +++ 141-no_vma_sort/fs/exec.c 2003-06-09 10:16:53.000000000 -0700 @@ -316,6 +316,7 @@ void put_dirty_page(struct task_struct * } lru_cache_add_active(page); flush_dcache_page(page); + SetPageAnon(page); set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(page, prot)))); pte_chain = page_add_rmap(page, pte, pte_chain); pte_unmap(pte); diff -purN -X /home/mbligh/.diff.exclude 000-virgin/fs/proc/proc_misc.c 141-no_vma_sort/fs/proc/proc_misc.c --- 000-virgin/fs/proc/proc_misc.c 2003-06-05 14:55:42.000000000 -0700 +++ 141-no_vma_sort/fs/proc/proc_misc.c 2003-06-09 10:15:46.000000000 -0700 @@ -303,6 +303,9 @@ static struct file_operations proc_vmsta .release = seq_release, }; +extern int schedstats_read_proc(char *page, char **start, off_t off, + int count, int *eof, void *data); + #ifdef CONFIG_PROC_HARDWARE static int hardware_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) @@ -359,6 +362,71 @@ static struct file_operations proc_modul }; #endif +#ifdef CONFIG_NUMA +#define K(x) ((x) << (PAGE_SHIFT - 10)) +static int show_meminfo_numa (struct seq_file *m, void *v) +{ + int *d = v; + int nid = *d; + struct sysinfo i; + si_meminfo_node(&i, nid); + seq_printf(m, "\n" + "Node %d MemTotal: %8lu kB\n" + "Node %d MemFree: %8lu kB\n" + "Node %d MemUsed: %8lu kB\n" + "Node %d HighTotal: %8lu kB\n" + "Node %d HighFree: %8lu kB\n" + "Node %d LowTotal: %8lu kB\n" + "Node %d LowFree: %8lu kB\n", + nid, K(i.totalram), + nid, K(i.freeram), + nid, K(i.totalram-i.freeram), + nid, K(i.totalhigh), + nid, K(i.freehigh), + nid, K(i.totalram-i.totalhigh), + nid, K(i.freeram-i.freehigh)); + + return 0; +} +#undef K + +extern struct seq_operations meminfo_numa_op; +static int meminfo_numa_open(struct inode *inode, struct file *file) +{ + return seq_open(file,&meminfo_numa_op); +} + +static struct file_operations proc_meminfo_numa_operations = { + open: meminfo_numa_open, + read: seq_read, + llseek: seq_lseek, + release: seq_release, +}; + +static void *meminfo_numa_start(struct seq_file *m, loff_t *pos) +{ + return *pos < numnodes ? pos : NULL; +} + +static void *meminfo_numa_next(struct seq_file *m, void *v, loff_t *pos) +{ + ++*pos; + return meminfo_numa_start(m, pos); +} + +static void meminfo_numa_stop(struct seq_file *m, void *v) +{ +} + +struct seq_operations meminfo_numa_op = { + .start = meminfo_numa_start, + .next = meminfo_numa_next, + .stop = meminfo_numa_stop, + .show = show_meminfo_numa, +}; + +#endif + extern struct seq_operations slabinfo_op; extern ssize_t slabinfo_write(struct file *, const char *, size_t, loff_t *); static int slabinfo_open(struct inode *inode, struct file *file) @@ -636,6 +704,7 @@ void __init proc_misc_init(void) #endif {"locks", locks_read_proc}, {"execdomains", execdomains_read_proc}, + {"schedstat", schedstats_read_proc}, {NULL,} }; for (p = simple_ones; p->name; p++) @@ -659,6 +728,9 @@ void __init proc_misc_init(void) #ifdef CONFIG_MODULES create_seq_entry("modules", 0, &proc_modules_operations); #endif +#ifdef CONFIG_NUMA + create_seq_entry("meminfo.numa",0,&proc_meminfo_numa_operations); +#endif proc_root_kcore = create_proc_entry("kcore", S_IRUSR, NULL); if (proc_root_kcore) { proc_root_kcore->proc_fops = &proc_kcore_operations; diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/asm-i386/early_printk.h 141-no_vma_sort/include/asm-i386/early_printk.h --- 000-virgin/include/asm-i386/early_printk.h 1969-12-31 16:00:00.000000000 -0800 +++ 141-no_vma_sort/include/asm-i386/early_printk.h 2003-06-09 10:15:18.000000000 -0700 @@ -0,0 +1,8 @@ +#ifndef __EARLY_PRINTK_H_I386_ +#define __EARLY_PRINTK_H_i386_ + +#define VGABASE 0xB8000 +#define SERIAL_BASES { 0x3f8, 0x2f8 } +#define SERIAL_BASES_LEN 2 + +#endif diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/asm-i386/page.h 141-no_vma_sort/include/asm-i386/page.h --- 000-virgin/include/asm-i386/page.h 2003-04-09 11:48:05.000000000 -0700 +++ 141-no_vma_sort/include/asm-i386/page.h 2003-06-09 10:15:30.000000000 -0700 @@ -115,9 +115,26 @@ static __inline__ int get_order(unsigned #endif /* __ASSEMBLY__ */ #ifdef __ASSEMBLY__ -#define __PAGE_OFFSET (0xC0000000) +#include +#ifdef CONFIG_05GB +#define __PAGE_OFFSET (0xE0000000) +#elif defined(CONFIG_1GB) +#define __PAGE_OFFSET (0xC0000000) +#elif defined(CONFIG_2GB) +#define __PAGE_OFFSET (0x80000000) +#elif defined(CONFIG_3GB) +#define __PAGE_OFFSET (0x40000000) +#endif #else -#define __PAGE_OFFSET (0xC0000000UL) +#ifdef CONFIG_05GB +#define __PAGE_OFFSET (0xE0000000UL) +#elif defined(CONFIG_1GB) +#define __PAGE_OFFSET (0xC0000000UL) +#elif defined(CONFIG_2GB) +#define __PAGE_OFFSET (0x80000000UL) +#elif defined(CONFIG_3GB) +#define __PAGE_OFFSET (0x40000000UL) +#endif #endif diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/asm-i386/param.h 141-no_vma_sort/include/asm-i386/param.h --- 000-virgin/include/asm-i386/param.h 2002-12-09 18:45:45.000000000 -0800 +++ 141-no_vma_sort/include/asm-i386/param.h 2003-06-09 10:15:23.000000000 -0700 @@ -2,11 +2,19 @@ #define _ASMi386_PARAM_H #ifdef __KERNEL__ -# define HZ 1000 /* Internal kernel timer frequency */ -# define USER_HZ 100 /* .. some user interfaces are in "ticks" */ -# define CLOCKS_PER_SEC (USER_HZ) /* like times() */ +#include + +#ifdef CONFIG_1000HZ +# define HZ 1000 /* Internal kernel timer frequency */ +#else +# define HZ 100 #endif +#define USER_HZ 100 /* .. some user interfaces are in "ticks" */ +#define CLOCKS_PER_SEC (USER_HZ) /* like times() */ + +#endif /* __KERNEL__ */ + #ifndef HZ #define HZ 100 #endif diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/asm-i386/processor.h 141-no_vma_sort/include/asm-i386/processor.h --- 000-virgin/include/asm-i386/processor.h 2003-06-05 14:56:10.000000000 -0700 +++ 141-no_vma_sort/include/asm-i386/processor.h 2003-06-09 10:15:30.000000000 -0700 @@ -288,7 +288,11 @@ extern unsigned int mca_pentium_flag; /* This decides where the kernel will search for a free chunk of vm * space during mmap's. */ +#ifdef CONFIG_05GB +#define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 16)) +#else #define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3)) +#endif /* * Size of io_bitmap in longwords: 32 is ports 0-0x3ff. diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/asm-x86_64/early_printk.h 141-no_vma_sort/include/asm-x86_64/early_printk.h --- 000-virgin/include/asm-x86_64/early_printk.h 1969-12-31 16:00:00.000000000 -0800 +++ 141-no_vma_sort/include/asm-x86_64/early_printk.h 2003-06-09 10:15:18.000000000 -0700 @@ -0,0 +1,8 @@ +#ifdef __EARLY_PRINTK_H_X86_64_ +#define __EARLY_PRINTK_H_X86_64_ + +#define VGABASE 0xffffffff800b8000UL +#define SERIAL_BASES { 0x3f8, 0x2f8 } +#define SERIAL_BASES_LEN 2 + +#endif diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/linux/early_printk.h 141-no_vma_sort/include/linux/early_printk.h --- 000-virgin/include/linux/early_printk.h 1969-12-31 16:00:00.000000000 -0800 +++ 141-no_vma_sort/include/linux/early_printk.h 2003-06-09 10:15:18.000000000 -0700 @@ -0,0 +1,47 @@ +#ifndef __EARLY_PRINTK_H_ +#define __EARLY_PRINTK_H_ + +#ifdef CONFIG_EARLY_PRINTK +#include +#include +#include +#include +#include +#include + +/* Simple VGA output */ + +#define MAX_YPOS 25 +#define MAX_XPOS 80 + +/* Simple serial port output */ + +#define DEFAULT_BAUD 57600 +#define XMTRDY 0x20 + +#define DLAB 0x80 + +#define TXR 0 /* Transmit register (WRITE) */ +#define RXR 0 /* Receive register (READ) */ +#define IER 1 /* Interrupt Enable */ +#define IIR 2 /* Interrupt ID */ +#define FCR 2 /* FIFO control */ +#define LCR 3 /* Line control */ +#define MCR 4 /* Modem control */ +#define LSR 5 /* Line Status */ +#define MSR 6 /* Modem Status */ +#define DLL 0 /* Divisor Latch Low */ +#define DLH 1 /* Divisor latch High */ + + +void early_printk(const char *fmt, ...); +int __init setup_early_printk(char *opt); + +#else + +#define early_printk(...) do {} while(0) +#define setup_early_printk(X) do {} while(0) + +#endif + +#endif diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/linux/mm.h 141-no_vma_sort/include/linux/mm.h --- 000-virgin/include/linux/mm.h 2003-06-05 14:56:33.000000000 -0700 +++ 141-no_vma_sort/include/linux/mm.h 2003-06-09 10:16:54.000000000 -0700 @@ -179,6 +179,7 @@ struct page { struct pte_chain *chain;/* Reverse pte mapping pointer. * protected by PG_chainlock */ pte_addr_t direct; + int mapcount; } pte; unsigned long private; /* mapping-private opaque data */ diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/linux/page-flags.h 141-no_vma_sort/include/linux/page-flags.h --- 000-virgin/include/linux/page-flags.h 2003-04-21 14:14:50.000000000 -0700 +++ 141-no_vma_sort/include/linux/page-flags.h 2003-06-09 10:16:54.000000000 -0700 @@ -74,6 +74,7 @@ #define PG_mappedtodisk 17 /* Has blocks allocated on-disk */ #define PG_reclaim 18 /* To be reclaimed asap */ #define PG_compound 19 /* Part of a compound page */ +#define PG_anon 20 /* Anonymous page */ /* @@ -257,6 +258,10 @@ extern void get_full_page_state(struct p #define SetPageCompound(page) set_bit(PG_compound, &(page)->flags) #define ClearPageCompound(page) clear_bit(PG_compound, &(page)->flags) +#define PageAnon(page) test_bit(PG_anon, &(page)->flags) +#define SetPageAnon(page) set_bit(PG_anon, &(page)->flags) +#define ClearPageAnon(page) clear_bit(PG_anon, &(page)->flags) + /* * The PageSwapCache predicate doesn't use a PG_flag at this time, * but it may again do so one day. diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/linux/swap.h 141-no_vma_sort/include/linux/swap.h --- 000-virgin/include/linux/swap.h 2003-06-05 14:56:37.000000000 -0700 +++ 141-no_vma_sort/include/linux/swap.h 2003-06-09 10:16:54.000000000 -0700 @@ -186,6 +186,8 @@ struct pte_chain *FASTCALL(page_add_rmap void FASTCALL(page_remove_rmap(struct page *, pte_t *)); int FASTCALL(try_to_unmap(struct page *)); +int page_convert_anon(struct page *); + /* linux/mm/shmem.c */ extern int shmem_unuse(swp_entry_t entry, struct page *page); #else diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/linux/sysctl.h 141-no_vma_sort/include/linux/sysctl.h --- 000-virgin/include/linux/sysctl.h 2003-06-05 14:56:37.000000000 -0700 +++ 141-no_vma_sort/include/linux/sysctl.h 2003-06-09 10:16:47.000000000 -0700 @@ -66,7 +66,8 @@ enum CTL_DEV=7, /* Devices */ CTL_BUS=8, /* Busses */ CTL_ABI=9, /* Binary emulation */ - CTL_CPU=10 /* CPU stuff (speed scaling, etc) */ + CTL_CPU=10, /* CPU stuff (speed scaling, etc) */ + CTL_SCHED=11, /* scheduler tunables */ }; /* CTL_BUS names: */ @@ -158,6 +159,21 @@ enum VM_LOWER_ZONE_PROTECTION=20,/* Amount of protection of lower zones */ }; +/* Tunable scheduler parameters in /proc/sys/sched/ */ +enum { + SCHED_MIN_TIMESLICE=1, /* minimum process timeslice */ + SCHED_MAX_TIMESLICE=2, /* maximum process timeslice */ + SCHED_CHILD_PENALTY=3, /* penalty on fork to child */ + SCHED_PARENT_PENALTY=4, /* penalty on fork to parent */ + SCHED_EXIT_WEIGHT=5, /* penalty to parent of CPU hog child */ + SCHED_PRIO_BONUS_RATIO=6, /* percent of max prio given as bonus */ + SCHED_INTERACTIVE_DELTA=7, /* delta used to scale interactivity */ + SCHED_MAX_SLEEP_AVG=8, /* maximum sleep avg attainable */ + SCHED_STARVATION_LIMIT=9, /* no re-active if expired is starved */ + SCHED_NODE_THRESHOLD=10, /* NUMA node rebalance threshold */ + SCHED_IDLE_NODE_REBALANCE_RATIO=11, /* how often to global balance */ + SCHED_BUSY_NODE_REBALANCE_RATIO=12, /* how often to global balance */ +}; /* CTL_NET names: */ enum diff -purN -X /home/mbligh/.diff.exclude 000-virgin/include/linux/timex.h 141-no_vma_sort/include/linux/timex.h --- 000-virgin/include/linux/timex.h 2003-06-05 14:39:26.000000000 -0700 +++ 141-no_vma_sort/include/linux/timex.h 2003-06-09 10:15:23.000000000 -0700 @@ -75,7 +75,7 @@ #elif HZ >= 768 && HZ < 1536 # define SHIFT_HZ 10 #else -# error You lose. +# error Please use a HZ value which is between 12 and 1536 #endif /* diff -purN -X /home/mbligh/.diff.exclude 000-virgin/init/main.c 141-no_vma_sort/init/main.c --- 000-virgin/init/main.c 2003-06-05 14:56:42.000000000 -0700 +++ 141-no_vma_sort/init/main.c 2003-06-09 10:15:18.000000000 -0700 @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -387,6 +388,7 @@ asmlinkage void __init start_kernel(void */ lock_kernel(); printk(linux_banner); + setup_early_printk(&command_line); setup_arch(&command_line); setup_per_cpu_areas(); diff -purN -X /home/mbligh/.diff.exclude 000-virgin/kernel/Makefile 141-no_vma_sort/kernel/Makefile --- 000-virgin/kernel/Makefile 2003-06-05 14:56:42.000000000 -0700 +++ 141-no_vma_sort/kernel/Makefile 2003-06-09 10:15:18.000000000 -0700 @@ -19,6 +19,7 @@ obj-$(CONFIG_CPU_FREQ) += cpufreq.o obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend.o obj-$(CONFIG_COMPAT) += compat.o +obj-$(CONFIG_EARLY_PRINTK) += early_printk.o ifneq ($(CONFIG_IA64),y) # According to Alan Modra , the -fno-omit-frame-pointer is diff -purN -X /home/mbligh/.diff.exclude 000-virgin/kernel/early_printk.c 141-no_vma_sort/kernel/early_printk.c --- 000-virgin/kernel/early_printk.c 1969-12-31 16:00:00.000000000 -0800 +++ 141-no_vma_sort/kernel/early_printk.c 2003-06-09 10:15:18.000000000 -0700 @@ -0,0 +1,209 @@ +#include +#include +#include +#include +#include +#include + +/* Simple VGA output */ + +#define MAX_YPOS 25 +#define MAX_XPOS 80 + +static int current_ypos = 1, current_xpos = 0; + +static void early_vga_write(struct console *con, const char *str, unsigned n) +{ + char c; + int i, k, j; + + while ((c = *str++) != '\0' && n-- > 0) { + if (current_ypos >= MAX_YPOS) { + /* scroll 1 line up */ + for(k = 1, j = 0; k < MAX_YPOS; k++, j++) { + for(i = 0; i < MAX_XPOS; i++) { + writew(readw(VGABASE + 2*(MAX_XPOS*k + i)), + VGABASE + 2*(MAX_XPOS*j + i)); + } + } + for(i = 0; i < MAX_XPOS; i++) { + writew(0x720, VGABASE + 2*(MAX_XPOS*j + i)); + } + current_ypos = MAX_YPOS-1; + } + if (c == '\n') { + current_xpos = 0; + current_ypos++; + } else if (c != '\r') { + writew(((0x7 << 8) | (unsigned short) c), + VGABASE + 2*(MAX_XPOS*current_ypos + current_xpos++)); + if (current_xpos >= MAX_XPOS) { + current_xpos = 0; + current_ypos++; + } + } + } +} + +static struct console early_vga_console = { + .name = "earlyvga", + .write = early_vga_write, + .flags = CON_PRINTBUFFER, + .index = -1, +}; + +/* Serial functions losely based on a similar package from Klaus P. Gerlicher */ + +int early_serial_base; /* ttyS0 */ + +static int early_serial_putc(unsigned char ch) +{ + unsigned timeout = 0xffff; + while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout) + rep_nop(); + outb(ch, early_serial_base + TXR); + return timeout ? 0 : -1; +} + +static void early_serial_write(struct console *con, const char *s, unsigned n) +{ + while (*s && n-- > 0) { + early_serial_putc(*s); + if (*s == '\n') + early_serial_putc('\r'); + s++; + } +} + +static __init void early_serial_init(char *opt) +{ + unsigned char c; + unsigned divisor, baud = DEFAULT_BAUD; + static int bases[] = SERIAL_BASES; + char *s, *e; + + early_serial_base = bases[0]; + + if (*opt == ',') + ++opt; + + s = strsep(&opt, ","); + if (s != NULL) { + unsigned port; + if (!strncmp(s,"0x",2)) + early_serial_base = simple_strtoul(s, &e, 16); + else { + if (!strncmp(s,"ttyS",4)) + s+=4; + port = simple_strtoul(s, &e, 10); + if (port > (SERIAL_BASES_LEN-1) || s == e) + port = 0; + early_serial_base = bases[port]; + } + } + + outb(0x3, early_serial_base + LCR); /* 8n1 */ + outb(0, early_serial_base + IER); /* no interrupt */ + outb(0, early_serial_base + FCR); /* no fifo */ + outb(0x3, early_serial_base + MCR); /* DTR + RTS */ + + s = strsep(&opt, ","); + if (s != NULL) { + baud = simple_strtoul(s, &e, 0); + if (baud == 0 || s == e) + baud = DEFAULT_BAUD; + } + + divisor = 115200 / baud; + c = inb(early_serial_base + LCR); + outb(c | DLAB, early_serial_base + LCR); + outb(divisor & 0xff, early_serial_base + DLL); + outb((divisor >> 8) & 0xff, early_serial_base + DLH); + outb(c & ~DLAB, early_serial_base + LCR); +} + +static struct console early_serial_console = { + .name = "earlyser", + .write = early_serial_write, + .flags = CON_PRINTBUFFER, + .index = -1, +}; + +/* Direct interface for emergencies */ +struct console *early_console = &early_vga_console; +static int early_console_initialized = 0; + +void early_printk(const char *fmt, ...) +{ + char buf[512]; + int n; + va_list ap; + va_start(ap,fmt); + n = vsnprintf(buf,512,fmt,ap); + early_console->write(early_console,buf,n); + va_end(ap); +} + +static int keep_early; + +int __init setup_early_printk(char *opt) +{ + char *space, *s; + char buf[256]; + + s = strstr(opt, "earlyprintk="); + if (s == NULL) + return -1; + opt = s+12; + + if (early_console_initialized) + return -1; + + strncpy(buf,opt,256); + buf[255] = 0; + space = strchr(buf, ' '); + if (space) + *space = 0; + + if (strstr(buf,"keep")) + keep_early = 1; + + if (!strncmp(buf, "serial", 6)) { + early_serial_init(buf + 6); + early_console = &early_serial_console; + } else if (!strncmp(buf, "ttyS", 4)) { + early_serial_init(buf); + early_console = &early_serial_console; + } else if (!strncmp(buf, "vga", 3)) { + early_console = &early_vga_console; + } else { + early_console = NULL; + return -1; + } + early_console_initialized = 1; + register_console(early_console); + early_printk( "early printk console registered\n" ); + return 0; +} + +void __init disable_early_printk(void) +{ + if (!early_console_initialized || !early_console) + return; + if (!keep_early) { + printk("disabling early console...\n"); + unregister_console(early_console); + early_console_initialized = 0; + } else { + printk("keeping early console.\n"); + } +} + +/* syntax: earlyprintk=vga + earlyprintk=serial[,ttySn[,baudrate]] + Append ,keep to not disable it when the real console takes over. + Only vga or serial at a time, not both. + Currently only ttyS0 and ttyS1 are supported. + Interaction with the standard serial driver is not very good. + The VGA output is eventually overwritten by the real console. */ +__setup("earlyprintk=", setup_early_printk); diff -purN -X /home/mbligh/.diff.exclude 000-virgin/kernel/sched.c 141-no_vma_sort/kernel/sched.c --- 000-virgin/kernel/sched.c 2003-06-05 14:56:43.000000000 -0700 +++ 141-no_vma_sort/kernel/sched.c 2003-06-09 10:16:47.000000000 -0700 @@ -64,16 +64,27 @@ * maximum timeslice is 200 msecs. Timeslices get refilled after * they expire. */ -#define MIN_TIMESLICE ( 10 * HZ / 1000) -#define MAX_TIMESLICE (200 * HZ / 1000) -#define CHILD_PENALTY 50 -#define PARENT_PENALTY 100 -#define EXIT_WEIGHT 3 -#define PRIO_BONUS_RATIO 25 -#define INTERACTIVE_DELTA 2 -#define MAX_SLEEP_AVG (10*HZ) -#define STARVATION_LIMIT (10*HZ) -#define NODE_THRESHOLD 125 +int min_timeslice = (10 * HZ) / 1000; +int max_timeslice = (200 * HZ) / 1000; +int child_penalty = 50; +int parent_penalty = 100; +int exit_weight = 3; +int prio_bonus_ratio = 25; +int interactive_delta = 2; +int max_sleep_avg = 10 * HZ; +int starvation_limit = 10 * HZ; +int node_threshold = 125; + +#define MIN_TIMESLICE (min_timeslice) +#define MAX_TIMESLICE (max_timeslice) +#define CHILD_PENALTY (child_penalty) +#define PARENT_PENALTY (parent_penalty) +#define EXIT_WEIGHT (exit_weight) +#define PRIO_BONUS_RATIO (prio_bonus_ratio) +#define INTERACTIVE_DELTA (interactive_delta) +#define MAX_SLEEP_AVG (max_sleep_avg) +#define STARVATION_LIMIT (starvation_limit) +#define NODE_THRESHOLD (node_threshold) /* * If a task is 'interactive' then we reinsert it in the active @@ -230,6 +241,111 @@ __init void node_nr_running_init(void) #endif /* CONFIG_NUMA */ + +struct schedstat { + /* sys_sched_yield stats */ + unsigned long yld_exp_empty; + unsigned long yld_act_empty; + unsigned long yld_both_empty; + unsigned long yld_cnt; + + /* schedule stats */ + unsigned long sched_noswitch; + unsigned long sched_switch; + unsigned long sched_cnt; + + /* load_balance stats */ + unsigned long lb_imbalance; + unsigned long lb_idle; + unsigned long lb_busy; + unsigned long lb_resched; + unsigned long lb_cnt; + unsigned long lb_nobusy; + unsigned long lb_bnode; + + /* pull_task stats */ + unsigned long pt_gained; + unsigned long pt_lost; + unsigned long pt_node_gained; + unsigned long pt_node_lost; + + /* balance_node stats */ + unsigned long bn_cnt; + unsigned long bn_idle; +} ____cacheline_aligned; + +/* + * bump this up when changing the output format or the meaning of an existing + * format, so that tools can adapt (or abort) + */ +#define SCHEDSTAT_VERSION 2 + +struct schedstat schedstats[NR_CPUS]; + +/* + * This could conceivably exceed a page's worth of output on machines with + * large number of cpus, where large == about 4096/100 or 40ish. Start + * worrying when we pass 32, probably. Then this has to stop being a + * "simple" entry in proc/proc_misc.c and needs to be an actual seq_file. + */ +int schedstats_read_proc(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct schedstat sums; + int i, len; + + memset(&sums, 0, sizeof(sums)); + len = sprintf(page, "version %d\n", SCHEDSTAT_VERSION); + for (i = 0; i < NR_CPUS; i++) { + if (!cpu_online(i)) continue; + sums.yld_exp_empty += schedstats[i].yld_exp_empty; + sums.yld_act_empty += schedstats[i].yld_act_empty; + sums.yld_both_empty += schedstats[i].yld_both_empty; + sums.yld_cnt += schedstats[i].yld_cnt; + sums.sched_noswitch += schedstats[i].sched_noswitch; + sums.sched_switch += schedstats[i].sched_switch; + sums.sched_cnt += schedstats[i].sched_cnt; + sums.lb_idle += schedstats[i].lb_idle; + sums.lb_busy += schedstats[i].lb_busy; + sums.lb_resched += schedstats[i].lb_resched; + sums.lb_cnt += schedstats[i].lb_cnt; + sums.lb_imbalance += schedstats[i].lb_imbalance; + sums.lb_nobusy += schedstats[i].lb_nobusy; + sums.lb_bnode += schedstats[i].lb_bnode; + sums.pt_node_gained += schedstats[i].pt_node_gained; + sums.pt_node_lost += schedstats[i].pt_node_lost; + sums.pt_gained += schedstats[i].pt_gained; + sums.pt_lost += schedstats[i].pt_lost; + sums.bn_cnt += schedstats[i].bn_cnt; + sums.bn_idle += schedstats[i].bn_idle; + len += sprintf(page + len, + "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu " + "%lu %lu %lu %lu %lu %lu %lu %lu\n", + i, schedstats[i].yld_both_empty, + schedstats[i].yld_act_empty, schedstats[i].yld_exp_empty, + schedstats[i].yld_cnt, schedstats[i].sched_noswitch, + schedstats[i].sched_switch, schedstats[i].sched_cnt, + schedstats[i].lb_idle, schedstats[i].lb_busy, + schedstats[i].lb_resched, + schedstats[i].lb_cnt, schedstats[i].lb_imbalance, + schedstats[i].lb_nobusy, schedstats[i].lb_bnode, + schedstats[i].pt_gained, schedstats[i].pt_lost, + schedstats[i].pt_node_gained, schedstats[i].pt_node_lost, + schedstats[i].bn_cnt, schedstats[i].bn_idle); + } + len += sprintf(page + len, + "totals %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu " + "%lu %lu %lu %lu %lu %lu %lu\n", + sums.yld_both_empty, sums.yld_act_empty, sums.yld_exp_empty, + sums.yld_cnt, sums.sched_noswitch, sums.sched_switch, + sums.sched_cnt, sums.lb_idle, sums.lb_busy, sums.lb_resched, + sums.lb_cnt, sums.lb_imbalance, sums.lb_nobusy, sums.lb_bnode, + sums.pt_gained, sums.pt_lost, sums.pt_node_gained, + sums.pt_node_lost, sums.bn_cnt, sums.bn_idle); + + return len; +} + /* * task_rq_lock - lock the runqueue a given task resides on and disable * interrupts. Note the ordering: we can safely lookup the task_rq without @@ -656,7 +772,6 @@ static inline task_t * context_switch(ru return prev; } - /* * nr_running, nr_uninterruptible and nr_context_switches: * @@ -840,6 +955,9 @@ static int find_busiest_node(int this_no #endif /* CONFIG_NUMA */ +int idle_node_rebalance_ratio = 10; +int busy_node_rebalance_ratio = 2; + #ifdef CONFIG_SMP /* @@ -951,6 +1069,12 @@ out: */ static inline void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, runqueue_t *this_rq, int this_cpu) { + if (cpu_to_node(this_cpu) != cpu_to_node(src_rq - runqueues)) { + schedstats[this_cpu].pt_node_gained++; + schedstats[src_rq - runqueues].pt_node_lost++; + } + schedstats[this_cpu].pt_gained++; + schedstats[src_rq - runqueues].pt_lost++; dequeue_task(p, src_array); nr_running_dec(src_rq); set_task_cpu(p, this_cpu); @@ -985,10 +1109,14 @@ static void load_balance(runqueue_t *thi struct list_head *head, *curr; task_t *tmp; + schedstats[this_cpu].lb_cnt++; busiest = find_busiest_queue(this_rq, this_cpu, idle, &imbalance, cpumask); - if (!busiest) + if (!busiest) { + schedstats[this_cpu].lb_nobusy++; goto out; + } + schedstats[this_cpu].lb_imbalance += imbalance; /* * We first consider expired tasks. Those will likely not be * executed in the near future, and they are most likely to @@ -1067,8 +1195,8 @@ out: */ #define IDLE_REBALANCE_TICK (HZ/1000 ?: 1) #define BUSY_REBALANCE_TICK (HZ/5 ?: 1) -#define IDLE_NODE_REBALANCE_TICK (IDLE_REBALANCE_TICK * 5) -#define BUSY_NODE_REBALANCE_TICK (BUSY_REBALANCE_TICK * 2) +#define IDLE_NODE_REBALANCE_TICK (IDLE_REBALANCE_TICK * idle_node_rebalance_ratio) +#define BUSY_NODE_REBALANCE_TICK (BUSY_REBALANCE_TICK * busy_node_rebalance_ratio) #ifdef CONFIG_NUMA static void balance_node(runqueue_t *this_rq, int idle, int this_cpu) @@ -1076,9 +1204,13 @@ static void balance_node(runqueue_t *thi int node = find_busiest_node(cpu_to_node(this_cpu)); unsigned long cpumask, this_cpumask = 1UL << this_cpu; + schedstats[this_cpu].bn_cnt++; + if (idle) + schedstats[this_cpu].bn_idle++; if (node >= 0) { cpumask = node_to_cpumask(node) | this_cpumask; spin_lock(&this_rq->lock); + schedstats[this_cpu].lb_bnode++; load_balance(this_rq, idle, cpumask); spin_unlock(&this_rq->lock); } @@ -1087,9 +1219,7 @@ static void balance_node(runqueue_t *thi static void rebalance_tick(runqueue_t *this_rq, int idle) { -#ifdef CONFIG_NUMA int this_cpu = smp_processor_id(); -#endif unsigned long j = jiffies; /* @@ -1107,6 +1237,7 @@ static void rebalance_tick(runqueue_t *t #endif if (!(j % IDLE_REBALANCE_TICK)) { spin_lock(&this_rq->lock); + schedstats[this_cpu].lb_idle++; load_balance(this_rq, 0, cpu_to_node_mask(this_cpu)); spin_unlock(&this_rq->lock); } @@ -1118,6 +1249,7 @@ static void rebalance_tick(runqueue_t *t #endif if (!(j % BUSY_REBALANCE_TICK)) { spin_lock(&this_rq->lock); + schedstats[this_cpu].lb_busy++; load_balance(this_rq, idle, cpu_to_node_mask(this_cpu)); spin_unlock(&this_rq->lock); } @@ -1243,13 +1375,14 @@ asmlinkage void schedule(void) runqueue_t *rq; prio_array_t *array; struct list_head *queue; - int idx; + int idx, mycpu = smp_processor_id(); /* * Test if we are atomic. Since do_exit() needs to call into * schedule() atomically, we ignore that path for now. * Otherwise, whine if we are scheduling when we should not be. */ + schedstats[mycpu].sched_cnt++; if (likely(!(current->state & (TASK_DEAD | TASK_ZOMBIE)))) { if (unlikely(in_atomic())) { printk(KERN_ERR "bad: scheduling while atomic!\n"); @@ -1288,6 +1421,7 @@ need_resched: pick_next_task: if (unlikely(!rq->nr_running)) { #ifdef CONFIG_SMP + schedstats[mycpu].lb_resched++; load_balance(rq, 1, cpu_to_node_mask(smp_processor_id())); if (rq->nr_running) goto pick_next_task; @@ -1302,11 +1436,13 @@ pick_next_task: /* * Switch the active and expired arrays. */ + schedstats[mycpu].sched_switch++; rq->active = rq->expired; rq->expired = array; array = rq->active; rq->expired_timestamp = 0; } + schedstats[mycpu].sched_noswitch++; idx = sched_find_first_bit(array->bitmap); queue = array->queue + idx; @@ -1958,6 +2094,7 @@ asmlinkage long sys_sched_yield(void) { runqueue_t *rq = this_rq_lock(); prio_array_t *array = current->array; + int mycpu = smp_processor_id(); /* * We implement yielding by moving the task into the expired @@ -1966,7 +2103,15 @@ asmlinkage long sys_sched_yield(void) * (special rule: RT tasks will just roundrobin in the active * array.) */ + schedstats[mycpu].yld_cnt++; if (likely(!rt_task(current))) { + if (current->array->nr_active == 1) { + schedstats[mycpu].yld_act_empty++; + if (!rq->expired->nr_active) + schedstats[mycpu].yld_both_empty++; + } else if (!rq->expired->nr_active) { + schedstats[mycpu].yld_exp_empty++; + } dequeue_task(current, array); enqueue_task(current, rq->expired); } else { diff -purN -X /home/mbligh/.diff.exclude 000-virgin/kernel/sysctl.c 141-no_vma_sort/kernel/sysctl.c --- 000-virgin/kernel/sysctl.c 2003-06-05 14:56:44.000000000 -0700 +++ 141-no_vma_sort/kernel/sysctl.c 2003-06-09 10:16:48.000000000 -0700 @@ -57,6 +57,18 @@ extern char core_pattern[]; extern int cad_pid; extern int pid_max; extern int sysctl_lower_zone_protection; +extern int min_timeslice; +extern int max_timeslice; +extern int child_penalty; +extern int parent_penalty; +extern int exit_weight; +extern int prio_bonus_ratio; +extern int interactive_delta; +extern int max_sleep_avg; +extern int starvation_limit; +extern int node_threshold; +extern int idle_node_rebalance_ratio; +extern int busy_node_rebalance_ratio; /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ static int maxolduid = 65535; @@ -114,6 +126,7 @@ static struct ctl_table_header root_tabl static ctl_table kern_table[]; static ctl_table vm_table[]; +static ctl_table sched_table[]; #ifdef CONFIG_NET extern ctl_table net_table[]; #endif @@ -158,6 +171,7 @@ static ctl_table root_table[] = { {CTL_FS, "fs", NULL, 0, 0555, fs_table}, {CTL_DEBUG, "debug", NULL, 0, 0555, debug_table}, {CTL_DEV, "dev", NULL, 0, 0555, dev_table}, + {CTL_SCHED, "sched", NULL, 0, 0555, sched_table}, {0} }; @@ -362,7 +376,49 @@ static ctl_table debug_table[] = { static ctl_table dev_table[] = { {0} -}; +}; + +static ctl_table sched_table[] = { + {SCHED_MAX_TIMESLICE, "max_timeslice", &max_timeslice, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &one, NULL}, + {SCHED_MIN_TIMESLICE, "min_timeslice", &min_timeslice, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &one, NULL}, + {SCHED_CHILD_PENALTY, "child_penalty", &child_penalty, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {SCHED_PARENT_PENALTY, "parent_penalty", &parent_penalty, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {SCHED_EXIT_WEIGHT, "exit_weight", &exit_weight, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {SCHED_PRIO_BONUS_RATIO, "prio_bonus_ratio", &prio_bonus_ratio, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {SCHED_INTERACTIVE_DELTA, "interactive_delta", &interactive_delta, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {SCHED_MAX_SLEEP_AVG, "max_sleep_avg", &max_sleep_avg, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &one, NULL}, + {SCHED_STARVATION_LIMIT, "starvation_limit", &starvation_limit, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {SCHED_NODE_THRESHOLD, "node_threshold", &node_threshold, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + sysctl_intvec, NULL, &one, NULL}, + {SCHED_IDLE_NODE_REBALANCE_RATIO, "idle_node_rebalance_ratio", + &idle_node_rebalance_ratio, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {SCHED_BUSY_NODE_REBALANCE_RATIO, "busy_node_rebalance_ratio", + &busy_node_rebalance_ratio, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {0} +}; extern void init_irq_proc (void); diff -purN -X /home/mbligh/.diff.exclude 000-virgin/mm/filemap.c 141-no_vma_sort/mm/filemap.c --- 000-virgin/mm/filemap.c 2003-06-05 14:56:44.000000000 -0700 +++ 141-no_vma_sort/mm/filemap.c 2003-06-09 10:16:54.000000000 -0700 @@ -63,6 +63,9 @@ * ->mmap_sem * ->i_shared_sem (various places) * + * ->lock_page + * ->i_shared_sem (page_convert_anon) + * * ->inode_lock * ->sb_lock (fs/fs-writeback.c) * ->mapping->page_lock (__sync_single_inode) diff -purN -X /home/mbligh/.diff.exclude 000-virgin/mm/fremap.c 141-no_vma_sort/mm/fremap.c --- 000-virgin/mm/fremap.c 2003-06-05 14:56:44.000000000 -0700 +++ 141-no_vma_sort/mm/fremap.c 2003-06-09 10:16:54.000000000 -0700 @@ -60,10 +60,26 @@ int install_page(struct mm_struct *mm, s pgd_t *pgd; pmd_t *pmd; struct pte_chain *pte_chain; + unsigned long pgidx; pte_chain = pte_chain_alloc(GFP_KERNEL); if (!pte_chain) goto err; + + /* + * Convert this page to anon for objrmap if it's nonlinear + */ + pgidx = (addr - vma->vm_start) >> PAGE_SHIFT; + pgidx += vma->vm_pgoff; + pgidx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT; + if (!PageAnon(page) && (page->index != pgidx)) { + lock_page(page); + err = page_convert_anon(page); + unlock_page(page); + if (err < 0) + goto err_free; + } + pgd = pgd_offset(mm, addr); spin_lock(&mm->page_table_lock); @@ -85,12 +101,11 @@ int install_page(struct mm_struct *mm, s if (flush) flush_tlb_page(vma, addr); update_mmu_cache(vma, addr, *pte); - spin_unlock(&mm->page_table_lock); - pte_chain_free(pte_chain); - return 0; + err = 0; err_unlock: spin_unlock(&mm->page_table_lock); +err_free: pte_chain_free(pte_chain); err: return err; diff -purN -X /home/mbligh/.diff.exclude 000-virgin/mm/memory.c 141-no_vma_sort/mm/memory.c --- 000-virgin/mm/memory.c 2003-06-05 14:56:44.000000000 -0700 +++ 141-no_vma_sort/mm/memory.c 2003-06-09 10:16:54.000000000 -0700 @@ -102,8 +102,7 @@ static inline void free_one_pmd(struct m static inline void free_one_pgd(struct mmu_gather *tlb, pgd_t * dir) { - int j; - pmd_t * pmd; + pmd_t * pmd, * md, * emd; if (pgd_none(*dir)) return; @@ -114,8 +113,21 @@ static inline void free_one_pgd(struct m } pmd = pmd_offset(dir, 0); pgd_clear(dir); - for (j = 0; j < PTRS_PER_PMD ; j++) - free_one_pmd(tlb, pmd+j); + /* + * Beware if changing the loop below. It once used int j, + * for (j = 0; j < PTRS_PER_PMD; j++) + * free_one_pmd(pmd+j); + * but some older i386 compilers (e.g. egcs-2.91.66, gcc-2.95.3) + * terminated the loop with a _signed_ address comparison + * using "jle", when configured for HIGHMEM64GB (X86_PAE). + * If also configured for 3GB of kernel virtual address space, + * if page at physical 0x3ffff000 virtual 0x7ffff000 is used as + * a pmd, when that mm exits the loop goes on to free "entries" + * found at 0x80000000 onwards. The loop below compiles instead + * to be terminated by unsigned address comparison using "jb". + */ + for (md = pmd, emd = pmd + PTRS_PER_PMD; md < emd; md++) + free_one_pmd(tlb,md); pmd_free_tlb(tlb, pmd); } @@ -1038,6 +1050,7 @@ static int do_wp_page(struct mm_struct * ++mm->rss; page_remove_rmap(old_page, page_table); break_cow(vma, new_page, address, page_table); + SetPageAnon(new_page); pte_chain = page_add_rmap(new_page, page_table, pte_chain); lru_cache_add_active(new_page); @@ -1241,6 +1254,7 @@ static int do_swap_page(struct mm_struct flush_icache_page(vma, page); set_pte(page_table, pte); + SetPageAnon(page); pte_chain = page_add_rmap(page, page_table, pte_chain); /* No need to invalidate - it was non-present before */ @@ -1306,6 +1320,7 @@ do_anonymous_page(struct mm_struct *mm, entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); lru_cache_add_active(page); mark_page_accessed(page); + SetPageAnon(page); } set_pte(page_table, entry); @@ -1365,6 +1380,10 @@ do_no_page(struct mm_struct *mm, struct if (!pte_chain) goto oom; + /* See if nopage returned an anon page */ + if (!new_page->mapping || PageSwapCache(new_page)) + SetPageAnon(new_page); + /* * Should we do an early C-O-W break? */ @@ -1377,6 +1396,7 @@ do_no_page(struct mm_struct *mm, struct copy_user_highpage(page, new_page, address); page_cache_release(new_page); lru_cache_add_active(page); + SetPageAnon(page); new_page = page; } diff -purN -X /home/mbligh/.diff.exclude 000-virgin/mm/mmap.c 141-no_vma_sort/mm/mmap.c --- 000-virgin/mm/mmap.c 2003-06-05 14:56:44.000000000 -0700 +++ 141-no_vma_sort/mm/mmap.c 2003-06-09 10:16:58.000000000 -0700 @@ -377,6 +377,28 @@ static inline int is_mergeable_vma(struc return 1; } +static void move_vma_start(struct vm_area_struct *vma, unsigned long addr) +{ + spinlock_t *lock = &vma->vm_mm->page_table_lock; + struct inode *inode = NULL; + + if (vma->vm_file) { + inode = vma->vm_file->f_dentry->d_inode; + down(&inode->i_mapping->i_shared_sem); + } + spin_lock(lock); + if (inode) + __remove_shared_vm_struct(vma, inode); + /* If no vm_file, perhaps we should always keep vm_pgoff at 0?? */ + vma->vm_pgoff += (long)(addr - vma->vm_start) >> PAGE_SHIFT; + vma->vm_start = addr; + if (inode) { + __vma_link_file(vma); + up(&inode->i_mapping->i_shared_sem); + } + spin_unlock(lock); +} + /* * Return true if we can merge this (vm_flags,file,vm_pgoff,size) * in front of (at a lower virtual address and file offset than) the vma. @@ -429,8 +451,6 @@ static int vma_merge(struct mm_struct *m unsigned long end, unsigned long vm_flags, struct file *file, unsigned long pgoff) { - spinlock_t * lock = &mm->page_table_lock; - /* * We later require that vma->vm_flags == vm_flags, so this tests * vma->vm_flags & VM_SPECIAL, too. @@ -450,6 +470,7 @@ static int vma_merge(struct mm_struct *m is_mergeable_vma(prev, file, vm_flags) && can_vma_merge_after(prev, vm_flags, file, pgoff)) { struct vm_area_struct *next; + spinlock_t *lock = &mm->page_table_lock; struct inode *inode = file ? file->f_dentry->d_inode : NULL; int need_up = 0; @@ -497,10 +518,7 @@ static int vma_merge(struct mm_struct *m pgoff, (end - addr) >> PAGE_SHIFT)) return 0; if (end == prev->vm_start) { - spin_lock(lock); - prev->vm_start = addr; - prev->vm_pgoff -= (end - addr) >> PAGE_SHIFT; - spin_unlock(lock); + move_vma_start(prev, addr); return 1; } } @@ -1220,8 +1238,7 @@ int split_vma(struct mm_struct * mm, str if (new_below) { new->vm_end = addr; - vma->vm_start = addr; - vma->vm_pgoff += ((addr - new->vm_start) >> PAGE_SHIFT); + move_vma_start(vma, addr); } else { vma->vm_end = addr; new->vm_start = addr; diff -purN -X /home/mbligh/.diff.exclude 000-virgin/mm/page_alloc.c 141-no_vma_sort/mm/page_alloc.c --- 000-virgin/mm/page_alloc.c 2003-06-05 14:56:45.000000000 -0700 +++ 141-no_vma_sort/mm/page_alloc.c 2003-06-09 10:16:54.000000000 -0700 @@ -220,6 +220,8 @@ static inline void free_pages_check(cons bad_page(function, page); if (PageDirty(page)) ClearPageDirty(page); + if (PageAnon(page)) + ClearPageAnon(page); } /* diff -purN -X /home/mbligh/.diff.exclude 000-virgin/mm/rmap.c 141-no_vma_sort/mm/rmap.c --- 000-virgin/mm/rmap.c 2003-04-21 14:14:53.000000000 -0700 +++ 141-no_vma_sort/mm/rmap.c 2003-06-09 10:16:58.000000000 -0700 @@ -102,6 +102,136 @@ pte_chain_encode(struct pte_chain *pte_c **/ /** + * find_pte - Find a pte pointer given a vma and a struct page. + * @vma: the vma to search + * @page: the page to find + * + * Determine if this page is mapped in this vma. If it is, map and rethrn + * the pte pointer associated with it. Return null if the page is not + * mapped in this vma for any reason. + * + * This is strictly an internal helper function for the object-based rmap + * functions. + * + * It is the caller's responsibility to unmap the pte if it is returned. + */ +static inline pte_t * +find_pte(struct vm_area_struct *vma, struct page *page, unsigned long *addr) +{ + struct mm_struct *mm = vma->vm_mm; + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + unsigned long loffset; + unsigned long address; + + loffset = (page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT)); + address = vma->vm_start + ((loffset - vma->vm_pgoff) << PAGE_SHIFT); + if (address < vma->vm_start || address >= vma->vm_end) + goto out; + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + goto out; + + pmd = pmd_offset(pgd, address); + if (!pmd_present(*pmd)) + goto out; + + pte = pte_offset_map(pmd, address); + if (!pte_present(*pte)) + goto out_unmap; + + if (page_to_pfn(page) != pte_pfn(*pte)) + goto out_unmap; + + if (addr) + *addr = address; + + return pte; + +out_unmap: + pte_unmap(pte); +out: + return NULL; +} + +/** + * page_referenced_obj_one - referenced check for object-based rmap + * @vma: the vma to look in. + * @page: the page we're working on. + * + * Find a pte entry for a page/vma pair, then check and clear the referenced + * bit. + * + * This is strictly a helper function for page_referenced_obj. + */ +static int +page_referenced_obj_one(struct vm_area_struct *vma, struct page *page) +{ + struct mm_struct *mm = vma->vm_mm; + pte_t *pte; + int referenced = 0; + + if (!spin_trylock(&mm->page_table_lock)) + return 1; + + pte = find_pte(vma, page, NULL); + if (pte) { + if (ptep_test_and_clear_young(pte)) + referenced++; + pte_unmap(pte); + } + + spin_unlock(&mm->page_table_lock); + return referenced; +} + +/** + * page_referenced_obj_one - referenced check for object-based rmap + * @page: the page we're checking references on. + * + * For an object-based mapped page, find all the places it is mapped and + * check/clear the referenced flag. This is done by following the page->mapping + * pointer, then walking the chain of vmas it holds. It returns the number + * of references it found. + * + * This function is only called from page_referenced for object-based pages. + * + * The semaphore address_space->i_shared_sem is tried. If it can't be gotten, + * assume a reference count of 1. + */ +static int +page_referenced_obj(struct page *page) +{ + struct address_space *mapping = page->mapping; + struct vm_area_struct *vma; + int referenced = 0; + + if (!page->pte.mapcount) + return 0; + + if (!mapping) + BUG(); + + if (PageSwapCache(page)) + BUG(); + + if (down_trylock(&mapping->i_shared_sem)) + return 1; + + list_for_each_entry(vma, &mapping->i_mmap, shared) + referenced += page_referenced_obj_one(vma, page); + + list_for_each_entry(vma, &mapping->i_mmap_shared, shared) + referenced += page_referenced_obj_one(vma, page); + + up(&mapping->i_shared_sem); + + return referenced; +} + +/** * page_referenced - test if the page was referenced * @page: the page to test * @@ -120,6 +250,10 @@ int page_referenced(struct page * page) if (TestClearPageReferenced(page)) referenced++; + if (!PageAnon(page)) { + referenced += page_referenced_obj(page); + goto out; + } if (PageDirect(page)) { pte_t *pte = rmap_ptep_map(page->pte.direct); if (ptep_test_and_clear_young(pte)) @@ -153,6 +287,7 @@ int page_referenced(struct page * page) __pte_chain_free(pc); } } +out: return referenced; } @@ -175,6 +310,21 @@ page_add_rmap(struct page *page, pte_t * pte_chain_lock(page); + /* + * If this is an object-based page, just count it. We can + * find the mappings by walking the object vma chain for that object. + */ + if (!PageAnon(page)) { + if (!page->mapping) + BUG(); + if (PageSwapCache(page)) + BUG(); + if (!page->pte.mapcount) + inc_page_state(nr_mapped); + page->pte.mapcount++; + goto out; + } + if (page->pte.direct == 0) { page->pte.direct = pte_paddr; SetPageDirect(page); @@ -231,8 +381,25 @@ void page_remove_rmap(struct page *page, pte_chain_lock(page); if (!page_mapped(page)) - goto out_unlock; /* remap_page_range() from a driver? */ + goto out_unlock; + /* + * If this is an object-based page, just uncount it. We can + * find the mappings by walking the object vma chain for that object. + */ + if (!PageAnon(page)) { + if (!page->mapping) + BUG(); + if (PageSwapCache(page)) + BUG(); + if (!page->pte.mapcount) + BUG(); + page->pte.mapcount--; + if (!page->pte.mapcount) + dec_page_state(nr_mapped); + goto out_unlock; + } + if (PageDirect(page)) { if (page->pte.direct == pte_paddr) { page->pte.direct = 0; @@ -279,6 +446,102 @@ out_unlock: } /** + * try_to_unmap_obj - unmap a page using the object-based rmap method + * @page: the page to unmap + * + * Determine whether a page is mapped in a given vma and unmap it if it's found. + * + * This function is strictly a helper function for try_to_unmap_obj. + */ +static inline int +try_to_unmap_obj_one(struct vm_area_struct *vma, struct page *page) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long address; + pte_t *pte; + pte_t pteval; + int ret = SWAP_AGAIN; + + if (!spin_trylock(&mm->page_table_lock)) + return ret; + + pte = find_pte(vma, page, &address); + if (!pte) + goto out; + + if (vma->vm_flags & VM_LOCKED) { + ret = SWAP_FAIL; + goto out_unmap; + } + + flush_cache_page(vma, address); + pteval = ptep_get_and_clear(pte); + flush_tlb_page(vma, address); + + if (pte_dirty(pteval)) + set_page_dirty(page); + + if (!page->pte.mapcount) + BUG(); + + mm->rss--; + page->pte.mapcount--; + page_cache_release(page); + +out_unmap: + pte_unmap(pte); + +out: + spin_unlock(&mm->page_table_lock); + return ret; +} + +/** + * try_to_unmap_obj - unmap a page using the object-based rmap method + * @page: the page to unmap + * + * Find all the mappings of a page using the mapping pointer and the vma chains + * contained in the address_space struct it points to. + * + * This function is only called from try_to_unmap for object-based pages. + * + * The semaphore address_space->i_shared_sem is tried. If it can't be gotten, + * return a temporary error. + */ +static int +try_to_unmap_obj(struct page *page) +{ + struct address_space *mapping = page->mapping; + struct vm_area_struct *vma; + int ret = SWAP_AGAIN; + + if (!mapping) + BUG(); + + if (PageSwapCache(page)) + BUG(); + + if (down_trylock(&mapping->i_shared_sem)) + return ret; + + list_for_each_entry(vma, &mapping->i_mmap, shared) { + ret = try_to_unmap_obj_one(vma, page); + if (ret == SWAP_FAIL || !page->pte.mapcount) + goto out; + } + + list_for_each_entry(vma, &mapping->i_mmap_shared, shared) { + ret = try_to_unmap_obj_one(vma, page); + if (ret == SWAP_FAIL || !page->pte.mapcount) + goto out; + } + +out: + up(&mapping->i_shared_sem); + return ret; +} + +/** * try_to_unmap_one - worker function for try_to_unmap * @page: page to unmap * @ptep: page table entry to unmap from page @@ -397,6 +660,15 @@ int try_to_unmap(struct page * page) if (!page->mapping) BUG(); + /* + * If it's an object-based page, use the object vma chain to find all + * the mappings. + */ + if (!PageAnon(page)) { + ret = try_to_unmap_obj(page); + goto out; + } + if (PageDirect(page)) { ret = try_to_unmap_one(page, page->pte.direct); if (ret == SWAP_SUCCESS) { @@ -452,12 +724,115 @@ int try_to_unmap(struct page * page) } } out: - if (!page_mapped(page)) + if (!page_mapped(page)) { dec_page_state(nr_mapped); + ret = SWAP_SUCCESS; + } return ret; } /** + * page_convert_anon - Convert an object-based mapped page to pte_chain-based. + * @page: the page to convert + * + * Find all the mappings for an object-based page and convert them + * to 'anonymous', ie create a pte_chain and store all the pte pointers there. + * + * This function takes the address_space->i_shared_sem, sets the PageAnon flag, + * then sets the mm->page_table_lock for each vma and calls page_add_rmap. This + * means there is a period when PageAnon is set, but still has some mappings + * with no pte_chain entry. This is in fact safe, since page_remove_rmap will + * simply not find it. try_to_unmap might erroneously return success, but it + * will never be called because the page_convert_anon() caller has locked the + * page. + * + * page_referenced() may fail to scan all the appropriate pte's and may return + * an inaccurate result. This is so rare that it does not matter. + */ +int page_convert_anon(struct page *page) +{ + struct address_space *mapping; + struct vm_area_struct *vma; + struct pte_chain *pte_chain = NULL; + pte_t *pte; + int err = 0; + + mapping = page->mapping; + if (mapping == NULL) + goto out; /* truncate won the lock_page() race */ + + down(&mapping->i_shared_sem); + pte_chain_lock(page); + + /* + * Has someone else done it for us before we got the lock? + * If so, pte.direct or pte.chain has replaced pte.mapcount. + */ + if (PageAnon(page)) { + pte_chain_unlock(page); + goto out_unlock; + } + + SetPageAnon(page); + if (page->pte.mapcount == 0) { + pte_chain_unlock(page); + goto out_unlock; + } + /* This is gonna get incremented by page_add_rmap */ + dec_page_state(nr_mapped); + page->pte.mapcount = 0; + + /* + * Now that the page is marked as anon, unlock it. page_add_rmap will + * lock it as necessary. + */ + pte_chain_unlock(page); + + list_for_each_entry(vma, &mapping->i_mmap, shared) { + if (!pte_chain) { + pte_chain = pte_chain_alloc(GFP_KERNEL); + if (!pte_chain) { + err = -ENOMEM; + goto out_unlock; + } + } + spin_lock(&vma->vm_mm->page_table_lock); + pte = find_pte(vma, page, NULL); + if (pte) { + /* Make sure this isn't a duplicate */ + page_remove_rmap(page, pte); + pte_chain = page_add_rmap(page, pte, pte_chain); + pte_unmap(pte); + } + spin_unlock(&vma->vm_mm->page_table_lock); + } + list_for_each_entry(vma, &mapping->i_mmap_shared, shared) { + if (!pte_chain) { + pte_chain = pte_chain_alloc(GFP_KERNEL); + if (!pte_chain) { + err = -ENOMEM; + goto out_unlock; + } + } + spin_lock(&vma->vm_mm->page_table_lock); + pte = find_pte(vma, page, NULL); + if (pte) { + /* Make sure this isn't a duplicate */ + page_remove_rmap(page, pte); + pte_chain = page_add_rmap(page, pte, pte_chain); + pte_unmap(pte); + } + spin_unlock(&vma->vm_mm->page_table_lock); + } + +out_unlock: + pte_chain_free(pte_chain); + up(&mapping->i_shared_sem); +out: + return err; +} + +/** ** No more VM stuff below this comment, only pte_chain helper ** functions. **/ diff -purN -X /home/mbligh/.diff.exclude 000-virgin/mm/swapfile.c 141-no_vma_sort/mm/swapfile.c --- 000-virgin/mm/swapfile.c 2003-06-05 14:56:45.000000000 -0700 +++ 141-no_vma_sort/mm/swapfile.c 2003-06-09 10:16:54.000000000 -0700 @@ -385,6 +385,7 @@ unuse_pte(struct vm_area_struct *vma, un vma->vm_mm->rss++; get_page(page); set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot))); + SetPageAnon(page); *pte_chainp = page_add_rmap(page, dir, *pte_chainp); swap_free(entry); } diff -purN -X /home/mbligh/.diff.exclude 000-virgin/scripts/schedcapture 141-no_vma_sort/scripts/schedcapture --- 000-virgin/scripts/schedcapture 1969-12-31 16:00:00.000000000 -0800 +++ 141-no_vma_sort/scripts/schedcapture 2003-06-09 10:16:45.000000000 -0700 @@ -0,0 +1,6 @@ +while true +do + cat /proc/schedstat + echo + sleep 20 +done diff -purN -X /home/mbligh/.diff.exclude 000-virgin/scripts/schedstat 141-no_vma_sort/scripts/schedstat --- 000-virgin/scripts/schedstat 1969-12-31 16:00:00.000000000 -0800 +++ 141-no_vma_sort/scripts/schedstat 2003-06-09 10:16:45.000000000 -0700 @@ -0,0 +1,168 @@ +#!/usr/bin/perl + +$slice = 20; # seconds +while (<>) { + @curr = split; + if ($curr[0] =~ /cpu(\d)/) { + $per_cpu_curr[$1] = [ @curr ]; + $max_cpu = $1 if ($1 > $max_cpu); + next; + } + next if (/^$/); + if ($curr[0] eq "version") { + if ($curr[1] != 2) { + die "Version mismatch. Update this tool.\n"; + } + next; + } + # + # format of line in /proc/schedstat + # + # tag 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 + # + # tag is "cpuN" or "cpu". Right now, we ignore "cpuN" lines (this tool + # doesn't collate per-cpu statistics, although it would be trivial to + # do so.) + # + # version == 1 + # NOTE: the active queue is considered empty if it has only one process + # in it, since obviously the process calling sched_yield is that process. + # + # First four are sched_yield statistics: + # 1) # of times both the active and the expired queue were empty + # 2) # of times just the active queue was empty + # 3) # of times just the expired queue was empty + # 4) # of times sched_yield() was called + # + # Next two are schedule() statistics: + # 5) # of times the active queue had at least one other process on it. + # 6) # of times we switched to the expired queue and reused it + # 7) # of times schedule() was called + # + # Next seven are statistics dealing with load balancing: + # 8) # of times load_balance was called at an idle tick + # 9) # of times load_balance was called at an busy tick + # 10) # of times load_balance was called from schedule() + # 11) # of times load_balance was called + # 12) sum of imbalances discovered (if any) with each call to + # load_balance + # 13) # of times load_balance was called when we did not find a + # "busiest" queue + # 14) # of times load_balance was called from balance_node() + # + # Next four are statistics dealing with pull_task(): + # 15) # of times pull_task was called at an idle tick + # 16) # of times pull_task was called at an busy tick + # 17) # of times pull_task was called from schedule() + # 18) # of times pull_task was called + # + # Next two are statistics dealing with balance_node(): + # 19) # of times balance_node was called + # 20) # of times balance_node was called at an idle tick + # + #$curr[7] = $sched_cnt; + foreach $i (1..20) { + $diff[$i] = $curr[$i] - $prev[$i]; + } + + for ($cpu = 0; $cpu <= $max_cpu; $cpu++) { + @arr_curr = @{$per_cpu_curr[$cpu]}; + @arr_prev = @{$per_cpu_prev[$cpu]}; + foreach $i (1..20) { + $arr_diff[$i] = $arr_curr[$i] - $arr_prev[$i]; + } + $per_cpu_diff[$cpu] = [ @arr_diff ]; + } + + #for ($cpu = 0; $cpu <= $max_cpu; $cpu++) { +# print "@{$per_cpu_curr[$cpu]}\n"; +# } +# print "@curr\n"; + printf "%02d:%02d:%02d--------------------------------------------------------------\n", + $tick*$slice/3600, ($tick*$slice/60)%60, ($tick*$slice)%60; + + # + # sched_yield() stats + # + printf " %7d sys_sched_yield()\n", $diff[4]; + printf " %7d(%6.2f%%) found (only) active queue empty on current cpu\n", + $diff[2]-$diff[1], $diff[4] ? (100*($diff[2]-$diff[1])/$diff[4]) : 0; + printf " %7d(%6.2f%%) found (only) expired queue empty on current cpu\n", + $diff[3], $diff[4] ? (100*$diff[3]/$diff[4]) : 0; + printf " %7d(%6.2f%%) found both queues empty on current cpu\n", + $diff[1], $diff[4] ? (100*$diff[1]/$diff[4]) : 0; + printf " %7d(%6.2f%%) found neither queue empty on current cpu\n\n", + $diff[4]-($diff[3]+$diff[2]), + $diff[4] ? 100*($diff[4]-($diff[3]+$diff[2]))/$diff[4] : 0; + + # + # schedule() stats + # + printf " %7d schedule()\n", $diff[7]; + printf " %7d(%6.2f%%) switched active and expired queues\n", + $diff[6], $diff[7] ? (100*$diff[6]/$diff[7]) : 0; + printf " %7d(%6.2f%%) used existing active queue\n\n", + $diff[5]-$diff[6], $diff[7] ? (100*($diff[5]-$diff[6])/$diff[7]) : 0; + + # + # load_balance() stats + # + printf " %7d load_balance()\n", $diff[11]; + printf " %7d(%6.2f%%) called while idle\n", $diff[8], + 100*$diff[8]/$diff[11]; + printf " %7d(%6.2f%%) called while busy\n", $diff[9], + 100*($diff[9])/$diff[11]; + printf " %7d(%6.2f%%) called from schedule()\n", $diff[10], + 100*$diff[10]/$diff[11]; + printf " %7d(%6.2f%%) called from balance_node()\n", $diff[14], + 100*$diff[14]/$diff[11]; + printf " %7d no \"busiest\" queue found\n",$diff[13]; + if ($diff[11]-$diff[13]) { + $imbalance = $diff[12] / ($diff[11]-$diff[13]); + if ($imbalance < 10) { + printf " %7.3f average imbalance (over %d)\n", + $imbalance, $diff[11]-$diff[13]; + } elsif ($imbalance < 100) { + printf " %8.2f average imbalance (over %d)\n", + $imbalance, $diff[11]-$diff[13]; + } else { + printf " %9.1f average imbalance (over %d)\n", + $imbalance, $diff[11]-$diff[13]; + } + } + else { + printf " no imbalances\n"; + } + + # + # pull_task() stats + # + print "\n"; + printf " %7d pull_task()\n", $diff[15]; + for ($cpu = 0; $cpu <= $max_cpu; $cpu++) { + @arr = @{$per_cpu_diff[$cpu]}; + if ($arr[15] || $arr[16]) { + printf " %7d/%-7d cpu %d lost/gained task to/from another cpu\n", + $arr[15], $arr[16], $cpu; + } + if ($arr[17] || $arr[18]) { + printf " %7d/%-7d cpu %d lost/gained task to/from another node\n", + $arr[17], $arr[18], $cpu; + } + } + print "\n"; + + # + # balance_node() stats + # + printf " %7d balance_node()\n", $diff[19]; + printf " %7d(%6.2f%%) called while idle\n", $diff[20], + $diff[19] ? 100*$diff[20]/$diff[19] : 0; + printf " %7d(%6.2f%%) called while busy\n", $diff[19] - $diff[20], + $diff[19] ? 100*(($diff[19]-$diff[20]))/$diff[19] : 0; + + printf("\n"); + @prev = @curr; + @per_cpu_prev = @per_cpu_curr; + $tick++; +}