diff -urpN -X /home/fletch/.diff.exclude 000-virgin/Documentation/filesystems/proc.txt 900-mjb2/Documentation/filesystems/proc.txt --- 000-virgin/Documentation/filesystems/proc.txt Sun Apr 20 19:34:55 2003 +++ 900-mjb2/Documentation/filesystems/proc.txt Wed Jun 11 22:42:39 2003 @@ -37,6 +37,7 @@ Table of Contents 2.8 /proc/sys/net/ipv4 - IPV4 settings 2.9 Appletalk 2.10 IPX + 2.11 /proc/sys/sched - scheduler tunables ------------------------------------------------------------------------------ Preface @@ -1749,6 +1750,104 @@ IPX. The /proc/net/ipx_route table holds a list of IPX routes. For each route it gives the destination network, the router node (or Directly) and the network address of the router (or Connected) for internal networks. + +2.11 /proc/sys/sched - scheduler tunables +----------------------------------------- + +Useful knobs for tuning the scheduler live in /proc/sys/sched. + +child_penalty +------------- + +Percentage of the parent's sleep_avg that children inherit. sleep_avg is +a running average of the time a process spends sleeping. Tasks with high +sleep_avg values are considered interactive and given a higher dynamic +priority and a larger timeslice. You typically want this some value just +under 100. + +exit_weight +----------- + +When a CPU hog task exits, its parent's sleep_avg is reduced by a factor of +exit_weight against the exiting task's sleep_avg. + +interactive_delta +----------------- + +If a task is "interactive" it is reinserted into the active array after it +has expired its timeslice, instead of being inserted into the expired array. +How "interactive" a task must be in order to be deemed interactive is a +function of its nice value. This interactive limit is scaled linearly by nice +value and is offset by the interactive_delta. + +max_sleep_avg +------------- + +max_sleep_avg is the largest value (in ms) stored for a task's running sleep +average. The larger this value, the longer a task needs to sleep to be +considered interactive (maximum interactive bonus is a function of +max_sleep_avg). + +max_timeslice +------------- + +Maximum timeslice, in milliseconds. This is the value given to tasks of the +highest dynamic priority. + +min_timeslice +------------- + +Minimum timeslice, in milliseconds. This is the value given to tasks of the +lowest dynamic priority. Every task gets at least this slice of the processor +per array switch. + +parent_penalty +-------------- + +Percentage of the parent's sleep_avg that it retains across a fork(). +sleep_avg is a running average of the time a process spends sleeping. Tasks +with high sleep_avg values are considered interactive and given a higher +dynamic priority and a larger timeslice. Normally, this value is 100 and thus +task's retain their sleep_avg on fork. If you want to punish interactive +tasks for forking, set this below 100. + +prio_bonus_ratio +---------------- + +Middle percentage of the priority range that tasks can receive as a dynamic +priority. The default value of 25% ensures that nice values at the +extremes are still enforced. For example, nice +19 interactive tasks will +never be able to preempt a nice 0 CPU hog. Setting this higher will increase +the size of the priority range the tasks can receive as a bonus. Setting +this lower will decrease this range, making the interactivity bonus less +apparent and user nice values more applicable. + +starvation_limit +---------------- + +Sufficiently interactive tasks are reinserted into the active array when they +run out of timeslice. Normally, tasks are inserted into the expired array. +Reinserting interactive tasks into the active array allows them to remain +runnable, which is important to interactive performance. This could starve +expired tasks, however, since the interactive task could prevent the array +switch. To prevent starving the tasks on the expired array for too long. the +starvation_limit is the longest (in ms) we will let the expired array starve +at the expense of reinserting interactive tasks back into active. Higher +values here give more preferance to running interactive tasks, at the expense +of expired tasks. Lower values provide more fair scheduling behavior, at the +expense of interactivity. The units are in milliseconds. + +idle_node_rebalance_ratio +------------------------- + +On NUMA machines, we normally rebalance within nodes, but we also rebalance +globally every N idle rebalance ticks, where N = idle_node_rebalance_ratio. + +busy_node_rebalance_ratio +------------------------- + +On NUMA machines, we normally rebalance within nodes, but we also rebalance +globally every N busy rebalance ticks, where N = busy_node_rebalance_ratio. ------------------------------------------------------------------------------ Summary diff -urpN -X /home/fletch/.diff.exclude 000-virgin/Documentation/i386/gdb-serial.txt 900-mjb2/Documentation/i386/gdb-serial.txt --- 000-virgin/Documentation/i386/gdb-serial.txt Wed Dec 31 16:00:00 1969 +++ 900-mjb2/Documentation/i386/gdb-serial.txt Wed Jun 11 22:42:58 2003 @@ -0,0 +1,386 @@ +Version +======= + +This version of the gdbstub package was developed and tested on +kernel version 2.3.48. It will not install on a 2.2 kernel. It may +not work on earlier versions of 2.3 kernels. It is possible that +it will continue to work on later versions of 2.3 and then +versions of 2.4 (I hope). + + +Debugging Setup +=============== + +Designate one machine as the "development" machine. This is the +machine on which you run your compiles and which has your source +code for the kernel. Designate a second machine as the "target" +machine. This is the machine that will run your experimental +kernel. + +The two machines will be connected together via a serial line out +one or the other of the COM ports of the PC. You will need a modem +eliminator and the appropriate cables. + +On the DEVELOPMENT machine you need to apply the patch for the gdb +hooks. You have probably already done that if you are reading this +file. + +On your DEVELOPMENT machine, go to your kernel source directory and +do "make menuconfig". Go down to the kernel hacking menu item and +open it up. Enable the kernel gdb stub code by selecting that item. + +Save and exit the menuconfig program. Then do "make clean" and +"make bzImage" (or whatever target you want to make). This gets +the kernel compiled with the "-g" option set -- necessary for +debugging. + +You have just built the kernel on your DEVELOPMENT machine that you +intend to run on our TARGET machine. + +To install this new kernel, use the following installation procedure. +Remember, you are on the DEVELOPMENT machine patching the kernel source +for the kernel that you intend to run on the TARGET machine. + +Copy this kernel to your target machine using your usual procedures. +I usually arrange to copy development:/usr/src/linux/arch/i386/boot/zImage +to /vmlinuz on the TARGET machine via a LAN based NFS access. That is, +I run the cp command on the target and copy from the development machine +via the LAN. Run Lilo on the new kernel on the target machine so that it +will boot! Then boot the kernel on the target machine. + +There is an utility program named "gdbstart" in the +development:/usr/src/linux/arch/i386/kernel directory. +You should copy this program over to your target machine, probably into +/sbin. This utility program is run on the target machine to +activate the kernel hooks for the debugger. It is invoked as follows: + + gdbstart [-s speed] [-t tty-dev] + defaults: /dev/ttyS0 with speed unmodified by gdbstart + +Don't run the program just yet. We'll get to that in a bit. + +Decide on which tty port you want the machines to communicate, then +cable them up back-to-back using the null modem. COM1 is /dev/ttyS0 +and COM2 is /dev/ttyS1. + +On the DEVELOPMENT machine, create a file called .gdbinit in the +directory /usr/src/linux. An example .gdbinit file looks like this: + +define rmt +set remotebaud 38400 +target remote /dev/ttyS0 +end + +Assuming that you added my gdbinit stuff to your .gdbinit, edit .gdbinit +and find the section that looks like this: + + define rmt + set remotebaud 38400 + target remote /dev/ttyS0 + end + +Change the "target" definition so that it specifies the tty port that +you intend to use. Change the "remotebaud" definition to match the +data rate that you are going to use for the com line. + +On the TARGET machine I find it helpful to create shell script file +named "debug" in the root home directory with the following contents: + + gdbstart -s 38400 -t /dev/ttyS0 < + EOF + +This runs the gdbstart program and gives it the carriage return that +it prompts for. This sets the data rate from the target machine's side. + +You are now ready to try it out. + +On your TARGET machine, freshly rebooted with your gdbstub-equipped +kernel, type "debug" in the root home directory. The system will appear +to hang with some messages on the screen from the debug stub. What +it is doing is waiting for contact from the development machine. + +On your DEVELOPMENT machine, cd /usr/src/linux and enter "gdb vmlinux". +When gdb gets the symbols loaded and prompts you, enter "rmt" (that's +the macro from the .gdbinit file that you just edited). If everything +is working correctly you should see gdb print out a few lines indicating +that a breakpoint has been taken. It will actually show a line of +code in the target kernel inside the gdbstub activation code. + +The gdb interaction should look something like this: + + linux-dev:/usr/src/linux# gdb vmlinux + GDB is free software and you are welcome to distribute copies of it + under certain conditions; type "show copying" to see the conditions. + There is absolutely no warranty for GDB; type "show warranty" for details. + GDB 4.15.1 (i486-slackware-linux), + Copyright 1995 Free Software Foundation, Inc... + (gdb) rmt + breakpoint () at i386-stub.c:750 + 750 } + (gdb) + + +You can now use whatever gdb commands you like to set breakpoints. +Enter "continue" to start your target machine executing again. At this +point the target system will run at full speed until it encounters +your breakpoint or gets a segment violation in the kernel, or whatever. + + +Triggering gdbstub at Kernel Boot Time +====================================== + +The gdbstub patch now has the ability for gdb to connect to the kernel during +bootup (as opposed to waiting for the system to come all the way up and then +running the gdbstart program on the target machine). This new functionality was +added by Scott Foehner at SGI. + +To force a kernel that has been compiled with gdbstub to pause during the boot +process and wait for a connection from gdb, the paramter "gdb" should be passed +to the kernel. This can be done by typing "gdb" after the name of the kernel +on the LILO command line. The patch defaults to use ttyS1 at a baud rate of +38400. These parameters can be changed by using "gdbttyS=" and +"gdbbaud=" on the command line. + +Example: + +LILO boot: linux gdb gdbttyS=1 gdbbaud=38400 + +Note that this command is entered on the TARGET machine as it is booting +the kernel that was compiled on the DEVELOPMENT machine. + +An alternate approach is to place a line in the /etc/lilo.conf file on +your TARGET machine. Under the heading for the kernel that you intend +to boot, place a line that looks like this: + + append = "gdb gdbttyS=1 gdbbaud=38400" + +This will cause the kernel to enter the gdbstub automatically at boot +time. + +BE SURE to run "lilo" after changing the /etc/lilo.conf file. + + +The "gdbstart" Program +===================== + +This utility program is used to set up the com port and data rate +for the connection from the target system to the development system. +Its usage has been described above. + +This version of the patch uses the same tty ioctl for kernel versions +2.0.30 onwards. Thus, the gdbstart utility does not need to be re-compiled +to install the patch in a later version of the kernel. The ioctl added +to the kernel for this purpose is far enough "off the end" of existing +ioctls (as of 2.1.120) that it should not interfere with any new kernel +tty ioctls for quite some time (famous last words). + +The source for the gdbstart program resides in the arch/i386/kernel directory. + + +Debugging hints +=============== + +You can break into the target machine at any time from the development +machine by typing ^C. If the target machine has interrupts enabled +this will stop it in the kernel and enter the debugger. + +There is unfortunately no way of breaking into the kernel if it is +in a loop with interrupts disabled, so if this happens to you then +you need to place exploratory breakpoints or printk's into the kernel +to find out where it is looping. + +There is a copy of an e-mail in the kgdb distribution directory which +describes how to create an NMI on an ISA bus machine using a paper +clip. I have a sophisticated version of this made by wiring a push +button switch into a PC104/ISA bus adapter card. The adapter card +nicely furnishes wire wrap pins for all the ISA bus signals. + +When you are done debugging the kernel on the target machine it is +a good idea to leave it in a running state. This makes reboots +faster, bypassing the fsck. So do a gdb "continue" as the last gdb +command if this is possible. To terminate gdb itself on the development +machine and leave the target machine running, type ^Z to suspend gdb +and then kill it with "kill %1" or something similar. + +If gdbstub Does Not Work +======================== + +If it doesn't work, you will have to troubleshoot it. Do the easy things +first like double checking your cabling and data rates. You might +try some non-kernel based programs to see if the back-to-back connection +works properly. Just something simple like cat /etc/hosts >/dev/ttyS0 +on one machine and cat /dev/ttyS0 on the other will tell you if you +can send data from one machine to the other. There is no point in tearing +out your hair in the kernel if the line doesn't work. + +All of the real action takes place in the file +/usr/src/linux/arch/i386/kernel/gdbstub.c. That is the code on the target +machine that interacts with gdb on the development machine. In gdb you can +turn on a debug switch with the following command: + + set remotedebug + +This will print out the protocol messages that gdb is exchanging with +the target machine. + +Another place to look is /usr/src/linux/drivers/char/gdbserial.c +That is the code that talks to the serial port on the target side. +There might be a problem there. + +If you are really desperate you can use printk debugging in the +gdbstub code in the target kernel until you get it working. In particular, +there is a global variable in /usr/src/linux/arch/i386/kernel/gdbstub.c +named "remote_debug". Compile your kernel with this set to 1, rather +than 0 and the debug stub will print out lots of stuff as it does +what it does. + + +Debugging Loadable Modules +========================== + +This technique comes courtesy of Edouard Parmelan + + +When you run gdb, enter the command + +source gdbinit-modules + +This will read in a file of gdb macros that was installed in your +kernel source directory with kgdb was installed. This file implements +the following commands: + +mod-list + Lists the loaded modules in the form + +mod-print-symbols + Prints all the symbols in the indicated module. + +mod-add-symbols + Loads the symbols from the object file and associates them + with the indicated module. + +After you have loaded the module that you want to debug, use the command +mod-list to find the of your module. Then use that +address in the mod-add-symbols command to load your module's symbols. +From that point onward you can debug your module as if it were a part +of the kernel. + +The file gdbinit-modules also contains a command named mod-add-lis as +an example of how to construct a command of your own to load your +favorite module. The idea is to "can" the pathname of the module +in the command so you don't have to type so much. + +Threads +======= + +Each process in a target machine is seen as a gdb thread. gdb thread related +commands (info threads, thread n) can be used. + +ia-32 hardware breakpoints +========================== + +gdb stub contains support for hardware breakpoints using debugging features +of ia-32(x86) processors. These breakpoints do not need code modification. +They use debugging registers. 4 hardware breakpoints are available in ia-32 +processors. + +Each hardware breakpoint can be of one of the following three types. +1. Execution breakpoint - An Execution breakpoint is triggered when code at the + breakpoint address is executed. + + As limited number of hardware breakpoints are available, it is advisable + to use software breakpoints ( break command ) instead of execution + hardware breakpoints, unless modification of code is to be avoided. + +2. Write breakpoint - A write breakpoint is triggered when memory location at the + breakpoint address is written. + + A write or can be placed for data of variable length. Length of a write + breakpoint indicates length of the datatype to be watched. Length is 1 + for 1 byte data , 2 for 2 byte data, 3 for 4 byte data. + +3. Access breakpoint - An access breakpoint is triggered when memory location at + the breakpoint address is either read or written. + + Access breakpoints also have lengths similar to write breakpoints. + +IO breakpoints in ia-32 are not supported. + +Since gdb stub at present does not use the protocol used by gdb for hardware +breakpoints, hardware breakpoints are accessed through gdb macros. gdb macros +for hardware breakpoints are described below. + +hwebrk - Places an execution breakpoint + hwebrk breakpointno address +hwwbrk - Places a write breakpoint + hwwbrk breakpointno length address +hwabrk - Places an access breakpoint + hwabrk breakpointno length address +hwrmbrk - Removes a breakpoint + hwrmbrk breakpointno +exinfo - Tells whether a software or hardware breakpoint has occured. + Prints number of the hardware breakpoint if a hardware breakpoint has + occured. + +Arguments required by these commands are as follows +breakpointno - 0 to 3 +length - 1 to 3 +address - Memory location in hex digits ( without 0x ) e.g c015e9bc + +MP support +========== + +When a breakpoint occurs or user issues a break ( Ctrl + C ) to gdb client, +all the processors are forced to enter the debugger. Current thread +corresponds to the thread running on the processor where breakpoint occured. +Threads running on other processor(s) appear similar to other non running +threads in the 'info threads' output. + +ia-32 hardware debugging registers on all processors are set to same values. +Hence any hardware breakpoints may occur on any processor. + +gdb troubleshooting +=================== + +1. gdb hangs +Kill it. restart gdb. Connect to target machine. + +2. gdb cannot connect to target machine (after killing a gdb and restarting +another) +If the target machine was not inside debugger when you killed gdb, gdb cannot +connect because the target machine won't respond. +In this case echo "Ctrl+C"(ascii 3) in the serial line. +e.g. echo -e "\003" > /dev/ttyS1 +This forces that target machine into debugger after which you can connect. + +3. gdb cannot connect even after echoing Ctrl+C into serial line +Try changing serial line settings min to 1 and time to 0 +e.g. stty min 1 time 0 < /dev/ttyS1 +Try echoing again + +check serial line speed and set it to correct value if required +e.g. stty ispeed 115200 ospeed 115200 < /dev/ttyS1 + +Final Items +=========== + +I picked up this code from Dave Grothe and enhanced it. + +If you make some really cool modification to this stuff, or if you +fix a bug, please let me know. + +Amit S. Kale + + +(First kgdb by David Grothe ) + +(modified by Tigran Aivazian ) + Putting gdbstub into the kernel config menu. + +(modified by Scott Foehner ) + Hooks for entering gdbstub at boot time. + +(modified by Amit S. Kale ) + Threads, ia-32 hw debugging, mp support, console support, + nmi watchdog handling. diff -urpN -X /home/fletch/.diff.exclude 000-virgin/Documentation/sysrq.txt 900-mjb2/Documentation/sysrq.txt --- 000-virgin/Documentation/sysrq.txt Wed Mar 26 22:54:28 2003 +++ 900-mjb2/Documentation/sysrq.txt Wed Jun 11 22:42:58 2003 @@ -77,6 +77,8 @@ On all - write a character to /proc/sys 'l' - Send a SIGKILL to all processes, INCLUDING init. (Your system will be non-functional after this.) +'g' - Enter the kernel debugger (if configured and supported). + 'h' - Will display help ( actually any other key than those listed above will display help. but 'h' is easy to remember :-) diff -urpN -X /home/fletch/.diff.exclude 000-virgin/Makefile 900-mjb2/Makefile --- 000-virgin/Makefile Fri May 30 19:01:58 2003 +++ 900-mjb2/Makefile Wed Jun 11 23:14:24 2003 @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 5 SUBLEVEL = 70 -EXTRAVERSION = +EXTRAVERSION = -mjb2 # *DOCUMENTATION* # To see a list of typical targets execute "make help" @@ -50,8 +50,13 @@ TOPDIR := $(CURDIR) HOSTCC = gcc HOSTCXX = g++ +ifdef CONFIG_DEBUG_SYMBOLS HOSTCFLAGS = -Wall -Wstrict-prototypes -O2 -fomit-frame-pointer HOSTCXXFLAGS = -O2 +else +HOSTCFLAGS = -Wall -Wstrict-prototypes -O2 -g -fomit-frame-pointer +HOSTCXXFLAGS = -O2 -g +endif CROSS_COMPILE = @@ -193,8 +198,13 @@ AFLAGS_KERNEL = NOSTDINC_FLAGS = -nostdinc -iwithprefix include CPPFLAGS := -D__KERNEL__ -Iinclude +ifdef CONFIG_DEBUG_SYMBOLS +CFLAGS := $(CPPFLAGS) -Wall -Wstrict-prototypes -Wno-trigraphs -O2 -g \ + -fno-strict-aliasing -fno-common +else CFLAGS := $(CPPFLAGS) -Wall -Wstrict-prototypes -Wno-trigraphs -O2 \ -fno-strict-aliasing -fno-common +endif AFLAGS := -D__ASSEMBLY__ $(CPPFLAGS) export VERSION PATCHLEVEL SUBLEVEL EXTRAVERSION KERNELRELEASE ARCH \ @@ -296,6 +306,10 @@ ifneq ($(KBUILD_BUILTIN),1) KBUILD_BUILTIN := 1 endif endif +endif + +ifdef CONFIG_X86_REMOTE_DEBUG +CFLAGS += -g endif # diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/Kconfig 900-mjb2/arch/i386/Kconfig --- 000-virgin/arch/i386/Kconfig Fri May 30 19:01:58 2003 +++ 900-mjb2/arch/i386/Kconfig Wed Jun 11 22:55:20 2003 @@ -383,6 +383,11 @@ config X86_OOSTORE depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 default y +config X86_CMOV + bool + depends on M686 || MPENTIUMII || MPENTIUMIII || MPENTIUM4 || MK8 || MCRUSOE + default y + config HUGETLB_PAGE bool "Huge TLB Page Support" help @@ -435,17 +440,17 @@ config NR_CPUS This is purely to save memory - each supported CPU adds approximately eight kilobytes to the kernel image. -config PREEMPT - bool "Preemptible Kernel" - help - This option reduces the latency of the kernel when reacting to - real-time or interactive events by allowing a low priority process to - be preempted even if it is in kernel mode executing a system call. - This allows applications to run more reliably even when the system is - under load. - - Say Y here if you are building a kernel for a desktop, embedded - or real-time system. Say N if you are unsure. +# config PREEMPT +# bool "Preemptible Kernel" +# help +# This option reduces the latency of the kernel when reacting to +# real-time or interactive events by allowing a low priority process to +# be preempted even if it is in kernel mode executing a system call. +# This allows applications to run more reliably even when the system is +# under load. +# +# Say Y here if you are building a kernel for a desktop, embedded +# or real-time system. Say N if you are unsure. config X86_UP_APIC bool "Local APIC support on uniprocessors" if !SMP @@ -666,6 +671,44 @@ config HIGHMEM64G endchoice +choice + help + On i386, a process can only virtually address 4GB of memory. This + lets you select how much of that virtual space you would like to + devoted to userspace, and how much to the kernel. + + Some userspace programs would like to address as much as possible and + have few demands of the kernel other than it get out of the way. These + users may opt to use the 3.5GB option to give their userspace program + as much room as possible. Due to alignment issues imposed by PAE, + the "3.5GB" option is unavailable if "64GB" high memory support is + enabled. + + Other users (especially those who use PAE) may be running out of + ZONE_NORMAL memory. Those users may benefit from increasing the + kernel's virtual address space size by taking it away from userspace, + which may not need all of its space. An indicator that this is + happening is when /proc/Meminfo's "LowFree:" is a small percentage of + "LowTotal:" while "HighFree:" is very large. + + If unsure, say "3GB" + prompt "User address space size" + default 1GB + +config 05GB + bool "3.5 GB" + depends on !HIGHMEM64G + +config 1GB + bool "3 GB" + +config 2GB + bool "2 GB" + +config 3GB + bool "1 GB" +endchoice + config HIGHMEM bool depends on HIGHMEM64G || HIGHMEM4G @@ -683,6 +726,11 @@ config NUMA default n if X86_PC default y if (X86_NUMAQ || X86_SUMMIT) +config NUMA_SCHED + bool "Numa Scheduling Support" + depends on NUMA + default y + # Need comments to help the hapless user trying to turn on NUMA support comment "NUMA (NUMA-Q) requires SMP, 64GB highmem support" depends on X86_NUMAQ && (!HIGHMEM64G || !SMP) @@ -709,6 +757,16 @@ config HIGHPTE low memory. Setting this option will put user-space page table entries in high memory. +config 4K_STACK + bool "Use smaller 4k per-task stacks" + help + This option will shrink the kernel's per-task stack from 8k to + 4k. This will greatly increase your chance of overflowing it. + But, if you use the per-cpu interrupt stacks as well, your chances + go way down. Also try the CONFIG_X86_STACK_CHECK overflow + detection. It is much more reliable than the currently in-kernel + version. + config MATH_EMULATION bool "Math emulation" ---help--- @@ -768,6 +826,33 @@ config MTRR See for more information. +choice + help + This is unrelated to your processor's speed. This variable alters + how often the system is asked to generate timer interrupts. A larger + value can lead to a more responsive system, but also causes extra + overhead from the increased number of context switches. + + If in doubt, leave it at the default of 1000. + + prompt "Kernel HZ" + default 1000HZ + +config 100HZ + bool "100 Hz" + +config 1000HZ + bool "1000 Hz" +endchoice + +config IRQBALANCE + bool "Enable kernel irq balancing" + depends on SMP + default y + help + The defalut yes will allow the kernel to do irq load balancing. + Saying no will keep the kernel from doing irq load balancing. + config HAVE_DEC_LOCK bool depends on (SMP || PREEMPT) && X86_CMPXCHG @@ -1462,6 +1547,10 @@ config DEBUG_KERNEL Say Y here if you are developing drivers or trying to debug and identify kernel problems. +config DEBUG_SYMBOLS + bool "Get debug symbols (turns on -g)" + depends on DEBUG_KERNEL + config DEBUG_STACKOVERFLOW bool "Check for stack overflows" depends on DEBUG_KERNEL @@ -1474,6 +1563,17 @@ config DEBUG_SLAB allocation as well as poisoning memory on free to catch use of freed memory. +config X86_REMOTE_DEBUG + bool "KGDB: Remote (serial) kernel debugging with gdb" + +config KGDB_THREAD + bool "KGDB: Thread analysis" + depends on X86_REMOTE_DEBUG + +config GDB_CONSOLE + bool "KGDB: Console messages through gdb" + depends on X86_REMOTE_DEBUG + config DEBUG_IOVIRT bool "Memory mapped I/O debugging" depends on DEBUG_KERNEL @@ -1499,6 +1599,26 @@ config MAGIC_SYSRQ keys are documented in . Don't say Y unless you really know what this hack does. +config EARLY_PRINTK + bool "Early console support" + default n + depends on DEBUG_KERNEL + help + Write kernel log output directly into the VGA buffer or serial port. + This is useful for kernel debugging when your machine crashes very + early before the console code is initialized. For normal operation + it is not recommended because it looks ugly and doesn't cooperate + with klogd/syslogd or the X server.You should normally N here, + unless you want to debug such a crash. + + Syntax: earlyprintk=vga + earlyprintk=serial[,ttySn[,baudrate]] + Append ,keep to not disable it when the real console takes over. + Only vga or serial at a time, not both. + Currently only ttyS0 and ttyS1 are supported. + Interaction with the standard serial driver is not very good. + The VGA output is eventually overwritten by the real console. + config DEBUG_SPINLOCK bool "Spinlock debugging" depends on DEBUG_KERNEL @@ -1508,6 +1628,15 @@ config DEBUG_SPINLOCK best used in conjunction with the NMI watchdog so that spinlock deadlocks are also debuggable. +config SPINLINE + bool "Spinlock inlining" + depends on DEBUG_KERNEL + help + This will change spinlocks from out of line to inline, making them + account cost to the callers in readprofile, rather than the lock + itself (as ".text.lock.filename"). This can be helpful for finding + the callers of locks. + config DEBUG_HIGHMEM bool "Highmem debugging" depends on DEBUG_KERNEL && HIGHMEM @@ -1528,13 +1657,42 @@ config DEBUG_SPINLOCK_SLEEP If you say Y here, various routines which may sleep will become very noisy if they are called with a spinlock held. +config LOCKMETER + bool "Kernel lock metering" + depends on SMP + help + Say Y to enable kernel lock metering, which adds overhead to SMP + locks, but allows you to see various statistics using the lockstat + command + config FRAME_POINTER - bool "Compile the kernel with frame pointers" + bool + default y if X86_REMOTE_DEBUG + default n if !X86_REMOTE_DEBUG help If you say Y here the resulting kernel image will be slightly larger and slower, but it will give very useful debugging information. If you don't debug the kernel, you can say N, but we may not be able to solve problems without frame pointers. + +config X86_STACK_CHECK + bool "Detect stack overflows" + depends on FRAME_POINTER + help + Say Y here to have the kernel attempt to detect when the per-task + kernel stack overflows. This is much more robust checking than + the above overflow check, which will only occasionally detect + an overflow. The level of guarantee here is much greater. + + Some older versions of gcc don't handle the -p option correctly. + Kernprof is affected by the same problem, which is described here: + http://oss.sgi.com/projects/kernprof/faq.html#Q9 + + Basically, if you get oopses in __free_pages_ok during boot when + you have this turned on, you need to fix gcc. The Redhat 2.96 + version and gcc-3.x seem to work. + + If not debugging a stack overflow problem, say N config X86_EXTRA_IRQS bool diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/Makefile 900-mjb2/arch/i386/Makefile --- 000-virgin/arch/i386/Makefile Fri May 30 19:01:58 2003 +++ 900-mjb2/arch/i386/Makefile Wed Jun 11 22:46:30 2003 @@ -81,6 +81,10 @@ core-$(CONFIG_X86_GENERICARCH) += arch/i # default subarch .h files mflags-y += -Iinclude/asm-i386/mach-default +ifdef CONFIG_X86_STACK_CHECK +CFLAGS += -p +endif + head-y := arch/i386/kernel/head.o arch/i386/kernel/init_task.o libs-y += arch/i386/lib/ @@ -94,6 +98,7 @@ drivers-$(CONFIG_OPROFILE) += arch/i386 CFLAGS += $(mflags-y) AFLAGS += $(mflags-y) +AFLAGS_vmlinux.lds.o += -imacros $(TOPDIR)/include/asm-i386/page.h boot := arch/i386/boot diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/boot/compressed/misc.c 900-mjb2/arch/i386/boot/compressed/misc.c --- 000-virgin/arch/i386/boot/compressed/misc.c Sun Apr 20 19:34:56 2003 +++ 900-mjb2/arch/i386/boot/compressed/misc.c Wed Jun 11 22:46:30 2003 @@ -379,3 +379,7 @@ asmlinkage int decompress_kernel(struct if (high_loaded) close_output_buffer_if_we_run_high(mv); return high_loaded; } + +/* We don't actually check for stack overflows this early. */ +__asm__(".globl mcount ; mcount: ret\n"); + diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/kernel/Makefile 900-mjb2/arch/i386/kernel/Makefile --- 000-virgin/arch/i386/kernel/Makefile Fri May 30 19:01:58 2003 +++ 900-mjb2/arch/i386/kernel/Makefile Wed Jun 11 22:47:01 2003 @@ -17,6 +17,7 @@ obj-$(CONFIG_MCA) += mca.o obj-$(CONFIG_X86_MSR) += msr.o obj-$(CONFIG_X86_CPUID) += cpuid.o obj-$(CONFIG_MICROCODE) += microcode.o +obj-$(CONFIG_X86_REMOTE_DEBUG) += gdbstub.o obj-$(CONFIG_PM) += suspend.o obj-$(CONFIG_APM) += apm.o obj-$(CONFIG_X86_SMP) += smp.o smpboot.o @@ -26,10 +27,19 @@ obj-$(CONFIG_X86_LOCAL_APIC) += apic.o n obj-$(CONFIG_X86_IO_APIC) += io_apic.o obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend.o suspend_asm.o obj-$(CONFIG_X86_NUMAQ) += numaq.o +obj-$(CONFIG_X86_SUMMIT) += summit.o obj-$(CONFIG_EDD) += edd.o obj-$(CONFIG_MODULES) += module.o obj-y += sysenter.o vsyscall.o obj-$(CONFIG_ACPI_SRAT) += srat.o + +ifdef CONFIG_X86_REMOTE_DEBUG +GDBSTART=gdbstart +GDBCLEAN= -rm -f gdbstart /sbin/gdbstart +else +GDBSTART= +GDBCLEAN= +endif EXTRA_AFLAGS := -traditional diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/kernel/apic.c 900-mjb2/arch/i386/kernel/apic.c --- 000-virgin/arch/i386/kernel/apic.c Fri May 30 19:01:58 2003 +++ 900-mjb2/arch/i386/kernel/apic.c Wed Jun 11 22:46:30 2003 @@ -1043,7 +1043,8 @@ inline void smp_local_timer_interrupt(st * interrupt as well. Thus we cannot inline the local irq ... ] */ -void smp_apic_timer_interrupt(struct pt_regs regs) +struct pt_regs * IRQHANDLER(smp_apic_timer_interrupt(struct pt_regs* regs)); +struct pt_regs * smp_apic_timer_interrupt(struct pt_regs* regs) { int cpu = smp_processor_id(); @@ -1063,14 +1064,16 @@ void smp_apic_timer_interrupt(struct pt_ * interrupt lock, which is the WrongThing (tm) to do. */ irq_enter(); - smp_local_timer_interrupt(®s); + smp_local_timer_interrupt(regs); irq_exit(); + return regs; } /* * This interrupt should _never_ happen with our APIC/SMP architecture */ -asmlinkage void smp_spurious_interrupt(void) +struct pt_regs * IRQHANDLER(smp_spurious_interrupt(struct pt_regs* regs)); +struct pt_regs * smp_spurious_interrupt(struct pt_regs* regs) { unsigned long v; @@ -1088,13 +1091,15 @@ asmlinkage void smp_spurious_interrupt(v printk(KERN_INFO "spurious APIC interrupt on CPU#%d, should never happen.\n", smp_processor_id()); irq_exit(); + return regs; } /* * This interrupt should never happen with our APIC/SMP architecture */ -asmlinkage void smp_error_interrupt(void) +struct pt_regs * IRQHANDLER(smp_error_interrupt(struct pt_regs* regs)); +struct pt_regs * smp_error_interrupt(struct pt_regs* regs) { unsigned long v, v1; @@ -1119,6 +1124,7 @@ asmlinkage void smp_error_interrupt(void printk (KERN_INFO "APIC error on CPU%d: %02lx(%02lx)\n", smp_processor_id(), v , v1); irq_exit(); + return regs; } /* diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/kernel/cpu/common.c 900-mjb2/arch/i386/kernel/cpu/common.c --- 000-virgin/arch/i386/kernel/cpu/common.c Sun Apr 20 19:34:56 2003 +++ 900-mjb2/arch/i386/kernel/cpu/common.c Wed Jun 11 22:51:54 2003 @@ -433,9 +433,9 @@ void __init early_cpu_init(void) } /* * cpu_init() initializes state that is per-CPU. Some data is already - * initialized (naturally) in the bootstrap process, such as the GDT - * and IDT. We reload them nevertheless, this function acts as a - * 'CPU state barrier', nothing should get across. + * initialized (naturally) in the bootstrap process, such as the GDT. + * We reload them nevertheless, this function acts as a 'CPU state barrier', + * nothing should get across. */ void __init cpu_init (void) { @@ -459,8 +459,8 @@ void __init cpu_init (void) } /* - * Initialize the per-CPU GDT with the boot GDT, - * and set up the GDT descriptor: + * Initialize the per-CPU GDTs with the boot equivalents, + * and set up the descriptors: */ if (cpu) { memcpy(cpu_gdt_table[cpu], cpu_gdt_table[0], GDT_SIZE); @@ -473,7 +473,6 @@ void __init cpu_init (void) memcpy(thread->tls_array, cpu_gdt_table[cpu], GDT_ENTRY_TLS_ENTRIES * 8); __asm__ __volatile__("lgdt %0": "=m" (cpu_gdt_descr[cpu])); - __asm__ __volatile__("lidt %0": "=m" (idt_descr)); /* * Delete NT @@ -517,3 +516,31 @@ void __init cpu_init (void) current->used_math = 0; stts(); } + +/* + * copy over the boot node idt across all nodes, we currently only have + * non-unique idt entries for device io interrupts. + */ +void __init setup_node_idts(void) +{ + int node = MAX_NUMNODES; + + /* we can skip setting up node0 since it's done in head.S */ + while (--node) { + memcpy(node_idt_table[node], node_idt_table[0], IDT_SIZE); + node_idt_descr[node].size = IDT_SIZE - 1; + node_idt_descr[node].address = (unsigned long)node_idt_table[node]; + } +} + +void __init setup_cpu_idt(void) +{ + int cpu = smp_processor_id(), node = cpu_to_node(cpu); + + printk(KERN_DEBUG "CPU%d IDT at 0x%08lx\n", + cpu, node_idt_descr[node].address); + + /* reload the idt on all processors as they come up */ + __asm__ __volatile__("lidt %0": "=m" (node_idt_descr[node])); +} + diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/kernel/cpu/mcheck/p4.c 900-mjb2/arch/i386/kernel/cpu/mcheck/p4.c --- 000-virgin/arch/i386/kernel/cpu/mcheck/p4.c Sun Apr 20 19:34:56 2003 +++ 900-mjb2/arch/i386/kernel/cpu/mcheck/p4.c Wed Jun 11 22:46:30 2003 @@ -61,11 +61,13 @@ static void intel_thermal_interrupt(stru /* Thermal interrupt handler for this CPU setup */ static void (*vendor_thermal_interrupt)(struct pt_regs *regs) = unexpected_thermal_interrupt; -asmlinkage void smp_thermal_interrupt(struct pt_regs regs) +struct pt_regs * IRQHANDLER(smp_thermal_interrupt(struct pt_regs* regs)); +struct pt_regs * smp_thermal_interrupt(struct pt_regs* regs) { irq_enter(); vendor_thermal_interrupt(®s); irq_exit(); + return regs; } /* P4/Xeon Thermal regulation detect and init */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/kernel/doublefault.c 900-mjb2/arch/i386/kernel/doublefault.c --- 000-virgin/arch/i386/kernel/doublefault.c Tue Feb 25 23:03:43 2003 +++ 900-mjb2/arch/i386/kernel/doublefault.c Wed Jun 11 22:51:54 2003 @@ -16,7 +16,7 @@ static unsigned long doublefault_stack[D static void doublefault_fn(void) { - struct Xgt_desc_struct gdt_desc = {0, 0}; + struct Xdt_desc_struct gdt_desc = {0, 0}; unsigned long gdt, tss; __asm__ __volatile__("sgdt %0": "=m" (gdt_desc): :"memory"); diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/kernel/entry.S 900-mjb2/arch/i386/kernel/entry.S --- 000-virgin/arch/i386/kernel/entry.S Fri May 30 19:01:58 2003 +++ 900-mjb2/arch/i386/kernel/entry.S Wed Jun 11 22:47:03 2003 @@ -49,6 +49,10 @@ #include #include "irq_vectors.h" +#ifndef CONFIG_KGDB_THREAD +#define user_schedule schedule +#endif + EBX = 0x00 ECX = 0x04 EDX = 0x08 @@ -160,7 +164,7 @@ do_lcall: movl %eax,EFLAGS(%ebp) # movl %edx,EIP(%ebp) # Now we move them to their "normal" places movl %ecx,CS(%ebp) # - andl $-8192, %ebp # GET_THREAD_INFO + GET_THREAD_INFO_WITH_ESP(%ebp) # GET_THREAD_INFO movl TI_EXEC_DOMAIN(%ebp), %edx # Get the execution domain call *4(%edx) # Call the lcall7 handler for the domain addl $4, %esp @@ -224,7 +228,7 @@ need_resched: jz restore_all movl $PREEMPT_ACTIVE,TI_PRE_COUNT(%ebp) sti - call schedule + call user_schedule movl $0,TI_PRE_COUNT(%ebp) cli jmp need_resched @@ -306,7 +310,7 @@ work_pending: testb $_TIF_NEED_RESCHED, %cl jz work_notifysig work_resched: - call schedule + call user_schedule cli # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret @@ -394,17 +398,78 @@ ENTRY(irq_entries_start) vector=vector+1 .endr + +# lets play optimizing compiler... +#ifdef CONFIG_X86_CMOV +#define COND_MOVE cmovnz %esi,%esp; +#else +#define COND_MOVE \ + jz 1f; \ + mov %esi,%esp; \ +1: +#endif + +# These macros will switch you to, and from a per-cpu interrupt stack +# They take the pt_regs arg and move it from the normal place on the +# stack to %eax. Any handler function can retrieve it using regparm(1). +# The handlers are expected to return the stack to switch back to in +# the same register. +# +# This means that the irq handlers need to return their arg +# +# SWITCH_TO_IRQSTACK clobbers %ebx, %ecx, %edx, %esi +# old stack gets put in %eax + +.macro SWITCH_TO_IRQSTACK + GET_THREAD_INFO(%ebx); + movl TI_IRQ_STACK(%ebx),%ecx; + movl TI_TASK(%ebx),%edx; + movl %esp,%eax; + + # %ecx+THREAD_SIZE is next stack -4 keeps us in the right one + leal (THREAD_SIZE-4)(%ecx),%esi; + + # is there a valid irq_stack? + testl %ecx,%ecx; + COND_MOVE; + + # update the task pointer in the irq stack + GET_THREAD_INFO(%esi); + movl %edx,TI_TASK(%esi); + + # update the preempt count in the irq stack + movl TI_PRE_COUNT(%ebx),%ecx; + movl %ecx,TI_PRE_COUNT(%esi); +.endm + +# copy flags from the irq stack back into the task's thread_info +# %esi is saved over the irq handler call and contains the irq stack's +# thread_info pointer +# %eax was returned from the handler, as described above +# %ebx contains the original thread_info pointer + +.macro RESTORE_FROM_IRQSTACK + movl %eax,%esp; + movl TI_FLAGS(%esi),%eax; + movl $0,TI_FLAGS(%esi); + LOCK orl %eax,TI_FLAGS(%ebx); +.endm + ALIGN common_interrupt: SAVE_ALL + SWITCH_TO_IRQSTACK call do_IRQ + RESTORE_FROM_IRQSTACK jmp ret_from_intr #define BUILD_INTERRUPT(name, nr) \ ENTRY(name) \ pushl $nr-256; \ SAVE_ALL \ - call smp_/**/name; \ + SWITCH_TO_IRQSTACK; \ + call smp_/**/name; \ + RESTORE_FROM_IRQSTACK; \ jmp ret_from_intr; /* The include is where all of the SMP etc. interrupts come from */ @@ -563,6 +628,31 @@ ENTRY(invalid_TSS) pushl $do_invalid_TSS jmp error_code +#ifdef CONFIG_KGDB_THREAD +ENTRY(kern_schedule) + pushl %ebp + movl %esp, %ebp + pushl %ss + pushl %ebp + pushfl + pushl %cs + pushl 4(%ebp) + pushl %eax + pushl %es + pushl %ds + pushl %eax + pushl (%ebp) + pushl %edi + pushl %esi + pushl %edx + pushl %ecx + pushl %ebx + call kern_do_schedule + movl %ebp, %esp + pop %ebp + ret +#endif + ENTRY(segment_not_present) pushl $do_segment_not_present jmp error_code @@ -594,6 +684,61 @@ ENTRY(spurious_interrupt_bug) pushl $0 pushl $do_spurious_interrupt_bug jmp error_code + + +#ifdef CONFIG_X86_STACK_CHECK +.data + .globl stack_overflowed +stack_overflowed: + .long 0 +.text + +ENTRY(mcount) + push %eax + movl $(THREAD_SIZE - 1),%eax + andl %esp,%eax + cmpl $STACK_WARN,%eax /* more than half the stack is used*/ + jle 1f +2: + popl %eax + ret +1: + lock; btsl $0,stack_overflowed + jc 2b + + # switch to overflow stack + movl %esp,%eax + movl $(stack_overflow_stack + THREAD_SIZE - 4),%esp + + pushf + cli + pushl %eax + + # push eip then esp of error for stack_overflow_panic + pushl 4(%eax) + pushl %eax + + # update the task pointer and cpu in the overflow stack's thread_info. + GET_THREAD_INFO_WITH_ESP(%eax) + movl TI_TASK(%eax),%ebx + movl %ebx,stack_overflow_stack+TI_TASK + movl TI_CPU(%eax),%ebx + movl %ebx,stack_overflow_stack+TI_CPU + + call stack_overflow + + # pop off call arguments + addl $8,%esp + + popl %eax + popf + movl %eax,%esp + popl %eax + movl $0,stack_overflowed + ret + +#warning stack check enabled +#endif .data ENTRY(sys_call_table) diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/kernel/gdbstart.c 900-mjb2/arch/i386/kernel/gdbstart.c --- 000-virgin/arch/i386/kernel/gdbstart.c Wed Dec 31 16:00:00 1969 +++ 900-mjb2/arch/i386/kernel/gdbstart.c Wed Jun 11 22:42:58 2003 @@ -0,0 +1,147 @@ +/* + * This program opens a tty file and issues the GDB stub activating + * ioctl on it. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +char *tty_name = "/dev/ttyS0" ; /* COM1 port */ +int speed = 9600 ; /* default speed */ +struct termios save_ts ; /* original term struct */ + +void print_usage(void) +{ + printf("gdbstub [-s speed] [-t tty-dev]\n") ; + printf(" defaults: /dev/ttyS0 with speed unmodified by this program\n"); + +} /* print_usage */ + +void tty_err(char *msg) +{ + char buf[100] ; + + strcpy(buf, msg) ; + strcat(buf, ": ") ; + strcat(buf, tty_name) ; + perror(buf) ; + exit(1) ; + +} /* tty_err */ + + +void setup_term(int fd) +{ + struct termios ts ; + int speed_code ; + + if (tcgetattr(fd, &ts) < 0) tty_err("tcgetattr") ; + + save_ts = ts ; + switch (speed) + { + case 4800: + speed_code = B4800 ; + break ; + case 9600: + speed_code = B9600 ; + break ; + case 19200: + speed_code = B19200 ; + break ; + case 38400: + speed_code = B38400 ; + break ; + case 57600: + speed_code = B57600 ; + break ; + case 115200: + speed_code = B115200 ; + break ; + case 230400: + speed_code = B230400 ; + break ; + default: + printf("Invalid speed: %d\n", speed) ; + exit(1) ; + } + + ts.c_cflag = CS8 | CREAD | CLOCAL ; + if (cfsetospeed(&ts, speed_code) < 0) tty_err("cfsetospeed") ; + if (cfsetispeed(&ts, speed_code) < 0) tty_err("cfsetispeed") ; + + if (tcsetattr(fd, TCSANOW, &ts) < 0) tty_err("tcsetattr") ; + +} /* setup_term */ + +int main(int argc, char **argv) +{ + int opt ; + int fil ; + int rslt ; + + while ((opt = getopt(argc, argv, "hs:t:")) > 0) + { + switch (opt) + { + case 's': + speed = atol(optarg) ; + break ; + case 't': + tty_name = optarg ; + break ; + case ':': + printf("Invalid option\n") ; + break ; + case '?': + case 'h': + default: + print_usage() ; + return 1; + } + } + + fil = open(tty_name, O_RDWR) ; + if (fil < 0) + { + perror(tty_name) ; + return 1; + } + + + setup_term(fil) ; + + /* + * When we issue this ioctl, control will not return until + * the debugger running on the remote host machine says "go". + */ + printf("\nAbout to activate GDB stub in the kernel on %s\n", tty_name) ; + printf("Hit CR to continue, kill program to abort -- ") ; + getchar() ; + sync() ; + rslt = ioctl(fil, TIOCGDB, 0) ; + if (rslt < 0) + { + perror("TIOCGDB ioctl") ; + return 1; + } + + printf("\nGDB stub successfully activated\n") ; + + for (;;) + { + pause() ; + } + + if (tcsetattr(fil, TCSANOW, &save_ts) < 0) tty_err("tcsetattr") ; + + exit(0); +} /* main */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/kernel/gdbstub.c 900-mjb2/arch/i386/kernel/gdbstub.c --- 000-virgin/arch/i386/kernel/gdbstub.c Wed Dec 31 16:00:00 1969 +++ 900-mjb2/arch/i386/kernel/gdbstub.c Wed Jun 11 22:42:58 2003 @@ -0,0 +1,1208 @@ +/* + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + */ + +/* + * Copyright (C) 2000-2001 VERITAS Software Corporation. + */ +/**************************************************************************** + * Header: remcom.c,v 1.34 91/03/09 12:29:49 glenne Exp $ + * + * Module name: remcom.c $ + * Revision: 1.34 $ + * Date: 91/03/09 12:29:49 $ + * Contributor: Lake Stevens Instrument Division$ + * + * Description: low level support for gdb debugger. $ + * + * Considerations: only works on target hardware $ + * + * Written by: Glenn Engel $ + * Updated by: Amit Kale + * ModuleState: Experimental $ + * + * NOTES: See Below $ + * + * Modified for 386 by Jim Kingdon, Cygnus Support. + * Origianl kgdb, compatibility with 2.1.xx kernel by David Grothe + * Integrated into 2.2.5 kernel by Tigran Aivazian + * thread support, + * support for multiple processors, + * support for ia-32(x86) hardware debugging, + * Console support, + * handling nmi watchdog + * Amit S. Kale ( akale@veritas.com ) + * + * + * To enable debugger support, two things need to happen. One, a + * call to set_debug_traps() is necessary in order to allow any breakpoints + * or error conditions to be properly intercepted and reported to gdb. + * Two, a breakpoint needs to be generated to begin communication. This + * is most easily accomplished by a call to breakpoint(). Breakpoint() + * simulates a breakpoint by executing an int 3. + * + ************* + * + * The following gdb commands are supported: + * + * command function Return value + * + * g return the value of the CPU registers hex data or ENN + * G set the value of the CPU registers OK or ENN + * + * mAA..AA,LLLL Read LLLL bytes at address AA..AA hex data or ENN + * MAA..AA,LLLL: Write LLLL bytes at address AA.AA OK or ENN + * + * c Resume at current address SNN ( signal NN) + * cAA..AA Continue at address AA..AA SNN + * + * s Step one instruction SNN + * sAA..AA Step one instruction from AA..AA SNN + * + * k kill + * + * ? What was the last sigval ? SNN (signal NN) + * + * All commands and responses are sent with a packet which includes a + * checksum. A packet consists of + * + * $#. + * + * where + * :: + * :: < two hex digits computed as modulo 256 sum of > + * + * When a packet is received, it is first acknowledged with either '+' or '-'. + * '+' indicates a successful transfer. '-' indicates a failed transfer. + * + * Example: + * + * Host: Reply: + * $m0,10#2a +$00010203040506070809101112131415#42 + * + ****************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for linux pt_regs struct */ +#include +#ifdef CONFIG_GDB_CONSOLE +#include +#endif +#include + +/************************************************************************ + * + * external low-level support routines + */ +typedef void (*Function) (void); /* pointer to a function */ + +/* Thread reference */ +typedef unsigned char threadref[8]; + +extern int putDebugChar(int); /* write a single character */ +extern int getDebugChar(void); /* read and return a single char */ + +extern int pid_max; + +/************************************************************************/ +/* BUFMAX defines the maximum number of characters in inbound/outbound buffers*/ +/* at least NUMREGBYTES*2 are needed for register packets */ +/* Longer buffer is needed to list all threads */ +#define BUFMAX 1024 + +static char initialized; /* boolean flag. != 0 means we've been initialized */ + +static const char hexchars[] = "0123456789abcdef"; + +/* Number of bytes of registers. */ +#define NUMREGBYTES 64 +/* + * Note that this register image is in a different order than + * the register image that Linux produces at interrupt time. + * + * Linux's register image is defined by struct pt_regs in ptrace.h. + * Just why GDB uses a different order is a historical mystery. + */ +enum regnames { _EAX, /* 0 */ + _ECX, /* 1 */ + _EDX, /* 2 */ + _EBX, /* 3 */ + _ESP, /* 4 */ + _EBP, /* 5 */ + _ESI, /* 6 */ + _EDI, /* 7 */ + _PC /* 8 also known as eip */ , + _PS /* 9 also known as eflags */ , + _CS, /* 10 */ + _SS, /* 11 */ + _DS, /* 12 */ + _ES, /* 13 */ + _FS, /* 14 */ + _GS +}; /* 15 */ + +/*************************** ASSEMBLY CODE MACROS *************************/ +/* */ + +#define BREAKPOINT() asm(" int $3"); + +/* Put the error code here just in case the user cares. */ +int gdb_i386errcode; +/* Likewise, the vector number here (since GDB only gets the signal + number through the usual means, and that's not very specific). */ +int gdb_i386vector = -1; + +static spinlock_t slavecpulocks[KGDB_MAX_NO_CPUS]; +volatile int procindebug[KGDB_MAX_NO_CPUS]; + +#ifdef CONFIG_SMP +spinlock_t kgdb_spinlock = SPIN_LOCK_UNLOCKED; +spinlock_t kgdb_nmispinlock = SPIN_LOCK_UNLOCKED; +#else +unsigned kgdb_spinlock = 0; +unsigned kgdb_nmispinlock = 0; +#endif + +static void +kgdb_usercode(void) +{ +} + +int +hex(char ch) +{ + if ((ch >= 'a') && (ch <= 'f')) + return (ch - 'a' + 10); + if ((ch >= '0') && (ch <= '9')) + return (ch - '0'); + if ((ch >= 'A') && (ch <= 'F')) + return (ch - 'A' + 10); + return (-1); +} + +/* scan for the sequence $# */ +void +getpacket(char *buffer) +{ + unsigned char checksum; + unsigned char xmitcsum; + int i; + int count; + char ch; + + do { + /* wait around for the start character, ignore all other characters */ + while ((ch = (getDebugChar() & 0x7f)) != '$') ; + checksum = 0; + xmitcsum = -1; + + count = 0; + + /* now, read until a # or end of buffer is found */ + while (count < BUFMAX) { + ch = getDebugChar() & 0x7f; + if (ch == '#') + break; + checksum = checksum + ch; + buffer[count] = ch; + count = count + 1; + } + buffer[count] = 0; + + if (ch == '#') { + xmitcsum = hex(getDebugChar() & 0x7f) << 4; + xmitcsum += hex(getDebugChar() & 0x7f); + + if (checksum != xmitcsum) + putDebugChar('-'); /* failed checksum */ + else { + putDebugChar('+'); /* successful transfer */ + /* if a sequence char is present, reply the sequence ID */ + if (buffer[2] == ':') { + putDebugChar(buffer[0]); + putDebugChar(buffer[1]); + /* remove sequence chars from buffer */ + count = strlen(buffer); + for (i = 3; i <= count; i++) + buffer[i - 3] = buffer[i]; + } + } + } + } while (checksum != xmitcsum); + +} + +/* send the packet in buffer. */ + +void +putpacket(char *buffer) +{ + unsigned char checksum; + int count; + char ch; + + /* $#. */ + do { + putDebugChar('$'); + checksum = 0; + count = 0; + + while ((ch = buffer[count])) { + if (!putDebugChar(ch)) + return; + checksum += ch; + count += 1; + } + + putDebugChar('#'); + putDebugChar(hexchars[checksum >> 4]); + putDebugChar(hexchars[checksum % 16]); + + } while ((getDebugChar() & 0x7f) != '+'); + +} + +static char remcomInBuffer[BUFMAX]; +static char remcomOutBuffer[BUFMAX]; +static short error; + +static void +regs_to_gdb_regs(int *gdb_regs, struct pt_regs *regs) +{ + gdb_regs[_EAX] = regs->eax; + gdb_regs[_EBX] = regs->ebx; + gdb_regs[_ECX] = regs->ecx; + gdb_regs[_EDX] = regs->edx; + gdb_regs[_ESI] = regs->esi; + gdb_regs[_EDI] = regs->edi; + gdb_regs[_EBP] = regs->ebp; + gdb_regs[_DS] = regs->xds; + gdb_regs[_ES] = regs->xes; + gdb_regs[_PS] = regs->eflags; + gdb_regs[_CS] = regs->xcs; + gdb_regs[_PC] = regs->eip; + gdb_regs[_ESP] = (int) (®s->esp); + gdb_regs[_SS] = __KERNEL_DS; + gdb_regs[_FS] = 0xFFFF; + gdb_regs[_GS] = 0xFFFF; +} /* regs_to_gdb_regs */ + +static void +gdb_regs_to_regs(int *gdb_regs, struct pt_regs *regs) +{ + regs->eax = gdb_regs[_EAX]; + regs->ebx = gdb_regs[_EBX]; + regs->ecx = gdb_regs[_ECX]; + regs->edx = gdb_regs[_EDX]; + regs->esi = gdb_regs[_ESI]; + regs->edi = gdb_regs[_EDI]; + regs->ebp = gdb_regs[_EBP]; + regs->xds = gdb_regs[_DS]; + regs->xes = gdb_regs[_ES]; + regs->eflags = gdb_regs[_PS]; + regs->xcs = gdb_regs[_CS]; + regs->eip = gdb_regs[_PC]; +#if 0 /* can't change these */ + regs->esp = gdb_regs[_ESP]; + regs->xss = gdb_regs[_SS]; + regs->fs = gdb_regs[_FS]; + regs->gs = gdb_regs[_GS]; +#endif + +} /* gdb_regs_to_regs */ + +/* Indicate to caller of mem2hex or hex2mem that there has been an + error. */ +static volatile int kgdb_memerr = 0; +volatile int kgdb_memerr_expected = 0; +static volatile int kgdb_memerr_cnt = 0; +static int garbage_loc = -1; + +int +get_char(char *addr) +{ + return *addr; +} + +void +set_char(char *addr, int val) +{ + *addr = val; +} + +/* convert the memory pointed to by mem into hex, placing result in buf */ +/* return a pointer to the last char put in buf (null) */ +/* If MAY_FAULT is non-zero, then we should set kgdb_memerr in response to + a fault; if zero treat a fault like any other fault in the stub. */ +char * +mem2hex(char *mem, char *buf, int count, int may_fault) +{ + int i; + unsigned char ch; + + if (may_fault) { + kgdb_memerr_expected = 1; + kgdb_memerr = 0; + } + for (i = 0; i < count; i++) { + + ch = get_char(mem++); + + if (may_fault && kgdb_memerr) { + *buf = 0; /* truncate buffer */ + return (buf); + } + *buf++ = hexchars[ch >> 4]; + *buf++ = hexchars[ch % 16]; + } + *buf = 0; + if (may_fault) + kgdb_memerr_expected = 0; + return (buf); +} + +/* convert the hex array pointed to by buf into binary to be placed in mem */ +/* return a pointer to the character AFTER the last byte written */ +char * +hex2mem(char *buf, char *mem, int count, int may_fault) +{ + int i; + unsigned char ch; + + if (may_fault) { + kgdb_memerr_expected = 1; + kgdb_memerr = 0; + } + for (i = 0; i < count; i++) { + ch = hex(*buf++) << 4; + ch = ch + hex(*buf++); + set_char(mem++, ch); + + if (may_fault && kgdb_memerr) { + return (mem); + } + } + if (may_fault) + kgdb_memerr_expected = 0; + return (mem); +} + +/**********************************************/ +/* WHILE WE FIND NICE HEX CHARS, BUILD AN INT */ +/* RETURN NUMBER OF CHARS PROCESSED */ +/**********************************************/ +int +hexToInt(char **ptr, int *intValue) +{ + int numChars = 0; + int hexValue; + + *intValue = 0; + + while (**ptr) { + hexValue = hex(**ptr); + if (hexValue >= 0) { + *intValue = (*intValue << 4) | hexValue; + numChars++; + } else + break; + + (*ptr)++; + } + + return (numChars); +} + +#ifdef CONFIG_KGDB_THREAD +static int +stubhex(int ch) +{ + if (ch >= 'a' && ch <= 'f') + return ch - 'a' + 10; + if (ch >= '0' && ch <= '9') + return ch - '0'; + if (ch >= 'A' && ch <= 'F') + return ch - 'A' + 10; + return -1; +} + +static int +stub_unpack_int(char *buff, int fieldlength) +{ + int nibble; + int retval = 0; + + while (fieldlength) { + nibble = stubhex(*buff++); + retval |= nibble; + fieldlength--; + if (fieldlength) + retval = retval << 4; + } + return retval; +} +#endif + +static char * +pack_hex_byte(char *pkt, int byte) +{ + *pkt++ = hexchars[(byte >> 4) & 0xf]; + *pkt++ = hexchars[(byte & 0xf)]; + return pkt; +} + +#define BUF_THREAD_ID_SIZE 16 + +#ifdef CONFIG_KGDB_THREAD +static char * +pack_threadid(char *pkt, threadref * id) +{ + char *limit; + unsigned char *altid; + + altid = (unsigned char *) id; + limit = pkt + BUF_THREAD_ID_SIZE; + while (pkt < limit) + pkt = pack_hex_byte(pkt, *altid++); + return pkt; +} + +static char * +unpack_byte(char *buf, int *value) +{ + *value = stub_unpack_int(buf, 2); + return buf + 2; +} + +static char * +unpack_threadid(char *inbuf, threadref * id) +{ + char *altref; + char *limit = inbuf + BUF_THREAD_ID_SIZE; + int x, y; + + altref = (char *) id; + + while (inbuf < limit) { + x = stubhex(*inbuf++); + y = stubhex(*inbuf++); + *altref++ = (x << 4) | y; + } + return inbuf; +} +#endif + +void +int_to_threadref(threadref * id, int value) +{ + unsigned char *scan; + + scan = (unsigned char *) id; + { + int i = 4; + while (i--) + *scan++ = 0; + } + *scan++ = (value >> 24) & 0xff; + *scan++ = (value >> 16) & 0xff; + *scan++ = (value >> 8) & 0xff; + *scan++ = (value & 0xff); +} + +#ifdef CONFIG_KGDB_THREAD +static int +threadref_to_int(threadref * ref) +{ + int i, value = 0; + unsigned char *scan; + + scan = (char *) ref; + scan += 4; + i = 4; + while (i-- > 0) + value = (value << 8) | ((*scan++) & 0xff); + return value; +} + +struct task_struct * +getthread(int pid) +{ + struct task_struct *thread; + thread = find_task_by_pid(pid); + if (thread) { + return thread; + } +#if 0 + thread = init_tasks[0]; + do { + if (thread->pid == pid) { + return thread; + } + thread = thread->next_task; + } while (thread != init_tasks[0]); +#endif + return NULL; +} +#endif + +struct hw_breakpoint { + unsigned enabled; + unsigned type; + unsigned len; + unsigned addr; +} breakinfo[4] = { { +enabled:0}, { +enabled:0}, { +enabled:0}, { +enabled:0}}; + +void +correct_hw_break(void) +{ + int breakno; + int correctit; + int breakbit; + unsigned dr7; + + asm volatile ("movl %%db7, %0\n":"=r" (dr7) + :); + do { + unsigned addr0, addr1, addr2, addr3; + asm volatile ("movl %%db0, %0\n" + "movl %%db1, %1\n" + "movl %%db2, %2\n" + "movl %%db3, %3\n":"=r" (addr0), "=r"(addr1), + "=r"(addr2), "=r"(addr3):); + } while (0); + correctit = 0; + for (breakno = 0; breakno < 3; breakno++) { + breakbit = 2 << (breakno << 1); + if (!(dr7 & breakbit) && breakinfo[breakno].enabled) { + correctit = 1; + dr7 |= breakbit; + dr7 &= ~(0xf0000 << (breakno << 2)); + dr7 |= (((breakinfo[breakno].len << 2) | + breakinfo[breakno].type) << 16) << + (breakno << 2); + switch (breakno) { + case 0: + asm volatile ("movl %0, %%dr0\n"::"r" + (breakinfo[breakno].addr)); + break; + + case 1: + asm volatile ("movl %0, %%dr1\n"::"r" + (breakinfo[breakno].addr)); + break; + + case 2: + asm volatile ("movl %0, %%dr2\n"::"r" + (breakinfo[breakno].addr)); + break; + + case 3: + asm volatile ("movl %0, %%dr3\n"::"r" + (breakinfo[breakno].addr)); + break; + } + } else if ((dr7 & breakbit) && !breakinfo[breakno].enabled) { + correctit = 1; + dr7 &= ~breakbit; + dr7 &= ~(0xf0000 << (breakno << 2)); + } + } + if (correctit) { + asm volatile ("movl %0, %%db7\n"::"r" (dr7)); + } +} + +int +remove_hw_break(unsigned breakno) +{ + if (!breakinfo[breakno].enabled) { + return -1; + } + breakinfo[breakno].enabled = 0; + return 0; +} + +int +set_hw_break(unsigned breakno, unsigned type, unsigned len, unsigned addr) +{ + if (breakinfo[breakno].enabled) { + return -1; + } + breakinfo[breakno].enabled = 1; + breakinfo[breakno].type = type; + breakinfo[breakno].len = len; + breakinfo[breakno].addr = addr; + return 0; +} + +void +gdb_wait(void *arg) +{ + unsigned flags; + int processor; + + local_irq_save(flags); + processor = smp_processor_id(); + procindebug[processor] = 1; + current->thread.kgdbregs = arg; + spin_lock(slavecpulocks + processor); + correct_hw_break(); + procindebug[processor] = 0; + local_irq_restore(flags); +} + +void +printexceptioninfo(int exceptionNo, int errorcode, char *buffer) +{ + unsigned dr6; + int i; + switch (exceptionNo) { + case 1: /* debug exception */ + break; + case 3: /* breakpoint */ + sprintf(buffer, "Software breakpoint"); + return; + default: + sprintf(buffer, "Details not available"); + return; + } + asm volatile ("movl %%db6, %0\n":"=r" (dr6) + :); + if (dr6 & 0x4000) { + sprintf(buffer, "Single step"); + return; + } + for (i = 0; i < 4; ++i) { + if (dr6 & (1 << i)) { + sprintf(buffer, "Hardware breakpoint %d", i); + return; + } + } + sprintf(buffer, "Unknown trap"); + return; +} + +/* + * This function does all command procesing for interfacing to gdb. + * + * NOTE: The INT nn instruction leaves the state of the interrupt + * enable flag UNCHANGED. That means that when this routine + * is entered via a breakpoint (INT 3) instruction from code + * that has interrupts enabled, then interrupts will STILL BE + * enabled when this routine is entered. The first thing that + * we do here is disable interrupts so as to prevent recursive + * entries and bothersome serial interrupts while we are + * trying to run the serial port in polled mode. + * + * For kernel version 2.1.xx the cli() actually gets a spin lock so + * it is always necessary to do a restore_flags before returning + * so as to let go of that lock. + */ +int +handle_exception(int exceptionVector, + int signo, int err_code, struct pt_regs *linux_regs) +{ + struct task_struct *usethread = NULL; + int addr, length; + int breakno, breaktype; + char *ptr; + int newPC; + unsigned long flags = ~0UL; + int gdb_regs[NUMREGBYTES / 4]; + int i; + int dr6; + int reboot = 0; +#ifdef CONFIG_KGDB_THREAD + int nothreads; + int maxthreads; + int threadid; + threadref thref; + struct task_struct *thread = NULL; +#endif +#define regs (*linux_regs) + + /* + * If the entry is not from the kernel then return to the Linux + * trap handler and let it process the interrupt normally. + */ + if ((linux_regs->eflags & VM_MASK) || (3 & linux_regs->xcs)) { + return (0); + } + + if (kgdb_memerr_expected) { + /* + * This fault occured because of the get_char or set_char + * routines. These two routines use either eax of edx to + * indirectly reference the location in memory that they + * are working with. For a page fault, when we return + * the instruction will be retried, so we have to make + * sure that these registers point to valid memory. + */ + kgdb_memerr = 1; /* set mem error flag */ + kgdb_memerr_expected = 0; + kgdb_memerr_cnt++; /* helps in debugging */ + regs.eax = (long) &garbage_loc; /* make valid address */ + regs.edx = (long) &garbage_loc; /* make valid address */ + return (0); + } +#ifdef CONFIG_SMP + if (!spin_is_locked(&kgdb_nmispinlock)) +#else + if (!kgdb_nmispinlock) +#endif + { + + /* Get kgdb spinlock */ +#ifdef CONFIG_SMP + _raw_spin_lock(&kgdb_spinlock); +#else + kgdb_spinlock = 1; +#endif + + local_irq_save(flags); + + /* Disable hardware debugging while we are in kgdb */ + __asm__("movl %0,%%db7": /* no output */ + :"r"(0)); + + for (i = 0; i < NR_CPUS; i++) { + spin_lock_init(&slavecpulocks[i]); + _raw_spin_lock(&slavecpulocks[i]); + } + + if (num_online_cpus() > 1) { + /* Force other cpus in debugger */ + if (smp_call_function(gdb_wait, NULL, 0, 99) != 0) { + return (1); + } + } + + procindebug[smp_processor_id()] = 1; + } + + gdb_i386vector = exceptionVector; + gdb_i386errcode = err_code; + + /* reply to host that an exception has occurred */ + remcomOutBuffer[0] = 'S'; + remcomOutBuffer[1] = hexchars[signo >> 4]; + remcomOutBuffer[2] = hexchars[signo % 16]; + remcomOutBuffer[3] = 0; + + putpacket(remcomOutBuffer); + + while (1 == 1) { + error = 0; + remcomOutBuffer[0] = 0; + getpacket(remcomInBuffer); + switch (remcomInBuffer[0]) { + case '?': + remcomOutBuffer[0] = 'S'; + remcomOutBuffer[1] = hexchars[signo >> 4]; + remcomOutBuffer[2] = hexchars[signo % 16]; + remcomOutBuffer[3] = 0; + break; + case 'g': /* return the value of the CPU registers */ + if (!usethread || usethread == current) { + regs_to_gdb_regs(gdb_regs, ®s); + } else { + memset(gdb_regs, 0, NUMREGBYTES); + if (usethread->thread.kgdbregs) { + kgdb_memerr_expected = 1; + kgdb_memerr = 0; + get_char((char *) usethread->thread. + kgdbregs); + kgdb_memerr_expected = 0; + if (kgdb_memerr) { + gdb_regs[_PC] = + (int) kgdb_usercode; + } else { + regs_to_gdb_regs(gdb_regs, + usethread-> + thread. + kgdbregs); + } + } else { + gdb_regs[_PC] = (int) kgdb_usercode; + } + } + mem2hex((char *) gdb_regs, remcomOutBuffer, NUMREGBYTES, + 0); + break; + case 'G': /* set the value of the CPU registers - return OK */ + hex2mem(&remcomInBuffer[1], (char *) gdb_regs, + NUMREGBYTES, 0); + if (!usethread || usethread == current) { + gdb_regs_to_regs(gdb_regs, ®s); + strcpy(remcomOutBuffer, "OK"); + } else { + strcpy(remcomOutBuffer, "E00"); + } + break; + + /* mAA..AA,LLLL Read LLLL bytes at address AA..AA */ + case 'm': + /* TRY TO READ %x,%x. IF SUCCEED, SET PTR = 0 */ + ptr = &remcomInBuffer[1]; + if (hexToInt(&ptr, &addr)) + if (*(ptr++) == ',') + if (hexToInt(&ptr, &length)) { + ptr = 0; + mem2hex((char *) addr, + remcomOutBuffer, length, + 1); + if (kgdb_memerr) { + strcpy(remcomOutBuffer, + "E03"); + } + } + + if (ptr) { + strcpy(remcomOutBuffer, "E01"); + } + break; + + /* MAA..AA,LLLL: Write LLLL bytes at address AA.AA return OK */ + case 'M': + /* TRY TO READ '%x,%x:'. IF SUCCEED, SET PTR = 0 */ + ptr = &remcomInBuffer[1]; + if (hexToInt(&ptr, &addr)) + if (*(ptr++) == ',') + if (hexToInt(&ptr, &length)) + if (*(ptr++) == ':') { + hex2mem(ptr, + (char *) addr, + length, 1); + + if (kgdb_memerr) { + strcpy + (remcomOutBuffer, + "E03"); + } else { + strcpy + (remcomOutBuffer, + "OK"); + } + + ptr = 0; + } + if (ptr) { + strcpy(remcomOutBuffer, "E02"); + } + break; + + /* cAA..AA Continue at address AA..AA(optional) */ + /* sAA..AA Step one instruction from AA..AA(optional) */ + case 'c': + case 's': +#ifdef CONFIG_SMP + if (spin_is_locked(&kgdb_nmispinlock)) +#else + if (kgdb_nmispinlock) +#endif + { + strcpy(remcomOutBuffer, "E01"); + break; + } + + /* try to read optional parameter, pc unchanged if no parm */ + ptr = &remcomInBuffer[1]; + if (hexToInt(&ptr, &addr)) { + regs.eip = addr; + } + + newPC = regs.eip; + + /* clear the trace bit */ + regs.eflags &= 0xfffffeff; + + /* set the trace bit if we're stepping */ + if (remcomInBuffer[0] == 's') + regs.eflags |= 0x100; + + asm volatile ("movl %%db6, %0\n":"=r" (dr6) + :); + if (!(dr6 & 0x4000)) { + for (breakno = 0; breakno < 4; ++breakno) { + if (dr6 & (1 << breakno)) { + if (breakinfo[breakno].type == + 0) { + /* Set restore flag */ + regs.eflags |= 0x10000; + break; + } + } + } + } + correct_hw_break(); + asm volatile ("movl %0, %%db6\n"::"r" (0)); + for (i = 0; i < NR_CPUS; i++) { + _raw_spin_unlock(&slavecpulocks[i]); + } + + procindebug[smp_processor_id()] = 0; + /* Release kgdb spinlock */ +#ifdef CONFIG_SMP + _raw_spin_unlock(&kgdb_spinlock); +#else + kgdb_spinlock = 0; +#endif + if (flags != ~0UL) + local_irq_restore(flags); + return (0); + + /* kill the program */ + case 'k': + break; + + /* query */ + case 'q': + switch (remcomInBuffer[1]) { +#ifdef CONFIG_KGDB_THREAD + case 'L': + /* List threads */ + unpack_byte(remcomInBuffer + 3, &maxthreads); + unpack_threadid(remcomInBuffer + 5, &thref); + + remcomOutBuffer[0] = 'q'; + remcomOutBuffer[1] = 'M'; + remcomOutBuffer[4] = '0'; + pack_threadid(remcomOutBuffer + 5, &thref); + + threadid = threadref_to_int(&thref); + for (nothreads = 0; + nothreads < maxthreads + && threadid < pid_max; threadid++) { + thread = getthread(threadid); + if (thread) { + int_to_threadref(&thref, + threadid); + pack_threadid(remcomOutBuffer + + 21 + + nothreads * 16, + &thref); + nothreads++; + } + } + if (threadid == pid_max) { + remcomOutBuffer[4] = '1'; + } + pack_hex_byte(remcomOutBuffer + 2, nothreads); + remcomOutBuffer[21 + nothreads * 16] = '\0'; + break; + + case 'C': + /* Current thread id */ + remcomOutBuffer[0] = 'Q'; + remcomOutBuffer[1] = 'C'; + threadid = current->pid; + int_to_threadref(&thref, threadid); + pack_threadid(remcomOutBuffer + 2, &thref); + remcomOutBuffer[18] = '\0'; + break; +#endif + + case 'E': + /* Print exception info */ + printexceptioninfo(exceptionVector, err_code, + remcomOutBuffer); + break; + } + break; + +#ifdef CONFIG_KGDB_THREAD + /* task related */ + case 'H': + switch (remcomInBuffer[1]) { + case 'g': + ptr = &remcomInBuffer[2]; + hexToInt(&ptr, &threadid); + thread = getthread(threadid); + if (!thread) { + remcomOutBuffer[0] = 'E'; + remcomOutBuffer[1] = '\0'; + break; + } + usethread = thread; + /* follow through */ + case 'c': + remcomOutBuffer[0] = 'O'; + remcomOutBuffer[1] = 'K'; + remcomOutBuffer[2] = '\0'; + break; + } + break; + + /* Query thread status */ + case 'T': + ptr = &remcomInBuffer[1]; + hexToInt(&ptr, &threadid); + thread = getthread(threadid); + if (thread) { + remcomOutBuffer[0] = 'O'; + remcomOutBuffer[1] = 'K'; + remcomOutBuffer[2] = '\0'; + } else { + remcomOutBuffer[0] = 'E'; + remcomOutBuffer[1] = '\0'; + } + break; +#endif + + case 'r': + reboot = 1; + strcpy(remcomOutBuffer, "OK"); + break; + case 'Y': + ptr = &remcomInBuffer[1]; + hexToInt(&ptr, &breakno); + ptr++; + hexToInt(&ptr, &breaktype); + ptr++; + hexToInt(&ptr, &length); + ptr++; + hexToInt(&ptr, &addr); + if (set_hw_break + (breakno & 0x3, breaktype & 0x3, length & 0x3, addr) + == 0) { + strcpy(remcomOutBuffer, "OK"); + } else { + strcpy(remcomOutBuffer, "ERROR"); + } + break; + + /* Remove hardware breakpoint */ + case 'y': + ptr = &remcomInBuffer[1]; + hexToInt(&ptr, &breakno); + if (remove_hw_break(breakno & 0x3) == 0) { + strcpy(remcomOutBuffer, "OK"); + } else { + strcpy(remcomOutBuffer, "ERROR"); + } + break; + + } /* switch */ + + /* reply to the request */ + putpacket(remcomOutBuffer); + if (reboot == 1) { + static long no_idt[2]; + __asm__ __volatile__("lidt %0"::"m"(no_idt)); + __asm__ __volatile__("int3"); + } + } +} + +/* this function is used to set up exception handlers for tracing and + breakpoints */ +void +set_debug_traps(void) +{ + /* + * linux_debug_hook is defined in traps.c. We store a pointer + * to our own exception handler into it. + */ + linux_debug_hook = handle_exception; + + /* + * In case GDB is started before us, ack any packets (presumably + * "$?#xx") sitting there. */ + putDebugChar('+'); + + initialized = 1; +} + +/* This function will generate a breakpoint exception. It is used at the + beginning of a program to sync up with a debugger and can be used + otherwise as a quick means to stop program execution and "break" into + the debugger. */ + +void +breakpoint(void) +{ + if (initialized) + BREAKPOINT(); +} + +#ifdef CONFIG_GDB_CONSOLE +char gdbconbuf[BUFMAX]; + +void +gdb_console_write(struct console *co, const char *s, unsigned count) +{ + int i; + int wcount; + char *bufptr; + + if (!gdb_initialized) { + return; + } + gdbconbuf[0] = 'O'; + bufptr = gdbconbuf + 1; + while (count > 0) { + if ((count << 1) > (BUFMAX - 2)) { + wcount = (BUFMAX - 2) >> 1; + } else { + wcount = count; + } + count -= wcount; + for (i = 0; i < wcount; i++) { + bufptr = pack_hex_byte(bufptr, s[i]); + } + *bufptr = '\0'; + s += wcount; + + putpacket(gdbconbuf); + + } +} +#endif +static int __init +kgdb_opt_gdb(char *dummy) +{ + gdb_enter = 1; + return 1; +} +static int __init +kgdb_opt_gdbttyS(char *str) +{ + gdb_ttyS = simple_strtoul(str, NULL, 10); + return 1; +} +static int __init +kgdb_opt_gdbbaud(char *str) +{ + gdb_baud = simple_strtoul(str, NULL, 10); + return 1; +} + +/* + * Sequence of these lines has to be maintained because gdb option is a prefix + * of the other two options + */ + +__setup("gdbttyS=", kgdb_opt_gdbttyS); +__setup("gdbbaud=", kgdb_opt_gdbbaud); +__setup("gdb", kgdb_opt_gdb); diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/kernel/head.S 900-mjb2/arch/i386/kernel/head.S --- 000-virgin/arch/i386/kernel/head.S Fri May 30 19:01:58 2003 +++ 900-mjb2/arch/i386/kernel/head.S Wed Jun 11 22:51:54 2003 @@ -16,6 +16,7 @@ #include #include #include +#include #define OLD_CL_MAGIC_ADDR 0x90020 #define OLD_CL_MAGIC 0xA33F @@ -249,7 +250,7 @@ is386: movl $2,%ecx # set MP call check_x87 incb ready lgdt cpu_gdt_descr - lidt idt_descr + lidt node_idt_descr # we switch to the per-node IDTs later ljmp $(__KERNEL_CS),$1f 1: movl $(__KERNEL_DS),%eax # reload all the segment registers movl %eax,%ss # after changing gdt. @@ -314,7 +315,7 @@ setup_idt: movw %dx,%ax /* selector = 0x0010 = cs */ movw $0x8E00,%dx /* interrupt gate - dpl=0, present */ - lea idt_table,%edi + lea node_idt_table,%edi mov $256,%ecx rp_sidt: movl %eax,(%edi) @@ -325,7 +326,7 @@ rp_sidt: ret ENTRY(stack_start) - .long init_thread_union+8192 + .long init_thread_union+THREAD_SIZE .long __BOOT_DS /* This is the default interrupt "handler" :-) */ @@ -359,14 +360,16 @@ ignore_int: * segment size, and 32-bit linear address value: */ -.globl idt_descr +.globl node_idt_descr .globl cpu_gdt_descr ALIGN .word 0 # 32-bit align idt_desc.address -idt_descr: +node_idt_descr: .word IDT_ENTRIES*8-1 # idt contains 256 entries - .long idt_table + .long node_idt_table + + .fill MAX_NUMNODES-1,8,0 # boot GDT descriptor (later on used by CPU#0): .word 0 # 32 bit align gdt_desc.address diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/kernel/i386_ksyms.c 900-mjb2/arch/i386/kernel/i386_ksyms.c --- 000-virgin/arch/i386/kernel/i386_ksyms.c Fri May 30 19:01:58 2003 +++ 900-mjb2/arch/i386/kernel/i386_ksyms.c Wed Jun 11 22:55:23 2003 @@ -95,7 +95,7 @@ EXPORT_SYMBOL(apm_info); EXPORT_SYMBOL(__io_virt_debug); #endif -EXPORT_SYMBOL_NOVERS(__down_failed); +EXPORT_SYMBOL_NOVERS(__down_failed_wq); EXPORT_SYMBOL_NOVERS(__down_failed_interruptible); EXPORT_SYMBOL_NOVERS(__down_failed_trylock); EXPORT_SYMBOL_NOVERS(__up_wakeup); @@ -146,6 +146,20 @@ EXPORT_SYMBOL(smp_num_siblings); EXPORT_SYMBOL(cpu_sibling_map); #endif +#ifdef CONFIG_X86_REMOTE_DEBUG +void __this_fixmap_does_not_exist(void) +{ + BUG(); +} +EXPORT_SYMBOL(__this_fixmap_does_not_exist); + +void __br_lock_usage_bug(void) +{ + BUG(); +} +EXPORT_SYMBOL(__br_lock_usage_bug); +#endif + #ifdef CONFIG_SMP EXPORT_SYMBOL(cpu_data); EXPORT_SYMBOL(cpu_online_map); @@ -207,4 +221,9 @@ EXPORT_SYMBOL(kmap_atomic_to_page); #ifdef CONFIG_EDD_MODULE EXPORT_SYMBOL(edd); EXPORT_SYMBOL(eddnr); +#endif + +#ifdef CONFIG_X86_STACK_CHECK +extern void mcount(void); +EXPORT_SYMBOL(mcount); #endif diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/kernel/init_task.c 900-mjb2/arch/i386/kernel/init_task.c --- 000-virgin/arch/i386/kernel/init_task.c Thu Feb 13 11:08:02 2003 +++ 900-mjb2/arch/i386/kernel/init_task.c Wed Jun 11 22:46:30 2003 @@ -14,6 +14,14 @@ static struct signal_struct init_signals static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); struct mm_struct init_mm = INIT_MM(init_mm); +union thread_union init_irq_union + __attribute__((__section__(".data.init_task"))); + +#ifdef CONFIG_X86_STACK_CHECK +union thread_union stack_overflow_stack + __attribute__((__section__(".data.init_task"))); +#endif + /* * Initial thread structure. * diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/kernel/io_apic.c 900-mjb2/arch/i386/kernel/io_apic.c --- 000-virgin/arch/i386/kernel/io_apic.c Fri May 30 19:01:59 2003 +++ 900-mjb2/arch/i386/kernel/io_apic.c Wed Jun 11 22:55:20 2003 @@ -271,7 +271,7 @@ static void set_ioapic_affinity (unsigne spin_unlock_irqrestore(&ioapic_lock, flags); } -#if defined(CONFIG_SMP) +#if defined(CONFIG_IRQBALANCE) # include /* kernel_thread() */ # include /* kstat */ # include /* kmalloc() */ @@ -665,8 +665,6 @@ static int __init irqbalance_disable(cha __setup("noirqbalance", irqbalance_disable); -static void set_ioapic_affinity (unsigned int irq, unsigned long mask); - static inline void move_irq(int irq) { /* note - we hold the desc->lock */ @@ -678,9 +676,9 @@ static inline void move_irq(int irq) __initcall(balanced_irq_init); -#else /* !SMP */ +#else /* !IRQBALANCE */ static inline void move_irq(int irq) { } -#endif /* defined(CONFIG_SMP) */ +#endif /* defined(IRQBALANCE) */ /* @@ -1117,24 +1115,59 @@ static inline int IO_APIC_irq_trigger(in } int irq_vector[NR_IRQS] = { FIRST_DEVICE_VECTOR , 0 }; +int __initdata vector_allocated[MAX_NUMNODES][FIRST_SYSTEM_VECTOR]; -static int __init assign_irq_vector(int irq) -{ - static int current_vector = FIRST_DEVICE_VECTOR, offset = 0; - if (IO_APIC_VECTOR(irq) > 0) - return IO_APIC_VECTOR(irq); +/* + * This is the per node vector allocator, it will only work for systems which + * have ioapics which can only deliver vectors to cpus on the same node and + * thus have hardware enforced ioapic/irq node affinity. + * + * However currently the only i386 systems which have this interrupt + * dispatching/servicing architecture are NUMAQ and x440. We try and 'share' + * vectors where possible to simplify cases where an irq can be serviced on + * multiple nodes due to it being present on multiple busses/nodes. + * The first pass on node0 will ensure we catch these node 'shared' irqs. + */ +static int __init assign_irq_vector(int irq, int node) +{ + static int offset[MAX_NUMNODES]; + static int nr_assigned[MAX_NUMNODES] = {[0 ... MAX_NUMNODES-1] = 1}; + static int current_vector[MAX_NUMNODES] = + {[0 ... MAX_NUMNODES-1] = FIRST_DEVICE_VECTOR}; + + int vector; + + Dprintk("requesting vector for node%d/irq%d\n", node, irq); + vector = IO_APIC_VECTOR(irq); + if (vector > 0) { + Dprintk("returning previous allocation vector0x%x\n", vector); + vector_allocated[node][vector]++; + return vector; + } + + if (++nr_assigned[node] > NR_IRQ_VECTORS) + return -ENOSPC; + next: - current_vector += 8; - if (current_vector == SYSCALL_VECTOR) + current_vector[node] += 8; + if (current_vector[node] == SYSCALL_VECTOR) goto next; - if (current_vector >= FIRST_SYSTEM_VECTOR) { - offset = (offset + 1) & 7; - current_vector = FIRST_DEVICE_VECTOR + offset; + if (current_vector[node] > FIRST_SYSTEM_VECTOR) { + offset[node] = (offset[node]+1) & 7; + current_vector[node] = FIRST_DEVICE_VECTOR + offset[node]; } - IO_APIC_VECTOR(irq) = current_vector; - return current_vector; + vector = current_vector[node]; + if (vector_allocated[node][vector]) + goto next; + + vector_allocated[node][vector]++; + IO_APIC_VECTOR(irq) = vector; + Dprintk("returning new allocation node%d/irq%d -> vector0x%x\n", + node, irq, vector); + + return vector; } static struct hw_interrupt_type ioapic_level_irq_type; @@ -1143,7 +1176,7 @@ static struct hw_interrupt_type ioapic_e void __init setup_IO_APIC_irqs(void) { struct IO_APIC_route_entry entry; - int apic, pin, idx, irq, first_notcon = 1, vector; + int apic, pin, idx, irq, first_notcon = 1, vector, bus, node; unsigned long flags; printk(KERN_DEBUG "init IO_APIC IRQs\n"); @@ -1159,7 +1192,8 @@ void __init setup_IO_APIC_irqs(void) entry.delivery_mode = INT_DELIVERY_MODE; entry.dest_mode = INT_DEST_MODE; entry.mask = 0; /* enable IRQ */ - entry.dest.logical.logical_dest = TARGET_CPUS; + entry.dest.logical.logical_dest = + cpu_mask_to_apicid(TARGET_CPUS); idx = find_irq_entry(apic,pin,mp_INT); if (idx == -1) { @@ -1174,12 +1208,21 @@ void __init setup_IO_APIC_irqs(void) entry.trigger = irq_trigger(idx); entry.polarity = irq_polarity(idx); + bus = mp_irqs[idx].mpc_srcbus; + node = mp_bus_id_to_node[bus]; + if (irq_trigger(idx)) { entry.trigger = 1; entry.mask = 1; } irq = pin_2_irq(idx, apic, pin); + if (irq >= NR_IRQS) { + printk("skipping irq%d on node%d/bus%d/ioapic%d out of IRQs!\n", + irq, node, bus, apic); + continue; + } + /* * skip adding the timer int on secondary nodes, which causes * a small but painful rift in the time-space continuum @@ -1193,7 +1236,10 @@ void __init setup_IO_APIC_irqs(void) continue; if (IO_APIC_IRQ(irq)) { - vector = assign_irq_vector(irq); + vector = assign_irq_vector(irq, node); + if (vector < 0) + continue; + entry.vector = vector; if (IO_APIC_irq_trigger(irq)) @@ -1201,11 +1247,15 @@ void __init setup_IO_APIC_irqs(void) else irq_desc[irq].handler = &ioapic_edge_irq_type; - set_intr_gate(vector, interrupt[irq]); - + Dprintk("irq_setup: node%d/bus%d/ioapic%d/vector0x%x - irq%d %p\n", + node, bus, apic, vector, irq, interrupt[irq]); + + node_set_intr_gate(node, vector, interrupt[irq]); + if (!apic && (irq < 16)) disable_8259A_irq(irq); } + spin_lock_irqsave(&ioapic_lock, flags); io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1)); io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0)); @@ -1238,7 +1288,7 @@ void __init setup_ExtINT_IRQ0_pin(unsign */ entry.dest_mode = INT_DEST_MODE; entry.mask = 0; /* unmask IRQ now */ - entry.dest.logical.logical_dest = TARGET_CPUS; + entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); entry.delivery_mode = INT_DELIVERY_MODE; entry.polarity = 0; entry.trigger = 0; @@ -1896,6 +1946,7 @@ static inline void init_IO_APIC_traps(vo * so default to an old-fashioned 8259 * interrupt if we can.. */ + printk(KERN_DEBUG "irq%d not serviced by IOAPIC\n", irq); if (irq < 16) make_8259A_irq(irq); else @@ -2034,9 +2085,10 @@ static inline void check_timer(void) * get/set the timer IRQ vector: */ disable_8259A_irq(0); - vector = assign_irq_vector(0); + vector = assign_irq_vector(0, cpu_to_node(smp_processor_id())); + /* This gets reserved on all nodes as FIRST_DEVICE_VECTOR */ set_intr_gate(vector, interrupt[0]); - + /* * Subtle, code in do_timer_interrupt() expects an AEOI * mode for the 8259A whenever interrupts are routed @@ -2291,10 +2343,13 @@ int io_apic_set_pci_routing (int ioapic, { struct IO_APIC_route_entry entry; unsigned long flags; + int node, bus, vector; + + if (irq >= NR_IRQS) + return -ENOSPC; if (!IO_APIC_IRQ(irq)) { - printk(KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0/n", - ioapic); + printk(KERN_ERR "ioapic%d invalid reference to IRQ0/n", ioapic); return -EINVAL; } @@ -2308,23 +2363,32 @@ int io_apic_set_pci_routing (int ioapic, entry.delivery_mode = INT_DELIVERY_MODE; entry.dest_mode = INT_DEST_MODE; - entry.dest.logical.logical_dest = TARGET_CPUS; + entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); entry.mask = 1; /* Disabled (masked) */ entry.trigger = 1; /* Level sensitive */ entry.polarity = 1; /* Low active */ add_pin_to_irq(irq, ioapic, pin); + + /* XXX verify this with an x440 and plain ACPI/SMP -zwane */ + bus = mp_irqs[pin].mpc_srcbus; + node = mp_bus_id_to_node[bus]; + + vector = assign_irq_vector(irq, node); + if (vector < 0) + return -ENOSPC; - entry.vector = assign_irq_vector(irq); - - printk(KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry (%d-%d -> 0x%x -> " - "IRQ %d)\n", ioapic, + entry.vector = vector; + printk(KERN_DEBUG "NODE[%d] IOAPIC[%d]: Set PCI routing entry (%d-%d -> 0x%x -> " + "IRQ %d)\n", node, ioapic, mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq); irq_desc[irq].handler = &ioapic_level_irq_type; - set_intr_gate(entry.vector, interrupt[irq]); - + printk(KERN_DEBUG "irq_route: node%d/bus%d/ioapic%d/vector0x%x - irq%d %p\n", + node, bus, ioapic, entry.vector, irq, interrupt[irq]); + node_set_intr_gate(node, entry.vector, interrupt[irq]); + if (!ioapic && (irq < 16)) disable_8259A_irq(irq); diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/kernel/irq.c 900-mjb2/arch/i386/kernel/irq.c --- 000-virgin/arch/i386/kernel/irq.c Fri May 30 19:01:59 2003 +++ 900-mjb2/arch/i386/kernel/irq.c Wed Jun 11 22:46:30 2003 @@ -342,7 +342,8 @@ void enable_irq(unsigned int irq) * SMP cross-CPU interrupts have their own specific * handlers). */ -asmlinkage unsigned int do_IRQ(struct pt_regs regs) +struct pt_regs * IRQHANDLER(do_IRQ(struct pt_regs *regs)); +struct pt_regs * do_IRQ(struct pt_regs *regs) { /* * We ack quickly, we don't want the irq controller @@ -354,7 +355,7 @@ asmlinkage unsigned int do_IRQ(struct pt * 0 return value means that this irq is already being * handled by some other CPU. (or is disabled) */ - int irq = regs.orig_eax & 0xff; /* high bits used in ret_from_ code */ + int irq = regs->orig_eax & 0xff; /* high bits used in ret_from_ code */ int cpu = smp_processor_id(); irq_desc_t *desc = irq_desc + irq; struct irqaction * action; @@ -419,7 +420,7 @@ asmlinkage unsigned int do_IRQ(struct pt */ for (;;) { spin_unlock(&desc->lock); - handle_IRQ_event(irq, ®s, action); + handle_IRQ_event(irq, regs, action); spin_lock(&desc->lock); if (likely(!(desc->status & IRQ_PENDING))) @@ -438,7 +439,7 @@ out: irq_exit(); - return 1; + return regs; } /** @@ -898,8 +899,9 @@ static int irq_affinity_write_proc (stru return -EINVAL; irq_affinity[irq] = new_value; +#ifndef CONFIG_X86_SUMMIT irq_desc[irq].handler->set_affinity(irq, new_value); - +#endif return full_count; } diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/kernel/mpparse.c 900-mjb2/arch/i386/kernel/mpparse.c --- 000-virgin/arch/i386/kernel/mpparse.c Fri May 30 19:01:59 2003 +++ 900-mjb2/arch/i386/kernel/mpparse.c Wed Jun 11 22:42:41 2003 @@ -73,9 +73,6 @@ static unsigned int __initdata num_proce /* Bitmask of physically existing CPUs */ unsigned long phys_cpu_present_map; -#ifndef CONFIG_X86_GENERICARCH -int x86_summit = 0; -#endif u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; /* diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/kernel/nmi.c 900-mjb2/arch/i386/kernel/nmi.c --- 000-virgin/arch/i386/kernel/nmi.c Sun Apr 20 19:34:57 2003 +++ 900-mjb2/arch/i386/kernel/nmi.c Wed Jun 11 22:42:58 2003 @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -33,6 +34,20 @@ static unsigned int nmi_hz = HZ; unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */ extern void show_registers(struct pt_regs *regs); +#ifdef CONFIG_X86_REMOTE_DEBUG +extern gdb_debug_hook * linux_debug_hook; +#define CHK_REMOTE_DEBUG(trapnr,signr,error_code,regs,after) \ +{ \ + if (linux_debug_hook != (gdb_debug_hook *) NULL && !user_mode(regs)) \ + { \ + (*linux_debug_hook)(trapnr, signr, error_code, regs); \ + after; \ + } \ +} +#else +#define CHK_REMOTE_DEBUG(trapnr,signr,error_code,regs,after) +#endif + #define K7_EVNTSEL_ENABLE (1 << 22) #define K7_EVNTSEL_INT (1 << 20) #define K7_EVNTSEL_OS (1 << 17) @@ -390,12 +405,59 @@ void nmi_watchdog_tick (struct pt_regs * sum = irq_stat[cpu].apic_timer_irqs; if (last_irq_sums[cpu] == sum) { +#ifdef CONFIG_X86_REMOTE_DEBUG +#ifdef CONFIG_SMP + if (spin_is_locked(&kgdb_spinlock)) +#else + if (kgdb_spinlock) +#endif + { + /* We are inside kgdb, this isn't a stuck cpu */ + alert_counter[cpu] = 0; + } else { +#ifdef CONFIG_SMP + if (spin_is_locked(&kgdb_nmispinlock)) +#else + if (kgdb_nmispinlock) +#endif + { + if (!procindebug[cpu]) { + procindebug[cpu] = 1; + current->thread.kgdbregs = regs; + while (1) { + /* nothing */ + } + } + return; + } + } +#endif /* * Ayiee, looks like this CPU is stuck ... * wait a few IRQs (5 seconds) before doing the oops ... */ alert_counter[cpu]++; if (alert_counter[cpu] == 5*nmi_hz) { +#ifdef CONFIG_X86_REMOTE_DEBUG +#ifdef CONFIG_SMP + if (spin_trylock(&kgdb_nmispinlock)) +#else + kgdb_nmispinlock = 1; +#endif + { + procindebug[cpu] = 1; + CHK_REMOTE_DEBUG(2,SIGBUS,0,regs,) + } +#ifdef CONFIG_SMP + else { + procindebug[cpu] = 1; + current->thread.kgdbregs = regs; + while (1) { + /* nothing */ + } + } +#endif +#endif spin_lock(&nmi_print_lock); /* * We are in trouble anyway, lets at least try diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/kernel/process.c 900-mjb2/arch/i386/kernel/process.c --- 000-virgin/arch/i386/kernel/process.c Fri May 30 19:01:59 2003 +++ 900-mjb2/arch/i386/kernel/process.c Wed Jun 11 22:46:30 2003 @@ -160,7 +160,25 @@ static int __init idle_setup (char *str) __setup("idle=", idle_setup); -void show_regs(struct pt_regs * regs) +void stack_overflow(unsigned long esp, unsigned long eip) +{ + int panicing = ((esp&(THREAD_SIZE-1)) <= STACK_PANIC); + + printk( "esp: 0x%lx masked: 0x%lx STACK_PANIC:0x%x %d %d\n", + esp, (esp&(THREAD_SIZE-1)), STACK_PANIC, (((esp&(THREAD_SIZE-1)) <= STACK_PANIC)), panicing ); + + if (panicing) + print_symbol("stack overflow from %s\n", eip); + else + print_symbol("excessive stack use from %s\n", eip); + printk("esp: %p\n", (void*)esp); + show_trace((void*)esp); + + if (panicing) + panic("stack overflow\n"); +} + +asmlinkage void show_regs(struct pt_regs * regs) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; @@ -449,6 +467,7 @@ struct task_struct * __switch_to(struct /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ + next_p->thread_info->irq_stack = prev_p->thread_info->irq_stack; unlazy_fpu(prev_p); /* diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/kernel/semaphore.c 900-mjb2/arch/i386/kernel/semaphore.c --- 000-virgin/arch/i386/kernel/semaphore.c Sun Dec 1 10:00:12 2002 +++ 900-mjb2/arch/i386/kernel/semaphore.c Wed Jun 11 22:55:23 2003 @@ -15,6 +15,7 @@ #include #include #include +#include #include /* @@ -53,15 +54,20 @@ void __up(struct semaphore *sem) wake_up(&sem->wait); } -void __down(struct semaphore * sem) +int __down_wq(struct semaphore * sem, wait_queue_t *wait) { struct task_struct *tsk = current; - DECLARE_WAITQUEUE(wait, tsk); + DECLARE_WAITQUEUE(local_wait, tsk); unsigned long flags; - tsk->state = TASK_UNINTERRUPTIBLE; + if (is_sync_wait(wait)) + tsk->state = TASK_UNINTERRUPTIBLE; + if (!wait) { + wait = &local_wait; + } + spin_lock_irqsave(&sem->wait.lock, flags); - add_wait_queue_exclusive_locked(&sem->wait, &wait); + add_wait_queue_exclusive_locked(&sem->wait, wait); sem->sleepers++; for (;;) { @@ -79,17 +85,23 @@ void __down(struct semaphore * sem) sem->sleepers = 1; /* us - see -1 above */ spin_unlock_irqrestore(&sem->wait.lock, flags); + if (!is_sync_wait(wait)) + return -EIOCBRETRY; + schedule(); spin_lock_irqsave(&sem->wait.lock, flags); tsk->state = TASK_UNINTERRUPTIBLE; } - remove_wait_queue_locked(&sem->wait, &wait); + if (is_sync_wait(wait) || !list_empty(&wait->task_list)) + remove_wait_queue_locked(&sem->wait, wait); wake_up_locked(&sem->wait); spin_unlock_irqrestore(&sem->wait.lock, flags); tsk->state = TASK_RUNNING; + return 0; } + int __down_interruptible(struct semaphore * sem) { int retval = 0; @@ -189,19 +201,17 @@ int __down_trylock(struct semaphore * se asm( ".text\n" ".align 4\n" -".globl __down_failed\n" -"__down_failed:\n\t" +".globl __down_failed_wq\n" +"__down_failed_wq:\n\t" #if defined(CONFIG_FRAME_POINTER) "pushl %ebp\n\t" "movl %esp,%ebp\n\t" #endif - "pushl %eax\n\t" "pushl %edx\n\t" "pushl %ecx\n\t" - "call __down\n\t" + "call __down_wq\n\t" "popl %ecx\n\t" "popl %edx\n\t" - "popl %eax\n\t" #if defined(CONFIG_FRAME_POINTER) "movl %ebp,%esp\n\t" "popl %ebp\n\t" diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/kernel/setup.c 900-mjb2/arch/i386/kernel/setup.c --- 000-virgin/arch/i386/kernel/setup.c Fri May 30 19:01:59 2003 +++ 900-mjb2/arch/i386/kernel/setup.c Wed Jun 11 22:47:01 2003 @@ -978,6 +978,9 @@ void __init setup_arch(char **cmdline_p) if (smp_found_config) get_smp_config(); #endif +#ifdef CONFIG_X86_SUMMIT + setup_summit(); +#endif register_memory(max_low_pfn); diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/kernel/smp.c 900-mjb2/arch/i386/kernel/smp.c --- 000-virgin/arch/i386/kernel/smp.c Fri May 30 19:01:59 2003 +++ 900-mjb2/arch/i386/kernel/smp.c Wed Jun 11 22:46:32 2003 @@ -305,7 +305,8 @@ static inline void leave_mm (unsigned lo * 2) Leave the mm if we are in the lazy tlb mode. */ -asmlinkage void smp_invalidate_interrupt (void) +struct pt_regs * IRQHANDLER(smp_invalidate_interrupt(struct pt_regs *regs)); +struct pt_regs * smp_invalidate_interrupt(struct pt_regs *regs) { unsigned long cpu; @@ -336,6 +337,7 @@ asmlinkage void smp_invalidate_interrupt out: put_cpu_no_resched(); + return regs; } static void flush_tlb_others (unsigned long cpumask, struct mm_struct *mm, @@ -509,10 +511,17 @@ int smp_call_function (void (*func) (voi { struct call_data_struct data; int cpus = num_online_cpus()-1; + int count = 0; + int gdb; - if (!cpus) + if (cpus <= 0) return 0; + gdb = 0; + if (wait == 99) { + wait = 0; + gdb = 1; + } data.func = func; data.info = info; atomic_set(&data.started, 0); @@ -528,12 +537,27 @@ int smp_call_function (void (*func) (voi send_IPI_allbutself(CALL_FUNCTION_VECTOR); /* Wait for response */ - while (atomic_read(&data.started) != cpus) + while (atomic_read(&data.started) != cpus) { + if (gdb) { + if (count++ == 2000000) { + printk("%s: timeout\n", __FUNCTION__); + break; + } + if (count == 1000000) { + printk("looks bad\n"); + printk("cpus=%d, started=%d\n", cpus, + atomic_read(&data.started)); + } + if (count > 1000000) + udelay(1); + } barrier(); + } if (wait) while (atomic_read(&data.finished) != cpus) barrier(); + spin_unlock(&call_lock); return 0; @@ -570,14 +594,17 @@ void smp_send_stop(void) * all the work is done automatically when * we return from the interrupt. */ -asmlinkage void smp_reschedule_interrupt(void) +struct pt_regs * IRQHANDLER(smp_reschedule_interrupt(struct pt_regs *regs)); +struct pt_regs * smp_reschedule_interrupt(struct pt_regs *regs) { ack_APIC_irq(); + return regs; } -asmlinkage void smp_call_function_interrupt(void) +struct pt_regs * IRQHANDLER(smp_call_function_interrupt(struct pt_regs *regs)); +struct pt_regs * smp_call_function_interrupt(struct pt_regs *regs) { - void (*func) (void *info) = call_data->func; + void (*func) (void *info, struct pt_regs *) = (void (*)(void *, struct pt_regs*))call_data->func; void *info = call_data->info; int wait = call_data->wait; @@ -592,12 +619,13 @@ asmlinkage void smp_call_function_interr * At this point the info structure may be out of scope unless wait==1 */ irq_enter(); - (*func)(info); + (*func)(info, regs); irq_exit(); if (wait) { mb(); atomic_inc(&call_data->finished); } + return regs; } diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/kernel/smpboot.c 900-mjb2/arch/i386/kernel/smpboot.c --- 000-virgin/arch/i386/kernel/smpboot.c Fri May 30 19:01:59 2003 +++ 900-mjb2/arch/i386/kernel/smpboot.c Wed Jun 11 22:51:54 2003 @@ -45,6 +45,7 @@ #include #include +#include #include #include #include @@ -62,7 +63,7 @@ int smp_num_siblings = 1; int phys_proc_id[NR_CPUS]; /* Package ID of each logical CPU */ /* Bitmask of currently online CPUs */ -unsigned long cpu_online_map; +unsigned long cpu_online_map = 1; static volatile unsigned long cpu_callin_map; volatile unsigned long cpu_callout_map; @@ -71,6 +72,11 @@ static unsigned long smp_commenced_mask; /* Per CPU bogomips and other parameters */ struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned; +/* Per CPU interrupt stacks */ +extern union thread_union init_irq_union; +union thread_union *irq_stacks[NR_CPUS] __cacheline_aligned = + { &init_irq_union, }; + /* Set when the idlers are all forked */ int smp_threads_ready; @@ -442,6 +448,7 @@ int __init start_secondary(void *unused) */ cpu_init(); smp_callin(); + setup_cpu_idt(); while (!test_bit(smp_processor_id(), &smp_commenced_mask)) rep_nop(); setup_secondary_APIC_clock(); @@ -770,6 +777,28 @@ wakeup_secondary_cpu(int phys_apicid, un } #endif /* WAKE_SECONDARY_VIA_INIT */ +static void __init setup_irq_stack(struct task_struct *p, int cpu) +{ + unsigned long stk; + + stk = __get_free_pages(GFP_KERNEL, THREAD_ORDER); + if (!stk) + panic("I can't seem to allocate my irq stack. Oh well, giving up."); + + irq_stacks[cpu] = (void *)stk; + memset(irq_stacks[cpu], 0, THREAD_SIZE); + irq_stacks[cpu]->thread_info.cpu = cpu; + irq_stacks[cpu]->thread_info.preempt_count = 1; + /* interrupts are not preemptable */ + p->thread_info->irq_stack = &irq_stacks[cpu]->thread_info; + + /* If we want to make the irq stack more than one unit + * deep, we can chain then off of the irq_stack pointer + * here. + */ +} + + extern unsigned long cpu_initialized; static int __init do_boot_cpu(int apicid) @@ -793,6 +822,7 @@ static int __init do_boot_cpu(int apicid idle = fork_by_hand(); if (IS_ERR(idle)) panic("failed fork for CPU %d", cpu); + setup_irq_stack(idle, cpu); wake_up_forked_process(idle); /* @@ -949,7 +979,7 @@ static void __init smp_boot_cpus(unsigne current_thread_info()->cpu = 0; smp_tune_scheduling(); - + /* * If we couldn't find an SMP configuration at boot time, * get out of here now! diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/kernel/summit.c 900-mjb2/arch/i386/kernel/summit.c --- 000-virgin/arch/i386/kernel/summit.c Wed Dec 31 16:00:00 1969 +++ 900-mjb2/arch/i386/kernel/summit.c Wed Jun 11 22:47:01 2003 @@ -0,0 +1,162 @@ +/* + * arch/i386/kernel/summit.c - IBM Summit-Specific Code + * + * Written By: Matthew Dobson, IBM Corporation + * + * Copyright (c) 2003 IBM Corp. + * + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or (at + * your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Send feedback to + * + */ + +#include +#include +#include +#include + +static void __init setup_pci_node_map_for_wpeg(int wpeg_num, struct rio_table_hdr *rth, + struct scal_detail **scal_nodes, struct rio_detail **rio_nodes){ + int twst_num = 0, node = 0, first_bus = 0; + int i, bus, num_busses; + + for(i = 0; i < rth->num_rio_dev; i++){ + if (rio_nodes[i]->node_id == rio_nodes[wpeg_num]->owner_id){ + twst_num = rio_nodes[i]->owner_id; + break; + } + } + if (i == rth->num_rio_dev){ + printk("%s: Couldn't find owner Cyclone for Winnipeg!\n", __FUNCTION__); + return; + } + + for(i = 0; i < rth->num_scal_dev; i++){ + if (scal_nodes[i]->node_id == twst_num){ + node = scal_nodes[i]->node_id; + break; + } + } + if (i == rth->num_scal_dev){ + printk("%s: Couldn't find owner Twister for Cyclone!\n", __FUNCTION__); + return; + } + + switch (rio_nodes[wpeg_num]->type){ + case CompatWPEG: + /* The Compatability Winnipeg controls the legacy busses + (busses 0 & 1), the 66MHz PCI bus [2 slots] (bus 2), + and the "extra" busses in case a PCI-PCI bridge card is + used in either slot (busses 3 & 4): total 5 busses. */ + num_busses = 5; + /* The BIOS numbers the busses starting at 1, and in a + slightly wierd manner. You'll have to trust that + the math used below to determine the number of the + first bus works. */ + first_bus = (rio_nodes[wpeg_num]->first_slot - 1) * 2; + break; + case AltWPEG: + /* The Alternate/Secondary Winnipeg controls the 1st 133MHz + bus [1 slot] & its "extra" bus (busses 0 & 1), the 2nd + 133MHz bus [1 slot] & its "extra" bus (busses 2 & 3), the + 100MHz bus [2 slots] (bus 4), and the "extra" busses for + the 2 100MHz slots (busses 5 & 6): total 7 busses. */ + num_busses = 7; + first_bus = (rio_nodes[wpeg_num]->first_slot * 2) - 1; + break; + case LookOutAWPEG: + case LookOutBWPEG: + printk("%s: LookOut Winnipegs not supported yet!\n", __FUNCTION__); + return; + default: + printk("%s: Unsupported Winnipeg type!\n", __FUNCTION__); + return; + } + + for(bus = first_bus; bus < first_bus + num_busses; bus++) + mp_bus_id_to_node[bus] = node; +} + +static void __init build_detail_arrays(struct rio_table_hdr *rth, + struct scal_detail **sd, struct rio_detail **rd){ + unsigned long ptr; + int i, scal_detail_size, rio_detail_size; + + switch (rth->version){ + default: + printk("%s: Bad Rio Grande Table Version: %d\n", __FUNCTION__, rth->version); + /* Fall through to default to version 2 spec */ + case 2: + scal_detail_size = 11; + rio_detail_size = 13; + break; + case 3: + scal_detail_size = 12; + rio_detail_size = 15; + break; + } + + ptr = (unsigned long)rth + 3; + for(i = 0; i < rth->num_scal_dev; i++) + sd[i] = (struct scal_detail *)(ptr + (scal_detail_size * i)); + + ptr += scal_detail_size * rth->num_scal_dev; + for(i = 0; i < rth->num_rio_dev; i++) + rd[i] = (struct rio_detail *)(ptr + (rio_detail_size * i)); +} + +void __init setup_summit(void) +{ + struct rio_table_hdr *rio_table_hdr = NULL; + struct scal_detail *scal_devs[MAX_NUMNODES]; + struct rio_detail *rio_devs[MAX_NUMNODES*2]; + unsigned long ptr; + unsigned short offset; + int i; + + memset(mp_bus_id_to_node, -1, sizeof(mp_bus_id_to_node)); + + /* The pointer to the EBDA is stored in the word @ phys 0x40E(40:0E) */ + ptr = *(unsigned short *)phys_to_virt(0x40Eul); + ptr = (unsigned long)phys_to_virt(ptr << 4); + + offset = 0x180; + while (offset){ + /* The block id is stored in the 2nd word */ + if (*((unsigned short *)(ptr + offset + 2)) == 0x4752){ + /* set the pointer past the offset & block id */ + rio_table_hdr = (struct rio_table_hdr *)(ptr + offset + 4); + break; + } + /* The next offset is stored in the 1st word. 0 means no more */ + offset = *((unsigned short *)(ptr + offset)); + } + if (!rio_table_hdr){ + printk("%s: Unable to locate Rio Grande Table in EBDA - bailing!\n", __FUNCTION__); + return; + } + + /* Deal with the ugly version 2/3 pointer arithmetic */ + build_detail_arrays(rio_table_hdr, scal_devs, rio_devs); + + for(i = 0; i < rio_table_hdr->num_rio_dev; i++) + if (is_WPEG(rio_devs[i]->type)) + /* It's a Winnipeg, it's got PCI Busses */ + setup_pci_node_map_for_wpeg(i, rio_table_hdr, scal_devs, rio_devs); +} diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/kernel/traps.c 900-mjb2/arch/i386/kernel/traps.c --- 000-virgin/arch/i386/kernel/traps.c Fri May 30 19:01:59 2003 +++ 900-mjb2/arch/i386/kernel/traps.c Wed Jun 11 22:51:54 2003 @@ -30,6 +30,7 @@ #include #endif +#include #ifdef CONFIG_MCA #include #include @@ -53,6 +54,24 @@ #include "mach_traps.h" +#ifdef CONFIG_X86_REMOTE_DEBUG +#include +#endif + +#ifdef CONFIG_X86_REMOTE_DEBUG +gdb_debug_hook * linux_debug_hook; +#define CHK_REMOTE_DEBUG(trapnr,signr,error_code,regs,after) \ + { \ + if (linux_debug_hook != (gdb_debug_hook *) NULL && !user_mode(regs)) \ + { \ + (*linux_debug_hook)(trapnr, signr, error_code, regs) ; \ + after; \ + } \ + } +#else +#define CHK_REMOTE_DEBUG(trapnr,signr,error_code,regs,after) +#endif + asmlinkage int system_call(void); asmlinkage void lcall7(void); asmlinkage void lcall27(void); @@ -68,7 +87,9 @@ char ignore_fpu_irq = 0; * F0 0F bug workaround.. We have a special link segment * for this. */ -struct desc_struct idt_table[256] __attribute__((__section__(".data.idt"))) = { {0, 0}, }; + +struct desc_struct node_idt_table[MAX_NUMNODES][IDT_ENTRIES] __attribute__((__section__(".data.idt"))) = + {[0 ... MAX_NUMNODES-1] = { {0, 0}, }}; asmlinkage void divide_error(void); asmlinkage void debug(void); @@ -259,6 +280,7 @@ void die(const char * str, struct pt_reg bust_spinlocks(1); handle_BUG(regs); printk("%s: %04lx [#%d]\n", str, err & 0xffff, ++die_counter); + CHK_REMOTE_DEBUG(1,SIGTRAP,err,regs,); show_registers(regs); bust_spinlocks(0); spin_unlock_irq(&die_lock); @@ -328,6 +350,7 @@ static inline void do_trap(int trapnr, i #define DO_ERROR(trapnr, signr, str, name) \ asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ { \ + CHK_REMOTE_DEBUG(trapnr,signr,error_code,regs,)\ do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \ } @@ -345,7 +368,9 @@ asmlinkage void do_##name(struct pt_regs #define DO_VM86_ERROR(trapnr, signr, str, name) \ asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ { \ + CHK_REMOTE_DEBUG(trapnr,signr,error_code,regs,return)\ do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \ + return; \ } #define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ @@ -388,8 +413,10 @@ gp_in_vm86: return; gp_in_kernel: - if (!fixup_exception(regs)) + if (!fixup_exception(regs)) { + CHK_REMOTE_DEBUG(13,SIGSEGV,error_code,regs,) die("general protection fault", regs, error_code); + } } static void mem_parity_error(unsigned char reason, struct pt_regs * regs) @@ -551,8 +578,10 @@ asmlinkage void do_debug(struct pt_regs * allowing programs to debug themselves without the ptrace() * interface. */ +#ifndef CONFIG_X86_REMOTE_DEBUG if ((regs->xcs & 3) == 0) goto clear_TF_reenable; +#endif if ((tsk->ptrace & (PT_DTRACE|PT_PTRACED)) == PT_DTRACE) goto clear_TF; } @@ -564,11 +593,13 @@ asmlinkage void do_debug(struct pt_regs info.si_errno = 0; info.si_code = TRAP_BRKPT; - /* If this is a kernel mode trap, save the user PC on entry to - * the kernel, that's what the debugger can make sense of. - */ - info.si_addr = ((regs->xcs & 3) == 0) ? (void *)tsk->thread.eip : - (void *)regs->eip; + + /* If this is a kernel mode trap, we need to reset db7 to allow us + * to continue sanely */ + if ((regs->xcs & 3) == 0) + goto clear_dr7; + + info.si_addr = (void *)regs->eip; force_sig_info(SIGTRAP, &info, tsk); /* Disable additional traps. They'll be re-enabled when @@ -578,13 +609,16 @@ clear_dr7: __asm__("movl %0,%%db7" : /* no output */ : "r" (0)); + CHK_REMOTE_DEBUG(1,SIGTRAP,error_code,regs,) return; debug_vm86: handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1); return; +#ifndef CONFIG_X86_REMOTE_DEBUG clear_TF_reenable: +#endif set_tsk_thread_flag(tsk, TIF_SINGLESTEP); clear_TF: regs->eflags &= ~TF_MASK; @@ -774,14 +808,16 @@ asmlinkage void math_emulate(long arg) #ifdef CONFIG_X86_F00F_BUG void __init trap_init_f00f_bug(void) { - __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO); + int node = cpu_to_node(smp_processor_id()); + + __set_fixmap(FIX_F00F_IDT, __pa(&node_idt_table[node]), PAGE_KERNEL_RO); /* * Update the IDT descriptor and reload the IDT so that * it uses the read-only mapped virtual address. */ - idt_descr.address = fix_to_virt(FIX_F00F_IDT); - __asm__ __volatile__("lidt %0": "=m" (idt_descr)); + node_idt_descr[node].address = fix_to_virt(FIX_F00F_IDT); + __asm__ __volatile__("lidt %0": "=m" (node_idt_descr[node])); } #endif @@ -800,24 +836,36 @@ do { \ /* - * This needs to use 'idt_table' rather than 'idt', and + * This needs to use 'node_idt_table' rather than 'idt', and * thus use the _nonmapped_ version of the IDT, as the * Pentium F0 0F bugfix can have resulted in the mapped * IDT being write-protected. */ + +void node_set_intr_gate(unsigned int node, unsigned int n, void *addr) +{ + _set_gate(&node_idt_table[node][n],14,0,addr,__KERNEL_CS); +} + void set_intr_gate(unsigned int n, void *addr) { - _set_gate(idt_table+n,14,0,addr,__KERNEL_CS); + int node; + for (node = 0; node < MAX_NUMNODES; node++) + node_set_intr_gate(node, n, addr); } static void __init set_trap_gate(unsigned int n, void *addr) { - _set_gate(idt_table+n,15,0,addr,__KERNEL_CS); + int node; + for (node = 0; node < MAX_NUMNODES; node++) + _set_gate(&node_idt_table[node][n],15,0,addr,__KERNEL_CS); } static void __init set_system_gate(unsigned int n, void *addr) { - _set_gate(idt_table+n,15,3,addr,__KERNEL_CS); + int node; + for (node = 0; node < MAX_NUMNODES; node++) + _set_gate(&node_idt_table[node][n],15,3,addr,__KERNEL_CS); } static void __init set_call_gate(void *a, void *addr) @@ -827,7 +875,9 @@ static void __init set_call_gate(void *a static void __init set_task_gate(unsigned int n, unsigned int gdt_entry) { - _set_gate(idt_table+n,5,0,0,(gdt_entry<<3)); + int node; + for (node = 0; node < MAX_NUMNODES; node++) + _set_gate(&node_idt_table[node][n],5,0,0,(gdt_entry<<3)); } @@ -878,6 +928,9 @@ void __init trap_init(void) */ set_call_gate(&default_ldt[0],lcall7); set_call_gate(&default_ldt[4],lcall27); + + /* setup the pernode idt tables */ + setup_node_idts(); /* * Should be a barrier for any external CPU state. diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/lib/dec_and_lock.c 900-mjb2/arch/i386/lib/dec_and_lock.c --- 000-virgin/arch/i386/lib/dec_and_lock.c Sun Nov 17 20:29:28 2002 +++ 900-mjb2/arch/i386/lib/dec_and_lock.c Wed Jun 11 22:47:01 2003 @@ -10,6 +10,7 @@ #include #include +#ifndef ATOMIC_DEC_AND_LOCK int atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock) { int counter; @@ -38,3 +39,5 @@ slow_path: spin_unlock(lock); return 0; } +#endif + diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/mm/fault.c 900-mjb2/arch/i386/mm/fault.c --- 000-virgin/arch/i386/mm/fault.c Fri May 30 19:01:59 2003 +++ 900-mjb2/arch/i386/mm/fault.c Wed Jun 11 22:51:54 2003 @@ -2,6 +2,11 @@ * linux/arch/i386/mm/fault.c * * Copyright (C) 1995 Linus Torvalds + * + * Change History + * + * Tigran Aivazian Remote debugging support. + * */ #include @@ -20,6 +25,9 @@ #include #include /* For unblank_screen() */ #include +#ifdef CONFIG_X86_REMOTE_DEBUG +#include +#endif #include #include @@ -112,6 +120,15 @@ asmlinkage void do_page_fault(struct pt_ if (in_atomic() || !mm) goto no_context; +#ifdef CONFIG_X86_REMOTE_DEBUG + if (kgdb_memerr_expected) { + if (linux_debug_hook != (gdb_debug_hook *) NULL) { + (*linux_debug_hook)(14, SIGSEGV, error_code, regs) ; + return; /* return w/modified regs */ + } + } +#endif + down_read(&mm->mmap_sem); vma = find_vma(mm, address); @@ -211,14 +228,27 @@ bad_area: return; } +#ifdef CONFIG_X86_REMOTE_DEBUG + if (kgdb_memerr_expected) { + if (linux_debug_hook != (gdb_debug_hook *) NULL) { + (*linux_debug_hook)(14, SIGSEGV, error_code, regs); + return; /* Return with modified registers */ + } + } else { + if (linux_debug_hook != (gdb_debug_hook *) NULL) { + (*linux_debug_hook)(14, SIGSEGV, error_code, regs); + } + } +#endif + #ifdef CONFIG_X86_F00F_BUG /* * Pentium F0 0F C7 C8 bug workaround. */ if (boot_cpu_data.f00f_bug) { - unsigned long nr; - - nr = (address - idt_descr.address) >> 3; + unsigned long nr, node; + node = cpu_to_node(smp_processor_id()); + nr = (address - node_idt_descr[node].address) >> 3; if (nr == 6) { do_invalid_op(regs, 0); diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/vmlinux.lds.S 900-mjb2/arch/i386/vmlinux.lds.S --- 000-virgin/arch/i386/vmlinux.lds.S Fri May 30 19:01:59 2003 +++ 900-mjb2/arch/i386/vmlinux.lds.S Wed Jun 11 22:42:37 2003 @@ -10,7 +10,7 @@ ENTRY(startup_32) jiffies = jiffies_64; SECTIONS { - . = 0xC0000000 + 0x100000; + . = __PAGE_OFFSET + 0x100000; /* read-only */ _text = .; /* Text and read-only data */ .text : { diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/ppc64/kernel/ppc_ksyms.c 900-mjb2/arch/ppc64/kernel/ppc_ksyms.c --- 000-virgin/arch/ppc64/kernel/ppc_ksyms.c Tue Feb 25 23:03:44 2003 +++ 900-mjb2/arch/ppc64/kernel/ppc_ksyms.c Wed Jun 11 22:55:25 2003 @@ -91,7 +91,7 @@ EXPORT_SYMBOL(strncmp); EXPORT_SYMBOL(__down_interruptible); EXPORT_SYMBOL(__up); EXPORT_SYMBOL(naca); -EXPORT_SYMBOL(__down); +EXPORT_SYMBOL(__down_wq); /* EXPORT_SYMBOL(csum_partial); already in net/netsyms.c */ EXPORT_SYMBOL(csum_partial_copy_generic); diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/ppc64/kernel/semaphore.c 900-mjb2/arch/ppc64/kernel/semaphore.c --- 000-virgin/arch/ppc64/kernel/semaphore.c Sun Nov 17 20:29:22 2002 +++ 900-mjb2/arch/ppc64/kernel/semaphore.c Wed Jun 11 22:55:25 2003 @@ -70,13 +70,18 @@ void __up(struct semaphore *sem) * Thus it is only when we decrement count from some value > 0 * that we have actually got the semaphore. */ -void __down(struct semaphore *sem) +int __down_wq(struct semaphore *sem, wait_queue_t *wait) { struct task_struct *tsk = current; - DECLARE_WAITQUEUE(wait, tsk); + DECLARE_WAITQUEUE(local_wait, tsk); + unsigned long flags; - tsk->state = TASK_UNINTERRUPTIBLE; - add_wait_queue_exclusive(&sem->wait, &wait); + if (!is_sync_wait(wait)) + tsk->state = TASK_UNINTERRUPTIBLE; + if (!wait) + wait = &local_wait; + + add_wait_queue_exclusive(&sem->wait, wait); smp_wmb(); /* @@ -86,10 +91,15 @@ void __down(struct semaphore *sem) * that we are asleep, and then sleep. */ while (__sem_update_count(sem, -1) <= 0) { + if (!is_sync_wait(wait)) + return -EIOCBRETRY; schedule(); tsk->state = TASK_UNINTERRUPTIBLE; } - remove_wait_queue(&sem->wait, &wait); + spin_lock_irqsave(&sem->wait.lock, flags) + if (is_sync_wait(wait) || !list_empty(&wait->task_list)) + remove_wait_queue_locked(&sem->wait, wait); + spin_unlock_irqrestore(&sem->wait.lock, flags); tsk->state = TASK_RUNNING; /* @@ -98,6 +108,8 @@ void __down(struct semaphore *sem) * indicating that there are still processes sleeping. */ wake_up(&sem->wait); + + return 0; } int __down_interruptible(struct semaphore * sem) diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/sparc64/kernel/devices.c 900-mjb2/arch/sparc64/kernel/devices.c --- 000-virgin/arch/sparc64/kernel/devices.c Sat May 10 18:34:35 2003 +++ 900-mjb2/arch/sparc64/kernel/devices.c Wed Jun 11 22:47:01 2003 @@ -31,6 +31,8 @@ int linux_num_cpus = 0; extern void cpu_probe(void); extern void central_probe(void); +unsigned long cpu_hz; + void __init device_scan(void) { char node_str[128]; @@ -68,6 +70,8 @@ void __init device_scan(void) prom_getproperty(scan, "portid", (char *) &thismid, sizeof(thismid)); } + if (!cpu_hz) + cpu_hz = prom_getint(scan, "clock-frequency"); linux_cpus[cpu_ctr].mid = thismid; printk("Found CPU %d (node=%08x,mid=%d)\n", cpu_ctr, (unsigned) scan, thismid); diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/sparc64/kernel/rtrap.S 900-mjb2/arch/sparc64/kernel/rtrap.S --- 000-virgin/arch/sparc64/kernel/rtrap.S Sat May 10 18:34:35 2003 +++ 900-mjb2/arch/sparc64/kernel/rtrap.S Wed Jun 11 22:47:03 2003 @@ -15,6 +15,10 @@ #include #include +#ifndef CONFIG_KGDB_THREAD +#define user_schedule schedule +#endif + #define RTRAP_PSTATE (PSTATE_RMO|PSTATE_PEF|PSTATE_PRIV|PSTATE_IE) #define RTRAP_PSTATE_IRQOFF (PSTATE_RMO|PSTATE_PEF|PSTATE_PRIV) #define RTRAP_PSTATE_AG_IRQOFF (PSTATE_RMO|PSTATE_PEF|PSTATE_PRIV|PSTATE_AG) @@ -33,7 +37,7 @@ __handle_softirq: ba,a,pt %xcc, __handle_softirq_continue nop __handle_preemption: - call schedule + call user_schedule wrpr %g0, RTRAP_PSTATE, %pstate ba,pt %xcc, __handle_preemption_continue wrpr %g0, RTRAP_PSTATE_IRQOFF, %pstate @@ -48,7 +52,7 @@ __handle_user_windows: be,pt %xcc, 1f nop - call schedule + call user_schedule wrpr %g0, RTRAP_PSTATE, %pstate wrpr %g0, RTRAP_PSTATE_IRQOFF, %pstate ldx [%g6 + TI_FLAGS], %l0 @@ -92,7 +96,7 @@ __handle_perfctrs: be,pt %xcc, 1f nop - call schedule + call user_schedule wrpr %g0, RTRAP_PSTATE, %pstate wrpr %g0, RTRAP_PSTATE_IRQOFF, %pstate ldx [%g6 + TI_FLAGS], %l0 @@ -273,7 +277,7 @@ to_kernel: sethi %hi(PREEMPT_ACTIVE), %l6 stw %l6, [%g6 + TI_PRE_COUNT] wrpr 0, %pil - call schedule + call user_schedule nop ba,pt %xcc, rtrap stw %g0, [%g6 + TI_PRE_COUNT] diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/sparc64/lib/rwlock.S 900-mjb2/arch/sparc64/lib/rwlock.S --- 000-virgin/arch/sparc64/lib/rwlock.S Sun Nov 17 20:29:44 2002 +++ 900-mjb2/arch/sparc64/lib/rwlock.S Wed Jun 11 22:47:01 2003 @@ -63,5 +63,33 @@ __write_lock: /* %o0 = lock_ptr */ be,pt %icc, 99b membar #StoreLoad | #StoreStore ba,a,pt %xcc, 1b + + .globl __read_trylock +__read_trylock: /* %o0 = lock_ptr */ + ldsw [%o0], %g5 + brlz,pn %g5, 100f + add %g5, 1, %g7 + cas [%o0], %g5, %g7 + cmp %g5, %g7 + bne,pn %icc, __read_trylock + membar #StoreLoad | #StoreStore + retl + mov 1, %o0 + + .globl __write_trylock +__write_trylock: /* %o0 = lock_ptr */ + sethi %hi(0x80000000), %g2 +1: lduw [%o0], %g5 +4: brnz,pn %g5, 100f + or %g5, %g2, %g7 + cas [%o0], %g5, %g7 + cmp %g5, %g7 + bne,pn %icc, 1b + membar #StoreLoad | #StoreStore + retl + mov 1, %o0 +100: retl + mov 0, %o0 + rwlock_impl_end: diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/x86_64/kernel/entry.S 900-mjb2/arch/x86_64/kernel/entry.S --- 000-virgin/arch/x86_64/kernel/entry.S Fri May 30 19:02:02 2003 +++ 900-mjb2/arch/x86_64/kernel/entry.S Wed Jun 11 22:47:03 2003 @@ -46,6 +46,10 @@ #define PDAREF(field) %gs:field +#ifndef CONFIG_KGDB_THREAD +#define user_schedule schedule +#endif + #ifdef CONFIG_PREEMPT #define preempt_stop cli #else @@ -187,7 +191,7 @@ sysret_careful: jnc sysret_signal sti pushq %rdi - call schedule + call user_schedule popq %rdi jmp sysret_check @@ -256,7 +260,7 @@ int_careful: jnc int_very_careful sti pushq %rdi - call schedule + call user_schedule popq %rdi jmp int_with_check @@ -426,7 +430,7 @@ retint_careful: jnc retint_signal sti pushq %rdi - call schedule + call user_schedule popq %rdi GET_THREAD_INFO(%rcx) cli @@ -460,7 +464,7 @@ retint_kernel: jc retint_restore_args movl $PREEMPT_ACTIVE,threadinfo_preempt_count(%rcx) sti - call schedule + call user_schedule cli GET_THREAD_INFO(%rcx) movl $0,threadinfo_preempt_count(%rcx) diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/x86_64/kernel/semaphore.c 900-mjb2/arch/x86_64/kernel/semaphore.c --- 000-virgin/arch/x86_64/kernel/semaphore.c Sun Nov 17 20:29:59 2002 +++ 900-mjb2/arch/x86_64/kernel/semaphore.c Wed Jun 11 22:55:26 2003 @@ -54,15 +54,20 @@ void __up(struct semaphore *sem) wake_up(&sem->wait); } -void __down(struct semaphore * sem) +int __down_wq(struct semaphore * sem, wait_queue_t *wait) { struct task_struct *tsk = current; - DECLARE_WAITQUEUE(wait, tsk); + DECLARE_WAITQUEUE(local_wait, tsk); unsigned long flags; - tsk->state = TASK_UNINTERRUPTIBLE; + if (is_sync_wait(wait)) + tsk->state = TASK_UNINTERRUPTIBLE; + if (!wait) { + wait = &local_wait; + } + spin_lock_irqsave(&sem->wait.lock, flags); - add_wait_queue_exclusive_locked(&sem->wait, &wait); + add_wait_queue_exclusive_locked(&sem->wait, wait); sem->sleepers++; for (;;) { @@ -80,16 +85,22 @@ void __down(struct semaphore * sem) sem->sleepers = 1; /* us - see -1 above */ spin_unlock_irqrestore(&sem->wait.lock, flags); + if (!is_sync_wait(wait)) + return -EIOCBRETRY; + schedule(); spin_lock_irqsave(&sem->wait.lock, flags); tsk->state = TASK_UNINTERRUPTIBLE; } - remove_wait_queue_locked(&sem->wait, &wait); + if (is_sync_wait(wait) || !list_empty(&wait->task_list)) + remove_wait_queue_locked(&sem->wait, wait); wake_up_locked(&sem->wait); spin_unlock_irqrestore(&sem->wait.lock, flags); tsk->state = TASK_RUNNING; + return 0; } + int __down_interruptible(struct semaphore * sem) { diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/x86_64/kernel/x8664_ksyms.c 900-mjb2/arch/x86_64/kernel/x8664_ksyms.c --- 000-virgin/arch/x86_64/kernel/x8664_ksyms.c Fri May 30 19:02:02 2003 +++ 900-mjb2/arch/x86_64/kernel/x8664_ksyms.c Wed Jun 11 22:55:26 2003 @@ -64,7 +64,7 @@ EXPORT_SYMBOL(get_cmos_time); EXPORT_SYMBOL(__io_virt_debug); #endif -EXPORT_SYMBOL_NOVERS(__down_failed); +EXPORT_SYMBOL_NOVERS(__down_failed_wq); EXPORT_SYMBOL_NOVERS(__down_failed_interruptible); EXPORT_SYMBOL_NOVERS(__down_failed_trylock); EXPORT_SYMBOL_NOVERS(__up_wakeup); diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/x86_64/lib/thunk.S 900-mjb2/arch/x86_64/lib/thunk.S --- 000-virgin/arch/x86_64/lib/thunk.S Sun Nov 17 20:29:44 2002 +++ 900-mjb2/arch/x86_64/lib/thunk.S Wed Jun 11 22:55:26 2003 @@ -38,7 +38,7 @@ #endif thunk do_softirq_thunk,do_softirq - thunk __down_failed,__down + thunk __down_failed_wq,__down_wq thunk_retrax __down_failed_interruptible,__down_interruptible thunk_retrax __down_failed_trylock,__down_trylock thunk __up_wakeup,__up diff -urpN -X /home/fletch/.diff.exclude 000-virgin/drivers/block/ll_rw_blk.c 900-mjb2/drivers/block/ll_rw_blk.c --- 000-virgin/drivers/block/ll_rw_blk.c Fri May 30 19:02:03 2003 +++ 900-mjb2/drivers/block/ll_rw_blk.c Wed Jun 11 22:55:22 2003 @@ -1501,16 +1501,29 @@ void blk_put_request(struct request *req * If no queues are congested then just wait for the next request to be * returned. */ -void blk_congestion_wait(int rw, long timeout) +int blk_congestion_wait_wq(int rw, long timeout, wait_queue_t *wait) { - DEFINE_WAIT(wait); wait_queue_head_t *wqh = &congestion_wqh[rw]; + DEFINE_WAIT(local_wait); + + if (!wait) + wait = &local_wait; blk_run_queues(); - prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); + prepare_to_wait(wqh, wait, TASK_UNINTERRUPTIBLE); + if (!is_sync_wait(wait)) + return -EIOCBRETRY; + io_schedule_timeout(timeout); - finish_wait(wqh, &wait); + finish_wait(wqh, wait); + return 0; +} + +void blk_congestion_wait(int rw, long timeout) +{ + blk_congestion_wait_wq(rw, timeout, NULL); } + /* * Has to be called with the request spinlock acquired diff -urpN -X /home/fletch/.diff.exclude 000-virgin/drivers/char/Makefile 900-mjb2/drivers/char/Makefile --- 000-virgin/drivers/char/Makefile Wed Mar 26 22:54:30 2003 +++ 900-mjb2/drivers/char/Makefile Wed Jun 11 22:42:59 2003 @@ -25,6 +25,7 @@ obj-$(CONFIG_COMPUTONE) += ip2.o ip2main obj-$(CONFIG_RISCOM8) += riscom8.o obj-$(CONFIG_ISI) += isicom.o obj-$(CONFIG_ESPSERIAL) += esp.o +obj-$(CONFIG_X86_REMOTE_DEBUG) += gdbserial.o obj-$(CONFIG_SYNCLINK) += synclink.o obj-$(CONFIG_SYNCLINKMP) += synclinkmp.o obj-$(CONFIG_N_HDLC) += n_hdlc.o diff -urpN -X /home/fletch/.diff.exclude 000-virgin/drivers/char/gdbserial.c 900-mjb2/drivers/char/gdbserial.c --- 000-virgin/drivers/char/gdbserial.c Wed Dec 31 16:00:00 1969 +++ 900-mjb2/drivers/char/gdbserial.c Wed Jun 11 22:42:59 2003 @@ -0,0 +1,274 @@ +/* + * Serial interface GDB stub + * + * Written (hacked together) by David Grothe (dave@gcom.com) + * + * Modified by Scott Foehner (sfoehner@engr.sgi.com) to allow connect + * on boot-up + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#undef PRNT /* define for debug printing */ + +#define GDB_BUF_SIZE 512 /* power of 2, please */ + +static char gdb_buf[GDB_BUF_SIZE]; +static int gdb_buf_in_inx; +static atomic_t gdb_buf_in_cnt; +static int gdb_buf_out_inx; + +extern void set_debug_traps(void); /* GDB routine */ +extern int gdb_serial_setup(int ttyS, int baud, int *port, int *irq); +extern void shutdown_for_gdb(struct async_struct *info); + /* in serial.c */ + +int gdb_irq; +int gdb_port; +int gdb_ttyS = 1; /* Default: ttyS1 */ +int gdb_baud = 38400; +int gdb_enter = 0; /* Default: do not do gdb_hook on boot */ +int gdb_initialized = 0; + +static int initialized = -1; + +/* + * Get a byte from the hardware data buffer and return it + */ +static int +read_data_bfr(void) +{ + if (inb(gdb_port + UART_LSR) & UART_LSR_DR) + return (inb(gdb_port + UART_RX)); + + return (-1); + +} /* read_data_bfr */ + +/* + * Get a char if available, return -1 if nothing available. + * Empty the receive buffer first, then look at the interface hardware. + */ +static int +read_char(void) +{ + if (atomic_read(&gdb_buf_in_cnt) != 0) { /* intr routine has q'd chars */ + int chr; + + chr = gdb_buf[gdb_buf_out_inx++]; + gdb_buf_out_inx &= (GDB_BUF_SIZE - 1); + atomic_dec(&gdb_buf_in_cnt); + return (chr); + } + + return (read_data_bfr()); /* read from hardware */ + +} /* read_char */ + +/* + * Wait until the interface can accept a char, then write it. + */ +static void +write_char(int chr) +{ + while (!(inb(gdb_port + UART_LSR) & UART_LSR_THRE)) ; + + outb(chr, gdb_port + UART_TX); + +} /* write_char */ + +/* + * This is the receiver interrupt routine for the GDB stub. + * It will receive a limited number of characters of input + * from the gdb host machine and save them up in a buffer. + * + * When the gdb stub routine getDebugChar() is called it + * draws characters out of the buffer until it is empty and + * then reads directly from the serial port. + * + * We do not attempt to write chars from the interrupt routine + * since the stubs do all of that via putDebugChar() which + * writes one byte after waiting for the interface to become + * ready. + * + * The debug stubs like to run with interrupts disabled since, + * after all, they run as a consequence of a breakpoint in + * the kernel. + * + * Perhaps someone who knows more about the tty driver than I + * care to learn can make this work for any low level serial + * driver. + */ +static void +gdb_interrupt(int irq, void *dev_id, struct pt_regs *regs) +{ + int chr; + int iir; + + do { + chr = read_data_bfr(); + iir = inb(gdb_port + UART_IIR); +#ifdef PRNT + printk("gdb_interrupt: chr=%02x '%c' after read iir=%02x\n", + chr, chr > ' ' && chr < 0x7F ? chr : ' ', iir); +#endif + if (chr < 0) + continue; + + if (chr == 3) { /* Ctrl-C means remote interrupt */ + breakpoint(); + continue; + } + + if (atomic_read(&gdb_buf_in_cnt) >= GDB_BUF_SIZE) { /* buffer overflow, clear it */ + gdb_buf_in_inx = 0; + atomic_set(&gdb_buf_in_cnt, 0); + gdb_buf_out_inx = 0; + break; + } + + gdb_buf[gdb_buf_in_inx++] = chr; + gdb_buf_in_inx &= (GDB_BUF_SIZE - 1); + atomic_inc(&gdb_buf_in_cnt); + } + while (iir & UART_IIR_RDI); + +} /* gdb_interrupt */ + +/* + * Just a NULL routine for testing. + */ +void +gdb_null(void) +{ +} /* gdb_null */ + +extern int serial8250_init(void); + +int +gdb_hook(void) +{ + int retval; + +#ifdef CONFIG_SMP + if (NR_CPUS > KGDB_MAX_NO_CPUS) { + printk + ("kgdb: too manu cpus. Cannot enable debugger with more than 8 cpus\n"); + return (-1); + } +#endif + + /* + * Call first time just to get the ser ptr + */ + + serial8250_init(); + + if (gdb_serial_setup(gdb_ttyS, gdb_baud, &gdb_port, &gdb_irq)) { + printk("gdb_serial_setup() error"); + return (-1); + } + + retval = request_irq(gdb_irq, + gdb_interrupt, SA_INTERRUPT, "GDB-stub", NULL); + if (retval == 0) + initialized = 1; + else { + initialized = 0; + printk("gdb_hook: request_irq(irq=%d) failed: %d\n", gdb_irq, + retval); + } + + /* + * Call GDB routine to setup the exception vectors for the debugger + */ + set_debug_traps(); + + /* + * Call the breakpoint() routine in GDB to start the debugging + * session. + */ + printk("Waiting for connection from remote gdb... "); + breakpoint(); + gdb_null(); + + printk("Connected.\n"); + + gdb_initialized = 1; + return (0); + +} /* gdb_hook_interrupt2 */ + +/* + * getDebugChar + * + * This is a GDB stub routine. It waits for a character from the + * serial interface and then returns it. If there is no serial + * interface connection then it returns a bogus value which will + * almost certainly cause the system to hang. + */ +int +getDebugChar(void) +{ + volatile int chr; + +#ifdef PRNT + printk("getDebugChar: "); +#endif + + while ((chr = read_char()) < 0) + touch_nmi_watchdog(); + +#ifdef PRNT + printk("%c\n", chr > ' ' && chr < 0x7F ? chr : ' '); +#endif + return (chr); + +} /* getDebugChar */ + +/* + * putDebugChar + * + * This is a GDB stub routine. It waits until the interface is ready + * to transmit a char and then sends it. If there is no serial + * interface connection then it simply returns to its caller, having + * pretended to send the char. + */ +void +putDebugChar(int chr) +{ +#ifdef PRNT + printk("putDebugChar: chr=%02x '%c'\n", chr, + chr > ' ' && chr < 0x7F ? chr : ' '); +#endif + + write_char(chr); /* this routine will wait */ + +} /* putDebugChar */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/drivers/char/sysrq.c 900-mjb2/drivers/char/sysrq.c --- 000-virgin/drivers/char/sysrq.c Fri May 30 19:02:05 2003 +++ 900-mjb2/drivers/char/sysrq.c Wed Jun 11 22:42:59 2003 @@ -134,6 +134,18 @@ static struct sysrq_key_op sysrq_mountro /* END SYNC SYSRQ HANDLERS BLOCK */ +#ifdef CONFIG_X86_REMOTE_DEBUG +static void sysrq_handle_gdb(int key, struct pt_regs *pt_regs, + struct tty_struct *tty) { + int gdb_hook(void); + gdb_hook(); +} +static struct sysrq_key_op sysrq_gdb_op = { + handler: sysrq_handle_gdb, + help_msg: "Gdb", + action_msg: "Entering debugger", +}; +#endif /* SHOW SYSRQ HANDLERS BLOCK */ @@ -240,7 +252,11 @@ static struct sysrq_key_op *sysrq_key_ta /* d */ NULL, /* e */ &sysrq_term_op, /* f */ NULL, +#ifdef CONFIG_X86_REMOTE_DEBUG +/* g */ &sysrq_gdb_op, +#else /* CONFIG_X86_REMOTE_DEBUG */ /* g */ NULL, +#endif /* CONFIG_X86_REMOTE_DEBUG */ /* h */ NULL, /* i */ &sysrq_kill_op, /* j */ NULL, diff -urpN -X /home/fletch/.diff.exclude 000-virgin/drivers/char/tty_io.c 900-mjb2/drivers/char/tty_io.c --- 000-virgin/drivers/char/tty_io.c Fri May 30 19:02:05 2003 +++ 900-mjb2/drivers/char/tty_io.c Wed Jun 11 22:42:59 2003 @@ -91,6 +91,9 @@ #include #include #include +#ifdef CONFIG_GDB_CONSOLE +#include +#endif #include #include @@ -2367,6 +2370,9 @@ void __init console_init(void) (*call)(); call++; } +#ifdef CONFIG_GDB_CONSOLE + gdb_console_init(); +#endif } #ifdef CONFIG_VT diff -urpN -X /home/fletch/.diff.exclude 000-virgin/drivers/net/acenic.c 900-mjb2/drivers/net/acenic.c --- 000-virgin/drivers/net/acenic.c Fri May 30 19:02:11 2003 +++ 900-mjb2/drivers/net/acenic.c Wed Jun 11 22:47:04 2003 @@ -132,7 +132,8 @@ #endif #if LINUX_VERSION_CODE >= 0x20400 -static struct pci_device_id acenic_pci_tbl[] __initdata = { +static struct pci_device_id acenic_pci_tbl[] + __initdata __attribute__ ((__unused__)) = { { PCI_VENDOR_ID_ALTEON, PCI_DEVICE_ID_ALTEON_ACENIC_FIBRE, PCI_ANY_ID, PCI_ANY_ID, PCI_CLASS_NETWORK_ETHERNET << 8, 0xffff00, }, { PCI_VENDOR_ID_ALTEON, PCI_DEVICE_ID_ALTEON_ACENIC_COPPER, diff -urpN -X /home/fletch/.diff.exclude 000-virgin/drivers/net/loopback.c 900-mjb2/drivers/net/loopback.c --- 000-virgin/drivers/net/loopback.c Sun Nov 17 20:29:25 2002 +++ 900-mjb2/drivers/net/loopback.c Wed Jun 11 22:47:05 2003 @@ -194,7 +194,7 @@ int __init loopback_init(struct net_devi /* Current netfilter will die with oom linearizing large skbs, * however this will be cured before 2.5.x is done. */ - dev->features |= NETIF_F_TSO; +/* dev->features |= NETIF_F_TSO; */ dev->priv = kmalloc(sizeof(struct net_device_stats), GFP_KERNEL); if (dev->priv == NULL) diff -urpN -X /home/fletch/.diff.exclude 000-virgin/drivers/pci/probe.c 900-mjb2/drivers/pci/probe.c --- 000-virgin/drivers/pci/probe.c Fri May 30 19:02:13 2003 +++ 900-mjb2/drivers/pci/probe.c Wed Jun 11 22:47:07 2003 @@ -173,7 +173,7 @@ void __devinit pci_read_bridge_bases(str limit |= (io_limit_hi << 16); } - if (base && base <= limit) { + if (base <= limit) { res->flags = (io_base_lo & PCI_IO_RANGE_TYPE_MASK) | IORESOURCE_IO; res->start = base; res->end = limit + 0xfff; diff -urpN -X /home/fletch/.diff.exclude 000-virgin/drivers/scsi/sd.c 900-mjb2/drivers/scsi/sd.c --- 000-virgin/drivers/scsi/sd.c Fri May 30 19:02:15 2003 +++ 900-mjb2/drivers/scsi/sd.c Wed Jun 11 22:50:26 2003 @@ -59,7 +59,9 @@ * Remaining dev_t-handling stuff */ #define SD_MAJORS 16 -#define SD_DISKS (SD_MAJORS << 4) +#define SD_DISKS ((SD_MAJORS - 1) << 4) +#define LAST_MAJOR_DISKS (1 << (KDEV_MINOR_BITS - 4)) +#define TOTAL_SD_DISKS (SD_DISKS + LAST_MAJOR_DISKS) /* * Time out in seconds for disks and Magneto-opticals (which are slower). @@ -88,7 +90,7 @@ struct scsi_disk { static LIST_HEAD(sd_devlist); static spinlock_t sd_devlist_lock = SPIN_LOCK_UNLOCKED; -static unsigned long sd_index_bits[SD_DISKS / BITS_PER_LONG]; +static unsigned long sd_index_bits[TOTAL_SD_DISKS / BITS_PER_LONG]; static spinlock_t sd_index_lock = SPIN_LOCK_UNLOCKED; static void sd_init_onedisk(struct scsi_disk * sdkp, struct gendisk *disk); @@ -127,6 +129,9 @@ static int sd_major(int major_idx) return SCSI_DISK1_MAJOR + major_idx - 1; case 8 ... 15: return SCSI_DISK8_MAJOR + major_idx - 8; +#define MAX_IDX (TOTAL_SD_DISKS >> 4) + case 16 ... MAX_IDX: + return SCSI_DISK15_MAJOR; default: BUG(); return 0; /* shut up gcc */ @@ -1328,8 +1333,8 @@ static int sd_attach(struct scsi_device goto out_free; spin_lock(&sd_index_lock); - index = find_first_zero_bit(sd_index_bits, SD_DISKS); - if (index == SD_DISKS) { + index = find_first_zero_bit(sd_index_bits, TOTAL_SD_DISKS); + if (index == TOTAL_SD_DISKS) { spin_unlock(&sd_index_lock); error = -EBUSY; goto out_put; @@ -1343,15 +1348,25 @@ static int sd_attach(struct scsi_device sdkp->index = index; gd->major = sd_major(index >> 4); - gd->first_minor = (index & 15) << 4; +#define DISKS_PER_MINOR_MASK ((1 << (KDEV_MINOR_BITS - 4)) - 1) + if (index > SD_DISKS) + gd->first_minor = ((index - SD_DISKS) & DISKS_PER_MINOR_MASK) << 4; + else + gd->first_minor = (index & 15) << 4; gd->minors = 16; gd->fops = &sd_fops; - if (index >= 26) { + if (index < 26) { + sprintf(gd->disk_name, "sd%c", 'a' + index % 26); + } else if (index < (26*27)) { sprintf(gd->disk_name, "sd%c%c", 'a' + index/26-1,'a' + index % 26); } else { - sprintf(gd->disk_name, "sd%c", 'a' + index % 26); + const unsigned int m1 = (index/ 26 - 1) / 26 - 1; + const unsigned int m2 = (index / 26 - 1) % 26; + const unsigned int m3 = index % 26; + sprintf(gd->disk_name, "sd%c%c%c", + 'a' + m1, 'a' + m2, 'a' + m3); } strcpy(gd->devfs_name, sdp->devfs_name); diff -urpN -X /home/fletch/.diff.exclude 000-virgin/drivers/serial/8250.c 900-mjb2/drivers/serial/8250.c --- 000-virgin/drivers/serial/8250.c Sat May 10 18:34:58 2003 +++ 900-mjb2/drivers/serial/8250.c Wed Jun 11 22:42:59 2003 @@ -2118,9 +2118,116 @@ void serial8250_resume_port(int line, u3 uart_resume_port(&serial8250_reg, &serial8250_ports[line].port, level); } -static int __init serial8250_init(void) +#ifdef CONFIG_X86_REMOTE_DEBUG +/* + * Takes: + * ttyS - integer specifying which serial port to use for debugging + * baud - baud rate of specified serial port + * Returns: + * port for use by the gdb serial driver + */ +int gdb_serial_setup(int ttyS, int baud, int *port, int *irq) +{ + struct uart_8250_port *up; + unsigned cval; + int bits = 8; + int parity = 'n'; + int cflag = CREAD | HUPCL | CLOCAL; + int quot = 0; + + /* + * Now construct a cflag setting. + */ + switch(baud) { + case 1200: + cflag |= B1200; + break; + case 2400: + cflag |= B2400; + break; + case 4800: + cflag |= B4800; + break; + case 19200: + cflag |= B19200; + break; + case 38400: + cflag |= B38400; + break; + case 57600: + cflag |= B57600; + break; + case 115200: + cflag |= B115200; + break; + case 9600: + default: + cflag |= B9600; + break; + } + switch(bits) { + case 7: + cflag |= CS7; + break; + default: + case 8: + cflag |= CS8; + break; + } + switch(parity) { + case 'o': case 'O': + cflag |= PARODD; + break; + case 'e': case 'E': + cflag |= PARENB; + break; + } + + /* + * Divisor, bytesize and parity + */ + + up = &serial8250_ports[ttyS]; +// ser->flags &= ~ASYNC_BOOT_AUTOCONF; + quot = ( 1843200 / 16 ) / baud; + cval = cflag & (CSIZE | CSTOPB); + cval >>= 4; + if (cflag & PARENB) + cval |= UART_LCR_PARITY; + if (!(cflag & PARODD)) + cval |= UART_LCR_EPAR; + + /* + * Disable UART interrupts, set DTR and RTS high + * and set speed. + */ + cval = 0x3; + serial_outp(up, UART_LCR, cval | UART_LCR_DLAB); /* set DLAB */ + serial_outp(up, UART_DLL, quot & 0xff); /* LS of divisor */ + serial_outp(up, UART_DLM, quot >> 8); /* MS of divisor */ + serial_outp(up, UART_LCR, cval); /* reset DLAB */ + serial_outp(up, UART_IER, UART_IER_RDI); /* turn on interrupts*/ + serial_outp(up, UART_MCR, UART_MCR_OUT2 | UART_MCR_DTR | UART_MCR_RTS); + + /* + * If we read 0xff from the LSR, there is no UART here. + */ + if (serial_inp(up, UART_LSR) == 0xff) + return 1; + *port = up->port.iobase; + *irq = up->port.irq; +// serial8250_shutdown(&up->port); + return 0; +} +#endif + +int serial8250_init(void) { int ret, i; + static int didit = 0; + + if (didit++) + return 0; printk(KERN_INFO "Serial: 8250/16550 driver $Revision: 1.90 $ " "IRQ sharing %sabled\n", share_irqs ? "en" : "dis"); diff -urpN -X /home/fletch/.diff.exclude 000-virgin/drivers/serial/core.c 900-mjb2/drivers/serial/core.c --- 000-virgin/drivers/serial/core.c Fri May 30 19:02:15 2003 +++ 900-mjb2/drivers/serial/core.c Wed Jun 11 22:42:59 2003 @@ -33,6 +33,10 @@ #include #include /* for serial_state and serial_icounter_struct */ +#ifdef CONFIG_X86_REMOTE_DEBUG +#include +#endif + #include #include @@ -1130,6 +1134,16 @@ uart_ioctl(struct tty_struct *tty, struc * protected against the tty being hung up. */ switch (cmd) { +#ifdef CONFIG_X86_REMOTE_DEBUG + case TIOCGDB: + ret = -ENOTTY; + if (capable(CAP_SYS_ADMIN)) { + gdb_ttyS = minor(tty->device) & 0x03F; + gdb_baud = tty_get_baud_rate(tty); + ret = gdb_hook(); + } + break; +#endif case TIOCSERGETLSR: /* Get line status register */ ret = uart_get_lsr_info(state, (unsigned int *)arg); break; @@ -1146,6 +1160,30 @@ uart_ioctl(struct tty_struct *tty, struc out: return ret; } + + /* + * ------------------------------------------------------------ + * Serial GDB driver (most in gdbserial.c) + * ------------------------------------------------------------ + */ + +#ifdef CONFIG_X86_REMOTE_DEBUG +#ifdef CONFIG_GDB_CONSOLE +static struct console gdbcons = { + name: "gdb", + write: gdb_console_write, + flags: CON_PRINTBUFFER | CON_ENABLED, + index: -1, +}; +#endif + +#ifdef CONFIG_GDB_CONSOLE +void __init gdb_console_init(void) +{ + register_console(&gdbcons); +} +#endif +#endif /* CONFIG_X86_REMOTE_DEBUG */ static void uart_set_termios(struct tty_struct *tty, struct termios *old_termios) { diff -urpN -X /home/fletch/.diff.exclude 000-virgin/fs/aio.c 900-mjb2/fs/aio.c --- 000-virgin/fs/aio.c Sat May 10 18:34:59 2003 +++ 900-mjb2/fs/aio.c Wed Jun 11 22:55:27 2003 @@ -39,6 +39,9 @@ #define dprintk(x...) do { ; } while (0) #endif +long aio_run = 0; /* for testing only */ +long aio_wakeups = 0; /* for testing only */ + /*------ sysctl variables----*/ atomic_t aio_nr = ATOMIC_INIT(0); /* current system wide number of aio requests */ unsigned aio_max_nr = 0x10000; /* system wide maximum number of aio requests */ @@ -204,6 +207,7 @@ static struct kioctx *ioctx_alloc(unsign { struct mm_struct *mm; struct kioctx *ctx; + int ret = 0; /* Prevent overflows */ if ((nr_events > (0x10000000U / sizeof(struct io_event))) || @@ -233,7 +237,8 @@ static struct kioctx *ioctx_alloc(unsign INIT_LIST_HEAD(&ctx->run_list); INIT_WORK(&ctx->wq, aio_kick_handler, ctx); - if (aio_setup_ring(ctx) < 0) + ret = aio_setup_ring(ctx); + if (unlikely(ret < 0)) goto out_freectx; /* limit the number of system wide aios */ @@ -259,7 +264,7 @@ out_cleanup: out_freectx: kmem_cache_free(kioctx_cachep, ctx); - ctx = ERR_PTR(-ENOMEM); + ctx = ERR_PTR(ret); dprintk("aio: error allocating ioctx %p\n", ctx); return ctx; @@ -281,6 +286,7 @@ static void aio_cancel_all(struct kioctx struct kiocb *iocb = list_kiocb(pos); list_del_init(&iocb->ki_list); cancel = iocb->ki_cancel; + kiocbSetCancelled(iocb); if (cancel) { iocb->ki_users++; spin_unlock_irq(&ctx->ctx_lock); @@ -395,6 +401,7 @@ static struct kiocb *__aio_get_req(struc req->ki_cancel = NULL; req->ki_retry = NULL; req->ki_user_obj = NULL; + INIT_LIST_HEAD(&req->ki_run_list); /* Check if the completion queue has enough free space to * accept an event from this io. @@ -544,60 +551,147 @@ static void use_mm(struct mm_struct *mm) struct mm_struct *active_mm = current->active_mm; atomic_inc(&mm->mm_count); current->mm = mm; - if (mm != active_mm) { - current->active_mm = mm; - activate_mm(active_mm, mm); - } + + current->active_mm = mm; + activate_mm(active_mm, mm); + mmdrop(active_mm); } -static void unuse_mm(struct mm_struct *mm) +void unuse_mm(struct mm_struct *mm) { current->mm = NULL; /* active_mm is still 'mm' */ enter_lazy_tlb(mm, current, smp_processor_id()); } -/* Run on kevent's context. FIXME: needs to be per-cpu and warn if an - * operation blocks. - */ -static void aio_kick_handler(void *data) +static inline int __queue_kicked_iocb(struct kiocb *iocb) { - struct kioctx *ctx = data; + struct kioctx *ctx = iocb->ki_ctx; - use_mm(ctx->mm); + if (list_empty(&iocb->ki_run_list)) { + list_add_tail(&iocb->ki_run_list, + &ctx->run_list); + iocb->ki_queued++; + return 1; + } + return 0; +} - spin_lock_irq(&ctx->ctx_lock); - while (!list_empty(&ctx->run_list)) { - struct kiocb *iocb; - long ret; +/* Expects to be called with iocb->ki_ctx->lock held */ +static ssize_t aio_run_iocb(struct kiocb *iocb) +{ + struct kioctx *ctx = iocb->ki_ctx; + ssize_t (*retry)(struct kiocb *); + ssize_t ret; - iocb = list_entry(ctx->run_list.next, struct kiocb, - ki_run_list); - list_del(&iocb->ki_run_list); - iocb->ki_users ++; - spin_unlock_irq(&ctx->ctx_lock); + if (iocb->ki_retried++ > 1024*1024) { + printk("Maximal retry count. Bytes done %d\n", + iocb->ki_nbytes - iocb->ki_left); + return -EAGAIN; + } + + if (!(iocb->ki_retried & 0xff)) { + dprintk("%ld retry: %d of %d (kick %ld, Q %ld run %ld, wake %ld)\n", + iocb->ki_retried, + iocb->ki_nbytes - iocb->ki_left, iocb->ki_nbytes, + iocb->ki_kicked, iocb->ki_queued, aio_run, aio_wakeups); + } + + if (!(retry = iocb->ki_retry)) { + printk("aio_run_iocb: iocb->ki_retry = NULL\n"); + return 0; + } + + iocb->ki_users ++; + kiocbClearKicked(iocb); + iocb->ki_run_list.next = iocb->ki_run_list.prev = NULL; + iocb->ki_retry = NULL; + spin_unlock_irq(&ctx->ctx_lock); + + if (kiocbIsCancelled(iocb)) { + aio_complete(iocb, -EINTR, 0); + spin_lock_irq(&ctx->ctx_lock); + __aio_put_req(ctx, iocb); + return -EINTR; + } - kiocbClearKicked(iocb); - ret = iocb->ki_retry(iocb); + BUG_ON(current->io_wait != NULL); + current->io_wait = &iocb->ki_wait; + ret = retry(iocb); + current->io_wait = NULL; + + if (-EIOCBRETRY != ret) { if (-EIOCBQUEUED != ret) { + BUG_ON(!list_empty(&iocb->ki_wait.task_list)); aio_complete(iocb, ret, 0); - iocb = NULL; } + } else { + if (list_empty(&iocb->ki_wait.task_list)) + kiocbSetKicked(iocb); + } + spin_lock_irq(&ctx->ctx_lock); - spin_lock_irq(&ctx->ctx_lock); - if (NULL != iocb) - __aio_put_req(ctx, iocb); + iocb->ki_retry = retry; + INIT_LIST_HEAD(&iocb->ki_run_list); + if (kiocbIsKicked(iocb)) { + BUG_ON(ret != -EIOCBRETRY); + __queue_kicked_iocb(iocb); + } + __aio_put_req(ctx, iocb); + return ret; +} + +static void aio_run_iocbs(struct kioctx *ctx) +{ + struct kiocb *iocb; + ssize_t ret; + int count = 0; + + spin_lock_irq(&ctx->ctx_lock); + while (!list_empty(&ctx->run_list)) { + iocb = list_entry(ctx->run_list.next, struct kiocb, + ki_run_list); + list_del(&iocb->ki_run_list); + ret = aio_run_iocb(iocb); + count++; } spin_unlock_irq(&ctx->ctx_lock); + aio_run++; +} +/* Run on aiod/kevent's context. FIXME: needs to be per-cpu and warn if an + * operation blocks. + */ +static void aio_kick_handler(void *data) +{ + struct kioctx *ctx = data; + + use_mm(ctx->mm); + aio_run_iocbs(ctx); unuse_mm(ctx->mm); } -void kick_iocb(struct kiocb *iocb) + +void queue_kicked_iocb(struct kiocb *iocb) { struct kioctx *ctx = iocb->ki_ctx; + unsigned long flags; + int run = 0; + + WARN_ON((!list_empty(&iocb->ki_wait.task_list))); + spin_lock_irqsave(&ctx->ctx_lock, flags); + run = __queue_kicked_iocb(iocb); + spin_unlock_irqrestore(&ctx->ctx_lock, flags); + if (run) { + queue_work(aio_wq, &ctx->wq); + aio_wakeups++; + } +} + +void kick_iocb(struct kiocb *iocb) +{ /* sync iocbs are easy: they can only ever be executing from a * single context. */ if (is_sync_kiocb(iocb)) { @@ -606,12 +700,9 @@ void kick_iocb(struct kiocb *iocb) return; } + iocb->ki_kicked++; if (!kiocbTryKick(iocb)) { - unsigned long flags; - spin_lock_irqsave(&ctx->ctx_lock, flags); - list_add_tail(&iocb->ki_run_list, &ctx->run_list); - spin_unlock_irqrestore(&ctx->ctx_lock, flags); - schedule_work(&ctx->wq); + queue_kicked_iocb(iocb); } } @@ -664,6 +755,9 @@ int aio_complete(struct kiocb *iocb, lon */ spin_lock_irqsave(&ctx->ctx_lock, flags); + if (iocb->ki_run_list.prev && !list_empty(&iocb->ki_run_list)) + list_del_init(&iocb->ki_run_list); + ring = kmap_atomic(info->ring_pages[0], KM_IRQ1); tail = info->tail; @@ -693,6 +787,11 @@ int aio_complete(struct kiocb *iocb, lon pr_debug("added to ring %p at [%lu]\n", iocb, tail); + pr_debug("%ld retries: %d of %d (kicked %ld, Q %ld run %ld wake %ld)\n", + iocb->ki_retried, + iocb->ki_nbytes - iocb->ki_left, iocb->ki_nbytes, + iocb->ki_kicked, iocb->ki_queued, aio_run, aio_wakeups); + /* everything turned out well, dispose of the aiocb. */ ret = __aio_put_req(ctx, iocb); @@ -807,6 +906,7 @@ static int read_events(struct kioctx *ct int i = 0; struct io_event ent; struct timeout to; + int event_loop = 0; /* testing only */ /* needed to zero any padding within an entry (there shouldn't be * any, but C is fun! @@ -856,7 +956,6 @@ static int read_events(struct kioctx *ct add_wait_queue_exclusive(&ctx->wait, &wait); do { set_task_state(tsk, TASK_INTERRUPTIBLE); - ret = aio_read_evt(ctx, &ent); if (ret) break; @@ -866,6 +965,7 @@ static int read_events(struct kioctx *ct if (to.timed_out) /* Only check after read evt */ break; schedule(); + event_loop++; if (signal_pending(tsk)) { ret = -EINTR; break; @@ -893,6 +993,9 @@ static int read_events(struct kioctx *ct if (timeout) clear_timeout(&to); out: + pr_debug("event loop executed %d times\n", event_loop); + pr_debug("aio_run %ld\n", aio_run); + pr_debug("aio_wakeups %ld\n", aio_wakeups); return i ? i : ret; } @@ -984,6 +1087,143 @@ asmlinkage long sys_io_destroy(aio_conte return -EINVAL; } +ssize_t aio_pread(struct kiocb *iocb) +{ + struct file *file = iocb->ki_filp; + ssize_t ret = 0; + + ret = file->f_op->aio_read(iocb, iocb->ki_buf, + iocb->ki_left, iocb->ki_pos); + + /* + * Can't just depend on iocb->ki_left to determine + * whether we are done. This may have been a short read. + */ + if (ret > 0) { + iocb->ki_buf += ret; + iocb->ki_left -= ret; + + ret = -EIOCBRETRY; + } + + /* This means we must have transferred all that we could */ + /* No need to retry anymore */ + if ((ret == 0) || (iocb->ki_left == 0)) + ret = iocb->ki_nbytes - iocb->ki_left; + + return ret; +} + +ssize_t aio_pwrite(struct kiocb *iocb) +{ + struct file *file = iocb->ki_filp; + ssize_t ret = 0; + + ret = file->f_op->aio_write(iocb, iocb->ki_buf, + iocb->ki_left, iocb->ki_pos); + + /* + * TBD: Even if iocb->ki_left = 0, could we need to + * wait for data to be sync'd ? Or can we assume + * that aio_fdsync/aio_fsync would be called explicitly + * as required. + */ + if (ret > 0) { + iocb->ki_buf += ret; + iocb->ki_left -= ret; + + ret = -EIOCBRETRY; + } + + /* This means we must have transferred all that we could */ + /* No need to retry anymore */ + if (ret == 0) + ret = iocb->ki_nbytes - iocb->ki_left; + + return ret; +} + +ssize_t aio_fdsync(struct kiocb *iocb) +{ + struct file *file = iocb->ki_filp; + ssize_t ret = -EINVAL; + + if (file->f_op->aio_fsync) + ret = file->f_op->aio_fsync(iocb, 1); + return ret; +} + +ssize_t aio_fsync(struct kiocb *iocb) +{ + struct file *file = iocb->ki_filp; + ssize_t ret = -EINVAL; + + if (file->f_op->aio_fsync) + ret = file->f_op->aio_fsync(iocb, 0); + return ret; +} + +/* Called during initial submission and subsequent retry operations */ +ssize_t aio_setup_iocb(struct kiocb *iocb) +{ + struct file *file = iocb->ki_filp; + ssize_t ret = 0; + + switch (iocb->ki_opcode) { + case IOCB_CMD_PREAD: + ret = -EBADF; + if (unlikely(!(file->f_mode & FMODE_READ))) + break; + ret = -EFAULT; + if (unlikely(!access_ok(VERIFY_WRITE, iocb->ki_buf, + iocb->ki_left))) + break; + ret = -EINVAL; + if (file->f_op->aio_read) + iocb->ki_retry = aio_pread; + break; + case IOCB_CMD_PWRITE: + ret = -EBADF; + if (unlikely(!(file->f_mode & FMODE_WRITE))) + break; + ret = -EFAULT; + if (unlikely(!access_ok(VERIFY_READ, iocb->ki_buf, + iocb->ki_left))) + break; + ret = -EINVAL; + if (file->f_op->aio_write) + iocb->ki_retry = aio_pwrite; + break; + case IOCB_CMD_FDSYNC: + ret = -EINVAL; + if (file->f_op->aio_fsync) + iocb->ki_retry = aio_fdsync; + break; + case IOCB_CMD_FSYNC: + ret = -EINVAL; + if (file->f_op->aio_fsync) + iocb->ki_retry = aio_fsync; + break; + default: + dprintk("EINVAL: io_submit: no operation provided\n"); + ret = -EINVAL; + } + + if (!iocb->ki_retry) + return ret; + + return 0; +} + +int aio_wake_function(wait_queue_t *wait, unsigned mode, int sync) +{ + struct kiocb *iocb = container_of(wait, struct kiocb, ki_wait); + + list_del_init(&wait->task_list); + kick_iocb(iocb); + return 1; +} + int FASTCALL(io_submit_one(struct kioctx *ctx, struct iocb *user_iocb, struct iocb *iocb)); int io_submit_one(struct kioctx *ctx, struct iocb *user_iocb, @@ -992,7 +1232,6 @@ int io_submit_one(struct kioctx *ctx, st struct kiocb *req; struct file *file; ssize_t ret; - char *buf; /* enforce forwards compatibility on users */ if (unlikely(iocb->aio_reserved1 || iocb->aio_reserved2 || @@ -1033,51 +1272,31 @@ int io_submit_one(struct kioctx *ctx, st req->ki_user_data = iocb->aio_data; req->ki_pos = iocb->aio_offset; - buf = (char *)(unsigned long)iocb->aio_buf; + req->ki_buf = (char *)(unsigned long)iocb->aio_buf; + req->ki_left = req->ki_nbytes = iocb->aio_nbytes; + req->ki_opcode = iocb->aio_lio_opcode; + init_waitqueue_func_entry(&req->ki_wait, aio_wake_function); + INIT_LIST_HEAD(&req->ki_wait.task_list); + req->ki_run_list.next = req->ki_run_list.prev = NULL; + req->ki_retry = NULL; + req->ki_retried = 0; + req->ki_kicked = 0; + req->ki_queued = 0; + aio_run = 0; + aio_wakeups = 0; - switch (iocb->aio_lio_opcode) { - case IOCB_CMD_PREAD: - ret = -EBADF; - if (unlikely(!(file->f_mode & FMODE_READ))) - goto out_put_req; - ret = -EFAULT; - if (unlikely(!access_ok(VERIFY_WRITE, buf, iocb->aio_nbytes))) - goto out_put_req; - ret = -EINVAL; - if (file->f_op->aio_read) - ret = file->f_op->aio_read(req, buf, - iocb->aio_nbytes, req->ki_pos); - break; - case IOCB_CMD_PWRITE: - ret = -EBADF; - if (unlikely(!(file->f_mode & FMODE_WRITE))) - goto out_put_req; - ret = -EFAULT; - if (unlikely(!access_ok(VERIFY_READ, buf, iocb->aio_nbytes))) - goto out_put_req; - ret = -EINVAL; - if (file->f_op->aio_write) - ret = file->f_op->aio_write(req, buf, - iocb->aio_nbytes, req->ki_pos); - break; - case IOCB_CMD_FDSYNC: - ret = -EINVAL; - if (file->f_op->aio_fsync) - ret = file->f_op->aio_fsync(req, 1); - break; - case IOCB_CMD_FSYNC: - ret = -EINVAL; - if (file->f_op->aio_fsync) - ret = file->f_op->aio_fsync(req, 0); - break; - default: - dprintk("EINVAL: io_submit: no operation provided\n"); - ret = -EINVAL; - } + ret = aio_setup_iocb(req); + + if ((-EBADF == ret) || (-EFAULT == ret)) + goto out_put_req; + + spin_lock_irq(&ctx->ctx_lock); + ret = aio_run_iocb(req); + spin_unlock_irq(&ctx->ctx_lock); + + if (-EIOCBRETRY == ret) + queue_work(aio_wq, &ctx->wq); - if (likely(-EIOCBQUEUED == ret)) - return 0; - aio_complete(req, ret, 0); return 0; out_put_req: diff -urpN -X /home/fletch/.diff.exclude 000-virgin/fs/buffer.c 900-mjb2/fs/buffer.c --- 000-virgin/fs/buffer.c Fri May 30 19:02:18 2003 +++ 900-mjb2/fs/buffer.c Wed Jun 11 22:55:24 2003 @@ -118,23 +118,35 @@ void unlock_buffer(struct buffer_head *b * from becoming locked again - you have to lock it yourself * if you want to preserve its state. */ -void __wait_on_buffer(struct buffer_head * bh) +int __wait_on_buffer_wq(struct buffer_head * bh, wait_queue_t *wait) { wait_queue_head_t *wqh = bh_waitq_head(bh); - DEFINE_WAIT(wait); + DEFINE_WAIT(local_wait); + + if (!wait) + wait = &local_wait; if (atomic_read(&bh->b_count) == 0 && (!bh->b_page || !PageLocked(bh->b_page))) buffer_error(); do { - prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); + prepare_to_wait(wqh, wait, TASK_UNINTERRUPTIBLE); if (buffer_locked(bh)) { blk_run_queues(); + if (!is_sync_wait(wait)) { + return -EIOCBRETRY; + } io_schedule(); } } while (buffer_locked(bh)); - finish_wait(wqh, &wait); + finish_wait(wqh, wait); + return 0; +} + +void __wait_on_buffer(struct buffer_head * bh) +{ + __wait_on_buffer_wq(bh, NULL); } static void @@ -409,6 +421,9 @@ __find_get_block_slow(struct block_devic bh = bh->b_this_page; } while (bh != head); buffer_error(); + printk("block=%llu, b_blocknr=%llu\n", + (unsigned long long)block, (unsigned long long)bh->b_blocknr); + printk("b_state=0x%08lx, b_size=%u\n", bh->b_state, bh->b_size); out_unlock: spin_unlock(&bd_mapping->private_lock); page_cache_release(page); @@ -1272,9 +1287,12 @@ void __bforget(struct buffer_head *bh) __brelse(bh); } -static struct buffer_head *__bread_slow(struct buffer_head *bh) +static struct buffer_head *__bread_slow_wq(struct buffer_head *bh, + wait_queue_t *wait) { - lock_buffer(bh); + if (-EIOCBRETRY == lock_buffer_wq(bh, wait)) + return ERR_PTR(-EIOCBRETRY); + if (buffer_uptodate(bh)) { unlock_buffer(bh); return bh; @@ -1284,7 +1302,8 @@ static struct buffer_head *__bread_slow( get_bh(bh); bh->b_end_io = end_buffer_io_sync; submit_bh(READ, bh); - wait_on_buffer(bh); + if (-EIOCBRETRY == wait_on_buffer_wq(bh, wait)) + return ERR_PTR(-EIOCBRETRY); if (buffer_uptodate(bh)) return bh; } @@ -1292,6 +1311,11 @@ static struct buffer_head *__bread_slow( return NULL; } +static inline struct buffer_head *__bread_slow(struct buffer_head *bh) +{ + return __bread_slow_wq(bh, NULL); +} + /* * Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block(). * The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their @@ -1466,6 +1490,18 @@ __bread(struct block_device *bdev, secto bh = __bread_slow(bh); return bh; } + + +struct buffer_head * +__bread_wq(struct block_device *bdev, sector_t block, int size, + wait_queue_t *wait) +{ + struct buffer_head *bh = __getblk(bdev, block, size); + + if (!buffer_uptodate(bh)) + bh = __bread_slow_wq(bh, wait); + return bh; +} EXPORT_SYMBOL(__bread); /* @@ -1904,8 +1940,11 @@ static int __block_prepare_write(struct clear_buffer_new(bh); if (!buffer_mapped(bh)) { err = get_block(inode, block, bh, 1); - if (err) + if (err) { + if (-EIOCBRETRY == err) + pr_debug("get_block queued\n"); goto out; + } if (buffer_new(bh)) { clear_buffer_new(bh); unmap_underlying_metadata(bh->b_bdev, @@ -1947,6 +1986,8 @@ static int __block_prepare_write(struct * If we issued read requests - let them complete. */ while(wait_bh > wait) { + if (!is_sync_wait(current->io_wait)) + printk("block_prepare_write: wait on buffer\n"); wait_on_buffer(*--wait_bh); if (!buffer_uptodate(*wait_bh)) return -EIO; diff -urpN -X /home/fletch/.diff.exclude 000-virgin/fs/exec.c 900-mjb2/fs/exec.c --- 000-virgin/fs/exec.c Fri May 30 19:02:18 2003 +++ 900-mjb2/fs/exec.c Wed Jun 11 22:56:53 2003 @@ -316,6 +316,7 @@ void put_dirty_page(struct task_struct * } lru_cache_add_active(page); flush_dcache_page(page); + SetPageAnon(page); set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(page, prot)))); pte_chain = page_add_rmap(page, pte, pte_chain); pte_unmap(pte); @@ -529,6 +530,30 @@ static int exec_mmap(struct mm_struct *m return 0; } +static struct dentry *clean_proc_dentry(struct task_struct *p) +{ + struct dentry *proc_dentry = p->proc_dentry; + + if (proc_dentry) { + spin_lock(&dcache_lock); + if (!d_unhashed(proc_dentry)) { + dget_locked(proc_dentry); + __d_drop(proc_dentry); + } else + proc_dentry = NULL; + spin_unlock(&dcache_lock); + } + return proc_dentry; +} + +static inline void put_proc_dentry(struct dentry *dentry) +{ + if (dentry) { + shrink_dcache_parent(dentry); + dput(dentry); + } +} + /* * This function makes sure the current process has its own signal table, * so that flush_signal_handlers can later reset the handlers without @@ -637,11 +662,9 @@ static inline int de_thread(struct task_ while (leader->state != TASK_ZOMBIE) yield(); - spin_lock(&leader->proc_lock); - spin_lock(¤t->proc_lock); - proc_dentry1 = proc_pid_unhash(current); - proc_dentry2 = proc_pid_unhash(leader); write_lock_irq(&tasklist_lock); + proc_dentry1 = clean_proc_dentry(current); + proc_dentry2 = clean_proc_dentry(leader); if (leader->tgid != current->tgid) BUG(); @@ -681,10 +704,9 @@ static inline int de_thread(struct task_ state = leader->state; write_unlock_irq(&tasklist_lock); - spin_unlock(&leader->proc_lock); - spin_unlock(¤t->proc_lock); - proc_pid_flush(proc_dentry1); - proc_pid_flush(proc_dentry2); + + put_proc_dentry(proc_dentry1); + put_proc_dentry(proc_dentry2); if (state != TASK_ZOMBIE) BUG(); diff -urpN -X /home/fletch/.diff.exclude 000-virgin/fs/ext2/inode.c 900-mjb2/fs/ext2/inode.c --- 000-virgin/fs/ext2/inode.c Tue Apr 8 14:38:19 2003 +++ 900-mjb2/fs/ext2/inode.c Wed Jun 11 22:55:25 2003 @@ -257,11 +257,12 @@ static int ext2_block_to_path(struct ino * or when it reads all @depth-1 indirect blocks successfully and finds * the whole chain, all way to the data (returns %NULL, *err == 0). */ -static Indirect *ext2_get_branch(struct inode *inode, +static Indirect *ext2_get_branch_wq(struct inode *inode, int depth, int *offsets, Indirect chain[4], - int *err) + int *err, + wait_queue_t *wait) { struct super_block *sb = inode->i_sb; Indirect *p = chain; @@ -273,8 +274,8 @@ static Indirect *ext2_get_branch(struct if (!p->key) goto no_block; while (--depth) { - bh = sb_bread(sb, le32_to_cpu(p->key)); - if (!bh) + bh = sb_bread_wq(sb, le32_to_cpu(p->key), wait); + if (!bh || IS_ERR(bh)) goto failure; read_lock(&EXT2_I(inode)->i_meta_lock); if (!verify_chain(chain, p)) @@ -292,11 +293,21 @@ changed: *err = -EAGAIN; goto no_block; failure: - *err = -EIO; + *err = IS_ERR(bh) ? PTR_ERR(bh) : -EIO; no_block: return p; } +static Indirect *ext2_get_branch(struct inode *inode, + int depth, + int *offsets, + Indirect chain[4], + int *err) +{ + return ext2_get_branch_wq(inode, depth, offsets, chain, + err, NULL); +} + /** * ext2_find_near - find a place for allocation with sufficient locality * @inode: owner @@ -536,7 +547,8 @@ changed: * reachable from inode. */ -static int ext2_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) +static int ext2_get_block_wq(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create, wait_queue_t *wait) { int err = -EIO; int offsets[4]; @@ -551,7 +563,8 @@ static int ext2_get_block(struct inode * goto out; reread: - partial = ext2_get_branch(inode, depth, offsets, chain, &err); + partial = ext2_get_branch_wq(inode, depth, offsets, chain, &err, + wait); /* Simplest case - block found, no allocation needed */ if (!partial) { @@ -565,7 +578,7 @@ got_it: } /* Next simple case - plain lookup or failed read of indirect block */ - if (!create || err == -EIO) { + if (!create || err == -EIO || err == -EIOCBRETRY) { cleanup: while (partial > chain) { brelse(partial->bh); @@ -606,6 +619,19 @@ changed: goto reread; } +static int ext2_get_block_async(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + return ext2_get_block_wq(inode, iblock, bh_result, create, + current->io_wait); +} + +static int ext2_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + return ext2_get_block_wq(inode, iblock, bh_result, create, NULL); +} + static int ext2_writepage(struct page *page, struct writeback_control *wbc) { return block_write_full_page(page, ext2_get_block, wbc); @@ -627,7 +653,7 @@ static int ext2_prepare_write(struct file *file, struct page *page, unsigned from, unsigned to) { - return block_prepare_write(page,from,to,ext2_get_block); + return block_prepare_write(page,from,to,ext2_get_block_async); } static int diff -urpN -X /home/fletch/.diff.exclude 000-virgin/fs/proc/array.c 900-mjb2/fs/proc/array.c --- 000-virgin/fs/proc/array.c Sat May 10 18:35:00 2003 +++ 900-mjb2/fs/proc/array.c Wed Jun 11 22:47:08 2003 @@ -336,7 +336,7 @@ int proc_pid_stat(struct task_struct *ta read_unlock(&tasklist_lock); res = sprintf(buffer,"%d (%s) %c %d %d %d %d %d %lu %lu \ %lu %lu %lu %lu %lu %ld %ld %ld %ld %ld %ld %llu %lu %ld %lu %lu %lu %lu %lu \ -%lu %lu %lu %lu %lu %lu %lu %lu %d %d %lu %lu\n", +%lu %lu %lu %lu %lu %lu %lu %lu %d %d %lu %lu %lu %lu %lu\n", task->pid, task->comm, state, @@ -382,7 +382,10 @@ int proc_pid_stat(struct task_struct *ta task->exit_signal, task_cpu(task), task->rt_priority, - task->policy); + task->policy, + jiffies_to_clock_t(task->sched_info.inter_arrival_time), + jiffies_to_clock_t(task->sched_info.service_time), + jiffies_to_clock_t(task->sched_info.response_time)); if(mm) mmput(mm); return res; diff -urpN -X /home/fletch/.diff.exclude 000-virgin/fs/proc/base.c 900-mjb2/fs/proc/base.c --- 000-virgin/fs/proc/base.c Fri May 30 19:02:19 2003 +++ 900-mjb2/fs/proc/base.c Wed Jun 11 23:10:38 2003 @@ -645,12 +645,6 @@ static struct inode_operations proc_pid_ .follow_link = proc_pid_follow_link }; -static int pid_alive(struct task_struct *p) -{ - BUG_ON(p->pids[PIDTYPE_PID].pidptr != &p->pids[PIDTYPE_PID].pid); - return atomic_read(&p->pids[PIDTYPE_PID].pid.count); -} - #define NUMBUF 10 static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir) @@ -662,9 +656,6 @@ static int proc_readfd(struct file * fil char buf[NUMBUF]; struct files_struct * files; - retval = -ENOENT; - if (!pid_alive(p)) - goto out; retval = 0; pid = p->pid; @@ -725,52 +716,50 @@ static int proc_pident_readdir(struct fi { int i; int pid; - struct dentry *dentry = filp->f_dentry; - struct inode *inode = dentry->d_inode; + struct inode *inode = filp->f_dentry->d_inode; struct pid_entry *p; - ino_t ino; - int ret; + int ret = 0; - ret = -ENOENT; - if (!pid_alive(proc_task(inode))) - goto out; + lock_kernel(); - ret = 0; pid = proc_task(inode)->pid; + if (!pid) { + ret = -ENOENT; + goto out; + } i = filp->f_pos; switch (i) { - case 0: - ino = inode->i_ino; - if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0) - goto out; - i++; - filp->f_pos++; - /* fall through */ - case 1: - ino = parent_ino(dentry); - if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0) - goto out; - i++; - filp->f_pos++; - /* fall through */ - default: - i -= 2; - if (i >= nents) { - ret = 1; - goto out; - } - p = ents + i; - while (p->name) { - if (filldir(dirent, p->name, p->len, filp->f_pos, - fake_ino(pid, p->type), p->mode >> 12) < 0) + case 0: + if (filldir(dirent, ".", 1, i, inode->i_ino, DT_DIR) < 0) goto out; + i++; filp->f_pos++; - p++; - } + /* fall through */ + case 1: + if (filldir(dirent, "..", 2, i, PROC_ROOT_INO, DT_DIR) < 0) + goto out; + i++; + filp->f_pos++; + /* fall through */ + default: + i -= 2; + if (i>=sizeof(base_stuff)/sizeof(base_stuff[0])) { + ret = 1; + goto out; + } + p = base_stuff + i; + while (p->name) { + if (filldir(dirent, p->name, p->len, filp->f_pos, + fake_ino(pid, p->type), p->mode >> 12) < 0) + goto out; + filp->f_pos++; + p++; + } } ret = 1; out: + unlock_kernel(); return ret; } @@ -814,7 +803,7 @@ static struct inode *proc_pid_make_inode inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; inode->i_ino = fake_ino(task->pid, ino); - if (!pid_alive(task)) + if (!task->pid) goto out_unlock; /* @@ -849,7 +838,7 @@ out_unlock: */ static int pid_revalidate(struct dentry * dentry, int flags) { - if (pid_alive(proc_task(dentry->d_inode))) + if (proc_task(dentry->d_inode)->pid) return 1; d_drop(dentry); return 0; @@ -883,23 +872,18 @@ static int pid_fd_revalidate(struct dent static void pid_base_iput(struct dentry *dentry, struct inode *inode) { struct task_struct *task = proc_task(inode); - spin_lock(&task->proc_lock); + write_lock_irq(&tasklist_lock); if (task->proc_dentry == dentry) task->proc_dentry = NULL; - spin_unlock(&task->proc_lock); + write_unlock_irq(&tasklist_lock); iput(inode); } static int pid_delete_dentry(struct dentry * dentry) { - /* Is the task we represent dead? - * If so, then don't put the dentry on the lru list, - * kill it immediately. - */ - return !pid_alive(proc_task(dentry->d_inode)); + return proc_task(dentry->d_inode)->pid == 0; } - static struct dentry_operations pid_fd_dentry_operations = { .d_revalidate = pid_fd_revalidate, @@ -955,8 +939,6 @@ static struct dentry *proc_lookupfd(stru if (fd == ~0U) goto out; - if (!pid_alive(task)) - goto out; inode = proc_pid_make_inode(dir->i_sb, task, PROC_PID_FD_DIR+fd); if (!inode) @@ -985,6 +967,8 @@ static struct dentry *proc_lookupfd(stru ei->op.proc_get_link = proc_fd_link; dentry->d_op = &pid_fd_dentry_operations; d_add(dentry, inode); + if (!proc_task(dentry->d_inode)->pid) + d_drop(dentry); return NULL; out_unlock2: @@ -1099,9 +1083,6 @@ static struct dentry *proc_pident_lookup error = -ENOENT; inode = NULL; - if (!pid_alive(task)) - goto out; - for (p = ents; p->name; p++) { if (p->len != dentry->d_name.len) continue; @@ -1196,6 +1177,8 @@ static struct dentry *proc_pident_lookup } dentry->d_op = &pid_dentry_operations; d_add(dentry, inode); + if (!proc_task(dentry->d_inode)->pid) + d_drop(dentry); return NULL; out: @@ -1259,55 +1242,6 @@ static struct inode_operations proc_self .follow_link = proc_self_follow_link, }; -/** - * proc_pid_unhash - Unhash /proc/ entry from the dcache. - * @p: task that should be flushed. - * - * Drops the /proc/ dcache entry from the hash chains. - * - * Dropping /proc/ entries and detach_pid must be synchroneous, - * otherwise e.g. /proc//exe might point to the wrong executable, - * if the pid value is immediately reused. This is enforced by - * - caller must acquire spin_lock(p->proc_lock) - * - must be called before detach_pid() - * - proc_pid_lookup acquires proc_lock, and checks that - * the target is not dead by looking at the attach count - * of PIDTYPE_PID. - */ - -struct dentry *proc_pid_unhash(struct task_struct *p) -{ - struct dentry *proc_dentry; - - proc_dentry = p->proc_dentry; - if (proc_dentry != NULL) { - - spin_lock(&dcache_lock); - if (!d_unhashed(proc_dentry)) { - dget_locked(proc_dentry); - __d_drop(proc_dentry); - } else - proc_dentry = NULL; - spin_unlock(&dcache_lock); - } - return proc_dentry; -} - -/** - * proc_pid_flush - recover memory used by stale /proc//x entries - * @proc_entry: directoy to prune. - * - * Shrink the /proc directory that was used by the just killed thread. - */ - -void proc_pid_flush(struct dentry *proc_dentry) -{ - if(proc_dentry != NULL) { - shrink_dcache_parent(proc_dentry); - dput(proc_dentry); - } -} - /* SMP-safe */ struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry) { @@ -1356,12 +1290,12 @@ struct dentry *proc_pid_lookup(struct in inode->i_flags|=S_IMMUTABLE; dentry->d_op = &pid_base_dentry_operations; - - spin_lock(&task->proc_lock); - task->proc_dentry = dentry; d_add(dentry, inode); - spin_unlock(&task->proc_lock); - + read_lock(&tasklist_lock); + proc_task(dentry->d_inode)->proc_dentry = dentry; + read_unlock(&tasklist_lock); + if (!proc_task(dentry->d_inode)->pid) + d_drop(dentry); return NULL; out: return ERR_PTR(-ENOENT); @@ -1384,7 +1318,7 @@ static int get_pid_list(int index, unsig read_lock(&tasklist_lock); for_each_process(p) { int pid = p->pid; - if (!pid_alive(p)) + if (!pid) continue; if (--index >= 0) continue; diff -urpN -X /home/fletch/.diff.exclude 000-virgin/fs/proc/proc_misc.c 900-mjb2/fs/proc/proc_misc.c --- 000-virgin/fs/proc/proc_misc.c Fri May 30 19:02:19 2003 +++ 900-mjb2/fs/proc/proc_misc.c Wed Jun 11 22:47:08 2003 @@ -134,6 +134,37 @@ static struct vmalloc_info get_vmalloc_i return vmi; } +static int real_loadavg_read_proc(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + int a, b, c, cpu; + int len; + + a = tasks_running[0] + (FIXED_1/200); + b = tasks_running[1] + (FIXED_1/200); + c = tasks_running[2] + (FIXED_1/200); + len = sprintf(page,"Domain load1 load2 load3 nr_run/nr_thrd\n"); + len += sprintf(page+len,"SYSTEM %5d.%02d %5d.%02d %5d.%02d %7ld/%7d\n", + LOAD_INT(a), LOAD_FRAC(a), + LOAD_INT(b), LOAD_FRAC(b), + LOAD_INT(c), LOAD_FRAC(c), + nr_running(), nr_threads); + for (cpu = 0; cpu < NR_CPUS; ++cpu) { + if (!cpu_online(cpu)) + continue; + a = cpu_tasks_running[0][cpu] + (FIXED_1/200); + b = cpu_tasks_running[1][cpu] + (FIXED_1/200); + c = cpu_tasks_running[2][cpu] + (FIXED_1/200); + len += sprintf(page+len, "%5d %5d.%02d %5d.%02d %5d.%02d %7ld/7%d\n", + cpu, + LOAD_INT(a), LOAD_FRAC(a), + LOAD_INT(b), LOAD_FRAC(b), + LOAD_INT(c), LOAD_FRAC(c), + nr_running_cpu(cpu), nr_threads); + } + return proc_calc_metrics(page, start, off, count, eof, len); +} + static int uptime_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) { @@ -303,6 +334,9 @@ static struct file_operations proc_vmsta .release = seq_release, }; +extern int schedstats_read_proc(char *page, char **start, off_t off, + int count, int *eof, void *data); + #ifdef CONFIG_PROC_HARDWARE static int hardware_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) @@ -359,6 +393,71 @@ static struct file_operations proc_modul }; #endif +#ifdef CONFIG_NUMA +#define K(x) ((x) << (PAGE_SHIFT - 10)) +static int show_meminfo_numa (struct seq_file *m, void *v) +{ + int *d = v; + int nid = *d; + struct sysinfo i; + si_meminfo_node(&i, nid); + seq_printf(m, "\n" + "Node %d MemTotal: %8lu kB\n" + "Node %d MemFree: %8lu kB\n" + "Node %d MemUsed: %8lu kB\n" + "Node %d HighTotal: %8lu kB\n" + "Node %d HighFree: %8lu kB\n" + "Node %d LowTotal: %8lu kB\n" + "Node %d LowFree: %8lu kB\n", + nid, K(i.totalram), + nid, K(i.freeram), + nid, K(i.totalram-i.freeram), + nid, K(i.totalhigh), + nid, K(i.freehigh), + nid, K(i.totalram-i.totalhigh), + nid, K(i.freeram-i.freehigh)); + + return 0; +} +#undef K + +extern struct seq_operations meminfo_numa_op; +static int meminfo_numa_open(struct inode *inode, struct file *file) +{ + return seq_open(file,&meminfo_numa_op); +} + +static struct file_operations proc_meminfo_numa_operations = { + open: meminfo_numa_open, + read: seq_read, + llseek: seq_lseek, + release: seq_release, +}; + +static void *meminfo_numa_start(struct seq_file *m, loff_t *pos) +{ + return *pos < numnodes ? pos : NULL; +} + +static void *meminfo_numa_next(struct seq_file *m, void *v, loff_t *pos) +{ + ++*pos; + return meminfo_numa_start(m, pos); +} + +static void meminfo_numa_stop(struct seq_file *m, void *v) +{ +} + +struct seq_operations meminfo_numa_op = { + .start = meminfo_numa_start, + .next = meminfo_numa_next, + .stop = meminfo_numa_stop, + .show = show_meminfo_numa, +}; + +#endif + extern struct seq_operations slabinfo_op; extern ssize_t slabinfo_write(struct file *, const char *, size_t, loff_t *); static int slabinfo_open(struct inode *inode, struct file *file) @@ -403,14 +502,20 @@ static int kstat_read_proc(char *page, c jiffies_to_clock_t(idle), jiffies_to_clock_t(iowait)); for (i = 0 ; i < NR_CPUS; i++){ - if (!cpu_online(i)) continue; - len += sprintf(page + len, "cpu%d %u %u %u %u %u\n", + struct sched_info info; + if (!cpu_online(i)) + continue; + cpu_sched_info(&info, i); + len += sprintf(page + len, "cpu%d %u %u %u %u %u %u %u %u\n", i, jiffies_to_clock_t(kstat_cpu(i).cpustat.user), jiffies_to_clock_t(kstat_cpu(i).cpustat.nice), jiffies_to_clock_t(kstat_cpu(i).cpustat.system), jiffies_to_clock_t(kstat_cpu(i).cpustat.idle), - jiffies_to_clock_t(kstat_cpu(i).cpustat.iowait)); + jiffies_to_clock_t(kstat_cpu(i).cpustat.iowait), + (uint) jiffies_to_clock_t(info.inter_arrival_time), + (uint) jiffies_to_clock_t(info.service_time), + (uint) jiffies_to_clock_t(info.response_time)); } len += sprintf(page + len, "intr %u", sum); @@ -610,6 +715,36 @@ static void create_seq_entry(char *name, entry->proc_fops = f; } +#ifdef CONFIG_LOCKMETER +extern ssize_t get_lockmeter_info(char *, size_t, loff_t *); +extern ssize_t put_lockmeter_info(const char *, size_t); +extern int get_lockmeter_info_size(void); + +/* + * This function accesses lock metering information. + */ +static ssize_t read_lockmeter(struct file *file, char *buf, + size_t count, loff_t *ppos) +{ + return get_lockmeter_info(buf, count, ppos); +} + +/* + * Writing to /proc/lockmeter resets the counters + */ +static ssize_t write_lockmeter(struct file * file, const char * buf, + size_t count, loff_t *ppos) +{ + return put_lockmeter_info(buf, count); +} + +static struct file_operations proc_lockmeter_operations = { + NULL, /* lseek */ + read: read_lockmeter, + write: write_lockmeter, +}; +#endif /* CONFIG_LOCKMETER */ + void __init proc_misc_init(void) { struct proc_dir_entry *entry; @@ -618,6 +753,7 @@ void __init proc_misc_init(void) int (*read_proc)(char*,char**,off_t,int,int*,void*); } *p, simple_ones[] = { {"loadavg", loadavg_read_proc}, + {"real_loadavg",real_loadavg_read_proc}, {"uptime", uptime_read_proc}, {"meminfo", meminfo_read_proc}, {"version", version_read_proc}, @@ -636,6 +772,7 @@ void __init proc_misc_init(void) #endif {"locks", locks_read_proc}, {"execdomains", execdomains_read_proc}, + {"schedstat", schedstats_read_proc}, {NULL,} }; for (p = simple_ones; p->name; p++) @@ -659,6 +796,9 @@ void __init proc_misc_init(void) #ifdef CONFIG_MODULES create_seq_entry("modules", 0, &proc_modules_operations); #endif +#ifdef CONFIG_NUMA + create_seq_entry("meminfo.numa",0,&proc_meminfo_numa_operations); +#endif proc_root_kcore = create_proc_entry("kcore", S_IRUSR, NULL); if (proc_root_kcore) { proc_root_kcore->proc_fops = &proc_kcore_operations; @@ -676,6 +816,13 @@ void __init proc_misc_init(void) entry = create_proc_entry("sysrq-trigger", S_IWUSR, NULL); if (entry) entry->proc_fops = &proc_sysrq_trigger_operations; +#endif +#ifdef CONFIG_LOCKMETER + entry = create_proc_entry("lockmeter", S_IWUSR | S_IRUGO, NULL); + if (entry) { + entry->proc_fops = &proc_lockmeter_operations; + entry->size = get_lockmeter_info_size(); + } #endif #ifdef CONFIG_PPC32 { diff -urpN -X /home/fletch/.diff.exclude 000-virgin/fs/proc/root.c 900-mjb2/fs/proc/root.c --- 000-virgin/fs/proc/root.c Fri May 30 19:02:19 2003 +++ 900-mjb2/fs/proc/root.c Wed Jun 11 22:56:53 2003 @@ -110,9 +110,9 @@ static int proc_root_readdir(struct file } filp->f_pos = FIRST_PROCESS_ENTRY; } - unlock_kernel(); ret = proc_pid_readdir(filp, dirent, filldir); + unlock_kernel(); return ret; } diff -urpN -X /home/fletch/.diff.exclude 000-virgin/fs/reiserfs/inode.c 900-mjb2/fs/reiserfs/inode.c --- 000-virgin/fs/reiserfs/inode.c Fri May 30 19:02:19 2003 +++ 900-mjb2/fs/reiserfs/inode.c Wed Jun 11 22:47:02 2003 @@ -306,7 +306,7 @@ research: ** read old data off disk. Set the up to date bit on the buffer instead ** and jump to the end */ - if (PageUptodate(bh_result->b_page)) { + if (!bh_result->b_page || PageUptodate(bh_result->b_page)) { set_buffer_uptodate(bh_result); goto finished ; } @@ -420,6 +420,40 @@ static int reiserfs_get_block_create_0 ( return reiserfs_get_block(inode, block, bh_result, GET_BLOCK_NO_HOLE) ; } +static int reiserfs_get_block_direct_io (struct inode * inode, + sector_t iblock, unsigned long max_blocks, + struct buffer_head * bh_result, int create) { + int ret ; + + bh_result->b_size = (1 << inode->i_blkbits); + bh_result->b_page = NULL; + + ret = reiserfs_get_block(inode, iblock, bh_result, create) ; + + if (ret != 0) + return ret; + + /* don't allow direct io onto tail pages */ + if (buffer_mapped(bh_result) && bh_result->b_blocknr == 0) { + /* make sure future calls to the direct io funcs for this offset + ** in the file fail by unmapping the buffer + */ + reiserfs_unmap_buffer(bh_result); + ret = -EINVAL ; + } + + /* Possible unpacked tail. Flush the data before pages have + disappeared */ + if (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) { + lock_kernel(); + reiserfs_commit_for_inode(inode); + REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask; + unlock_kernel(); + } + + return ret ; +} + /* ** helper function for when reiserfs_get_block is called for a hole ** but the file tail is still in a direct item @@ -448,7 +482,7 @@ static int convert_tail_for_hole(struct tail_end = (tail_start | (bh_result->b_size - 1)) + 1 ; index = tail_offset >> PAGE_CACHE_SHIFT ; - if (index != hole_page->index) { + if (!hole_page || index != hole_page->index) { tail_page = grab_cache_page(inode->i_mapping, index) ; retval = -ENOMEM; if (!tail_page) { @@ -554,7 +588,15 @@ int reiserfs_get_block (struct inode * i return ret; } - REISERFS_I(inode)->i_flags |= i_pack_on_close_mask ; + /* If file is of such a size, that it might have a tail and tails are enabled + ** we should mark it as possibly needing tail packing on close + */ + if ( (have_large_tails (inode->i_sb) && + inode->i_size < i_block_size (inode)*4) || + (have_small_tails (inode->i_sb) && + inode->i_size < i_block_size(inode)) ) + + REISERFS_I(inode)->i_flags |= i_pack_on_close_mask ; windex = push_journal_writer("reiserfs_get_block") ; @@ -744,22 +786,27 @@ int reiserfs_get_block (struct inode * i ** the disk */ set_buffer_uptodate (unbh); - - /* we've converted the tail, so we must - ** flush unbh before the transaction commits - */ - add_to_flushlist(inode, unbh) ; - - /* mark it dirty now to prevent commit_write from adding - ** this buffer to the inode's dirty buffer list - */ + /* unbh->b_page == NULL in case of DIRECT_IO request, this means + buffer will disappear shortly, so it should not be added to + any of our lists. + */ + if ( unbh->b_page ) { + /* we've converted the tail, so we must + ** flush unbh before the transaction commits + */ + add_to_flushlist(inode, unbh) ; + + /* mark it dirty now to prevent commit_write from adding + ** this buffer to the inode's dirty buffer list + */ /* * AKPM: changed __mark_buffer_dirty to mark_buffer_dirty(). * It's still atomic, but it sets the page dirty too, * which makes it eligible for writeback at any time by the * VM (which was also the case with __mark_buffer_dirty()) */ - mark_buffer_dirty(unbh) ; + mark_buffer_dirty(unbh) ; + } //inode->i_blocks += inode->i_sb->s_blocksize / 512; //mark_tail_converted (inode); @@ -2202,6 +2249,15 @@ static int reiserfs_commit_write(struct if (pos > inode->i_size) { struct reiserfs_transaction_handle th ; reiserfs_write_lock(inode->i_sb); + /* If the file have grown beyond the border where it + can have a tail, unmark it as needing a tail + packing */ + if ( (have_large_tails (inode->i_sb) && + inode->i_size < i_block_size(inode)*4) || + (have_small_tails (inode->i_sb) && + inode->i_size < i_block_size(inode)) ) + REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask ; + journal_begin(&th, inode->i_sb, 1) ; reiserfs_update_inode_transaction(inode) ; inode->i_size = pos ; @@ -2304,6 +2360,17 @@ static int reiserfs_releasepage(struct p return ret ; } +static int reiserfs_direct_IO(int rw, struct kiocb *iocb, + const struct iovec *iov, loff_t offset, + unsigned long nr_segs) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_dentry->d_inode->i_mapping->host; + + return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, + offset, nr_segs, reiserfs_get_block_direct_io); +} + struct address_space_operations reiserfs_address_space_operations = { .writepage = reiserfs_writepage, .readpage = reiserfs_readpage, @@ -2312,5 +2379,6 @@ struct address_space_operations reiserfs .sync_page = block_sync_page, .prepare_write = reiserfs_prepare_write, .commit_write = reiserfs_commit_write, - .bmap = reiserfs_aop_bmap + .bmap = reiserfs_aop_bmap, + .direct_IO = reiserfs_direct_IO } ; diff -urpN -X /home/fletch/.diff.exclude 000-virgin/fs/reiserfs/tail_conversion.c 900-mjb2/fs/reiserfs/tail_conversion.c --- 000-virgin/fs/reiserfs/tail_conversion.c Fri May 30 19:02:19 2003 +++ 900-mjb2/fs/reiserfs/tail_conversion.c Wed Jun 11 22:47:02 2003 @@ -104,8 +104,10 @@ int direct2indirect (struct reiserfs_tra /* we only send the unbh pointer if the buffer is not up to date. ** this avoids overwriting good data from writepage() with old data ** from the disk or buffer cache + ** Special case: unbh->b_page will be NULL if we are coming through + ** DIRECT_IO handler here. */ - if (buffer_uptodate(unbh) || PageUptodate(unbh->b_page)) { + if (!unbh->b_page || buffer_uptodate(unbh) || PageUptodate(unbh->b_page)) { up_to_date_bh = NULL ; } else { up_to_date_bh = unbh ; diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-alpha/lockmeter.h 900-mjb2/include/asm-alpha/lockmeter.h --- 000-virgin/include/asm-alpha/lockmeter.h Wed Dec 31 16:00:00 1969 +++ 900-mjb2/include/asm-alpha/lockmeter.h Wed Jun 11 22:47:01 2003 @@ -0,0 +1,90 @@ +/* + * Written by John Hawkes (hawkes@sgi.com) + * Based on klstat.h by Jack Steiner (steiner@sgi.com) + * + * Modified by Peter Rival (frival@zk3.dec.com) + */ + +#ifndef _ALPHA_LOCKMETER_H +#define _ALPHA_LOCKMETER_H + +#include +#define CPU_CYCLE_FREQUENCY hwrpb->cycle_freq + +#define get_cycles64() get_cycles() + +#define THIS_CPU_NUMBER smp_processor_id() + +#include +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,3,0) +#define local_irq_save(x) \ + __save_and_cli(x) +#define local_irq_restore(x) \ + __restore_flags(x) +#endif /* Linux version 2.2.x */ + +#define SPINLOCK_MAGIC_INIT /**/ + +/* + * Macros to cache and retrieve an index value inside of a lock + * these macros assume that there are less than 65536 simultaneous + * (read mode) holders of a rwlock. + * We also assume that the hash table has less than 32767 entries. + * the high order bit is used for write locking a rw_lock + * Note: although these defines and macros are the same as what is being used + * in include/asm-i386/lockmeter.h, they are present here to easily + * allow an alternate Alpha implementation. + */ +/* + * instrumented spinlock structure -- never used to allocate storage + * only used in macros below to overlay a spinlock_t + */ +typedef struct inst_spinlock_s { + /* remember, Alpha is little endian */ + unsigned short lock; + unsigned short index; +} inst_spinlock_t; +#define PUT_INDEX(lock_ptr,indexv) ((inst_spinlock_t *)(lock_ptr))->index = indexv +#define GET_INDEX(lock_ptr) ((inst_spinlock_t *)(lock_ptr))->index + +/* + * macros to cache and retrieve an index value in a read/write lock + * as well as the cpu where a reader busy period started + * we use the 2nd word (the debug word) for this, so require the + * debug word to be present + */ +/* + * instrumented rwlock structure -- never used to allocate storage + * only used in macros below to overlay a rwlock_t + */ +typedef struct inst_rwlock_s { + volatile int lock; + unsigned short index; + unsigned short cpu; +} inst_rwlock_t; +#define PUT_RWINDEX(rwlock_ptr,indexv) ((inst_rwlock_t *)(rwlock_ptr))->index = indexv +#define GET_RWINDEX(rwlock_ptr) ((inst_rwlock_t *)(rwlock_ptr))->index +#define PUT_RW_CPU(rwlock_ptr,cpuv) ((inst_rwlock_t *)(rwlock_ptr))->cpu = cpuv +#define GET_RW_CPU(rwlock_ptr) ((inst_rwlock_t *)(rwlock_ptr))->cpu + +/* + * return true if rwlock is write locked + * (note that other lock attempts can cause the lock value to be negative) + */ +#define RWLOCK_IS_WRITE_LOCKED(rwlock_ptr) (((inst_rwlock_t *)rwlock_ptr)->lock & 1) +#define IABS(x) ((x) > 0 ? (x) : -(x)) + +#define RWLOCK_READERS(rwlock_ptr) rwlock_readers(rwlock_ptr) +extern inline int rwlock_readers(rwlock_t *rwlock_ptr) +{ + int tmp = (int) ((inst_rwlock_t *)rwlock_ptr)->lock; + /* readers subtract 2, so we have to: */ + /* - andnot off a possible writer (bit 0) */ + /* - get the absolute value */ + /* - divide by 2 (right shift by one) */ + /* to find the number of readers */ + if (tmp == 0) return(0); + else return(IABS(tmp & ~1)>>1); +} + +#endif /* _ALPHA_LOCKMETER_H */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-alpha/spinlock.h 900-mjb2/include/asm-alpha/spinlock.h --- 000-virgin/include/asm-alpha/spinlock.h Fri May 30 19:02:20 2003 +++ 900-mjb2/include/asm-alpha/spinlock.h Wed Jun 11 22:47:01 2003 @@ -6,6 +6,10 @@ #include #include +#ifdef CONFIG_LOCKMETER +#undef DEBUG_SPINLOCK +#undef DEBUG_RWLOCK +#endif /* * Simple spin lock operations. There are two variants, one clears IRQ's @@ -95,9 +99,18 @@ static inline int _raw_spin_trylock(spin typedef struct { volatile int write_lock:1, read_counter:31; +#ifdef CONFIG_LOCKMETER + /* required for LOCKMETER since all bits in lock are used */ + /* need this storage for CPU and lock INDEX ............. */ + unsigned magic; +#endif } /*__attribute__((aligned(32)))*/ rwlock_t; +#ifdef CONFIG_LOCKMETER +#define RW_LOCK_UNLOCKED (rwlock_t) { 0, 0, 0 } +#else #define RW_LOCK_UNLOCKED (rwlock_t) { 0, 0 } +#endif #define rwlock_init(x) do { *(x) = RW_LOCK_UNLOCKED; } while(0) #define rwlock_is_locked(x) (*(volatile int *)(x) != 0) @@ -168,5 +181,42 @@ static inline void _raw_read_unlock(rwlo : "=m" (*lock), "=&r" (regx) : "m" (*lock) : "memory"); } + +#ifdef CONFIG_LOCKMETER +static inline int _raw_write_trylock(rwlock_t *lock) +{ + long temp,result; + + __asm__ __volatile__( + " ldl_l %1,%0\n" + " mov $31,%2\n" + " bne %1,1f\n" + " or $31,1,%2\n" + " stl_c %2,%0\n" + "1: mb\n" + : "=m" (*(volatile int *)lock), "=&r" (temp), "=&r" (result) + : "m" (*(volatile int *)lock) + ); + + return (result); +} + +static inline int _raw_read_trylock(rwlock_t *lock) +{ + unsigned long temp,result; + + __asm__ __volatile__( + " ldl_l %1,%0\n" + " mov $31,%2\n" + " blbs %1,1f\n" + " subl %1,2,%2\n" + " stl_c %2,%0\n" + "1: mb\n" + : "=m" (*(volatile int *)lock), "=&r" (temp), "=&r" (result) + : "m" (*(volatile int *)lock) + ); + return (result); +} +#endif /* CONFIG_LOCKMETER */ #endif /* _ALPHA_SPINLOCK_H */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/bug.h 900-mjb2/include/asm-i386/bug.h --- 000-virgin/include/asm-i386/bug.h Mon Mar 17 21:43:48 2003 +++ 900-mjb2/include/asm-i386/bug.h Wed Jun 11 22:42:59 2003 @@ -9,6 +9,11 @@ * undefined" opcode for parsing in the trap handler. */ +#ifdef CONFIG_X86_REMOTE_DEBUG +#define BUG() do { \ + asm ("int $0x3"); \ +} while (0) +#else #if 1 /* Set to zero for a slightly smaller kernel */ #define BUG() \ __asm__ __volatile__( "ud2\n" \ @@ -17,6 +22,7 @@ : : "i" (__LINE__), "i" (__FILE__)) #else #define BUG() __asm__ __volatile__("ud2\n") +#endif #endif #define PAGE_BUG(page) do { \ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/cpu.h 900-mjb2/include/asm-i386/cpu.h --- 000-virgin/include/asm-i386/cpu.h Thu Feb 13 11:08:13 2003 +++ 900-mjb2/include/asm-i386/cpu.h Wed Jun 11 22:51:54 2003 @@ -23,4 +23,6 @@ static inline int arch_register_cpu(int return register_cpu(&cpu_devices[num].cpu, num, parent); } +extern void setup_cpu_idt(void); +extern void setup_node_idts(void); #endif /* _ASM_I386_CPU_H_ */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/desc.h 900-mjb2/include/asm-i386/desc.h --- 000-virgin/include/asm-i386/desc.h Tue Feb 25 23:03:50 2003 +++ 900-mjb2/include/asm-i386/desc.h Wed Jun 11 22:51:54 2003 @@ -2,6 +2,7 @@ #define __ARCH_DESC_H #include +#include #include #ifndef __ASSEMBLY__ @@ -12,14 +13,15 @@ #include extern struct desc_struct cpu_gdt_table[NR_CPUS][GDT_ENTRIES]; +extern struct desc_struct node_idt_table[MAX_NUMNODES][IDT_ENTRIES]; -struct Xgt_desc_struct { +struct Xdt_desc_struct { unsigned short size; unsigned long address __attribute__((packed)); unsigned short pad; } __attribute__ ((packed)); -extern struct Xgt_desc_struct idt_descr, cpu_gdt_descr[NR_CPUS]; +extern struct Xdt_desc_struct node_idt_descr[MAX_NUMNODES], cpu_gdt_descr[NR_CPUS]; #define load_TR_desc() __asm__ __volatile__("ltr %%ax"::"a" (GDT_ENTRY_TSS*8)) #define load_LDT_desc() __asm__ __volatile__("lldt %%ax"::"a" (GDT_ENTRY_LDT*8)) @@ -29,7 +31,8 @@ extern struct Xgt_desc_struct idt_descr, * something other than this. */ extern struct desc_struct default_ldt[]; -extern void set_intr_gate(unsigned int irq, void * addr); +extern void node_set_intr_gate(unsigned int node, unsigned int vector, void * addr); +extern void set_intr_gate(unsigned int n, void *addr); #define _set_tssldt_desc(n,addr,limit,type) \ __asm__ __volatile__ ("movw %w3,0(%2)\n\t" \ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/early_printk.h 900-mjb2/include/asm-i386/early_printk.h --- 000-virgin/include/asm-i386/early_printk.h Wed Dec 31 16:00:00 1969 +++ 900-mjb2/include/asm-i386/early_printk.h Wed Jun 11 22:42:36 2003 @@ -0,0 +1,8 @@ +#ifndef __EARLY_PRINTK_H_I386_ +#define __EARLY_PRINTK_H_i386_ + +#define VGABASE 0xB8000 +#define SERIAL_BASES { 0x3f8, 0x2f8 } +#define SERIAL_BASES_LEN 2 + +#endif diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/ioctls.h 900-mjb2/include/asm-i386/ioctls.h --- 000-virgin/include/asm-i386/ioctls.h Tue Apr 8 14:38:20 2003 +++ 900-mjb2/include/asm-i386/ioctls.h Wed Jun 11 22:42:59 2003 @@ -68,6 +68,7 @@ #define TIOCGHAYESESP 0x545E /* Get Hayes ESP configuration */ #define TIOCSHAYESESP 0x545F /* Set Hayes ESP configuration */ #define FIOQSIZE 0x5460 +#define TIOCGDB 0x547F /* enable GDB stub mode on this tty */ /* Used for packet mode */ #define TIOCPKT_DATA 0 diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/linkage.h 900-mjb2/include/asm-i386/linkage.h --- 000-virgin/include/asm-i386/linkage.h Sun Nov 17 20:29:46 2002 +++ 900-mjb2/include/asm-i386/linkage.h Wed Jun 11 22:46:30 2003 @@ -3,6 +3,7 @@ #define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0))) #define FASTCALL(x) x __attribute__((regparm(3))) +#define IRQHANDLER(x) x __attribute__((regparm(1))) #ifdef CONFIG_X86_ALIGNMENT_16 #define __ALIGN .align 16,0x90 diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/lockmeter.h 900-mjb2/include/asm-i386/lockmeter.h --- 000-virgin/include/asm-i386/lockmeter.h Wed Dec 31 16:00:00 1969 +++ 900-mjb2/include/asm-i386/lockmeter.h Wed Jun 11 22:47:01 2003 @@ -0,0 +1,127 @@ +/* + * Copyright (C) 1999,2000 Silicon Graphics, Inc. + * + * Written by John Hawkes (hawkes@sgi.com) + * Based on klstat.h by Jack Steiner (steiner@sgi.com) + * + * Modified by Ray Bryant (raybry@us.ibm.com) + * Changes Copyright (C) 2000 IBM, Inc. + * Added save of index in spinlock_t to improve efficiency + * of "hold" time reporting for spinlocks. + * Added support for hold time statistics for read and write + * locks. + * Moved machine dependent code here from include/lockmeter.h. + * + */ + +#ifndef _I386_LOCKMETER_H +#define _I386_LOCKMETER_H + +#include +#include + +#include + +#ifdef __KERNEL__ +extern unsigned long cpu_khz; +#define CPU_CYCLE_FREQUENCY (cpu_khz * 1000) +#else +#define CPU_CYCLE_FREQUENCY 450000000 +#endif + +#define THIS_CPU_NUMBER smp_processor_id() + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,3,0) +#define local_irq_save(x) \ + __asm__ __volatile__("pushfl ; popl %0 ; cli":"=g" (x): /* no input */ :"memory") + +#define local_irq_restore(x) \ + __asm__ __volatile__("pushl %0 ; popfl": /* no output */ :"g" (x):"memory") +#endif /* Linux version 2.2.x */ + +/* + * macros to cache and retrieve an index value inside of a spin lock + * these macros assume that there are less than 65536 simultaneous + * (read mode) holders of a rwlock. Not normally a problem!! + * we also assume that the hash table has less than 65535 entries. + */ +/* + * instrumented spinlock structure -- never used to allocate storage + * only used in macros below to overlay a spinlock_t + */ +typedef struct inst_spinlock_s { + /* remember, Intel is little endian */ + unsigned short lock; + unsigned short index; +} inst_spinlock_t; +#define PUT_INDEX(lock_ptr,indexv) ((inst_spinlock_t *)(lock_ptr))->index = indexv +#define GET_INDEX(lock_ptr) ((inst_spinlock_t *)(lock_ptr))->index + +/* + * macros to cache and retrieve an index value in a read/write lock + * as well as the cpu where a reader busy period started + * we use the 2nd word (the debug word) for this, so require the + * debug word to be present + */ +/* + * instrumented rwlock structure -- never used to allocate storage + * only used in macros below to overlay a rwlock_t + */ +typedef struct inst_rwlock_s { + volatile int lock; + unsigned short index; + unsigned short cpu; +} inst_rwlock_t; +#define PUT_RWINDEX(rwlock_ptr,indexv) ((inst_rwlock_t *)(rwlock_ptr))->index = indexv +#define GET_RWINDEX(rwlock_ptr) ((inst_rwlock_t *)(rwlock_ptr))->index +#define PUT_RW_CPU(rwlock_ptr,cpuv) ((inst_rwlock_t *)(rwlock_ptr))->cpu = cpuv +#define GET_RW_CPU(rwlock_ptr) ((inst_rwlock_t *)(rwlock_ptr))->cpu + +/* + * return the number of readers for a rwlock_t + */ +#define RWLOCK_READERS(rwlock_ptr) rwlock_readers(rwlock_ptr) + +extern inline int rwlock_readers(rwlock_t *rwlock_ptr) +{ + int tmp = (int) rwlock_ptr->lock; + /* read and write lock attempts may cause the lock value to temporarily */ + /* be negative. Until it is >= 0 we know nothing (i. e. can't tell if */ + /* is -1 because it was write locked and somebody tried to read lock it */ + /* or if it is -1 because it was read locked and somebody tried to write*/ + /* lock it. ........................................................... */ + do { + tmp = (int) rwlock_ptr->lock; + } while (tmp < 0); + if (tmp == 0) return(0); + else return(RW_LOCK_BIAS-tmp); +} + +/* + * return true if rwlock is write locked + * (note that other lock attempts can cause the lock value to be negative) + */ +#define RWLOCK_IS_WRITE_LOCKED(rwlock_ptr) ((rwlock_ptr)->lock <= 0) +#define IABS(x) ((x) > 0 ? (x) : -(x)) +#define RWLOCK_IS_READ_LOCKED(rwlock_ptr) ((IABS((rwlock_ptr)->lock) % RW_LOCK_BIAS) != 0) + +/* this is a lot of typing just to get gcc to emit "rdtsc" */ +static inline long long get_cycles64 (void) +{ +#ifndef CONFIG_X86_TSC + #error this code requires CONFIG_X86_TSC +#else + union longlong_u { + long long intlong; + struct intint_s { + uint32_t eax; + uint32_t edx; + } intint; + } longlong; + + rdtsc(longlong.intint.eax,longlong.intint.edx); + return longlong.intlong; +#endif +} + +#endif /* _I386_LOCKMETER_H */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/mach-bigsmp/mach_apic.h 900-mjb2/include/asm-i386/mach-bigsmp/mach_apic.h --- 000-virgin/include/asm-i386/mach-bigsmp/mach_apic.h Fri May 30 19:02:20 2003 +++ 900-mjb2/include/asm-i386/mach-bigsmp/mach_apic.h Wed Jun 11 22:42:41 2003 @@ -22,7 +22,7 @@ static inline int apic_id_registered(voi #define APIC_DFR_VALUE (APIC_DFR_CLUSTER) static inline unsigned long target_cpus(void) { - return ((cpu_online_map < 0xf)?cpu_online_map:0xf); + return cpu_online_map; } #define TARGET_CPUS (target_cpus()) @@ -151,7 +151,7 @@ static inline unsigned int cpu_mask_to_a if (apicid_cluster(apicid) != apicid_cluster(new_apicid)){ printk ("%s: Not a valid mask!\n",__FUNCTION__); - return TARGET_CPUS; + return 0xFF; } apicid = apicid | new_apicid; cpus_found++; diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/mach-default/irq_vectors.h 900-mjb2/include/asm-i386/mach-default/irq_vectors.h --- 000-virgin/include/asm-i386/mach-default/irq_vectors.h Sun Apr 20 19:35:05 2003 +++ 900-mjb2/include/asm-i386/mach-default/irq_vectors.h Wed Jun 11 22:51:54 2003 @@ -68,15 +68,22 @@ #define TIMER_IRQ 0 /* - * 16 8259A IRQ's, 208 potential APIC interrupt sources. - * Right now the APIC is mostly only used for SMP. - * 256 vectors is an architectural limit. (we can have - * more than 256 devices theoretically, but they will - * have to use shared interrupts) + * 16 8259A IRQ's, MAX_IRQ_SOURCES-16 potential APIC + * interrupt sources. Right now the APIC is mostly only + * used for SMP. 256 vectors is an architectural limit. + * (we can have more than 256 devices theoretically, but + * they will have to use shared interrupts) * Since vectors 0x00-0x1f are used/reserved for the CPU, * the usable vector space is 0x20-0xff (224 vectors) + * Linux currently makes 190 vectors available for io interrupts + * starting at FIRST_DEVICE_VECTOR till FIRST_SYSTEM_VECTOR + * + * 0________0x31__________________________0xef_________0xff + * system io interrupts resvd/smp + * */ #ifdef CONFIG_X86_IO_APIC +#define NR_IRQ_VECTORS 190 #define NR_IRQS 224 #else #define NR_IRQS 16 diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/mach-numaq/mach_apic.h 900-mjb2/include/asm-i386/mach-numaq/mach_apic.h --- 000-virgin/include/asm-i386/mach-numaq/mach_apic.h Fri May 30 19:02:20 2003 +++ 900-mjb2/include/asm-i386/mach-numaq/mach_apic.h Wed Jun 11 22:42:41 2003 @@ -6,7 +6,7 @@ #define APIC_DFR_VALUE (APIC_DFR_CLUSTER) -#define TARGET_CPUS (0xf) +#define TARGET_CPUS (~0UL) #define NO_BALANCE_IRQ (1) #define esr_disable (1) @@ -107,37 +107,13 @@ static inline int check_phys_apicid_pres return (1); } +/* + * We use physical apicids here, not logical, so just return the default + * physical broadcast to stop people from breaking us + */ static inline unsigned int cpu_mask_to_apicid (unsigned long cpumask) { - int num_bits_set; - int cpus_found = 0; - int cpu; - int apicid; - - num_bits_set = hweight32(cpumask); - /* Return id to all */ - if (num_bits_set == 32) - return (int) 0xFF; - /* - * The cpus in the mask must all be on the apic cluster. If are not - * on the same apicid cluster return default value of TARGET_CPUS. - */ - cpu = ffs(cpumask)-1; - apicid = cpu_to_logical_apicid(cpu); - while (cpus_found < num_bits_set) { - if (cpumask & (1 << cpu)) { - int new_apicid = cpu_to_logical_apicid(cpu); - if (apicid_cluster(apicid) != - apicid_cluster(new_apicid)){ - printk ("%s: Not a valid mask!\n",__FUNCTION__); - return TARGET_CPUS; - } - apicid = apicid | new_apicid; - cpus_found++; - } - cpu++; - } - return apicid; + return (int) 0xF; } #endif /* __ASM_MACH_APIC_H */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/mach-summit/mach_apic.h 900-mjb2/include/asm-i386/mach-summit/mach_apic.h --- 000-virgin/include/asm-i386/mach-summit/mach_apic.h Fri May 30 19:02:20 2003 +++ 900-mjb2/include/asm-i386/mach-summit/mach_apic.h Wed Jun 11 22:42:41 2003 @@ -4,13 +4,7 @@ #include #include -#ifdef CONFIG_X86_GENERICARCH -#define x86_summit 1 /* must be an constant expressiona for generic arch */ -#else -extern int x86_summit; -#endif - -#define esr_disable (x86_summit ? 1 : 0) +#define esr_disable (1) #define NO_BALANCE_IRQ (0) #define XAPIC_DEST_CPUS_MASK 0x0Fu @@ -22,27 +16,27 @@ static inline unsigned long xapic_phys_t ((phys_apic) & XAPIC_DEST_CLUSTER_MASK) ); } -#define APIC_DFR_VALUE (x86_summit ? APIC_DFR_CLUSTER : APIC_DFR_FLAT) +#define APIC_DFR_VALUE (APIC_DFR_CLUSTER) static inline unsigned long target_cpus(void) { - return (x86_summit ? XAPIC_DEST_CPUS_MASK : cpu_online_map); + return (~0UL); } #define TARGET_CPUS (target_cpus()) -#define INT_DELIVERY_MODE (x86_summit ? dest_Fixed : dest_LowestPrio) +#define INT_DELIVERY_MODE (dest_Fixed) #define INT_DEST_MODE 1 /* logical delivery broadcast to all procs */ #define APIC_BROADCAST_ID (0x0F) static inline unsigned long check_apicid_used(unsigned long bitmap, int apicid) { - return (x86_summit ? 0 : (bitmap & (1 << apicid))); + return 0; } /* we don't use the phys_cpu_present_map to indicate apicid presence */ static inline unsigned long check_apicid_present(int bit) { - return (x86_summit ? 1 : (phys_cpu_present_map & (1 << bit))); + return 1; } #define apicid_cluster(apicid) (apicid & 0xF0) @@ -53,10 +47,7 @@ static inline void init_apic_ldr(void) { unsigned long val, id; - if (x86_summit) - id = xapic_phys_to_log_apicid(hard_smp_processor_id()); - else - id = 1UL << smp_processor_id(); + id = xapic_phys_to_log_apicid(hard_smp_processor_id()); apic_write_around(APIC_DFR, APIC_DFR_VALUE); val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; val |= SET_APIC_LOGICAL_ID(id); @@ -75,8 +66,8 @@ static inline int apic_id_registered(voi static inline void clustered_apic_check(void) { - printk("Enabling APIC mode: %s. Using %d I/O APICs\n", - (x86_summit ? "Summit" : "Flat"), nr_ioapics); + printk("Enabling APIC mode: Summit. Using %d I/O APICs\n", + nr_ioapics); } static inline int apicid_to_node(int logical_apicid) @@ -93,24 +84,18 @@ static inline int cpu_to_logical_apicid( static inline int cpu_present_to_apicid(int mps_cpu) { - if (x86_summit) - return (int) bios_cpu_apicid[mps_cpu]; - else - return mps_cpu; + return (int) bios_cpu_apicid[mps_cpu]; } static inline ulong ioapic_phys_id_map(ulong phys_map) { /* For clustered we don't have a good way to do this yet - hack */ - return (x86_summit ? 0x0F : phys_map); + return 0x0F; } static inline unsigned long apicid_to_cpu_present(int apicid) { - if (x86_summit) - return 1; - else - return (1ul << apicid); + return 1; } static inline int mpc_apic_id(struct mpc_config_processor *m, @@ -130,10 +115,7 @@ static inline void setup_portio_remap(vo static inline int check_phys_apicid_present(int boot_cpu_physical_apicid) { - if (x86_summit) - return (1); - else - return test_bit(boot_cpu_physical_apicid, &phys_cpu_present_map); + return 1; } static inline unsigned int cpu_mask_to_apicid (unsigned long cpumask) @@ -159,7 +141,7 @@ static inline unsigned int cpu_mask_to_a if (apicid_cluster(apicid) != apicid_cluster(new_apicid)){ printk ("%s: Not a valid mask!\n",__FUNCTION__); - return TARGET_CPUS; + return 0xFF; } apicid = apicid | new_apicid; cpus_found++; diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/mach-summit/mach_mpparse.h 900-mjb2/include/asm-i386/mach-summit/mach_mpparse.h --- 000-virgin/include/asm-i386/mach-summit/mach_mpparse.h Fri May 30 19:02:20 2003 +++ 900-mjb2/include/asm-i386/mach-summit/mach_mpparse.h Wed Jun 11 22:47:01 2003 @@ -1,6 +1,8 @@ #ifndef __ASM_MACH_MPPARSE_H #define __ASM_MACH_MPPARSE_H +#include + extern int use_cyclone; static inline void mpc_oem_bus_info(struct mpc_config_bus *m, char *name, @@ -21,9 +23,6 @@ static inline int mps_oem_check(struct m (!strncmp(productid, "VIGIL SMP", 9) || !strncmp(productid, "EXA", 3) || !strncmp(productid, "RUTHLESS SMP", 12))){ -#ifndef CONFIG_X86_GENERICARCH - x86_summit = 1; -#endif use_cyclone = 1; /*enable cyclone-timer*/ return 1; } @@ -36,12 +35,76 @@ static inline int acpi_madt_oem_check(ch if (!strncmp(oem_id, "IBM", 3) && (!strncmp(oem_table_id, "SERVIGIL", 8) || !strncmp(oem_table_id, "EXA", 3))){ -#ifndef CONFIG_X86_GENERICARCH - x86_summit = 1; -#endif use_cyclone = 1; /*enable cyclone-timer*/ return 1; } return 0; } + +struct rio_table_hdr { + unsigned char version; /* Version number of this data structure */ + /* Version 3 adds chassis_num & WP_index */ + unsigned char num_scal_dev; /* # of Scalability devices (Twisters for Vigil) */ + unsigned char num_rio_dev; /* # of RIO I/O devices (Cyclones and Winnipegs) */ +} __attribute__((packed)); + +struct scal_detail { + unsigned char node_id; /* Scalability Node ID */ + unsigned long CBAR; /* Address of 1MB register space */ + unsigned char port0node; /* Node ID port connected to: 0xFF=None */ + unsigned char port0port; /* Port num port connected to: 0,1,2, or 0xFF=None */ + unsigned char port1node; /* Node ID port connected to: 0xFF = None */ + unsigned char port1port; /* Port num port connected to: 0,1,2, or 0xFF=None */ + unsigned char port2node; /* Node ID port connected to: 0xFF = None */ + unsigned char port2port; /* Port num port connected to: 0,1,2, or 0xFF=None */ + unsigned char chassis_num; /* 1 based Chassis number (1 = boot node) */ +} __attribute__((packed)); + +struct rio_detail { + unsigned char node_id; /* RIO Node ID */ + unsigned long BBAR; /* Address of 1MB register space */ + unsigned char type; /* Type of device */ + unsigned char owner_id; /* For WPEG: Node ID of Cyclone that owns this WPEG*/ + /* For CYC: Node ID of Twister that owns this CYC */ + unsigned char port0node; /* Node ID port connected to: 0xFF=None */ + unsigned char port0port; /* Port num port connected to: 0,1,2, or 0xFF=None */ + unsigned char port1node; /* Node ID port connected to: 0xFF=None */ + unsigned char port1port; /* Port num port connected to: 0,1,2, or 0xFF=None */ + unsigned char first_slot; /* For WPEG: Lowest slot number below this WPEG */ + /* For CYC: 0 */ + unsigned char status; /* For WPEG: Bit 0 = 1 : the XAPIC is used */ + /* = 0 : the XAPIC is not used, ie:*/ + /* ints fwded to another XAPIC */ + /* Bits1:7 Reserved */ + /* For CYC: Bits0:7 Reserved */ + unsigned char WP_index; /* For WPEG: WPEG instance index - lower ones have */ + /* lower slot numbers/PCI bus numbers */ + /* For CYC: No meaning */ + unsigned char chassis_num; /* 1 based Chassis number */ + /* For LookOut WPEGs this field indicates the */ + /* Expansion Chassis #, enumerated from Boot */ + /* Node WPEG external port, then Boot Node CYC */ + /* external port, then Next Vigil chassis WPEG */ + /* external port, etc. */ + /* Shared Lookouts have only 1 chassis number (the */ + /* first one assigned) */ +} __attribute__((packed)); + + +typedef enum { + CompatTwister = 0, /* Compatibility Twister */ + AltTwister = 1, /* Alternate Twister of internal 8-way */ + CompatCyclone = 2, /* Compatibility Cyclone */ + AltCyclone = 3, /* Alternate Cyclone of internal 8-way */ + CompatWPEG = 4, /* Compatibility WPEG */ + AltWPEG = 5, /* Second Planar WPEG */ + LookOutAWPEG = 6, /* LookOut WPEG */ + LookOutBWPEG = 7, /* LookOut WPEG */ +} node_type; + +static inline int is_WPEG(node_type type){ + return (type == CompatWPEG || type == AltWPEG || + type == LookOutAWPEG || type == LookOutBWPEG); +} + #endif /* __ASM_MACH_MPPARSE_H */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/mpspec.h 900-mjb2/include/asm-i386/mpspec.h --- 000-virgin/include/asm-i386/mpspec.h Fri May 30 19:02:20 2003 +++ 900-mjb2/include/asm-i386/mpspec.h Wed Jun 11 22:47:01 2003 @@ -222,6 +222,10 @@ extern unsigned long mp_lapic_addr; extern int pic_mode; extern int using_apic_timer; +#ifdef CONFIG_X86_SUMMIT +extern void setup_summit (void); +#endif + #ifdef CONFIG_ACPI_BOOT extern void mp_register_lapic (u8 id, u8 enabled); extern void mp_register_lapic_address (u64 address); diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/numaq.h 900-mjb2/include/asm-i386/numaq.h --- 000-virgin/include/asm-i386/numaq.h Mon Mar 17 21:43:48 2003 +++ 900-mjb2/include/asm-i386/numaq.h Wed Jun 11 22:51:54 2003 @@ -29,6 +29,8 @@ #ifdef CONFIG_X86_NUMAQ #define MAX_NUMNODES 8 + +#ifndef __ASSEMBLY__ extern void get_memcfg_numaq(void); #define get_memcfg_numa() get_memcfg_numaq() @@ -161,6 +163,7 @@ static inline unsigned long *get_zholes_ { return 0; } +#endif /* __ASSEMBLY__ */ #endif /* CONFIG_X86_NUMAQ */ #endif /* NUMAQ_H */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/page.h 900-mjb2/include/asm-i386/page.h --- 000-virgin/include/asm-i386/page.h Tue Apr 8 14:38:20 2003 +++ 900-mjb2/include/asm-i386/page.h Wed Jun 11 22:46:29 2003 @@ -3,7 +3,11 @@ /* PAGE_SHIFT determines the page size */ #define PAGE_SHIFT 12 -#define PAGE_SIZE (1UL << PAGE_SHIFT) +#ifndef __ASSEMBLY__ +#define PAGE_SIZE (1UL << PAGE_SHIFT) +#else +#define PAGE_SIZE (1 << PAGE_SHIFT) +#endif #define PAGE_MASK (~(PAGE_SIZE-1)) #define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1)) @@ -115,9 +119,26 @@ static __inline__ int get_order(unsigned #endif /* __ASSEMBLY__ */ #ifdef __ASSEMBLY__ -#define __PAGE_OFFSET (0xC0000000) +#include +#ifdef CONFIG_05GB +#define __PAGE_OFFSET (0xE0000000) +#elif defined(CONFIG_1GB) +#define __PAGE_OFFSET (0xC0000000) +#elif defined(CONFIG_2GB) +#define __PAGE_OFFSET (0x80000000) +#elif defined(CONFIG_3GB) +#define __PAGE_OFFSET (0x40000000) +#endif #else -#define __PAGE_OFFSET (0xC0000000UL) +#ifdef CONFIG_05GB +#define __PAGE_OFFSET (0xE0000000UL) +#elif defined(CONFIG_1GB) +#define __PAGE_OFFSET (0xC0000000UL) +#elif defined(CONFIG_2GB) +#define __PAGE_OFFSET (0x80000000UL) +#elif defined(CONFIG_3GB) +#define __PAGE_OFFSET (0x40000000UL) +#endif #endif diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/param.h 900-mjb2/include/asm-i386/param.h --- 000-virgin/include/asm-i386/param.h Sun Nov 17 20:29:26 2002 +++ 900-mjb2/include/asm-i386/param.h Wed Jun 11 22:42:36 2003 @@ -2,10 +2,18 @@ #define _ASMi386_PARAM_H #ifdef __KERNEL__ -# define HZ 1000 /* Internal kernel timer frequency */ -# define USER_HZ 100 /* .. some user interfaces are in "ticks" */ -# define CLOCKS_PER_SEC (USER_HZ) /* like times() */ +#include + +#ifdef CONFIG_1000HZ +# define HZ 1000 /* Internal kernel timer frequency */ +#else +# define HZ 100 #endif + +#define USER_HZ 100 /* .. some user interfaces are in "ticks" */ +#define CLOCKS_PER_SEC (USER_HZ) /* like times() */ + +#endif /* __KERNEL__ */ #ifndef HZ #define HZ 100 diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/posix_types.h 900-mjb2/include/asm-i386/posix_types.h --- 000-virgin/include/asm-i386/posix_types.h Sun Apr 20 19:35:05 2003 +++ 900-mjb2/include/asm-i386/posix_types.h Wed Jun 11 22:48:35 2003 @@ -7,7 +7,7 @@ * assume GCC is being used. */ -typedef unsigned short __kernel_dev_t; +typedef unsigned long __kernel_dev_t; typedef unsigned long __kernel_ino_t; typedef unsigned short __kernel_mode_t; typedef unsigned short __kernel_nlink_t; diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/processor.h 900-mjb2/include/asm-i386/processor.h --- 000-virgin/include/asm-i386/processor.h Fri May 30 19:02:20 2003 +++ 900-mjb2/include/asm-i386/processor.h Wed Jun 11 22:42:59 2003 @@ -288,7 +288,11 @@ extern unsigned int mca_pentium_flag; /* This decides where the kernel will search for a free chunk of vm * space during mmap's. */ +#ifdef CONFIG_05GB +#define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 16)) +#else #define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3)) +#endif /* * Size of io_bitmap in longwords: 32 is ports 0-0x3ff. @@ -406,6 +410,9 @@ struct thread_struct { unsigned int saved_fs, saved_gs; /* IO permissions */ unsigned long *ts_io_bitmap; +#ifdef CONFIG_X86_REMOTE_DEBUG + struct pt_regs *kgdbregs; +#endif }; #define INIT_THREAD { \ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/rwlock.h 900-mjb2/include/asm-i386/rwlock.h --- 000-virgin/include/asm-i386/rwlock.h Sun Nov 17 20:29:57 2002 +++ 900-mjb2/include/asm-i386/rwlock.h Wed Jun 11 22:47:00 2003 @@ -20,28 +20,52 @@ #define RW_LOCK_BIAS 0x01000000 #define RW_LOCK_BIAS_STR "0x01000000" -#define __build_read_lock_ptr(rw, helper) \ - asm volatile(LOCK "subl $1,(%0)\n\t" \ - "js 2f\n" \ - "1:\n" \ - LOCK_SECTION_START("") \ - "2:\tcall " helper "\n\t" \ - "jmp 1b\n" \ - LOCK_SECTION_END \ - ::"a" (rw) : "memory") - -#define __build_read_lock_const(rw, helper) \ - asm volatile(LOCK "subl $1,%0\n\t" \ - "js 2f\n" \ - "1:\n" \ - LOCK_SECTION_START("") \ - "2:\tpushl %%eax\n\t" \ - "leal %0,%%eax\n\t" \ - "call " helper "\n\t" \ - "popl %%eax\n\t" \ - "jmp 1b\n" \ - LOCK_SECTION_END \ - :"=m" (*(volatile int *)rw) : : "memory") +#ifdef CONFIG_SPINLINE + + #define __build_read_lock_ptr(rw, helper) \ + asm volatile(LOCK "subl $1,(%0)\n\t" \ + "jns 1f\n\t" \ + "call " helper "\n\t" \ + "1:\t" \ + ::"a" (rw) : "memory") + + #define __build_read_lock_const(rw, helper) \ + asm volatile(LOCK "subl $1,%0\n\t" \ + "jns 1f\n\t" \ + "pushl %%eax\n\t" \ + "leal %0,%%eax\n\t" \ + "call " helper "\n\t" \ + "popl %%eax\n\t" \ + "1:\t" \ + :"=m" (*(volatile int *)rw) : : "memory") + +#else /* !CONFIG_SPINLINE */ + + #define __build_read_lock_ptr(rw, helper) \ + asm volatile(LOCK "subl $1,(%0)\n\t" \ + "js 2f\n" \ + "1:\n" \ + LOCK_SECTION_START("") \ + "2:\tcall " helper "\n\t" \ + "jmp 1b\n" \ + LOCK_SECTION_END \ + ::"a" (rw) : "memory") + + #define __build_read_lock_const(rw, helper) \ + asm volatile(LOCK "subl $1,%0\n\t" \ + "js 2f\n" \ + "1:\n" \ + LOCK_SECTION_START("") \ + "2:\tpushl %%eax\n\t" \ + "leal %0,%%eax\n\t" \ + "call " helper "\n\t" \ + "popl %%eax\n\t" \ + "jmp 1b\n" \ + LOCK_SECTION_END \ + :"=m" (*(volatile int *)rw) : : "memory") + +#endif /* CONFIG_SPINLINE */ + #define __build_read_lock(rw, helper) do { \ if (__builtin_constant_p(rw)) \ @@ -50,28 +74,51 @@ __build_read_lock_ptr(rw, helper); \ } while (0) -#define __build_write_lock_ptr(rw, helper) \ - asm volatile(LOCK "subl $" RW_LOCK_BIAS_STR ",(%0)\n\t" \ - "jnz 2f\n" \ - "1:\n" \ - LOCK_SECTION_START("") \ - "2:\tcall " helper "\n\t" \ - "jmp 1b\n" \ - LOCK_SECTION_END \ - ::"a" (rw) : "memory") - -#define __build_write_lock_const(rw, helper) \ - asm volatile(LOCK "subl $" RW_LOCK_BIAS_STR ",%0\n\t" \ - "jnz 2f\n" \ - "1:\n" \ - LOCK_SECTION_START("") \ - "2:\tpushl %%eax\n\t" \ - "leal %0,%%eax\n\t" \ - "call " helper "\n\t" \ - "popl %%eax\n\t" \ - "jmp 1b\n" \ - LOCK_SECTION_END \ - :"=m" (*(volatile int *)rw) : : "memory") +#ifdef CONFIG_SPINLINE + + #define __build_write_lock_ptr(rw, helper) \ + asm volatile(LOCK "subl $" RW_LOCK_BIAS_STR ",(%0)\n\t" \ + "jz 1f\n\t" \ + "call " helper "\n\t" \ + "1:\n" \ + ::"a" (rw) : "memory") + + #define __build_write_lock_const(rw, helper) \ + asm volatile(LOCK "subl $" RW_LOCK_BIAS_STR ",%0\n\t" \ + "jz 1f\n\t" \ + "pushl %%eax\n\t" \ + "leal %0,%%eax\n\t" \ + "call " helper "\n\t" \ + "popl %%eax\n\t" \ + "1:\n" \ + :"=m" (*(volatile int *)rw) : : "memory") + +#else /* !CONFIG_SPINLINE */ + + #define __build_write_lock_ptr(rw, helper) \ + asm volatile(LOCK "subl $" RW_LOCK_BIAS_STR ",(%0)\n\t" \ + "jnz 2f\n" \ + "1:\n" \ + LOCK_SECTION_START("") \ + "2:\tcall " helper "\n\t" \ + "jmp 1b\n" \ + LOCK_SECTION_END \ + ::"a" (rw) : "memory") + + #define __build_write_lock_const(rw, helper) \ + asm volatile(LOCK "subl $" RW_LOCK_BIAS_STR ",%0\n\t" \ + "jnz 2f\n" \ + "1:\n" \ + LOCK_SECTION_START("") \ + "2:\tpushl %%eax\n\t" \ + "leal %0,%%eax\n\t" \ + "call " helper "\n\t" \ + "popl %%eax\n\t" \ + "jmp 1b\n" \ + LOCK_SECTION_END \ + :"=m" (*(volatile int *)rw) : : "memory") + +#endif /* CONFIG_SPINLINE */ #define __build_write_lock(rw, helper) do { \ if (__builtin_constant_p(rw)) \ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/segment.h 900-mjb2/include/asm-i386/segment.h --- 000-virgin/include/asm-i386/segment.h Tue Feb 25 23:03:50 2003 +++ 900-mjb2/include/asm-i386/segment.h Wed Jun 11 22:51:54 2003 @@ -94,5 +94,5 @@ * of tasks we can have.. */ #define IDT_ENTRIES 256 - +#define IDT_SIZE (IDT_ENTRIES * 8) #endif diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/semaphore.h 900-mjb2/include/asm-i386/semaphore.h --- 000-virgin/include/asm-i386/semaphore.h Wed Mar 5 07:37:06 2003 +++ 900-mjb2/include/asm-i386/semaphore.h Wed Jun 11 22:55:23 2003 @@ -96,39 +96,48 @@ static inline void init_MUTEX_LOCKED (st sema_init(sem, 0); } -asmlinkage void __down_failed(void /* special register calling convention */); +asmlinkage int __down_failed_wq(void /* special register calling convention */); asmlinkage int __down_failed_interruptible(void /* params in registers */); asmlinkage int __down_failed_trylock(void /* params in registers */); asmlinkage void __up_wakeup(void /* special register calling convention */); -asmlinkage void __down(struct semaphore * sem); +asmlinkage int __down_wq(struct semaphore * sem, wait_queue_t *wait); asmlinkage int __down_interruptible(struct semaphore * sem); asmlinkage int __down_trylock(struct semaphore * sem); asmlinkage void __up(struct semaphore * sem); /* * This is ugly, but we want the default case to fall through. - * "__down_failed" is a special asm handler that calls the C + * "__down_failed_wq" is a special asm handler that calls the C * routine that actually waits. See arch/i386/kernel/semaphore.c */ -static inline void down(struct semaphore * sem) +static inline int down_wq(struct semaphore * sem, wait_queue_t *wait) { + int result; + #ifdef WAITQUEUE_DEBUG CHECK_MAGIC(sem->__magic); #endif might_sleep(); __asm__ __volatile__( "# atomic down operation\n\t" - LOCK "decl %0\n\t" /* --sem->count */ - "js 2f\n" + LOCK "decl %1\n\t" /* --sem->count */ + "js 2f\n\t" + "xorl %0,%0\n" "1:\n" LOCK_SECTION_START("") - "2:\tcall __down_failed\n\t" + "2:\tcall __down_failed_wq\n\t" "jmp 1b\n" LOCK_SECTION_END - :"=m" (sem->count) - :"c" (sem) + :"=a" (result), "=m" (sem->count) + :"c" (sem), "d" (wait) :"memory"); + return result; +} + +static inline void down(struct semaphore * sem) +{ + down_wq(sem, NULL); } /* diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/spinlock.h 900-mjb2/include/asm-i386/spinlock.h --- 000-virgin/include/asm-i386/spinlock.h Fri May 30 19:02:20 2003 +++ 900-mjb2/include/asm-i386/spinlock.h Wed Jun 11 22:47:01 2003 @@ -43,18 +43,35 @@ typedef struct { #define spin_is_locked(x) (*(volatile signed char *)(&(x)->lock) <= 0) #define spin_unlock_wait(x) do { barrier(); } while(spin_is_locked(x)) -#define spin_lock_string \ - "\n1:\t" \ - "lock ; decb %0\n\t" \ - "js 2f\n" \ - LOCK_SECTION_START("") \ - "2:\t" \ - "rep;nop\n\t" \ - "cmpb $0,%0\n\t" \ - "jle 2b\n\t" \ - "jmp 1b\n" \ - LOCK_SECTION_END +#ifdef CONFIG_SPINLINE + #define spin_lock_string \ + "\n1:\t" \ + "lock ; decb %0\n\t" \ + "js 2f\n" \ + "jmp 3f\n" \ + "2:\t" \ + "rep;nop\n\t" \ + "cmpb $0,%0\n\t" \ + "jle 2b\n\t" \ + "jmp 1b\n" \ + "3:\t" + +#else /* !CONFIG_SPINLINE */ + + #define spin_lock_string \ + "\n1:\t" \ + "lock ; decb %0\n\t" \ + "js 2f\n" \ + LOCK_SECTION_START("") \ + "2:\t" \ + "rep;nop\n\t" \ + "cmpb $0,%0\n\t" \ + "jle 2b\n\t" \ + "jmp 1b\n" \ + LOCK_SECTION_END + +#endif /* CONFIG_SPINLINE */ /* * This works. Despite all the confusion. * (except on PPro SMP or if we are using OOSTORE) @@ -138,6 +155,11 @@ here: */ typedef struct { volatile unsigned int lock; +#if CONFIG_LOCKMETER + /* required for LOCKMETER since all bits in lock are used */ + /* and we need this storage for CPU and lock INDEX */ + unsigned lockmeter_magic; +#endif #ifdef CONFIG_DEBUG_SPINLOCK unsigned magic; #endif @@ -145,11 +167,19 @@ typedef struct { #define RWLOCK_MAGIC 0xdeaf1eed +#ifdef CONFIG_LOCKMETER +#if CONFIG_DEBUG_SPINLOCK +#define RWLOCK_MAGIC_INIT , 0, RWLOCK_MAGIC +#else +#define RWLOCK_MAGIC_INIT , 0 +#endif +#else /* !CONFIG_LOCKMETER */ #ifdef CONFIG_DEBUG_SPINLOCK #define RWLOCK_MAGIC_INIT , RWLOCK_MAGIC #else #define RWLOCK_MAGIC_INIT /* */ #endif +#endif /* !CONFIG_LOCKMETER */ #define RW_LOCK_UNLOCKED (rwlock_t) { RW_LOCK_BIAS RWLOCK_MAGIC_INIT } @@ -195,5 +225,59 @@ static inline int _raw_write_trylock(rwl atomic_add(RW_LOCK_BIAS, count); return 0; } + +#ifdef CONFIG_LOCKMETER +static inline int _raw_read_trylock(rwlock_t *lock) +{ +/* FIXME -- replace with assembler */ + atomic_t *count = (atomic_t *)lock; + atomic_dec(count); + if (count->counter > 0) + return 1; + atomic_inc(count); + return 0; +} +#endif + +#if defined(CONFIG_LOCKMETER) && defined(CONFIG_HAVE_DEC_LOCK) +extern void _metered_spin_lock (spinlock_t *lock); +extern void _metered_spin_unlock(spinlock_t *lock); + +/* + * Matches what is in arch/i386/lib/dec_and_lock.c, except this one is + * "static inline" so that the spin_lock(), if actually invoked, is charged + * against the real caller, not against the catch-all atomic_dec_and_lock + */ +static inline int atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock) +{ + int counter; + int newcount; + +repeat: + counter = atomic_read(atomic); + newcount = counter-1; + + if (!newcount) + goto slow_path; + + asm volatile("lock; cmpxchgl %1,%2" + :"=a" (newcount) + :"r" (newcount), "m" (atomic->counter), "0" (counter)); + + /* If the above failed, "eax" will have changed */ + if (newcount != counter) + goto repeat; + return 0; + +slow_path: + _metered_spin_lock(lock); + if (atomic_dec_and_test(atomic)) + return 1; + _metered_spin_unlock(lock); + return 0; +} + +#define ATOMIC_DEC_AND_LOCK +#endif #endif /* __ASM_SPINLOCK_H */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/srat.h 900-mjb2/include/asm-i386/srat.h --- 000-virgin/include/asm-i386/srat.h Mon Mar 17 21:43:48 2003 +++ 900-mjb2/include/asm-i386/srat.h Wed Jun 11 22:51:54 2003 @@ -28,8 +28,9 @@ #define _ASM_SRAT_H_ #define MAX_NUMNODES 8 +#ifndef __ASSEMBLY__ extern void get_memcfg_from_srat(void); extern unsigned long *get_zholes_size(int); #define get_memcfg_numa() get_memcfg_from_srat() - +#endif #endif /* _ASM_SRAT_H_ */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/thread_info.h 900-mjb2/include/asm-i386/thread_info.h --- 000-virgin/include/asm-i386/thread_info.h Mon Mar 17 21:43:48 2003 +++ 900-mjb2/include/asm-i386/thread_info.h Wed Jun 11 22:46:32 2003 @@ -9,6 +9,7 @@ #ifdef __KERNEL__ +#include #ifndef __ASSEMBLY__ #include #endif @@ -30,9 +31,11 @@ struct thread_info { __s32 preempt_count; /* 0 => preemptable, <0 => BUG */ mm_segment_t addr_limit; /* thread address space: + 0 for interrupts: illegal 0-0xBFFFFFFF for user-thead 0-0xFFFFFFFF for kernel-thread */ + struct thread_info *irq_stack; /* pointer to cpu irq stack */ struct restart_block restart_block; __u8 supervisor_stack[0]; @@ -48,7 +51,8 @@ struct thread_info { #define TI_CPU 0x00000010 #define TI_PRE_COUNT 0x00000014 #define TI_ADDR_LIMIT 0x00000018 -#define TI_RESTART_BLOCK 0x000001C +#define TI_IRQ_STACK 0x0000001C +#define TI_RESTART_BLOCK 0x0000026 #endif @@ -59,48 +63,66 @@ struct thread_info { * * preempt_count needs to be 1 initially, until the scheduler is functional. */ +#ifdef CONFIG_4K_STACK +#define THREAD_ORDER 0 +#define STACK_WARN 0x200 +#define STACK_PANIC 0x100 +#else +#define THREAD_ORDER 1 +#define STACK_WARN ((THREAD_SIZE)>>1) +#define STACK_PANIC 0x100 +#endif +#define INIT_THREAD_SIZE THREAD_SIZE + #ifndef __ASSEMBLY__ -#define INIT_THREAD_INFO(tsk) \ -{ \ - .task = &tsk, \ - .exec_domain = &default_exec_domain, \ - .flags = 0, \ - .cpu = 0, \ - .preempt_count = 1, \ - .addr_limit = KERNEL_DS, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ +#define INIT_THREAD_INFO(tsk) \ +{ \ + .task = &tsk, \ + .exec_domain = &default_exec_domain, \ + .flags = 0, \ + .cpu = 0, \ + .preempt_count = 1, \ + .addr_limit = KERNEL_DS, \ + .irq_stack = &init_irq_union.thread_info, \ + .restart_block = { \ + .fn = do_no_restart_syscall, \ + } \ } #define init_thread_info (init_thread_union.thread_info) #define init_stack (init_thread_union.stack) +/* thread information allocation */ +#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER) +#define alloc_thread_info() kmalloc(THREAD_SIZE, GFP_KERNEL) +#define free_thread_info(ti) kfree(ti) +#define get_thread_info(ti) get_task_struct((ti)->task) +#define put_thread_info(ti) put_task_struct((ti)->task) + /* how to get the thread information struct from C */ static inline struct thread_info *current_thread_info(void) { struct thread_info *ti; - __asm__("andl %%esp,%0; ":"=r" (ti) : "0" (~8191UL)); + __asm__("andl %%esp,%0; ":"=r" (ti) : "0" (~(THREAD_SIZE - 1))); return ti; } -/* thread information allocation */ -#define THREAD_SIZE (2*PAGE_SIZE) -#define alloc_thread_info() ((struct thread_info *) __get_free_pages(GFP_KERNEL,1)) -#define free_thread_info(ti) free_pages((unsigned long) (ti), 1) -#define get_thread_info(ti) get_task_struct((ti)->task) -#define put_thread_info(ti) put_task_struct((ti)->task) - #else /* !__ASSEMBLY__ */ +#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER) + /* how to get the thread information struct from ASM */ #define GET_THREAD_INFO(reg) \ - movl $-8192, reg; \ + movl $-THREAD_SIZE, reg; \ andl %esp, reg -#endif +/* use this one if reg already contains %esp */ +#define GET_THREAD_INFO_WITH_ESP(reg) \ + andl $-THREAD_SIZE, reg +#endif + /* * thread information flags * - these are process state flags that various assembly files may need to access diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-ia64/lockmeter.h 900-mjb2/include/asm-ia64/lockmeter.h --- 000-virgin/include/asm-ia64/lockmeter.h Wed Dec 31 16:00:00 1969 +++ 900-mjb2/include/asm-ia64/lockmeter.h Wed Jun 11 22:47:01 2003 @@ -0,0 +1,72 @@ +/* + * Copyright (C) 1999,2000 Silicon Graphics, Inc. + * + * Written by John Hawkes (hawkes@sgi.com) + * Based on klstat.h by Jack Steiner (steiner@sgi.com) + */ + +#ifndef _IA64_LOCKMETER_H +#define _IA64_LOCKMETER_H + +#ifdef local_cpu_data +#define CPU_CYCLE_FREQUENCY local_cpu_data->itc_freq +#else +#define CPU_CYCLE_FREQUENCY my_cpu_data.itc_freq +#endif +#define get_cycles64() get_cycles() + +#define THIS_CPU_NUMBER smp_processor_id() + +/* + * macros to cache and retrieve an index value inside of a lock + * these macros assume that there are less than 65536 simultaneous + * (read mode) holders of a rwlock. + * we also assume that the hash table has less than 32767 entries. + */ +/* + * instrumented spinlock structure -- never used to allocate storage + * only used in macros below to overlay a spinlock_t + */ +typedef struct inst_spinlock_s { + /* remember, Intel is little endian */ + volatile unsigned short lock; + volatile unsigned short index; +} inst_spinlock_t; +#define PUT_INDEX(lock_ptr,indexv) ((inst_spinlock_t *)(lock_ptr))->index = indexv +#define GET_INDEX(lock_ptr) ((inst_spinlock_t *)(lock_ptr))->index + +/* + * macros to cache and retrieve an index value in a read/write lock + * as well as the cpu where a reader busy period started + * we use the 2nd word (the debug word) for this, so require the + * debug word to be present + */ +/* + * instrumented rwlock structure -- never used to allocate storage + * only used in macros below to overlay a rwlock_t + */ +typedef struct inst_rwlock_s { + volatile int read_counter:31; + volatile int write_lock:1; + volatile unsigned short index; + volatile unsigned short cpu; +} inst_rwlock_t; +#define PUT_RWINDEX(rwlock_ptr,indexv) ((inst_rwlock_t *)(rwlock_ptr))->index = indexv +#define GET_RWINDEX(rwlock_ptr) ((inst_rwlock_t *)(rwlock_ptr))->index +#define PUT_RW_CPU(rwlock_ptr,cpuv) ((inst_rwlock_t *)(rwlock_ptr))->cpu = cpuv +#define GET_RW_CPU(rwlock_ptr) ((inst_rwlock_t *)(rwlock_ptr))->cpu + +/* + * return the number of readers for a rwlock_t + */ +#define RWLOCK_READERS(rwlock_ptr) ((rwlock_ptr)->read_counter) + +/* + * return true if rwlock is write locked + * (note that other lock attempts can cause the lock value to be negative) + */ +#define RWLOCK_IS_WRITE_LOCKED(rwlock_ptr) ((rwlock_ptr)->write_lock) +#define RWLOCK_IS_READ_LOCKED(rwlock_ptr) ((rwlock_ptr)->read_counter) + +#endif /* _IA64_LOCKMETER_H */ + diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-ia64/spinlock.h 900-mjb2/include/asm-ia64/spinlock.h --- 000-virgin/include/asm-ia64/spinlock.h Fri May 30 19:02:21 2003 +++ 900-mjb2/include/asm-ia64/spinlock.h Wed Jun 11 22:47:01 2003 @@ -182,4 +182,25 @@ do { \ clear_bit(31, (x)); \ }) +#ifdef CONFIG_LOCKMETER +extern void _metered_spin_lock (spinlock_t *lock); +extern void _metered_spin_unlock(spinlock_t *lock); + +/* + * Use a less efficient, and inline, atomic_dec_and_lock() if lockmetering + * so we can see the callerPC of who is actually doing the spin_lock(). + * Otherwise, all we see is the generic rollup of all locks done by + * atomic_dec_and_lock(). + */ +static inline int atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock) +{ + _metered_spin_lock(lock); + if (atomic_dec_and_test(atomic)) + return 1; + _metered_spin_unlock(lock); + return 0; +} +#define ATOMIC_DEC_AND_LOCK +#endif + #endif /* _ASM_IA64_SPINLOCK_H */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-mips/lockmeter.h 900-mjb2/include/asm-mips/lockmeter.h --- 000-virgin/include/asm-mips/lockmeter.h Wed Dec 31 16:00:00 1969 +++ 900-mjb2/include/asm-mips/lockmeter.h Wed Jun 11 22:47:01 2003 @@ -0,0 +1,126 @@ +/* + * Copyright (C) 1999,2000 Silicon Graphics, Inc. + * + * Written by John Hawkes (hawkes@sgi.com) + * Based on klstat.h by Jack Steiner (steiner@sgi.com) + * Ported to mips32 for Asita Technologies + * by D.J. Barrow ( dj.barrow@asitatechnologies.com ) + */ +#ifndef _ASM_LOCKMETER_H +#define _ASM_LOCKMETER_H + +/* do_gettimeoffset is a function pointer on mips */ +/* & it is not included by */ +#include +#include +#include + +#define SPINLOCK_MAGIC_INIT /* */ + +#define CPU_CYCLE_FREQUENCY get_cpu_cycle_frequency() + +#define THIS_CPU_NUMBER smp_processor_id() + +static uint32_t cpu_cycle_frequency = 0; + +static uint32_t get_cpu_cycle_frequency(void) +{ + /* a total hack, slow and invasive, but ... it works */ + int sec; + uint32_t start_cycles; + struct timeval tv; + + if (cpu_cycle_frequency == 0) { /* uninitialized */ + do_gettimeofday(&tv); + sec = tv.tv_sec; /* set up to catch the tv_sec rollover */ + while (sec == tv.tv_sec) { do_gettimeofday(&tv); } + sec = tv.tv_sec; /* rolled over to a new sec value */ + start_cycles = get_cycles(); + while (sec == tv.tv_sec) { do_gettimeofday(&tv); } + cpu_cycle_frequency = get_cycles() - start_cycles; + } + + return cpu_cycle_frequency; +} + +extern struct timeval xtime; + +static uint64_t get_cycles64(void) +{ + static uint64_t last_get_cycles64 = 0; + uint64_t ret; + unsigned long sec; + unsigned long usec, usec_offset; + +again: + sec = xtime.tv_sec; + usec = xtime.tv_usec; + usec_offset = do_gettimeoffset(); + if ((xtime.tv_sec != sec) || + (xtime.tv_usec != usec)|| + (usec_offset >= 20000)) + goto again; + + ret = ((uint64_t)(usec + usec_offset) * cpu_cycle_frequency); + /* We can't do a normal 64 bit division on mips without libgcc.a */ + do_div(ret,1000000); + ret += ((uint64_t)sec * cpu_cycle_frequency); + + /* XXX why does time go backwards? do_gettimeoffset? general time adj? */ + if (ret <= last_get_cycles64) + ret = last_get_cycles64+1; + last_get_cycles64 = ret; + + return ret; +} + +/* + * macros to cache and retrieve an index value inside of a lock + * these macros assume that there are less than 65536 simultaneous + * (read mode) holders of a rwlock. + * we also assume that the hash table has less than 32767 entries. + * the high order bit is used for write locking a rw_lock + */ +#define INDEX_MASK 0x7FFF0000 +#define READERS_MASK 0x0000FFFF +#define INDEX_SHIFT 16 +#define PUT_INDEX(lockp,index) \ + lockp->lock = (((lockp->lock) & ~INDEX_MASK) | (index) << INDEX_SHIFT) +#define GET_INDEX(lockp) \ + (((lockp->lock) & INDEX_MASK) >> INDEX_SHIFT) + +/* + * macros to cache and retrieve an index value in a read/write lock + * as well as the cpu where a reader busy period started + * we use the 2nd word (the debug word) for this, so require the + * debug word to be present + */ +/* + * instrumented rwlock structure -- never used to allocate storage + * only used in macros below to overlay a rwlock_t + */ +typedef struct inst_rwlock_s { + volatile int lock; + unsigned short index; + unsigned short cpu; +} inst_rwlock_t; +#define PUT_RWINDEX(rwlock_ptr,indexv) ((inst_rwlock_t *)(rwlock_ptr))->index = indexv +#define GET_RWINDEX(rwlock_ptr) ((inst_rwlock_t *)(rwlock_ptr))->index +#define PUT_RW_CPU(rwlock_ptr,cpuv) ((inst_rwlock_t *)(rwlock_ptr))->cpu = cpuv +#define GET_RW_CPU(rwlock_ptr) ((inst_rwlock_t *)(rwlock_ptr))->cpu + +/* + * return the number of readers for a rwlock_t + */ +#define RWLOCK_READERS(rwlock_ptr) rwlock_readers(rwlock_ptr) + +extern inline int rwlock_readers(rwlock_t *rwlock_ptr) +{ + int tmp = (int) rwlock_ptr->lock; + return (tmp >= 0) ? tmp : 0; +} + +#define RWLOCK_IS_WRITE_LOCKED(rwlock_ptr) ((rwlock_ptr)->lock < 0) +#define RWLOCK_IS_READ_LOCKED(rwlock_ptr) ((rwlock_ptr)->lock > 0) + +#endif /* _ASM_LOCKMETER_H */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-mips/spinlock.h 900-mjb2/include/asm-mips/spinlock.h --- 000-virgin/include/asm-mips/spinlock.h Sun Nov 17 20:29:56 2002 +++ 900-mjb2/include/asm-mips/spinlock.h Wed Jun 11 22:47:01 2003 @@ -74,9 +74,18 @@ static inline void spin_unlock(spinlock_ typedef struct { volatile unsigned int lock; +#if CONFIG_LOCKMETER + /* required for LOCKMETER since all bits in lock are used */ + /* and we need this storage for CPU and lock INDEX */ + unsigned lockmeter_magic; +#endif } rwlock_t; +#ifdef CONFIG_LOCKMETER +#define RW_LOCK_UNLOCKED (rwlock_t) { 0, 0 } +#else #define RW_LOCK_UNLOCKED (rwlock_t) { 0 } +#endif static inline void read_lock(rwlock_t *rw) { diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-mips64/lockmeter.h 900-mjb2/include/asm-mips64/lockmeter.h --- 000-virgin/include/asm-mips64/lockmeter.h Wed Dec 31 16:00:00 1969 +++ 900-mjb2/include/asm-mips64/lockmeter.h Wed Jun 11 22:47:01 2003 @@ -0,0 +1,120 @@ +/* + * Copyright (C) 1999,2000 Silicon Graphics, Inc. + * + * Written by John Hawkes (hawkes@sgi.com) + * Based on klstat.h by Jack Steiner (steiner@sgi.com) + */ + +#ifndef _ASM_LOCKMETER_H +#define _ASM_LOCKMETER_H + +#include + +#define SPINLOCK_MAGIC_INIT /* */ + +#define CPU_CYCLE_FREQUENCY get_cpu_cycle_frequency() + +#define THIS_CPU_NUMBER smp_processor_id() + +static uint32_t cpu_cycle_frequency = 0; + +static uint32_t get_cpu_cycle_frequency(void) +{ + /* a total hack, slow and invasive, but ... it works */ + int sec; + uint32_t start_cycles; + struct timeval tv; + + if (cpu_cycle_frequency == 0) { /* uninitialized */ + do_gettimeofday(&tv); + sec = tv.tv_sec; /* set up to catch the tv_sec rollover */ + while (sec == tv.tv_sec) { do_gettimeofday(&tv); } + sec = tv.tv_sec; /* rolled over to a new sec value */ + start_cycles = get_cycles(); + while (sec == tv.tv_sec) { do_gettimeofday(&tv); } + cpu_cycle_frequency = get_cycles() - start_cycles; + } + + return cpu_cycle_frequency; +} + +extern struct timeval xtime; +extern long do_gettimeoffset(void); + +static uint64_t get_cycles64(void) +{ + static uint64_t last_get_cycles64 = 0; + uint64_t ret; + unsigned long sec; + unsigned long usec, usec_offset; + +again: + sec = xtime.tv_sec; + usec = xtime.tv_usec; + usec_offset = do_gettimeoffset(); + if ((xtime.tv_sec != sec) || + (xtime.tv_usec != usec)|| + (usec_offset >= 20000)) + goto again; + + ret = ((uint64_t)sec * cpu_cycle_frequency) + + ( ((uint64_t)(usec + usec_offset) * cpu_cycle_frequency) / 1000000 ); + + /* XXX why does time go backwards? do_gettimeoffset? general time adj? */ + if (ret <= last_get_cycles64) + ret = last_get_cycles64+1; + last_get_cycles64 = ret; + + return ret; +} + +/* + * macros to cache and retrieve an index value inside of a lock + * these macros assume that there are less than 65536 simultaneous + * (read mode) holders of a rwlock. + * we also assume that the hash table has less than 32767 entries. + * the high order bit is used for write locking a rw_lock + */ +#define INDEX_MASK 0x7FFF0000 +#define READERS_MASK 0x0000FFFF +#define INDEX_SHIFT 16 +#define PUT_INDEX(lockp,index) \ + lockp->lock = (((lockp->lock) & ~INDEX_MASK) | (index) << INDEX_SHIFT) +#define GET_INDEX(lockp) \ + (((lockp->lock) & INDEX_MASK) >> INDEX_SHIFT) + +/* + * macros to cache and retrieve an index value in a read/write lock + * as well as the cpu where a reader busy period started + * we use the 2nd word (the debug word) for this, so require the + * debug word to be present + */ +/* + * instrumented rwlock structure -- never used to allocate storage + * only used in macros below to overlay a rwlock_t + */ +typedef struct inst_rwlock_s { + volatile int lock; + unsigned short index; + unsigned short cpu; +} inst_rwlock_t; +#define PUT_RWINDEX(rwlock_ptr,indexv) ((inst_rwlock_t *)(rwlock_ptr))->index = indexv +#define GET_RWINDEX(rwlock_ptr) ((inst_rwlock_t *)(rwlock_ptr))->index +#define PUT_RW_CPU(rwlock_ptr,cpuv) ((inst_rwlock_t *)(rwlock_ptr))->cpu = cpuv +#define GET_RW_CPU(rwlock_ptr) ((inst_rwlock_t *)(rwlock_ptr))->cpu + +/* + * return the number of readers for a rwlock_t + */ +#define RWLOCK_READERS(rwlock_ptr) rwlock_readers(rwlock_ptr) + +extern inline int rwlock_readers(rwlock_t *rwlock_ptr) +{ + int tmp = (int) rwlock_ptr->lock; + return (tmp >= 0) ? tmp : 0; +} + +#define RWLOCK_IS_WRITE_LOCKED(rwlock_ptr) ((rwlock_ptr)->lock < 0) +#define RWLOCK_IS_READ_LOCKED(rwlock_ptr) ((rwlock_ptr)->lock > 0) + +#endif /* _ASM_LOCKMETER_H */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-mips64/pgtable.h 900-mjb2/include/asm-mips64/pgtable.h --- 000-virgin/include/asm-mips64/pgtable.h Sun Apr 20 19:35:05 2003 +++ 900-mjb2/include/asm-mips64/pgtable.h Wed Jun 11 22:47:01 2003 @@ -126,7 +126,7 @@ extern void (*_flush_cache_l1)(void); #define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE) #define FIRST_USER_PGD_NR 0 -#define KPTBL_PAGE_ORDER 1 +#define KPTBL_PAGE_ORDER 2 #define VMALLOC_START XKSEG #define VMALLOC_VMADDR(x) ((unsigned long)(x)) #define VMALLOC_END \ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-ppc64/semaphore.h 900-mjb2/include/asm-ppc64/semaphore.h --- 000-virgin/include/asm-ppc64/semaphore.h Tue Feb 25 23:03:50 2003 +++ 900-mjb2/include/asm-ppc64/semaphore.h Wed Jun 11 22:55:25 2003 @@ -68,12 +68,14 @@ static inline void init_MUTEX_LOCKED (st sema_init(sem, 0); } -extern void __down(struct semaphore * sem); +extern void __down_wq(struct semaphore * sem, wait_queue_t *wait); extern int __down_interruptible(struct semaphore * sem); extern void __up(struct semaphore * sem); -static inline void down(struct semaphore * sem) +static inline int down_wq(struct semaphore * sem, wait_queue_t *wait) { + int ret = 0; + #ifdef WAITQUEUE_DEBUG CHECK_MAGIC(sem->__magic); #endif @@ -83,8 +85,14 @@ static inline void down(struct semaphore * Try to get the semaphore, take the slow path if we fail. */ if (atomic_dec_return(&sem->count) < 0) - __down(sem); + ret =__down_wq(sem, wait); smp_wmb(); + return ret; +} + +static inline void down(struct semaphore * sem) +{ + down_wq(sem, NULL); } static inline int down_interruptible(struct semaphore * sem) diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-sparc64/lockmeter.h 900-mjb2/include/asm-sparc64/lockmeter.h --- 000-virgin/include/asm-sparc64/lockmeter.h Wed Dec 31 16:00:00 1969 +++ 900-mjb2/include/asm-sparc64/lockmeter.h Wed Jun 11 22:47:01 2003 @@ -0,0 +1,47 @@ +/* + * Copyright (C) 2000 Anton Blanchard (anton@linuxcare.com) + */ + +#ifndef _SPARC64_LOCKMETER_H +#define _SPARC64_LOCKMETER_H + +#include + +#include + +extern unsigned long cpu_hz; +#define CPU_CYCLE_FREQUENCY cpu_hz + +#define THIS_CPU_NUMBER __cpu_number_map[smp_processor_id()] + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,3,0) +#define local_irq_save(x) __save_and_cli(x) +#define local_irq_restore(x) __restore_flags(x) +#endif /* Linux version 2.2.x */ + +#define PUT_INDEX(lock_ptr,indexv) (lock_ptr)->index = (indexv) +#define GET_INDEX(lock_ptr) (lock_ptr)->index + +#define PUT_RWINDEX(rwlock_ptr,indexv) (rwlock_ptr)->index = (indexv) +#define GET_RWINDEX(rwlock_ptr) (rwlock_ptr)->index +#define PUT_RW_CPU(rwlock_ptr,cpuv) (rwlock_ptr)->cpu = (cpuv) +#define GET_RW_CPU(rwlock_ptr) (rwlock_ptr)->cpu + +#define RWLOCK_READERS(rwlock_ptr) rwlock_readers(rwlock_ptr) + +extern inline int rwlock_readers(rwlock_t *rwlock_ptr) +{ + signed int tmp = rwlock_ptr->lock; + + if (tmp > 0) + return tmp; + else + return 0; +} + +#define RWLOCK_IS_WRITE_LOCKED(rwlock_ptr) ((signed int)((rwlock_ptr)->lock) < 0) +#define RWLOCK_IS_READ_LOCKED(rwlock_ptr) ((signed int)((rwlock_ptr)->lock) > 0) + +#define get_cycles64() get_cycles() + +#endif /* _SPARC64_LOCKMETER_H */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-sparc64/spinlock.h 900-mjb2/include/asm-sparc64/spinlock.h --- 000-virgin/include/asm-sparc64/spinlock.h Sun Nov 17 20:29:27 2002 +++ 900-mjb2/include/asm-sparc64/spinlock.h Wed Jun 11 22:47:01 2003 @@ -30,15 +30,23 @@ #ifndef CONFIG_DEBUG_SPINLOCK -typedef unsigned char spinlock_t; -#define SPIN_LOCK_UNLOCKED 0 +typedef struct { + unsigned char lock; + unsigned int index; +} spinlock_t; -#define spin_lock_init(lock) (*((unsigned char *)(lock)) = 0) -#define spin_is_locked(lock) (*((volatile unsigned char *)(lock)) != 0) +#ifdef CONFIG_LOCKMETER +#define SPIN_LOCK_UNLOCKED (spinlock_t) {0, 0} +#else +#define SPIN_LOCK_UNLOCKED (spinlock_t) { 0 } +#endif + +#define spin_lock_init(__lock) do { *(__lock) = SPIN_LOCK_UNLOCKED; } while(0) +#define spin_is_locked(__lock) (*((volatile unsigned char *)(&((__lock)->lock))) != 0) -#define spin_unlock_wait(lock) \ +#define spin_unlock_wait(__lock) \ do { membar("#LoadLoad"); \ -} while(*((volatile unsigned char *)lock)) +} while(*((volatile unsigned char *)(&(((spinlock_t *)__lock)->lock)))) static __inline__ void _raw_spin_lock(spinlock_t *lock) { @@ -109,8 +117,20 @@ extern int _spin_trylock (spinlock_t *lo #ifndef CONFIG_DEBUG_SPINLOCK -typedef unsigned int rwlock_t; -#define RW_LOCK_UNLOCKED 0 +#ifdef CONFIG_LOCKMETER +typedef struct { + unsigned int lock; + unsigned int index; + unsigned int cpu; +} rwlock_t; +#define RW_LOCK_UNLOCKED (rwlock_t) { 0, 0, 0xff } +#else +typedef struct { + unsigned int lock; +} rwlock_t; +#define RW_LOCK_UNLOCKED (rwlock_t) { 0 } +#endif + #define rwlock_init(lp) do { *(lp) = RW_LOCK_UNLOCKED; } while(0) #define rwlock_is_locked(x) (*(x) != RW_LOCK_UNLOCKED) diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-x86_64/early_printk.h 900-mjb2/include/asm-x86_64/early_printk.h --- 000-virgin/include/asm-x86_64/early_printk.h Wed Dec 31 16:00:00 1969 +++ 900-mjb2/include/asm-x86_64/early_printk.h Wed Jun 11 22:42:36 2003 @@ -0,0 +1,8 @@ +#ifdef __EARLY_PRINTK_H_X86_64_ +#define __EARLY_PRINTK_H_X86_64_ + +#define VGABASE 0xffffffff800b8000UL +#define SERIAL_BASES { 0x3f8, 0x2f8 } +#define SERIAL_BASES_LEN 2 + +#endif diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-x86_64/semaphore.h 900-mjb2/include/asm-x86_64/semaphore.h --- 000-virgin/include/asm-x86_64/semaphore.h Wed Mar 5 07:37:07 2003 +++ 900-mjb2/include/asm-x86_64/semaphore.h Wed Jun 11 22:55:26 2003 @@ -98,39 +98,48 @@ static inline void init_MUTEX_LOCKED (st sema_init(sem, 0); } -asmlinkage void __down_failed(void /* special register calling convention */); +asmlinkage int __down_failed_wq(void /* special register calling convention */); asmlinkage int __down_failed_interruptible(void /* params in registers */); asmlinkage int __down_failed_trylock(void /* params in registers */); asmlinkage void __up_wakeup(void /* special register calling convention */); -asmlinkage void __down(struct semaphore * sem); +asmlinkage int __down_wq(struct semaphore * sem, wait_queue_t *wait); asmlinkage int __down_interruptible(struct semaphore * sem); asmlinkage int __down_trylock(struct semaphore * sem); asmlinkage void __up(struct semaphore * sem); /* * This is ugly, but we want the default case to fall through. - * "__down_failed" is a special asm handler that calls the C + * "__down_failed_wq" is a special asm handler that calls the C * routine that actually waits. See arch/x86_64/kernel/semaphore.c */ -static inline void down(struct semaphore * sem) +static inline int down_wq(struct semaphore * sem, wait_queue_t *wait) { + int result; + #if WAITQUEUE_DEBUG CHECK_MAGIC(sem->__magic); #endif __asm__ __volatile__( "# atomic down operation\n\t" - LOCK "decl %0\n\t" /* --sem->count */ - "js 2f\n" + LOCK "decl %1\n\t" /* --sem->count */ + "js 2f\n\t" + "xorl %0,%0\n" "1:\n" LOCK_SECTION_START("") - "2:\tcall __down_failed\n\t" + "2:\tcall __down_failed_wq\n\t" "jmp 1b\n" LOCK_SECTION_END - :"=m" (sem->count) - :"D" (sem) + :"=a" (result), "=m" (sem->count) + :"D" (sem), "S" (wait) :"memory"); + return result; +} + +static inline void down(struct semaphore * sem) +{ + down_wq(sem, NULL); } /* diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/aio.h 900-mjb2/include/linux/aio.h --- 000-virgin/include/linux/aio.h Fri May 30 19:02:22 2003 +++ 900-mjb2/include/linux/aio.h Wed Jun 11 22:55:21 2003 @@ -54,7 +54,7 @@ struct kiocb { struct file *ki_filp; struct kioctx *ki_ctx; /* may be NULL for sync ops */ int (*ki_cancel)(struct kiocb *, struct io_event *); - long (*ki_retry)(struct kiocb *); + ssize_t (*ki_retry)(struct kiocb *); struct list_head ki_list; /* the aio core uses this * for cancellation */ @@ -62,6 +62,16 @@ struct kiocb { void *ki_user_obj; /* pointer to userland's iocb */ __u64 ki_user_data; /* user's data for completion */ loff_t ki_pos; + + /* State that we remember to be able to restart/retry */ + unsigned short ki_opcode; + size_t ki_nbytes; /* copy of iocb->aio_nbytes */ + char *ki_buf; /* remaining iocb->aio_buf */ + size_t ki_left; /* remaining bytes */ + wait_queue_t ki_wait; + long ki_retried; /* just for testing */ + long ki_kicked; /* just for testing */ + long ki_queued; /* just for testing */ char private[KIOCB_PRIVATE_SIZE]; }; @@ -77,6 +87,8 @@ struct kiocb { (x)->ki_ctx = &tsk->active_mm->default_kioctx; \ (x)->ki_cancel = NULL; \ (x)->ki_user_obj = tsk; \ + (x)->ki_user_data = 0; \ + init_wait((&(x)->ki_wait)); \ } while (0) #define AIO_RING_MAGIC 0xa10a10a1 @@ -158,6 +170,24 @@ int FASTCALL(io_submit_one(struct kioctx #define get_ioctx(kioctx) do { if (unlikely(atomic_read(&(kioctx)->users) <= 0)) BUG(); atomic_inc(&(kioctx)->users); } while (0) #define put_ioctx(kioctx) do { if (unlikely(atomic_dec_and_test(&(kioctx)->users))) __put_ioctx(kioctx); else if (unlikely(atomic_read(&(kioctx)->users) < 0)) BUG(); } while (0) + +#define in_aio() !is_sync_wait(current->io_wait) + +/* when sync behaviour is desired even if running in async context */ +#define do_sync_op(op) if (in_aio()) { \ + wait_queue_t *wait = current->io_wait; \ + current->io_wait = NULL; \ + op; \ + current->io_wait = wait; \ +} else { \ + op; \ +} + +#define warn_if_async() if (in_aio()) {\ + printk(KERN_ERR "%s(%s:%d) called in async context!\n", \ + __FUNCTION__, __FILE__, __LINE__); \ + dump_stack(); \ + } #include diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/blkdev.h 900-mjb2/include/linux/blkdev.h --- 000-virgin/include/linux/blkdev.h Fri May 30 19:02:22 2003 +++ 900-mjb2/include/linux/blkdev.h Wed Jun 11 22:55:22 2003 @@ -454,6 +454,7 @@ extern int blk_queue_init_tags(request_q extern void blk_queue_free_tags(request_queue_t *); extern void blk_queue_invalidate_tags(request_queue_t *); extern void blk_congestion_wait(int rw, long timeout); +extern int blk_congestion_wait_wq(int rw, long timeout, wait_queue_t *wait); extern void blk_rq_bio_prep(request_queue_t *, struct request *, struct bio *); diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/buffer_head.h 900-mjb2/include/linux/buffer_head.h --- 000-virgin/include/linux/buffer_head.h Sat May 10 18:35:03 2003 +++ 900-mjb2/include/linux/buffer_head.h Wed Jun 11 22:55:24 2003 @@ -158,6 +158,7 @@ void mark_buffer_async_write(struct buff void invalidate_bdev(struct block_device *, int); int sync_blockdev(struct block_device *bdev); void __wait_on_buffer(struct buffer_head *); +int __wait_on_buffer_wq(struct buffer_head *, wait_queue_t *wait); wait_queue_head_t *bh_waitq_head(struct buffer_head *bh); void wake_up_buffer(struct buffer_head *bh); int fsync_bdev(struct block_device *); @@ -168,6 +169,8 @@ struct buffer_head * __getblk(struct blo void __brelse(struct buffer_head *); void __bforget(struct buffer_head *); struct buffer_head *__bread(struct block_device *, sector_t block, int size); +struct buffer_head *__bread_wq(struct block_device *, sector_t block, + int size, wait_queue_t *wait); struct buffer_head *alloc_buffer_head(int gfp_flags); void free_buffer_head(struct buffer_head * bh); void FASTCALL(unlock_buffer(struct buffer_head *bh)); @@ -225,13 +228,13 @@ static inline void put_bh(struct buffer_ static inline void brelse(struct buffer_head *bh) { - if (bh) + if (bh && !IS_ERR(bh)) __brelse(bh); } static inline void bforget(struct buffer_head *bh) { - if (bh) + if (bh && !IS_ERR(bh)) __bforget(bh); } @@ -242,7 +245,12 @@ sb_bread(struct super_block *sb, sector_ } static inline struct buffer_head * -sb_getblk(struct super_block *sb, sector_t block) +sb_bread_wq(struct super_block *sb, sector_t block, wait_queue_t *wait) +{ + return __bread_wq(sb->s_bdev, block, sb->s_blocksize, wait); +} + +static inline struct buffer_head *sb_getblk(struct super_block *sb, sector_t block) { return __getblk(sb->s_bdev, block, sb->s_blocksize); } @@ -272,10 +280,26 @@ static inline void wait_on_buffer(struct __wait_on_buffer(bh); } +static inline int wait_on_buffer_wq(struct buffer_head *bh, wait_queue_t *wait) +{ + if (buffer_locked(bh)) + return __wait_on_buffer_wq(bh, wait); + + return 0; +} + static inline void lock_buffer(struct buffer_head *bh) { while (test_set_buffer_locked(bh)) __wait_on_buffer(bh); +} + +static inline int lock_buffer_wq(struct buffer_head *bh, wait_queue_t *wait) +{ + if (test_set_buffer_locked(bh)) + return __wait_on_buffer_wq(bh, wait); + + return 0; } #endif /* _LINUX_BUFFER_HEAD_H */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/compiler.h 900-mjb2/include/linux/compiler.h --- 000-virgin/include/linux/compiler.h Sun Apr 20 19:35:07 2003 +++ 900-mjb2/include/linux/compiler.h Wed Jun 11 22:47:07 2003 @@ -60,6 +60,6 @@ shouldn't recognize the original var, and make assumptions about it */ #define RELOC_HIDE(ptr, off) \ ({ unsigned long __ptr; \ - __asm__ ("" : "=g"(__ptr) : "0"(ptr)); \ + __asm__ ("" : "=r"(__ptr) : "0"(ptr)); \ (typeof(ptr)) (__ptr + (off)); }) #endif /* __LINUX_COMPILER_H */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/early_printk.h 900-mjb2/include/linux/early_printk.h --- 000-virgin/include/linux/early_printk.h Wed Dec 31 16:00:00 1969 +++ 900-mjb2/include/linux/early_printk.h Wed Jun 11 22:42:36 2003 @@ -0,0 +1,47 @@ +#ifndef __EARLY_PRINTK_H_ +#define __EARLY_PRINTK_H_ + +#ifdef CONFIG_EARLY_PRINTK +#include +#include +#include +#include +#include +#include + +/* Simple VGA output */ + +#define MAX_YPOS 25 +#define MAX_XPOS 80 + +/* Simple serial port output */ + +#define DEFAULT_BAUD 57600 +#define XMTRDY 0x20 + +#define DLAB 0x80 + +#define TXR 0 /* Transmit register (WRITE) */ +#define RXR 0 /* Receive register (READ) */ +#define IER 1 /* Interrupt Enable */ +#define IIR 2 /* Interrupt ID */ +#define FCR 2 /* FIFO control */ +#define LCR 3 /* Line control */ +#define MCR 4 /* Modem control */ +#define LSR 5 /* Line Status */ +#define MSR 6 /* Modem Status */ +#define DLL 0 /* Divisor Latch Low */ +#define DLH 1 /* Divisor latch High */ + + +void early_printk(const char *fmt, ...); +int __init setup_early_printk(); + +#else + +#define early_printk(...) do {} while(0) +#define setup_early_printk() do {} while(0) + +#endif + +#endif diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/errno.h 900-mjb2/include/linux/errno.h --- 000-virgin/include/linux/errno.h Fri Dec 13 23:18:13 2002 +++ 900-mjb2/include/linux/errno.h Wed Jun 11 22:55:21 2003 @@ -22,6 +22,7 @@ #define EBADTYPE 527 /* Type not supported by server */ #define EJUKEBOX 528 /* Request initiated, but will not complete before timeout */ #define EIOCBQUEUED 529 /* iocb queued, will get completion event */ +#define EIOCBRETRY 530 /* iocb queued, will trigger a retry */ #endif diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/gdb.h 900-mjb2/include/linux/gdb.h --- 000-virgin/include/linux/gdb.h Wed Dec 31 16:00:00 1969 +++ 900-mjb2/include/linux/gdb.h Wed Jun 11 22:42:59 2003 @@ -0,0 +1,67 @@ +#ifndef _GDB_H_ +#define _GDB_H_ + +/* + * Copyright (C) 2001 Amit S. Kale + */ + +/* gdb locks */ +#define KGDB_MAX_NO_CPUS NR_CPUS + +extern int gdb_enter; /* 1 = enter debugger on boot */ +extern int gdb_ttyS; +extern int gdb_baud; +extern int gdb_initialized; + +extern int gdb_hook(void); +extern void breakpoint(void); + +typedef int gdb_debug_hook(int trapno, + int signo, + int err_code, + struct pt_regs *regs); +extern gdb_debug_hook *linux_debug_hook; + +#ifdef CONFIG_SMP +extern spinlock_t kgdb_spinlock; +extern spinlock_t kgdb_nmispinlock; +#else +extern unsigned kgdb_spinlock; +extern unsigned kgdb_nmispinlock; +#endif + +extern volatile int kgdb_memerr_expected; + +struct console; +void gdb_console_write(struct console *co, const char *s, + unsigned count); +void gdb_console_init(void); + +extern volatile int procindebug[KGDB_MAX_NO_CPUS]; + +#define KGDB_ASSERT(message, condition) do { \ + if (!(condition)) { \ + printk("kgdb assertion failed: %s\n", message); \ + asm ("int $0x3"); \ + } \ +} while (0) + +#ifdef CONFIG_KERNEL_ASSERTS +#define KERNEL_ASSERT(message, condition) KGDB_ASSERT(message, condition) +#else +#define KERNEL_ASSERT(message, condition) +#endif + +#define KA_VALID_ERRNO(errno) ((errno) > 0 && (errno) <= EMEDIUMTYPE) + +#define KA_VALID_PTR_ERR(ptr) KA_VALID_ERRNO(-PTR_ERR(ptr)) + +#define KA_VALID_KPTR(ptr) (!(ptr) || \ + ((void *)(ptr) >= (void *)PAGE_OFFSET && \ + (void *)(ptr) < ERR_PTR(-EMEDIUMTYPE))) + +#define KA_VALID_PTRORERR(errptr) (KA_VALID_KPTR(errptr) || KA_VALID_PTR_ERR(errptr)) + +#define KA_HELD_GKL() (current->lock_depth >= 0) + +#endif /* _GDB_H_ */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/init_task.h 900-mjb2/include/linux/init_task.h --- 000-virgin/include/linux/init_task.h Fri May 30 19:02:22 2003 +++ 900-mjb2/include/linux/init_task.h Wed Jun 11 22:56:53 2003 @@ -101,9 +101,9 @@ .blocked = {{0}}, \ .posix_timers = LIST_HEAD_INIT(tsk.posix_timers), \ .alloc_lock = SPIN_LOCK_UNLOCKED, \ - .proc_lock = SPIN_LOCK_UNLOCKED, \ .switch_lock = SPIN_LOCK_UNLOCKED, \ .journal_info = NULL, \ + .io_wait = NULL, \ } diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/kdev_t.h 900-mjb2/include/linux/kdev_t.h --- 000-virgin/include/linux/kdev_t.h Sat May 10 18:35:03 2003 +++ 900-mjb2/include/linux/kdev_t.h Wed Jun 11 22:48:35 2003 @@ -70,13 +70,13 @@ aeb - 950811 * static arrays, and they are sized for a 8-bit index. */ typedef struct { - unsigned short value; + unsigned int value; } kdev_t; -#define KDEV_MINOR_BITS 8 -#define KDEV_MAJOR_BITS 8 +#define KDEV_MINOR_BITS 16 +#define KDEV_MAJOR_BITS 16 -#define __mkdev(major,minor) (((major) << KDEV_MINOR_BITS) + (minor)) +#define __mkdev(major, minor) (((major) << KDEV_MINOR_BITS) + (minor)) #define mk_kdev(major, minor) ((kdev_t) { __mkdev(major,minor) } ) @@ -107,17 +107,55 @@ static inline int kdev_same(kdev_t dev1, #define kdev_none(d1) (!kdev_val(d1)) -/* Mask off the high bits for now.. */ -#define minor(dev) ((dev).value & 0xff) -#define major(dev) (((dev).value >> KDEV_MINOR_BITS) & 0xff) +#define minor(dev) ((dev).value & 0xffff) +#define major(dev) (((dev).value >> KDEV_MINOR_BITS) & 0xffff) /* These are for user-level "dev_t" */ +/* Since glibc uses 8+8 in , we'll get + incompatibilities with a simple scheme like 12+20. + Use 8+8 for 16-bit values, some other division, say 16+16, + for 32-bit values. */ #define MINORBITS 8 #define MINORMASK ((1U << MINORBITS) - 1) -#define MAJOR(dev) ((unsigned int) ((dev) >> MINORBITS)) -#define MINOR(dev) ((unsigned int) ((dev) & MINORMASK)) -#define MKDEV(ma,mi) (((ma) << MINORBITS) | (mi)) +#include /* dev_t */ +#if 1 +/* macro versions */ + +#define MAJOR(dev) ((unsigned int)(((dev) & 0xffff0000) ? ((dev) >> 16) & 0xffff : ((dev) >> 8) & 0xff)) +#define MINOR(dev) ((unsigned int)(((dev) & 0xffff0000) ? ((dev) & 0xffff) : ((dev) & 0xff))) +#define MKDEV(ma,mi) ((dev_t)((((ma) & ~0xff) == 0 && ((mi) & ~0xff) == 0) ? (((ma) << 8) | (mi)) : (((ma) << 16) | (mi)))) + +#else +/* inline function versions */ + +static inline unsigned int +MAJOR(dev_t dev) { + unsigned int ma; + + ma = ((dev >> 16) & 0xffff); + if (ma == 0) + ma = ((dev >> 8) & 0xff); + return ma; +} + +static inline unsigned int +MINOR(dev_t dev) { + unsigned int mi; + + mi = (dev & 0xffff); + if (mi == dev) + mi = (dev & 0xff); + return mi; +} + +static inline dev_t +MKDEV(unsigned int ma, unsigned int mi) { + if ((ma & ~0xff) == 0 && (mi & ~0xff) == 0) + return ((ma << 8) | mi); + return ((ma << 16) | mi); +} +#endif /* * Conversion functions @@ -125,12 +163,16 @@ static inline int kdev_same(kdev_t dev1, static inline int kdev_t_to_nr(kdev_t dev) { - return MKDEV(major(dev), minor(dev)); + unsigned int ma = major(dev); + unsigned int mi = minor(dev); + return MKDEV(ma, mi); } -static inline kdev_t to_kdev_t(int dev) +static inline kdev_t to_kdev_t(dev_t dev) { - return mk_kdev(MAJOR(dev),MINOR(dev)); + unsigned int ma = MAJOR(dev); + unsigned int mi = MINOR(dev); + return mk_kdev(ma, mi); } #else /* __KERNEL__ */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/lockmeter.h 900-mjb2/include/linux/lockmeter.h --- 000-virgin/include/linux/lockmeter.h Wed Dec 31 16:00:00 1969 +++ 900-mjb2/include/linux/lockmeter.h Wed Jun 11 22:47:01 2003 @@ -0,0 +1,320 @@ +/* + * Copyright (C) 1999-2002 Silicon Graphics, Inc. + * + * Written by John Hawkes (hawkes@sgi.com) + * Based on klstat.h by Jack Steiner (steiner@sgi.com) + * + * Modified by Ray Bryant (raybry@us.ibm.com) Feb-Apr 2000 + * Changes Copyright (C) 2000 IBM, Inc. + * Added save of index in spinlock_t to improve efficiency + * of "hold" time reporting for spinlocks + * Added support for hold time statistics for read and write + * locks. + * Moved machine dependent code to include/asm/lockmeter.h. + * + */ + +#ifndef _LINUX_LOCKMETER_H +#define _LINUX_LOCKMETER_H + + +/*--------------------------------------------------- + * architecture-independent lockmeter.h + *-------------------------------------------------*/ + +/* + * raybry -- version 2: added efficient hold time statistics + * requires lstat recompile, so flagged as new version + * raybry -- version 3: added global reader lock data + * hawkes -- version 4: removed some unnecessary fields to simplify mips64 port + */ +#define LSTAT_VERSION 5 + +int lstat_update(void*, void*, int); +int lstat_update_time(void*, void*, int, uint32_t); + +/* + * Currently, the mips64 and sparc64 kernels talk to a 32-bit lockstat, so we + * need to force compatibility in the inter-communication data structure. + */ + +#if defined(CONFIG_MIPS32_COMPAT) +#define TIME_T uint32_t +#elif defined(CONFIG_SPARC32_COMPAT) +#define TIME_T uint64_t +#else +#define TIME_T time_t +#endif + +#if defined(__KERNEL__) || (!defined(CONFIG_MIPS32_COMPAT) && !defined(CONFIG_SPARC32_COMPAT)) || (_MIPS_SZLONG==32) +#define POINTER void * +#else +#define POINTER int64_t +#endif + +/* + * Values for the "action" parameter passed to lstat_update. + * ZZZ - do we want a try-success status here??? + */ +#define LSTAT_ACT_NO_WAIT 0 +#define LSTAT_ACT_SPIN 1 +#define LSTAT_ACT_REJECT 2 +#define LSTAT_ACT_WW_SPIN 3 +#define LSTAT_ACT_SLEPT 4 /* UNUSED */ + +#define LSTAT_ACT_MAX_VALUES 4 /* NOTE: Increase to 5 if use ACT_SLEPT */ + +/* + * Special values for the low 2 bits of an RA passed to + * lstat_update. + */ +/* we use these values to figure out what kind of lock data */ +/* is stored in the statistics table entry at index ....... */ +#define LSTAT_RA_SPIN 0 /* spin lock data */ +#define LSTAT_RA_READ 1 /* read lock statistics */ +#define LSTAT_RA_SEMA 2 /* RESERVED */ +#define LSTAT_RA_WRITE 3 /* write lock statistics*/ + +#define LSTAT_RA(n) \ + ((void*)( ((unsigned long)__builtin_return_address(0) & ~3) | n) ) + +/* + * Constants used for lock addresses in the lstat_directory + * to indicate special values of the lock address. + */ +#define LSTAT_MULTI_LOCK_ADDRESS NULL + +/* + * Maximum size of the lockstats tables. Increase this value + * if its not big enough. (Nothing bad happens if its not + * big enough although some locks will not be monitored.) + * We record overflows of this quantity in lstat_control.dir_overflows + * + * Note: The max value here must fit into the field set + * and obtained by the macro's PUT_INDEX() and GET_INDEX(). + * This value depends on how many bits are available in the + * lock word in the particular machine implementation we are on. + */ +#define LSTAT_MAX_STAT_INDEX 2000 + +/* + * Size and mask for the hash table into the directory. + */ +#define LSTAT_HASH_TABLE_SIZE 4096 /* must be 2**N */ +#define LSTAT_HASH_TABLE_MASK (LSTAT_HASH_TABLE_SIZE-1) + +#define DIRHASH(ra) ((unsigned long)(ra)>>2 & LSTAT_HASH_TABLE_MASK) + +/* + * This defines an entry in the lockstat directory. It contains + * information about a lock being monitored. + * A directory entry only contains the lock identification - + * counts on usage of the lock are kept elsewhere in a per-cpu + * data structure to minimize cache line pinging. + */ +typedef struct { + POINTER caller_ra; /* RA of code that set lock */ + POINTER lock_ptr; /* lock address */ + ushort next_stat_index; /* Used to link multiple locks that have the same hash table value */ +} lstat_directory_entry_t; + +/* + * A multi-dimensioned array used to contain counts for lock accesses. + * The array is 3-dimensional: + * - CPU number. Keep from thrashing cache lines between CPUs + * - Directory entry index. Identifies the lock + * - Action. Indicates what kind of contention occurred on an + * access to the lock. + * + * The index of an entry in the directory is the same as the 2nd index + * of the entry in the counts array. + */ +/* + * This table contains data for spin_locks, write locks, and read locks + * Not all data is used for all cases. In particular, the hold time + * information is not stored here for read locks since that is a global + * (e. g. cannot be separated out by return address) quantity. + * See the lstat_read_lock_counts_t structure for the global read lock + * hold time. + */ +typedef struct { + uint64_t cum_wait_ticks; /* sum of wait times */ + /* for write locks, sum of time a */ + /* writer is waiting for a reader */ + int64_t cum_hold_ticks; /* cumulative sum of holds */ + /* not used for read mode locks */ + /* must be signed. ............... */ + uint32_t max_wait_ticks; /* max waiting time */ + uint32_t max_hold_ticks; /* max holding time */ + uint64_t cum_wait_ww_ticks; /* sum times writer waits on writer*/ + uint32_t max_wait_ww_ticks; /* max wait time writer vs writer */ + /* prev 2 only used for write locks*/ + uint32_t acquire_time; /* time lock acquired this CPU */ + uint32_t count[LSTAT_ACT_MAX_VALUES]; +} lstat_lock_counts_t; + +typedef lstat_lock_counts_t lstat_cpu_counts_t[LSTAT_MAX_STAT_INDEX]; + +/* + * User request to: + * - turn statistic collection on/off, or to reset + */ +#define LSTAT_OFF 0 +#define LSTAT_ON 1 +#define LSTAT_RESET 2 +#define LSTAT_RELEASE 3 + +#define LSTAT_MAX_READ_LOCK_INDEX 1000 +typedef struct { + POINTER lock_ptr; /* address of lock for output stats */ + uint32_t read_lock_count; + int64_t cum_hold_ticks; /* sum of read lock hold times over */ + /* all callers. ....................*/ + uint32_t write_index; /* last write lock hash table index */ + uint32_t busy_periods; /* count of busy periods ended this */ + uint64_t start_busy; /* time this busy period started. ..*/ + uint64_t busy_ticks; /* sum of busy periods this lock. ..*/ + uint64_t max_busy; /* longest busy period for this lock*/ + uint32_t max_readers; /* maximum number of readers ...... */ +#ifdef USER_MODE_TESTING + rwlock_t entry_lock; /* lock for this read lock entry... */ + /* avoid having more than one rdr at*/ + /* needed for user space testing... */ + /* not needed for kernel 'cause it */ + /* is non-preemptive. ............. */ +#endif +} lstat_read_lock_counts_t; +typedef lstat_read_lock_counts_t lstat_read_lock_cpu_counts_t[LSTAT_MAX_READ_LOCK_INDEX]; + +#if defined(__KERNEL__) || defined(USER_MODE_TESTING) + +#ifndef USER_MODE_TESTING +#include +#else +#include "asm_newlockmeter.h" +#endif + +/* + * Size and mask for the hash table into the directory. + */ +#define LSTAT_HASH_TABLE_SIZE 4096 /* must be 2**N */ +#define LSTAT_HASH_TABLE_MASK (LSTAT_HASH_TABLE_SIZE-1) + +#define DIRHASH(ra) ((unsigned long)(ra)>>2 & LSTAT_HASH_TABLE_MASK) + +/* + * This version eliminates the per processor lock stack. What we do is to + * store the index of the lock hash structure in unused bits in the lock + * itself. Then on unlock we can find the statistics record without doing + * any additional hash or lock stack lookup. This works for spin_locks. + * Hold time reporting is now basically as cheap as wait time reporting + * so we ignore the difference between LSTAT_ON_HOLD and LSTAT_ON_WAIT + * as in version 1.1.* of lockmeter. + * + * For rw_locks, we store the index of a global reader stats structure in + * the lock and the writer index is stored in the latter structure. + * For read mode locks we hash at the time of the lock to find an entry + * in the directory for reader wait time and the like. + * At unlock time for read mode locks, we update just the global structure + * so we don't need to know the reader directory index value at unlock time. + * + */ + +/* + * Protocol to change lstat_control.state + * This is complicated because we don't want the cum_hold_time for + * a rw_lock to be decremented in _read_lock_ without making sure it + * is incremented in _read_lock_ and vice versa. So here is the + * way we change the state of lstat_control.state: + * I. To Turn Statistics On + * After allocating storage, set lstat_control.state non-zero. + * This works because we don't start updating statistics for in use + * locks until the reader lock count goes to zero. + * II. To Turn Statistics Off: + * (0) Disable interrupts on this CPU + * (1) Seize the lstat_control.directory_lock + * (2) Obtain the current value of lstat_control.next_free_read_lock_index + * (3) Store a zero in lstat_control.state. + * (4) Release the lstat_control.directory_lock + * (5) For each lock in the read lock list up to the saved value + * (well, -1) of the next_free_read_lock_index, do the following: + * (a) Check validity of the stored lock address + * by making sure that the word at the saved addr + * has an index that matches this entry. If not + * valid, then skip this entry. + * (b) If there is a write lock already set on this lock, + * skip to (d) below. + * (c) Set a non-metered write lock on the lock + * (d) set the cached INDEX in the lock to zero + * (e) Release the non-metered write lock. + * (6) Re-enable interrupts + * + * These rules ensure that a read lock will not have its statistics + * partially updated even though the global lock recording state has + * changed. See put_lockmeter_info() for implementation. + * + * The reason for (b) is that there may be write locks set on the + * syscall path to put_lockmeter_info() from user space. If we do + * not do this check, then we can deadlock. A similar problem would + * occur if the lock was read locked by the current CPU. At the + * moment this does not appear to happen. + */ + +/* + * Main control structure for lockstat. Used to turn statistics on/off + * and to maintain directory info. + */ +typedef struct { + int state; + spinlock_t control_lock; /* used to serialize turning statistics on/off */ + spinlock_t directory_lock; /* for serialize adding entries to directory */ + volatile int next_free_dir_index;/* next free entry in the directory */ + /* FIXME not all of these fields are used / needed .............. */ + /* the following fields represent data since */ + /* first "lstat on" or most recent "lstat reset" */ + TIME_T first_started_time; /* time when measurement first enabled */ + TIME_T started_time; /* time when measurement last started */ + TIME_T ending_time; /* time when measurement last disabled */ + uint64_t started_cycles64; /* cycles when measurement last started */ + uint64_t ending_cycles64; /* cycles when measurement last disabled */ + uint64_t enabled_cycles64; /* total cycles with measurement enabled */ + int intervals; /* number of measurement intervals recorded */ + /* i. e. number of times did lstat on;lstat off */ + lstat_directory_entry_t *dir; /* directory */ + int dir_overflow; /* count of times ran out of space in directory */ + int rwlock_overflow; /* count of times we couldn't allocate a rw block*/ + ushort *hashtab; /* hash table for quick dir scans */ + lstat_cpu_counts_t *counts[NR_CPUS]; /* Array of pointers to per-cpu stats */ + int next_free_read_lock_index; /* next rwlock reader (global) stats block */ + lstat_read_lock_cpu_counts_t *read_lock_counts[NR_CPUS]; /* per cpu read lock stats */ +} lstat_control_t; + +#endif /* defined(__KERNEL__) || defined(USER_MODE_TESTING) */ + +typedef struct { + short lstat_version; /* version of the data */ + short state; /* the current state is returned */ + int maxcpus; /* Number of cpus present */ + int next_free_dir_index; /* index of the next free directory entry */ + TIME_T first_started_time; /* when measurement enabled for first time */ + TIME_T started_time; /* time in secs since 1969 when stats last turned on */ + TIME_T ending_time; /* time in secs since 1969 when stats last turned off */ + uint32_t cycleval; /* cycles per second */ +#ifdef notyet + void *kernel_magic_addr; /* address of kernel_magic */ + void *kernel_end_addr; /* contents of kernel magic (points to "end") */ +#endif + int next_free_read_lock_index; /* index of next (global) read lock stats struct */ + uint64_t started_cycles64; /* cycles when measurement last started */ + uint64_t ending_cycles64; /* cycles when stats last turned off */ + uint64_t enabled_cycles64; /* total cycles with measurement enabled */ + int intervals; /* number of measurement intervals recorded */ + /* i.e. number of times we did lstat on;lstat off*/ + int dir_overflow; /* number of times we wanted more space in directory */ + int rwlock_overflow; /* # of times we wanted more space in read_locks_count */ + struct new_utsname uts; /* info about machine where stats are measured */ + /* -T option of lockstat allows data to be */ + /* moved to another machine. ................. */ +} lstat_user_request_t; + +#endif /* _LINUX_LOCKMETER_H */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/mm.h 900-mjb2/include/linux/mm.h --- 000-virgin/include/linux/mm.h Fri May 30 19:02:22 2003 +++ 900-mjb2/include/linux/mm.h Wed Jun 11 22:42:42 2003 @@ -179,6 +179,7 @@ struct page { struct pte_chain *chain;/* Reverse pte mapping pointer. * protected by PG_chainlock */ pte_addr_t direct; + int mapcount; } pte; unsigned long private; /* mapping-private opaque data */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/mmzone.h 900-mjb2/include/linux/mmzone.h --- 000-virgin/include/linux/mmzone.h Fri May 30 19:02:22 2003 +++ 900-mjb2/include/linux/mmzone.h Wed Jun 11 22:47:08 2003 @@ -185,6 +185,7 @@ typedef struct pglist_data { struct bootmem_data *bdata; unsigned long node_start_pfn; unsigned long node_size; + unsigned long real_node_size; int node_id; struct pglist_data *pgdat_next; wait_queue_head_t kswapd_wait; diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/nfsd/syscall.h 900-mjb2/include/linux/nfsd/syscall.h --- 000-virgin/include/linux/nfsd/syscall.h Sat May 10 18:35:03 2003 +++ 900-mjb2/include/linux/nfsd/syscall.h Wed Jun 11 22:48:35 2003 @@ -59,7 +59,7 @@ struct nfsctl_client { struct nfsctl_export { char ex_client[NFSCLNT_IDMAX+1]; char ex_path[NFS_MAXPATHLEN+1]; - __kernel_old_dev_t ex_dev; + u16 ex_dev; __kernel_ino_t ex_ino; int ex_flags; __kernel_uid_t ex_anon_uid; diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/page-flags.h 900-mjb2/include/linux/page-flags.h --- 000-virgin/include/linux/page-flags.h Sun Apr 20 19:35:07 2003 +++ 900-mjb2/include/linux/page-flags.h Wed Jun 11 22:42:42 2003 @@ -74,6 +74,7 @@ #define PG_mappedtodisk 17 /* Has blocks allocated on-disk */ #define PG_reclaim 18 /* To be reclaimed asap */ #define PG_compound 19 /* Part of a compound page */ +#define PG_anon 20 /* Anonymous page */ /* @@ -256,6 +257,10 @@ extern void get_full_page_state(struct p #define PageCompound(page) test_bit(PG_compound, &(page)->flags) #define SetPageCompound(page) set_bit(PG_compound, &(page)->flags) #define ClearPageCompound(page) clear_bit(PG_compound, &(page)->flags) + +#define PageAnon(page) test_bit(PG_anon, &(page)->flags) +#define SetPageAnon(page) set_bit(PG_anon, &(page)->flags) +#define ClearPageAnon(page) clear_bit(PG_anon, &(page)->flags) /* * The PageSwapCache predicate doesn't use a PG_flag at this time, diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/pagemap.h 900-mjb2/include/linux/pagemap.h --- 000-virgin/include/linux/pagemap.h Fri May 30 19:02:23 2003 +++ 900-mjb2/include/linux/pagemap.h Wed Jun 11 22:55:21 2003 @@ -135,6 +135,16 @@ static inline void lock_page(struct page if (TestSetPageLocked(page)) __lock_page(page); } + +extern int FASTCALL(__lock_page_wq(struct page *page, wait_queue_t *wait)); +static inline int lock_page_wq(struct page *page, wait_queue_t *wait) +{ + if (TestSetPageLocked(page)) + return __lock_page_wq(page, wait); + else + return 0; +} + /* * This is exported only for wait_on_page_locked/wait_on_page_writeback. @@ -153,6 +163,15 @@ static inline void wait_on_page_locked(s { if (PageLocked(page)) wait_on_page_bit(page, PG_locked); +} + +extern int FASTCALL(wait_on_page_bit_wq(struct page *page, int bit_nr, + wait_queue_t *wait)); +static inline int wait_on_page_locked_wq(struct page *page, wait_queue_t *wait) +{ + if (PageLocked(page)) + return wait_on_page_bit_wq(page, PG_locked, wait); + return 0; } /* diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/pci.h 900-mjb2/include/linux/pci.h --- 000-virgin/include/linux/pci.h Fri May 30 19:02:23 2003 +++ 900-mjb2/include/linux/pci.h Wed Jun 11 22:47:07 2003 @@ -451,10 +451,10 @@ struct pci_bus { void *sysdata; /* hook for sys-specific extension */ struct proc_dir_entry *procdir; /* directory entry in /proc/bus/pci */ - unsigned char number; /* bus number */ - unsigned char primary; /* number of primary bridge */ - unsigned char secondary; /* number of secondary bridge */ - unsigned char subordinate; /* max number of subordinate buses */ + unsigned int number; /* bus number */ + unsigned int primary; /* number of primary bridge */ + unsigned int secondary; /* number of secondary bridge */ + unsigned int subordinate; /* max number of subordinate buses */ char name[48]; diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/proc_fs.h 900-mjb2/include/linux/proc_fs.h --- 000-virgin/include/linux/proc_fs.h Fri May 30 19:02:23 2003 +++ 900-mjb2/include/linux/proc_fs.h Wed Jun 11 22:56:53 2003 @@ -87,8 +87,6 @@ extern void proc_root_init(void); extern void proc_misc_init(void); struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry); -struct dentry *proc_pid_unhash(struct task_struct *p); -void proc_pid_flush(struct dentry *proc_dentry); int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir); extern struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode, diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/sched.h 900-mjb2/include/linux/sched.h --- 000-virgin/include/linux/sched.h Fri May 30 19:02:23 2003 +++ 900-mjb2/include/linux/sched.h Wed Jun 11 22:56:53 2003 @@ -69,7 +69,11 @@ struct exec_domain; * the EXP_n values would be 1981, 2034 and 2043 if still using only * 11 bit fractions. */ -extern unsigned long avenrun[]; /* Load averages */ +extern unsigned long avenrun[]; /* Load averages */ +extern unsigned long tasks_running[3]; /* Real load averages */ +extern unsigned long cpu_tasks_running[3][NR_CPUS]; /* Real load averages per cpu */ + +extern unsigned long tasks_running[]; /* Real load averages */ #define FSHIFT 11 /* nr of bits of precision */ #define FIXED_1 (1< #include #include @@ -166,7 +174,13 @@ extern unsigned long cache_decay_ticks; #define MAX_SCHEDULE_TIMEOUT LONG_MAX extern signed long FASTCALL(schedule_timeout(signed long timeout)); -asmlinkage void schedule(void); +#ifdef CONFIG_KGDB_THREAD + asmlinkage void do_schedule(void); + asmlinkage void kern_schedule(void); + asmlinkage void kern_do_schedule(struct pt_regs); +#else + asmlinkage void schedule(void); +#endif struct namespace; @@ -320,6 +334,13 @@ struct k_itimer { }; +struct sched_info { + /* running averages */ + unsigned long response_time, inter_arrival_time, service_time; + + /* timestamps */ + unsigned long last_arrival, began_service; +}; struct task_struct { volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ @@ -341,6 +362,8 @@ struct task_struct { unsigned long cpus_allowed; unsigned int time_slice, first_time_slice; + struct sched_info sched_info; + struct list_head tasks; struct list_head ptrace_children; struct list_head ptrace_list; @@ -435,8 +458,6 @@ struct task_struct { u32 self_exec_id; /* Protection of (de-)allocation: mm, files, fs, tty */ spinlock_t alloc_lock; -/* Protection of proc_dentry: nesting proc_lock, dcache_lock, write_lock_irq(&tasklist_lock); */ - spinlock_t proc_lock; /* context-switch lock */ spinlock_t switch_lock; @@ -451,6 +472,8 @@ struct task_struct { unsigned long ptrace_message; siginfo_t *last_siginfo; /* For ptrace use. */ +/* current io wait handle */ + wait_queue_t *io_wait; }; extern void __put_task_struct(struct task_struct *tsk); @@ -486,7 +509,7 @@ extern void set_cpus_allowed(task_t *p, # define set_cpus_allowed(p, new_mask) do { } while (0) #endif -#ifdef CONFIG_NUMA +#ifdef CONFIG_NUMA_SCHED extern void sched_balance_exec(void); extern void node_nr_running_init(void); #else @@ -700,6 +723,12 @@ static inline int thread_group_empty(tas (thread_group_leader(p) && !thread_group_empty(p)) extern void unhash_process(struct task_struct *p); + +#ifdef CONFIG_KGDB_THREAD +#define schedule() kern_schedule() +#else +#define user_schedule() schedule() +#endif /* Protects ->fs, ->files, ->mm, and synchronises with wait4(). * Nests both inside and outside of read_lock(&tasklist_lock). diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/spinlock.h 900-mjb2/include/linux/spinlock.h --- 000-virgin/include/linux/spinlock.h Fri May 30 19:02:23 2003 +++ 900-mjb2/include/linux/spinlock.h Wed Jun 11 22:55:09 2003 @@ -183,6 +183,17 @@ typedef struct { #endif /* !SMP */ +#ifdef CONFIG_LOCKMETER +extern void _metered_spin_lock (spinlock_t *lock); +extern void _metered_spin_unlock (spinlock_t *lock); +extern int _metered_spin_trylock(spinlock_t *lock); +extern void _metered_read_lock (rwlock_t *lock); +extern void _metered_read_unlock (rwlock_t *lock); +extern void _metered_write_lock (rwlock_t *lock); +extern void _metered_write_unlock (rwlock_t *lock); +extern int _metered_write_trylock(rwlock_t *lock); +#endif + /* * Define the various spin_lock and rw_lock methods. Note we define these * regardless of whether CONFIG_SMP or CONFIG_PREEMPT are set. The various @@ -387,6 +398,141 @@ do { \ #define spin_trylock_bh(lock) ({ local_bh_disable(); preempt_disable(); \ _raw_spin_trylock(lock) ? 1 : \ ({preempt_enable(); local_bh_enable(); 0;});}) + +#ifdef CONFIG_LOCKMETER +#undef spin_lock +#undef spin_trylock +#undef spin_unlock +#undef spin_lock_irqsave +#undef spin_lock_irq +#undef spin_lock_bh +#undef read_lock +#undef read_unlock +#undef write_lock +#undef write_unlock +#undef write_trylock +#undef spin_unlock_bh +#undef read_lock_irqsave +#undef read_lock_irq +#undef read_lock_bh +#undef read_unlock_bh +#undef write_lock_irqsave +#undef write_lock_irq +#undef write_lock_bh +#undef write_unlock_bh + +#define spin_lock(lock) \ +do { \ + preempt_disable(); \ + _metered_spin_lock(lock); \ +} while(0) + +#define spin_trylock(lock) ({preempt_disable(); _metered_spin_trylock(lock) ? \ + 1 : ({preempt_enable(); 0;});}) +#define spin_unlock(lock) \ +do { \ + _metered_spin_unlock(lock); \ + preempt_enable(); \ +} while (0) + +#define spin_lock_irqsave(lock, flags) \ +do { \ + local_irq_save(flags); \ + preempt_disable(); \ + _metered_spin_lock(lock); \ +} while (0) + +#define spin_lock_irq(lock) \ +do { \ + local_irq_disable(); \ + preempt_disable(); \ + _metered_spin_lock(lock); \ +} while (0) + +#define spin_lock_bh(lock) \ +do { \ + local_bh_disable(); \ + preempt_disable(); \ + _metered_spin_lock(lock); \ +} while (0) + +#define spin_unlock_bh(lock) \ +do { \ + _metered_spin_unlock(lock); \ + preempt_enable(); \ + local_bh_enable(); \ +} while (0) + + +#define read_lock(lock) ({preempt_disable(); _metered_read_lock(lock);}) +#define read_unlock(lock) ({_metered_read_unlock(lock); preempt_enable();}) +#define write_lock(lock) ({preempt_disable(); _metered_write_lock(lock);}) +#define write_unlock(lock) ({_metered_write_unlock(lock); preempt_enable();}) +#define write_trylock(lock) ({preempt_disable();_metered_write_trylock(lock) ? \ + 1 : ({preempt_enable(); 0;});}) +#define spin_unlock_no_resched(lock) \ +do { \ + _metered_spin_unlock(lock); \ + preempt_enable_no_resched(); \ +} while (0) + +#define read_lock_irqsave(lock, flags) \ +do { \ + local_irq_save(flags); \ + preempt_disable(); \ + _metered_read_lock(lock); \ +} while (0) + +#define read_lock_irq(lock) \ +do { \ + local_irq_disable(); \ + preempt_disable(); \ + _metered_read_lock(lock); \ +} while (0) + +#define read_lock_bh(lock) \ +do { \ + local_bh_disable(); \ + preempt_disable(); \ + _metered_read_lock(lock); \ +} while (0) + +#define read_unlock_bh(lock) \ +do { \ + _metered_read_unlock(lock); \ + preempt_enable(); \ + local_bh_enable(); \ +} while (0) + +#define write_lock_irqsave(lock, flags) \ +do { \ + local_irq_save(flags); \ + preempt_disable(); \ + _metered_write_lock(lock); \ +} while (0) + +#define write_lock_irq(lock) \ +do { \ + local_irq_disable(); \ + preempt_disable(); \ + _metered_write_lock(lock); \ +} while (0) + +#define write_lock_bh(lock) \ +do { \ + local_bh_disable(); \ + preempt_disable(); \ + _metered_write_lock(lock); \ +} while (0) + +#define write_unlock_bh(lock) \ +do { \ + _metered_write_unlock(lock); \ + preempt_enable(); \ + local_bh_enable(); \ +} while (0) + +#endif /* !CONFIG_LOCKMETER */ /* "lock on reference count zero" */ #ifndef ATOMIC_DEC_AND_LOCK diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/swap.h 900-mjb2/include/linux/swap.h --- 000-virgin/include/linux/swap.h Fri May 30 19:02:23 2003 +++ 900-mjb2/include/linux/swap.h Wed Jun 11 22:42:42 2003 @@ -186,6 +186,8 @@ struct pte_chain *FASTCALL(page_add_rmap void FASTCALL(page_remove_rmap(struct page *, pte_t *)); int FASTCALL(try_to_unmap(struct page *)); +int page_convert_anon(struct page *); + /* linux/mm/shmem.c */ extern int shmem_unuse(swp_entry_t entry, struct page *page); #else diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/sysctl.h 900-mjb2/include/linux/sysctl.h --- 000-virgin/include/linux/sysctl.h Fri May 30 19:02:23 2003 +++ 900-mjb2/include/linux/sysctl.h Wed Jun 11 22:42:39 2003 @@ -66,7 +66,8 @@ enum CTL_DEV=7, /* Devices */ CTL_BUS=8, /* Busses */ CTL_ABI=9, /* Binary emulation */ - CTL_CPU=10 /* CPU stuff (speed scaling, etc) */ + CTL_CPU=10, /* CPU stuff (speed scaling, etc) */ + CTL_SCHED=11, /* scheduler tunables */ }; /* CTL_BUS names: */ @@ -158,6 +159,21 @@ enum VM_LOWER_ZONE_PROTECTION=20,/* Amount of protection of lower zones */ }; +/* Tunable scheduler parameters in /proc/sys/sched/ */ +enum { + SCHED_MIN_TIMESLICE=1, /* minimum process timeslice */ + SCHED_MAX_TIMESLICE=2, /* maximum process timeslice */ + SCHED_CHILD_PENALTY=3, /* penalty on fork to child */ + SCHED_PARENT_PENALTY=4, /* penalty on fork to parent */ + SCHED_EXIT_WEIGHT=5, /* penalty to parent of CPU hog child */ + SCHED_PRIO_BONUS_RATIO=6, /* percent of max prio given as bonus */ + SCHED_INTERACTIVE_DELTA=7, /* delta used to scale interactivity */ + SCHED_MAX_SLEEP_AVG=8, /* maximum sleep avg attainable */ + SCHED_STARVATION_LIMIT=9, /* no re-active if expired is starved */ + SCHED_NODE_THRESHOLD=10, /* NUMA node rebalance threshold */ + SCHED_IDLE_NODE_REBALANCE_RATIO=11, /* how often to global balance */ + SCHED_BUSY_NODE_REBALANCE_RATIO=12, /* how often to global balance */ +}; /* CTL_NET names: */ enum diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/timex.h 900-mjb2/include/linux/timex.h --- 000-virgin/include/linux/timex.h Sat May 10 18:35:04 2003 +++ 900-mjb2/include/linux/timex.h Wed Jun 11 22:42:36 2003 @@ -75,7 +75,7 @@ #elif HZ >= 768 && HZ < 1536 # define SHIFT_HZ 10 #else -# error You lose. +# error Please use a HZ value which is between 12 and 1536 #endif /* diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/wait.h 900-mjb2/include/linux/wait.h --- 000-virgin/include/linux/wait.h Fri May 30 19:02:23 2003 +++ 900-mjb2/include/linux/wait.h Wed Jun 11 22:55:21 2003 @@ -80,6 +80,8 @@ static inline int waitqueue_active(wait_ return !list_empty(&q->task_list); } +#define is_sync_wait(wait) (!(wait) || ((wait)->task)) + extern void FASTCALL(add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)); extern void FASTCALL(add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait)); extern void FASTCALL(remove_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)); diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/writeback.h 900-mjb2/include/linux/writeback.h --- 000-virgin/include/linux/writeback.h Mon Mar 17 21:43:50 2003 +++ 900-mjb2/include/linux/writeback.h Wed Jun 11 22:55:22 2003 @@ -80,8 +80,8 @@ extern int dirty_expire_centisecs; void page_writeback_init(void); -void balance_dirty_pages(struct address_space *mapping); -void balance_dirty_pages_ratelimited(struct address_space *mapping); +int balance_dirty_pages(struct address_space *mapping); +int balance_dirty_pages_ratelimited(struct address_space *mapping); int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0); int do_writepages(struct address_space *mapping, struct writeback_control *wbc); diff -urpN -X /home/fletch/.diff.exclude 000-virgin/init/main.c 900-mjb2/init/main.c --- 000-virgin/init/main.c Fri May 30 19:02:23 2003 +++ 900-mjb2/init/main.c Wed Jun 11 22:42:59 2003 @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -55,6 +56,10 @@ #include #endif +#ifdef CONFIG_X86_REMOTE_DEBUG +#include +#endif + /* * Versions of gcc older than that listed below may actually compile * and link okay, but the end product can have subtle run time bugs. @@ -387,6 +392,8 @@ asmlinkage void __init start_kernel(void */ lock_kernel(); printk(linux_banner); + setup_early_printk(); + setup_arch(&command_line); setup_per_cpu_areas(); @@ -458,6 +465,12 @@ asmlinkage void __init start_kernel(void * make syscalls (and thus be locked). */ init_idle(current, smp_processor_id()); + +#ifdef CONFIG_X86_REMOTE_DEBUG + if (gdb_enter) { + gdb_hook(); /* right at boot time */ + } +#endif /* Do the rest non-__init'ed, we're now alive */ rest_init(); diff -urpN -X /home/fletch/.diff.exclude 000-virgin/kernel/Makefile 900-mjb2/kernel/Makefile --- 000-virgin/kernel/Makefile Fri May 30 19:02:24 2003 +++ 900-mjb2/kernel/Makefile Wed Jun 11 22:47:01 2003 @@ -11,6 +11,7 @@ obj-y = sched.o fork.o exec_domain.o obj-$(CONFIG_FUTEX) += futex.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o obj-$(CONFIG_SMP) += cpu.o +obj-$(CONFIG_LOCKMETER) += lockmeter.o obj-$(CONFIG_UID16) += uid16.o obj-$(CONFIG_MODULES) += ksyms.o module.o obj-$(CONFIG_KALLSYMS) += kallsyms.o @@ -19,6 +20,7 @@ obj-$(CONFIG_CPU_FREQ) += cpufreq.o obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend.o obj-$(CONFIG_COMPAT) += compat.o +obj-$(CONFIG_EARLY_PRINTK) += early_printk.o ifneq ($(CONFIG_IA64),y) # According to Alan Modra , the -fno-omit-frame-pointer is diff -urpN -X /home/fletch/.diff.exclude 000-virgin/kernel/early_printk.c 900-mjb2/kernel/early_printk.c --- 000-virgin/kernel/early_printk.c Wed Dec 31 16:00:00 1969 +++ 900-mjb2/kernel/early_printk.c Wed Jun 11 22:42:36 2003 @@ -0,0 +1,218 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +/* Simple VGA output */ + +#define MAX_YPOS 25 +#define MAX_XPOS 80 + +static int current_ypos = 1, current_xpos = 0; + +static void early_vga_write(struct console *con, const char *str, unsigned n) +{ + char c; + int i, k, j; + + while ((c = *str++) != '\0' && n-- > 0) { + if (current_ypos >= MAX_YPOS) { + /* scroll 1 line up */ + for(k = 1, j = 0; k < MAX_YPOS; k++, j++) { + for(i = 0; i < MAX_XPOS; i++) { + writew(readw(VGABASE + 2*(MAX_XPOS*k + i)), + VGABASE + 2*(MAX_XPOS*j + i)); + } + } + for(i = 0; i < MAX_XPOS; i++) { + writew(0x720, VGABASE + 2*(MAX_XPOS*j + i)); + } + current_ypos = MAX_YPOS-1; + } + if (c == '\n') { + current_xpos = 0; + current_ypos++; + } else if (c != '\r') { + writew(((0x7 << 8) | (unsigned short) c), + VGABASE + 2*(MAX_XPOS*current_ypos + current_xpos++)); + if (current_xpos >= MAX_XPOS) { + current_xpos = 0; + current_ypos++; + } + } + } +} + +static struct console early_vga_console = { + .name = "earlyvga", + .write = early_vga_write, + .flags = CON_PRINTBUFFER, + .index = -1, +}; + +/* Serial functions losely based on a similar package from Klaus P. Gerlicher */ + +int early_serial_base; /* ttyS0 */ + +static int early_serial_putc(unsigned char ch) +{ + unsigned timeout = 0xffff; + while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout) + rep_nop(); + outb(ch, early_serial_base + TXR); + return timeout ? 0 : -1; +} + +static void early_serial_write(struct console *con, const char *s, unsigned n) +{ + while (*s && n-- > 0) { + early_serial_putc(*s); + if (*s == '\n') + early_serial_putc('\r'); + s++; + } +} + +static __init void early_serial_init(char *opt) +{ + unsigned char c; + unsigned divisor, baud = DEFAULT_BAUD; + static int bases[] = SERIAL_BASES; + char *s, *e; + + early_serial_base = bases[0]; + + if (*opt == ',') + ++opt; + + s = strsep(&opt, ","); + if (s != NULL) { + unsigned port; + if (!strncmp(s,"0x",2)) + early_serial_base = simple_strtoul(s, &e, 16); + else { + if (!strncmp(s,"ttyS",4)) + s+=4; + port = simple_strtoul(s, &e, 10); + if (port > (SERIAL_BASES_LEN-1) || s == e) + port = 0; + early_serial_base = bases[port]; + } + } + + outb(0x3, early_serial_base + LCR); /* 8n1 */ + outb(0, early_serial_base + IER); /* no interrupt */ + outb(0, early_serial_base + FCR); /* no fifo */ + outb(0x3, early_serial_base + MCR); /* DTR + RTS */ + + s = strsep(&opt, ","); + if (s != NULL) { + baud = simple_strtoul(s, &e, 0); + if (baud == 0 || s == e) + baud = DEFAULT_BAUD; + } + + divisor = 115200 / baud; + c = inb(early_serial_base + LCR); + outb(c | DLAB, early_serial_base + LCR); + outb(divisor & 0xff, early_serial_base + DLL); + outb((divisor >> 8) & 0xff, early_serial_base + DLH); + outb(c & ~DLAB, early_serial_base + LCR); +} + +static struct console early_serial_console = { + .name = "earlyser", + .write = early_serial_write, + .flags = CON_PRINTBUFFER, + .index = -1, +}; + +/* Direct interface for emergencies */ +struct console *early_console = &early_vga_console; +static int early_console_initialized = 0; + +void early_printk(const char *fmt, ...) +{ + char buf[512]; + int n; + va_list ap; + va_start(ap,fmt); + n = vsnprintf(buf,512,fmt,ap); + early_console->write(early_console,buf,n); + va_end(ap); +} + +static int keep_early; + +int __init setup_early_printk(void) +{ + char *space, *s; + char buf[256]; + char cmd[COMMAND_LINE_SIZE]; + char *opt; + + /* Get our own copy of the cmd line */ + memcpy(cmd, COMMAND_LINE, COMMAND_LINE_SIZE); + cmd[COMMAND_LINE_SIZE-1] = '\0'; + opt = cmd; + + s = strstr(opt, "earlyprintk="); + if (s == NULL) + return -1; + opt = s+12; + + if (early_console_initialized) + return -1; + + strncpy(buf,opt,256); + buf[255] = 0; + space = strchr(buf, ' '); + if (space) + *space = 0; + + if (strstr(buf,"keep")) + keep_early = 1; + + if (!strncmp(buf, "serial", 6)) { + early_serial_init(buf + 6); + early_console = &early_serial_console; + } else if (!strncmp(buf, "ttyS", 4)) { + early_serial_init(buf); + early_console = &early_serial_console; + } else if (!strncmp(buf, "vga", 3)) { + early_console = &early_vga_console; + } else { + early_console = NULL; + return -1; + } + early_console_initialized = 1; + register_console(early_console); + printk("early printk console registered\n"); + return 0; +} + +void __init disable_early_printk(void) +{ + if (!early_console_initialized || !early_console) + return; + if (!keep_early) { + printk("disabling early console...\n"); + unregister_console(early_console); + early_console_initialized = 0; + } else { + printk("keeping early console.\n"); + } +} + +/* syntax: earlyprintk=vga + earlyprintk=serial[,ttySn[,baudrate]] + Append ,keep to not disable it when the real console takes over. + Only vga or serial at a time, not both. + Currently only ttyS0 and ttyS1 are supported. + Interaction with the standard serial driver is not very good. + The VGA output is eventually overwritten by the real console. */ +__setup("earlyprintk=", setup_early_printk); diff -urpN -X /home/fletch/.diff.exclude 000-virgin/kernel/exit.c 900-mjb2/kernel/exit.c --- 000-virgin/kernel/exit.c Fri May 30 19:02:24 2003 +++ 900-mjb2/kernel/exit.c Wed Jun 11 22:56:53 2003 @@ -21,7 +21,6 @@ #include #include #include -#include #include #include @@ -32,8 +31,10 @@ extern struct task_struct *child_reaper; int getrusage(struct task_struct *, int, struct rusage *); -static void __unhash_process(struct task_struct *p) +static struct dentry * __unhash_process(struct task_struct *p) { + struct dentry *proc_dentry; + nr_threads--; detach_pid(p, PIDTYPE_PID); detach_pid(p, PIDTYPE_TGID); @@ -45,25 +46,34 @@ static void __unhash_process(struct task } REMOVE_LINKS(p); + proc_dentry = p->proc_dentry; + if (unlikely(proc_dentry != NULL)) { + spin_lock(&dcache_lock); + if (!d_unhashed(proc_dentry)) { + dget_locked(proc_dentry); + __d_drop(proc_dentry); + } else + proc_dentry = NULL; + spin_unlock(&dcache_lock); + } + return proc_dentry; } void release_task(struct task_struct * p) { - task_t *leader; struct dentry *proc_dentry; + task_t *leader; BUG_ON(p->state < TASK_ZOMBIE); atomic_dec(&p->user->processes); - spin_lock(&p->proc_lock); - proc_dentry = proc_pid_unhash(p); write_lock_irq(&tasklist_lock); if (unlikely(p->ptrace)) __ptrace_unlink(p); BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children)); __exit_signal(p); __exit_sighand(p); - __unhash_process(p); + proc_dentry = __unhash_process(p); /* * If we are the last non-leader member of the thread @@ -82,8 +92,11 @@ void release_task(struct task_struct * p p->parent->cnswap += p->nswap + p->cnswap; sched_exit(p); write_unlock_irq(&tasklist_lock); - spin_unlock(&p->proc_lock); - proc_pid_flush(proc_dentry); + + if (unlikely(proc_dentry != NULL)) { + shrink_dcache_parent(proc_dentry); + dput(proc_dentry); + } release_thread(p); put_task_struct(p); } @@ -94,13 +107,14 @@ void unhash_process(struct task_struct * { struct dentry *proc_dentry; - spin_lock(&p->proc_lock); - proc_dentry = proc_pid_unhash(p); write_lock_irq(&tasklist_lock); - __unhash_process(p); + proc_dentry = __unhash_process(p); write_unlock_irq(&tasklist_lock); - spin_unlock(&p->proc_lock); - proc_pid_flush(proc_dentry); + + if (unlikely(proc_dentry != NULL)) { + shrink_dcache_parent(proc_dentry); + dput(proc_dentry); + } } /* diff -urpN -X /home/fletch/.diff.exclude 000-virgin/kernel/fork.c 900-mjb2/kernel/fork.c --- 000-virgin/kernel/fork.c Fri May 30 19:02:24 2003 +++ 900-mjb2/kernel/fork.c Wed Jun 11 22:55:21 2003 @@ -141,8 +141,9 @@ void remove_wait_queue(wait_queue_head_t void prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state) { unsigned long flags; - - __set_current_state(state); + + if (is_sync_wait(wait)) + __set_current_state(state); wait->flags &= ~WQ_FLAG_EXCLUSIVE; spin_lock_irqsave(&q->lock, flags); if (list_empty(&wait->task_list)) @@ -155,7 +156,8 @@ prepare_to_wait_exclusive(wait_queue_hea { unsigned long flags; - __set_current_state(state); + if (is_sync_wait(wait)) + __set_current_state(state); wait->flags |= WQ_FLAG_EXCLUSIVE; spin_lock_irqsave(&q->lock, flags); if (list_empty(&wait->task_list)) @@ -199,7 +201,10 @@ void __init fork_init(unsigned long memp * value: the thread structures can take up at most half * of memory. */ - max_threads = mempages / (THREAD_SIZE/PAGE_SIZE) / 8; + if (THREAD_SIZE >= PAGE_SIZE) + max_threads = mempages / (THREAD_SIZE/PAGE_SIZE) / 8; + else + max_threads = (mempages * (PAGE_SIZE/THREAD_SIZE)) / 8; /* * we need to allow at least 20 threads to boot a system */ @@ -857,6 +862,7 @@ struct task_struct *copy_process(unsigne p->lock_depth = -1; /* -1 = no lock */ p->start_time = get_jiffies_64(); p->security = NULL; + p->io_wait = NULL; retval = -ENOMEM; if ((retval = security_task_alloc(p))) diff -urpN -X /home/fletch/.diff.exclude 000-virgin/kernel/ksyms.c 900-mjb2/kernel/ksyms.c --- 000-virgin/kernel/ksyms.c Fri May 30 19:02:24 2003 +++ 900-mjb2/kernel/ksyms.c Wed Jun 11 22:47:03 2003 @@ -456,7 +456,12 @@ EXPORT_SYMBOL(sleep_on); EXPORT_SYMBOL(sleep_on_timeout); EXPORT_SYMBOL(interruptible_sleep_on); EXPORT_SYMBOL(interruptible_sleep_on_timeout); +#ifdef CONFIG_KGDB_THREAD +EXPORT_SYMBOL(kern_schedule); +EXPORT_SYMBOL(do_schedule); +#else EXPORT_SYMBOL(schedule); +#endif #ifdef CONFIG_PREEMPT EXPORT_SYMBOL(preempt_schedule); #endif @@ -599,6 +604,16 @@ EXPORT_SYMBOL(find_task_by_pid); EXPORT_SYMBOL(next_thread); #if defined(CONFIG_SMP) && defined(__GENERIC_PER_CPU) EXPORT_SYMBOL(__per_cpu_offset); +#endif + +#if defined(CONFIG_LOCKMETER) +EXPORT_SYMBOL(_metered_spin_lock); +EXPORT_SYMBOL(_metered_spin_unlock); +EXPORT_SYMBOL(_metered_spin_trylock); +EXPORT_SYMBOL(_metered_read_lock); +EXPORT_SYMBOL(_metered_read_unlock); +EXPORT_SYMBOL(_metered_write_lock); +EXPORT_SYMBOL(_metered_write_unlock); #endif /* debug */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/kernel/lockmeter.c 900-mjb2/kernel/lockmeter.c --- 000-virgin/kernel/lockmeter.c Wed Dec 31 16:00:00 1969 +++ 900-mjb2/kernel/lockmeter.c Wed Jun 11 22:47:01 2003 @@ -0,0 +1,1088 @@ +/* + * Copyright (C) 1999,2000 Silicon Graphics, Inc. + * + * Written by John Hawkes (hawkes@sgi.com) + * Based on klstat.c by Jack Steiner (steiner@sgi.com) + * + * Modified by Ray Bryant (raybry@us.ibm.com) + * Changes Copyright (C) 2000 IBM, Inc. + * Added save of index in spinlock_t to improve efficiency + * of "hold" time reporting for spinlocks + * Added support for hold time statistics for read and write + * locks. + */ + +#ifdef __KERNEL__ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#else +#define __SMP__ +#include +#include +#include +#include "bitops.h" +#include "user_scaffold.h" +#include +#include +#include "newlockmeter.h" +#endif + +#ifdef __KERNEL__ +#define ASSERT(cond) +#define bzero(loc,size) memset(loc,0,size) +#endif + +/*<---------------------------------------------------*/ +/* lockmeter.c */ +/*>---------------------------------------------------*/ + +#ifdef __KERNEL__ +static lstat_control_t lstat_control __cacheline_aligned = {LSTAT_OFF, SPIN_LOCK_UNLOCKED, SPIN_LOCK_UNLOCKED, 19*0, NR_CPUS*0, 0, NR_CPUS*0}; +#else +lstat_control_t lstat_control = {LSTAT_OFF, SPIN_LOCK_UNLOCKED, SPIN_LOCK_UNLOCKED, 19*0, NR_CPUS*0, 0, NR_CPUS*0}; +#endif + +int smp_num_cpus=NR_CPUS; + +#undef BUG +#define BUG() + +static ushort lstat_make_dir_entry(void *, void *); + +/* + * lstat_lookup + * + * Given a RA, locate the directory entry for the lock. + */ +static ushort +lstat_lookup( + void *lock_ptr, + void *caller_ra) +{ + ushort index; + lstat_directory_entry_t *dirp; + + dirp = lstat_control.dir; + + index = lstat_control.hashtab[DIRHASH(caller_ra)]; + while (dirp[index].caller_ra != caller_ra) { + if (index == 0) { + return(lstat_make_dir_entry(lock_ptr, caller_ra)); + } + index = dirp[index].next_stat_index; + } + + if (dirp[index].lock_ptr != NULL && + dirp[index].lock_ptr != lock_ptr) { + dirp[index].lock_ptr = NULL; + } + + return(index); +} + + +/* + * lstat_make_dir_entry + * Called to add a new lock to the lock directory. + */ +static ushort +lstat_make_dir_entry( + void *lock_ptr, + void *caller_ra) +{ + lstat_directory_entry_t *dirp; + ushort index, hindex; + unsigned long flags; + + /* lock the table without recursively reentering this metering code */ + do { local_irq_save(flags); + _raw_spin_lock(&lstat_control.directory_lock); } while(0); + + hindex = DIRHASH(caller_ra); + index = lstat_control.hashtab[hindex]; + dirp = lstat_control.dir; + while (index && dirp[index].caller_ra != caller_ra) + index = dirp[index].next_stat_index; + + if (index == 0) { + if(lstat_control.next_free_dir_index < LSTAT_MAX_STAT_INDEX) { + index = lstat_control.next_free_dir_index++; + lstat_control.dir[index].caller_ra = caller_ra; + lstat_control.dir[index].lock_ptr = lock_ptr; + lstat_control.dir[index].next_stat_index = lstat_control.hashtab[hindex]; + lstat_control.hashtab[hindex] = index; + } else { + lstat_control.dir_overflow++; + } + } + + do { _raw_spin_unlock(&lstat_control.directory_lock); + local_irq_restore(flags);} while(0); + return(index); +} + +int +lstat_update ( + void *lock_ptr, + void *caller_ra, + int action) +{ + int index; + int cpu; + + ASSERT(action < LSTAT_ACT_MAX_VALUES); + + if (lstat_control.state == LSTAT_OFF) { + return(0); + } + + index = lstat_lookup(lock_ptr, caller_ra); + cpu = THIS_CPU_NUMBER; + (*lstat_control.counts[cpu])[index].count[action]++; + (*lstat_control.counts[cpu])[index].acquire_time = get_cycles(); + + return(index); +} + +int +lstat_update_time ( + void *lock_ptr, + void *caller_ra, + int action, + uint32_t ticks) +{ + ushort index; + int cpu; + + ASSERT(action < LSTAT_ACT_MAX_VALUES); + + if (lstat_control.state == LSTAT_OFF) { + return(0); + } + + index = lstat_lookup(lock_ptr, caller_ra); + cpu = THIS_CPU_NUMBER; + (*lstat_control.counts[cpu])[index].count[action]++; + (*lstat_control.counts[cpu])[index].cum_wait_ticks += (uint64_t)ticks; + if ((*lstat_control.counts[cpu])[index].max_wait_ticks < ticks) + (*lstat_control.counts[cpu])[index].max_wait_ticks = ticks; + + (*lstat_control.counts[cpu])[index].acquire_time = get_cycles(); + + return(index); +} + +void _metered_spin_lock(spinlock_t *lock_ptr) +{ + if (lstat_control.state == LSTAT_OFF) { + _raw_spin_lock(lock_ptr); /* do the real lock */ + PUT_INDEX(lock_ptr,0); /* clean index in case lockmetering */ + /* gets turned on before unlock */ + } else { + void *this_pc = LSTAT_RA(LSTAT_RA_SPIN); + int index; + + if (_raw_spin_trylock(lock_ptr)) { + index = lstat_update(lock_ptr, this_pc, LSTAT_ACT_NO_WAIT); + } else { + uint32_t start_cycles = get_cycles(); + _raw_spin_lock(lock_ptr); /* do the real lock */ + index = lstat_update_time(lock_ptr, this_pc, LSTAT_ACT_SPIN, + get_cycles() - start_cycles); + } + /* save the index in the lock itself for use in spin unlock */ + PUT_INDEX(lock_ptr,index); + } +} + +int _metered_spin_trylock(spinlock_t *lock_ptr) +{ + if (lstat_control.state == LSTAT_OFF) { + return _raw_spin_trylock(lock_ptr); + } else { + int retval; + void *this_pc = LSTAT_RA(LSTAT_RA_SPIN); + + if ((retval = _raw_spin_trylock(lock_ptr))) { + int index = lstat_update(lock_ptr, this_pc, LSTAT_ACT_NO_WAIT); + /* save the index in the lock itself for use in spin unlock */ + PUT_INDEX(lock_ptr,index); + } else { + lstat_update(lock_ptr, this_pc, LSTAT_ACT_REJECT); + } + + return retval; + } +} + +void _metered_spin_unlock(spinlock_t *lock_ptr) +{ + int index=-1; + + if (lstat_control.state != LSTAT_OFF) { + index = GET_INDEX(lock_ptr); + /* + * If statistics were turned off when we set the lock, + * then the index can be zero. If that is the case, + * then collect no stats on this call. + */ + if (index > 0) { + uint32_t hold_time; + int cpu = THIS_CPU_NUMBER; + hold_time = get_cycles() - (*lstat_control.counts[cpu])[index].acquire_time; + (*lstat_control.counts[cpu])[index].cum_hold_ticks += (uint64_t)hold_time; + if ((*lstat_control.counts[cpu])[index].max_hold_ticks < hold_time) + (*lstat_control.counts[cpu])[index].max_hold_ticks = hold_time; + } + } + + /* make sure we don't have a stale index value saved */ + PUT_INDEX(lock_ptr,0); + _raw_spin_unlock(lock_ptr); /* do the real unlock */ +} + +/* + * allocate the next global read lock structure and store its index + * in the rwlock at "lock_ptr". + */ +uint32_t alloc_rwlock_struct(rwlock_t *rwlock_ptr) +{ + int index; + int flags; + int cpu=THIS_CPU_NUMBER; + + /* If we've already overflowed, then do a quick exit */ + if (lstat_control.next_free_read_lock_index > LSTAT_MAX_READ_LOCK_INDEX) { + lstat_control.rwlock_overflow++; + return(0); + } + + do { local_irq_save(flags); + _raw_spin_lock(&lstat_control.directory_lock); } while(0); + + /* It is possible this changed while we were waiting for the directory_lock */ + if (lstat_control.state == LSTAT_OFF) { + index=0; + goto unlock; + } + + /* It is possible someone else got here first and set the index */ + if ((index=GET_RWINDEX(rwlock_ptr)) == 0) { + + /* we can't turn on read stats for this lock while there are readers */ + /* (this would mess up the running hold time sum at unlock time) */ + if (RWLOCK_READERS(rwlock_ptr) != 0) { + index=0; + goto unlock; + } + + /* if stats are turned on after being off, we may need to return an old */ + /* index from when the statistics were on last time. ................... */ + for(index=1;index= LSTAT_MAX_READ_LOCK_INDEX) { + lstat_control.rwlock_overflow++; + index = 0; + goto unlock; + } + index = lstat_control.next_free_read_lock_index++; + + /* initialize the global read stats data structure for each cpu */ + for(cpu=0; cpu < smp_num_cpus; cpu++) { + (*lstat_control.read_lock_counts[cpu])[index].lock_ptr = rwlock_ptr; + } +put_index_and_unlock: + /* store the index for the read lock structure into the lock */ + PUT_RWINDEX(rwlock_ptr,index); + } + +unlock: + do { _raw_spin_unlock(&lstat_control.directory_lock); + local_irq_restore(flags);} while(0); + + return(index); +} + +void +_metered_read_lock(rwlock_t *rwlock_ptr) +{ + void *this_pc; + uint32_t start_cycles; + int index; + int cpu; + int flags; + int readers_before, readers_after; + uint64_t cycles64; + + if (lstat_control.state == LSTAT_OFF) { + _raw_read_lock(rwlock_ptr); + /* clean index in case lockmetering turns on before an unlock */ + PUT_RWINDEX(rwlock_ptr, 0); + return; + } + + this_pc = LSTAT_RA(LSTAT_RA_READ); + cpu = THIS_CPU_NUMBER; + index = GET_RWINDEX(rwlock_ptr); + + /* allocate the global stats entry for this lock, if needed */ + if (index==0) { + index = alloc_rwlock_struct(rwlock_ptr); + } + + readers_before = RWLOCK_READERS(rwlock_ptr); + if (_raw_read_trylock(rwlock_ptr)) { + /* + * We have decremented the lock to count a new reader, + * and have confirmed that no writer has it locked. + */ + /* update statistics if enabled */ + if (index>0) { +#ifndef __KERNEL__ + _raw_spin_lock((spinlock_t *)&(*lstat_control.read_lock_counts[cpu])[index].entry_lock); +#else + do { local_irq_save(flags); } while(0); +#endif + lstat_update((void *)rwlock_ptr, this_pc, LSTAT_ACT_NO_WAIT); + /* preserve value of TSC so cum_hold_ticks and start_busy use same value */ + cycles64 = get_cycles64(); + (*lstat_control.read_lock_counts[cpu])[index].cum_hold_ticks -= cycles64; + + /* record time and cpu of start of busy period */ + /* this is not perfect (some race conditions are possible) */ + if (readers_before==0) { + (*lstat_control.read_lock_counts[cpu])[index].start_busy = cycles64; + PUT_RW_CPU(rwlock_ptr, cpu); + } + readers_after=RWLOCK_READERS(rwlock_ptr); + if (readers_after > (*lstat_control.read_lock_counts[cpu])[index].max_readers) + (*lstat_control.read_lock_counts[cpu])[index].max_readers = readers_after; +#ifndef __KERNEL__ + _raw_spin_unlock((spinlock_t*)&(*lstat_control.read_lock_counts[cpu])[index].entry_lock); +#else + do {local_irq_restore(flags);} while(0); +#endif + } + + return; + } + /* If we get here, then we could not quickly grab the read lock */ + + start_cycles = get_cycles(); /* start counting the wait time */ + + /* Now spin until read_lock is successful */ + _raw_read_lock(rwlock_ptr); + + lstat_update_time((void *)rwlock_ptr, this_pc, LSTAT_ACT_SPIN, + get_cycles() - start_cycles); + + /* update statistics if they are enabled for this lock */ + if (index>0) { +#ifndef __KERNEL__ + _raw_spin_lock((spinlock_t *)&(*lstat_control.read_lock_counts[cpu])[index].entry_lock); +#else + do { local_irq_save(flags); } while(0); +#endif + cycles64 = get_cycles64(); + (*lstat_control.read_lock_counts[cpu])[index].cum_hold_ticks -= cycles64; + + /* this is not perfect (some race conditions are possible) */ + if (readers_before==0) { + (*lstat_control.read_lock_counts[cpu])[index].start_busy = cycles64; + PUT_RW_CPU(rwlock_ptr, cpu); + } + readers_after=RWLOCK_READERS(rwlock_ptr); + if (readers_after > (*lstat_control.read_lock_counts[cpu])[index].max_readers) + (*lstat_control.read_lock_counts[cpu])[index].max_readers = readers_after; + +#ifndef __KERNEL__ + _raw_spin_unlock((spinlock_t *)&(*lstat_control.read_lock_counts[cpu])[index].entry_lock); +#else + do {local_irq_restore(flags);} while(0); +#endif + } +} + +void _metered_read_unlock(rwlock_t *rwlock_ptr) +{ + int index; + int cpu; + int flags; + uint64_t busy_length; + uint64_t cycles64; + + if (lstat_control.state == LSTAT_OFF) { + _raw_read_unlock(rwlock_ptr); + return; + } + + index = GET_RWINDEX(rwlock_ptr); + cpu = THIS_CPU_NUMBER; + + if (index>0) { +#ifndef __KERNEL__ + _raw_spin_lock((spinlock_t *)&(*lstat_control.read_lock_counts[cpu])[index].entry_lock); +#else + /* updates below are non-atomic */ + do { local_irq_save(flags); } while(0); +#endif + /* preserve value of TSC so cum_hold_ticks and busy_ticks are consistent.. */ + cycles64 = get_cycles64(); + (*lstat_control.read_lock_counts[cpu])[index].cum_hold_ticks += cycles64; + (*lstat_control.read_lock_counts[cpu])[index].read_lock_count++; + + /* once again, this is not perfect (some race conditions are possible) */ + if (RWLOCK_READERS(rwlock_ptr) == 1) { + int cpu1 = GET_RW_CPU(rwlock_ptr); + uint64_t last_start_busy = (*lstat_control.read_lock_counts[cpu1])[index].start_busy; + (*lstat_control.read_lock_counts[cpu])[index].busy_periods++; + if (cycles64 > last_start_busy) { + busy_length = cycles64 - last_start_busy; + (*lstat_control.read_lock_counts[cpu])[index].busy_ticks += busy_length; + if (busy_length > (*lstat_control.read_lock_counts[cpu])[index].max_busy) + (*lstat_control.read_lock_counts[cpu])[index].max_busy = busy_length; + } + } +#ifndef __KERNEL__ + _raw_spin_unlock((spinlock_t *)&(*lstat_control.read_lock_counts[cpu])[index].entry_lock); +#else + do {local_irq_restore(flags);} while(0); +#endif + } + + /* unlock the lock */ + _raw_read_unlock(rwlock_ptr); +} + +void _metered_write_lock(rwlock_t *rwlock_ptr) +{ + uint32_t start_cycles; + void *this_pc; + uint32_t spin_ticks = 0; /* in anticipation of a potential wait */ + int index; + int write_index = 0; + int cpu; + enum {writer_writer_conflict, writer_reader_conflict} why_wait = writer_writer_conflict; + + if (lstat_control.state == LSTAT_OFF) { + _raw_write_lock(rwlock_ptr); + /* clean index in case lockmetering turns on before an unlock */ + PUT_RWINDEX(rwlock_ptr, 0); + return; + } + + this_pc = LSTAT_RA(LSTAT_RA_WRITE); + cpu = THIS_CPU_NUMBER; + index = GET_RWINDEX(rwlock_ptr); + + /* allocate the global stats entry for this lock, if needed */ + if (index == 0) { + index = alloc_rwlock_struct(rwlock_ptr); + } + + if (_raw_write_trylock(rwlock_ptr)) { + /* We acquired the lock on the first try */ + write_index = lstat_update((void *)rwlock_ptr, this_pc, LSTAT_ACT_NO_WAIT); + /* save the write_index for use in unlock if stats enabled */ + if (index > 0) + (*lstat_control.read_lock_counts[cpu])[index].write_index = write_index; + return; + } + + /* If we get here, then we could not quickly grab the write lock */ + start_cycles = get_cycles(); /* start counting the wait time */ + + why_wait = RWLOCK_READERS(rwlock_ptr) ? writer_reader_conflict : writer_writer_conflict; + + /* Now set the lock and wait for conflicts to disappear */ + _raw_write_lock(rwlock_ptr); + + spin_ticks = get_cycles() - start_cycles; + + /* update stats -- if enabled */ + if (index > 0) + if (spin_ticks) { + if (why_wait == writer_reader_conflict) { + /* waited due to a reader holding the lock */ + write_index = lstat_update_time((void *)rwlock_ptr, this_pc, + LSTAT_ACT_SPIN, spin_ticks); + } else { + /* waited due to another writer holding the lock */ + write_index = lstat_update_time((void *)rwlock_ptr, this_pc, + LSTAT_ACT_WW_SPIN, spin_ticks); + (*lstat_control.counts[cpu])[write_index].cum_wait_ww_ticks += spin_ticks; + if (spin_ticks > + (*lstat_control.counts[cpu])[write_index].max_wait_ww_ticks) { + (*lstat_control.counts[cpu])[write_index].max_wait_ww_ticks = spin_ticks; + } + } + + /* save the directory index for use on write_unlock */ + (*lstat_control.read_lock_counts[cpu])[index].write_index = write_index; + } + +} + +void +_metered_write_unlock(rwlock_t *rwlock_ptr) +{ + int index; + int cpu; + int write_index; + uint32_t hold_time; + + if (lstat_control.state == LSTAT_OFF) { + _raw_write_unlock(rwlock_ptr); + return; + } + + cpu = THIS_CPU_NUMBER; + index = GET_RWINDEX(rwlock_ptr); + + /* update statistics if stats enabled for this lock */ + if (index>0) { + write_index = (*lstat_control.read_lock_counts[cpu])[index].write_index; + + hold_time = get_cycles() - (*lstat_control.counts[cpu])[write_index].acquire_time; + (*lstat_control.counts[cpu])[write_index].cum_hold_ticks += (uint64_t)hold_time; + if ((*lstat_control.counts[cpu])[write_index].max_hold_ticks < hold_time) + (*lstat_control.counts[cpu])[write_index].max_hold_ticks = hold_time; + } + _raw_write_unlock(rwlock_ptr); +} + +int _metered_write_trylock(rwlock_t *rwlock_ptr) +{ + int retval; + void *this_pc = LSTAT_RA(LSTAT_RA_WRITE); + + if ((retval = _raw_write_trylock(rwlock_ptr))) { + lstat_update(rwlock_ptr, this_pc, LSTAT_ACT_NO_WAIT); + } else { + lstat_update(rwlock_ptr, this_pc, LSTAT_ACT_REJECT); + } + + return retval; +} + +#ifdef __KERNEL__ +static void +init_control_space(void) +{ + /* Set all control space pointers to null and indices to "empty" */ + int cpu; + + /* + * Access CPU_CYCLE_FREQUENCY at the outset, which in some + * architectures may trigger a runtime calculation that uses a + * spinlock. Let's do this before lockmetering is turned on. + */ + if (CPU_CYCLE_FREQUENCY == 0) + BUG(); + + lstat_control.hashtab = NULL; + lstat_control.dir = NULL; + for (cpu=0; cpu max_len) + return actual_ret_bcount; + + copy_to_user(buffer, (void *)&req, next_ret_bcount); + actual_ret_bcount += next_ret_bcount; + return actual_ret_bcount; + } else { + /* measurement is off but valid data present */ + /* fetch time info from lstat_control */ + req.ending_time = lstat_control.ending_time; + req.ending_cycles64 = lstat_control.ending_cycles64; + req.enabled_cycles64 = lstat_control.enabled_cycles64; + } + } else { + /* this must be a read while data active--use current time, etc */ + do_gettimeofday(&tv); + req.ending_time = tv.tv_sec; + req.ending_cycles64 = get_cycles64(); + req.enabled_cycles64 = req.ending_cycles64-req.started_cycles64 + + lstat_control.enabled_cycles64; + } + + next_ret_bcount = sizeof(lstat_user_request_t); + if ((actual_ret_bcount + next_ret_bcount) > max_len) + return actual_ret_bcount; + + copy_to_user(buffer, (void *)&req, next_ret_bcount); + actual_ret_bcount += next_ret_bcount; + + if (!lstat_control.counts[0]) /* not initialized? */ + return actual_ret_bcount; + + next_ret_bcount = sizeof(lstat_cpu_counts_t); + for (cpu = 0; cpu < smp_num_cpus; cpu++) { + if ((actual_ret_bcount + next_ret_bcount) > max_len) + return actual_ret_bcount; /* leave early */ + copy_to_user(buffer + actual_ret_bcount, lstat_control.counts[cpu], + next_ret_bcount); + actual_ret_bcount += next_ret_bcount; + } + + next_ret_bcount = LSTAT_MAX_STAT_INDEX * sizeof(lstat_directory_entry_t); + if ( ((actual_ret_bcount + next_ret_bcount) > max_len) + || !lstat_control.dir ) + return actual_ret_bcount; /* leave early */ + + copy_to_user(buffer + actual_ret_bcount, lstat_control.dir, + next_ret_bcount); + actual_ret_bcount += next_ret_bcount; + + next_ret_bcount = sizeof(lstat_read_lock_cpu_counts_t); + for (cpu = 0; cpu < smp_num_cpus; cpu++) { + if (actual_ret_bcount + next_ret_bcount > max_len) + return actual_ret_bcount; + copy_to_user(buffer + actual_ret_bcount, lstat_control.read_lock_counts[cpu], + next_ret_bcount); + actual_ret_bcount += next_ret_bcount; + } + + return actual_ret_bcount; +} + +/* + * Writing to the /proc lockmeter node enables or disables metering. + * based upon the first byte of the "written" data. + * The following values are defined: + * LSTAT_ON: 1st call: allocates storage, intializes and turns on measurement + * subsequent calls just turn on measurement + * LSTAT_OFF: turns off measurement + * LSTAT_RESET: resets statistics + * LSTAT_RELEASE: releases statistics storage + * + * This allows one to accumulate statistics over several lockstat runs: + * + * lockstat on + * lockstat off + * ...repeat above as desired... + * lockstat get + * ...now start a new set of measurements... + * lockstat reset + * lockstat on + * ... + * + */ +ssize_t put_lockmeter_info(const char *buffer, size_t len) +{ + int error = 0; + int dirsize, countsize, read_lock_countsize, hashsize; + int cpu; + char put_char; + int i, read_lock_blocks, flags; + rwlock_t *lock_ptr; + struct timeval tv; + + if (len <= 0) + return -EINVAL; + + _raw_spin_lock(&lstat_control.control_lock); + + get_user(put_char, buffer); + switch (put_char) { + + case LSTAT_OFF: + if (lstat_control.state != LSTAT_OFF) { + /* + * To avoid seeing read lock hold times in an inconsisent state, + * we have to follow this protocol to turn off statistics + */ + do { local_irq_save(flags); } while(0); + /* getting this lock will stop any read lock block allocations */ + _raw_spin_lock(&lstat_control.directory_lock); + /* keep any more read lock blocks from being allocated */ + lstat_control.state = LSTAT_OFF; + /* record how may read lock blocks there are */ + read_lock_blocks = lstat_control.next_free_read_lock_index; + _raw_spin_unlock(&lstat_control.directory_lock); + /* now go through the list of read locks */ + cpu = THIS_CPU_NUMBER; + for(i=1;i #include -#ifdef CONFIG_NUMA +#ifdef CONFIG_NUMA_SCHED #define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu)) #else #define cpu_to_node_mask(cpu) (cpu_online_map) @@ -57,6 +57,11 @@ #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) +/* the FIXED_1 gunk is so running averages don't vanish prematurely */ +#define RAVG_WEIGHT 128 +#define RAVG_FACTOR (RAVG_WEIGHT*FIXED_1) +#define RUNNING_AVG(x,y) (((RAVG_WEIGHT-1)*(x)+RAVG_FACTOR*(y))/RAVG_WEIGHT) + /* * These are the 'tuning knobs' of the scheduler: * @@ -64,16 +69,29 @@ * maximum timeslice is 200 msecs. Timeslices get refilled after * they expire. */ -#define MIN_TIMESLICE ( 10 * HZ / 1000) -#define MAX_TIMESLICE (200 * HZ / 1000) -#define CHILD_PENALTY 50 -#define PARENT_PENALTY 100 -#define EXIT_WEIGHT 3 -#define PRIO_BONUS_RATIO 25 -#define INTERACTIVE_DELTA 2 -#define MAX_SLEEP_AVG (10*HZ) -#define STARVATION_LIMIT (10*HZ) -#define NODE_THRESHOLD 125 +int min_timeslice = (10 * HZ) / 1000; +int max_timeslice = (200 * HZ) / 1000; +int child_penalty = 50; +int parent_penalty = 100; +int exit_weight = 3; +int prio_bonus_ratio = 25; +int interactive_delta = 2; +int max_sleep_avg = 10 * HZ; +int starvation_limit = 10 * HZ; +int node_threshold = 125; + +#define MIN_TIMESLICE (min_timeslice) +#define MAX_TIMESLICE (max_timeslice) +#define CHILD_PENALTY (child_penalty) +#define PARENT_PENALTY (parent_penalty) +#define EXIT_WEIGHT (exit_weight) +#define PRIO_BONUS_RATIO (prio_bonus_ratio) +#define INTERACTIVE_DELTA (interactive_delta) +#define MAX_SLEEP_AVG (max_sleep_avg) +#define STARVATION_LIMIT (starvation_limit) +#define NODE_THRESHOLD (node_threshold) + +#define TIMESLICE_GRANULARITY (HZ/20 ?: 1) /* * If a task is 'interactive' then we reinsert it in the active @@ -161,7 +179,7 @@ struct runqueue { struct mm_struct *prev_mm; prio_array_t *active, *expired, arrays[2]; int prev_cpu_load[NR_CPUS]; -#ifdef CONFIG_NUMA +#ifdef CONFIG_NUMA_SCHED atomic_t *node_nr_running; int prev_node_load[MAX_NUMNODES]; #endif @@ -169,6 +187,8 @@ struct runqueue { struct list_head migration_queue; atomic_t nr_iowait; + + struct sched_info info; } ____cacheline_aligned; static struct runqueue runqueues[NR_CPUS] __cacheline_aligned; @@ -188,7 +208,7 @@ static struct runqueue runqueues[NR_CPUS # define task_running(rq, p) ((rq)->curr == (p)) #endif -#ifdef CONFIG_NUMA +#ifdef CONFIG_NUMA_SCHED /* * Keep track of running tasks. @@ -222,13 +242,186 @@ __init void node_nr_running_init(void) cpu_rq(i)->node_nr_running = &node_nr_running[cpu_to_node(i)]; } -#else /* !CONFIG_NUMA */ +#else /* !CONFIG_NUMA_SCHED */ # define nr_running_init(rq) do { } while (0) # define nr_running_inc(rq) do { (rq)->nr_running++; } while (0) # define nr_running_dec(rq) do { (rq)->nr_running--; } while (0) -#endif /* CONFIG_NUMA */ +#endif /* CONFIG_NUMA_SCHED */ + + +struct schedstat { + /* sys_sched_yield stats */ + unsigned long yld_exp_empty; + unsigned long yld_act_empty; + unsigned long yld_both_empty; + unsigned long yld_cnt; + + /* schedule stats */ + unsigned long sched_noswitch; + unsigned long sched_switch; + unsigned long sched_cnt; + + /* load_balance stats */ + unsigned long lb_imbalance; + unsigned long lb_idle; + unsigned long lb_busy; + unsigned long lb_resched; + unsigned long lb_cnt; + unsigned long lb_nobusy; + unsigned long lb_bnode; + + /* pull_task stats */ + unsigned long pt_gained; + unsigned long pt_lost; + unsigned long pt_node_gained; + unsigned long pt_node_lost; + + /* balance_node stats */ + unsigned long bn_cnt; + unsigned long bn_idle; +} ____cacheline_aligned; + +static inline void sched_info_arrive(task_t *t) +{ + unsigned long now = jiffies; + unsigned long diff = now - t->sched_info.last_arrival; + struct runqueue *rq = task_rq(t); + + t->sched_info.inter_arrival_time = + RUNNING_AVG(t->sched_info.inter_arrival_time, diff); + t->sched_info.last_arrival = now; + + if (!rq) + return; + diff = now - rq->info.last_arrival; + rq->info.inter_arrival_time = + RUNNING_AVG(rq->info.inter_arrival_time, diff); + rq->info.last_arrival = now; +} + +/* is this ever used? */ +static inline void sched_info_depart(task_t *t) +{ + struct runqueue *rq = task_rq(t); + unsigned long diff, now = jiffies; + + diff = now - t->sched_info.began_service; + t->sched_info.service_time = + RUNNING_AVG(t->sched_info.service_time, diff); + + if (!rq) + return; + diff = now - rq->info.began_service; + rq->info.service_time = + RUNNING_AVG(rq->info.service_time, diff); +} + +static inline void sched_info_switch(task_t *prev, task_t *next) +{ + struct runqueue *rq = task_rq(prev); + unsigned long diff, now = jiffies; + + /* prev now departs the cpu */ + sched_info_depart(prev); + + /* only for involuntary context switches */ + if (prev->state == TASK_RUNNING) + sched_info_arrive(prev); + + diff = now - next->sched_info.last_arrival; + next->sched_info.response_time = + RUNNING_AVG(next->sched_info.response_time, diff); + next->sched_info.began_service = now; + + if (!rq) + return; + /* yes, reusing next's service time is valid */ + rq->info.response_time = + RUNNING_AVG(rq->info.response_time, diff); + rq->info.began_service = now; + + if (prev->state != TASK_RUNNING) + return; + /* if prev arrived subtract rq's last arrival from its arrival */ + diff = now - rq->info.last_arrival; + rq->info.inter_arrival_time = + RUNNING_AVG(rq->info.inter_arrival_time, diff); + rq->info.last_arrival = now; +} + +/* + * bump this up when changing the output format or the meaning of an existing + * format, so that tools can adapt (or abort) + */ +#define SCHEDSTAT_VERSION 2 + +struct schedstat schedstats[NR_CPUS]; + +/* + * This could conceivably exceed a page's worth of output on machines with + * large number of cpus, where large == about 4096/100 or 40ish. Start + * worrying when we pass 32, probably. Then this has to stop being a + * "simple" entry in proc/proc_misc.c and needs to be an actual seq_file. + */ +int schedstats_read_proc(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct schedstat sums; + int i, len; + + memset(&sums, 0, sizeof(sums)); + len = sprintf(page, "version %d\n", SCHEDSTAT_VERSION); + for (i = 0; i < NR_CPUS; i++) { + if (!cpu_online(i)) continue; + sums.yld_exp_empty += schedstats[i].yld_exp_empty; + sums.yld_act_empty += schedstats[i].yld_act_empty; + sums.yld_both_empty += schedstats[i].yld_both_empty; + sums.yld_cnt += schedstats[i].yld_cnt; + sums.sched_noswitch += schedstats[i].sched_noswitch; + sums.sched_switch += schedstats[i].sched_switch; + sums.sched_cnt += schedstats[i].sched_cnt; + sums.lb_idle += schedstats[i].lb_idle; + sums.lb_busy += schedstats[i].lb_busy; + sums.lb_resched += schedstats[i].lb_resched; + sums.lb_cnt += schedstats[i].lb_cnt; + sums.lb_imbalance += schedstats[i].lb_imbalance; + sums.lb_nobusy += schedstats[i].lb_nobusy; + sums.lb_bnode += schedstats[i].lb_bnode; + sums.pt_node_gained += schedstats[i].pt_node_gained; + sums.pt_node_lost += schedstats[i].pt_node_lost; + sums.pt_gained += schedstats[i].pt_gained; + sums.pt_lost += schedstats[i].pt_lost; + sums.bn_cnt += schedstats[i].bn_cnt; + sums.bn_idle += schedstats[i].bn_idle; + len += sprintf(page + len, + "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu " + "%lu %lu %lu %lu %lu %lu %lu %lu\n", + i, schedstats[i].yld_both_empty, + schedstats[i].yld_act_empty, schedstats[i].yld_exp_empty, + schedstats[i].yld_cnt, schedstats[i].sched_noswitch, + schedstats[i].sched_switch, schedstats[i].sched_cnt, + schedstats[i].lb_idle, schedstats[i].lb_busy, + schedstats[i].lb_resched, + schedstats[i].lb_cnt, schedstats[i].lb_imbalance, + schedstats[i].lb_nobusy, schedstats[i].lb_bnode, + schedstats[i].pt_gained, schedstats[i].pt_lost, + schedstats[i].pt_node_gained, schedstats[i].pt_node_lost, + schedstats[i].bn_cnt, schedstats[i].bn_idle); + } + len += sprintf(page + len, + "totals %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu " + "%lu %lu %lu %lu %lu %lu %lu\n", + sums.yld_both_empty, sums.yld_act_empty, sums.yld_exp_empty, + sums.yld_cnt, sums.sched_noswitch, sums.sched_switch, + sums.sched_cnt, sums.lb_idle, sums.lb_busy, sums.lb_resched, + sums.lb_cnt, sums.lb_imbalance, sums.lb_nobusy, sums.lb_bnode, + sums.pt_gained, sums.pt_lost, sums.pt_node_gained, + sums.pt_node_lost, sums.bn_cnt, sums.bn_idle); + + return len; +} /* * task_rq_lock - lock the runqueue a given task resides on and disable @@ -487,15 +680,18 @@ repeat_lock_task: (p->cpus_allowed & (1UL << smp_processor_id())))) { set_task_cpu(p, smp_processor_id()); + sched_info_arrive(p); task_rq_unlock(rq, &flags); goto repeat_lock_task; } if (old_state == TASK_UNINTERRUPTIBLE) rq->nr_uninterruptible--; - if (sync) + if (sync) { + sched_info_arrive(p); __activate_task(p, rq); - else { + } else { activate_task(p, rq); + sched_info_arrive(p); if (p->prio < rq->curr->prio) resched_task(rq->curr); } @@ -549,6 +745,7 @@ void wake_up_forked_process(task_t * p) p->sleep_avg = p->sleep_avg * CHILD_PENALTY / 100; p->prio = effective_prio(p); set_task_cpu(p, smp_processor_id()); + sched_info_arrive(p); if (unlikely(!current->array)) __activate_task(p, rq); @@ -656,7 +853,6 @@ static inline task_t * context_switch(ru return prev; } - /* * nr_running, nr_uninterruptible and nr_context_switches: * @@ -674,6 +870,11 @@ unsigned long nr_running(void) return sum; } +unsigned long nr_running_cpu(int cpu) +{ + return cpu_rq(cpu)->nr_running; +} + unsigned long nr_uninterruptible(void) { unsigned long i, sum = 0; @@ -710,6 +911,11 @@ unsigned long nr_iowait(void) return sum; } +void cpu_sched_info(struct sched_info *info, int cpu) +{ + memcpy(info, &cpu_rq(cpu)->info, sizeof(struct sched_info)); +} + /* * double_rq_lock - safely lock two runqueues * @@ -744,7 +950,7 @@ static inline void double_rq_unlock(runq spin_unlock(&rq2->lock); } -#ifdef CONFIG_NUMA +#ifdef CONFIG_NUMA_SCHED /* * If dest_cpu is allowed for this process, migrate the task to it. * This is accomplished by forcing the cpu_allowed mask to only @@ -771,30 +977,64 @@ static void sched_migrate_task(task_t *p */ static int sched_best_cpu(struct task_struct *p) { - int i, minload, load, best_cpu, node = 0; + int cpu, node, minload, load, best_cpu, best_node; + int this_cpu, this_node, this_node_load; unsigned long cpumask; - best_cpu = task_cpu(p); - if (cpu_rq(best_cpu)->nr_running <= 2) - return best_cpu; + this_cpu = best_cpu = task_cpu(p); + if (cpu_rq(this_cpu)->nr_running <= 2) + return this_cpu; + this_node = best_node = cpu_to_node(this_cpu); + + /* + * First look for any node-local idle queue and use that. + * This improves performance under light loads (mbligh). + * In case this node turns out to be the lightest node, store the best + * cpu that we find, so we don't go sniffing the same runqueues again. + */ + minload = 10000000; + cpumask = node_to_cpumask(this_node); + for (cpu = 0; cpu < NR_CPUS; ++cpu) { + if (!(cpumask & (1UL << cpu))) + continue; + load = cpu_rq(cpu)->nr_running; + if (load == 0) + return cpu; + if (load < minload) { + minload = load; + best_cpu = cpu; + } + } + /* Now find the lightest loaded node, and put it in best_node */ minload = 10000000; - for (i = 0; i < numnodes; i++) { - load = atomic_read(&node_nr_running[i]); + this_node_load = atomic_read(&node_nr_running[this_node]); + for (node = 0; node < numnodes; node++) { + if (node == this_node) + load = this_node_load; + else + load = atomic_read(&node_nr_running[node]); if (load < minload) { minload = load; - node = i; + best_node = node; } } + /* If we chose this node, we already did the legwork earlier */ + if (best_node == this_node) + return best_cpu; + + /* Now find the lightest loaded cpu on best_node, and use that */ minload = 10000000; - cpumask = node_to_cpumask(node); - for (i = 0; i < NR_CPUS; ++i) { - if (!(cpumask & (1UL << i))) + best_cpu = this_cpu; + cpumask = node_to_cpumask(best_node); + for (cpu = 0; cpu < NR_CPUS; ++cpu) { + if (!(cpumask & (1UL << cpu))) continue; - if (cpu_rq(i)->nr_running < minload) { - best_cpu = i; - minload = cpu_rq(i)->nr_running; + load = cpu_rq(cpu)->nr_running; + if (load < minload) { + minload = load; + best_cpu = cpu; } } return best_cpu; @@ -838,7 +1078,10 @@ static int find_busiest_node(int this_no return node; } -#endif /* CONFIG_NUMA */ +#endif /* CONFIG_NUMA_SCHED */ + +int idle_node_rebalance_ratio = 10; +int busy_node_rebalance_ratio = 2; #ifdef CONFIG_SMP @@ -951,6 +1194,12 @@ out: */ static inline void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, runqueue_t *this_rq, int this_cpu) { + if (cpu_to_node(this_cpu) != cpu_to_node(src_rq - runqueues)) { + schedstats[this_cpu].pt_node_gained++; + schedstats[src_rq - runqueues].pt_node_lost++; + } + schedstats[this_cpu].pt_gained++; + schedstats[src_rq - runqueues].pt_lost++; dequeue_task(p, src_array); nr_running_dec(src_rq); set_task_cpu(p, this_cpu); @@ -985,10 +1234,14 @@ static void load_balance(runqueue_t *thi struct list_head *head, *curr; task_t *tmp; + schedstats[this_cpu].lb_cnt++; busiest = find_busiest_queue(this_rq, this_cpu, idle, &imbalance, cpumask); - if (!busiest) + if (!busiest) { + schedstats[this_cpu].lb_nobusy++; goto out; + } + schedstats[this_cpu].lb_imbalance += imbalance; /* * We first consider expired tasks. Those will likely not be * executed in the near future, and they are most likely to @@ -1067,18 +1320,22 @@ out: */ #define IDLE_REBALANCE_TICK (HZ/1000 ?: 1) #define BUSY_REBALANCE_TICK (HZ/5 ?: 1) -#define IDLE_NODE_REBALANCE_TICK (IDLE_REBALANCE_TICK * 5) -#define BUSY_NODE_REBALANCE_TICK (BUSY_REBALANCE_TICK * 2) +#define IDLE_NODE_REBALANCE_TICK (IDLE_REBALANCE_TICK * idle_node_rebalance_ratio) +#define BUSY_NODE_REBALANCE_TICK (BUSY_REBALANCE_TICK * busy_node_rebalance_ratio) -#ifdef CONFIG_NUMA +#ifdef CONFIG_NUMA_SCHED static void balance_node(runqueue_t *this_rq, int idle, int this_cpu) { int node = find_busiest_node(cpu_to_node(this_cpu)); unsigned long cpumask, this_cpumask = 1UL << this_cpu; + schedstats[this_cpu].bn_cnt++; + if (idle) + schedstats[this_cpu].bn_idle++; if (node >= 0) { cpumask = node_to_cpumask(node) | this_cpumask; spin_lock(&this_rq->lock); + schedstats[this_cpu].lb_bnode++; load_balance(this_rq, idle, cpumask); spin_unlock(&this_rq->lock); } @@ -1087,9 +1344,7 @@ static void balance_node(runqueue_t *thi static void rebalance_tick(runqueue_t *this_rq, int idle) { -#ifdef CONFIG_NUMA int this_cpu = smp_processor_id(); -#endif unsigned long j = jiffies; /* @@ -1101,23 +1356,25 @@ static void rebalance_tick(runqueue_t *t * are not balanced.) */ if (idle) { -#ifdef CONFIG_NUMA +#ifdef CONFIG_NUMA_SCHED if (!(j % IDLE_NODE_REBALANCE_TICK)) balance_node(this_rq, idle, this_cpu); #endif if (!(j % IDLE_REBALANCE_TICK)) { spin_lock(&this_rq->lock); - load_balance(this_rq, 0, cpu_to_node_mask(this_cpu)); + schedstats[this_cpu].lb_idle++; + load_balance(this_rq, idle, cpu_to_node_mask(this_cpu)); spin_unlock(&this_rq->lock); } return; } -#ifdef CONFIG_NUMA +#ifdef CONFIG_NUMA_SCHED if (!(j % BUSY_NODE_REBALANCE_TICK)) balance_node(this_rq, idle, this_cpu); #endif if (!(j % BUSY_REBALANCE_TICK)) { spin_lock(&this_rq->lock); + schedstats[this_cpu].lb_busy++; load_balance(this_rq, idle, cpu_to_node_mask(this_cpu)); spin_unlock(&this_rq->lock); } @@ -1225,6 +1482,27 @@ void scheduler_tick(int user_ticks, int enqueue_task(p, rq->expired); } else enqueue_task(p, rq->active); + } else { + /* + * Prevent a too long timeslice allowing a task to monopolize + * the CPU. We do this by splitting up the timeslice into + * smaller pieces. + * + * Note: this does not mean the task's timeslices expire or + * get lost in any way, they just might be preempted by + * another task of equal priority. (one with higher + * priority would have preempted this task already.) We + * requeue this task to the end of the list on this priority + * level, which is in essence a round-robin of tasks with + * equal priority. + */ + if (!(p->time_slice % TIMESLICE_GRANULARITY) && + (p->array == rq->active)) { + dequeue_task(p, rq->active); + set_tsk_need_resched(p); + p->prio = effective_prio(p); + enqueue_task(p, rq->active); + } } out_unlock: spin_unlock(&rq->lock); @@ -1237,19 +1515,24 @@ void scheduling_functions_start_here(voi /* * schedule() is the main scheduler function. */ +#ifdef CONFIG_KGDB_THREAD +asmlinkage void do_schedule(void) +#else asmlinkage void schedule(void) +#endif { task_t *prev, *next; runqueue_t *rq; prio_array_t *array; struct list_head *queue; - int idx; + int idx, mycpu = smp_processor_id(); /* * Test if we are atomic. Since do_exit() needs to call into * schedule() atomically, we ignore that path for now. * Otherwise, whine if we are scheduling when we should not be. */ + schedstats[mycpu].sched_cnt++; if (likely(!(current->state & (TASK_DEAD | TASK_ZOMBIE)))) { if (unlikely(in_atomic())) { printk(KERN_ERR "bad: scheduling while atomic!\n"); @@ -1288,6 +1571,7 @@ need_resched: pick_next_task: if (unlikely(!rq->nr_running)) { #ifdef CONFIG_SMP + schedstats[mycpu].lb_resched++; load_balance(rq, 1, cpu_to_node_mask(smp_processor_id())); if (rq->nr_running) goto pick_next_task; @@ -1302,11 +1586,13 @@ pick_next_task: /* * Switch the active and expired arrays. */ + schedstats[mycpu].sched_switch++; rq->active = rq->expired; rq->expired = array; array = rq->active; rq->expired_timestamp = 0; } + schedstats[mycpu].sched_noswitch++; idx = sched_find_first_bit(array->bitmap); queue = array->queue + idx; @@ -1319,6 +1605,7 @@ switch_tasks: if (likely(prev != next)) { rq->nr_switches++; + sched_info_switch(prev, next); rq->curr = next; prepare_arch_switch(rq, next); @@ -1466,6 +1753,20 @@ void complete_all(struct completion *x) spin_unlock_irqrestore(&x->wait.lock, flags); } +#ifdef CONFIG_KGDB_THREAD +asmlinkage void user_schedule(void) +{ + current->thread.kgdbregs = NULL; + do_schedule(); +} + +asmlinkage void kern_do_schedule(struct pt_regs regs) +{ + current->thread.kgdbregs = ®s; + do_schedule(); +} +#endif + void wait_for_completion(struct completion *x) { might_sleep(); @@ -1958,6 +2259,7 @@ asmlinkage long sys_sched_yield(void) { runqueue_t *rq = this_rq_lock(); prio_array_t *array = current->array; + int mycpu = smp_processor_id(); /* * We implement yielding by moving the task into the expired @@ -1966,7 +2268,15 @@ asmlinkage long sys_sched_yield(void) * (special rule: RT tasks will just roundrobin in the active * array.) */ + schedstats[mycpu].yld_cnt++; if (likely(!rt_task(current))) { + if (current->array->nr_active == 1) { + schedstats[mycpu].yld_act_empty++; + if (!rq->expired->nr_active) + schedstats[mycpu].yld_both_empty++; + } else if (!rq->expired->nr_active) { + schedstats[mycpu].yld_exp_empty++; + } dequeue_task(current, array); enqueue_task(current, rq->expired); } else { diff -urpN -X /home/fletch/.diff.exclude 000-virgin/kernel/sysctl.c 900-mjb2/kernel/sysctl.c --- 000-virgin/kernel/sysctl.c Fri May 30 19:02:24 2003 +++ 900-mjb2/kernel/sysctl.c Wed Jun 11 22:42:39 2003 @@ -57,6 +57,18 @@ extern char core_pattern[]; extern int cad_pid; extern int pid_max; extern int sysctl_lower_zone_protection; +extern int min_timeslice; +extern int max_timeslice; +extern int child_penalty; +extern int parent_penalty; +extern int exit_weight; +extern int prio_bonus_ratio; +extern int interactive_delta; +extern int max_sleep_avg; +extern int starvation_limit; +extern int node_threshold; +extern int idle_node_rebalance_ratio; +extern int busy_node_rebalance_ratio; /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ static int maxolduid = 65535; @@ -114,6 +126,7 @@ static struct ctl_table_header root_tabl static ctl_table kern_table[]; static ctl_table vm_table[]; +static ctl_table sched_table[]; #ifdef CONFIG_NET extern ctl_table net_table[]; #endif @@ -158,6 +171,7 @@ static ctl_table root_table[] = { {CTL_FS, "fs", NULL, 0, 0555, fs_table}, {CTL_DEBUG, "debug", NULL, 0, 0555, debug_table}, {CTL_DEV, "dev", NULL, 0, 0555, dev_table}, + {CTL_SCHED, "sched", NULL, 0, 0555, sched_table}, {0} }; @@ -362,7 +376,49 @@ static ctl_table debug_table[] = { static ctl_table dev_table[] = { {0} -}; +}; + +static ctl_table sched_table[] = { + {SCHED_MAX_TIMESLICE, "max_timeslice", &max_timeslice, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &one, NULL}, + {SCHED_MIN_TIMESLICE, "min_timeslice", &min_timeslice, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &one, NULL}, + {SCHED_CHILD_PENALTY, "child_penalty", &child_penalty, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {SCHED_PARENT_PENALTY, "parent_penalty", &parent_penalty, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {SCHED_EXIT_WEIGHT, "exit_weight", &exit_weight, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {SCHED_PRIO_BONUS_RATIO, "prio_bonus_ratio", &prio_bonus_ratio, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {SCHED_INTERACTIVE_DELTA, "interactive_delta", &interactive_delta, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {SCHED_MAX_SLEEP_AVG, "max_sleep_avg", &max_sleep_avg, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &one, NULL}, + {SCHED_STARVATION_LIMIT, "starvation_limit", &starvation_limit, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {SCHED_NODE_THRESHOLD, "node_threshold", &node_threshold, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + sysctl_intvec, NULL, &one, NULL}, + {SCHED_IDLE_NODE_REBALANCE_RATIO, "idle_node_rebalance_ratio", + &idle_node_rebalance_ratio, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {SCHED_BUSY_NODE_REBALANCE_RATIO, "busy_node_rebalance_ratio", + &busy_node_rebalance_ratio, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {0} +}; extern void init_irq_proc (void); diff -urpN -X /home/fletch/.diff.exclude 000-virgin/kernel/timer.c 900-mjb2/kernel/timer.c --- 000-virgin/kernel/timer.c Fri May 30 19:02:24 2003 +++ 900-mjb2/kernel/timer.c Wed Jun 11 22:46:33 2003 @@ -747,6 +747,8 @@ static unsigned long count_active_tasks( * Requires xtime_lock to access. */ unsigned long avenrun[3]; +unsigned long tasks_running[3]; +unsigned long cpu_tasks_running[3][NR_CPUS]; /* * calc_load - given tick count, update the avenrun load estimates. @@ -754,8 +756,9 @@ unsigned long avenrun[3]; */ static inline void calc_load(unsigned long ticks) { - unsigned long active_tasks; /* fixed-point */ + unsigned long active_tasks, running_tasks; /* fixed-point */ static int count = LOAD_FREQ; + int cpu; count -= ticks; if (count < 0) { @@ -764,6 +767,19 @@ static inline void calc_load(unsigned lo CALC_LOAD(avenrun[0], EXP_1, active_tasks); CALC_LOAD(avenrun[1], EXP_5, active_tasks); CALC_LOAD(avenrun[2], EXP_15, active_tasks); + running_tasks = nr_running() * FIXED_1; + CALC_LOAD(tasks_running[0], EXP_1, running_tasks); + CALC_LOAD(tasks_running[1], EXP_5, running_tasks); + CALC_LOAD(tasks_running[2], EXP_15, running_tasks); + for (cpu = 0; cpu < NR_CPUS; ++cpu) { + if (!cpu_online(cpu)) + continue; + running_tasks = nr_running_cpu(cpu) * FIXED_1; + CALC_LOAD(cpu_tasks_running[0][cpu], EXP_1, running_tasks); + CALC_LOAD(cpu_tasks_running[1][cpu], EXP_5, running_tasks); + CALC_LOAD(cpu_tasks_running[2][cpu], EXP_15, running_tasks); + } + } } diff -urpN -X /home/fletch/.diff.exclude 000-virgin/mm/filemap.c 900-mjb2/mm/filemap.c --- 000-virgin/mm/filemap.c Fri May 30 19:02:24 2003 +++ 900-mjb2/mm/filemap.c Wed Jun 11 22:55:23 2003 @@ -63,6 +63,9 @@ * ->mmap_sem * ->i_shared_sem (various places) * + * ->lock_page + * ->i_shared_sem (page_convert_anon) + * * ->inode_lock * ->sb_lock (fs/fs-writeback.c) * ->mapping->page_lock (__sync_single_inode) @@ -267,19 +270,32 @@ static wait_queue_head_t *page_waitqueue return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)]; } -void wait_on_page_bit(struct page *page, int bit_nr) +int wait_on_page_bit_wq(struct page *page, int bit_nr, wait_queue_t *wait) { wait_queue_head_t *waitqueue = page_waitqueue(page); - DEFINE_WAIT(wait); + DEFINE_WAIT(local_wait); + if (!wait) + wait = &local_wait; + do { - prepare_to_wait(waitqueue, &wait, TASK_UNINTERRUPTIBLE); + prepare_to_wait(waitqueue, wait, TASK_UNINTERRUPTIBLE); if (test_bit(bit_nr, &page->flags)) { sync_page(page); + if (!is_sync_wait(wait)) + return -EIOCBRETRY; io_schedule(); } } while (test_bit(bit_nr, &page->flags)); - finish_wait(waitqueue, &wait); + finish_wait(waitqueue, wait); + + return 0; +} +EXPORT_SYMBOL(wait_on_page_bit_wq); + +void wait_on_page_bit(struct page *page, int bit_nr) +{ + wait_on_page_bit_wq(page, bit_nr, NULL); } EXPORT_SYMBOL(wait_on_page_bit); @@ -335,19 +351,31 @@ EXPORT_SYMBOL(end_page_writeback); * chances are that on the second loop, the block layer's plug list is empty, * so sync_page() will then return in state TASK_UNINTERRUPTIBLE. */ -void __lock_page(struct page *page) +int __lock_page_wq(struct page *page, wait_queue_t *wait) { wait_queue_head_t *wqh = page_waitqueue(page); - DEFINE_WAIT(wait); + DEFINE_WAIT(local_wait); + if (!wait) + wait = &local_wait; + while (TestSetPageLocked(page)) { - prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); + prepare_to_wait(wqh, wait, TASK_UNINTERRUPTIBLE); if (PageLocked(page)) { sync_page(page); + if (!is_sync_wait(wait)) + return -EIOCBRETRY; io_schedule(); } } - finish_wait(wqh, &wait); + finish_wait(wqh, wait); + return 0; +} +EXPORT_SYMBOL(__lock_page_wq); + +void __lock_page(struct page *page) +{ + __lock_page_wq(page, NULL); } EXPORT_SYMBOL(__lock_page); @@ -397,8 +425,8 @@ struct page *find_trylock_page(struct ad * * Returns zero if the page was not present. find_lock_page() may sleep. */ -struct page *find_lock_page(struct address_space *mapping, - unsigned long offset) +struct page *find_lock_page_wq(struct address_space *mapping, + unsigned long offset, wait_queue_t *wait) { struct page *page; @@ -409,7 +437,10 @@ repeat: page_cache_get(page); if (TestSetPageLocked(page)) { spin_unlock(&mapping->page_lock); - lock_page(page); + if (-EIOCBRETRY == lock_page_wq(page, wait)) { + page_cache_release(page); + return ERR_PTR(-EIOCBRETRY); + } spin_lock(&mapping->page_lock); /* Has the page been truncated while we slept? */ @@ -424,6 +455,12 @@ repeat: return page; } +struct page *find_lock_page(struct address_space *mapping, + unsigned long offset) +{ + return find_lock_page_wq(mapping, offset, NULL); +} + /** * find_or_create_page - locate or add a pagecache page * @@ -620,7 +657,13 @@ page_not_up_to_date: goto page_ok; /* Get exclusive access to the page ... */ - lock_page(page); + + if (lock_page_wq(page, current->io_wait)) { + pr_debug("queued lock page \n"); + error = -EIOCBRETRY; + /* TBD: should we hold on to the cached page ? */ + goto sync_error; + } /* Did it get unhashed before we got the lock? */ if (!page->mapping) { @@ -642,12 +685,19 @@ readpage: if (!error) { if (PageUptodate(page)) goto page_ok; - wait_on_page_locked(page); + if (wait_on_page_locked_wq(page, current->io_wait)) { + pr_debug("queued wait_on_page \n"); + error = -EIOCBRETRY; + /*TBD:should we hold on to the cached page ?*/ + goto sync_error; + } + if (PageUptodate(page)) goto page_ok; error = -EIO; } +sync_error: /* UHHUH! A synchronous read error occurred. Report it */ desc->error = error; page_cache_release(page); @@ -818,6 +868,10 @@ generic_file_read(struct file *filp, cha struct kiocb kiocb; ssize_t ret; + if (current->io_wait != NULL) { + printk("current->io_wait != NULL\n"); + dump_stack(); + } init_sync_kiocb(&kiocb, filp); ret = __generic_file_aio_read(&kiocb, &local_iov, 1, ppos); if (-EIOCBQUEUED == ret) @@ -850,6 +904,7 @@ ssize_t generic_file_sendfile(struct fil { read_descriptor_t desc; + BUG_ON(current->io_wait != NULL); if (!count) return 0; @@ -1372,7 +1427,9 @@ __grab_cache_page(struct address_space * int err; struct page *page; repeat: - page = find_lock_page(mapping, index); + page = find_lock_page_wq(mapping, index, current->io_wait); + if (IS_ERR(page)) + return page; if (!page) { if (!*cached_page) { *cached_page = page_cache_alloc(mapping); @@ -1689,6 +1746,10 @@ generic_file_aio_write_nolock(struct kio fault_in_pages_readable(buf, bytes); page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec); + if (IS_ERR(page)) { + status = PTR_ERR(page); + break; + } if (!page) { status = -ENOMEM; break; @@ -1696,6 +1757,8 @@ generic_file_aio_write_nolock(struct kio status = a_ops->prepare_write(file, page, offset, offset+bytes); if (unlikely(status)) { + if (-EIOCBRETRY == status) + pr_debug("queued prepare_write\n"); /* * prepare_write() may have instantiated a few blocks * outside i_size. Trim these off again. @@ -1736,7 +1799,11 @@ generic_file_aio_write_nolock(struct kio page_cache_release(page); if (status < 0) break; - balance_dirty_pages_ratelimited(mapping); + status = balance_dirty_pages_ratelimited(mapping); + if (status < 0) { + pr_debug("async balance_dirty_pages\n"); + break; + } cond_resched(); } while (count); *ppos = pos; @@ -1785,7 +1852,8 @@ ssize_t generic_file_aio_write(struct ki BUG_ON(iocb->ki_pos != pos); - down(&inode->i_sem); + if ((err = down_wq(&inode->i_sem, current->io_wait))) + return err; err = generic_file_aio_write_nolock(iocb, &local_iov, 1, &iocb->ki_pos); up(&inode->i_sem); diff -urpN -X /home/fletch/.diff.exclude 000-virgin/mm/fremap.c 900-mjb2/mm/fremap.c --- 000-virgin/mm/fremap.c Fri May 30 19:02:24 2003 +++ 900-mjb2/mm/fremap.c Wed Jun 11 22:42:42 2003 @@ -60,10 +60,26 @@ int install_page(struct mm_struct *mm, s pgd_t *pgd; pmd_t *pmd; struct pte_chain *pte_chain; + unsigned long pgidx; pte_chain = pte_chain_alloc(GFP_KERNEL); if (!pte_chain) goto err; + + /* + * Convert this page to anon for objrmap if it's nonlinear + */ + pgidx = (addr - vma->vm_start) >> PAGE_SHIFT; + pgidx += vma->vm_pgoff; + pgidx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT; + if (!PageAnon(page) && (page->index != pgidx)) { + lock_page(page); + err = page_convert_anon(page); + unlock_page(page); + if (err < 0) + goto err_free; + } + pgd = pgd_offset(mm, addr); spin_lock(&mm->page_table_lock); @@ -85,12 +101,11 @@ int install_page(struct mm_struct *mm, s if (flush) flush_tlb_page(vma, addr); update_mmu_cache(vma, addr, *pte); - spin_unlock(&mm->page_table_lock); - pte_chain_free(pte_chain); - return 0; + err = 0; err_unlock: spin_unlock(&mm->page_table_lock); +err_free: pte_chain_free(pte_chain); err: return err; diff -urpN -X /home/fletch/.diff.exclude 000-virgin/mm/memory.c 900-mjb2/mm/memory.c --- 000-virgin/mm/memory.c Fri May 30 19:02:24 2003 +++ 900-mjb2/mm/memory.c Wed Jun 11 22:42:42 2003 @@ -102,8 +102,7 @@ static inline void free_one_pmd(struct m static inline void free_one_pgd(struct mmu_gather *tlb, pgd_t * dir) { - int j; - pmd_t * pmd; + pmd_t * pmd, * md, * emd; if (pgd_none(*dir)) return; @@ -114,8 +113,21 @@ static inline void free_one_pgd(struct m } pmd = pmd_offset(dir, 0); pgd_clear(dir); - for (j = 0; j < PTRS_PER_PMD ; j++) - free_one_pmd(tlb, pmd+j); + /* + * Beware if changing the loop below. It once used int j, + * for (j = 0; j < PTRS_PER_PMD; j++) + * free_one_pmd(pmd+j); + * but some older i386 compilers (e.g. egcs-2.91.66, gcc-2.95.3) + * terminated the loop with a _signed_ address comparison + * using "jle", when configured for HIGHMEM64GB (X86_PAE). + * If also configured for 3GB of kernel virtual address space, + * if page at physical 0x3ffff000 virtual 0x7ffff000 is used as + * a pmd, when that mm exits the loop goes on to free "entries" + * found at 0x80000000 onwards. The loop below compiles instead + * to be terminated by unsigned address comparison using "jb". + */ + for (md = pmd, emd = pmd + PTRS_PER_PMD; md < emd; md++) + free_one_pmd(tlb,md); pmd_free_tlb(tlb, pmd); } @@ -1038,6 +1050,7 @@ static int do_wp_page(struct mm_struct * ++mm->rss; page_remove_rmap(old_page, page_table); break_cow(vma, new_page, address, page_table); + SetPageAnon(new_page); pte_chain = page_add_rmap(new_page, page_table, pte_chain); lru_cache_add_active(new_page); @@ -1241,6 +1254,7 @@ static int do_swap_page(struct mm_struct flush_icache_page(vma, page); set_pte(page_table, pte); + SetPageAnon(page); pte_chain = page_add_rmap(page, page_table, pte_chain); /* No need to invalidate - it was non-present before */ @@ -1306,6 +1320,7 @@ do_anonymous_page(struct mm_struct *mm, entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); lru_cache_add_active(page); mark_page_accessed(page); + SetPageAnon(page); } set_pte(page_table, entry); @@ -1365,6 +1380,10 @@ do_no_page(struct mm_struct *mm, struct if (!pte_chain) goto oom; + /* See if nopage returned an anon page */ + if (!new_page->mapping || PageSwapCache(new_page)) + SetPageAnon(new_page); + /* * Should we do an early C-O-W break? */ @@ -1377,6 +1396,7 @@ do_no_page(struct mm_struct *mm, struct copy_user_highpage(page, new_page, address); page_cache_release(new_page); lru_cache_add_active(page); + SetPageAnon(page); new_page = page; } diff -urpN -X /home/fletch/.diff.exclude 000-virgin/mm/mmap.c 900-mjb2/mm/mmap.c --- 000-virgin/mm/mmap.c Fri May 30 19:02:24 2003 +++ 900-mjb2/mm/mmap.c Wed Jun 11 22:42:42 2003 @@ -377,6 +377,28 @@ static inline int is_mergeable_vma(struc return 1; } +static void move_vma_start(struct vm_area_struct *vma, unsigned long addr) +{ + spinlock_t *lock = &vma->vm_mm->page_table_lock; + struct inode *inode = NULL; + + if (vma->vm_file) { + inode = vma->vm_file->f_dentry->d_inode; + down(&inode->i_mapping->i_shared_sem); + } + spin_lock(lock); + if (inode) + __remove_shared_vm_struct(vma, inode); + /* If no vm_file, perhaps we should always keep vm_pgoff at 0?? */ + vma->vm_pgoff += (long)(addr - vma->vm_start) >> PAGE_SHIFT; + vma->vm_start = addr; + if (inode) { + __vma_link_file(vma); + up(&inode->i_mapping->i_shared_sem); + } + spin_unlock(lock); +} + /* * Return true if we can merge this (vm_flags,file,vm_pgoff,size) * in front of (at a lower virtual address and file offset than) the vma. @@ -429,8 +451,6 @@ static int vma_merge(struct mm_struct *m unsigned long end, unsigned long vm_flags, struct file *file, unsigned long pgoff) { - spinlock_t * lock = &mm->page_table_lock; - /* * We later require that vma->vm_flags == vm_flags, so this tests * vma->vm_flags & VM_SPECIAL, too. @@ -450,6 +470,7 @@ static int vma_merge(struct mm_struct *m is_mergeable_vma(prev, file, vm_flags) && can_vma_merge_after(prev, vm_flags, file, pgoff)) { struct vm_area_struct *next; + spinlock_t *lock = &mm->page_table_lock; struct inode *inode = file ? file->f_dentry->d_inode : NULL; int need_up = 0; @@ -497,10 +518,7 @@ static int vma_merge(struct mm_struct *m pgoff, (end - addr) >> PAGE_SHIFT)) return 0; if (end == prev->vm_start) { - spin_lock(lock); - prev->vm_start = addr; - prev->vm_pgoff -= (end - addr) >> PAGE_SHIFT; - spin_unlock(lock); + move_vma_start(prev, addr); return 1; } } @@ -1220,8 +1238,7 @@ int split_vma(struct mm_struct * mm, str if (new_below) { new->vm_end = addr; - vma->vm_start = addr; - vma->vm_pgoff += ((addr - new->vm_start) >> PAGE_SHIFT); + move_vma_start(vma, addr); } else { vma->vm_end = addr; new->vm_start = addr; diff -urpN -X /home/fletch/.diff.exclude 000-virgin/mm/page-writeback.c 900-mjb2/mm/page-writeback.c --- 000-virgin/mm/page-writeback.c Fri May 30 19:02:24 2003 +++ 900-mjb2/mm/page-writeback.c Wed Jun 11 22:55:22 2003 @@ -134,7 +134,7 @@ get_dirty_limits(struct page_state *ps, * If we're over `background_thresh' then pdflush is woken to perform some * writeout. */ -void balance_dirty_pages(struct address_space *mapping) +int balance_dirty_pages(struct address_space *mapping) { struct page_state ps; long nr_reclaimable; @@ -151,6 +151,7 @@ void balance_dirty_pages(struct address_ .sync_mode = WB_SYNC_NONE, .older_than_this = NULL, .nr_to_write = write_chunk, + .nonblocking = !is_sync_wait(current->io_wait) }; get_dirty_limits(&ps, &background_thresh, &dirty_thresh); @@ -177,7 +178,11 @@ void balance_dirty_pages(struct address_ if (pages_written >= write_chunk) break; /* We've done our duty */ } - blk_congestion_wait(WRITE, HZ/10); + if (-EIOCBRETRY == blk_congestion_wait_wq(WRITE, HZ/10, + current->io_wait)) { + pr_debug("async blk congestion wait\n"); + return -EIOCBRETRY; + } } if (nr_reclaimable + ps.nr_writeback <= dirty_thresh) @@ -185,6 +190,8 @@ void balance_dirty_pages(struct address_ if (!writeback_in_progress(bdi) && nr_reclaimable > background_thresh) pdflush_operation(background_writeout, 0); + + return 0; } /** @@ -200,7 +207,7 @@ void balance_dirty_pages(struct address_ * decrease the ratelimiting by a lot, to prevent individual processes from * overshooting the limit by (ratelimit_pages) each. */ -void balance_dirty_pages_ratelimited(struct address_space *mapping) +int balance_dirty_pages_ratelimited(struct address_space *mapping) { static DEFINE_PER_CPU(int, ratelimits) = 0; int cpu; @@ -214,10 +221,10 @@ void balance_dirty_pages_ratelimited(str if (per_cpu(ratelimits, cpu)++ >= ratelimit) { per_cpu(ratelimits, cpu) = 0; put_cpu(); - balance_dirty_pages(mapping); - return; + return balance_dirty_pages(mapping); } put_cpu(); + return 0; } EXPORT_SYMBOL_GPL(balance_dirty_pages_ratelimited); diff -urpN -X /home/fletch/.diff.exclude 000-virgin/mm/page_alloc.c 900-mjb2/mm/page_alloc.c --- 000-virgin/mm/page_alloc.c Fri May 30 19:02:24 2003 +++ 900-mjb2/mm/page_alloc.c Wed Jun 11 22:47:08 2003 @@ -220,6 +220,8 @@ static inline void free_pages_check(cons bad_page(function, page); if (PageDirty(page)) ClearPageDirty(page); + if (PageAnon(page)) + ClearPageAnon(page); } /* @@ -897,7 +899,7 @@ void si_meminfo_node(struct sysinfo *val { pg_data_t *pgdat = NODE_DATA(nid); - val->totalram = pgdat->node_size; + val->totalram = pgdat->real_node_size; val->freeram = nr_free_pages_pgdat(pgdat); val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].spanned_pages; val->freehigh = pgdat->node_zones[ZONE_HIGHMEM].free_pages; @@ -1138,6 +1140,8 @@ static void __init calculate_zone_totalp if (zholes_size) for (i = 0; i < MAX_NR_ZONES; i++) realtotalpages -= zholes_size[i]; + pgdat->real_node_size = realtotalpages; + printk("On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages); } diff -urpN -X /home/fletch/.diff.exclude 000-virgin/mm/rmap.c 900-mjb2/mm/rmap.c --- 000-virgin/mm/rmap.c Sun Apr 20 19:35:08 2003 +++ 900-mjb2/mm/rmap.c Wed Jun 11 22:42:42 2003 @@ -102,6 +102,136 @@ pte_chain_encode(struct pte_chain *pte_c **/ /** + * find_pte - Find a pte pointer given a vma and a struct page. + * @vma: the vma to search + * @page: the page to find + * + * Determine if this page is mapped in this vma. If it is, map and rethrn + * the pte pointer associated with it. Return null if the page is not + * mapped in this vma for any reason. + * + * This is strictly an internal helper function for the object-based rmap + * functions. + * + * It is the caller's responsibility to unmap the pte if it is returned. + */ +static inline pte_t * +find_pte(struct vm_area_struct *vma, struct page *page, unsigned long *addr) +{ + struct mm_struct *mm = vma->vm_mm; + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + unsigned long loffset; + unsigned long address; + + loffset = (page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT)); + address = vma->vm_start + ((loffset - vma->vm_pgoff) << PAGE_SHIFT); + if (address < vma->vm_start || address >= vma->vm_end) + goto out; + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + goto out; + + pmd = pmd_offset(pgd, address); + if (!pmd_present(*pmd)) + goto out; + + pte = pte_offset_map(pmd, address); + if (!pte_present(*pte)) + goto out_unmap; + + if (page_to_pfn(page) != pte_pfn(*pte)) + goto out_unmap; + + if (addr) + *addr = address; + + return pte; + +out_unmap: + pte_unmap(pte); +out: + return NULL; +} + +/** + * page_referenced_obj_one - referenced check for object-based rmap + * @vma: the vma to look in. + * @page: the page we're working on. + * + * Find a pte entry for a page/vma pair, then check and clear the referenced + * bit. + * + * This is strictly a helper function for page_referenced_obj. + */ +static int +page_referenced_obj_one(struct vm_area_struct *vma, struct page *page) +{ + struct mm_struct *mm = vma->vm_mm; + pte_t *pte; + int referenced = 0; + + if (!spin_trylock(&mm->page_table_lock)) + return 1; + + pte = find_pte(vma, page, NULL); + if (pte) { + if (ptep_test_and_clear_young(pte)) + referenced++; + pte_unmap(pte); + } + + spin_unlock(&mm->page_table_lock); + return referenced; +} + +/** + * page_referenced_obj_one - referenced check for object-based rmap + * @page: the page we're checking references on. + * + * For an object-based mapped page, find all the places it is mapped and + * check/clear the referenced flag. This is done by following the page->mapping + * pointer, then walking the chain of vmas it holds. It returns the number + * of references it found. + * + * This function is only called from page_referenced for object-based pages. + * + * The semaphore address_space->i_shared_sem is tried. If it can't be gotten, + * assume a reference count of 1. + */ +static int +page_referenced_obj(struct page *page) +{ + struct address_space *mapping = page->mapping; + struct vm_area_struct *vma; + int referenced = 0; + + if (!page->pte.mapcount) + return 0; + + if (!mapping) + BUG(); + + if (PageSwapCache(page)) + BUG(); + + if (down_trylock(&mapping->i_shared_sem)) + return 1; + + list_for_each_entry(vma, &mapping->i_mmap, shared) + referenced += page_referenced_obj_one(vma, page); + + list_for_each_entry(vma, &mapping->i_mmap_shared, shared) + referenced += page_referenced_obj_one(vma, page); + + up(&mapping->i_shared_sem); + + return referenced; +} + +/** * page_referenced - test if the page was referenced * @page: the page to test * @@ -120,6 +250,10 @@ int page_referenced(struct page * page) if (TestClearPageReferenced(page)) referenced++; + if (!PageAnon(page)) { + referenced += page_referenced_obj(page); + goto out; + } if (PageDirect(page)) { pte_t *pte = rmap_ptep_map(page->pte.direct); if (ptep_test_and_clear_young(pte)) @@ -153,6 +287,7 @@ int page_referenced(struct page * page) __pte_chain_free(pc); } } +out: return referenced; } @@ -175,6 +310,21 @@ page_add_rmap(struct page *page, pte_t * pte_chain_lock(page); + /* + * If this is an object-based page, just count it. We can + * find the mappings by walking the object vma chain for that object. + */ + if (!PageAnon(page)) { + if (!page->mapping) + BUG(); + if (PageSwapCache(page)) + BUG(); + if (!page->pte.mapcount) + inc_page_state(nr_mapped); + page->pte.mapcount++; + goto out; + } + if (page->pte.direct == 0) { page->pte.direct = pte_paddr; SetPageDirect(page); @@ -231,8 +381,25 @@ void page_remove_rmap(struct page *page, pte_chain_lock(page); if (!page_mapped(page)) - goto out_unlock; /* remap_page_range() from a driver? */ + goto out_unlock; + /* + * If this is an object-based page, just uncount it. We can + * find the mappings by walking the object vma chain for that object. + */ + if (!PageAnon(page)) { + if (!page->mapping) + BUG(); + if (PageSwapCache(page)) + BUG(); + if (!page->pte.mapcount) + BUG(); + page->pte.mapcount--; + if (!page->pte.mapcount) + dec_page_state(nr_mapped); + goto out_unlock; + } + if (PageDirect(page)) { if (page->pte.direct == pte_paddr) { page->pte.direct = 0; @@ -279,6 +446,102 @@ out_unlock: } /** + * try_to_unmap_obj - unmap a page using the object-based rmap method + * @page: the page to unmap + * + * Determine whether a page is mapped in a given vma and unmap it if it's found. + * + * This function is strictly a helper function for try_to_unmap_obj. + */ +static inline int +try_to_unmap_obj_one(struct vm_area_struct *vma, struct page *page) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long address; + pte_t *pte; + pte_t pteval; + int ret = SWAP_AGAIN; + + if (!spin_trylock(&mm->page_table_lock)) + return ret; + + pte = find_pte(vma, page, &address); + if (!pte) + goto out; + + if (vma->vm_flags & VM_LOCKED) { + ret = SWAP_FAIL; + goto out_unmap; + } + + flush_cache_page(vma, address); + pteval = ptep_get_and_clear(pte); + flush_tlb_page(vma, address); + + if (pte_dirty(pteval)) + set_page_dirty(page); + + if (!page->pte.mapcount) + BUG(); + + mm->rss--; + page->pte.mapcount--; + page_cache_release(page); + +out_unmap: + pte_unmap(pte); + +out: + spin_unlock(&mm->page_table_lock); + return ret; +} + +/** + * try_to_unmap_obj - unmap a page using the object-based rmap method + * @page: the page to unmap + * + * Find all the mappings of a page using the mapping pointer and the vma chains + * contained in the address_space struct it points to. + * + * This function is only called from try_to_unmap for object-based pages. + * + * The semaphore address_space->i_shared_sem is tried. If it can't be gotten, + * return a temporary error. + */ +static int +try_to_unmap_obj(struct page *page) +{ + struct address_space *mapping = page->mapping; + struct vm_area_struct *vma; + int ret = SWAP_AGAIN; + + if (!mapping) + BUG(); + + if (PageSwapCache(page)) + BUG(); + + if (down_trylock(&mapping->i_shared_sem)) + return ret; + + list_for_each_entry(vma, &mapping->i_mmap, shared) { + ret = try_to_unmap_obj_one(vma, page); + if (ret == SWAP_FAIL || !page->pte.mapcount) + goto out; + } + + list_for_each_entry(vma, &mapping->i_mmap_shared, shared) { + ret = try_to_unmap_obj_one(vma, page); + if (ret == SWAP_FAIL || !page->pte.mapcount) + goto out; + } + +out: + up(&mapping->i_shared_sem); + return ret; +} + +/** * try_to_unmap_one - worker function for try_to_unmap * @page: page to unmap * @ptep: page table entry to unmap from page @@ -397,6 +660,15 @@ int try_to_unmap(struct page * page) if (!page->mapping) BUG(); + /* + * If it's an object-based page, use the object vma chain to find all + * the mappings. + */ + if (!PageAnon(page)) { + ret = try_to_unmap_obj(page); + goto out; + } + if (PageDirect(page)) { ret = try_to_unmap_one(page, page->pte.direct); if (ret == SWAP_SUCCESS) { @@ -452,9 +724,112 @@ int try_to_unmap(struct page * page) } } out: - if (!page_mapped(page)) + if (!page_mapped(page)) { dec_page_state(nr_mapped); + ret = SWAP_SUCCESS; + } return ret; +} + +/** + * page_convert_anon - Convert an object-based mapped page to pte_chain-based. + * @page: the page to convert + * + * Find all the mappings for an object-based page and convert them + * to 'anonymous', ie create a pte_chain and store all the pte pointers there. + * + * This function takes the address_space->i_shared_sem, sets the PageAnon flag, + * then sets the mm->page_table_lock for each vma and calls page_add_rmap. This + * means there is a period when PageAnon is set, but still has some mappings + * with no pte_chain entry. This is in fact safe, since page_remove_rmap will + * simply not find it. try_to_unmap might erroneously return success, but it + * will never be called because the page_convert_anon() caller has locked the + * page. + * + * page_referenced() may fail to scan all the appropriate pte's and may return + * an inaccurate result. This is so rare that it does not matter. + */ +int page_convert_anon(struct page *page) +{ + struct address_space *mapping; + struct vm_area_struct *vma; + struct pte_chain *pte_chain = NULL; + pte_t *pte; + int err = 0; + + mapping = page->mapping; + if (mapping == NULL) + goto out; /* truncate won the lock_page() race */ + + down(&mapping->i_shared_sem); + pte_chain_lock(page); + + /* + * Has someone else done it for us before we got the lock? + * If so, pte.direct or pte.chain has replaced pte.mapcount. + */ + if (PageAnon(page)) { + pte_chain_unlock(page); + goto out_unlock; + } + + SetPageAnon(page); + if (page->pte.mapcount == 0) { + pte_chain_unlock(page); + goto out_unlock; + } + /* This is gonna get incremented by page_add_rmap */ + dec_page_state(nr_mapped); + page->pte.mapcount = 0; + + /* + * Now that the page is marked as anon, unlock it. page_add_rmap will + * lock it as necessary. + */ + pte_chain_unlock(page); + + list_for_each_entry(vma, &mapping->i_mmap, shared) { + if (!pte_chain) { + pte_chain = pte_chain_alloc(GFP_KERNEL); + if (!pte_chain) { + err = -ENOMEM; + goto out_unlock; + } + } + spin_lock(&vma->vm_mm->page_table_lock); + pte = find_pte(vma, page, NULL); + if (pte) { + /* Make sure this isn't a duplicate */ + page_remove_rmap(page, pte); + pte_chain = page_add_rmap(page, pte, pte_chain); + pte_unmap(pte); + } + spin_unlock(&vma->vm_mm->page_table_lock); + } + list_for_each_entry(vma, &mapping->i_mmap_shared, shared) { + if (!pte_chain) { + pte_chain = pte_chain_alloc(GFP_KERNEL); + if (!pte_chain) { + err = -ENOMEM; + goto out_unlock; + } + } + spin_lock(&vma->vm_mm->page_table_lock); + pte = find_pte(vma, page, NULL); + if (pte) { + /* Make sure this isn't a duplicate */ + page_remove_rmap(page, pte); + pte_chain = page_add_rmap(page, pte, pte_chain); + pte_unmap(pte); + } + spin_unlock(&vma->vm_mm->page_table_lock); + } + +out_unlock: + pte_chain_free(pte_chain); + up(&mapping->i_shared_sem); +out: + return err; } /** diff -urpN -X /home/fletch/.diff.exclude 000-virgin/mm/swapfile.c 900-mjb2/mm/swapfile.c --- 000-virgin/mm/swapfile.c Fri May 30 19:02:24 2003 +++ 900-mjb2/mm/swapfile.c Wed Jun 11 22:42:42 2003 @@ -385,6 +385,7 @@ unuse_pte(struct vm_area_struct *vma, un vma->vm_mm->rss++; get_page(page); set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot))); + SetPageAnon(page); *pte_chainp = page_add_rmap(page, dir, *pte_chainp); swap_free(entry); } diff -urpN -X /home/fletch/.diff.exclude 000-virgin/scripts/schedcapture 900-mjb2/scripts/schedcapture --- 000-virgin/scripts/schedcapture Wed Dec 31 16:00:00 1969 +++ 900-mjb2/scripts/schedcapture Wed Jun 11 22:42:39 2003 @@ -0,0 +1,6 @@ +while true +do + cat /proc/schedstat + echo + sleep 20 +done diff -urpN -X /home/fletch/.diff.exclude 000-virgin/scripts/schedstat 900-mjb2/scripts/schedstat --- 000-virgin/scripts/schedstat Wed Dec 31 16:00:00 1969 +++ 900-mjb2/scripts/schedstat Wed Jun 11 22:42:39 2003 @@ -0,0 +1,168 @@ +#!/usr/bin/perl + +$slice = 20; # seconds +while (<>) { + @curr = split; + if ($curr[0] =~ /cpu(\d)/) { + $per_cpu_curr[$1] = [ @curr ]; + $max_cpu = $1 if ($1 > $max_cpu); + next; + } + next if (/^$/); + if ($curr[0] eq "version") { + if ($curr[1] != 2) { + die "Version mismatch. Update this tool.\n"; + } + next; + } + # + # format of line in /proc/schedstat + # + # tag 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 + # + # tag is "cpuN" or "cpu". Right now, we ignore "cpuN" lines (this tool + # doesn't collate per-cpu statistics, although it would be trivial to + # do so.) + # + # version == 1 + # NOTE: the active queue is considered empty if it has only one process + # in it, since obviously the process calling sched_yield is that process. + # + # First four are sched_yield statistics: + # 1) # of times both the active and the expired queue were empty + # 2) # of times just the active queue was empty + # 3) # of times just the expired queue was empty + # 4) # of times sched_yield() was called + # + # Next two are schedule() statistics: + # 5) # of times the active queue had at least one other process on it. + # 6) # of times we switched to the expired queue and reused it + # 7) # of times schedule() was called + # + # Next seven are statistics dealing with load balancing: + # 8) # of times load_balance was called at an idle tick + # 9) # of times load_balance was called at an busy tick + # 10) # of times load_balance was called from schedule() + # 11) # of times load_balance was called + # 12) sum of imbalances discovered (if any) with each call to + # load_balance + # 13) # of times load_balance was called when we did not find a + # "busiest" queue + # 14) # of times load_balance was called from balance_node() + # + # Next four are statistics dealing with pull_task(): + # 15) # of times pull_task was called at an idle tick + # 16) # of times pull_task was called at an busy tick + # 17) # of times pull_task was called from schedule() + # 18) # of times pull_task was called + # + # Next two are statistics dealing with balance_node(): + # 19) # of times balance_node was called + # 20) # of times balance_node was called at an idle tick + # + #$curr[7] = $sched_cnt; + foreach $i (1..20) { + $diff[$i] = $curr[$i] - $prev[$i]; + } + + for ($cpu = 0; $cpu <= $max_cpu; $cpu++) { + @arr_curr = @{$per_cpu_curr[$cpu]}; + @arr_prev = @{$per_cpu_prev[$cpu]}; + foreach $i (1..20) { + $arr_diff[$i] = $arr_curr[$i] - $arr_prev[$i]; + } + $per_cpu_diff[$cpu] = [ @arr_diff ]; + } + + #for ($cpu = 0; $cpu <= $max_cpu; $cpu++) { +# print "@{$per_cpu_curr[$cpu]}\n"; +# } +# print "@curr\n"; + printf "%02d:%02d:%02d--------------------------------------------------------------\n", + $tick*$slice/3600, ($tick*$slice/60)%60, ($tick*$slice)%60; + + # + # sched_yield() stats + # + printf " %7d sys_sched_yield()\n", $diff[4]; + printf " %7d(%6.2f%%) found (only) active queue empty on current cpu\n", + $diff[2]-$diff[1], $diff[4] ? (100*($diff[2]-$diff[1])/$diff[4]) : 0; + printf " %7d(%6.2f%%) found (only) expired queue empty on current cpu\n", + $diff[3], $diff[4] ? (100*$diff[3]/$diff[4]) : 0; + printf " %7d(%6.2f%%) found both queues empty on current cpu\n", + $diff[1], $diff[4] ? (100*$diff[1]/$diff[4]) : 0; + printf " %7d(%6.2f%%) found neither queue empty on current cpu\n\n", + $diff[4]-($diff[3]+$diff[2]), + $diff[4] ? 100*($diff[4]-($diff[3]+$diff[2]))/$diff[4] : 0; + + # + # schedule() stats + # + printf " %7d schedule()\n", $diff[7]; + printf " %7d(%6.2f%%) switched active and expired queues\n", + $diff[6], $diff[7] ? (100*$diff[6]/$diff[7]) : 0; + printf " %7d(%6.2f%%) used existing active queue\n\n", + $diff[5]-$diff[6], $diff[7] ? (100*($diff[5]-$diff[6])/$diff[7]) : 0; + + # + # load_balance() stats + # + printf " %7d load_balance()\n", $diff[11]; + printf " %7d(%6.2f%%) called while idle\n", $diff[8], + 100*$diff[8]/$diff[11]; + printf " %7d(%6.2f%%) called while busy\n", $diff[9], + 100*($diff[9])/$diff[11]; + printf " %7d(%6.2f%%) called from schedule()\n", $diff[10], + 100*$diff[10]/$diff[11]; + printf " %7d(%6.2f%%) called from balance_node()\n", $diff[14], + 100*$diff[14]/$diff[11]; + printf " %7d no \"busiest\" queue found\n",$diff[13]; + if ($diff[11]-$diff[13]) { + $imbalance = $diff[12] / ($diff[11]-$diff[13]); + if ($imbalance < 10) { + printf " %7.3f average imbalance (over %d)\n", + $imbalance, $diff[11]-$diff[13]; + } elsif ($imbalance < 100) { + printf " %8.2f average imbalance (over %d)\n", + $imbalance, $diff[11]-$diff[13]; + } else { + printf " %9.1f average imbalance (over %d)\n", + $imbalance, $diff[11]-$diff[13]; + } + } + else { + printf " no imbalances\n"; + } + + # + # pull_task() stats + # + print "\n"; + printf " %7d pull_task()\n", $diff[15]; + for ($cpu = 0; $cpu <= $max_cpu; $cpu++) { + @arr = @{$per_cpu_diff[$cpu]}; + if ($arr[15] || $arr[16]) { + printf " %7d/%-7d cpu %d lost/gained task to/from another cpu\n", + $arr[15], $arr[16], $cpu; + } + if ($arr[17] || $arr[18]) { + printf " %7d/%-7d cpu %d lost/gained task to/from another node\n", + $arr[17], $arr[18], $cpu; + } + } + print "\n"; + + # + # balance_node() stats + # + printf " %7d balance_node()\n", $diff[19]; + printf " %7d(%6.2f%%) called while idle\n", $diff[20], + $diff[19] ? 100*$diff[20]/$diff[19] : 0; + printf " %7d(%6.2f%%) called while busy\n", $diff[19] - $diff[20], + $diff[19] ? 100*(($diff[19]-$diff[20]))/$diff[19] : 0; + + printf("\n"); + @prev = @curr; + @per_cpu_prev = @per_cpu_curr; + $tick++; +}