diff -urpN -X /home/fletch/.diff.exclude 000-virgin/Documentation/filesystems/proc.txt 900-mjb1/Documentation/filesystems/proc.txt --- 000-virgin/Documentation/filesystems/proc.txt Sun Apr 20 19:34:55 2003 +++ 900-mjb1/Documentation/filesystems/proc.txt Fri May 30 19:05:11 2003 @@ -37,6 +37,7 @@ Table of Contents 2.8 /proc/sys/net/ipv4 - IPV4 settings 2.9 Appletalk 2.10 IPX + 2.11 /proc/sys/sched - scheduler tunables ------------------------------------------------------------------------------ Preface @@ -1749,6 +1750,104 @@ IPX. The /proc/net/ipx_route table holds a list of IPX routes. For each route it gives the destination network, the router node (or Directly) and the network address of the router (or Connected) for internal networks. + +2.11 /proc/sys/sched - scheduler tunables +----------------------------------------- + +Useful knobs for tuning the scheduler live in /proc/sys/sched. + +child_penalty +------------- + +Percentage of the parent's sleep_avg that children inherit. sleep_avg is +a running average of the time a process spends sleeping. Tasks with high +sleep_avg values are considered interactive and given a higher dynamic +priority and a larger timeslice. You typically want this some value just +under 100. + +exit_weight +----------- + +When a CPU hog task exits, its parent's sleep_avg is reduced by a factor of +exit_weight against the exiting task's sleep_avg. + +interactive_delta +----------------- + +If a task is "interactive" it is reinserted into the active array after it +has expired its timeslice, instead of being inserted into the expired array. +How "interactive" a task must be in order to be deemed interactive is a +function of its nice value. This interactive limit is scaled linearly by nice +value and is offset by the interactive_delta. + +max_sleep_avg +------------- + +max_sleep_avg is the largest value (in ms) stored for a task's running sleep +average. The larger this value, the longer a task needs to sleep to be +considered interactive (maximum interactive bonus is a function of +max_sleep_avg). + +max_timeslice +------------- + +Maximum timeslice, in milliseconds. This is the value given to tasks of the +highest dynamic priority. + +min_timeslice +------------- + +Minimum timeslice, in milliseconds. This is the value given to tasks of the +lowest dynamic priority. Every task gets at least this slice of the processor +per array switch. + +parent_penalty +-------------- + +Percentage of the parent's sleep_avg that it retains across a fork(). +sleep_avg is a running average of the time a process spends sleeping. Tasks +with high sleep_avg values are considered interactive and given a higher +dynamic priority and a larger timeslice. Normally, this value is 100 and thus +task's retain their sleep_avg on fork. If you want to punish interactive +tasks for forking, set this below 100. + +prio_bonus_ratio +---------------- + +Middle percentage of the priority range that tasks can receive as a dynamic +priority. The default value of 25% ensures that nice values at the +extremes are still enforced. For example, nice +19 interactive tasks will +never be able to preempt a nice 0 CPU hog. Setting this higher will increase +the size of the priority range the tasks can receive as a bonus. Setting +this lower will decrease this range, making the interactivity bonus less +apparent and user nice values more applicable. + +starvation_limit +---------------- + +Sufficiently interactive tasks are reinserted into the active array when they +run out of timeslice. Normally, tasks are inserted into the expired array. +Reinserting interactive tasks into the active array allows them to remain +runnable, which is important to interactive performance. This could starve +expired tasks, however, since the interactive task could prevent the array +switch. To prevent starving the tasks on the expired array for too long. the +starvation_limit is the longest (in ms) we will let the expired array starve +at the expense of reinserting interactive tasks back into active. Higher +values here give more preferance to running interactive tasks, at the expense +of expired tasks. Lower values provide more fair scheduling behavior, at the +expense of interactivity. The units are in milliseconds. + +idle_node_rebalance_ratio +------------------------- + +On NUMA machines, we normally rebalance within nodes, but we also rebalance +globally every N idle rebalance ticks, where N = idle_node_rebalance_ratio. + +busy_node_rebalance_ratio +------------------------- + +On NUMA machines, we normally rebalance within nodes, but we also rebalance +globally every N busy rebalance ticks, where N = busy_node_rebalance_ratio. ------------------------------------------------------------------------------ Summary diff -urpN -X /home/fletch/.diff.exclude 000-virgin/Documentation/i386/gdb-serial.txt 900-mjb1/Documentation/i386/gdb-serial.txt --- 000-virgin/Documentation/i386/gdb-serial.txt Wed Dec 31 16:00:00 1969 +++ 900-mjb1/Documentation/i386/gdb-serial.txt Fri May 30 19:13:53 2003 @@ -0,0 +1,386 @@ +Version +======= + +This version of the gdbstub package was developed and tested on +kernel version 2.3.48. It will not install on a 2.2 kernel. It may +not work on earlier versions of 2.3 kernels. It is possible that +it will continue to work on later versions of 2.3 and then +versions of 2.4 (I hope). + + +Debugging Setup +=============== + +Designate one machine as the "development" machine. This is the +machine on which you run your compiles and which has your source +code for the kernel. Designate a second machine as the "target" +machine. This is the machine that will run your experimental +kernel. + +The two machines will be connected together via a serial line out +one or the other of the COM ports of the PC. You will need a modem +eliminator and the appropriate cables. + +On the DEVELOPMENT machine you need to apply the patch for the gdb +hooks. You have probably already done that if you are reading this +file. + +On your DEVELOPMENT machine, go to your kernel source directory and +do "make menuconfig". Go down to the kernel hacking menu item and +open it up. Enable the kernel gdb stub code by selecting that item. + +Save and exit the menuconfig program. Then do "make clean" and +"make bzImage" (or whatever target you want to make). This gets +the kernel compiled with the "-g" option set -- necessary for +debugging. + +You have just built the kernel on your DEVELOPMENT machine that you +intend to run on our TARGET machine. + +To install this new kernel, use the following installation procedure. +Remember, you are on the DEVELOPMENT machine patching the kernel source +for the kernel that you intend to run on the TARGET machine. + +Copy this kernel to your target machine using your usual procedures. +I usually arrange to copy development:/usr/src/linux/arch/i386/boot/zImage +to /vmlinuz on the TARGET machine via a LAN based NFS access. That is, +I run the cp command on the target and copy from the development machine +via the LAN. Run Lilo on the new kernel on the target machine so that it +will boot! Then boot the kernel on the target machine. + +There is an utility program named "gdbstart" in the +development:/usr/src/linux/arch/i386/kernel directory. +You should copy this program over to your target machine, probably into +/sbin. This utility program is run on the target machine to +activate the kernel hooks for the debugger. It is invoked as follows: + + gdbstart [-s speed] [-t tty-dev] + defaults: /dev/ttyS0 with speed unmodified by gdbstart + +Don't run the program just yet. We'll get to that in a bit. + +Decide on which tty port you want the machines to communicate, then +cable them up back-to-back using the null modem. COM1 is /dev/ttyS0 +and COM2 is /dev/ttyS1. + +On the DEVELOPMENT machine, create a file called .gdbinit in the +directory /usr/src/linux. An example .gdbinit file looks like this: + +define rmt +set remotebaud 38400 +target remote /dev/ttyS0 +end + +Assuming that you added my gdbinit stuff to your .gdbinit, edit .gdbinit +and find the section that looks like this: + + define rmt + set remotebaud 38400 + target remote /dev/ttyS0 + end + +Change the "target" definition so that it specifies the tty port that +you intend to use. Change the "remotebaud" definition to match the +data rate that you are going to use for the com line. + +On the TARGET machine I find it helpful to create shell script file +named "debug" in the root home directory with the following contents: + + gdbstart -s 38400 -t /dev/ttyS0 < + EOF + +This runs the gdbstart program and gives it the carriage return that +it prompts for. This sets the data rate from the target machine's side. + +You are now ready to try it out. + +On your TARGET machine, freshly rebooted with your gdbstub-equipped +kernel, type "debug" in the root home directory. The system will appear +to hang with some messages on the screen from the debug stub. What +it is doing is waiting for contact from the development machine. + +On your DEVELOPMENT machine, cd /usr/src/linux and enter "gdb vmlinux". +When gdb gets the symbols loaded and prompts you, enter "rmt" (that's +the macro from the .gdbinit file that you just edited). If everything +is working correctly you should see gdb print out a few lines indicating +that a breakpoint has been taken. It will actually show a line of +code in the target kernel inside the gdbstub activation code. + +The gdb interaction should look something like this: + + linux-dev:/usr/src/linux# gdb vmlinux + GDB is free software and you are welcome to distribute copies of it + under certain conditions; type "show copying" to see the conditions. + There is absolutely no warranty for GDB; type "show warranty" for details. + GDB 4.15.1 (i486-slackware-linux), + Copyright 1995 Free Software Foundation, Inc... + (gdb) rmt + breakpoint () at i386-stub.c:750 + 750 } + (gdb) + + +You can now use whatever gdb commands you like to set breakpoints. +Enter "continue" to start your target machine executing again. At this +point the target system will run at full speed until it encounters +your breakpoint or gets a segment violation in the kernel, or whatever. + + +Triggering gdbstub at Kernel Boot Time +====================================== + +The gdbstub patch now has the ability for gdb to connect to the kernel during +bootup (as opposed to waiting for the system to come all the way up and then +running the gdbstart program on the target machine). This new functionality was +added by Scott Foehner at SGI. + +To force a kernel that has been compiled with gdbstub to pause during the boot +process and wait for a connection from gdb, the paramter "gdb" should be passed +to the kernel. This can be done by typing "gdb" after the name of the kernel +on the LILO command line. The patch defaults to use ttyS1 at a baud rate of +38400. These parameters can be changed by using "gdbttyS=" and +"gdbbaud=" on the command line. + +Example: + +LILO boot: linux gdb gdbttyS=1 gdbbaud=38400 + +Note that this command is entered on the TARGET machine as it is booting +the kernel that was compiled on the DEVELOPMENT machine. + +An alternate approach is to place a line in the /etc/lilo.conf file on +your TARGET machine. Under the heading for the kernel that you intend +to boot, place a line that looks like this: + + append = "gdb gdbttyS=1 gdbbaud=38400" + +This will cause the kernel to enter the gdbstub automatically at boot +time. + +BE SURE to run "lilo" after changing the /etc/lilo.conf file. + + +The "gdbstart" Program +===================== + +This utility program is used to set up the com port and data rate +for the connection from the target system to the development system. +Its usage has been described above. + +This version of the patch uses the same tty ioctl for kernel versions +2.0.30 onwards. Thus, the gdbstart utility does not need to be re-compiled +to install the patch in a later version of the kernel. The ioctl added +to the kernel for this purpose is far enough "off the end" of existing +ioctls (as of 2.1.120) that it should not interfere with any new kernel +tty ioctls for quite some time (famous last words). + +The source for the gdbstart program resides in the arch/i386/kernel directory. + + +Debugging hints +=============== + +You can break into the target machine at any time from the development +machine by typing ^C. If the target machine has interrupts enabled +this will stop it in the kernel and enter the debugger. + +There is unfortunately no way of breaking into the kernel if it is +in a loop with interrupts disabled, so if this happens to you then +you need to place exploratory breakpoints or printk's into the kernel +to find out where it is looping. + +There is a copy of an e-mail in the kgdb distribution directory which +describes how to create an NMI on an ISA bus machine using a paper +clip. I have a sophisticated version of this made by wiring a push +button switch into a PC104/ISA bus adapter card. The adapter card +nicely furnishes wire wrap pins for all the ISA bus signals. + +When you are done debugging the kernel on the target machine it is +a good idea to leave it in a running state. This makes reboots +faster, bypassing the fsck. So do a gdb "continue" as the last gdb +command if this is possible. To terminate gdb itself on the development +machine and leave the target machine running, type ^Z to suspend gdb +and then kill it with "kill %1" or something similar. + +If gdbstub Does Not Work +======================== + +If it doesn't work, you will have to troubleshoot it. Do the easy things +first like double checking your cabling and data rates. You might +try some non-kernel based programs to see if the back-to-back connection +works properly. Just something simple like cat /etc/hosts >/dev/ttyS0 +on one machine and cat /dev/ttyS0 on the other will tell you if you +can send data from one machine to the other. There is no point in tearing +out your hair in the kernel if the line doesn't work. + +All of the real action takes place in the file +/usr/src/linux/arch/i386/kernel/gdbstub.c. That is the code on the target +machine that interacts with gdb on the development machine. In gdb you can +turn on a debug switch with the following command: + + set remotedebug + +This will print out the protocol messages that gdb is exchanging with +the target machine. + +Another place to look is /usr/src/linux/drivers/char/gdbserial.c +That is the code that talks to the serial port on the target side. +There might be a problem there. + +If you are really desperate you can use printk debugging in the +gdbstub code in the target kernel until you get it working. In particular, +there is a global variable in /usr/src/linux/arch/i386/kernel/gdbstub.c +named "remote_debug". Compile your kernel with this set to 1, rather +than 0 and the debug stub will print out lots of stuff as it does +what it does. + + +Debugging Loadable Modules +========================== + +This technique comes courtesy of Edouard Parmelan + + +When you run gdb, enter the command + +source gdbinit-modules + +This will read in a file of gdb macros that was installed in your +kernel source directory with kgdb was installed. This file implements +the following commands: + +mod-list + Lists the loaded modules in the form + +mod-print-symbols + Prints all the symbols in the indicated module. + +mod-add-symbols + Loads the symbols from the object file and associates them + with the indicated module. + +After you have loaded the module that you want to debug, use the command +mod-list to find the of your module. Then use that +address in the mod-add-symbols command to load your module's symbols. +From that point onward you can debug your module as if it were a part +of the kernel. + +The file gdbinit-modules also contains a command named mod-add-lis as +an example of how to construct a command of your own to load your +favorite module. The idea is to "can" the pathname of the module +in the command so you don't have to type so much. + +Threads +======= + +Each process in a target machine is seen as a gdb thread. gdb thread related +commands (info threads, thread n) can be used. + +ia-32 hardware breakpoints +========================== + +gdb stub contains support for hardware breakpoints using debugging features +of ia-32(x86) processors. These breakpoints do not need code modification. +They use debugging registers. 4 hardware breakpoints are available in ia-32 +processors. + +Each hardware breakpoint can be of one of the following three types. +1. Execution breakpoint - An Execution breakpoint is triggered when code at the + breakpoint address is executed. + + As limited number of hardware breakpoints are available, it is advisable + to use software breakpoints ( break command ) instead of execution + hardware breakpoints, unless modification of code is to be avoided. + +2. Write breakpoint - A write breakpoint is triggered when memory location at the + breakpoint address is written. + + A write or can be placed for data of variable length. Length of a write + breakpoint indicates length of the datatype to be watched. Length is 1 + for 1 byte data , 2 for 2 byte data, 3 for 4 byte data. + +3. Access breakpoint - An access breakpoint is triggered when memory location at + the breakpoint address is either read or written. + + Access breakpoints also have lengths similar to write breakpoints. + +IO breakpoints in ia-32 are not supported. + +Since gdb stub at present does not use the protocol used by gdb for hardware +breakpoints, hardware breakpoints are accessed through gdb macros. gdb macros +for hardware breakpoints are described below. + +hwebrk - Places an execution breakpoint + hwebrk breakpointno address +hwwbrk - Places a write breakpoint + hwwbrk breakpointno length address +hwabrk - Places an access breakpoint + hwabrk breakpointno length address +hwrmbrk - Removes a breakpoint + hwrmbrk breakpointno +exinfo - Tells whether a software or hardware breakpoint has occured. + Prints number of the hardware breakpoint if a hardware breakpoint has + occured. + +Arguments required by these commands are as follows +breakpointno - 0 to 3 +length - 1 to 3 +address - Memory location in hex digits ( without 0x ) e.g c015e9bc + +MP support +========== + +When a breakpoint occurs or user issues a break ( Ctrl + C ) to gdb client, +all the processors are forced to enter the debugger. Current thread +corresponds to the thread running on the processor where breakpoint occured. +Threads running on other processor(s) appear similar to other non running +threads in the 'info threads' output. + +ia-32 hardware debugging registers on all processors are set to same values. +Hence any hardware breakpoints may occur on any processor. + +gdb troubleshooting +=================== + +1. gdb hangs +Kill it. restart gdb. Connect to target machine. + +2. gdb cannot connect to target machine (after killing a gdb and restarting +another) +If the target machine was not inside debugger when you killed gdb, gdb cannot +connect because the target machine won't respond. +In this case echo "Ctrl+C"(ascii 3) in the serial line. +e.g. echo -e "\003" > /dev/ttyS1 +This forces that target machine into debugger after which you can connect. + +3. gdb cannot connect even after echoing Ctrl+C into serial line +Try changing serial line settings min to 1 and time to 0 +e.g. stty min 1 time 0 < /dev/ttyS1 +Try echoing again + +check serial line speed and set it to correct value if required +e.g. stty ispeed 115200 ospeed 115200 < /dev/ttyS1 + +Final Items +=========== + +I picked up this code from Dave Grothe and enhanced it. + +If you make some really cool modification to this stuff, or if you +fix a bug, please let me know. + +Amit S. Kale + + +(First kgdb by David Grothe ) + +(modified by Tigran Aivazian ) + Putting gdbstub into the kernel config menu. + +(modified by Scott Foehner ) + Hooks for entering gdbstub at boot time. + +(modified by Amit S. Kale ) + Threads, ia-32 hw debugging, mp support, console support, + nmi watchdog handling. diff -urpN -X /home/fletch/.diff.exclude 000-virgin/Documentation/sysrq.txt 900-mjb1/Documentation/sysrq.txt --- 000-virgin/Documentation/sysrq.txt Wed Mar 26 22:54:28 2003 +++ 900-mjb1/Documentation/sysrq.txt Fri May 30 19:13:53 2003 @@ -77,6 +77,8 @@ On all - write a character to /proc/sys 'l' - Send a SIGKILL to all processes, INCLUDING init. (Your system will be non-functional after this.) +'g' - Enter the kernel debugger (if configured and supported). + 'h' - Will display help ( actually any other key than those listed above will display help. but 'h' is easy to remember :-) diff -urpN -X /home/fletch/.diff.exclude 000-virgin/Makefile 900-mjb1/Makefile --- 000-virgin/Makefile Fri May 30 19:01:58 2003 +++ 900-mjb1/Makefile Fri May 30 22:06:07 2003 @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 5 SUBLEVEL = 70 -EXTRAVERSION = +EXTRAVERSION = -mjb1 # *DOCUMENTATION* # To see a list of typical targets execute "make help" @@ -50,8 +50,13 @@ TOPDIR := $(CURDIR) HOSTCC = gcc HOSTCXX = g++ +ifdef CONFIG_DEBUG_SYMBOLS HOSTCFLAGS = -Wall -Wstrict-prototypes -O2 -fomit-frame-pointer HOSTCXXFLAGS = -O2 +else +HOSTCFLAGS = -Wall -Wstrict-prototypes -O2 -g -fomit-frame-pointer +HOSTCXXFLAGS = -O2 -g +endif CROSS_COMPILE = @@ -193,8 +198,13 @@ AFLAGS_KERNEL = NOSTDINC_FLAGS = -nostdinc -iwithprefix include CPPFLAGS := -D__KERNEL__ -Iinclude +ifdef CONFIG_DEBUG_SYMBOLS +CFLAGS := $(CPPFLAGS) -Wall -Wstrict-prototypes -Wno-trigraphs -O2 -g \ + -fno-strict-aliasing -fno-common +else CFLAGS := $(CPPFLAGS) -Wall -Wstrict-prototypes -Wno-trigraphs -O2 \ -fno-strict-aliasing -fno-common +endif AFLAGS := -D__ASSEMBLY__ $(CPPFLAGS) export VERSION PATCHLEVEL SUBLEVEL EXTRAVERSION KERNELRELEASE ARCH \ @@ -296,6 +306,10 @@ ifneq ($(KBUILD_BUILTIN),1) KBUILD_BUILTIN := 1 endif endif +endif + +ifdef CONFIG_X86_REMOTE_DEBUG +CFLAGS += -g endif # diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/Kconfig 900-mjb1/arch/i386/Kconfig --- 000-virgin/arch/i386/Kconfig Fri May 30 19:01:58 2003 +++ 900-mjb1/arch/i386/Kconfig Fri May 30 22:02:33 2003 @@ -383,6 +383,11 @@ config X86_OOSTORE depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 default y +config X86_CMOV + bool + depends on M686 || MPENTIUMII || MPENTIUMIII || MPENTIUM4 || MK8 || MCRUSOE + default y + config HUGETLB_PAGE bool "Huge TLB Page Support" help @@ -435,17 +440,17 @@ config NR_CPUS This is purely to save memory - each supported CPU adds approximately eight kilobytes to the kernel image. -config PREEMPT - bool "Preemptible Kernel" - help - This option reduces the latency of the kernel when reacting to - real-time or interactive events by allowing a low priority process to - be preempted even if it is in kernel mode executing a system call. - This allows applications to run more reliably even when the system is - under load. - - Say Y here if you are building a kernel for a desktop, embedded - or real-time system. Say N if you are unsure. +# config PREEMPT +# bool "Preemptible Kernel" +# help +# This option reduces the latency of the kernel when reacting to +# real-time or interactive events by allowing a low priority process to +# be preempted even if it is in kernel mode executing a system call. +# This allows applications to run more reliably even when the system is +# under load. +# +# Say Y here if you are building a kernel for a desktop, embedded +# or real-time system. Say N if you are unsure. config X86_UP_APIC bool "Local APIC support on uniprocessors" if !SMP @@ -666,6 +671,44 @@ config HIGHMEM64G endchoice +choice + help + On i386, a process can only virtually address 4GB of memory. This + lets you select how much of that virtual space you would like to + devoted to userspace, and how much to the kernel. + + Some userspace programs would like to address as much as possible and + have few demands of the kernel other than it get out of the way. These + users may opt to use the 3.5GB option to give their userspace program + as much room as possible. Due to alignment issues imposed by PAE, + the "3.5GB" option is unavailable if "64GB" high memory support is + enabled. + + Other users (especially those who use PAE) may be running out of + ZONE_NORMAL memory. Those users may benefit from increasing the + kernel's virtual address space size by taking it away from userspace, + which may not need all of its space. An indicator that this is + happening is when /proc/Meminfo's "LowFree:" is a small percentage of + "LowTotal:" while "HighFree:" is very large. + + If unsure, say "3GB" + prompt "User address space size" + default 1GB + +config 05GB + bool "3.5 GB" + depends on !HIGHMEM64G + +config 1GB + bool "3 GB" + +config 2GB + bool "2 GB" + +config 3GB + bool "1 GB" +endchoice + config HIGHMEM bool depends on HIGHMEM64G || HIGHMEM4G @@ -683,6 +726,11 @@ config NUMA default n if X86_PC default y if (X86_NUMAQ || X86_SUMMIT) +config NUMA_SCHED + bool "Numa Scheduling Support" + depends on NUMA + default y + # Need comments to help the hapless user trying to turn on NUMA support comment "NUMA (NUMA-Q) requires SMP, 64GB highmem support" depends on X86_NUMAQ && (!HIGHMEM64G || !SMP) @@ -709,6 +757,16 @@ config HIGHPTE low memory. Setting this option will put user-space page table entries in high memory. +config 4K_STACK + bool "Use smaller 4k per-task stacks" + help + This option will shrink the kernel's per-task stack from 8k to + 4k. This will greatly increase your chance of overflowing it. + But, if you use the per-cpu interrupt stacks as well, your chances + go way down. Also try the CONFIG_X86_STACK_CHECK overflow + detection. It is much more reliable than the currently in-kernel + version. + config MATH_EMULATION bool "Math emulation" ---help--- @@ -768,6 +826,33 @@ config MTRR See for more information. +choice + help + This is unrelated to your processor's speed. This variable alters + how often the system is asked to generate timer interrupts. A larger + value can lead to a more responsive system, but also causes extra + overhead from the increased number of context switches. + + If in doubt, leave it at the default of 1000. + + prompt "Kernel HZ" + default 1000HZ + +config 100HZ + bool "100 Hz" + +config 1000HZ + bool "1000 Hz" +endchoice + +config IRQBALANCE + bool "Enable kernel irq balancing" + depends on SMP + default y + help + The defalut yes will allow the kernel to do irq load balancing. + Saying no will keep the kernel from doing irq load balancing. + config HAVE_DEC_LOCK bool depends on (SMP || PREEMPT) && X86_CMPXCHG @@ -1456,12 +1541,58 @@ source "arch/i386/oprofile/Kconfig" menu "Kernel hacking" +config CRASH_DUMP + tristate "Crash dump support (EXPERIMENTAL)" + depends on EXPERIMENTAL + default n + ---help--- + Say Y here to enable saving an image of system memory when a panic + or other error occurs. Dumps can also be forced with the SysRq+d + key if MAGIC_SYSRQ is enabled. + +config CRASH_DUMP_BLOCKDEV + tristate "Crash dump block device driver" + depends on CRASH_DUMP + help + Say Y to allow saving crash dumps directly to a disk device. + +config CRASH_DUMP_NETDEV + tristate "Crash dump network device driver" + depends on CRASH_DUMP + help + Say Y to allow saving crash dumps over a network device. + +config CRASH_DUMP_COMPRESS_RLE + tristate "Crash dump RLE compression" + depends on CRASH_DUMP + help + Say Y to allow saving dumps with Run Length Encoding compression. + +config CRASH_DUMP_COMPRESS_GZIP + tristate "Crash dump GZIP compression" + depends on CRASH_DUMP + help + Say Y to allow saving dumps with Gnu Zip compression. + config DEBUG_KERNEL bool "Kernel debugging" help Say Y here if you are developing drivers or trying to debug and identify kernel problems. +config KPROBES + bool "Kprobes" + depends on DEBUG_KERNEL + help + Kprobes allows you to trap at almost any kernel address, using + register_kprobe(), and providing a callback function. This is useful + for kernel debugging, non-intrusive instrumentation and testing. If + in doubt, say "N". + +config DEBUG_SYMBOLS + bool "Get debug symbols (turns on -g)" + depends on DEBUG_KERNEL + config DEBUG_STACKOVERFLOW bool "Check for stack overflows" depends on DEBUG_KERNEL @@ -1474,6 +1605,17 @@ config DEBUG_SLAB allocation as well as poisoning memory on free to catch use of freed memory. +config X86_REMOTE_DEBUG + bool "KGDB: Remote (serial) kernel debugging with gdb" + +config KGDB_THREAD + bool "KGDB: Thread analysis" + depends on X86_REMOTE_DEBUG + +config GDB_CONSOLE + bool "KGDB: Console messages through gdb" + depends on X86_REMOTE_DEBUG + config DEBUG_IOVIRT bool "Memory mapped I/O debugging" depends on DEBUG_KERNEL @@ -1499,6 +1641,26 @@ config MAGIC_SYSRQ keys are documented in . Don't say Y unless you really know what this hack does. +config EARLY_PRINTK + bool "Early console support" + default n + depends on DEBUG_KERNEL + help + Write kernel log output directly into the VGA buffer or serial port. + This is useful for kernel debugging when your machine crashes very + early before the console code is initialized. For normal operation + it is not recommended because it looks ugly and doesn't cooperate + with klogd/syslogd or the X server.You should normally N here, + unless you want to debug such a crash. + + Syntax: earlyprintk=vga + earlyprintk=serial[,ttySn[,baudrate]] + Append ,keep to not disable it when the real console takes over. + Only vga or serial at a time, not both. + Currently only ttyS0 and ttyS1 are supported. + Interaction with the standard serial driver is not very good. + The VGA output is eventually overwritten by the real console. + config DEBUG_SPINLOCK bool "Spinlock debugging" depends on DEBUG_KERNEL @@ -1508,6 +1670,15 @@ config DEBUG_SPINLOCK best used in conjunction with the NMI watchdog so that spinlock deadlocks are also debuggable. +config SPINLINE + bool "Spinlock inlining" + depends on DEBUG_KERNEL + help + This will change spinlocks from out of line to inline, making them + account cost to the callers in readprofile, rather than the lock + itself (as ".text.lock.filename"). This can be helpful for finding + the callers of locks. + config DEBUG_HIGHMEM bool "Highmem debugging" depends on DEBUG_KERNEL && HIGHMEM @@ -1528,13 +1699,42 @@ config DEBUG_SPINLOCK_SLEEP If you say Y here, various routines which may sleep will become very noisy if they are called with a spinlock held. +config LOCKMETER + bool "Kernel lock metering" + depends on SMP + help + Say Y to enable kernel lock metering, which adds overhead to SMP + locks, but allows you to see various statistics using the lockstat + command + config FRAME_POINTER - bool "Compile the kernel with frame pointers" + bool + default y if X86_REMOTE_DEBUG + default n if !X86_REMOTE_DEBUG help If you say Y here the resulting kernel image will be slightly larger and slower, but it will give very useful debugging information. If you don't debug the kernel, you can say N, but we may not be able to solve problems without frame pointers. + +config X86_STACK_CHECK + bool "Detect stack overflows" + depends on FRAME_POINTER + help + Say Y here to have the kernel attempt to detect when the per-task + kernel stack overflows. This is much more robust checking than + the above overflow check, which will only occasionally detect + an overflow. The level of guarantee here is much greater. + + Some older versions of gcc don't handle the -p option correctly. + Kernprof is affected by the same problem, which is described here: + http://oss.sgi.com/projects/kernprof/faq.html#Q9 + + Basically, if you get oopses in __free_pages_ok during boot when + you have this turned on, you need to fix gcc. The Redhat 2.96 + version and gcc-3.x seem to work. + + If not debugging a stack overflow problem, say N config X86_EXTRA_IRQS bool diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/Makefile 900-mjb1/arch/i386/Makefile --- 000-virgin/arch/i386/Makefile Fri May 30 19:01:58 2003 +++ 900-mjb1/arch/i386/Makefile Fri May 30 19:24:43 2003 @@ -81,6 +81,10 @@ core-$(CONFIG_X86_GENERICARCH) += arch/i # default subarch .h files mflags-y += -Iinclude/asm-i386/mach-default +ifdef CONFIG_X86_STACK_CHECK +CFLAGS += -p +endif + head-y := arch/i386/kernel/head.o arch/i386/kernel/init_task.o libs-y += arch/i386/lib/ @@ -94,6 +98,7 @@ drivers-$(CONFIG_OPROFILE) += arch/i386 CFLAGS += $(mflags-y) AFLAGS += $(mflags-y) +AFLAGS_vmlinux.lds.o += -imacros $(TOPDIR)/include/asm-i386/page.h boot := arch/i386/boot diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/boot/Makefile 900-mjb1/arch/i386/boot/Makefile --- 000-virgin/arch/i386/boot/Makefile Mon Mar 17 21:43:38 2003 +++ 900-mjb1/arch/i386/boot/Makefile Fri May 30 19:24:56 2003 @@ -101,3 +101,4 @@ zlilo: $(BOOTIMAGE) install: $(BOOTIMAGE) sh $(src)/install.sh $(KERNELRELEASE) $(BOOTIMAGE) System.map "$(INSTALL_PATH)" + if [ -f init/kerntypes.o ]; then cp init/kerntypes.o $(INSTALL_PATH)/Kerntypes; fi diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/boot/compressed/misc.c 900-mjb1/arch/i386/boot/compressed/misc.c --- 000-virgin/arch/i386/boot/compressed/misc.c Sun Apr 20 19:34:56 2003 +++ 900-mjb1/arch/i386/boot/compressed/misc.c Fri May 30 19:24:43 2003 @@ -379,3 +379,7 @@ asmlinkage int decompress_kernel(struct if (high_loaded) close_output_buffer_if_we_run_high(mv); return high_loaded; } + +/* We don't actually check for stack overflows this early. */ +__asm__(".globl mcount ; mcount: ret\n"); + diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/kernel/Makefile 900-mjb1/arch/i386/kernel/Makefile --- 000-virgin/arch/i386/kernel/Makefile Fri May 30 19:01:58 2003 +++ 900-mjb1/arch/i386/kernel/Makefile Fri May 30 19:26:44 2003 @@ -17,6 +17,7 @@ obj-$(CONFIG_MCA) += mca.o obj-$(CONFIG_X86_MSR) += msr.o obj-$(CONFIG_X86_CPUID) += cpuid.o obj-$(CONFIG_MICROCODE) += microcode.o +obj-$(CONFIG_X86_REMOTE_DEBUG) += gdbstub.o obj-$(CONFIG_PM) += suspend.o obj-$(CONFIG_APM) += apm.o obj-$(CONFIG_X86_SMP) += smp.o smpboot.o @@ -26,10 +27,20 @@ obj-$(CONFIG_X86_LOCAL_APIC) += apic.o n obj-$(CONFIG_X86_IO_APIC) += io_apic.o obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend.o suspend_asm.o obj-$(CONFIG_X86_NUMAQ) += numaq.o +obj-$(CONFIG_X86_SUMMIT) += summit.o obj-$(CONFIG_EDD) += edd.o +obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_MODULES) += module.o obj-y += sysenter.o vsyscall.o obj-$(CONFIG_ACPI_SRAT) += srat.o + +ifdef CONFIG_X86_REMOTE_DEBUG +GDBSTART=gdbstart +GDBCLEAN= -rm -f gdbstart /sbin/gdbstart +else +GDBSTART= +GDBCLEAN= +endif EXTRA_AFLAGS := -traditional diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/kernel/apic.c 900-mjb1/arch/i386/kernel/apic.c --- 000-virgin/arch/i386/kernel/apic.c Fri May 30 19:01:58 2003 +++ 900-mjb1/arch/i386/kernel/apic.c Fri May 30 19:23:03 2003 @@ -1043,7 +1043,8 @@ inline void smp_local_timer_interrupt(st * interrupt as well. Thus we cannot inline the local irq ... ] */ -void smp_apic_timer_interrupt(struct pt_regs regs) +struct pt_regs * IRQHANDLER(smp_apic_timer_interrupt(struct pt_regs* regs)); +struct pt_regs * smp_apic_timer_interrupt(struct pt_regs* regs) { int cpu = smp_processor_id(); @@ -1063,14 +1064,16 @@ void smp_apic_timer_interrupt(struct pt_ * interrupt lock, which is the WrongThing (tm) to do. */ irq_enter(); - smp_local_timer_interrupt(®s); + smp_local_timer_interrupt(regs); irq_exit(); + return regs; } /* * This interrupt should _never_ happen with our APIC/SMP architecture */ -asmlinkage void smp_spurious_interrupt(void) +struct pt_regs * IRQHANDLER(smp_spurious_interrupt(struct pt_regs* regs)); +struct pt_regs * smp_spurious_interrupt(struct pt_regs* regs) { unsigned long v; @@ -1088,13 +1091,15 @@ asmlinkage void smp_spurious_interrupt(v printk(KERN_INFO "spurious APIC interrupt on CPU#%d, should never happen.\n", smp_processor_id()); irq_exit(); + return regs; } /* * This interrupt should never happen with our APIC/SMP architecture */ -asmlinkage void smp_error_interrupt(void) +struct pt_regs * IRQHANDLER(smp_error_interrupt(struct pt_regs* regs)); +struct pt_regs * smp_error_interrupt(struct pt_regs* regs) { unsigned long v, v1; @@ -1119,6 +1124,7 @@ asmlinkage void smp_error_interrupt(void printk (KERN_INFO "APIC error on CPU%d: %02lx(%02lx)\n", smp_processor_id(), v , v1); irq_exit(); + return regs; } /* diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/kernel/cpu/common.c 900-mjb1/arch/i386/kernel/cpu/common.c --- 000-virgin/arch/i386/kernel/cpu/common.c Sun Apr 20 19:34:56 2003 +++ 900-mjb1/arch/i386/kernel/cpu/common.c Fri May 30 21:58:44 2003 @@ -433,9 +433,9 @@ void __init early_cpu_init(void) } /* * cpu_init() initializes state that is per-CPU. Some data is already - * initialized (naturally) in the bootstrap process, such as the GDT - * and IDT. We reload them nevertheless, this function acts as a - * 'CPU state barrier', nothing should get across. + * initialized (naturally) in the bootstrap process, such as the GDT. + * We reload them nevertheless, this function acts as a 'CPU state barrier', + * nothing should get across. */ void __init cpu_init (void) { @@ -459,8 +459,8 @@ void __init cpu_init (void) } /* - * Initialize the per-CPU GDT with the boot GDT, - * and set up the GDT descriptor: + * Initialize the per-CPU GDTs with the boot equivalents, + * and set up the descriptors: */ if (cpu) { memcpy(cpu_gdt_table[cpu], cpu_gdt_table[0], GDT_SIZE); @@ -473,7 +473,6 @@ void __init cpu_init (void) memcpy(thread->tls_array, cpu_gdt_table[cpu], GDT_ENTRY_TLS_ENTRIES * 8); __asm__ __volatile__("lgdt %0": "=m" (cpu_gdt_descr[cpu])); - __asm__ __volatile__("lidt %0": "=m" (idt_descr)); /* * Delete NT @@ -517,3 +516,31 @@ void __init cpu_init (void) current->used_math = 0; stts(); } + +/* + * copy over the boot node idt across all nodes, we currently only have + * non-unique idt entries for device io interrupts. + */ +void __init setup_node_idts(void) +{ + int node = MAX_NUMNODES; + + /* we can skip setting up node0 since it's done in head.S */ + while (--node) { + memcpy(node_idt_table[node], node_idt_table[0], IDT_SIZE); + node_idt_descr[node].size = IDT_SIZE - 1; + node_idt_descr[node].address = (unsigned long)node_idt_table[node]; + } +} + +void __init setup_cpu_idt(void) +{ + int cpu = smp_processor_id(), node = cpu_to_node(cpu); + + printk(KERN_DEBUG "CPU%d IDT at 0x%08lx\n", + cpu, node_idt_descr[node].address); + + /* reload the idt on all processors as they come up */ + __asm__ __volatile__("lidt %0": "=m" (node_idt_descr[node])); +} + diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/kernel/cpu/mcheck/p4.c 900-mjb1/arch/i386/kernel/cpu/mcheck/p4.c --- 000-virgin/arch/i386/kernel/cpu/mcheck/p4.c Sun Apr 20 19:34:56 2003 +++ 900-mjb1/arch/i386/kernel/cpu/mcheck/p4.c Fri May 30 19:23:03 2003 @@ -61,11 +61,13 @@ static void intel_thermal_interrupt(stru /* Thermal interrupt handler for this CPU setup */ static void (*vendor_thermal_interrupt)(struct pt_regs *regs) = unexpected_thermal_interrupt; -asmlinkage void smp_thermal_interrupt(struct pt_regs regs) +struct pt_regs * IRQHANDLER(smp_thermal_interrupt(struct pt_regs* regs)); +struct pt_regs * smp_thermal_interrupt(struct pt_regs* regs) { irq_enter(); vendor_thermal_interrupt(®s); irq_exit(); + return regs; } /* P4/Xeon Thermal regulation detect and init */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/kernel/doublefault.c 900-mjb1/arch/i386/kernel/doublefault.c --- 000-virgin/arch/i386/kernel/doublefault.c Tue Feb 25 23:03:43 2003 +++ 900-mjb1/arch/i386/kernel/doublefault.c Fri May 30 21:58:44 2003 @@ -16,7 +16,7 @@ static unsigned long doublefault_stack[D static void doublefault_fn(void) { - struct Xgt_desc_struct gdt_desc = {0, 0}; + struct Xdt_desc_struct gdt_desc = {0, 0}; unsigned long gdt, tss; __asm__ __volatile__("sgdt %0": "=m" (gdt_desc): :"memory"); diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/kernel/entry.S 900-mjb1/arch/i386/kernel/entry.S --- 000-virgin/arch/i386/kernel/entry.S Fri May 30 19:01:58 2003 +++ 900-mjb1/arch/i386/kernel/entry.S Fri May 30 19:28:10 2003 @@ -49,6 +49,10 @@ #include #include "irq_vectors.h" +#ifndef CONFIG_KGDB_THREAD +#define user_schedule schedule +#endif + EBX = 0x00 ECX = 0x04 EDX = 0x08 @@ -160,7 +164,7 @@ do_lcall: movl %eax,EFLAGS(%ebp) # movl %edx,EIP(%ebp) # Now we move them to their "normal" places movl %ecx,CS(%ebp) # - andl $-8192, %ebp # GET_THREAD_INFO + GET_THREAD_INFO_WITH_ESP(%ebp) # GET_THREAD_INFO movl TI_EXEC_DOMAIN(%ebp), %edx # Get the execution domain call *4(%edx) # Call the lcall7 handler for the domain addl $4, %esp @@ -224,7 +228,7 @@ need_resched: jz restore_all movl $PREEMPT_ACTIVE,TI_PRE_COUNT(%ebp) sti - call schedule + call user_schedule movl $0,TI_PRE_COUNT(%ebp) cli jmp need_resched @@ -306,7 +310,7 @@ work_pending: testb $_TIF_NEED_RESCHED, %cl jz work_notifysig work_resched: - call schedule + call user_schedule cli # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret @@ -394,17 +398,78 @@ ENTRY(irq_entries_start) vector=vector+1 .endr + +# lets play optimizing compiler... +#ifdef CONFIG_X86_CMOV +#define COND_MOVE cmovnz %esi,%esp; +#else +#define COND_MOVE \ + jz 1f; \ + mov %esi,%esp; \ +1: +#endif + +# These macros will switch you to, and from a per-cpu interrupt stack +# They take the pt_regs arg and move it from the normal place on the +# stack to %eax. Any handler function can retrieve it using regparm(1). +# The handlers are expected to return the stack to switch back to in +# the same register. +# +# This means that the irq handlers need to return their arg +# +# SWITCH_TO_IRQSTACK clobbers %ebx, %ecx, %edx, %esi +# old stack gets put in %eax + +.macro SWITCH_TO_IRQSTACK + GET_THREAD_INFO(%ebx); + movl TI_IRQ_STACK(%ebx),%ecx; + movl TI_TASK(%ebx),%edx; + movl %esp,%eax; + + # %ecx+THREAD_SIZE is next stack -4 keeps us in the right one + leal (THREAD_SIZE-4)(%ecx),%esi; + + # is there a valid irq_stack? + testl %ecx,%ecx; + COND_MOVE; + + # update the task pointer in the irq stack + GET_THREAD_INFO(%esi); + movl %edx,TI_TASK(%esi); + + # update the preempt count in the irq stack + movl TI_PRE_COUNT(%ebx),%ecx; + movl %ecx,TI_PRE_COUNT(%esi); +.endm + +# copy flags from the irq stack back into the task's thread_info +# %esi is saved over the irq handler call and contains the irq stack's +# thread_info pointer +# %eax was returned from the handler, as described above +# %ebx contains the original thread_info pointer + +.macro RESTORE_FROM_IRQSTACK + movl %eax,%esp; + movl TI_FLAGS(%esi),%eax; + movl $0,TI_FLAGS(%esi); + LOCK orl %eax,TI_FLAGS(%ebx); +.endm + ALIGN common_interrupt: SAVE_ALL + SWITCH_TO_IRQSTACK call do_IRQ + RESTORE_FROM_IRQSTACK jmp ret_from_intr #define BUILD_INTERRUPT(name, nr) \ ENTRY(name) \ pushl $nr-256; \ SAVE_ALL \ - call smp_/**/name; \ + SWITCH_TO_IRQSTACK; \ + call smp_/**/name; \ + RESTORE_FROM_IRQSTACK; \ jmp ret_from_intr; /* The include is where all of the SMP etc. interrupts come from */ @@ -493,9 +558,16 @@ ENTRY(debug) jne debug_stack_correct FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn) debug_stack_correct: + pushl $-1 # mark this as an int + SAVE_ALL + movl %esp,%edx pushl $0 - pushl $do_debug - jmp error_code + pushl %edx + call do_debug + addl $8,%esp + testl %eax,%eax + jnz restore_all + jmp ret_from_exception /* * NMI is doubly nasty. It can happen _while_ we're handling @@ -535,9 +607,16 @@ nmi_debug_stack_fixup: jmp nmi_stack_correct ENTRY(int3) + pushl $-1 # mark this as an int + SAVE_ALL + movl %esp,%edx pushl $0 - pushl $do_int3 - jmp error_code + pushl %edx + call do_int3 + addl $8,%esp + testl %eax,%eax + jnz restore_all + jmp ret_from_exception ENTRY(overflow) pushl $0 @@ -563,6 +642,31 @@ ENTRY(invalid_TSS) pushl $do_invalid_TSS jmp error_code +#ifdef CONFIG_KGDB_THREAD +ENTRY(kern_schedule) + pushl %ebp + movl %esp, %ebp + pushl %ss + pushl %ebp + pushfl + pushl %cs + pushl 4(%ebp) + pushl %eax + pushl %es + pushl %ds + pushl %eax + pushl (%ebp) + pushl %edi + pushl %esi + pushl %edx + pushl %ecx + pushl %ebx + call kern_do_schedule + movl %ebp, %esp + pop %ebp + ret +#endif + ENTRY(segment_not_present) pushl $do_segment_not_present jmp error_code @@ -595,6 +699,61 @@ ENTRY(spurious_interrupt_bug) pushl $do_spurious_interrupt_bug jmp error_code + +#ifdef CONFIG_X86_STACK_CHECK +.data + .globl stack_overflowed +stack_overflowed: + .long 0 +.text + +ENTRY(mcount) + push %eax + movl $(THREAD_SIZE - 1),%eax + andl %esp,%eax + cmpl $STACK_WARN,%eax /* more than half the stack is used*/ + jle 1f +2: + popl %eax + ret +1: + lock; btsl $0,stack_overflowed + jc 2b + + # switch to overflow stack + movl %esp,%eax + movl $(stack_overflow_stack + THREAD_SIZE - 4),%esp + + pushf + cli + pushl %eax + + # push eip then esp of error for stack_overflow_panic + pushl 4(%eax) + pushl %eax + + # update the task pointer and cpu in the overflow stack's thread_info. + GET_THREAD_INFO_WITH_ESP(%eax) + movl TI_TASK(%eax),%ebx + movl %ebx,stack_overflow_stack+TI_TASK + movl TI_CPU(%eax),%ebx + movl %ebx,stack_overflow_stack+TI_CPU + + call stack_overflow + + # pop off call arguments + addl $8,%esp + + popl %eax + popf + movl %eax,%esp + popl %eax + movl $0,stack_overflowed + ret + +#warning stack check enabled +#endif + .data ENTRY(sys_call_table) .long sys_restart_syscall /* 0 - old "setup()" system call, used for restarting */ @@ -865,6 +1024,7 @@ ENTRY(sys_call_table) .long sys_clock_gettime /* 265 */ .long sys_clock_getres .long sys_clock_nanosleep + .long sys_membind nr_syscalls=(.-sys_call_table)/4 diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/kernel/gdbstart.c 900-mjb1/arch/i386/kernel/gdbstart.c --- 000-virgin/arch/i386/kernel/gdbstart.c Wed Dec 31 16:00:00 1969 +++ 900-mjb1/arch/i386/kernel/gdbstart.c Fri May 30 19:13:53 2003 @@ -0,0 +1,147 @@ +/* + * This program opens a tty file and issues the GDB stub activating + * ioctl on it. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +char *tty_name = "/dev/ttyS0" ; /* COM1 port */ +int speed = 9600 ; /* default speed */ +struct termios save_ts ; /* original term struct */ + +void print_usage(void) +{ + printf("gdbstub [-s speed] [-t tty-dev]\n") ; + printf(" defaults: /dev/ttyS0 with speed unmodified by this program\n"); + +} /* print_usage */ + +void tty_err(char *msg) +{ + char buf[100] ; + + strcpy(buf, msg) ; + strcat(buf, ": ") ; + strcat(buf, tty_name) ; + perror(buf) ; + exit(1) ; + +} /* tty_err */ + + +void setup_term(int fd) +{ + struct termios ts ; + int speed_code ; + + if (tcgetattr(fd, &ts) < 0) tty_err("tcgetattr") ; + + save_ts = ts ; + switch (speed) + { + case 4800: + speed_code = B4800 ; + break ; + case 9600: + speed_code = B9600 ; + break ; + case 19200: + speed_code = B19200 ; + break ; + case 38400: + speed_code = B38400 ; + break ; + case 57600: + speed_code = B57600 ; + break ; + case 115200: + speed_code = B115200 ; + break ; + case 230400: + speed_code = B230400 ; + break ; + default: + printf("Invalid speed: %d\n", speed) ; + exit(1) ; + } + + ts.c_cflag = CS8 | CREAD | CLOCAL ; + if (cfsetospeed(&ts, speed_code) < 0) tty_err("cfsetospeed") ; + if (cfsetispeed(&ts, speed_code) < 0) tty_err("cfsetispeed") ; + + if (tcsetattr(fd, TCSANOW, &ts) < 0) tty_err("tcsetattr") ; + +} /* setup_term */ + +int main(int argc, char **argv) +{ + int opt ; + int fil ; + int rslt ; + + while ((opt = getopt(argc, argv, "hs:t:")) > 0) + { + switch (opt) + { + case 's': + speed = atol(optarg) ; + break ; + case 't': + tty_name = optarg ; + break ; + case ':': + printf("Invalid option\n") ; + break ; + case '?': + case 'h': + default: + print_usage() ; + return 1; + } + } + + fil = open(tty_name, O_RDWR) ; + if (fil < 0) + { + perror(tty_name) ; + return 1; + } + + + setup_term(fil) ; + + /* + * When we issue this ioctl, control will not return until + * the debugger running on the remote host machine says "go". + */ + printf("\nAbout to activate GDB stub in the kernel on %s\n", tty_name) ; + printf("Hit CR to continue, kill program to abort -- ") ; + getchar() ; + sync() ; + rslt = ioctl(fil, TIOCGDB, 0) ; + if (rslt < 0) + { + perror("TIOCGDB ioctl") ; + return 1; + } + + printf("\nGDB stub successfully activated\n") ; + + for (;;) + { + pause() ; + } + + if (tcsetattr(fil, TCSANOW, &save_ts) < 0) tty_err("tcsetattr") ; + + exit(0); +} /* main */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/kernel/gdbstub.c 900-mjb1/arch/i386/kernel/gdbstub.c --- 000-virgin/arch/i386/kernel/gdbstub.c Wed Dec 31 16:00:00 1969 +++ 900-mjb1/arch/i386/kernel/gdbstub.c Fri May 30 19:13:53 2003 @@ -0,0 +1,1208 @@ +/* + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + */ + +/* + * Copyright (C) 2000-2001 VERITAS Software Corporation. + */ +/**************************************************************************** + * Header: remcom.c,v 1.34 91/03/09 12:29:49 glenne Exp $ + * + * Module name: remcom.c $ + * Revision: 1.34 $ + * Date: 91/03/09 12:29:49 $ + * Contributor: Lake Stevens Instrument Division$ + * + * Description: low level support for gdb debugger. $ + * + * Considerations: only works on target hardware $ + * + * Written by: Glenn Engel $ + * Updated by: Amit Kale + * ModuleState: Experimental $ + * + * NOTES: See Below $ + * + * Modified for 386 by Jim Kingdon, Cygnus Support. + * Origianl kgdb, compatibility with 2.1.xx kernel by David Grothe + * Integrated into 2.2.5 kernel by Tigran Aivazian + * thread support, + * support for multiple processors, + * support for ia-32(x86) hardware debugging, + * Console support, + * handling nmi watchdog + * Amit S. Kale ( akale@veritas.com ) + * + * + * To enable debugger support, two things need to happen. One, a + * call to set_debug_traps() is necessary in order to allow any breakpoints + * or error conditions to be properly intercepted and reported to gdb. + * Two, a breakpoint needs to be generated to begin communication. This + * is most easily accomplished by a call to breakpoint(). Breakpoint() + * simulates a breakpoint by executing an int 3. + * + ************* + * + * The following gdb commands are supported: + * + * command function Return value + * + * g return the value of the CPU registers hex data or ENN + * G set the value of the CPU registers OK or ENN + * + * mAA..AA,LLLL Read LLLL bytes at address AA..AA hex data or ENN + * MAA..AA,LLLL: Write LLLL bytes at address AA.AA OK or ENN + * + * c Resume at current address SNN ( signal NN) + * cAA..AA Continue at address AA..AA SNN + * + * s Step one instruction SNN + * sAA..AA Step one instruction from AA..AA SNN + * + * k kill + * + * ? What was the last sigval ? SNN (signal NN) + * + * All commands and responses are sent with a packet which includes a + * checksum. A packet consists of + * + * $#. + * + * where + * :: + * :: < two hex digits computed as modulo 256 sum of > + * + * When a packet is received, it is first acknowledged with either '+' or '-'. + * '+' indicates a successful transfer. '-' indicates a failed transfer. + * + * Example: + * + * Host: Reply: + * $m0,10#2a +$00010203040506070809101112131415#42 + * + ****************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for linux pt_regs struct */ +#include +#ifdef CONFIG_GDB_CONSOLE +#include +#endif +#include + +/************************************************************************ + * + * external low-level support routines + */ +typedef void (*Function) (void); /* pointer to a function */ + +/* Thread reference */ +typedef unsigned char threadref[8]; + +extern int putDebugChar(int); /* write a single character */ +extern int getDebugChar(void); /* read and return a single char */ + +extern int pid_max; + +/************************************************************************/ +/* BUFMAX defines the maximum number of characters in inbound/outbound buffers*/ +/* at least NUMREGBYTES*2 are needed for register packets */ +/* Longer buffer is needed to list all threads */ +#define BUFMAX 1024 + +static char initialized; /* boolean flag. != 0 means we've been initialized */ + +static const char hexchars[] = "0123456789abcdef"; + +/* Number of bytes of registers. */ +#define NUMREGBYTES 64 +/* + * Note that this register image is in a different order than + * the register image that Linux produces at interrupt time. + * + * Linux's register image is defined by struct pt_regs in ptrace.h. + * Just why GDB uses a different order is a historical mystery. + */ +enum regnames { _EAX, /* 0 */ + _ECX, /* 1 */ + _EDX, /* 2 */ + _EBX, /* 3 */ + _ESP, /* 4 */ + _EBP, /* 5 */ + _ESI, /* 6 */ + _EDI, /* 7 */ + _PC /* 8 also known as eip */ , + _PS /* 9 also known as eflags */ , + _CS, /* 10 */ + _SS, /* 11 */ + _DS, /* 12 */ + _ES, /* 13 */ + _FS, /* 14 */ + _GS +}; /* 15 */ + +/*************************** ASSEMBLY CODE MACROS *************************/ +/* */ + +#define BREAKPOINT() asm(" int $3"); + +/* Put the error code here just in case the user cares. */ +int gdb_i386errcode; +/* Likewise, the vector number here (since GDB only gets the signal + number through the usual means, and that's not very specific). */ +int gdb_i386vector = -1; + +static spinlock_t slavecpulocks[KGDB_MAX_NO_CPUS]; +volatile int procindebug[KGDB_MAX_NO_CPUS]; + +#ifdef CONFIG_SMP +spinlock_t kgdb_spinlock = SPIN_LOCK_UNLOCKED; +spinlock_t kgdb_nmispinlock = SPIN_LOCK_UNLOCKED; +#else +unsigned kgdb_spinlock = 0; +unsigned kgdb_nmispinlock = 0; +#endif + +static void +kgdb_usercode(void) +{ +} + +int +hex(char ch) +{ + if ((ch >= 'a') && (ch <= 'f')) + return (ch - 'a' + 10); + if ((ch >= '0') && (ch <= '9')) + return (ch - '0'); + if ((ch >= 'A') && (ch <= 'F')) + return (ch - 'A' + 10); + return (-1); +} + +/* scan for the sequence $# */ +void +getpacket(char *buffer) +{ + unsigned char checksum; + unsigned char xmitcsum; + int i; + int count; + char ch; + + do { + /* wait around for the start character, ignore all other characters */ + while ((ch = (getDebugChar() & 0x7f)) != '$') ; + checksum = 0; + xmitcsum = -1; + + count = 0; + + /* now, read until a # or end of buffer is found */ + while (count < BUFMAX) { + ch = getDebugChar() & 0x7f; + if (ch == '#') + break; + checksum = checksum + ch; + buffer[count] = ch; + count = count + 1; + } + buffer[count] = 0; + + if (ch == '#') { + xmitcsum = hex(getDebugChar() & 0x7f) << 4; + xmitcsum += hex(getDebugChar() & 0x7f); + + if (checksum != xmitcsum) + putDebugChar('-'); /* failed checksum */ + else { + putDebugChar('+'); /* successful transfer */ + /* if a sequence char is present, reply the sequence ID */ + if (buffer[2] == ':') { + putDebugChar(buffer[0]); + putDebugChar(buffer[1]); + /* remove sequence chars from buffer */ + count = strlen(buffer); + for (i = 3; i <= count; i++) + buffer[i - 3] = buffer[i]; + } + } + } + } while (checksum != xmitcsum); + +} + +/* send the packet in buffer. */ + +void +putpacket(char *buffer) +{ + unsigned char checksum; + int count; + char ch; + + /* $#. */ + do { + putDebugChar('$'); + checksum = 0; + count = 0; + + while ((ch = buffer[count])) { + if (!putDebugChar(ch)) + return; + checksum += ch; + count += 1; + } + + putDebugChar('#'); + putDebugChar(hexchars[checksum >> 4]); + putDebugChar(hexchars[checksum % 16]); + + } while ((getDebugChar() & 0x7f) != '+'); + +} + +static char remcomInBuffer[BUFMAX]; +static char remcomOutBuffer[BUFMAX]; +static short error; + +static void +regs_to_gdb_regs(int *gdb_regs, struct pt_regs *regs) +{ + gdb_regs[_EAX] = regs->eax; + gdb_regs[_EBX] = regs->ebx; + gdb_regs[_ECX] = regs->ecx; + gdb_regs[_EDX] = regs->edx; + gdb_regs[_ESI] = regs->esi; + gdb_regs[_EDI] = regs->edi; + gdb_regs[_EBP] = regs->ebp; + gdb_regs[_DS] = regs->xds; + gdb_regs[_ES] = regs->xes; + gdb_regs[_PS] = regs->eflags; + gdb_regs[_CS] = regs->xcs; + gdb_regs[_PC] = regs->eip; + gdb_regs[_ESP] = (int) (®s->esp); + gdb_regs[_SS] = __KERNEL_DS; + gdb_regs[_FS] = 0xFFFF; + gdb_regs[_GS] = 0xFFFF; +} /* regs_to_gdb_regs */ + +static void +gdb_regs_to_regs(int *gdb_regs, struct pt_regs *regs) +{ + regs->eax = gdb_regs[_EAX]; + regs->ebx = gdb_regs[_EBX]; + regs->ecx = gdb_regs[_ECX]; + regs->edx = gdb_regs[_EDX]; + regs->esi = gdb_regs[_ESI]; + regs->edi = gdb_regs[_EDI]; + regs->ebp = gdb_regs[_EBP]; + regs->xds = gdb_regs[_DS]; + regs->xes = gdb_regs[_ES]; + regs->eflags = gdb_regs[_PS]; + regs->xcs = gdb_regs[_CS]; + regs->eip = gdb_regs[_PC]; +#if 0 /* can't change these */ + regs->esp = gdb_regs[_ESP]; + regs->xss = gdb_regs[_SS]; + regs->fs = gdb_regs[_FS]; + regs->gs = gdb_regs[_GS]; +#endif + +} /* gdb_regs_to_regs */ + +/* Indicate to caller of mem2hex or hex2mem that there has been an + error. */ +static volatile int kgdb_memerr = 0; +volatile int kgdb_memerr_expected = 0; +static volatile int kgdb_memerr_cnt = 0; +static int garbage_loc = -1; + +int +get_char(char *addr) +{ + return *addr; +} + +void +set_char(char *addr, int val) +{ + *addr = val; +} + +/* convert the memory pointed to by mem into hex, placing result in buf */ +/* return a pointer to the last char put in buf (null) */ +/* If MAY_FAULT is non-zero, then we should set kgdb_memerr in response to + a fault; if zero treat a fault like any other fault in the stub. */ +char * +mem2hex(char *mem, char *buf, int count, int may_fault) +{ + int i; + unsigned char ch; + + if (may_fault) { + kgdb_memerr_expected = 1; + kgdb_memerr = 0; + } + for (i = 0; i < count; i++) { + + ch = get_char(mem++); + + if (may_fault && kgdb_memerr) { + *buf = 0; /* truncate buffer */ + return (buf); + } + *buf++ = hexchars[ch >> 4]; + *buf++ = hexchars[ch % 16]; + } + *buf = 0; + if (may_fault) + kgdb_memerr_expected = 0; + return (buf); +} + +/* convert the hex array pointed to by buf into binary to be placed in mem */ +/* return a pointer to the character AFTER the last byte written */ +char * +hex2mem(char *buf, char *mem, int count, int may_fault) +{ + int i; + unsigned char ch; + + if (may_fault) { + kgdb_memerr_expected = 1; + kgdb_memerr = 0; + } + for (i = 0; i < count; i++) { + ch = hex(*buf++) << 4; + ch = ch + hex(*buf++); + set_char(mem++, ch); + + if (may_fault && kgdb_memerr) { + return (mem); + } + } + if (may_fault) + kgdb_memerr_expected = 0; + return (mem); +} + +/**********************************************/ +/* WHILE WE FIND NICE HEX CHARS, BUILD AN INT */ +/* RETURN NUMBER OF CHARS PROCESSED */ +/**********************************************/ +int +hexToInt(char **ptr, int *intValue) +{ + int numChars = 0; + int hexValue; + + *intValue = 0; + + while (**ptr) { + hexValue = hex(**ptr); + if (hexValue >= 0) { + *intValue = (*intValue << 4) | hexValue; + numChars++; + } else + break; + + (*ptr)++; + } + + return (numChars); +} + +#ifdef CONFIG_KGDB_THREAD +static int +stubhex(int ch) +{ + if (ch >= 'a' && ch <= 'f') + return ch - 'a' + 10; + if (ch >= '0' && ch <= '9') + return ch - '0'; + if (ch >= 'A' && ch <= 'F') + return ch - 'A' + 10; + return -1; +} + +static int +stub_unpack_int(char *buff, int fieldlength) +{ + int nibble; + int retval = 0; + + while (fieldlength) { + nibble = stubhex(*buff++); + retval |= nibble; + fieldlength--; + if (fieldlength) + retval = retval << 4; + } + return retval; +} +#endif + +static char * +pack_hex_byte(char *pkt, int byte) +{ + *pkt++ = hexchars[(byte >> 4) & 0xf]; + *pkt++ = hexchars[(byte & 0xf)]; + return pkt; +} + +#define BUF_THREAD_ID_SIZE 16 + +#ifdef CONFIG_KGDB_THREAD +static char * +pack_threadid(char *pkt, threadref * id) +{ + char *limit; + unsigned char *altid; + + altid = (unsigned char *) id; + limit = pkt + BUF_THREAD_ID_SIZE; + while (pkt < limit) + pkt = pack_hex_byte(pkt, *altid++); + return pkt; +} + +static char * +unpack_byte(char *buf, int *value) +{ + *value = stub_unpack_int(buf, 2); + return buf + 2; +} + +static char * +unpack_threadid(char *inbuf, threadref * id) +{ + char *altref; + char *limit = inbuf + BUF_THREAD_ID_SIZE; + int x, y; + + altref = (char *) id; + + while (inbuf < limit) { + x = stubhex(*inbuf++); + y = stubhex(*inbuf++); + *altref++ = (x << 4) | y; + } + return inbuf; +} +#endif + +void +int_to_threadref(threadref * id, int value) +{ + unsigned char *scan; + + scan = (unsigned char *) id; + { + int i = 4; + while (i--) + *scan++ = 0; + } + *scan++ = (value >> 24) & 0xff; + *scan++ = (value >> 16) & 0xff; + *scan++ = (value >> 8) & 0xff; + *scan++ = (value & 0xff); +} + +#ifdef CONFIG_KGDB_THREAD +static int +threadref_to_int(threadref * ref) +{ + int i, value = 0; + unsigned char *scan; + + scan = (char *) ref; + scan += 4; + i = 4; + while (i-- > 0) + value = (value << 8) | ((*scan++) & 0xff); + return value; +} + +struct task_struct * +getthread(int pid) +{ + struct task_struct *thread; + thread = find_task_by_pid(pid); + if (thread) { + return thread; + } +#if 0 + thread = init_tasks[0]; + do { + if (thread->pid == pid) { + return thread; + } + thread = thread->next_task; + } while (thread != init_tasks[0]); +#endif + return NULL; +} +#endif + +struct hw_breakpoint { + unsigned enabled; + unsigned type; + unsigned len; + unsigned addr; +} breakinfo[4] = { { +enabled:0}, { +enabled:0}, { +enabled:0}, { +enabled:0}}; + +void +correct_hw_break(void) +{ + int breakno; + int correctit; + int breakbit; + unsigned dr7; + + asm volatile ("movl %%db7, %0\n":"=r" (dr7) + :); + do { + unsigned addr0, addr1, addr2, addr3; + asm volatile ("movl %%db0, %0\n" + "movl %%db1, %1\n" + "movl %%db2, %2\n" + "movl %%db3, %3\n":"=r" (addr0), "=r"(addr1), + "=r"(addr2), "=r"(addr3):); + } while (0); + correctit = 0; + for (breakno = 0; breakno < 3; breakno++) { + breakbit = 2 << (breakno << 1); + if (!(dr7 & breakbit) && breakinfo[breakno].enabled) { + correctit = 1; + dr7 |= breakbit; + dr7 &= ~(0xf0000 << (breakno << 2)); + dr7 |= (((breakinfo[breakno].len << 2) | + breakinfo[breakno].type) << 16) << + (breakno << 2); + switch (breakno) { + case 0: + asm volatile ("movl %0, %%dr0\n"::"r" + (breakinfo[breakno].addr)); + break; + + case 1: + asm volatile ("movl %0, %%dr1\n"::"r" + (breakinfo[breakno].addr)); + break; + + case 2: + asm volatile ("movl %0, %%dr2\n"::"r" + (breakinfo[breakno].addr)); + break; + + case 3: + asm volatile ("movl %0, %%dr3\n"::"r" + (breakinfo[breakno].addr)); + break; + } + } else if ((dr7 & breakbit) && !breakinfo[breakno].enabled) { + correctit = 1; + dr7 &= ~breakbit; + dr7 &= ~(0xf0000 << (breakno << 2)); + } + } + if (correctit) { + asm volatile ("movl %0, %%db7\n"::"r" (dr7)); + } +} + +int +remove_hw_break(unsigned breakno) +{ + if (!breakinfo[breakno].enabled) { + return -1; + } + breakinfo[breakno].enabled = 0; + return 0; +} + +int +set_hw_break(unsigned breakno, unsigned type, unsigned len, unsigned addr) +{ + if (breakinfo[breakno].enabled) { + return -1; + } + breakinfo[breakno].enabled = 1; + breakinfo[breakno].type = type; + breakinfo[breakno].len = len; + breakinfo[breakno].addr = addr; + return 0; +} + +void +gdb_wait(void *arg) +{ + unsigned flags; + int processor; + + local_irq_save(flags); + processor = smp_processor_id(); + procindebug[processor] = 1; + current->thread.kgdbregs = arg; + spin_lock(slavecpulocks + processor); + correct_hw_break(); + procindebug[processor] = 0; + local_irq_restore(flags); +} + +void +printexceptioninfo(int exceptionNo, int errorcode, char *buffer) +{ + unsigned dr6; + int i; + switch (exceptionNo) { + case 1: /* debug exception */ + break; + case 3: /* breakpoint */ + sprintf(buffer, "Software breakpoint"); + return; + default: + sprintf(buffer, "Details not available"); + return; + } + asm volatile ("movl %%db6, %0\n":"=r" (dr6) + :); + if (dr6 & 0x4000) { + sprintf(buffer, "Single step"); + return; + } + for (i = 0; i < 4; ++i) { + if (dr6 & (1 << i)) { + sprintf(buffer, "Hardware breakpoint %d", i); + return; + } + } + sprintf(buffer, "Unknown trap"); + return; +} + +/* + * This function does all command procesing for interfacing to gdb. + * + * NOTE: The INT nn instruction leaves the state of the interrupt + * enable flag UNCHANGED. That means that when this routine + * is entered via a breakpoint (INT 3) instruction from code + * that has interrupts enabled, then interrupts will STILL BE + * enabled when this routine is entered. The first thing that + * we do here is disable interrupts so as to prevent recursive + * entries and bothersome serial interrupts while we are + * trying to run the serial port in polled mode. + * + * For kernel version 2.1.xx the cli() actually gets a spin lock so + * it is always necessary to do a restore_flags before returning + * so as to let go of that lock. + */ +int +handle_exception(int exceptionVector, + int signo, int err_code, struct pt_regs *linux_regs) +{ + struct task_struct *usethread = NULL; + int addr, length; + int breakno, breaktype; + char *ptr; + int newPC; + unsigned long flags = ~0UL; + int gdb_regs[NUMREGBYTES / 4]; + int i; + int dr6; + int reboot = 0; +#ifdef CONFIG_KGDB_THREAD + int nothreads; + int maxthreads; + int threadid; + threadref thref; + struct task_struct *thread = NULL; +#endif +#define regs (*linux_regs) + + /* + * If the entry is not from the kernel then return to the Linux + * trap handler and let it process the interrupt normally. + */ + if ((linux_regs->eflags & VM_MASK) || (3 & linux_regs->xcs)) { + return (0); + } + + if (kgdb_memerr_expected) { + /* + * This fault occured because of the get_char or set_char + * routines. These two routines use either eax of edx to + * indirectly reference the location in memory that they + * are working with. For a page fault, when we return + * the instruction will be retried, so we have to make + * sure that these registers point to valid memory. + */ + kgdb_memerr = 1; /* set mem error flag */ + kgdb_memerr_expected = 0; + kgdb_memerr_cnt++; /* helps in debugging */ + regs.eax = (long) &garbage_loc; /* make valid address */ + regs.edx = (long) &garbage_loc; /* make valid address */ + return (0); + } +#ifdef CONFIG_SMP + if (!spin_is_locked(&kgdb_nmispinlock)) +#else + if (!kgdb_nmispinlock) +#endif + { + + /* Get kgdb spinlock */ +#ifdef CONFIG_SMP + _raw_spin_lock(&kgdb_spinlock); +#else + kgdb_spinlock = 1; +#endif + + local_irq_save(flags); + + /* Disable hardware debugging while we are in kgdb */ + __asm__("movl %0,%%db7": /* no output */ + :"r"(0)); + + for (i = 0; i < NR_CPUS; i++) { + spin_lock_init(&slavecpulocks[i]); + _raw_spin_lock(&slavecpulocks[i]); + } + + if (num_online_cpus() > 1) { + /* Force other cpus in debugger */ + if (smp_call_function(gdb_wait, NULL, 0, 99) != 0) { + return (1); + } + } + + procindebug[smp_processor_id()] = 1; + } + + gdb_i386vector = exceptionVector; + gdb_i386errcode = err_code; + + /* reply to host that an exception has occurred */ + remcomOutBuffer[0] = 'S'; + remcomOutBuffer[1] = hexchars[signo >> 4]; + remcomOutBuffer[2] = hexchars[signo % 16]; + remcomOutBuffer[3] = 0; + + putpacket(remcomOutBuffer); + + while (1 == 1) { + error = 0; + remcomOutBuffer[0] = 0; + getpacket(remcomInBuffer); + switch (remcomInBuffer[0]) { + case '?': + remcomOutBuffer[0] = 'S'; + remcomOutBuffer[1] = hexchars[signo >> 4]; + remcomOutBuffer[2] = hexchars[signo % 16]; + remcomOutBuffer[3] = 0; + break; + case 'g': /* return the value of the CPU registers */ + if (!usethread || usethread == current) { + regs_to_gdb_regs(gdb_regs, ®s); + } else { + memset(gdb_regs, 0, NUMREGBYTES); + if (usethread->thread.kgdbregs) { + kgdb_memerr_expected = 1; + kgdb_memerr = 0; + get_char((char *) usethread->thread. + kgdbregs); + kgdb_memerr_expected = 0; + if (kgdb_memerr) { + gdb_regs[_PC] = + (int) kgdb_usercode; + } else { + regs_to_gdb_regs(gdb_regs, + usethread-> + thread. + kgdbregs); + } + } else { + gdb_regs[_PC] = (int) kgdb_usercode; + } + } + mem2hex((char *) gdb_regs, remcomOutBuffer, NUMREGBYTES, + 0); + break; + case 'G': /* set the value of the CPU registers - return OK */ + hex2mem(&remcomInBuffer[1], (char *) gdb_regs, + NUMREGBYTES, 0); + if (!usethread || usethread == current) { + gdb_regs_to_regs(gdb_regs, ®s); + strcpy(remcomOutBuffer, "OK"); + } else { + strcpy(remcomOutBuffer, "E00"); + } + break; + + /* mAA..AA,LLLL Read LLLL bytes at address AA..AA */ + case 'm': + /* TRY TO READ %x,%x. IF SUCCEED, SET PTR = 0 */ + ptr = &remcomInBuffer[1]; + if (hexToInt(&ptr, &addr)) + if (*(ptr++) == ',') + if (hexToInt(&ptr, &length)) { + ptr = 0; + mem2hex((char *) addr, + remcomOutBuffer, length, + 1); + if (kgdb_memerr) { + strcpy(remcomOutBuffer, + "E03"); + } + } + + if (ptr) { + strcpy(remcomOutBuffer, "E01"); + } + break; + + /* MAA..AA,LLLL: Write LLLL bytes at address AA.AA return OK */ + case 'M': + /* TRY TO READ '%x,%x:'. IF SUCCEED, SET PTR = 0 */ + ptr = &remcomInBuffer[1]; + if (hexToInt(&ptr, &addr)) + if (*(ptr++) == ',') + if (hexToInt(&ptr, &length)) + if (*(ptr++) == ':') { + hex2mem(ptr, + (char *) addr, + length, 1); + + if (kgdb_memerr) { + strcpy + (remcomOutBuffer, + "E03"); + } else { + strcpy + (remcomOutBuffer, + "OK"); + } + + ptr = 0; + } + if (ptr) { + strcpy(remcomOutBuffer, "E02"); + } + break; + + /* cAA..AA Continue at address AA..AA(optional) */ + /* sAA..AA Step one instruction from AA..AA(optional) */ + case 'c': + case 's': +#ifdef CONFIG_SMP + if (spin_is_locked(&kgdb_nmispinlock)) +#else + if (kgdb_nmispinlock) +#endif + { + strcpy(remcomOutBuffer, "E01"); + break; + } + + /* try to read optional parameter, pc unchanged if no parm */ + ptr = &remcomInBuffer[1]; + if (hexToInt(&ptr, &addr)) { + regs.eip = addr; + } + + newPC = regs.eip; + + /* clear the trace bit */ + regs.eflags &= 0xfffffeff; + + /* set the trace bit if we're stepping */ + if (remcomInBuffer[0] == 's') + regs.eflags |= 0x100; + + asm volatile ("movl %%db6, %0\n":"=r" (dr6) + :); + if (!(dr6 & 0x4000)) { + for (breakno = 0; breakno < 4; ++breakno) { + if (dr6 & (1 << breakno)) { + if (breakinfo[breakno].type == + 0) { + /* Set restore flag */ + regs.eflags |= 0x10000; + break; + } + } + } + } + correct_hw_break(); + asm volatile ("movl %0, %%db6\n"::"r" (0)); + for (i = 0; i < NR_CPUS; i++) { + _raw_spin_unlock(&slavecpulocks[i]); + } + + procindebug[smp_processor_id()] = 0; + /* Release kgdb spinlock */ +#ifdef CONFIG_SMP + _raw_spin_unlock(&kgdb_spinlock); +#else + kgdb_spinlock = 0; +#endif + if (flags != ~0UL) + local_irq_restore(flags); + return (0); + + /* kill the program */ + case 'k': + break; + + /* query */ + case 'q': + switch (remcomInBuffer[1]) { +#ifdef CONFIG_KGDB_THREAD + case 'L': + /* List threads */ + unpack_byte(remcomInBuffer + 3, &maxthreads); + unpack_threadid(remcomInBuffer + 5, &thref); + + remcomOutBuffer[0] = 'q'; + remcomOutBuffer[1] = 'M'; + remcomOutBuffer[4] = '0'; + pack_threadid(remcomOutBuffer + 5, &thref); + + threadid = threadref_to_int(&thref); + for (nothreads = 0; + nothreads < maxthreads + && threadid < pid_max; threadid++) { + thread = getthread(threadid); + if (thread) { + int_to_threadref(&thref, + threadid); + pack_threadid(remcomOutBuffer + + 21 + + nothreads * 16, + &thref); + nothreads++; + } + } + if (threadid == pid_max) { + remcomOutBuffer[4] = '1'; + } + pack_hex_byte(remcomOutBuffer + 2, nothreads); + remcomOutBuffer[21 + nothreads * 16] = '\0'; + break; + + case 'C': + /* Current thread id */ + remcomOutBuffer[0] = 'Q'; + remcomOutBuffer[1] = 'C'; + threadid = current->pid; + int_to_threadref(&thref, threadid); + pack_threadid(remcomOutBuffer + 2, &thref); + remcomOutBuffer[18] = '\0'; + break; +#endif + + case 'E': + /* Print exception info */ + printexceptioninfo(exceptionVector, err_code, + remcomOutBuffer); + break; + } + break; + +#ifdef CONFIG_KGDB_THREAD + /* task related */ + case 'H': + switch (remcomInBuffer[1]) { + case 'g': + ptr = &remcomInBuffer[2]; + hexToInt(&ptr, &threadid); + thread = getthread(threadid); + if (!thread) { + remcomOutBuffer[0] = 'E'; + remcomOutBuffer[1] = '\0'; + break; + } + usethread = thread; + /* follow through */ + case 'c': + remcomOutBuffer[0] = 'O'; + remcomOutBuffer[1] = 'K'; + remcomOutBuffer[2] = '\0'; + break; + } + break; + + /* Query thread status */ + case 'T': + ptr = &remcomInBuffer[1]; + hexToInt(&ptr, &threadid); + thread = getthread(threadid); + if (thread) { + remcomOutBuffer[0] = 'O'; + remcomOutBuffer[1] = 'K'; + remcomOutBuffer[2] = '\0'; + } else { + remcomOutBuffer[0] = 'E'; + remcomOutBuffer[1] = '\0'; + } + break; +#endif + + case 'r': + reboot = 1; + strcpy(remcomOutBuffer, "OK"); + break; + case 'Y': + ptr = &remcomInBuffer[1]; + hexToInt(&ptr, &breakno); + ptr++; + hexToInt(&ptr, &breaktype); + ptr++; + hexToInt(&ptr, &length); + ptr++; + hexToInt(&ptr, &addr); + if (set_hw_break + (breakno & 0x3, breaktype & 0x3, length & 0x3, addr) + == 0) { + strcpy(remcomOutBuffer, "OK"); + } else { + strcpy(remcomOutBuffer, "ERROR"); + } + break; + + /* Remove hardware breakpoint */ + case 'y': + ptr = &remcomInBuffer[1]; + hexToInt(&ptr, &breakno); + if (remove_hw_break(breakno & 0x3) == 0) { + strcpy(remcomOutBuffer, "OK"); + } else { + strcpy(remcomOutBuffer, "ERROR"); + } + break; + + } /* switch */ + + /* reply to the request */ + putpacket(remcomOutBuffer); + if (reboot == 1) { + static long no_idt[2]; + __asm__ __volatile__("lidt %0"::"m"(no_idt)); + __asm__ __volatile__("int3"); + } + } +} + +/* this function is used to set up exception handlers for tracing and + breakpoints */ +void +set_debug_traps(void) +{ + /* + * linux_debug_hook is defined in traps.c. We store a pointer + * to our own exception handler into it. + */ + linux_debug_hook = handle_exception; + + /* + * In case GDB is started before us, ack any packets (presumably + * "$?#xx") sitting there. */ + putDebugChar('+'); + + initialized = 1; +} + +/* This function will generate a breakpoint exception. It is used at the + beginning of a program to sync up with a debugger and can be used + otherwise as a quick means to stop program execution and "break" into + the debugger. */ + +void +breakpoint(void) +{ + if (initialized) + BREAKPOINT(); +} + +#ifdef CONFIG_GDB_CONSOLE +char gdbconbuf[BUFMAX]; + +void +gdb_console_write(struct console *co, const char *s, unsigned count) +{ + int i; + int wcount; + char *bufptr; + + if (!gdb_initialized) { + return; + } + gdbconbuf[0] = 'O'; + bufptr = gdbconbuf + 1; + while (count > 0) { + if ((count << 1) > (BUFMAX - 2)) { + wcount = (BUFMAX - 2) >> 1; + } else { + wcount = count; + } + count -= wcount; + for (i = 0; i < wcount; i++) { + bufptr = pack_hex_byte(bufptr, s[i]); + } + *bufptr = '\0'; + s += wcount; + + putpacket(gdbconbuf); + + } +} +#endif +static int __init +kgdb_opt_gdb(char *dummy) +{ + gdb_enter = 1; + return 1; +} +static int __init +kgdb_opt_gdbttyS(char *str) +{ + gdb_ttyS = simple_strtoul(str, NULL, 10); + return 1; +} +static int __init +kgdb_opt_gdbbaud(char *str) +{ + gdb_baud = simple_strtoul(str, NULL, 10); + return 1; +} + +/* + * Sequence of these lines has to be maintained because gdb option is a prefix + * of the other two options + */ + +__setup("gdbttyS=", kgdb_opt_gdbttyS); +__setup("gdbbaud=", kgdb_opt_gdbbaud); +__setup("gdb", kgdb_opt_gdb); diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/kernel/head.S 900-mjb1/arch/i386/kernel/head.S --- 000-virgin/arch/i386/kernel/head.S Fri May 30 19:01:58 2003 +++ 900-mjb1/arch/i386/kernel/head.S Fri May 30 21:58:44 2003 @@ -16,6 +16,7 @@ #include #include #include +#include #define OLD_CL_MAGIC_ADDR 0x90020 #define OLD_CL_MAGIC 0xA33F @@ -249,7 +250,7 @@ is386: movl $2,%ecx # set MP call check_x87 incb ready lgdt cpu_gdt_descr - lidt idt_descr + lidt node_idt_descr # we switch to the per-node IDTs later ljmp $(__KERNEL_CS),$1f 1: movl $(__KERNEL_DS),%eax # reload all the segment registers movl %eax,%ss # after changing gdt. @@ -314,7 +315,7 @@ setup_idt: movw %dx,%ax /* selector = 0x0010 = cs */ movw $0x8E00,%dx /* interrupt gate - dpl=0, present */ - lea idt_table,%edi + lea node_idt_table,%edi mov $256,%ecx rp_sidt: movl %eax,(%edi) @@ -325,7 +326,7 @@ rp_sidt: ret ENTRY(stack_start) - .long init_thread_union+8192 + .long init_thread_union+THREAD_SIZE .long __BOOT_DS /* This is the default interrupt "handler" :-) */ @@ -359,14 +360,16 @@ ignore_int: * segment size, and 32-bit linear address value: */ -.globl idt_descr +.globl node_idt_descr .globl cpu_gdt_descr ALIGN .word 0 # 32-bit align idt_desc.address -idt_descr: +node_idt_descr: .word IDT_ENTRIES*8-1 # idt contains 256 entries - .long idt_table + .long node_idt_table + + .fill MAX_NUMNODES-1,8,0 # boot GDT descriptor (later on used by CPU#0): .word 0 # 32 bit align gdt_desc.address diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/kernel/i386_ksyms.c 900-mjb1/arch/i386/kernel/i386_ksyms.c --- 000-virgin/arch/i386/kernel/i386_ksyms.c Fri May 30 19:01:58 2003 +++ 900-mjb1/arch/i386/kernel/i386_ksyms.c Fri May 30 22:04:52 2003 @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -33,6 +34,8 @@ #include #include #include +#include +#include extern void dump_thread(struct pt_regs *, struct user *); extern spinlock_t rtc_lock; @@ -95,7 +98,7 @@ EXPORT_SYMBOL(apm_info); EXPORT_SYMBOL(__io_virt_debug); #endif -EXPORT_SYMBOL_NOVERS(__down_failed); +EXPORT_SYMBOL_NOVERS(__down_failed_wq); EXPORT_SYMBOL_NOVERS(__down_failed_interruptible); EXPORT_SYMBOL_NOVERS(__down_failed_trylock); EXPORT_SYMBOL_NOVERS(__up_wakeup); @@ -146,6 +149,20 @@ EXPORT_SYMBOL(smp_num_siblings); EXPORT_SYMBOL(cpu_sibling_map); #endif +#ifdef CONFIG_X86_REMOTE_DEBUG +void __this_fixmap_does_not_exist(void) +{ + BUG(); +} +EXPORT_SYMBOL(__this_fixmap_does_not_exist); + +void __br_lock_usage_bug(void) +{ + BUG(); +} +EXPORT_SYMBOL(__br_lock_usage_bug); +#endif + #ifdef CONFIG_SMP EXPORT_SYMBOL(cpu_data); EXPORT_SYMBOL(cpu_online_map); @@ -179,6 +196,7 @@ EXPORT_SYMBOL(rtc_lock); EXPORT_SYMBOL_GPL(set_nmi_callback); EXPORT_SYMBOL_GPL(unset_nmi_callback); +EXPORT_SYMBOL_GPL(die_chain); #undef memcpy #undef memset @@ -207,4 +225,26 @@ EXPORT_SYMBOL(kmap_atomic_to_page); #ifdef CONFIG_EDD_MODULE EXPORT_SYMBOL(edd); EXPORT_SYMBOL(eddnr); +#endif + +#ifdef CONFIG_CRASH_DUMP_MODULE +#ifdef CONFIG_SMP +extern irq_desc_t irq_desc[NR_IRQS]; +extern unsigned long irq_affinity[NR_IRQS]; +extern void stop_this_cpu(void *); +EXPORT_SYMBOL(irq_desc); +EXPORT_SYMBOL(irq_affinity); +EXPORT_SYMBOL(stop_this_cpu); +EXPORT_SYMBOL(dump_send_ipi); +#endif +extern int pfn_is_ram(unsigned long); +EXPORT_SYMBOL(pfn_is_ram); +#ifdef ARCH_HAS_NMI_WATCHDOG +EXPORT_SYMBOL(touch_nmi_watchdog); +#endif +#endif + +#ifdef CONFIG_X86_STACK_CHECK +extern void mcount(void); +EXPORT_SYMBOL(mcount); #endif diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/kernel/init_task.c 900-mjb1/arch/i386/kernel/init_task.c --- 000-virgin/arch/i386/kernel/init_task.c Thu Feb 13 11:08:02 2003 +++ 900-mjb1/arch/i386/kernel/init_task.c Fri May 30 19:24:43 2003 @@ -14,6 +14,14 @@ static struct signal_struct init_signals static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); struct mm_struct init_mm = INIT_MM(init_mm); +union thread_union init_irq_union + __attribute__((__section__(".data.init_task"))); + +#ifdef CONFIG_X86_STACK_CHECK +union thread_union stack_overflow_stack + __attribute__((__section__(".data.init_task"))); +#endif + /* * Initial thread structure. * diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/kernel/io_apic.c 900-mjb1/arch/i386/kernel/io_apic.c --- 000-virgin/arch/i386/kernel/io_apic.c Fri May 30 19:01:59 2003 +++ 900-mjb1/arch/i386/kernel/io_apic.c Fri May 30 22:02:33 2003 @@ -271,7 +271,7 @@ static void set_ioapic_affinity (unsigne spin_unlock_irqrestore(&ioapic_lock, flags); } -#if defined(CONFIG_SMP) +#if defined(CONFIG_IRQBALANCE) # include /* kernel_thread() */ # include /* kstat */ # include /* kmalloc() */ @@ -665,8 +665,6 @@ static int __init irqbalance_disable(cha __setup("noirqbalance", irqbalance_disable); -static void set_ioapic_affinity (unsigned int irq, unsigned long mask); - static inline void move_irq(int irq) { /* note - we hold the desc->lock */ @@ -678,9 +676,9 @@ static inline void move_irq(int irq) __initcall(balanced_irq_init); -#else /* !SMP */ +#else /* !IRQBALANCE */ static inline void move_irq(int irq) { } -#endif /* defined(CONFIG_SMP) */ +#endif /* defined(IRQBALANCE) */ /* @@ -1117,24 +1115,59 @@ static inline int IO_APIC_irq_trigger(in } int irq_vector[NR_IRQS] = { FIRST_DEVICE_VECTOR , 0 }; +int __initdata vector_allocated[MAX_NUMNODES][FIRST_SYSTEM_VECTOR]; -static int __init assign_irq_vector(int irq) -{ - static int current_vector = FIRST_DEVICE_VECTOR, offset = 0; - if (IO_APIC_VECTOR(irq) > 0) - return IO_APIC_VECTOR(irq); +/* + * This is the per node vector allocator, it will only work for systems which + * have ioapics which can only deliver vectors to cpus on the same node and + * thus have hardware enforced ioapic/irq node affinity. + * + * However currently the only i386 systems which have this interrupt + * dispatching/servicing architecture are NUMAQ and x440. We try and 'share' + * vectors where possible to simplify cases where an irq can be serviced on + * multiple nodes due to it being present on multiple busses/nodes. + * The first pass on node0 will ensure we catch these node 'shared' irqs. + */ +static int __init assign_irq_vector(int irq, int node) +{ + static int offset[MAX_NUMNODES]; + static int nr_assigned[MAX_NUMNODES] = {[0 ... MAX_NUMNODES-1] = 1}; + static int current_vector[MAX_NUMNODES] = + {[0 ... MAX_NUMNODES-1] = FIRST_DEVICE_VECTOR}; + + int vector; + + Dprintk("requesting vector for node%d/irq%d\n", node, irq); + vector = IO_APIC_VECTOR(irq); + if (vector > 0) { + Dprintk("returning previous allocation vector0x%x\n", vector); + vector_allocated[node][vector]++; + return vector; + } + + if (++nr_assigned[node] > NR_IRQ_VECTORS) + return -ENOSPC; + next: - current_vector += 8; - if (current_vector == SYSCALL_VECTOR) + current_vector[node] += 8; + if (current_vector[node] == SYSCALL_VECTOR) goto next; - if (current_vector >= FIRST_SYSTEM_VECTOR) { - offset = (offset + 1) & 7; - current_vector = FIRST_DEVICE_VECTOR + offset; + if (current_vector[node] > FIRST_SYSTEM_VECTOR) { + offset[node] = (offset[node]+1) & 7; + current_vector[node] = FIRST_DEVICE_VECTOR + offset[node]; } - IO_APIC_VECTOR(irq) = current_vector; - return current_vector; + vector = current_vector[node]; + if (vector_allocated[node][vector]) + goto next; + + vector_allocated[node][vector]++; + IO_APIC_VECTOR(irq) = vector; + Dprintk("returning new allocation node%d/irq%d -> vector0x%x\n", + node, irq, vector); + + return vector; } static struct hw_interrupt_type ioapic_level_irq_type; @@ -1143,7 +1176,7 @@ static struct hw_interrupt_type ioapic_e void __init setup_IO_APIC_irqs(void) { struct IO_APIC_route_entry entry; - int apic, pin, idx, irq, first_notcon = 1, vector; + int apic, pin, idx, irq, first_notcon = 1, vector, bus, node; unsigned long flags; printk(KERN_DEBUG "init IO_APIC IRQs\n"); @@ -1160,11 +1193,12 @@ void __init setup_IO_APIC_irqs(void) entry.dest_mode = INT_DEST_MODE; entry.mask = 0; /* enable IRQ */ entry.dest.logical.logical_dest = TARGET_CPUS; - + idx = find_irq_entry(apic,pin,mp_INT); if (idx == -1) { if (first_notcon) { - printk(KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mpc_apicid, pin); + printk(KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", + mp_ioapics[apic].mpc_apicid, pin); first_notcon = 0; } else printk(", %d-%d", mp_ioapics[apic].mpc_apicid, pin); @@ -1174,12 +1208,21 @@ void __init setup_IO_APIC_irqs(void) entry.trigger = irq_trigger(idx); entry.polarity = irq_polarity(idx); + bus = mp_irqs[idx].mpc_srcbus; + node = mp_bus_id_to_node[bus]; + if (irq_trigger(idx)) { entry.trigger = 1; entry.mask = 1; } irq = pin_2_irq(idx, apic, pin); + if (irq >= NR_IRQS) { + printk("skipping irq%d on node%d/bus%d/ioapic%d out of IRQs!\n", + irq, node, bus, apic); + continue; + } + /* * skip adding the timer int on secondary nodes, which causes * a small but painful rift in the time-space continuum @@ -1193,7 +1236,10 @@ void __init setup_IO_APIC_irqs(void) continue; if (IO_APIC_IRQ(irq)) { - vector = assign_irq_vector(irq); + vector = assign_irq_vector(irq, node); + if (vector < 0) + continue; + entry.vector = vector; if (IO_APIC_irq_trigger(irq)) @@ -1201,11 +1247,15 @@ void __init setup_IO_APIC_irqs(void) else irq_desc[irq].handler = &ioapic_edge_irq_type; - set_intr_gate(vector, interrupt[irq]); - + Dprintk("irq_setup: node%d/bus%d/ioapic%d/vector0x%x - irq%d %p\n", + node, bus, apic, vector, irq, interrupt[irq]); + + node_set_intr_gate(node, vector, interrupt[irq]); + if (!apic && (irq < 16)) disable_8259A_irq(irq); } + spin_lock_irqsave(&ioapic_lock, flags); io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1)); io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0)); @@ -1896,6 +1946,7 @@ static inline void init_IO_APIC_traps(vo * so default to an old-fashioned 8259 * interrupt if we can.. */ + printk(KERN_DEBUG "irq%d not serviced by IOAPIC\n", irq); if (irq < 16) make_8259A_irq(irq); else @@ -2034,9 +2085,10 @@ static inline void check_timer(void) * get/set the timer IRQ vector: */ disable_8259A_irq(0); - vector = assign_irq_vector(0); + vector = assign_irq_vector(0, cpu_to_node(smp_processor_id())); + /* This gets reserved on all nodes as FIRST_DEVICE_VECTOR */ set_intr_gate(vector, interrupt[0]); - + /* * Subtle, code in do_timer_interrupt() expects an AEOI * mode for the 8259A whenever interrupts are routed @@ -2291,10 +2343,13 @@ int io_apic_set_pci_routing (int ioapic, { struct IO_APIC_route_entry entry; unsigned long flags; + int node, bus, vector; + + if (irq >= NR_IRQS) + return -ENOSPC; if (!IO_APIC_IRQ(irq)) { - printk(KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0/n", - ioapic); + printk(KERN_ERR "ioapic%d invalid reference to IRQ0/n", ioapic); return -EINVAL; } @@ -2314,17 +2369,26 @@ int io_apic_set_pci_routing (int ioapic, entry.polarity = 1; /* Low active */ add_pin_to_irq(irq, ioapic, pin); + + /* XXX verify this with an x440 and plain ACPI/SMP -zwane */ + bus = mp_irqs[pin].mpc_srcbus; + node = mp_bus_id_to_node[bus]; + + vector = assign_irq_vector(irq, node); + if (vector < 0) + return -ENOSPC; - entry.vector = assign_irq_vector(irq); - - printk(KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry (%d-%d -> 0x%x -> " - "IRQ %d)\n", ioapic, + entry.vector = vector; + printk(KERN_DEBUG "NODE[%d] IOAPIC[%d]: Set PCI routing entry (%d-%d -> 0x%x -> " + "IRQ %d)\n", node, ioapic, mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq); irq_desc[irq].handler = &ioapic_level_irq_type; - set_intr_gate(entry.vector, interrupt[irq]); - + printk(KERN_DEBUG "irq_route: node%d/bus%d/ioapic%d/vector0x%x - irq%d %p\n", + node, bus, ioapic, entry.vector, irq, interrupt[irq]); + node_set_intr_gate(node, entry.vector, interrupt[irq]); + if (!ioapic && (irq < 16)) disable_8259A_irq(irq); diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/kernel/irq.c 900-mjb1/arch/i386/kernel/irq.c --- 000-virgin/arch/i386/kernel/irq.c Fri May 30 19:01:59 2003 +++ 900-mjb1/arch/i386/kernel/irq.c Fri May 30 19:23:03 2003 @@ -342,7 +342,8 @@ void enable_irq(unsigned int irq) * SMP cross-CPU interrupts have their own specific * handlers). */ -asmlinkage unsigned int do_IRQ(struct pt_regs regs) +struct pt_regs * IRQHANDLER(do_IRQ(struct pt_regs *regs)); +struct pt_regs * do_IRQ(struct pt_regs *regs) { /* * We ack quickly, we don't want the irq controller @@ -354,7 +355,7 @@ asmlinkage unsigned int do_IRQ(struct pt * 0 return value means that this irq is already being * handled by some other CPU. (or is disabled) */ - int irq = regs.orig_eax & 0xff; /* high bits used in ret_from_ code */ + int irq = regs->orig_eax & 0xff; /* high bits used in ret_from_ code */ int cpu = smp_processor_id(); irq_desc_t *desc = irq_desc + irq; struct irqaction * action; @@ -419,7 +420,7 @@ asmlinkage unsigned int do_IRQ(struct pt */ for (;;) { spin_unlock(&desc->lock); - handle_IRQ_event(irq, ®s, action); + handle_IRQ_event(irq, regs, action); spin_lock(&desc->lock); if (likely(!(desc->status & IRQ_PENDING))) @@ -438,7 +439,7 @@ out: irq_exit(); - return 1; + return regs; } /** @@ -898,8 +899,9 @@ static int irq_affinity_write_proc (stru return -EINVAL; irq_affinity[irq] = new_value; +#ifndef CONFIG_X86_SUMMIT irq_desc[irq].handler->set_affinity(irq, new_value); - +#endif return full_count; } diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/kernel/kprobes.c 900-mjb1/arch/i386/kernel/kprobes.c --- 000-virgin/arch/i386/kernel/kprobes.c Wed Dec 31 16:00:00 1969 +++ 900-mjb1/arch/i386/kernel/kprobes.c Fri May 30 19:15:14 2003 @@ -0,0 +1,160 @@ +/* + * Support for kernel probes. + * (C) 2002 Vamsi Krishna S . + */ + +#include +#include +#include +#include +#include + +/* kprobe_status settings */ +#define KPROBE_HIT_ACTIVE 0x00000001 +#define KPROBE_HIT_SS 0x00000002 + +static struct kprobe *current_kprobe; +static unsigned long kprobe_status, kprobe_old_eflags, kprobe_saved_eflags; + +/* + * returns non-zero if opcode modifies the interrupt flag. + */ +static inline int is_IF_modifier(u8 opcode) +{ + switch(opcode) { + case 0xfa: /* cli */ + case 0xfb: /* sti */ + case 0xcf: /* iret/iretd */ + case 0x9d: /* popf/popfd */ + return 1; + } + return 0; +} + +static inline void disarm_kprobe(struct kprobe *p, struct pt_regs *regs) +{ + *p->addr = p->opcode; + regs->eip = (unsigned long)p->addr; +} + +/* + * Interrupts are disabled on entry as trap3 is an interrupt gate and they + * remain disabled thorough out this function. + */ +int kprobe_handler(struct pt_regs *regs) +{ + struct kprobe *p; + int ret = 0; + u8 *addr = (u8 *)(regs->eip-1); + + /* We're in an interrupt, but this is clear and BUG()-safe. */ + preempt_disable(); + + /* Check we're not actually recursing */ + if (kprobe_running()) { + /* We *are* holding lock here, so this is safe. + Disarm the probe we just hit, and ignore it. */ + p = get_kprobe(addr); + if (p) { + disarm_kprobe(p, regs); + ret = 1; + } + /* If it's not ours, can't be delete race, (we hold lock). */ + goto no_kprobe; + } + + lock_kprobes(); + p = get_kprobe(addr); + if (!p) { + unlock_kprobes(); + /* Unregistered (on another cpu) after this hit? Ignore */ + if (*addr != BREAKPOINT_INSTRUCTION) + ret = 1; + /* Not one of ours: let kernel handle it */ + goto no_kprobe; + } + + kprobe_status = KPROBE_HIT_ACTIVE; + current_kprobe = p; + kprobe_saved_eflags = kprobe_old_eflags + = (regs->eflags & (TF_MASK|IF_MASK)); + if (is_IF_modifier(p->opcode)) + kprobe_saved_eflags &= ~IF_MASK; + + p->pre_handler(p, regs); + + regs->eflags |= TF_MASK; + regs->eflags &= ~IF_MASK; + + /* We hold lock, now we remove breakpoint and single step. */ + disarm_kprobe(p, regs); + kprobe_status = KPROBE_HIT_SS; + return 1; + +no_kprobe: + preempt_enable_no_resched(); + return ret; +} + +static void rearm_kprobe(struct kprobe *p, struct pt_regs *regs) +{ + regs->eflags &= ~TF_MASK; + *p->addr = BREAKPOINT_INSTRUCTION; +} + +/* + * Interrupts are disabled on entry as trap1 is an interrupt gate and they + * remain disabled thorough out this function. And we hold kprobe lock. + */ +int post_kprobe_handler(struct pt_regs *regs) +{ + if (!kprobe_running()) + return 0; + + if (current_kprobe->post_handler) + current_kprobe->post_handler(current_kprobe, regs, 0); + + /* + * We singlestepped with interrupts disabled. So, the result on + * the stack would be incorrect for "pushfl" instruction. + * Note that regs->esp is actually the top of the stack when the + * trap occurs in kernel space. + */ + if (current_kprobe->opcode == 0x9c) { /* pushfl */ + regs->esp &= ~(TF_MASK | IF_MASK); + regs->esp |= kprobe_old_eflags; + } + + rearm_kprobe(current_kprobe, regs); + regs->eflags |= kprobe_saved_eflags; + + unlock_kprobes(); + preempt_enable_no_resched(); + + /* + * if somebody else is singlestepping across a probe point, eflags + * will have TF set, in which case, continue the remaining processing + * of do_debug, as if this is not a probe hit. + */ + if (regs->eflags & TF_MASK) + return 0; + + return 1; +} + +/* Interrupts disabled, kprobe_lock held. */ +int kprobe_fault_handler(struct pt_regs *regs, int trapnr) +{ + if (current_kprobe->fault_handler + && current_kprobe->fault_handler(current_kprobe, regs, trapnr)) + return 1; + + if (kprobe_status & KPROBE_HIT_SS) { + rearm_kprobe(current_kprobe, regs); + regs->eflags |= kprobe_old_eflags; + + unlock_kprobes(); + preempt_enable_no_resched(); + } + return 0; +} diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/kernel/nmi.c 900-mjb1/arch/i386/kernel/nmi.c --- 000-virgin/arch/i386/kernel/nmi.c Sun Apr 20 19:34:57 2003 +++ 900-mjb1/arch/i386/kernel/nmi.c Fri May 30 19:24:56 2003 @@ -23,7 +23,9 @@ #include #include #include +#include +#include #include #include #include @@ -33,6 +35,20 @@ static unsigned int nmi_hz = HZ; unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */ extern void show_registers(struct pt_regs *regs); +#ifdef CONFIG_X86_REMOTE_DEBUG +extern gdb_debug_hook * linux_debug_hook; +#define CHK_REMOTE_DEBUG(trapnr,signr,error_code,regs,after) \ +{ \ + if (linux_debug_hook != (gdb_debug_hook *) NULL && !user_mode(regs)) \ + { \ + (*linux_debug_hook)(trapnr, signr, error_code, regs); \ + after; \ + } \ +} +#else +#define CHK_REMOTE_DEBUG(trapnr,signr,error_code,regs,after) +#endif + #define K7_EVNTSEL_ENABLE (1 << 22) #define K7_EVNTSEL_INT (1 << 20) #define K7_EVNTSEL_OS (1 << 17) @@ -390,12 +406,59 @@ void nmi_watchdog_tick (struct pt_regs * sum = irq_stat[cpu].apic_timer_irqs; if (last_irq_sums[cpu] == sum) { +#ifdef CONFIG_X86_REMOTE_DEBUG +#ifdef CONFIG_SMP + if (spin_is_locked(&kgdb_spinlock)) +#else + if (kgdb_spinlock) +#endif + { + /* We are inside kgdb, this isn't a stuck cpu */ + alert_counter[cpu] = 0; + } else { +#ifdef CONFIG_SMP + if (spin_is_locked(&kgdb_nmispinlock)) +#else + if (kgdb_nmispinlock) +#endif + { + if (!procindebug[cpu]) { + procindebug[cpu] = 1; + current->thread.kgdbregs = regs; + while (1) { + /* nothing */ + } + } + return; + } + } +#endif /* * Ayiee, looks like this CPU is stuck ... * wait a few IRQs (5 seconds) before doing the oops ... */ alert_counter[cpu]++; if (alert_counter[cpu] == 5*nmi_hz) { +#ifdef CONFIG_X86_REMOTE_DEBUG +#ifdef CONFIG_SMP + if (spin_trylock(&kgdb_nmispinlock)) +#else + kgdb_nmispinlock = 1; +#endif + { + procindebug[cpu] = 1; + CHK_REMOTE_DEBUG(2,SIGBUS,0,regs,) + } +#ifdef CONFIG_SMP + else { + procindebug[cpu] = 1; + current->thread.kgdbregs = regs; + while (1) { + /* nothing */ + } + } +#endif +#endif spin_lock(&nmi_print_lock); /* * We are in trouble anyway, lets at least try @@ -404,6 +467,7 @@ void nmi_watchdog_tick (struct pt_regs * bust_spinlocks(1); printk("NMI Watchdog detected LOCKUP on CPU%d, eip %08lx, registers:\n", cpu, regs->eip); show_registers(regs); + notify_die(DIE_WATCHDOG, "nmi_watchdog", regs, 0); printk("console shuts up ...\n"); console_silent(); spin_unlock(&nmi_print_lock); diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/kernel/process.c 900-mjb1/arch/i386/kernel/process.c --- 000-virgin/arch/i386/kernel/process.c Fri May 30 19:01:59 2003 +++ 900-mjb1/arch/i386/kernel/process.c Fri May 30 19:24:43 2003 @@ -160,7 +160,25 @@ static int __init idle_setup (char *str) __setup("idle=", idle_setup); -void show_regs(struct pt_regs * regs) +void stack_overflow(unsigned long esp, unsigned long eip) +{ + int panicing = ((esp&(THREAD_SIZE-1)) <= STACK_PANIC); + + printk( "esp: 0x%lx masked: 0x%lx STACK_PANIC:0x%x %d %d\n", + esp, (esp&(THREAD_SIZE-1)), STACK_PANIC, (((esp&(THREAD_SIZE-1)) <= STACK_PANIC)), panicing ); + + if (panicing) + print_symbol("stack overflow from %s\n", eip); + else + print_symbol("excessive stack use from %s\n", eip); + printk("esp: %p\n", (void*)esp); + show_trace((void*)esp); + + if (panicing) + panic("stack overflow\n"); +} + +asmlinkage void show_regs(struct pt_regs * regs) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; @@ -449,6 +467,7 @@ struct task_struct * __switch_to(struct /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ + next_p->thread_info->irq_stack = prev_p->thread_info->irq_stack; unlazy_fpu(prev_p); /* diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/kernel/reboot.c 900-mjb1/arch/i386/kernel/reboot.c --- 000-virgin/arch/i386/kernel/reboot.c Fri May 30 19:01:59 2003 +++ 900-mjb1/arch/i386/kernel/reboot.c Fri May 30 19:24:56 2003 @@ -8,8 +8,10 @@ #include #include #include +#include #include "mach_reboot.h" + /* * Power off function, if any */ @@ -248,7 +250,8 @@ void machine_restart(char * __unused) * Stop all CPUs and turn off local APICs and the IO-APIC, so * other OSs see a clean IRQ state. */ - smp_send_stop(); + if (notify_die(DIE_STOP,"cpustop",0,0) != NOTIFY_BAD) + smp_send_stop(); disable_IO_APIC(); #endif diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/kernel/semaphore.c 900-mjb1/arch/i386/kernel/semaphore.c --- 000-virgin/arch/i386/kernel/semaphore.c Sun Dec 1 10:00:12 2002 +++ 900-mjb1/arch/i386/kernel/semaphore.c Fri May 30 22:04:52 2003 @@ -15,6 +15,7 @@ #include #include #include +#include #include /* @@ -53,15 +54,20 @@ void __up(struct semaphore *sem) wake_up(&sem->wait); } -void __down(struct semaphore * sem) +int __down_wq(struct semaphore * sem, wait_queue_t *wait) { struct task_struct *tsk = current; - DECLARE_WAITQUEUE(wait, tsk); + DECLARE_WAITQUEUE(local_wait, tsk); unsigned long flags; - tsk->state = TASK_UNINTERRUPTIBLE; + if (is_sync_wait(wait)) + tsk->state = TASK_UNINTERRUPTIBLE; + if (!wait) { + wait = &local_wait; + } + spin_lock_irqsave(&sem->wait.lock, flags); - add_wait_queue_exclusive_locked(&sem->wait, &wait); + add_wait_queue_exclusive_locked(&sem->wait, wait); sem->sleepers++; for (;;) { @@ -79,17 +85,23 @@ void __down(struct semaphore * sem) sem->sleepers = 1; /* us - see -1 above */ spin_unlock_irqrestore(&sem->wait.lock, flags); + if (!is_sync_wait(wait)) + return -EIOCBRETRY; + schedule(); spin_lock_irqsave(&sem->wait.lock, flags); tsk->state = TASK_UNINTERRUPTIBLE; } - remove_wait_queue_locked(&sem->wait, &wait); + if (is_sync_wait(wait) || !list_empty(&wait->task_list)) + remove_wait_queue_locked(&sem->wait, wait); wake_up_locked(&sem->wait); spin_unlock_irqrestore(&sem->wait.lock, flags); tsk->state = TASK_RUNNING; + return 0; } + int __down_interruptible(struct semaphore * sem) { int retval = 0; @@ -189,19 +201,17 @@ int __down_trylock(struct semaphore * se asm( ".text\n" ".align 4\n" -".globl __down_failed\n" -"__down_failed:\n\t" +".globl __down_failed_wq\n" +"__down_failed_wq:\n\t" #if defined(CONFIG_FRAME_POINTER) "pushl %ebp\n\t" "movl %esp,%ebp\n\t" #endif - "pushl %eax\n\t" "pushl %edx\n\t" "pushl %ecx\n\t" - "call __down\n\t" + "call __down_wq\n\t" "popl %ecx\n\t" "popl %edx\n\t" - "popl %eax\n\t" #if defined(CONFIG_FRAME_POINTER) "movl %ebp,%esp\n\t" "popl %ebp\n\t" diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/kernel/setup.c 900-mjb1/arch/i386/kernel/setup.c --- 000-virgin/arch/i386/kernel/setup.c Fri May 30 19:01:59 2003 +++ 900-mjb1/arch/i386/kernel/setup.c Fri May 30 19:26:44 2003 @@ -978,6 +978,9 @@ void __init setup_arch(char **cmdline_p) if (smp_found_config) get_smp_config(); #endif +#ifdef CONFIG_X86_SUMMIT + setup_summit(); +#endif register_memory(max_low_pfn); diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/kernel/smp.c 900-mjb1/arch/i386/kernel/smp.c --- 000-virgin/arch/i386/kernel/smp.c Fri May 30 19:01:59 2003 +++ 900-mjb1/arch/i386/kernel/smp.c Fri May 30 19:24:56 2003 @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -144,6 +145,13 @@ inline void __send_IPI_shortcut(unsigned */ cfg = __prepare_ICR(shortcut, vector); + if (vector == DUMP_VECTOR) { + /* + * Setup DUMP IPI to be delivered as an NMI + */ + cfg = (cfg&~APIC_VECTOR_MASK)|APIC_DM_NMI; + } + /* * Send the IPI. The write to APIC_ICR fires this off. */ @@ -305,7 +313,8 @@ static inline void leave_mm (unsigned lo * 2) Leave the mm if we are in the lazy tlb mode. */ -asmlinkage void smp_invalidate_interrupt (void) +struct pt_regs * IRQHANDLER(smp_invalidate_interrupt(struct pt_regs *regs)); +struct pt_regs * smp_invalidate_interrupt(struct pt_regs *regs) { unsigned long cpu; @@ -336,6 +345,7 @@ asmlinkage void smp_invalidate_interrupt out: put_cpu_no_resched(); + return regs; } static void flush_tlb_others (unsigned long cpumask, struct mm_struct *mm, @@ -450,6 +460,11 @@ void flush_tlb_all(void) on_each_cpu(do_flush_tlb_all, 0, 1, 1); } +void dump_send_ipi(void) +{ + send_IPI_allbutself(DUMP_VECTOR); +} + /* * this function sends a 'reschedule' IPI to another CPU. * it goes straight through and wastes no time serializing @@ -509,10 +524,17 @@ int smp_call_function (void (*func) (voi { struct call_data_struct data; int cpus = num_online_cpus()-1; + int count = 0; + int gdb; - if (!cpus) + if (cpus <= 0) return 0; + gdb = 0; + if (wait == 99) { + wait = 0; + gdb = 1; + } data.func = func; data.info = info; atomic_set(&data.started, 0); @@ -528,18 +550,33 @@ int smp_call_function (void (*func) (voi send_IPI_allbutself(CALL_FUNCTION_VECTOR); /* Wait for response */ - while (atomic_read(&data.started) != cpus) + while (atomic_read(&data.started) != cpus) { + if (gdb) { + if (count++ == 2000000) { + printk("%s: timeout\n", __FUNCTION__); + break; + } + if (count == 1000000) { + printk("looks bad\n"); + printk("cpus=%d, started=%d\n", cpus, + atomic_read(&data.started)); + } + if (count > 1000000) + udelay(1); + } barrier(); + } if (wait) while (atomic_read(&data.finished) != cpus) barrier(); + spin_unlock(&call_lock); return 0; } -static void stop_this_cpu (void * dummy) +void stop_this_cpu (void * dummy) { /* * Remove this CPU: @@ -570,14 +607,17 @@ void smp_send_stop(void) * all the work is done automatically when * we return from the interrupt. */ -asmlinkage void smp_reschedule_interrupt(void) +struct pt_regs * IRQHANDLER(smp_reschedule_interrupt(struct pt_regs *regs)); +struct pt_regs * smp_reschedule_interrupt(struct pt_regs *regs) { ack_APIC_irq(); + return regs; } -asmlinkage void smp_call_function_interrupt(void) +struct pt_regs * IRQHANDLER(smp_call_function_interrupt(struct pt_regs *regs)); +struct pt_regs * smp_call_function_interrupt(struct pt_regs *regs) { - void (*func) (void *info) = call_data->func; + void (*func) (void *info, struct pt_regs *) = (void (*)(void *, struct pt_regs*))call_data->func; void *info = call_data->info; int wait = call_data->wait; @@ -592,12 +632,12 @@ asmlinkage void smp_call_function_interr * At this point the info structure may be out of scope unless wait==1 */ irq_enter(); - (*func)(info); + (*func)(info, regs); irq_exit(); if (wait) { mb(); atomic_inc(&call_data->finished); } + return regs; } - diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/kernel/smpboot.c 900-mjb1/arch/i386/kernel/smpboot.c --- 000-virgin/arch/i386/kernel/smpboot.c Fri May 30 19:01:59 2003 +++ 900-mjb1/arch/i386/kernel/smpboot.c Fri May 30 21:58:44 2003 @@ -45,9 +45,11 @@ #include #include +#include #include #include #include +#include #include #include @@ -62,7 +64,7 @@ int smp_num_siblings = 1; int phys_proc_id[NR_CPUS]; /* Package ID of each logical CPU */ /* Bitmask of currently online CPUs */ -unsigned long cpu_online_map; +unsigned long cpu_online_map = 1; static volatile unsigned long cpu_callin_map; volatile unsigned long cpu_callout_map; @@ -71,6 +73,11 @@ static unsigned long smp_commenced_mask; /* Per CPU bogomips and other parameters */ struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned; +/* Per CPU interrupt stacks */ +extern union thread_union init_irq_union; +union thread_union *irq_stacks[NR_CPUS] __cacheline_aligned = + { &init_irq_union, }; + /* Set when the idlers are all forked */ int smp_threads_ready; @@ -412,6 +419,8 @@ void __init smp_callin(void) */ smp_store_cpu_info(cpuid); + notify_die(DIE_CPUINIT, "cpuinit", NULL, 0); + disable_APIC_timer(); local_irq_disable(); /* @@ -442,6 +451,7 @@ int __init start_secondary(void *unused) */ cpu_init(); smp_callin(); + setup_cpu_idt(); while (!test_bit(smp_processor_id(), &smp_commenced_mask)) rep_nop(); setup_secondary_APIC_clock(); @@ -770,6 +780,28 @@ wakeup_secondary_cpu(int phys_apicid, un } #endif /* WAKE_SECONDARY_VIA_INIT */ +static void __init setup_irq_stack(struct task_struct *p, int cpu) +{ + unsigned long stk; + + stk = __get_free_pages(GFP_KERNEL, THREAD_ORDER); + if (!stk) + panic("I can't seem to allocate my irq stack. Oh well, giving up."); + + irq_stacks[cpu] = (void *)stk; + memset(irq_stacks[cpu], 0, THREAD_SIZE); + irq_stacks[cpu]->thread_info.cpu = cpu; + irq_stacks[cpu]->thread_info.preempt_count = 1; + /* interrupts are not preemptable */ + p->thread_info->irq_stack = &irq_stacks[cpu]->thread_info; + + /* If we want to make the irq stack more than one unit + * deep, we can chain then off of the irq_stack pointer + * here. + */ +} + + extern unsigned long cpu_initialized; static int __init do_boot_cpu(int apicid) @@ -793,6 +825,7 @@ static int __init do_boot_cpu(int apicid idle = fork_by_hand(); if (IS_ERR(idle)) panic("failed fork for CPU %d", cpu); + setup_irq_stack(idle, cpu); wake_up_forked_process(idle); /* @@ -949,7 +982,7 @@ static void __init smp_boot_cpus(unsigne current_thread_info()->cpu = 0; smp_tune_scheduling(); - + /* * If we couldn't find an SMP configuration at boot time, * get out of here now! diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/kernel/summit.c 900-mjb1/arch/i386/kernel/summit.c --- 000-virgin/arch/i386/kernel/summit.c Wed Dec 31 16:00:00 1969 +++ 900-mjb1/arch/i386/kernel/summit.c Fri May 30 19:26:44 2003 @@ -0,0 +1,162 @@ +/* + * arch/i386/kernel/summit.c - IBM Summit-Specific Code + * + * Written By: Matthew Dobson, IBM Corporation + * + * Copyright (c) 2003 IBM Corp. + * + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or (at + * your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Send feedback to + * + */ + +#include +#include +#include +#include + +static void __init setup_pci_node_map_for_wpeg(int wpeg_num, struct rio_table_hdr *rth, + struct scal_detail **scal_nodes, struct rio_detail **rio_nodes){ + int twst_num = 0, node = 0, first_bus = 0; + int i, bus, num_busses; + + for(i = 0; i < rth->num_rio_dev; i++){ + if (rio_nodes[i]->node_id == rio_nodes[wpeg_num]->owner_id){ + twst_num = rio_nodes[i]->owner_id; + break; + } + } + if (i == rth->num_rio_dev){ + printk("%s: Couldn't find owner Cyclone for Winnipeg!\n", __FUNCTION__); + return; + } + + for(i = 0; i < rth->num_scal_dev; i++){ + if (scal_nodes[i]->node_id == twst_num){ + node = scal_nodes[i]->node_id; + break; + } + } + if (i == rth->num_scal_dev){ + printk("%s: Couldn't find owner Twister for Cyclone!\n", __FUNCTION__); + return; + } + + switch (rio_nodes[wpeg_num]->type){ + case CompatWPEG: + /* The Compatability Winnipeg controls the legacy busses + (busses 0 & 1), the 66MHz PCI bus [2 slots] (bus 2), + and the "extra" busses in case a PCI-PCI bridge card is + used in either slot (busses 3 & 4): total 5 busses. */ + num_busses = 5; + /* The BIOS numbers the busses starting at 1, and in a + slightly wierd manner. You'll have to trust that + the math used below to determine the number of the + first bus works. */ + first_bus = (rio_nodes[wpeg_num]->first_slot - 1) * 2; + break; + case AltWPEG: + /* The Alternate/Secondary Winnipeg controls the 1st 133MHz + bus [1 slot] & its "extra" bus (busses 0 & 1), the 2nd + 133MHz bus [1 slot] & its "extra" bus (busses 2 & 3), the + 100MHz bus [2 slots] (bus 4), and the "extra" busses for + the 2 100MHz slots (busses 5 & 6): total 7 busses. */ + num_busses = 7; + first_bus = (rio_nodes[wpeg_num]->first_slot * 2) - 1; + break; + case LookOutAWPEG: + case LookOutBWPEG: + printk("%s: LookOut Winnipegs not supported yet!\n", __FUNCTION__); + return; + default: + printk("%s: Unsupported Winnipeg type!\n", __FUNCTION__); + return; + } + + for(bus = first_bus; bus < first_bus + num_busses; bus++) + mp_bus_id_to_node[bus] = node; +} + +static void __init build_detail_arrays(struct rio_table_hdr *rth, + struct scal_detail **sd, struct rio_detail **rd){ + unsigned long ptr; + int i, scal_detail_size, rio_detail_size; + + switch (rth->version){ + default: + printk("%s: Bad Rio Grande Table Version: %d\n", __FUNCTION__, rth->version); + /* Fall through to default to version 2 spec */ + case 2: + scal_detail_size = 11; + rio_detail_size = 13; + break; + case 3: + scal_detail_size = 12; + rio_detail_size = 15; + break; + } + + ptr = (unsigned long)rth + 3; + for(i = 0; i < rth->num_scal_dev; i++) + sd[i] = (struct scal_detail *)(ptr + (scal_detail_size * i)); + + ptr += scal_detail_size * rth->num_scal_dev; + for(i = 0; i < rth->num_rio_dev; i++) + rd[i] = (struct rio_detail *)(ptr + (rio_detail_size * i)); +} + +void __init setup_summit(void) +{ + struct rio_table_hdr *rio_table_hdr = NULL; + struct scal_detail *scal_devs[MAX_NUMNODES]; + struct rio_detail *rio_devs[MAX_NUMNODES*2]; + unsigned long ptr; + unsigned short offset; + int i; + + memset(mp_bus_id_to_node, -1, sizeof(mp_bus_id_to_node)); + + /* The pointer to the EBDA is stored in the word @ phys 0x40E(40:0E) */ + ptr = *(unsigned short *)phys_to_virt(0x40Eul); + ptr = (unsigned long)phys_to_virt(ptr << 4); + + offset = 0x180; + while (offset){ + /* The block id is stored in the 2nd word */ + if (*((unsigned short *)(ptr + offset + 2)) == 0x4752){ + /* set the pointer past the offset & block id */ + rio_table_hdr = (struct rio_table_hdr *)(ptr + offset + 4); + break; + } + /* The next offset is stored in the 1st word. 0 means no more */ + offset = *((unsigned short *)(ptr + offset)); + } + if (!rio_table_hdr){ + printk("%s: Unable to locate Rio Grande Table in EBDA - bailing!\n", __FUNCTION__); + return; + } + + /* Deal with the ugly version 2/3 pointer arithmetic */ + build_detail_arrays(rio_table_hdr, scal_devs, rio_devs); + + for(i = 0; i < rio_table_hdr->num_rio_dev; i++) + if (is_WPEG(rio_devs[i]->type)) + /* It's a Winnipeg, it's got PCI Busses */ + setup_pci_node_map_for_wpeg(i, rio_table_hdr, scal_devs, rio_devs); +} diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/kernel/traps.c 900-mjb1/arch/i386/kernel/traps.c --- 000-virgin/arch/i386/kernel/traps.c Fri May 30 19:01:59 2003 +++ 900-mjb1/arch/i386/kernel/traps.c Fri May 30 21:58:44 2003 @@ -25,11 +25,13 @@ #include #include #include +#include #ifdef CONFIG_EISA #include #endif +#include #ifdef CONFIG_MCA #include #include @@ -42,6 +44,7 @@ #include #include #include +#include #include #include @@ -53,6 +56,24 @@ #include "mach_traps.h" +#ifdef CONFIG_X86_REMOTE_DEBUG +#include +#endif + +#ifdef CONFIG_X86_REMOTE_DEBUG +gdb_debug_hook * linux_debug_hook; +#define CHK_REMOTE_DEBUG(trapnr,signr,error_code,regs,after) \ + { \ + if (linux_debug_hook != (gdb_debug_hook *) NULL && !user_mode(regs)) \ + { \ + (*linux_debug_hook)(trapnr, signr, error_code, regs) ; \ + after; \ + } \ + } +#else +#define CHK_REMOTE_DEBUG(trapnr,signr,error_code,regs,after) +#endif + asmlinkage int system_call(void); asmlinkage void lcall7(void); asmlinkage void lcall27(void); @@ -68,7 +89,9 @@ char ignore_fpu_irq = 0; * F0 0F bug workaround.. We have a special link segment * for this. */ -struct desc_struct idt_table[256] __attribute__((__section__(".data.idt"))) = { {0, 0}, }; + +struct desc_struct node_idt_table[MAX_NUMNODES][IDT_ENTRIES] __attribute__((__section__(".data.idt"))) = + {[0 ... MAX_NUMNODES-1] = { {0, 0}, }}; asmlinkage void divide_error(void); asmlinkage void debug(void); @@ -249,19 +272,28 @@ bug: } spinlock_t die_lock = SPIN_LOCK_UNLOCKED; +struct notifier_block *die_chain; void die(const char * str, struct pt_regs * regs, long err) { static int die_counter; console_verbose(); + if (notify_die(DIE_DIE, str, regs, err) == NOTIFY_BAD) + goto exit_segv; + spin_lock_irq(&die_lock); bust_spinlocks(1); handle_BUG(regs); printk("%s: %04lx [#%d]\n", str, err & 0xffff, ++die_counter); + CHK_REMOTE_DEBUG(1,SIGTRAP,err,regs,); show_registers(regs); bust_spinlocks(0); spin_unlock_irq(&die_lock); + + notify_die(DIE_OOPS, str, regs, err); + +exit_segv: if (in_interrupt()) panic("Fatal exception in interrupt"); @@ -328,6 +360,7 @@ static inline void do_trap(int trapnr, i #define DO_ERROR(trapnr, signr, str, name) \ asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ { \ + CHK_REMOTE_DEBUG(trapnr,signr,error_code,regs,)\ do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \ } @@ -345,7 +378,9 @@ asmlinkage void do_##name(struct pt_regs #define DO_VM86_ERROR(trapnr, signr, str, name) \ asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ { \ + CHK_REMOTE_DEBUG(trapnr,signr,error_code,regs,return)\ do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \ + return; \ } #define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ @@ -360,7 +395,6 @@ asmlinkage void do_##name(struct pt_regs } DO_VM86_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->eip) -DO_VM86_ERROR( 3, SIGTRAP, "int3", int3) DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow) DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds) DO_ERROR_INFO( 6, SIGILL, "invalid operand", invalid_op, ILL_ILLOPN, regs->eip) @@ -374,6 +408,13 @@ asmlinkage void do_general_protection(st { if (regs->eflags & VM_MASK) goto gp_in_vm86; + + if (kprobe_running() && kprobe_fault_handler(regs, 13)) + return; + + if (notify_die(DIE_PROTFAULT, "general protection", regs, error_code) + == NOTIFY_BAD) + return; if (!(regs->xcs & 3)) goto gp_in_kernel; @@ -388,8 +429,10 @@ gp_in_vm86: return; gp_in_kernel: - if (!fixup_exception(regs)) + if (!fixup_exception(regs)) { + CHK_REMOTE_DEBUG(13,SIGSEGV,error_code,regs,) die("general protection fault", regs, error_code); + } } static void mem_parity_error(unsigned char reason, struct pt_regs * regs) @@ -417,6 +460,7 @@ static void io_check_error(unsigned char outb(reason, 0x61); } + static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs) { #ifdef CONFIG_MCA @@ -451,10 +495,14 @@ static void default_do_nmi(struct pt_reg unknown_nmi_error(reason, regs); return; } + + if (notify_die(DIE_NMI, "nmi", regs, reason) == NOTIFY_BAD) + return; if (reason & 0x80) mem_parity_error(reason, regs); if (reason & 0x40) io_check_error(reason, regs); + /* * Reassert NMI in case it became active meanwhile * as it's edge-triggered. @@ -494,6 +542,21 @@ void unset_nmi_callback(void) nmi_callback = dummy_nmi_callback; } +asmlinkage int do_int3(struct pt_regs *regs, long error_code) +{ + /* This is an interrupt gate, because kprobes wants interrupts + disabled. Normal trap handlers don't. */ + if (kprobe_handler(regs)) + return 1; + if (notify_die(DIE_INT3, "int3", regs, error_code) == NOTIFY_BAD) + return 1; + + restore_interrupts(regs); + CHK_REMOTE_DEBUG(3,SIGTRAP,error_code,regs,return 0) + do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL); + return 0; +} + /* * Our handling of the processor debug registers is non-trivial. * We do not clear them on entry and exit from the kernel. Therefore @@ -516,7 +579,7 @@ void unset_nmi_callback(void) * find every occurrence of the TF bit that could be saved away even * by user code) */ -asmlinkage void do_debug(struct pt_regs * regs, long error_code) +asmlinkage int do_debug(struct pt_regs * regs, long error_code) { unsigned int condition; struct task_struct *tsk = current; @@ -528,6 +591,18 @@ asmlinkage void do_debug(struct pt_regs if (regs->eflags & X86_EFLAGS_IF) local_irq_enable(); + if (post_kprobe_handler(regs)) + return 1; + + /* Interrupts not disabled for normal trap handling. */ + restore_interrupts(regs); + + if (notify_die(DIE_DEBUG, "debug", regs, error_code) == NOTIFY_BAD) + return 1; + + /* Interrupts not disabled for normal trap handling. */ + restore_interrupts(regs); + /* Mask out spurious debug traps due to lazy DR7 setting */ if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { if (!tsk->thread.debugreg[7]) @@ -551,8 +626,10 @@ asmlinkage void do_debug(struct pt_regs * allowing programs to debug themselves without the ptrace() * interface. */ +#ifndef CONFIG_X86_REMOTE_DEBUG if ((regs->xcs & 3) == 0) goto clear_TF_reenable; +#endif if ((tsk->ptrace & (PT_DTRACE|PT_PTRACED)) == PT_DTRACE) goto clear_TF; } @@ -564,11 +641,13 @@ asmlinkage void do_debug(struct pt_regs info.si_errno = 0; info.si_code = TRAP_BRKPT; - /* If this is a kernel mode trap, save the user PC on entry to - * the kernel, that's what the debugger can make sense of. - */ - info.si_addr = ((regs->xcs & 3) == 0) ? (void *)tsk->thread.eip : - (void *)regs->eip; + + /* If this is a kernel mode trap, we need to reset db7 to allow us + * to continue sanely */ + if ((regs->xcs & 3) == 0) + goto clear_dr7; + + info.si_addr = (void *)regs->eip; force_sig_info(SIGTRAP, &info, tsk); /* Disable additional traps. They'll be re-enabled when @@ -578,17 +657,20 @@ clear_dr7: __asm__("movl %0,%%db7" : /* no output */ : "r" (0)); - return; + CHK_REMOTE_DEBUG(1,SIGTRAP,error_code,regs,) + return 0; debug_vm86: handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1); - return; + return 0; +#ifndef CONFIG_X86_REMOTE_DEBUG clear_TF_reenable: +#endif set_tsk_thread_flag(tsk, TIF_SINGLESTEP); clear_TF: regs->eflags &= ~TF_MASK; - return; + return 0; } /* @@ -753,6 +835,12 @@ asmlinkage void math_state_restore(struc struct task_struct *tsk = thread->task; clts(); /* Allow maths ops (or we recurse) */ + + if (kprobe_running() && kprobe_fault_handler(®s, 7)) + return; + if (notify_die(DIE_FPUTRAP, "fpu", ®s, 0) == NOTIFY_BAD) + return; + if (!tsk->used_math) init_fpu(tsk); restore_fpu(tsk); @@ -774,17 +862,52 @@ asmlinkage void math_emulate(long arg) #ifdef CONFIG_X86_F00F_BUG void __init trap_init_f00f_bug(void) { - __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO); + int node = cpu_to_node(smp_processor_id()); + + __set_fixmap(FIX_F00F_IDT, __pa(&node_idt_table[node]), PAGE_KERNEL_RO); /* * Update the IDT descriptor and reload the IDT so that * it uses the read-only mapped virtual address. */ - idt_descr.address = fix_to_virt(FIX_F00F_IDT); - __asm__ __volatile__("lidt %0": "=m" (idt_descr)); + node_idt_descr[node].address = fix_to_virt(FIX_F00F_IDT); + __asm__ __volatile__("lidt %0": "=m" (node_idt_descr[node])); } #endif +static inline void get_current_regs(struct pt_regs *regs) +{ + __asm__ __volatile__("movl %%ebx,%0" : "=m"(regs->ebx)); + __asm__ __volatile__("movl %%ecx,%0" : "=m"(regs->ecx)); + __asm__ __volatile__("movl %%edx,%0" : "=m"(regs->edx)); + __asm__ __volatile__("movl %%esi,%0" : "=m"(regs->esi)); + __asm__ __volatile__("movl %%edi,%0" : "=m"(regs->edi)); + __asm__ __volatile__("movl %%ebp,%0" : "=m"(regs->ebp)); + __asm__ __volatile__("movl %%eax,%0" : "=m"(regs->eax)); + __asm__ __volatile__("movl %%esp,%0" : "=m"(regs->esp)); + __asm__ __volatile__("movw %%ss, %%ax;" :"=a"(regs->xss)); + __asm__ __volatile__("movw %%cs, %%ax;" :"=a"(regs->xcs)); + __asm__ __volatile__("movw %%ds, %%ax;" :"=a"(regs->xds)); + __asm__ __volatile__("movw %%es, %%ax;" :"=a"(regs->xes)); + __asm__ __volatile__("pushfl; popl %0" :"=m"(regs->eflags)); + regs->eip = (unsigned long)current_text_addr(); +} + +static int panic_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct pt_regs regs; + get_current_regs(®s); + + return notify_die(DIE_PANIC, (const char *)ptr, ®s, event); +} + +extern struct notifier_block *panic_notifier_list; +static int panic_event(struct notifier_block *, unsigned long, void *); +static struct notifier_block panic_block = { + .notifier_call = panic_event, +}; + #define _set_gate(gate_addr,type,dpl,addr,seg) \ do { \ int __d0, __d1; \ @@ -800,24 +923,36 @@ do { \ /* - * This needs to use 'idt_table' rather than 'idt', and + * This needs to use 'node_idt_table' rather than 'idt', and * thus use the _nonmapped_ version of the IDT, as the * Pentium F0 0F bugfix can have resulted in the mapped * IDT being write-protected. */ + +void node_set_intr_gate(unsigned int node, unsigned int n, void *addr) +{ + _set_gate(&node_idt_table[node][n],14,0,addr,__KERNEL_CS); +} + void set_intr_gate(unsigned int n, void *addr) { - _set_gate(idt_table+n,14,0,addr,__KERNEL_CS); + int node; + for (node = 0; node < MAX_NUMNODES; node++) + node_set_intr_gate(node, n, addr); } static void __init set_trap_gate(unsigned int n, void *addr) { - _set_gate(idt_table+n,15,0,addr,__KERNEL_CS); + int node; + for (node = 0; node < MAX_NUMNODES; node++) + _set_gate(&node_idt_table[node][n],15,0,addr,__KERNEL_CS); } static void __init set_system_gate(unsigned int n, void *addr) { - _set_gate(idt_table+n,15,3,addr,__KERNEL_CS); + int node; + for (node = 0; node < MAX_NUMNODES; node++) + _set_gate(&node_idt_table[node][n],15,3,addr,__KERNEL_CS); } static void __init set_call_gate(void *a, void *addr) @@ -827,7 +962,9 @@ static void __init set_call_gate(void *a static void __init set_task_gate(unsigned int n, unsigned int gdt_entry) { - _set_gate(idt_table+n,5,0,0,(gdt_entry<<3)); + int node; + for (node = 0; node < MAX_NUMNODES; node++) + _set_gate(&node_idt_table[node][n],5,0,0,(gdt_entry<<3)); } @@ -848,9 +985,17 @@ void __init trap_init(void) #endif set_trap_gate(0,÷_error); - set_intr_gate(1,&debug); + + /* debug trap for kprobes */ + /* _set_gate(idt_table+1,14,3,&debug,__KERNEL_CS); */ + _set_gate(&node_idt_table[0][1],14,3,&debug,__KERNEL_CS); + set_intr_gate(2,&nmi); - set_system_gate(3,&int3); /* int3-5 can be called from all */ + + /* int3-5 can be called from all */ + /* _set_gate(idt_table+3,14,3,&int3,__KERNEL_CS); */ + _set_gate(&node_idt_table[0][3],14,3,&int3,__KERNEL_CS); + set_system_gate(4,&overflow); set_system_gate(5,&bounds); set_trap_gate(6,&invalid_op); @@ -879,10 +1024,17 @@ void __init trap_init(void) set_call_gate(&default_ldt[0],lcall7); set_call_gate(&default_ldt[4],lcall27); + /* setup the pernode idt tables */ + setup_node_idts(); + + notify_die(DIE_TRAPINIT, "traps initialized", 0, 0); /* * Should be a barrier for any external CPU state. */ cpu_init(); trap_init_hook(); + + notifier_chain_register(&panic_notifier_list, &panic_block); } + diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/lib/dec_and_lock.c 900-mjb1/arch/i386/lib/dec_and_lock.c --- 000-virgin/arch/i386/lib/dec_and_lock.c Sun Nov 17 20:29:28 2002 +++ 900-mjb1/arch/i386/lib/dec_and_lock.c Fri May 30 19:26:45 2003 @@ -10,6 +10,7 @@ #include #include +#ifndef ATOMIC_DEC_AND_LOCK int atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock) { int counter; @@ -38,3 +39,5 @@ slow_path: spin_unlock(lock); return 0; } +#endif + diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/mm/fault.c 900-mjb1/arch/i386/mm/fault.c --- 000-virgin/arch/i386/mm/fault.c Fri May 30 19:01:59 2003 +++ 900-mjb1/arch/i386/mm/fault.c Fri May 30 21:58:44 2003 @@ -2,6 +2,11 @@ * linux/arch/i386/mm/fault.c * * Copyright (C) 1995 Linus Torvalds + * + * Change History + * + * Tigran Aivazian Remote debugging support. + * */ #include @@ -20,12 +25,17 @@ #include #include /* For unblank_screen() */ #include +#ifdef CONFIG_X86_REMOTE_DEBUG +#include +#endif +#include #include #include #include #include #include +#include extern void die(const char *,struct pt_regs *,long); @@ -80,6 +90,15 @@ asmlinkage void do_page_fault(struct pt_ /* get the address */ __asm__("movl %%cr2,%0":"=r" (address)); + /* for many cases list will be empty, so optimize for that case */ + if (unlikely(die_chain != NULL) + && notify_die(DIE_PAGEFAULT, "page fault", regs, error_code) + == NOTIFY_BAD) + return; + + if (kprobe_running() && kprobe_fault_handler(regs, 14)) + return; + /* It's safe to allow irq's after cr2 has been saved */ if (regs->eflags & X86_EFLAGS_IF) local_irq_enable(); @@ -112,6 +131,15 @@ asmlinkage void do_page_fault(struct pt_ if (in_atomic() || !mm) goto no_context; +#ifdef CONFIG_X86_REMOTE_DEBUG + if (kgdb_memerr_expected) { + if (linux_debug_hook != (gdb_debug_hook *) NULL) { + (*linux_debug_hook)(14, SIGSEGV, error_code, regs) ; + return; /* return w/modified regs */ + } + } +#endif + down_read(&mm->mmap_sem); vma = find_vma(mm, address); @@ -211,14 +239,27 @@ bad_area: return; } +#ifdef CONFIG_X86_REMOTE_DEBUG + if (kgdb_memerr_expected) { + if (linux_debug_hook != (gdb_debug_hook *) NULL) { + (*linux_debug_hook)(14, SIGSEGV, error_code, regs); + return; /* Return with modified registers */ + } + } else { + if (linux_debug_hook != (gdb_debug_hook *) NULL) { + (*linux_debug_hook)(14, SIGSEGV, error_code, regs); + } + } +#endif + #ifdef CONFIG_X86_F00F_BUG /* * Pentium F0 0F C7 C8 bug workaround. */ if (boot_cpu_data.f00f_bug) { - unsigned long nr; - - nr = (address - idt_descr.address) >> 3; + unsigned long nr, node; + node = cpu_to_node(smp_processor_id()); + nr = (address - node_idt_descr[node].address) >> 3; if (nr == 6) { do_invalid_op(regs, 0); diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/mm/init.c 900-mjb1/arch/i386/mm/init.c --- 000-virgin/arch/i386/mm/init.c Fri May 30 19:01:59 2003 +++ 900-mjb1/arch/i386/mm/init.c Fri May 30 19:24:56 2003 @@ -166,7 +166,7 @@ static inline int page_kills_ppro(unsign return 0; } -static inline int page_is_ram(unsigned long pagenr) +int page_is_ram(unsigned long pagenr) { int i; @@ -186,6 +186,12 @@ static inline int page_is_ram(unsigned l return 1; } return 0; +} + +/* To enable modules to check if a page is in RAM */ +int pfn_is_ram(unsigned long pfn) +{ + return (page_is_ram(pfn)); } #ifdef CONFIG_HIGHMEM diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/vmlinux.lds.S 900-mjb1/arch/i386/vmlinux.lds.S --- 000-virgin/arch/i386/vmlinux.lds.S Fri May 30 19:01:59 2003 +++ 900-mjb1/arch/i386/vmlinux.lds.S Fri May 30 19:03:26 2003 @@ -10,7 +10,7 @@ ENTRY(startup_32) jiffies = jiffies_64; SECTIONS { - . = 0xC0000000 + 0x100000; + . = __PAGE_OFFSET + 0x100000; /* read-only */ _text = .; /* Text and read-only data */ .text : { diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/ppc64/kernel/ppc_ksyms.c 900-mjb1/arch/ppc64/kernel/ppc_ksyms.c --- 000-virgin/arch/ppc64/kernel/ppc_ksyms.c Tue Feb 25 23:03:44 2003 +++ 900-mjb1/arch/ppc64/kernel/ppc_ksyms.c Fri May 30 22:04:58 2003 @@ -91,7 +91,7 @@ EXPORT_SYMBOL(strncmp); EXPORT_SYMBOL(__down_interruptible); EXPORT_SYMBOL(__up); EXPORT_SYMBOL(naca); -EXPORT_SYMBOL(__down); +EXPORT_SYMBOL(__down_wq); /* EXPORT_SYMBOL(csum_partial); already in net/netsyms.c */ EXPORT_SYMBOL(csum_partial_copy_generic); diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/ppc64/kernel/semaphore.c 900-mjb1/arch/ppc64/kernel/semaphore.c --- 000-virgin/arch/ppc64/kernel/semaphore.c Sun Nov 17 20:29:22 2002 +++ 900-mjb1/arch/ppc64/kernel/semaphore.c Fri May 30 22:04:58 2003 @@ -70,13 +70,18 @@ void __up(struct semaphore *sem) * Thus it is only when we decrement count from some value > 0 * that we have actually got the semaphore. */ -void __down(struct semaphore *sem) +int __down_wq(struct semaphore *sem, wait_queue_t *wait) { struct task_struct *tsk = current; - DECLARE_WAITQUEUE(wait, tsk); + DECLARE_WAITQUEUE(local_wait, tsk); + unsigned long flags; - tsk->state = TASK_UNINTERRUPTIBLE; - add_wait_queue_exclusive(&sem->wait, &wait); + if (!is_sync_wait(wait)) + tsk->state = TASK_UNINTERRUPTIBLE; + if (!wait) + wait = &local_wait; + + add_wait_queue_exclusive(&sem->wait, wait); smp_wmb(); /* @@ -86,10 +91,15 @@ void __down(struct semaphore *sem) * that we are asleep, and then sleep. */ while (__sem_update_count(sem, -1) <= 0) { + if (!is_sync_wait(wait)) + return -EIOCBRETRY; schedule(); tsk->state = TASK_UNINTERRUPTIBLE; } - remove_wait_queue(&sem->wait, &wait); + spin_lock_irqsave(&sem->wait.lock, flags) + if (is_sync_wait(wait) || !list_empty(&wait->task_list)) + remove_wait_queue_locked(&sem->wait, wait); + spin_unlock_irqrestore(&sem->wait.lock, flags); tsk->state = TASK_RUNNING; /* @@ -98,6 +108,8 @@ void __down(struct semaphore *sem) * indicating that there are still processes sleeping. */ wake_up(&sem->wait); + + return 0; } int __down_interruptible(struct semaphore * sem) diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/s390/boot/Makefile 900-mjb1/arch/s390/boot/Makefile --- 000-virgin/arch/s390/boot/Makefile Sun Apr 20 19:34:58 2003 +++ 900-mjb1/arch/s390/boot/Makefile Fri May 30 19:24:56 2003 @@ -16,4 +16,4 @@ $(obj)/image: vmlinux FORCE install: $(CONFIGURE) $(obj)/image sh -x $(obj)/install.sh $(KERNELRELEASE) $(obj)/image \ - System.map Kerntypes "$(INSTALL_PATH)" + System.map init/kerntypes.o "$(INSTALL_PATH)" diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/s390/boot/install.sh 900-mjb1/arch/s390/boot/install.sh --- 000-virgin/arch/s390/boot/install.sh Sun Nov 17 20:29:48 2002 +++ 900-mjb1/arch/s390/boot/install.sh Fri May 30 19:24:56 2003 @@ -16,7 +16,8 @@ # $1 - kernel version # $2 - kernel image file # $3 - kernel map file -# $4 - default install path (blank if root directory) +# $4 - kernel type file +# $5 - default install path (blank if root directory) # # User may have a custom install script @@ -26,13 +27,22 @@ if [ -x /sbin/installkernel ]; then exec # Default install - same as make zlilo -if [ -f $4/vmlinuz ]; then - mv $4/vmlinuz $4/vmlinuz.old +if [ -f $5/vmlinuz ]; then + mv $5/vmlinuz $5/vmlinuz.old fi -if [ -f $4/System.map ]; then - mv $4/System.map $4/System.old +if [ -f $5/System.map ]; then + mv $5/System.map $5/System.old fi -cat $2 > $4/vmlinuz -cp $3 $4/System.map +if [ -f $5/Kerntypes ]; then + mv $5/Kerntypes $5/Kerntypes.old +fi + +cat $2 > $5/vmlinuz +cp $3 $5/System.map + +# copy the kernel type file if it exists +if [ -f $4 ]; then + cp $4 $5/Kerntypes +fi diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/sparc64/kernel/devices.c 900-mjb1/arch/sparc64/kernel/devices.c --- 000-virgin/arch/sparc64/kernel/devices.c Sat May 10 18:34:35 2003 +++ 900-mjb1/arch/sparc64/kernel/devices.c Fri May 30 19:26:45 2003 @@ -31,6 +31,8 @@ int linux_num_cpus = 0; extern void cpu_probe(void); extern void central_probe(void); +unsigned long cpu_hz; + void __init device_scan(void) { char node_str[128]; @@ -68,6 +70,8 @@ void __init device_scan(void) prom_getproperty(scan, "portid", (char *) &thismid, sizeof(thismid)); } + if (!cpu_hz) + cpu_hz = prom_getint(scan, "clock-frequency"); linux_cpus[cpu_ctr].mid = thismid; printk("Found CPU %d (node=%08x,mid=%d)\n", cpu_ctr, (unsigned) scan, thismid); diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/sparc64/kernel/rtrap.S 900-mjb1/arch/sparc64/kernel/rtrap.S --- 000-virgin/arch/sparc64/kernel/rtrap.S Sat May 10 18:34:35 2003 +++ 900-mjb1/arch/sparc64/kernel/rtrap.S Fri May 30 19:28:07 2003 @@ -15,6 +15,10 @@ #include #include +#ifndef CONFIG_KGDB_THREAD +#define user_schedule schedule +#endif + #define RTRAP_PSTATE (PSTATE_RMO|PSTATE_PEF|PSTATE_PRIV|PSTATE_IE) #define RTRAP_PSTATE_IRQOFF (PSTATE_RMO|PSTATE_PEF|PSTATE_PRIV) #define RTRAP_PSTATE_AG_IRQOFF (PSTATE_RMO|PSTATE_PEF|PSTATE_PRIV|PSTATE_AG) @@ -33,7 +37,7 @@ __handle_softirq: ba,a,pt %xcc, __handle_softirq_continue nop __handle_preemption: - call schedule + call user_schedule wrpr %g0, RTRAP_PSTATE, %pstate ba,pt %xcc, __handle_preemption_continue wrpr %g0, RTRAP_PSTATE_IRQOFF, %pstate @@ -48,7 +52,7 @@ __handle_user_windows: be,pt %xcc, 1f nop - call schedule + call user_schedule wrpr %g0, RTRAP_PSTATE, %pstate wrpr %g0, RTRAP_PSTATE_IRQOFF, %pstate ldx [%g6 + TI_FLAGS], %l0 @@ -92,7 +96,7 @@ __handle_perfctrs: be,pt %xcc, 1f nop - call schedule + call user_schedule wrpr %g0, RTRAP_PSTATE, %pstate wrpr %g0, RTRAP_PSTATE_IRQOFF, %pstate ldx [%g6 + TI_FLAGS], %l0 @@ -273,7 +277,7 @@ to_kernel: sethi %hi(PREEMPT_ACTIVE), %l6 stw %l6, [%g6 + TI_PRE_COUNT] wrpr 0, %pil - call schedule + call user_schedule nop ba,pt %xcc, rtrap stw %g0, [%g6 + TI_PRE_COUNT] diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/sparc64/lib/rwlock.S 900-mjb1/arch/sparc64/lib/rwlock.S --- 000-virgin/arch/sparc64/lib/rwlock.S Sun Nov 17 20:29:44 2002 +++ 900-mjb1/arch/sparc64/lib/rwlock.S Fri May 30 19:26:45 2003 @@ -63,5 +63,33 @@ __write_lock: /* %o0 = lock_ptr */ be,pt %icc, 99b membar #StoreLoad | #StoreStore ba,a,pt %xcc, 1b + + .globl __read_trylock +__read_trylock: /* %o0 = lock_ptr */ + ldsw [%o0], %g5 + brlz,pn %g5, 100f + add %g5, 1, %g7 + cas [%o0], %g5, %g7 + cmp %g5, %g7 + bne,pn %icc, __read_trylock + membar #StoreLoad | #StoreStore + retl + mov 1, %o0 + + .globl __write_trylock +__write_trylock: /* %o0 = lock_ptr */ + sethi %hi(0x80000000), %g2 +1: lduw [%o0], %g5 +4: brnz,pn %g5, 100f + or %g5, %g2, %g7 + cas [%o0], %g5, %g7 + cmp %g5, %g7 + bne,pn %icc, 1b + membar #StoreLoad | #StoreStore + retl + mov 1, %o0 +100: retl + mov 0, %o0 + rwlock_impl_end: diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/x86_64/kernel/entry.S 900-mjb1/arch/x86_64/kernel/entry.S --- 000-virgin/arch/x86_64/kernel/entry.S Fri May 30 19:02:02 2003 +++ 900-mjb1/arch/x86_64/kernel/entry.S Fri May 30 19:28:07 2003 @@ -46,6 +46,10 @@ #define PDAREF(field) %gs:field +#ifndef CONFIG_KGDB_THREAD +#define user_schedule schedule +#endif + #ifdef CONFIG_PREEMPT #define preempt_stop cli #else @@ -187,7 +191,7 @@ sysret_careful: jnc sysret_signal sti pushq %rdi - call schedule + call user_schedule popq %rdi jmp sysret_check @@ -256,7 +260,7 @@ int_careful: jnc int_very_careful sti pushq %rdi - call schedule + call user_schedule popq %rdi jmp int_with_check @@ -426,7 +430,7 @@ retint_careful: jnc retint_signal sti pushq %rdi - call schedule + call user_schedule popq %rdi GET_THREAD_INFO(%rcx) cli @@ -460,7 +464,7 @@ retint_kernel: jc retint_restore_args movl $PREEMPT_ACTIVE,threadinfo_preempt_count(%rcx) sti - call schedule + call user_schedule cli GET_THREAD_INFO(%rcx) movl $0,threadinfo_preempt_count(%rcx) diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/x86_64/kernel/semaphore.c 900-mjb1/arch/x86_64/kernel/semaphore.c --- 000-virgin/arch/x86_64/kernel/semaphore.c Sun Nov 17 20:29:59 2002 +++ 900-mjb1/arch/x86_64/kernel/semaphore.c Fri May 30 22:04:59 2003 @@ -54,15 +54,20 @@ void __up(struct semaphore *sem) wake_up(&sem->wait); } -void __down(struct semaphore * sem) +int __down_wq(struct semaphore * sem, wait_queue_t *wait) { struct task_struct *tsk = current; - DECLARE_WAITQUEUE(wait, tsk); + DECLARE_WAITQUEUE(local_wait, tsk); unsigned long flags; - tsk->state = TASK_UNINTERRUPTIBLE; + if (is_sync_wait(wait)) + tsk->state = TASK_UNINTERRUPTIBLE; + if (!wait) { + wait = &local_wait; + } + spin_lock_irqsave(&sem->wait.lock, flags); - add_wait_queue_exclusive_locked(&sem->wait, &wait); + add_wait_queue_exclusive_locked(&sem->wait, wait); sem->sleepers++; for (;;) { @@ -80,16 +85,22 @@ void __down(struct semaphore * sem) sem->sleepers = 1; /* us - see -1 above */ spin_unlock_irqrestore(&sem->wait.lock, flags); + if (!is_sync_wait(wait)) + return -EIOCBRETRY; + schedule(); spin_lock_irqsave(&sem->wait.lock, flags); tsk->state = TASK_UNINTERRUPTIBLE; } - remove_wait_queue_locked(&sem->wait, &wait); + if (is_sync_wait(wait) || !list_empty(&wait->task_list)) + remove_wait_queue_locked(&sem->wait, wait); wake_up_locked(&sem->wait); spin_unlock_irqrestore(&sem->wait.lock, flags); tsk->state = TASK_RUNNING; + return 0; } + int __down_interruptible(struct semaphore * sem) { diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/x86_64/kernel/x8664_ksyms.c 900-mjb1/arch/x86_64/kernel/x8664_ksyms.c --- 000-virgin/arch/x86_64/kernel/x8664_ksyms.c Fri May 30 19:02:02 2003 +++ 900-mjb1/arch/x86_64/kernel/x8664_ksyms.c Fri May 30 22:04:59 2003 @@ -64,7 +64,7 @@ EXPORT_SYMBOL(get_cmos_time); EXPORT_SYMBOL(__io_virt_debug); #endif -EXPORT_SYMBOL_NOVERS(__down_failed); +EXPORT_SYMBOL_NOVERS(__down_failed_wq); EXPORT_SYMBOL_NOVERS(__down_failed_interruptible); EXPORT_SYMBOL_NOVERS(__down_failed_trylock); EXPORT_SYMBOL_NOVERS(__up_wakeup); diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/x86_64/lib/thunk.S 900-mjb1/arch/x86_64/lib/thunk.S --- 000-virgin/arch/x86_64/lib/thunk.S Sun Nov 17 20:29:44 2002 +++ 900-mjb1/arch/x86_64/lib/thunk.S Fri May 30 22:04:59 2003 @@ -38,7 +38,7 @@ #endif thunk do_softirq_thunk,do_softirq - thunk __down_failed,__down + thunk __down_failed_wq,__down_wq thunk_retrax __down_failed_interruptible,__down_interruptible thunk_retrax __down_failed_trylock,__down_trylock thunk __up_wakeup,__up diff -urpN -X /home/fletch/.diff.exclude 000-virgin/drivers/Makefile 900-mjb1/drivers/Makefile --- 000-virgin/drivers/Makefile Fri May 30 19:02:02 2003 +++ 900-mjb1/drivers/Makefile Fri May 30 19:24:56 2003 @@ -51,3 +51,4 @@ obj-$(CONFIG_ISDN_BOOL) += isdn/ obj-$(CONFIG_MCA) += mca/ obj-$(CONFIG_EISA) += eisa/ obj-$(CONFIG_CPU_FREQ) += cpufreq/ +obj-$(CONFIG_CRASH_DUMP) += dump/ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/drivers/block/ll_rw_blk.c 900-mjb1/drivers/block/ll_rw_blk.c --- 000-virgin/drivers/block/ll_rw_blk.c Fri May 30 19:02:03 2003 +++ 900-mjb1/drivers/block/ll_rw_blk.c Fri May 30 22:04:50 2003 @@ -1501,16 +1501,29 @@ void blk_put_request(struct request *req * If no queues are congested then just wait for the next request to be * returned. */ -void blk_congestion_wait(int rw, long timeout) +int blk_congestion_wait_wq(int rw, long timeout, wait_queue_t *wait) { - DEFINE_WAIT(wait); wait_queue_head_t *wqh = &congestion_wqh[rw]; + DEFINE_WAIT(local_wait); + + if (!wait) + wait = &local_wait; blk_run_queues(); - prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); + prepare_to_wait(wqh, wait, TASK_UNINTERRUPTIBLE); + if (!is_sync_wait(wait)) + return -EIOCBRETRY; + io_schedule_timeout(timeout); - finish_wait(wqh, &wait); + finish_wait(wqh, wait); + return 0; +} + +void blk_congestion_wait(int rw, long timeout) +{ + blk_congestion_wait_wq(rw, timeout, NULL); } + /* * Has to be called with the request spinlock acquired diff -urpN -X /home/fletch/.diff.exclude 000-virgin/drivers/char/Makefile 900-mjb1/drivers/char/Makefile --- 000-virgin/drivers/char/Makefile Wed Mar 26 22:54:30 2003 +++ 900-mjb1/drivers/char/Makefile Fri May 30 19:13:53 2003 @@ -25,6 +25,7 @@ obj-$(CONFIG_COMPUTONE) += ip2.o ip2main obj-$(CONFIG_RISCOM8) += riscom8.o obj-$(CONFIG_ISI) += isicom.o obj-$(CONFIG_ESPSERIAL) += esp.o +obj-$(CONFIG_X86_REMOTE_DEBUG) += gdbserial.o obj-$(CONFIG_SYNCLINK) += synclink.o obj-$(CONFIG_SYNCLINKMP) += synclinkmp.o obj-$(CONFIG_N_HDLC) += n_hdlc.o diff -urpN -X /home/fletch/.diff.exclude 000-virgin/drivers/char/gdbserial.c 900-mjb1/drivers/char/gdbserial.c --- 000-virgin/drivers/char/gdbserial.c Wed Dec 31 16:00:00 1969 +++ 900-mjb1/drivers/char/gdbserial.c Fri May 30 19:13:53 2003 @@ -0,0 +1,274 @@ +/* + * Serial interface GDB stub + * + * Written (hacked together) by David Grothe (dave@gcom.com) + * + * Modified by Scott Foehner (sfoehner@engr.sgi.com) to allow connect + * on boot-up + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#undef PRNT /* define for debug printing */ + +#define GDB_BUF_SIZE 512 /* power of 2, please */ + +static char gdb_buf[GDB_BUF_SIZE]; +static int gdb_buf_in_inx; +static atomic_t gdb_buf_in_cnt; +static int gdb_buf_out_inx; + +extern void set_debug_traps(void); /* GDB routine */ +extern int gdb_serial_setup(int ttyS, int baud, int *port, int *irq); +extern void shutdown_for_gdb(struct async_struct *info); + /* in serial.c */ + +int gdb_irq; +int gdb_port; +int gdb_ttyS = 1; /* Default: ttyS1 */ +int gdb_baud = 38400; +int gdb_enter = 0; /* Default: do not do gdb_hook on boot */ +int gdb_initialized = 0; + +static int initialized = -1; + +/* + * Get a byte from the hardware data buffer and return it + */ +static int +read_data_bfr(void) +{ + if (inb(gdb_port + UART_LSR) & UART_LSR_DR) + return (inb(gdb_port + UART_RX)); + + return (-1); + +} /* read_data_bfr */ + +/* + * Get a char if available, return -1 if nothing available. + * Empty the receive buffer first, then look at the interface hardware. + */ +static int +read_char(void) +{ + if (atomic_read(&gdb_buf_in_cnt) != 0) { /* intr routine has q'd chars */ + int chr; + + chr = gdb_buf[gdb_buf_out_inx++]; + gdb_buf_out_inx &= (GDB_BUF_SIZE - 1); + atomic_dec(&gdb_buf_in_cnt); + return (chr); + } + + return (read_data_bfr()); /* read from hardware */ + +} /* read_char */ + +/* + * Wait until the interface can accept a char, then write it. + */ +static void +write_char(int chr) +{ + while (!(inb(gdb_port + UART_LSR) & UART_LSR_THRE)) ; + + outb(chr, gdb_port + UART_TX); + +} /* write_char */ + +/* + * This is the receiver interrupt routine for the GDB stub. + * It will receive a limited number of characters of input + * from the gdb host machine and save them up in a buffer. + * + * When the gdb stub routine getDebugChar() is called it + * draws characters out of the buffer until it is empty and + * then reads directly from the serial port. + * + * We do not attempt to write chars from the interrupt routine + * since the stubs do all of that via putDebugChar() which + * writes one byte after waiting for the interface to become + * ready. + * + * The debug stubs like to run with interrupts disabled since, + * after all, they run as a consequence of a breakpoint in + * the kernel. + * + * Perhaps someone who knows more about the tty driver than I + * care to learn can make this work for any low level serial + * driver. + */ +static void +gdb_interrupt(int irq, void *dev_id, struct pt_regs *regs) +{ + int chr; + int iir; + + do { + chr = read_data_bfr(); + iir = inb(gdb_port + UART_IIR); +#ifdef PRNT + printk("gdb_interrupt: chr=%02x '%c' after read iir=%02x\n", + chr, chr > ' ' && chr < 0x7F ? chr : ' ', iir); +#endif + if (chr < 0) + continue; + + if (chr == 3) { /* Ctrl-C means remote interrupt */ + breakpoint(); + continue; + } + + if (atomic_read(&gdb_buf_in_cnt) >= GDB_BUF_SIZE) { /* buffer overflow, clear it */ + gdb_buf_in_inx = 0; + atomic_set(&gdb_buf_in_cnt, 0); + gdb_buf_out_inx = 0; + break; + } + + gdb_buf[gdb_buf_in_inx++] = chr; + gdb_buf_in_inx &= (GDB_BUF_SIZE - 1); + atomic_inc(&gdb_buf_in_cnt); + } + while (iir & UART_IIR_RDI); + +} /* gdb_interrupt */ + +/* + * Just a NULL routine for testing. + */ +void +gdb_null(void) +{ +} /* gdb_null */ + +extern int serial8250_init(void); + +int +gdb_hook(void) +{ + int retval; + +#ifdef CONFIG_SMP + if (NR_CPUS > KGDB_MAX_NO_CPUS) { + printk + ("kgdb: too manu cpus. Cannot enable debugger with more than 8 cpus\n"); + return (-1); + } +#endif + + /* + * Call first time just to get the ser ptr + */ + + serial8250_init(); + + if (gdb_serial_setup(gdb_ttyS, gdb_baud, &gdb_port, &gdb_irq)) { + printk("gdb_serial_setup() error"); + return (-1); + } + + retval = request_irq(gdb_irq, + gdb_interrupt, SA_INTERRUPT, "GDB-stub", NULL); + if (retval == 0) + initialized = 1; + else { + initialized = 0; + printk("gdb_hook: request_irq(irq=%d) failed: %d\n", gdb_irq, + retval); + } + + /* + * Call GDB routine to setup the exception vectors for the debugger + */ + set_debug_traps(); + + /* + * Call the breakpoint() routine in GDB to start the debugging + * session. + */ + printk("Waiting for connection from remote gdb... "); + breakpoint(); + gdb_null(); + + printk("Connected.\n"); + + gdb_initialized = 1; + return (0); + +} /* gdb_hook_interrupt2 */ + +/* + * getDebugChar + * + * This is a GDB stub routine. It waits for a character from the + * serial interface and then returns it. If there is no serial + * interface connection then it returns a bogus value which will + * almost certainly cause the system to hang. + */ +int +getDebugChar(void) +{ + volatile int chr; + +#ifdef PRNT + printk("getDebugChar: "); +#endif + + while ((chr = read_char()) < 0) + touch_nmi_watchdog(); + +#ifdef PRNT + printk("%c\n", chr > ' ' && chr < 0x7F ? chr : ' '); +#endif + return (chr); + +} /* getDebugChar */ + +/* + * putDebugChar + * + * This is a GDB stub routine. It waits until the interface is ready + * to transmit a char and then sends it. If there is no serial + * interface connection then it simply returns to its caller, having + * pretended to send the char. + */ +void +putDebugChar(int chr) +{ +#ifdef PRNT + printk("putDebugChar: chr=%02x '%c'\n", chr, + chr > ' ' && chr < 0x7F ? chr : ' '); +#endif + + write_char(chr); /* this routine will wait */ + +} /* putDebugChar */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/drivers/char/sysrq.c 900-mjb1/drivers/char/sysrq.c --- 000-virgin/drivers/char/sysrq.c Fri May 30 19:02:05 2003 +++ 900-mjb1/drivers/char/sysrq.c Fri May 30 19:13:53 2003 @@ -134,6 +134,18 @@ static struct sysrq_key_op sysrq_mountro /* END SYNC SYSRQ HANDLERS BLOCK */ +#ifdef CONFIG_X86_REMOTE_DEBUG +static void sysrq_handle_gdb(int key, struct pt_regs *pt_regs, + struct tty_struct *tty) { + int gdb_hook(void); + gdb_hook(); +} +static struct sysrq_key_op sysrq_gdb_op = { + handler: sysrq_handle_gdb, + help_msg: "Gdb", + action_msg: "Entering debugger", +}; +#endif /* SHOW SYSRQ HANDLERS BLOCK */ @@ -240,7 +252,11 @@ static struct sysrq_key_op *sysrq_key_ta /* d */ NULL, /* e */ &sysrq_term_op, /* f */ NULL, +#ifdef CONFIG_X86_REMOTE_DEBUG +/* g */ &sysrq_gdb_op, +#else /* CONFIG_X86_REMOTE_DEBUG */ /* g */ NULL, +#endif /* CONFIG_X86_REMOTE_DEBUG */ /* h */ NULL, /* i */ &sysrq_kill_op, /* j */ NULL, diff -urpN -X /home/fletch/.diff.exclude 000-virgin/drivers/char/tty_io.c 900-mjb1/drivers/char/tty_io.c --- 000-virgin/drivers/char/tty_io.c Fri May 30 19:02:05 2003 +++ 900-mjb1/drivers/char/tty_io.c Fri May 30 19:13:53 2003 @@ -91,6 +91,9 @@ #include #include #include +#ifdef CONFIG_GDB_CONSOLE +#include +#endif #include #include @@ -2367,6 +2370,9 @@ void __init console_init(void) (*call)(); call++; } +#ifdef CONFIG_GDB_CONSOLE + gdb_console_init(); +#endif } #ifdef CONFIG_VT diff -urpN -X /home/fletch/.diff.exclude 000-virgin/drivers/dump/Makefile 900-mjb1/drivers/dump/Makefile --- 000-virgin/drivers/dump/Makefile Wed Dec 31 16:00:00 1969 +++ 900-mjb1/drivers/dump/Makefile Fri May 30 19:24:57 2003 @@ -0,0 +1,13 @@ +# +# Makefile for the dump device drivers. +# + +dump-y := dump_setup.o dump_fmt.o dump_filters.o dump_scheme.o dump_execute.o +dump-$(CONFIG_X86) += dump_i386.o +dump-objs += $(dump-y) + +obj-$(CONFIG_CRASH_DUMP) += dump.o +obj-$(CONFIG_CRASH_DUMP_BLOCKDEV) += dump_blockdev.o +obj-$(CONFIG_CRASH_DUMP_NETDEV) += dump_netdev.o +obj-$(CONFIG_CRASH_DUMP_COMPRESS_RLE) += dump_rle.o +obj-$(CONFIG_CRASH_DUMP_COMPRESS_GZIP) += dump_gzip.o diff -urpN -X /home/fletch/.diff.exclude 000-virgin/drivers/dump/dump_blockdev.c 900-mjb1/drivers/dump/dump_blockdev.c --- 000-virgin/drivers/dump/dump_blockdev.c Wed Dec 31 16:00:00 1969 +++ 900-mjb1/drivers/dump/dump_blockdev.c Fri May 30 19:26:22 2003 @@ -0,0 +1,456 @@ +/* + * Implements the dump driver interface for saving a dump to + * a block device through the kernel's generic low level block i/o + * routines. + * + * Started: June 2002 - Mohamed Abbas + * Moved original lkcd kiobuf dump i/o code from dump_base.c + * to use generic dump device interfaces + * + * Sept 2002 - Bharata B. Rao + * Convert dump i/o to directly use bio instead of kiobuf for 2.5 + * + * Oct 2002 - Suparna Bhattacharya + * Rework to new dumpdev.h structures, implement open/close/ + * silence, misc fixes (blocknr removal, bio_add_page usage) + * + * Copyright (C) 1999 - 2002 Silicon Graphics, Inc. All rights reserved. + * Copyright (C) 2001 - 2002 Matt D. Robinson. All rights reserved. + * Copyright (C) 2002 International Business Machines Corp. + * + * This code is released under version 2 of the GNU GPL. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "dump_methods.h" + +extern void *dump_page_buf; + +/* The end_io callback for dump i/o completion */ +static int +dump_bio_end_io(struct bio *bio, unsigned int bytes_done, int error) +{ + struct dump_blockdev *dump_bdev; + + if (bio->bi_size) { + /* some bytes still left to transfer */ + return 1; /* not complete */ + } + + dump_bdev = (struct dump_blockdev *)bio->bi_private; + if (error) { + printk("IO error while writing the dump, aborting\n"); + } + + dump_bdev->err = error; + + /* no wakeup needed, since caller polls for completion */ + return 0; +} + +/* Check if the dump bio is already mapped to the specified buffer */ +static int +dump_block_map_valid(struct dump_blockdev *dev, struct page *page, + int len) +{ + struct bio *bio = dev->bio; + + if (!bio->bi_vcnt) + return 0; /* first time, not mapped */ + + + if ((bio_page(bio) != page) || (len != bio->bi_vcnt << PAGE_SHIFT)) + return 0; /* buffer not mapped */ + + /* quick check to decide if we need to redo bio_add_page */ + if (bdev_get_queue(bio->bi_bdev)->merge_bvec_fn) + return 0; /* device may have other restrictions */ + + return 1; /* already mapped */ +} + +/* + * Set up the dump bio for i/o from the specified buffer + * Return value indicates whether the full buffer could be mapped or not + */ +static int +dump_block_map(struct dump_blockdev *dev, void *buf, int len) +{ + struct page *page = virt_to_page(buf); + struct bio *bio = dev->bio; + unsigned long bsize = 0; + + bio->bi_bdev = dev->bdev; + bio->bi_sector = (dev->start_offset + dev->ddev.curr_offset) >> 9; + bio->bi_idx = 0; /* reset index to the beginning */ + + if (dump_block_map_valid(dev, page, len)) { + /* already mapped and usable rightaway */ + bio->bi_size = len; /* reset size to the whole bio */ + } else { + /* need to map the bio */ + bio->bi_size = 0; + bio->bi_vcnt = 0; + bsize = bdev_hardsect_size(bio->bi_bdev); + + /* first a few sanity checks */ + if (len < bsize) { + printk("map: len less than hardsect size \n"); + return -EINVAL; + } + + if ((unsigned long)buf & bsize) { + printk("map: not aligned \n"); + return -EINVAL; + } + + /* assume contig. page aligned low mem buffer( no vmalloc) */ + if ((page_address(page) != buf) || (len & (PAGE_SIZE - 1))) { + printk("map: invalid buffer alignment!\n"); + return -EINVAL; + } + /* finally we can go ahead and map it */ + while (bio->bi_size < len) + if (bio_add_page(bio, page++, PAGE_SIZE, 0) == 0) { + break; + } + + bio->bi_end_io = dump_bio_end_io; + bio->bi_private = dev; + } + + if (bio->bi_size != len) { + printk("map: bio size = %d not enough for len = %d!\n", + bio->bi_size, len); + return -E2BIG; + } + return 0; +} + +static void +dump_free_bio(struct bio *bio) +{ + if (bio) + kfree(bio->bi_io_vec); + kfree(bio); +} + +/* + * Prepares the dump device so we can take a dump later. + * The caller is expected to have filled up the kdev_id field in the + * block dump dev structure. + * + * At dump time when dump_block_write() is invoked it will be too + * late to recover, so as far as possible make sure obvious errors + * get caught right here and reported back to the caller. + */ +static int +dump_block_open(struct dump_dev *dev, unsigned long arg) +{ + struct dump_blockdev *dump_bdev = DUMP_BDEV(dev); + struct block_device *bdev; + int retval = 0; + struct bio_vec *bvec; + + /* make sure this is a valid block device */ + if (!arg) { + retval = -EINVAL; + goto err; + } + + /* get a corresponding block_dev struct for this */ + bdev = bdget((dev_t)arg); + if (!bdev) { + retval = -ENODEV; + goto err; + } + + /* get the block device opened */ + if ((retval = blkdev_get(bdev, O_RDWR | O_LARGEFILE, 0, BDEV_RAW))) { + goto err1; + } + + if ((dump_bdev->bio = kmalloc(sizeof(struct bio), GFP_KERNEL)) + == NULL) { + printk("Cannot allocate bio\n"); + retval = -ENOMEM; + goto err2; + } + + bio_init(dump_bdev->bio); + + if ((bvec = kmalloc(sizeof(struct bio_vec) * + (DUMP_BUFFER_SIZE >> PAGE_SHIFT), GFP_KERNEL)) == NULL) { + retval = -ENOMEM; + goto err3; + } + + /* assign the new dump dev structure */ + dump_bdev->kdev_id = to_kdev_t((dev_t)arg); + dump_bdev->bdev = bdev; + + /* make a note of the limit */ + dump_bdev->limit = bdev->bd_inode->i_size; + + /* now make sure we can map the dump buffer */ + dump_bdev->bio->bi_io_vec = bvec; + dump_bdev->bio->bi_max_vecs = DUMP_BUFFER_SIZE >> PAGE_SHIFT; + + retval = dump_block_map(dump_bdev, dump_config.dumper->dump_buf, + DUMP_BUFFER_SIZE); + + if (retval) { + printk("open: dump_block_map failed, ret %d\n", retval); + goto err3; + } + + printk("Block device (%d,%d) successfully configured for dumping\n", + major(dump_bdev->kdev_id), + minor(dump_bdev->kdev_id)); + + + /* after opening the block device, return */ + return retval; + +err3: dump_free_bio(dump_bdev->bio); + dump_bdev->bio = NULL; +err2: if (bdev) blkdev_put(bdev, BDEV_RAW); + goto err; +err1: if (bdev) bdput(bdev); + dump_bdev->bdev = NULL; +err: return retval; +} + +/* + * Close the dump device and release associated resources + * Invoked when unconfiguring the dump device. + */ +static int +dump_block_release(struct dump_dev *dev) +{ + struct dump_blockdev *dump_bdev = DUMP_BDEV(dev); + + /* release earlier bdev if present */ + if (dump_bdev->bdev) { + blkdev_put(dump_bdev->bdev, BDEV_RAW); + dump_bdev->bdev = NULL; + } + + dump_free_bio(dump_bdev->bio); + dump_bdev->bio = NULL; + + return 0; +} + + +/* + * Prepare the dump device for use (silence any ongoing activity + * and quiesce state) when the system crashes. + */ +static int +dump_block_silence(struct dump_dev *dev) +{ + struct dump_blockdev *dump_bdev = DUMP_BDEV(dev); + struct request_queue *q = bdev_get_queue(dump_bdev->bdev); + int ret; + + /* If we can't get request queue lock, refuse to take the dump */ + if (!spin_trylock(q->queue_lock)) + return -EBUSY; + + ret = elv_queue_empty(q); + spin_unlock(q->queue_lock); + + /* For now we assume we have the device to ourselves */ + /* Just a quick sanity check */ + if (!ret) { + /* i/o in flight - safer to quit */ + return -EBUSY; + } + + /* + * Move to a softer level of silencing where no spin_lock_irqs + * are held on other cpus + */ + dump_silence_level = DUMP_SOFT_SPIN_CPUS; + + __dump_irq_enable(); + + printk("Dumping to block device (%d,%d) on CPU %d ...\n", + major(dump_bdev->kdev_id), minor(dump_bdev->kdev_id), + smp_processor_id()); + + return 0; +} + +/* + * Invoked when dumping is done. This is the time to put things back + * (i.e. undo the effects of dump_block_silence) so the device is + * available for normal use. + */ +static int +dump_block_resume(struct dump_dev *dev) +{ + __dump_irq_restore(); + return 0; +} + + +/* + * Seek to the specified offset in the dump device. + * Makes sure this is a valid offset, otherwise returns an error. + */ +static int +dump_block_seek(struct dump_dev *dev, loff_t off) +{ + struct dump_blockdev *dump_bdev = DUMP_BDEV(dev); + loff_t offset = off + dump_bdev->start_offset; + + if (offset & ( PAGE_SIZE - 1)) { + printk("seek: non-page aligned\n"); + return -EINVAL; + } + + if (offset & (bdev_hardsect_size(dump_bdev->bdev) - 1)) { + printk("seek: not sector aligned \n"); + return -EINVAL; + } + + if (offset > dump_bdev->limit) { + printk("seek: not enough space left on device!\n"); + return -ENOSPC; + } + dev->curr_offset = off; + return 0; +} + +/* + * Write out a buffer after checking the device limitations, + * sector sizes, etc. Assumes the buffer is in directly mapped + * kernel address space (not vmalloc'ed). + * + * Returns: number of bytes written or -ERRNO. + */ +static int +dump_block_write(struct dump_dev *dev, void *buf, + unsigned long len) +{ + struct dump_blockdev *dump_bdev = DUMP_BDEV(dev); + loff_t offset = dev->curr_offset + dump_bdev->start_offset; + int retval = -ENOSPC; + + if (offset >= dump_bdev->limit) { + printk("write: not enough space left on device!\n"); + goto out; + } + + /* don't write more blocks than our max limit */ + if (offset + len > dump_bdev->limit) + len = dump_bdev->limit - offset; + + + retval = dump_block_map(dump_bdev, buf, len); + if (retval){ + printk("write: dump_block_map failed! err %d\n", retval); + goto out; + } + + /* + * Write out the data to disk. + * Assumes the entire buffer mapped to a single bio, which we can + * submit and wait for io completion. In the future, may consider + * increasing the dump buffer size and submitting multiple bio s + * for better throughput. + */ + dump_bdev->err = -EAGAIN; + submit_bio(WRITE, dump_bdev->bio); + + dump_bdev->ddev.curr_offset += len; + retval = len; + out: + return retval; +} + +/* + * Name: dump_block_ready() + * Func: check if the last dump i/o is over and ready for next request + */ +static int +dump_block_ready(struct dump_dev *dev, void *buf) +{ + struct dump_blockdev *dump_bdev = DUMP_BDEV(dev); + request_queue_t *q = bdev_get_queue(dump_bdev->bio->bi_bdev); + + /* check for io completion */ + if (dump_bdev->err == -EAGAIN) { + q->unplug_fn(q); + return -EAGAIN; + } + + if (dump_bdev->err) { + printk("dump i/o err\n"); + return dump_bdev->err; + } + + return 0; +} + + +struct dump_dev_ops dump_blockdev_ops = { + .open = dump_block_open, + .release = dump_block_release, + .silence = dump_block_silence, + .resume = dump_block_resume, + .seek = dump_block_seek, + .write = dump_block_write, + /* .read not implemented */ + .ready = dump_block_ready +}; + +static struct dump_blockdev default_dump_blockdev = { + .ddev = {.type_name = "blockdev", .ops = &dump_blockdev_ops, + .curr_offset = 0}, + /* + * leave enough room for the longest swap header possibly written + * written by mkswap (likely the largest page size supported by + * the arch + */ + .start_offset = DUMP_HEADER_OFFSET, + .err = 0 + /* assume the rest of the fields are zeroed by default */ +}; + +struct dump_blockdev *dump_blockdev = &default_dump_blockdev; + +static int __init +dump_blockdev_init(void) +{ + if (dump_register_device(&dump_blockdev->ddev) < 0) { + printk("block device driver registration failed\n"); + return -1; + } + + printk("block device driver for LKCD registered\n"); + return 0; +} + +static void __exit +dump_blockdev_cleanup(void) +{ + dump_unregister_device(&dump_blockdev->ddev); + printk("block device driver for LKCD unregistered\n"); +} + +MODULE_AUTHOR("LKCD Development Team "); +MODULE_DESCRIPTION("Block Dump Driver for Linux Kernel Crash Dump (LKCD)"); +MODULE_LICENSE("GPL"); + +module_init(dump_blockdev_init); +module_exit(dump_blockdev_cleanup); diff -urpN -X /home/fletch/.diff.exclude 000-virgin/drivers/dump/dump_execute.c 900-mjb1/drivers/dump/dump_execute.c --- 000-virgin/drivers/dump/dump_execute.c Wed Dec 31 16:00:00 1969 +++ 900-mjb1/drivers/dump/dump_execute.c Fri May 30 19:24:56 2003 @@ -0,0 +1,124 @@ +/* + * The file has the common/generic dump execution code + * + * Started: Oct 2002 - Suparna Bhattacharya + * Split and rewrote high level dump execute code to make use + * of dump method interfaces. + * + * Derived from original code in dump_base.c created by + * Matt Robinson ) + * + * Copyright (C) 1999 - 2002 Silicon Graphics, Inc. All rights reserved. + * Copyright (C) 2001 - 2002 Matt D. Robinson. All rights reserved. + * Copyright (C) 2002 International Business Machines Corp. + * + * Assumes dumper and dump config settings are in place + * (invokes corresponding dumper specific routines as applicable) + * + * This code is released under version 2 of the GNU GPL. + */ +#include +#include +#include +#include "dump_methods.h" + +extern int dump_device; + +struct notifier_block *dump_notifier_list; /* dump started/ended callback */ + +/* Dump progress indicator */ +void +dump_speedo(int i) +{ + static const char twiddle[4] = { '|', '\\', '-', '/' }; + printk("%c\b", twiddle[i&3]); +} + +/* Make the device ready and write out the header */ +int dump_begin(void) +{ + int err = 0; + + /* dump_dev = dump_config.dumper->dev; */ + dumper_reset(); + if ((err = dump_dev_silence())) { + /* quiesce failed, can't risk continuing */ + /* Todo/Future: switch to alternate dump scheme if possible */ + printk("dump silence dev failed ! error %d\n", err); + return err; + } + + pr_debug("Writing dump header\n"); + if ((err = dump_update_header())) { + printk("dump update header failed ! error %d\n", err); + dump_dev_resume(); + return err; + } + + dump_config.dumper->curr_offset = DUMP_BUFFER_SIZE; + + return 0; +} + +/* + * Write the dump terminator, a final header update and let go of + * exclusive use of the device for dump. + */ +int dump_complete(void) +{ + int ret = 0; + + if (dump_config.level != DUMP_LEVEL_HEADER) { + if ((ret = dump_update_end_marker())) { + printk("dump update end marker error %d\n", ret); + } + if ((ret = dump_update_header())) { + printk("dump update header error %d\n", ret); + } + } + ret = dump_dev_resume(); + + return ret; +} + +/* Saves all dump data */ +int dump_execute_savedump(void) +{ + int ret = 0; + + if ((ret = dump_begin())) { + return ret; + } + + if (dump_config.level != DUMP_LEVEL_HEADER) { + ret = dump_sequencer(); + } + dump_complete(); + + return ret; +} + +/* Does all the real work: Capture and save state */ +int dump_generic_execute(const char *panic_str, const struct pt_regs *regs) +{ + int ret = 0; + + if ((ret = dump_configure_header(panic_str, regs))) { + printk("dump config header failed ! error %d\n", ret); + return ret; + } + + /* tell interested parties that a dump is about to start */ + notifier_call_chain(&dump_notifier_list, DUMP_BEGIN, &dump_device); + + if (dump_config.level != DUMP_LEVEL_NONE) + ret = dump_execute_savedump(); + + pr_debug("dumped %ld blocks of %d bytes each\n", + dump_config.dumper->count, DUMP_BUFFER_SIZE); + + /* tell interested parties that a dump has completed */ + notifier_call_chain(&dump_notifier_list, DUMP_END, &dump_device); + + return ret; +} diff -urpN -X /home/fletch/.diff.exclude 000-virgin/drivers/dump/dump_filters.c 900-mjb1/drivers/dump/dump_filters.c --- 000-virgin/drivers/dump/dump_filters.c Wed Dec 31 16:00:00 1969 +++ 900-mjb1/drivers/dump/dump_filters.c Fri May 30 19:24:56 2003 @@ -0,0 +1,145 @@ +/* + * Default filters to select data to dump for various passes. + * + * Started: Oct 2002 - Suparna Bhattacharya + * Split and rewrote default dump selection logic to generic dump + * method interfaces + * Derived from a portion of dump_base.c created by + * Matt Robinson ) + * + * Copyright (C) 1999 - 2002 Silicon Graphics, Inc. All rights reserved. + * Copyright (C) 2001 - 2002 Matt D. Robinson. All rights reserved. + * Copyright (C) 2002 International Business Machines Corp. + * + * Used during single-stage dumping and during stage 1 of the 2-stage scheme + * (Stage 2 of the 2-stage scheme uses the fully transparent filters + * i.e. passthru filters in dump_overlay.c) + * + * Future: Custom selective dump may involve a different set of filters. + * + * This code is released under version 2 of the GNU GPL. + */ + +#include +#include +#include +#include +#include +#include "dump_methods.h" + + +/* Copied from mm/bootmem.c - FIXME */ +/* return the number of _pages_ that will be allocated for the boot bitmap */ +unsigned long dump_calc_bootmap_pages (void) +{ + unsigned long mapsize; + unsigned long pages = num_physpages; + + mapsize = (pages+7)/8; + mapsize = (mapsize + ~PAGE_MASK) & PAGE_MASK; + mapsize >>= PAGE_SHIFT; + + return mapsize; +} + + +#define DUMP_PFN_SAFETY_MARGIN 1024 /* 4 MB */ +/* temporary */ +extern unsigned long min_low_pfn; + +/* to track all used (compound + zero order) pages */ +#define PageInuse(p) (PageCompound(p) || page_count(p)) + +int dump_low_page(struct page *p) +{ + return page_to_pfn(p) < min_low_pfn + dump_calc_bootmap_pages() + + 1 + DUMP_PFN_SAFETY_MARGIN; +} + +static inline int kernel_page(struct page *p) +{ + /* FIXME: Need to exclude hugetlb pages. Clue: reserved but inuse */ + return PageReserved(p) || (!PageLRU(p) && PageInuse(p)); +} + +static inline int user_page(struct page *p) +{ + return PageInuse(p) && (!PageReserved(p) && PageLRU(p)); +} + +static inline int unreferenced_page(struct page *p) +{ + return !PageInuse(p) && !PageReserved(p); +} + + +/* loc marks the beginning of a range of pages */ +int dump_filter_kernpages(int pass, unsigned long loc, unsigned long sz) +{ + struct page *page = (struct page *)loc; + /* if any of the pages is a kernel page, select this set */ + while (sz) { + if (dump_low_page(page) || kernel_page(page)) + return 1; + sz -= PAGE_SIZE; + page++; + } + return 0; +} + + +/* loc marks the beginning of a range of pages */ +int dump_filter_userpages(int pass, unsigned long loc, unsigned long sz) +{ + struct page *page = (struct page *)loc; + int ret = 0; + /* select if the set has any user page, and no kernel pages */ + while (sz) { + if (user_page(page) && !dump_low_page(page)) { + ret = 1; + } else if (kernel_page(page) || dump_low_page(page)) { + return 0; + } + page++; + sz -= PAGE_SIZE; + } + return ret; +} + + + +/* loc marks the beginning of a range of pages */ +int dump_filter_unusedpages(int pass, unsigned long loc, unsigned long sz) +{ + struct page *page = (struct page *)loc; + + /* select if the set does not have any used pages */ + while (sz) { + if (!unreferenced_page(page) || dump_low_page(page)) { + return 0; + } + page++; + sz -= PAGE_SIZE; + } + return 1; +} + +/* dummy: last (non-existent) pass */ +int dump_filter_none(int pass, unsigned long loc, unsigned long sz) +{ + return 0; +} + +/* TBD: resolve level bitmask ? */ +struct dump_data_filter dump_filter_table[] = { + { .name = "kern", .selector = dump_filter_kernpages, + .level_mask = DUMP_MASK_KERN}, + { .name = "user", .selector = dump_filter_userpages, + .level_mask = DUMP_MASK_USED}, + { .name = "unused", .selector = dump_filter_unusedpages, + .level_mask = DUMP_MASK_UNUSED}, + { .name = "none", .selector = dump_filter_none, + .level_mask = DUMP_MASK_REST}, + { .name = "", .selector = NULL, .level_mask = 0} +}; + diff -urpN -X /home/fletch/.diff.exclude 000-virgin/drivers/dump/dump_fmt.c 900-mjb1/drivers/dump/dump_fmt.c --- 000-virgin/drivers/dump/dump_fmt.c Wed Dec 31 16:00:00 1969 +++ 900-mjb1/drivers/dump/dump_fmt.c Fri May 30 19:24:56 2003 @@ -0,0 +1,395 @@ +/* + * Implements the routines which handle the format specific + * aspects of dump for the default dump format. + * + * Used in single stage dumping and stage 1 of soft-boot based dumping + * Saves data in LKCD (lcrash) format + * + * Previously a part of dump_base.c + * + * Started: Oct 2002 - Suparna Bhattacharya + * Split off and reshuffled LKCD dump format code around generic + * dump method interfaces. + * + * Derived from original code created by + * Matt Robinson ) + * + * Contributions from SGI, IBM, HP, MCL, and others. + * + * Copyright (C) 1999 - 2002 Silicon Graphics, Inc. All rights reserved. + * Copyright (C) 2000 - 2002 TurboLinux, Inc. All rights reserved. + * Copyright (C) 2001 - 2002 Matt D. Robinson. All rights reserved. + * Copyright (C) 2002 International Business Machines Corp. + * + * This code is released under version 2 of the GNU GPL. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "dump_methods.h" + +/* + * SYSTEM DUMP LAYOUT + * + * System dumps are currently the combination of a dump header and a set + * of data pages which contain the system memory. The layout of the dump + * (for full dumps) is as follows: + * + * +-----------------------------+ + * | generic dump header | + * +-----------------------------+ + * | architecture dump header | + * +-----------------------------+ + * | page header | + * +-----------------------------+ + * | page data | + * +-----------------------------+ + * | page header | + * +-----------------------------+ + * | page data | + * +-----------------------------+ + * | | | + * | | | + * | | | + * | | | + * | V | + * +-----------------------------+ + * | PAGE_END header | + * +-----------------------------+ + * + * There are two dump headers, the first which is architecture + * independent, and the other which is architecture dependent. This + * allows different architectures to dump different data structures + * which are specific to their chipset, CPU, etc. + * + * After the dump headers come a succession of dump page headers along + * with dump pages. The page header contains information about the page + * size, any flags associated with the page (whether it's compressed or + * not), and the address of the page. After the page header is the page + * data, which is either compressed (or not). Each page of data is + * dumped in succession, until the final dump header (PAGE_END) is + * placed at the end of the dump, assuming the dump device isn't out + * of space. + * + * This mechanism allows for multiple compression types, different + * types of data structures, different page ordering, etc., etc., etc. + * It's a very straightforward mechanism for dumping system memory. + */ + +struct __dump_header dump_header; /* the primary dump header */ +struct __dump_header_asm dump_header_asm; /* the arch-specific dump header */ + +/* + * Set up common header fields (mainly the arch indep section) + * Per-cpu state is handled by lcrash_save_context + */ +static int lcrash_init_dump_header(const char *panic_str) +{ + struct timeval dh_time; + /* make sure the dump header isn't TOO big */ + if ((sizeof(struct __dump_header) + + sizeof(struct __dump_header_asm)) > DUMP_BUFFER_SIZE) { + printk("lcrash_init_header(): combined " + "headers larger than DUMP_BUFFER_SIZE!\n"); + return -E2BIG; + } + + /* initialize the dump headers to zero */ + memset(&dump_header, 0, sizeof(dump_header)); + memset(&dump_header_asm, 0, sizeof(dump_header_asm)); + + /* configure dump header values */ + dump_header.dh_magic_number = DUMP_MAGIC_NUMBER; + dump_header.dh_version = DUMP_VERSION_NUMBER; + dump_header.dh_memory_start = PAGE_OFFSET; + dump_header.dh_memory_end = DUMP_MAGIC_NUMBER; + dump_header.dh_header_size = sizeof(struct __dump_header); + dump_header.dh_page_size = PAGE_SIZE; + dump_header.dh_dump_level = dump_config.level; + dump_header.dh_current_task = (unsigned long) current; + dump_header.dh_dump_compress = dump_config.dumper->compress-> + compress_type; + dump_header.dh_dump_flags = dump_config.flags; + dump_header.dh_dump_device = dump_config.dumper->dev->device_id; + +#if DUMP_DEBUG >= 6 + dump_header.dh_num_bytes = 0; +#endif + dump_header.dh_num_dump_pages = 0; + do_gettimeofday(&dh_time); + dump_header.dh_time.tv_sec = dh_time.tv_sec; + dump_header.dh_time.tv_usec = dh_time.tv_usec; + + memcpy((void *)&(dump_header.dh_utsname_sysname), + (const void *)&(system_utsname.sysname), __NEW_UTS_LEN + 1); + memcpy((void *)&(dump_header.dh_utsname_nodename), + (const void *)&(system_utsname.nodename), __NEW_UTS_LEN + 1); + memcpy((void *)&(dump_header.dh_utsname_release), + (const void *)&(system_utsname.release), __NEW_UTS_LEN + 1); + memcpy((void *)&(dump_header.dh_utsname_version), + (const void *)&(system_utsname.version), __NEW_UTS_LEN + 1); + memcpy((void *)&(dump_header.dh_utsname_machine), + (const void *)&(system_utsname.machine), __NEW_UTS_LEN + 1); + memcpy((void *)&(dump_header.dh_utsname_domainname), + (const void *)&(system_utsname.domainname), __NEW_UTS_LEN + 1); + + if (panic_str) { + memcpy((void *)&(dump_header.dh_panic_string), + (const void *)panic_str, DUMP_PANIC_LEN); + } + + dump_header_asm.dha_magic_number = DUMP_ASM_MAGIC_NUMBER; + dump_header_asm.dha_version = DUMP_ASM_VERSION_NUMBER; + dump_header_asm.dha_header_size = sizeof(dump_header_asm); + + dump_header_asm.dha_smp_num_cpus = num_online_cpus(); + pr_debug("smp_num_cpus in header %d\n", + dump_header_asm.dha_smp_num_cpus); + + dump_header_asm.dha_dumping_cpu = smp_processor_id(); + + return 0; +} + + +int dump_lcrash_configure_header(const char *panic_str, + const struct pt_regs *regs) +{ + int retval = 0; + + if ((retval = lcrash_init_dump_header(panic_str))) + return retval; + + /* capture register states for all processors */ + dump_save_this_cpu(regs); + __dump_save_other_cpus(); /* side effect:silence cpus */ + + /* configure architecture-specific dump header values */ + if ((retval = __dump_configure_header(regs))) + return retval; + + dump_config.dumper->header_dirty++; + return 0; +} + +/* save register and task context */ +void dump_lcrash_save_context(int cpu, const struct pt_regs *regs, + struct task_struct *tsk) +{ + dump_header_asm.dha_smp_current_task[cpu] = (uint32_t) tsk; + + __dump_save_regs(&dump_header_asm.dha_smp_regs[cpu], regs); + + /* take a snapshot of the stack */ + /* doing this enables us to tolerate slight drifts on this cpu */ + if (dump_header_asm.dha_stack[cpu]) { + memcpy((void *)dump_header_asm.dha_stack[cpu], + tsk->thread_info, THREAD_SIZE); + } + dump_header_asm.dha_stack_ptr[cpu] = (uint32_t)(tsk->thread_info); +} + +/* write out the header */ +int dump_write_header(void) +{ + int retval = 0, size; + void *buf = dump_config.dumper->dump_buf; + + /* accounts for DUMP_HEADER_OFFSET if applicable */ + if ((retval = dump_dev_seek(0))) { + printk("Unable to seek to dump header offset: %d\n", + retval); + return retval; + } + + memcpy(buf, (void *)&dump_header, sizeof(dump_header)); + size = sizeof(dump_header); + memcpy(buf + size, (void *)&dump_header_asm, sizeof(dump_header_asm)); + size += sizeof(dump_header_asm); + /* assuming header is dump buffer size always ? */ + retval = dump_ll_write(buf , DUMP_BUFFER_SIZE); + + if (retval < DUMP_BUFFER_SIZE) + return (retval >= 0) ? ENOSPC : retval; + + return 0; +} + +int dump_generic_update_header(void) +{ + int err = 0; + + if (dump_config.dumper->header_dirty) { + if ((err = dump_write_header())) { + printk("dump write header failed !err %d\n", err); + } else { + dump_config.dumper->header_dirty = 0; + } + } + + return err; +} + +static inline int is_curr_stack_page(struct page *page, unsigned long size) +{ + unsigned long thread_addr = (unsigned long)current_thread_info(); + unsigned long addr = (unsigned long)page_address(page); + + return !PageHighMem(page) && (addr < thread_addr + THREAD_SIZE) + && (addr + size > thread_addr); +} + +static inline int is_dump_page(struct page *page, unsigned long size) +{ + unsigned long addr = (unsigned long)page_address(page); + unsigned long dump_buf = (unsigned long)dump_config.dumper->dump_buf; + + return !PageHighMem(page) && (addr < dump_buf + DUMP_BUFFER_SIZE) + && (addr + size > dump_buf); +} + +int dump_allow_compress(struct page *page, unsigned long size) +{ + /* + * Don't compress the page if any part of it overlaps + * with the current stack or dump buffer (since the contents + * in these could be changing while compression is going on) + */ + return !is_curr_stack_page(page, size) && !is_dump_page(page, size); +} + +void lcrash_init_pageheader(struct __dump_page *dp, struct page *page, + unsigned long sz) +{ + memset(dp, sizeof(struct __dump_page), 0); + dp->dp_flags = 0; + dp->dp_size = 0; + if (sz > 0) + dp->dp_address = page_to_pfn(page) << PAGE_SHIFT; + +#if DUMP_DEBUG > 6 + dp->dp_page_index = dump_header.dh_num_dump_pages; + dp->dp_byte_offset = dump_header.dh_num_bytes + DUMP_BUFFER_SIZE + + DUMP_HEADER_OFFSET; /* ?? */ +#endif /* DUMP_DEBUG */ +} + +int dump_lcrash_add_data(unsigned long loc, unsigned long len) +{ + struct page *page = (struct page *)loc; + void *addr, *buf = dump_config.dumper->curr_buf; + struct __dump_page *dp = (struct __dump_page *)buf; + int bytes, size; + + if (buf > dump_config.dumper->dump_buf + DUMP_BUFFER_SIZE) + return -ENOMEM; + + lcrash_init_pageheader(dp, page, len); + buf += sizeof(struct __dump_page); + + while (len) { + addr = kmap_atomic(page, KM_DUMP); + size = bytes = (len > PAGE_SIZE) ? PAGE_SIZE : len; + /* check for compression */ + if (dump_allow_compress(page, bytes)) { + size = dump_compress_data((char *)addr, bytes, (char *)buf); + } + /* set the compressed flag if the page did compress */ + if (size && (size < bytes)) { + dp->dp_flags |= DUMP_DH_COMPRESSED; + } else { + /* compression failed -- default to raw mode */ + dp->dp_flags |= DUMP_DH_RAW; + memcpy(buf, addr, bytes); + size = bytes; + } + /* memset(buf, 'A', size); temporary: testing only !! */ + kunmap_atomic(addr, KM_DUMP); + dp->dp_size += size; + buf += size; + len -= bytes; + page++; + } + + /* now update the header */ +#if DUMP_DEBUG > 6 + dump_header.dh_num_bytes += dp->dp_size + sizeof(*dp); +#endif + dump_header.dh_num_dump_pages++; + dump_config.dumper->header_dirty++; + + dump_config.dumper->curr_buf = buf; + + return len; +} + +int dump_lcrash_update_end_marker(void) +{ + struct __dump_page *dp = + (struct __dump_page *)dump_config.dumper->curr_buf; + unsigned long left; + int ret = 0; + + lcrash_init_pageheader(dp, NULL, 0); + dp->dp_flags |= DUMP_DH_END; /* tbd: truncation test ? */ + + /* now update the header */ +#if DUMP_DEBUG > 6 + dump_header.dh_num_bytes += sizeof(*dp); +#endif + dump_config.dumper->curr_buf += sizeof(*dp); + left = dump_config.dumper->curr_buf - dump_config.dumper->dump_buf; + + printk("\n"); + + while (left) { + if ((ret = dump_dev_seek(dump_config.dumper->curr_offset))) { + printk("Seek failed at offset 0x%llx\n", + dump_config.dumper->curr_offset); + return ret; + } + + if (DUMP_BUFFER_SIZE > left) + memset(dump_config.dumper->curr_buf, 'm', + DUMP_BUFFER_SIZE - left); + + if ((ret = dump_ll_write(dump_config.dumper->dump_buf, + DUMP_BUFFER_SIZE)) < DUMP_BUFFER_SIZE) { + return (ret < 0) ? ret : -ENOSPC; + } + + dump_config.dumper->curr_offset += DUMP_BUFFER_SIZE; + + if (left > DUMP_BUFFER_SIZE) { + left -= DUMP_BUFFER_SIZE; + memcpy(dump_config.dumper->dump_buf, + dump_config.dumper->dump_buf + DUMP_BUFFER_SIZE, left); + dump_config.dumper->curr_buf -= DUMP_BUFFER_SIZE; + } else { + left = 0; + } + } + return 0; +} + + +/* Default Formatter (lcrash) */ +struct dump_fmt_ops dump_fmt_lcrash_ops = { + .configure_header = dump_lcrash_configure_header, + .update_header = dump_generic_update_header, + .save_context = dump_lcrash_save_context, + .add_data = dump_lcrash_add_data, + .update_end_marker = dump_lcrash_update_end_marker +}; + +struct dump_fmt dump_fmt_lcrash = { + .name = "lcrash", + .ops = &dump_fmt_lcrash_ops +}; + diff -urpN -X /home/fletch/.diff.exclude 000-virgin/drivers/dump/dump_gzip.c 900-mjb1/drivers/dump/dump_gzip.c --- 000-virgin/drivers/dump/dump_gzip.c Wed Dec 31 16:00:00 1969 +++ 900-mjb1/drivers/dump/dump_gzip.c Fri May 30 19:24:56 2003 @@ -0,0 +1,118 @@ +/* + * GZIP Compression functions for kernel crash dumps. + * + * Created by: Matt Robinson (yakker@sourceforge.net) + * Copyright 2001 Matt D. Robinson. All rights reserved. + * + * This code is released under version 2 of the GNU GPL. + */ + +/* header files */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void *deflate_workspace; + +/* + * Name: dump_compress_gzip() + * Func: Compress a DUMP_PAGE_SIZE page using gzip-style algorithms (the. + * deflate functions similar to what's used in PPP). + */ +static u16 +dump_compress_gzip(const u8 *old, u16 oldsize, u8 *new, u16 newsize) +{ + /* error code and dump stream */ + int err; + z_stream dump_stream; + + dump_stream.workspace = deflate_workspace; + + if ((err = zlib_deflateInit(&dump_stream, Z_BEST_COMPRESSION)) != Z_OK) { + /* fall back to RLE compression */ + printk("dump_compress_gzip(): zlib_deflateInit() " + "failed (%d)!\n", err); + return 0; + } + + /* use old (page of memory) and size (DUMP_PAGE_SIZE) as in-streams */ + dump_stream.next_in = (u8 *) old; + dump_stream.avail_in = oldsize; + + /* out streams are new (dpcpage) and new size (DUMP_DPC_PAGE_SIZE) */ + dump_stream.next_out = new; + dump_stream.avail_out = newsize; + + /* deflate the page -- check for error */ + err = zlib_deflate(&dump_stream, Z_FINISH); + if (err != Z_STREAM_END) { + /* zero is return code here */ + (void)zlib_deflateEnd(&dump_stream); + printk("dump_compress_gzip(): zlib_deflate() failed (%d)!\n", + err); + return 0; + } + + /* let's end the deflated compression stream */ + if ((err = zlib_deflateEnd(&dump_stream)) != Z_OK) { + printk("dump_compress_gzip(): zlib_deflateEnd() " + "failed (%d)!\n", err); + } + + /* return the compressed byte total (if it's smaller) */ + if (dump_stream.total_out >= oldsize) { + return oldsize; + } + return dump_stream.total_out; +} + +/* setup the gzip compression functionality */ +static struct __dump_compress dump_gzip_compression = { + .compress_type = DUMP_COMPRESS_GZIP, + .compress_func = dump_compress_gzip, + .compress_name = "GZIP", +}; + +/* + * Name: dump_compress_gzip_init() + * Func: Initialize gzip as a compression mechanism. + */ +static int __init +dump_compress_gzip_init(void) +{ + deflate_workspace = vmalloc(zlib_deflate_workspacesize()); + if (!deflate_workspace) { + printk("dump_compress_gzip_init(): Failed to " + "alloc %d bytes for deflate workspace\n", + zlib_deflate_workspacesize()); + return -ENOMEM; + } + dump_register_compression(&dump_gzip_compression); + return 0; +} + +/* + * Name: dump_compress_gzip_cleanup() + * Func: Remove gzip as a compression mechanism. + */ +static void __exit +dump_compress_gzip_cleanup(void) +{ + vfree(deflate_workspace); + dump_unregister_compression(DUMP_COMPRESS_GZIP); +} + +/* module initialization */ +module_init(dump_compress_gzip_init); +module_exit(dump_compress_gzip_cleanup); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("LKCD Development Team "); +MODULE_DESCRIPTION("Gzip compression module for crash dump driver"); diff -urpN -X /home/fletch/.diff.exclude 000-virgin/drivers/dump/dump_i386.c 900-mjb1/drivers/dump/dump_i386.c --- 000-virgin/drivers/dump/dump_i386.c Wed Dec 31 16:00:00 1969 +++ 900-mjb1/drivers/dump/dump_i386.c Fri May 30 19:24:56 2003 @@ -0,0 +1,358 @@ +/* + * Architecture specific (i386) functions for Linux crash dumps. + * + * Created by: Matt Robinson (yakker@sgi.com) + * + * Copyright 1999 Silicon Graphics, Inc. All rights reserved. + * + * 2.3 kernel modifications by: Matt D. Robinson (yakker@turbolinux.com) + * Copyright 2000 TurboLinux, Inc. All rights reserved. + * + * This code is released under version 2 of the GNU GPL. + */ + +/* + * The hooks for dumping the kernel virtual memory to disk are in this + * file. Any time a modification is made to the virtual memory mechanism, + * these routines must be changed to use the new mechanisms. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include "dump_methods.h" +#include + +#include +#include +#include +#include +#include + +static __s32 saved_irq_count; /* saved preempt_count() flags */ + +static int +alloc_dha_stack(void) +{ + int i; + void *ptr; + + if (dump_header_asm.dha_stack[0]) + return 0; + + ptr = vmalloc(THREAD_SIZE * num_online_cpus()); + if (!ptr) { + printk("vmalloc for dha_stacks failed\n"); + return -ENOMEM; + } + + for (i = 0; i < num_online_cpus(); i++) { + dump_header_asm.dha_stack[i] = (u32)((unsigned long)ptr + + (i * THREAD_SIZE)); + } + return 0; +} + +static int +free_dha_stack(void) +{ + if (dump_header_asm.dha_stack[0]) { + vfree((void *)dump_header_asm.dha_stack[0]); + dump_header_asm.dha_stack[0] = 0; + } + return 0; +} + + +void +__dump_save_regs(struct pt_regs *dest_regs, const struct pt_regs *regs) +{ + *dest_regs = *regs; + + /* In case of panic dumps, we collects regs on entry to panic. + * so, we shouldn't 'fix' ssesp here again. But it is hard to + * tell just looking at regs whether ssesp need fixing. We make + * this decision by looking at xss in regs. If we have better + * means to determine that ssesp are valid (by some flag which + * tells that we are here due to panic dump), then we can use + * that instead of this kludge. + */ + if (!user_mode(regs)) { + if ((0xffff & regs->xss) == __KERNEL_DS) + /* already fixed up */ + return; + dest_regs->esp = (unsigned long)&(regs->esp); + __asm__ __volatile__ ("movw %%ss, %%ax;" + :"=a"(dest_regs->xss)); + } +} + + +#ifdef CONFIG_SMP +extern unsigned long irq_affinity[]; +extern irq_desc_t irq_desc[]; +extern void dump_send_ipi(void); + +static int dump_expect_ipi[NR_CPUS]; +static atomic_t waiting_for_dump_ipi; +static unsigned long saved_affinity[NR_IRQS]; + +extern void stop_this_cpu(void *); /* exported by i386 kernel */ + +static int +dump_nmi_callback(struct pt_regs *regs, int cpu) +{ + if (!dump_expect_ipi[cpu]) + return 0; + + dump_expect_ipi[cpu] = 0; + + dump_save_this_cpu(regs); + atomic_dec(&waiting_for_dump_ipi); + + level_changed: + switch (dump_silence_level) { + case DUMP_HARD_SPIN_CPUS: /* Spin until dump is complete */ + while (dump_oncpu) { + barrier(); /* paranoia */ + if (dump_silence_level != DUMP_HARD_SPIN_CPUS) + goto level_changed; + + cpu_relax(); /* kill time nicely */ + } + break; + + case DUMP_HALT_CPUS: /* Execute halt */ + stop_this_cpu(NULL); + break; + + case DUMP_SOFT_SPIN_CPUS: + /* Mark the task so it spins in schedule */ + set_tsk_thread_flag(current, TIF_NEED_RESCHED); + break; + } + + return 1; +} + +/* save registers on other processors */ +void +__dump_save_other_cpus(void) +{ + int i, cpu = smp_processor_id(); + int other_cpus = num_online_cpus()-1; + + if (other_cpus > 0) { + atomic_set(&waiting_for_dump_ipi, other_cpus); + + for (i = 0; i < NR_CPUS; i++) { + dump_expect_ipi[i] = (i != cpu && cpu_online(i)); + } + + /* short circuit normal NMI handling temporarily */ + set_nmi_callback(dump_nmi_callback); + wmb(); + + dump_send_ipi(); + /* may be we dont need to wait for NMI to be processed. + just write out the header at the end of dumping, if + this IPI is not processed until then, there probably + is a problem and we just fail to capture state of + other cpus. */ + while(atomic_read(&waiting_for_dump_ipi) > 0) { + cpu_relax(); + } + + unset_nmi_callback(); + } +} + +/* + * Routine to save the old irq affinities and change affinities of all irqs to + * the dumping cpu. + */ +static void +set_irq_affinity(void) +{ + int i; + int cpu = smp_processor_id(); + + memcpy(saved_affinity, irq_affinity, NR_IRQS * sizeof(unsigned long)); + for (i = 0; i < NR_IRQS; i++) { + if (irq_desc[i].handler == NULL) + continue; + irq_affinity[i] = 1UL << cpu; + if (irq_desc[i].handler->set_affinity != NULL) + irq_desc[i].handler->set_affinity(i, irq_affinity[i]); + } +} + +/* + * Restore old irq affinities. + */ +static void +reset_irq_affinity(void) +{ + int i; + + memcpy(irq_affinity, saved_affinity, NR_IRQS * sizeof(unsigned long)); + for (i = 0; i < NR_IRQS; i++) { + if (irq_desc[i].handler == NULL) + continue; + if (irq_desc[i].handler->set_affinity != NULL) + irq_desc[i].handler->set_affinity(i, saved_affinity[i]); + } +} + +#else /* !CONFIG_SMP */ +#define set_irq_affinity() do { } while (0) +#define reset_irq_affinity() do { } while (0) +#define save_other_cpu_states() do { } while (0) +#endif /* !CONFIG_SMP */ + +/* + * Kludge - dump from interrupt context is unreliable (Fixme) + * + * We do this so that softirqs initiated for dump i/o + * get processed and we don't hang while waiting for i/o + * to complete or in any irq synchronization attempt. + * + * This is not quite legal of course, as it has the side + * effect of making all interrupts & softirqs triggered + * while dump is in progress complete before currently + * pending softirqs and the currently executing interrupt + * code. + */ +static inline void +irq_bh_save(void) +{ + saved_irq_count = irq_count(); + preempt_count() &= ~(HARDIRQ_MASK|SOFTIRQ_MASK); +} + +static inline void +irq_bh_restore(void) +{ + preempt_count() |= saved_irq_count; +} + +/* + * Name: __dump_irq_enable + * Func: Reset system so interrupts are enabled. + * This is used for dump methods that require interrupts + * Eventually, all methods will have interrupts disabled + * and this code can be removed. + * + * Change irq affinities + * Re-enable interrupts + */ +void +__dump_irq_enable(void) +{ + set_irq_affinity(); + irq_bh_save(); + local_irq_enable(); +} + +/* + * Name: __dump_irq_restore + * Func: Resume the system state in an architecture-specific way. + + */ +void +__dump_irq_restore(void) +{ + local_irq_disable(); + reset_irq_affinity(); + irq_bh_restore(); +} + +/* + * Name: __dump_configure_header() + * Func: Meant to fill in arch specific header fields except per-cpu state + * already captured via __dump_save_context for all CPUs. + */ +int +__dump_configure_header(const struct pt_regs *regs) +{ + return (0); +} + +/* + * Name: dump_nmi_handler + * Func: Called from notify_die + */ +static int dump_die_event(struct notifier_block *this, + unsigned long event, + void *arg) +{ + const struct die_args *args = (const struct die_args *) arg; + + switch (event) { + case DIE_PANIC: + case DIE_OOPS: + case DIE_WATCHDOG: + dump_execute(args->str, args->regs); + break; + } + return NOTIFY_DONE; + +} + +static struct notifier_block dump_die_block = { + .notifier_call = dump_die_event, +}; + +/* + * Name: __dump_init() + * Func: Initialize the dumping routine process. + */ +void +__dump_init(uint64_t local_memory_start) +{ + /* hook into NMI, Panic, and OOPS */ + register_die_notifier(&dump_die_block); +} + +/* + * Name: __dump_open() + * Func: Open the dump device (architecture specific). + */ +void +__dump_open(void) +{ + alloc_dha_stack(); +} + +/* + * Name: __dump_cleanup() + * Func: Free any architecture specific data structures. This is called + * when the dump module is being removed. + */ +void +__dump_cleanup(void) +{ + free_dha_stack(); + + unregister_die_notifier(&dump_die_block); +} + +extern int pfn_is_ram(unsigned long); + +/* + * Name: __dump_page_valid() + * Func: Check if page is valid to dump. + */ +int +__dump_page_valid(unsigned long index) +{ + if (!pfn_valid(index)) + return 0; + + return pfn_is_ram(index); +} + diff -urpN -X /home/fletch/.diff.exclude 000-virgin/drivers/dump/dump_methods.h 900-mjb1/drivers/dump/dump_methods.h --- 000-virgin/drivers/dump/dump_methods.h Wed Dec 31 16:00:00 1969 +++ 900-mjb1/drivers/dump/dump_methods.h Fri May 30 19:24:56 2003 @@ -0,0 +1,314 @@ +/* + * Generic interfaces for flexible system dump + * + * Started: Oct 2002 - Suparna Bhattacharya (suparna@in.ibm.com) + * + * Copyright (C) 2002 International Business Machines Corp. + * + * This code is released under version 2 of the GNU GPL. + */ + +#ifndef _LINUX_DUMP_METHODS_H +#define _LINUX_DUMP_METHODS_H + +/* + * Inspired by Matt Robinson's suggestion of introducing dump + * methods as a way to enable different crash dump facilities to + * coexist where each employs its own scheme or dumping policy. + * + * The code here creates a framework for flexible dump by defining + * a set of methods and providing associated helpers that differentiate + * between the underlying mechanism (how to dump), overall scheme + * (sequencing of stages and data dumped and associated quiescing), + * output format (what the dump output looks like), target type + * (where to save the dump; see dumpdev.h), and selection policy + * (state/data to dump). + * + * These sets of interfaces can be mixed and matched to build a + * dumper suitable for a given situation, allowing for + * flexibility as well appropriate degree of code reuse. + * For example all features and options of lkcd (including + * granular selective dumping in the near future) should be + * available even when say, the 2 stage soft-boot based mechanism + * is used for taking disruptive dumps. + * + * Todo: Additionally modules or drivers may supply their own + * custom dumpers which extend dump with module specific + * information or hardware state, and can even tweak the + * mechanism when it comes to saving state relevant to + * them. + */ + +#include +#include +#include +#include + +#define MAX_PASSES 6 +#define MAX_DEVS 4 + + +/* To customise selection of pages to be dumped in a given pass/group */ +struct dump_data_filter{ + char name[32]; + int (*selector)(int, unsigned long, unsigned long); + ulong level_mask; /* dump level(s) for which this filter applies */ + loff_t start, end; /* location range applicable */ +}; + + +/* + * Determined by the kind of dump mechanism and appropriate + * overall scheme + */ +struct dump_scheme_ops { + /* sets aside memory, inits data structures etc */ + int (*configure)(unsigned long devid); + /* releases resources */ + int (*unconfigure)(void); + + /* ordering of passes, invoking iterator */ + int (*sequencer)(void); + /* iterates over system data, selects and acts on data to dump */ + int (*iterator)(int, int (*)(unsigned long, unsigned long), + struct dump_data_filter *); + /* action when data is selected for dump */ + int (*save_data)(unsigned long, unsigned long); + /* action when data is to be excluded from dump */ + int (*skip_data)(unsigned long, unsigned long); + /* policies for space, multiple dump devices etc */ + int (*write_buffer)(void *, unsigned long); +}; + +struct dump_scheme { + /* the name serves as an anchor to locate the scheme after reboot */ + char name[32]; + struct dump_scheme_ops *ops; + struct list_head list; +}; + +/* Quiescing/Silence levels (controls IPI callback behaviour) */ +extern enum dump_silence_levels { + DUMP_SOFT_SPIN_CPUS = 1, + DUMP_HARD_SPIN_CPUS = 2, + DUMP_HALT_CPUS = 3, +} dump_silence_level; + +/* determined by the dump (file) format */ +struct dump_fmt_ops { + /* build header */ + int (*configure_header)(const char *, const struct pt_regs *); + int (*update_header)(void); /* update header and write it out */ + /* save curr context */ + void (*save_context)(int, const struct pt_regs *, + struct task_struct *); + /* typically called by the save_data action */ + /* add formatted data to the dump buffer */ + int (*add_data)(unsigned long, unsigned long); + int (*update_end_marker)(void); +}; + +struct dump_fmt { + unsigned long magic; + char name[32]; /* lcrash, crash, elf-core etc */ + struct dump_fmt_ops *ops; + struct list_head list; +}; + +/* + * Modules will be able add their own data capture schemes by + * registering their own dumpers. Typically they would use the + * primary dumper as a template and tune it with their routines. + * Still Todo. + */ + +/* The combined dumper profile (mechanism, scheme, dev, fmt) */ +struct dumper { + char name[32]; /* singlestage, overlay (stg1), passthru(stg2), pull */ + struct dump_scheme *scheme; + struct dump_fmt *fmt; + struct __dump_compress *compress; + struct dump_data_filter *filter; + struct dump_dev *dev; + /* state valid only for active dumper(s) - per instance */ + /* run time state/context */ + int curr_pass; + unsigned long count; + loff_t curr_offset; /* current logical offset into dump device */ + loff_t curr_loc; /* current memory location */ + void *curr_buf; /* current position in the dump buffer */ + void *dump_buf; /* starting addr of dump buffer */ + int header_dirty; /* whether the header needs to be written out */ + struct list_head dumper_list; /* links to other dumpers */ +}; + +/* Starting point to get to the current configured state */ +struct dump_config { + ulong level; + ulong flags; + struct dumper *dumper; + struct list_head dump_dev_list; +}; + +extern struct dump_config dump_config; + + +/* Wrappers that invoke the methods for the current (active) dumper */ + +/* Scheme operations */ + +static inline int dump_sequencer(void) +{ + return dump_config.dumper->scheme->ops->sequencer(); +} + +static inline int dump_iterator(int pass, int (*action)(unsigned long, + unsigned long), struct dump_data_filter *filter) +{ + return dump_config.dumper->scheme->ops->iterator(pass, action, filter); +} + +#define dump_save_data dump_config.dumper->scheme->ops->save_data +#define dump_skip_data dump_config.dumper->scheme->ops->skip_data + +static inline int dump_write_buffer(void *buf, unsigned long len) +{ + return dump_config.dumper->scheme->ops->write_buffer(buf, len); +} + +static inline int dump_configure(unsigned long devid) +{ + return dump_config.dumper->scheme->ops->configure(devid); +} + +static inline int dump_unconfigure(void) +{ + return dump_config.dumper->scheme->ops->unconfigure(); +} + +/* Format operations */ + +static inline int dump_configure_header(const char *panic_str, + const struct pt_regs *regs) +{ + return dump_config.dumper->fmt->ops->configure_header(panic_str, regs); +} + +static inline void dump_save_context(int cpu, const struct pt_regs *regs, + struct task_struct *tsk) +{ + dump_config.dumper->fmt->ops->save_context(cpu, regs, tsk); +} + +static inline int dump_save_this_cpu(const struct pt_regs *regs) +{ + int cpu = smp_processor_id(); + + dump_save_context(cpu, regs, current); + return 1; +} + +static inline int dump_update_header(void) +{ + return dump_config.dumper->fmt->ops->update_header(); +} + +static inline int dump_update_end_marker(void) +{ + return dump_config.dumper->fmt->ops->update_end_marker(); +} + +static inline int dump_add_data(unsigned long loc, unsigned long sz) +{ + return dump_config.dumper->fmt->ops->add_data(loc, sz); +} + +/* Compression operation */ +static inline int dump_compress_data(char *src, int slen, char *dst) +{ + return dump_config.dumper->compress->compress_func(src, slen, + dst, DUMP_DPC_PAGE_SIZE); +} + + +/* Prototypes of some default implementations of dump methods */ + +extern struct __dump_compress dump_none_compression; + +/* Default scheme methods (dump_scheme.c) */ + +extern int dump_generic_sequencer(void); +extern int dump_page_iterator(int pass, int (*action)(unsigned long, unsigned + long), struct dump_data_filter *filter); +extern int dump_generic_save_data(unsigned long loc, unsigned long sz); +extern int dump_generic_skip_data(unsigned long loc, unsigned long sz); +extern int dump_generic_write_buffer(void *buf, unsigned long len); +extern int dump_generic_configure(unsigned long); +extern int dump_generic_unconfigure(void); + +/* Default scheme template */ +extern struct dump_scheme dump_scheme_singlestage; + +/* Default dump format methods */ + +extern int dump_lcrash_configure_header(const char *panic_str, + const struct pt_regs *regs); +extern void dump_lcrash_save_context(int cpu, const struct pt_regs *regs, + struct task_struct *tsk); +extern int dump_generic_update_header(void); +extern int dump_lcrash_add_data(unsigned long loc, unsigned long sz); +extern int dump_lcrash_update_end_marker(void); + +/* Default format (lcrash) template */ +extern struct dump_fmt dump_fmt_lcrash; + +/* Default dump selection filter table */ + +/* + * Entries listed in order of importance and correspond to passes + * The last entry (with a level_mask of zero) typically reflects data that + * won't be dumped -- this may for example be used to identify data + * that will be skipped for certain so the corresponding memory areas can be + * utilized as scratch space. + */ +extern struct dump_data_filter dump_filter_table[]; + +/* Some pre-defined dumpers */ +extern struct dumper dumper_singlestage; + +/* These are temporary */ +#define DUMP_MASK_HEADER DUMP_LEVEL_HEADER +#define DUMP_MASK_KERN DUMP_LEVEL_KERN +#define DUMP_MASK_USED DUMP_LEVEL_USED +#define DUMP_MASK_UNUSED DUMP_LEVEL_ALL_RAM +#define DUMP_MASK_REST 0 /* dummy for now */ + +/* Helpers - move these to dump.h later ? */ + +int dump_generic_execute(const char *panic_str, const struct pt_regs *regs); +extern int dump_ll_write(void *buf, unsigned long len); + +static inline void dumper_reset(void) +{ + dump_config.dumper->curr_buf = dump_config.dumper->dump_buf; + dump_config.dumper->curr_loc = 0; + dump_config.dumper->curr_offset = 0; + dump_config.dumper->count = 0; + dump_config.dumper->curr_pass = 0; +} + +/* + * May later be moulded to perform boot-time allocations so we can dump + * earlier during bootup + */ +static inline void *dump_alloc_mem(unsigned long size) +{ + return kmalloc(size, GFP_KERNEL); +} + +static inline void dump_free_mem(void *buf) +{ + kfree(buf); +} + +#endif /* _LINUX_DUMP_METHODS_H */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/drivers/dump/dump_netdev.c 900-mjb1/drivers/dump/dump_netdev.c --- 000-virgin/drivers/dump/dump_netdev.c Wed Dec 31 16:00:00 1969 +++ 900-mjb1/drivers/dump/dump_netdev.c Fri May 30 19:24:57 2003 @@ -0,0 +1,858 @@ +/* + * Implements the dump driver interface for saving a dump via network + * interface. + * + * Some of this code has been taken/adapted from Ingo Molnar's netconsole + * code. LKCD team expresses its thanks to Ingo. + * + * Started: June 2002 - Mohamed Abbas + * Adapted netconsole code to implement LKCD dump over the network. + * + * Nov 2002 - Bharata B. Rao + * Innumerable code cleanups, simplification and some fixes. + * Netdump configuration done by ioctl instead of using module parameters. + * + * Copyright (C) 2001 Ingo Molnar + * Copyright (C) 2002 International Business Machines Corp. + * + * This code is released under version 2 of the GNU GPL. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +static int startup_handshake; +static int page_counter; +static struct net_device *dump_ndev; +static struct in_device *dump_in_dev; +static u16 source_port, target_port; +static u32 source_ip, target_ip; +static unsigned char daddr[6] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff} ; +static spinlock_t dump_skb_lock = SPIN_LOCK_UNLOCKED; +static int dump_nr_skbs; +static struct sk_buff *dump_skb; +static unsigned long flags_global; +static int netdump_in_progress; +static char device_name[IFNAMSIZ]; + +/* + * security depends on the trusted path between the netconsole + * server and netconsole client, since none of the packets are + * encrypted. The random magic number protects the protocol + * against spoofing. + */ +static u64 dump_magic; + +#define MAX_UDP_CHUNK 1460 +#define MAX_PRINT_CHUNK (MAX_UDP_CHUNK-HEADER_LEN) + +/* + * We maintain a small pool of fully-sized skbs, + * to make sure the message gets out even in + * extreme OOM situations. + */ +#define DUMP_MAX_SKBS 32 + +#define MAX_SKB_SIZE \ + (MAX_UDP_CHUNK + sizeof(struct udphdr) + \ + sizeof(struct iphdr) + sizeof(struct ethhdr)) + +static void +dump_refill_skbs(void) +{ + struct sk_buff *skb; + unsigned long flags; + + spin_lock_irqsave(&dump_skb_lock, flags); + while (dump_nr_skbs < DUMP_MAX_SKBS) { + skb = alloc_skb(MAX_SKB_SIZE, GFP_ATOMIC); + if (!skb) + break; + if (dump_skb) + skb->next = dump_skb; + else + skb->next = NULL; + dump_skb = skb; + dump_nr_skbs++; + } + spin_unlock_irqrestore(&dump_skb_lock, flags); +} + +static struct +sk_buff * dump_get_skb(void) +{ + struct sk_buff *skb; + unsigned long flags; + + spin_lock_irqsave(&dump_skb_lock, flags); + skb = dump_skb; + if (skb) { + dump_skb = skb->next; + skb->next = NULL; + dump_nr_skbs--; + } + spin_unlock_irqrestore(&dump_skb_lock, flags); + + return skb; +} + +/* + * Zap completed output skbs. + */ +static void +zap_completion_queue(void) +{ + int count; + unsigned long flags; + int cpu = smp_processor_id(); + + count=0; + if (softnet_data[cpu].completion_queue) { + struct sk_buff *clist; + + local_irq_save(flags); + clist = softnet_data[cpu].completion_queue; + softnet_data[cpu].completion_queue = NULL; + local_irq_restore(flags); + + while (clist != NULL) { + struct sk_buff *skb = clist; + clist = clist->next; + __kfree_skb(skb); + count++; + if (count > 10000) + printk("Error in sk list\n"); + } + } +} + +static void +dump_send_skb(struct net_device *dev, const char *msg, unsigned int msg_len, + reply_t *reply) +{ + int once = 1; + int total_len, eth_len, ip_len, udp_len, count = 0; + struct sk_buff *skb; + struct udphdr *udph; + struct iphdr *iph; + struct ethhdr *eth; + + udp_len = msg_len + HEADER_LEN + sizeof(*udph); + ip_len = eth_len = udp_len + sizeof(*iph); + total_len = eth_len + ETH_HLEN; + +repeat_loop: + zap_completion_queue(); + if (dump_nr_skbs < DUMP_MAX_SKBS) + dump_refill_skbs(); + + skb = alloc_skb(total_len, GFP_ATOMIC); + if (!skb) { + skb = dump_get_skb(); + if (!skb) { + count++; + if (once && (count == 1000000)) { + printk("possibly FATAL: out of netconsole " + "skbs!!! will keep retrying.\n"); + once = 0; + } + dev->poll_controller(dev); + goto repeat_loop; + } + } + + atomic_set(&skb->users, 1); + skb_reserve(skb, total_len - msg_len - HEADER_LEN); + skb->data[0] = NETCONSOLE_VERSION; + + put_unaligned(htonl(reply->nr), (u32 *) (skb->data + 1)); + put_unaligned(htonl(reply->code), (u32 *) (skb->data + 5)); + put_unaligned(htonl(reply->info), (u32 *) (skb->data + 9)); + + memcpy(skb->data + HEADER_LEN, msg, msg_len); + skb->len += msg_len + HEADER_LEN; + + udph = (struct udphdr *) skb_push(skb, sizeof(*udph)); + udph->source = source_port; + udph->dest = target_port; + udph->len = htons(udp_len); + udph->check = 0; + + iph = (struct iphdr *)skb_push(skb, sizeof(*iph)); + + iph->version = 4; + iph->ihl = 5; + iph->tos = 0; + iph->tot_len = htons(ip_len); + iph->id = 0; + iph->frag_off = 0; + iph->ttl = 64; + iph->protocol = IPPROTO_UDP; + iph->check = 0; + iph->saddr = source_ip; + iph->daddr = target_ip; + iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); + + eth = (struct ethhdr *) skb_push(skb, ETH_HLEN); + + eth->h_proto = htons(ETH_P_IP); + memcpy(eth->h_source, dev->dev_addr, dev->addr_len); + memcpy(eth->h_dest, daddr, dev->addr_len); + + count=0; +repeat_poll: + spin_lock(&dev->xmit_lock); + dev->xmit_lock_owner = smp_processor_id(); + + count++; + + + if (netif_queue_stopped(dev)) { + dev->xmit_lock_owner = -1; + spin_unlock(&dev->xmit_lock); + + dev->poll_controller(dev); + zap_completion_queue(); + + + goto repeat_poll; + } + + dev->hard_start_xmit(skb, dev); + + dev->xmit_lock_owner = -1; + spin_unlock(&dev->xmit_lock); +} + +static unsigned short +udp_check(struct udphdr *uh, int len, unsigned long saddr, unsigned long daddr, + unsigned long base) +{ + return csum_tcpudp_magic(saddr, daddr, len, IPPROTO_UDP, base); +} + +static int +udp_checksum_init(struct sk_buff *skb, struct udphdr *uh, + unsigned short ulen, u32 saddr, u32 daddr) +{ + if (uh->check == 0) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + } else if (skb->ip_summed == CHECKSUM_HW) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + if (!udp_check(uh, ulen, saddr, daddr, skb->csum)) + return 0; + skb->ip_summed = CHECKSUM_NONE; + } + if (skb->ip_summed != CHECKSUM_UNNECESSARY) + skb->csum = csum_tcpudp_nofold(saddr, daddr, ulen, + IPPROTO_UDP, 0); + /* Probably, we should checksum udp header (it should be in cache + * in any case) and data in tiny packets (< rx copybreak). + */ + return 0; +} + +static __inline__ int +__udp_checksum_complete(struct sk_buff *skb) +{ + return (unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, + skb->csum)); +} + +static __inline__ +int udp_checksum_complete(struct sk_buff *skb) +{ + return skb->ip_summed != CHECKSUM_UNNECESSARY && + __udp_checksum_complete(skb); +} + +int new_req = 0; +static req_t req; + +static int +dump_rx_hook(struct sk_buff *skb) +{ + int proto; + struct iphdr *iph; + struct udphdr *uh; + __u32 len, saddr, daddr, ulen; + req_t *__req; + + /* + * First check if were are dumping or doing startup handshake, if + * not quickly return. + */ + if (!netdump_in_progress) + return NET_RX_SUCCESS; + + if (skb->dev->type != ARPHRD_ETHER) + goto out; + + proto = ntohs(skb->mac.ethernet->h_proto); + if (proto != ETH_P_IP) + goto out; + + if (skb->pkt_type == PACKET_OTHERHOST) + goto out; + + if (skb_shared(skb)) + goto out; + + /* IP header correctness testing: */ + iph = (struct iphdr *)skb->data; + if (!pskb_may_pull(skb, sizeof(struct iphdr))) + goto out; + + if (iph->ihl < 5 || iph->version != 4) + goto out; + + if (!pskb_may_pull(skb, iph->ihl*4)) + goto out; + + if (ip_fast_csum((u8 *)iph, iph->ihl) != 0) + goto out; + + len = ntohs(iph->tot_len); + if (skb->len < len || len < iph->ihl*4) + goto out; + + saddr = iph->saddr; + daddr = iph->daddr; + if (iph->protocol != IPPROTO_UDP) + goto out; + + if (source_ip != daddr) + goto out; + + if (target_ip != saddr) + goto out; + + len -= iph->ihl*4; + uh = (struct udphdr *)(((char *)iph) + iph->ihl*4); + ulen = ntohs(uh->len); + + if (ulen != len || ulen < (sizeof(*uh) + sizeof(*__req))) + goto out; + + if (udp_checksum_init(skb, uh, ulen, saddr, daddr) < 0) + goto out; + + if (udp_checksum_complete(skb)) + goto out; + + if (source_port != uh->dest) + goto out; + + if (target_port != uh->source) + goto out; + + __req = (req_t *)(uh + 1); + if ((ntohl(__req->command) != COMM_GET_MAGIC) && + (ntohl(__req->command) != COMM_HELLO) && + (ntohl(__req->command) != COMM_START_WRITE_NETDUMP_ACK) && + (ntohl(__req->command) != COMM_START_NETDUMP_ACK) && + (memcmp(&__req->magic, &dump_magic, sizeof(dump_magic)) != 0)) + goto out; + + req.magic = ntohl(__req->magic); + req.command = ntohl(__req->command); + req.from = ntohl(__req->from); + req.to = ntohl(__req->to); + req.nr = ntohl(__req->nr); + new_req = 1; +out: + return NET_RX_DROP; +} + +static void +dump_send_mem(struct net_device *dev, req_t *req, const char* buff, size_t len) +{ + int i; + + int nr_chunks = len/1024; + reply_t reply; + + reply.nr = req->nr; + reply.info = 0; + + if ( nr_chunks <= 0) + nr_chunks = 1; + for (i = 0; i < nr_chunks; i++) { + unsigned int offset = i*1024; + reply.code = REPLY_MEM; + reply.info = offset; + dump_send_skb(dev, buff + offset, 1024, &reply); + } +} + +/* + * This function waits for the client to acknowledge the receipt + * of the netdump startup reply, with the possibility of packets + * getting lost. We resend the startup packet if no ACK is received, + * after a 1 second delay. + * + * (The client can test the success of the handshake via the HELLO + * command, and send ACKs until we enter netdump mode.) + */ +static int +dump_handshake(struct dump_dev *net_dev) +{ + char tmp[200]; + reply_t reply; + int i, j; + + if (startup_handshake) { + sprintf(tmp, "NETDUMP start, waiting for start-ACK.\n"); + reply.code = REPLY_START_NETDUMP; + reply.nr = 0; + reply.info = 0; + } else { + sprintf(tmp, "NETDUMP start, waiting for start-ACK.\n"); + reply.code = REPLY_START_WRITE_NETDUMP; + reply.nr = net_dev->curr_offset; + reply.info = net_dev->curr_offset; + } + + /* send 300 handshake packets before declaring failure */ + for (i = 0; i < 300; i++) { + dump_send_skb(dump_ndev, tmp, strlen(tmp), &reply); + + /* wait 1 sec */ + for (j = 0; j < 10000; j++) { + udelay(100); + dump_ndev->poll_controller(dump_ndev); + zap_completion_queue(); + if (new_req) + break; + } + + /* + * if there is no new request, try sending the handshaking + * packet again + */ + if (!new_req) + continue; + + /* + * check if the new request is of the expected type, + * if so, return, else try sending the handshaking + * packet again + */ + if (startup_handshake) { + if (req.command == COMM_HELLO || req.command == + COMM_START_NETDUMP_ACK) { + return 0; + } else { + new_req = 0; + continue; + } + } else { + if (req.command == COMM_SEND_MEM) { + return 0; + } else { + new_req = 0; + continue; + } + } + } + return -1; +} + +static ssize_t +do_netdump(struct dump_dev *net_dev, const char* buff, size_t len) +{ + reply_t reply; + char tmp[200]; + ssize_t ret = 0; + int repeatCounter, counter, total_loop; + + netdump_in_progress = 1; + + if (dump_handshake(net_dev) < 0) { + printk("network dump failed due to handshake failure\n"); + goto out; + } + + /* + * Ideally startup handshake should be done during dump configuration, + * i.e., in dump_net_open(). This will be done when I figure out + * the dependency between startup handshake, subsequent write and + * various commands wrt to net-server. + */ + if (startup_handshake) + startup_handshake = 0; + + counter = 0; + repeatCounter = 0; + total_loop = 0; + while (1) { + if (!new_req) { + dump_ndev->poll_controller(dump_ndev); + zap_completion_queue(); + } + if (!new_req) { + repeatCounter++; + + if (repeatCounter > 5) { + counter++; + if (counter > 10000) { + if (total_loop >= 100000) { + printk("Time OUT LEAVE NOW\n"); + goto out; + } else { + total_loop++; + printk("Try number %d out of " + "10 before Time Out\n", + total_loop); + } + } + mdelay(1); + repeatCounter = 0; + } + continue; + } + repeatCounter = 0; + counter = 0; + total_loop = 0; + new_req = 0; + switch (req.command) { + case COMM_NONE: + break; + + case COMM_SEND_MEM: + dump_send_mem(dump_ndev, &req, buff, len); + break; + + case COMM_EXIT: + case COMM_START_WRITE_NETDUMP_ACK: + ret = len; + goto out; + + case COMM_HELLO: + sprintf(tmp, "Hello, this is netdump version " + "0.%02d\n", NETCONSOLE_VERSION); + reply.code = REPLY_HELLO; + reply.nr = req.nr; + reply.info = net_dev->curr_offset; + dump_send_skb(dump_ndev, tmp, strlen(tmp), &reply); + break; + + case COMM_GET_PAGE_SIZE: + sprintf(tmp, "PAGE_SIZE: %ld\n", PAGE_SIZE); + reply.code = REPLY_PAGE_SIZE; + reply.nr = req.nr; + reply.info = PAGE_SIZE; + dump_send_skb(dump_ndev, tmp, strlen(tmp), &reply); + break; + + case COMM_GET_NR_PAGES: + reply.code = REPLY_NR_PAGES; + reply.nr = req.nr; + reply.info = num_physpages; + reply.info = page_counter; + sprintf(tmp, "Number of pages: %ld\n", num_physpages); + dump_send_skb(dump_ndev, tmp, strlen(tmp), &reply); + break; + + case COMM_GET_MAGIC: + reply.code = REPLY_MAGIC; + reply.nr = req.nr; + reply.info = NETCONSOLE_VERSION; + dump_send_skb(dump_ndev, (char *)&dump_magic, + sizeof(dump_magic), &reply); + break; + + default: + reply.code = REPLY_ERROR; + reply.nr = req.nr; + reply.info = req.command; + sprintf(tmp, "Got unknown command code %d!\n", + req.command); + dump_send_skb(dump_ndev, tmp, strlen(tmp), &reply); + break; + } + } +out: + netdump_in_progress = 0; + return ret; +} + +static int +dump_validate_config(void) +{ + source_ip = dump_in_dev->ifa_list->ifa_local; + if (!source_ip) { + printk("network device %s has no local address, " + "aborting.\n", device_name); + return -1; + } + +#define IP(x) ((unsigned char *)&source_ip)[x] + printk("Source %d.%d.%d.%d", IP(0), IP(1), IP(2), IP(3)); +#undef IP + + if (!source_port) { + printk("source_port parameter not specified, aborting.\n"); + return -1; + } + printk(":%i\n", source_port); + source_port = htons(source_port); + + if (!target_ip) { + printk("target_ip parameter not specified, aborting.\n"); + return -1; + } + +#define IP(x) ((unsigned char *)&target_ip)[x] + printk("Target %d.%d.%d.%d", IP(0), IP(1), IP(2), IP(3)); +#undef IP + + if (!target_port) { + printk("target_port parameter not specified, aborting.\n"); + return -1; + } + printk(":%i\n", target_port); + target_port = htons(target_port); + + printk("Target Ethernet Address %02x:%02x:%02x:%02x:%02x:%02x", + daddr[0], daddr[1], daddr[2], daddr[3], daddr[4], daddr[5]); + + if ((daddr[0] & daddr[1] & daddr[2] & daddr[3] & daddr[4] & + daddr[5]) == 255) + printk("(Broadcast)"); + printk("\n"); + return 0; +} + +/* + * Prepares the dump device so we can take a dump later. + * Validates the netdump configuration parameters. + * + * TODO: Network connectivity check should be done here. + */ +static int +dump_net_open(struct dump_dev *net_dev, unsigned long arg) +{ + int retval = 0; + + /* get the interface name */ + if (copy_from_user(device_name, (void *)arg, IFNAMSIZ)) + return -EFAULT; + + if (!(dump_ndev = dev_get_by_name(device_name))) { + printk("network device %s does not exist, aborting.\n", + device_name); + return -ENODEV; + } + + if (!dump_ndev->poll_controller) { + printk("network device %s does not implement polling yet, " + "aborting.\n", device_name); + retval = -1; /* return proper error */ + goto err1; + } + + if (!(dump_in_dev = in_dev_get(dump_ndev))) { + printk("network device %s is not an IP protocol device, " + "aborting.\n", device_name); + retval = -EINVAL; + goto err1; + } + + if ((retval = dump_validate_config()) < 0) + goto err2; + + net_dev->curr_offset = 0; + printk("Network device %s successfully configured for dumping\n", + device_name); + return retval; +err2: + in_dev_put(dump_in_dev); +err1: + dev_put(dump_ndev); + return retval; +} + +/* + * Close the dump device and release associated resources + * Invoked when unconfiguring the dump device. + */ +static int +dump_net_release(struct dump_dev *net_dev) +{ + if (dump_in_dev) + in_dev_put(dump_in_dev); + if (dump_ndev) + dev_put(dump_ndev); + return 0; +} + +/* + * Prepare the dump device for use (silence any ongoing activity + * and quiesce state) when the system crashes. + */ +static int +dump_net_silence(struct dump_dev *net_dev) +{ + local_irq_save(flags_global); + dump_ndev->rx_hook = dump_rx_hook; + startup_handshake = 1; + net_dev->curr_offset = 0; + printk("Dumping to network device %s on CPU %d ...\n", device_name, + smp_processor_id()); + return 0; +} + +/* + * Invoked when dumping is done. This is the time to put things back + * (i.e. undo the effects of dump_block_silence) so the device is + * available for normal use. + */ +static int +dump_net_resume(struct dump_dev *net_dev) +{ + int indx; + reply_t reply; + char tmp[200]; + + if (!dump_ndev) + return (0); + + sprintf(tmp, "NETDUMP end.\n"); + for( indx = 0; indx < 6; indx++) { + reply.code = REPLY_END_NETDUMP; + reply.nr = 0; + reply.info = 0; + dump_send_skb(dump_ndev, tmp, strlen(tmp), &reply); + } + printk("NETDUMP END!\n"); + local_irq_restore(flags_global); + dump_ndev->rx_hook = NULL; + startup_handshake = 0; + return 0; +} + +/* + * Seek to the specified offset in the dump device. + * Makes sure this is a valid offset, otherwise returns an error. + */ +static int +dump_net_seek(struct dump_dev *net_dev, loff_t off) +{ + net_dev->curr_offset = off; + return 0; +} + +/* + * + */ +static int +dump_net_write(struct dump_dev *net_dev, void *buf, unsigned long len) +{ + int cnt, i, off; + ssize_t ret; + + cnt = len/ PAGE_SIZE; + + for (i = 0; i < cnt; i++) { + off = i* PAGE_SIZE; + ret = do_netdump(net_dev, buf+off, PAGE_SIZE); + if (ret <= 0) + return -1; + net_dev->curr_offset = net_dev->curr_offset + PAGE_SIZE; + } + return len; +} + +/* + * check if the last dump i/o is over and ready for next request + */ +static int +dump_net_ready(struct dump_dev *net_dev, void *buf) +{ + return 0; +} + +/* + * ioctl function used for configuring network dump + */ +static int +dump_net_ioctl(struct dump_dev *net_dev, unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case DIOSTARGETIP: + target_ip = arg; + break; + case DIOSTARGETPORT: + target_port = (u16)arg; + break; + case DIOSSOURCEPORT: + source_port = (u16)arg; + break; + case DIOSETHADDR: + return copy_from_user(daddr, (void *)arg, 6); + break; + case DIOGTARGETIP: + case DIOGTARGETPORT: + case DIOGSOURCEPORT: + case DIOGETHADDR: + break; + default: + return -EINVAL; + } + return 0; +} + +struct dump_dev_ops dump_netdev_ops = { + .open = dump_net_open, + .release = dump_net_release, + .silence = dump_net_silence, + .resume = dump_net_resume, + .seek = dump_net_seek, + .write = dump_net_write, + /* .read not implemented */ + .ready = dump_net_ready, + .ioctl = dump_net_ioctl +}; + +static struct dump_dev default_dump_netdev = { + .type_name = "networkdev", + .ops = &dump_netdev_ops, + .curr_offset = 0 +}; + +static int __init +dump_netdev_init(void) +{ + default_dump_netdev.curr_offset = 0; + + if (dump_register_device(&default_dump_netdev) < 0) { + printk("network dump device driver registration failed\n"); + return -1; + } + printk("network device driver for LKCD registered\n"); + + get_random_bytes(&dump_magic, sizeof(dump_magic)); + return 0; +} + +static void __exit +dump_netdev_cleanup(void) +{ + dump_unregister_device(&default_dump_netdev); +} + +MODULE_AUTHOR("LKCD Development Team "); +MODULE_DESCRIPTION("Network Dump Driver for Linux Kernel Crash Dump (LKCD)"); +MODULE_LICENSE("GPL"); + +module_init(dump_netdev_init); +module_exit(dump_netdev_cleanup); diff -urpN -X /home/fletch/.diff.exclude 000-virgin/drivers/dump/dump_rle.c 900-mjb1/drivers/dump/dump_rle.c --- 000-virgin/drivers/dump/dump_rle.c Wed Dec 31 16:00:00 1969 +++ 900-mjb1/drivers/dump/dump_rle.c Fri May 30 19:24:56 2003 @@ -0,0 +1,175 @@ +/* + * RLE Compression functions for kernel crash dumps. + * + * Created by: Matt Robinson (yakker@sourceforge.net) + * Copyright 2001 Matt D. Robinson. All rights reserved. + * + * This code is released under version 2 of the GNU GPL. + */ + +/* header files */ +#include +#include +#include +#include +#include +#include +#include + +/* + * Name: dump_compress_rle() + * Func: Compress a DUMP_PAGE_SIZE (hardware) page down to something more + * reasonable, if possible. This is the same routine we use in IRIX. + */ +static u16 +dump_compress_rle(const u8 *old, u16 oldsize, u8 *new, u16 newsize) +{ + u16 ri, wi, count = 0; + u_char value = 0, cur_byte; + + /* + * If the block should happen to "compress" to larger than the + * buffer size, allocate a larger one and change cur_buf_size. + */ + + wi = ri = 0; + + while (ri < oldsize) { + if (!ri) { + cur_byte = value = old[ri]; + count = 0; + } else { + if (count == 255) { + if (wi + 3 > oldsize) { + return oldsize; + } + new[wi++] = 0; + new[wi++] = count; + new[wi++] = value; + value = cur_byte = old[ri]; + count = 0; + } else { + if ((cur_byte = old[ri]) == value) { + count++; + } else { + if (count > 1) { + if (wi + 3 > oldsize) { + return oldsize; + } + new[wi++] = 0; + new[wi++] = count; + new[wi++] = value; + } else if (count == 1) { + if (value == 0) { + if (wi + 3 > oldsize) { + return oldsize; + } + new[wi++] = 0; + new[wi++] = 1; + new[wi++] = 0; + } else { + if (wi + 2 > oldsize) { + return oldsize; + } + new[wi++] = value; + new[wi++] = value; + } + } else { /* count == 0 */ + if (value == 0) { + if (wi + 2 > oldsize) { + return oldsize; + } + new[wi++] = value; + new[wi++] = value; + } else { + if (wi + 1 > oldsize) { + return oldsize; + } + new[wi++] = value; + } + } /* if count > 1 */ + + value = cur_byte; + count = 0; + + } /* if byte == value */ + + } /* if count == 255 */ + + } /* if ri == 0 */ + ri++; + + } + if (count > 1) { + if (wi + 3 > oldsize) { + return oldsize; + } + new[wi++] = 0; + new[wi++] = count; + new[wi++] = value; + } else if (count == 1) { + if (value == 0) { + if (wi + 3 > oldsize) + return oldsize; + new[wi++] = 0; + new[wi++] = 1; + new[wi++] = 0; + } else { + if (wi + 2 > oldsize) + return oldsize; + new[wi++] = value; + new[wi++] = value; + } + } else { /* count == 0 */ + if (value == 0) { + if (wi + 2 > oldsize) + return oldsize; + new[wi++] = value; + new[wi++] = value; + } else { + if (wi + 1 > oldsize) + return oldsize; + new[wi++] = value; + } + } /* if count > 1 */ + + value = cur_byte; + count = 0; + return wi; +} + +/* setup the rle compression functionality */ +static struct __dump_compress dump_rle_compression = { + .compress_type = DUMP_COMPRESS_RLE, + .compress_func = dump_compress_rle, + .compress_name = "RLE", +}; + +/* + * Name: dump_compress_rle_init() + * Func: Initialize rle compression for dumping. + */ +static int __init +dump_compress_rle_init(void) +{ + dump_register_compression(&dump_rle_compression); + return 0; +} + +/* + * Name: dump_compress_rle_cleanup() + * Func: Remove rle compression for dumping. + */ +static void __exit +dump_compress_rle_cleanup(void) +{ + dump_unregister_compression(DUMP_COMPRESS_RLE); +} + +/* module initialization */ +module_init(dump_compress_rle_init); +module_exit(dump_compress_rle_cleanup); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("LKCD Development Team "); +MODULE_DESCRIPTION("RLE compression module for crash dump driver"); diff -urpN -X /home/fletch/.diff.exclude 000-virgin/drivers/dump/dump_scheme.c 900-mjb1/drivers/dump/dump_scheme.c --- 000-virgin/drivers/dump/dump_scheme.c Wed Dec 31 16:00:00 1969 +++ 900-mjb1/drivers/dump/dump_scheme.c Fri May 30 19:24:57 2003 @@ -0,0 +1,346 @@ +/* + * Default single stage dump scheme methods + * + * Previously a part of dump_base.c + * + * Started: Oct 2002 - Suparna Bhattacharya + * Split and rewrote LKCD dump scheme to generic dump method + * interfaces + * Derived from original code created by + * Matt Robinson ) + * + * Contributions from SGI, IBM, HP, MCL, and others. + * + * Copyright (C) 1999 - 2002 Silicon Graphics, Inc. All rights reserved. + * Copyright (C) 2001 - 2002 Matt D. Robinson. All rights reserved. + * Copyright (C) 2002 International Business Machines Corp. + * + * This code is released under version 2 of the GNU GPL. + */ + +/* + * Implements the default dump scheme, i.e. single-stage gathering and + * saving of dump data directly to the target device, which operates in + * a push mode, where the dumping system decides what data it saves + * taking into account pre-specified dump config options. + * + * Aside: The 2-stage dump scheme, where there is a soft-reset between + * the gathering and saving phases, also reuses some of these + * default routines (see dump_overlay.c) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include "dump_methods.h" + +extern int panic_timeout; /* time before reboot */ + +extern void dump_speedo(int); + +/* Default sequencer used during single stage dumping */ +/* Also invoked during stage 2 of soft-boot based dumping */ +int dump_generic_sequencer(void) +{ + struct dump_data_filter *filter = dump_config.dumper->filter; + int pass = 0, err = 0, save = 0; + int (*action)(unsigned long, unsigned long); + + /* + * We want to save the more critical data areas first in + * case we run out of space, encounter i/o failures, or get + * interrupted otherwise and have to give up midway + * So, run through the passes in increasing order + */ + for (;filter->selector; filter++, pass++) + { + /* Assumes passes are exclusive (even across dumpers) */ + /* Requires care when coding the selection functions */ + if ((save = filter->level_mask & dump_config.level)) + action = dump_save_data; + else + action = dump_skip_data; + + if ((err = dump_iterator(pass, action, filter)) < 0) + break; + + printk("\n %d dump pages %s of %d each in pass %d\n", + err, save ? "saved" : "skipped", DUMP_PAGE_SIZE, pass); + + } + + return (err < 0) ? err : 0; +} + +static inline struct page *dump_get_page(loff_t loc) +{ + unsigned long page_index = loc >> PAGE_SHIFT; + + /* todo: complete this to account for ia64/discontig mem */ + /* todo: and to check for validity, ram page, no i/o mem etc */ + /* need to use pfn/physaddr equiv of kern_addr_valid */ + if (__dump_page_valid(page_index)) + return pfn_to_page(page_index); + else + return NULL; + +} + +/* Default iterator: for singlestage and stage 1 of soft-boot dumping */ +/* Iterates over range of physical memory pages in DUMP_PAGE_SIZE increments */ +int dump_page_iterator(int pass, int (*action)(unsigned long, unsigned long), + struct dump_data_filter *filter) +{ + /* Todo : fix unit, type */ + loff_t loc; + int count = 0, err = 0; + struct page *page; + + /* Todo: Add membanks code */ + /* TBD: Check if we need to address DUMP_PAGE_SIZE < PAGE_SIZE */ + + for (loc = filter->start; loc < filter->end; loc += DUMP_PAGE_SIZE) { + dump_config.dumper->curr_loc = loc; + page = dump_get_page(loc); + if (page && filter->selector(pass, (unsigned long) page, + DUMP_PAGE_SIZE)) { + if ((err = action((unsigned long)page, DUMP_PAGE_SIZE))) + { + printk("dump_page_iterator: err %d for loc " + "0x%llx, in pass %d\n", err, loc, pass); + break; + } else + count++; + } + } + + return err ? err : count; +} + +/* + * Base function that saves the selected block of data in the dump + * Action taken when iterator decides that data needs to be saved + */ +int dump_generic_save_data(unsigned long loc, unsigned long sz) +{ + void *buf; + void *dump_buf = dump_config.dumper->dump_buf; + int left, bytes, ret; + + if ((ret = dump_add_data(loc, sz))) { + return ret; + } + buf = dump_config.dumper->curr_buf; + + /* If we've filled up the buffer write it out */ + if ((left = buf - dump_buf) >= DUMP_BUFFER_SIZE) { + bytes = dump_write_buffer(dump_buf, DUMP_BUFFER_SIZE); + if (bytes < DUMP_BUFFER_SIZE) { + printk("dump_write_buffer failed %d\n", bytes); + return bytes ? -ENOSPC : bytes; + } + + left -= bytes; + + /* -- A few chores to do from time to time -- */ + dump_config.dumper->count++; + + if (!(dump_config.dumper->count & 0x3f)) { + /* Update the header every one in a while */ + /* memset((void *)dump_buf, 'b', DUMP_BUFFER_SIZE);*/ + if ((ret = dump_update_header()) < 0) { + /* issue warning */ + return ret; + } + printk("."); + + touch_nmi_watchdog(); + } else if (!(dump_config.dumper->count & 0x7)) { + /* Show progress so the user knows we aren't hung */ + dump_speedo(dump_config.dumper->count >> 3); + } + /* Todo: Touch/Refresh watchdog */ + + /* --- Done with periodic chores -- */ + + + /* now adjust the leftover bits back to the top of the page */ + /* this case would not arise during stage 2 (passthru) */ + memset(dump_buf, 'z', DUMP_BUFFER_SIZE); + if (left) { + memcpy(dump_buf, dump_buf + DUMP_BUFFER_SIZE, left); + } + buf -= DUMP_BUFFER_SIZE; + dump_config.dumper->curr_buf = buf; + } + + return 0; +} + +int dump_generic_skip_data(unsigned long loc, unsigned long sz) +{ + /* dummy by default */ + return 0; +} + +/* + * Common low level routine to write a buffer to current dump device + * Expects checks for space etc to have been taken care of by the caller + * Operates serially at the moment for simplicity. + * TBD/Todo: Consider batching for improved throughput + */ +int dump_ll_write(void *buf, unsigned long len) +{ + long transferred = 0, last_transfer = 0; + int ret = 0; + + /* make sure device is ready */ + while ((ret = dump_dev_ready(NULL)) == -EAGAIN); + if (ret < 0) { + printk("dump_dev_ready failed !err %d\n", ret); + return ret; + } + + while (len) { + if ((last_transfer = dump_dev_write(buf, len)) <= 0) { + ret = last_transfer; + printk("dump_dev_write failed !err %d\n", + ret); + break; + } + /* wait till complete */ + while ((ret = dump_dev_ready(buf)) == -EAGAIN) + cpu_relax(); + + if (ret < 0) { + printk("i/o failed !err %d\n", ret); + break; + } + + len -= last_transfer; + buf += last_transfer; + transferred += last_transfer; + } + return (ret < 0) ? ret : transferred; +} + +/* default writeout routine for single dump device */ +/* writes out the dump data ensuring enough space is left for the end marker */ +int dump_generic_write_buffer(void *buf, unsigned long len) +{ + long written = 0; + int err = 0; + + /* check for space */ + if ((err = dump_dev_seek(dump_config.dumper->curr_offset + len + + 2*DUMP_BUFFER_SIZE)) < 0) { + printk("dump_write_buffer: insuff space after offset 0x%llx\n", + dump_config.dumper->curr_offset); + return err; + } + /* alignment check would happen as a side effect of this */ + if ((err = dump_dev_seek(dump_config.dumper->curr_offset)) < 0) + return err; + + written = dump_ll_write(buf, len); + + /* all or none */ + + if (written < len) + written = written ? -ENOSPC : written; + else + dump_config.dumper->curr_offset += len; + + return written; +} + +int dump_generic_configure(unsigned long devid) +{ + struct dump_dev *dev = dump_config.dumper->dev; + struct dump_data_filter *filter; + void *buf; + int ret = 0; + + /* Allocate the dump buffer and initialize dumper state */ + /* Assume that we get aligned addresses */ + if (!(buf = dump_alloc_mem(DUMP_BUFFER_SIZE + 2 * DUMP_PAGE_SIZE))) + return -ENOMEM; + + if ((unsigned long)buf & (PAGE_SIZE - 1)) { + /* sanity check for page aligned address */ + dump_free_mem(buf); + return -ENOMEM; /* fixme: better error code */ + } + + /* Initialize the rest of the fields */ + dump_config.dumper->dump_buf = buf; + dumper_reset(); + + /* Open the dump device */ + if (!dev) + return -ENODEV; + + if ((ret = dev->ops->open(dev, devid))) { + return ret; + } + + /* Initialise the memory ranges in the dump filter */ + for (filter = dump_config.dumper->filter ;filter->selector; filter++) { + if (!filter->start && !filter->end) { + filter->start = 0; + filter->end = num_physpages << PAGE_SHIFT; + } + } + + return 0; +} + +int dump_generic_unconfigure(void) +{ + struct dump_dev *dev = dump_config.dumper->dev; + void *buf = dump_config.dumper->dump_buf; + int ret = 0; + + /* Close the dump device */ + if (dev && (ret = dev->ops->release(dev))) + return ret; + + if (buf) + dump_free_mem(buf); + + dump_config.dumper->curr_buf = dump_config.dumper->dump_buf = NULL; + return 0; +} + + +/* Set up the default dump scheme */ + +struct dump_scheme_ops dump_scheme_singlestage_ops = { + .configure = dump_generic_configure, + .unconfigure = dump_generic_unconfigure, + .sequencer = dump_generic_sequencer, + .iterator = dump_page_iterator, + .save_data = dump_generic_save_data, + .skip_data = dump_generic_skip_data, + .write_buffer = dump_generic_write_buffer, +}; + +struct dump_scheme dump_scheme_singlestage = { + .name = "single-stage", + .ops = &dump_scheme_singlestage_ops +}; + +/* The single stage dumper comprising all these */ +struct dumper dumper_singlestage = { + .name = "single-stage", + .scheme = &dump_scheme_singlestage, + .fmt = &dump_fmt_lcrash, + .compress = &dump_none_compression, + .filter = dump_filter_table, + .dev = NULL, +}; + diff -urpN -X /home/fletch/.diff.exclude 000-virgin/drivers/dump/dump_setup.c 900-mjb1/drivers/dump/dump_setup.c --- 000-virgin/drivers/dump/dump_setup.c Wed Dec 31 16:00:00 1969 +++ 900-mjb1/drivers/dump/dump_setup.c Fri May 30 19:24:56 2003 @@ -0,0 +1,729 @@ +/* + * Standard kernel function entry points for Linux crash dumps. + * + * Created by: Matt Robinson (yakker@sourceforge.net) + * Contributions from SGI, IBM, HP, MCL, and others. + * + * Copyright (C) 1999 - 2002 Silicon Graphics, Inc. All rights reserved. + * Copyright (C) 2000 - 2002 TurboLinux, Inc. All rights reserved. + * Copyright (C) 2001 - 2002 Matt D. Robinson. All rights reserved. + * Copyright (C) 2002 Free Software Foundation, Inc. All rights reserved. + * + * This code is released under version 2 of the GNU GPL. + */ + +/* + * ----------------------------------------------------------------------- + * + * DUMP HISTORY + * + * This dump code goes back to SGI's first attempts at dumping system + * memory on SGI systems running IRIX. A few developers at SGI needed + * a way to take this system dump and analyze it, and created 'icrash', + * or IRIX Crash. The mechanism (the dumps and 'icrash') were used + * by support people to generate crash reports when a system failure + * occurred. This was vital for large system configurations that + * couldn't apply patch after patch after fix just to hope that the + * problems would go away. So the system memory, along with the crash + * dump analyzer, allowed support people to quickly figure out what the + * problem was on the system with the crash dump. + * + * In comes Linux. SGI started moving towards the open source community, + * and upon doing so, SGI wanted to take its support utilities into Linux + * with the hopes that they would end up the in kernel and user space to + * be used by SGI's customers buying SGI Linux systems. One of the first + * few products to be open sourced by SGI was LKCD, or Linux Kernel Crash + * Dumps. LKCD comprises of a patch to the kernel to enable system + * dumping, along with 'lcrash', or Linux Crash, to analyze the system + * memory dump. A few additional system scripts and kernel modifications + * are also included to make the dump mechanism and dump data easier to + * process and use. + * + * As soon as LKCD was released into the open source community, a number + * of larger companies started to take advantage of it. Today, there are + * many community members that contribute to LKCD, and it continues to + * flourish and grow as an open source project. + */ + +/* + * DUMP TUNABLES + * + * This is the list of system tunables (via /proc) that are available + * for Linux systems. All the read, write, etc., functions are listed + * here. Currently, there are a few different tunables for dumps: + * + * dump_device (used to be dumpdev): + * The device for dumping the memory pages out to. This + * may be set to the primary swap partition for disruptive dumps, + * and must be an unused partition for non-disruptive dumps. + * Todo: In the case of network dumps, this may be interpreted + * as the IP address of the netdump server to connect to. + * + * dump_compress (used to be dump_compress_pages): + * This is the flag which indicates which compression mechanism + * to use. This is a BITMASK, not an index (0,1,2,4,8,16,etc.). + * This is the current set of values: + * + * 0: DUMP_COMPRESS_NONE -- Don't compress any pages. + * 1: DUMP_COMPRESS_RLE -- This uses RLE compression. + * 2: DUMP_COMPRESS_GZIP -- This uses GZIP compression. + * + * dump_level: + * The amount of effort the dump module should make to save + * information for post crash analysis. This value is now + * a BITMASK value, not an index: + * + * 0: Do nothing, no dumping. (DUMP_LEVEL_NONE) + * + * 1: Print out the dump information to the dump header, and + * write it out to the dump_device. (DUMP_LEVEL_HEADER) + * + * 2: Write out the dump header and all kernel memory pages. + * (DUMP_LEVEL_KERN) + * + * 4: Write out the dump header and all kernel and user + * memory pages. (DUMP_LEVEL_USED) + * + * 8: Write out the dump header and all conventional/cached + * memory (RAM) pages in the system (kernel, user, free). + * (DUMP_LEVEL_ALL_RAM) + * + * 16: Write out everything, including non-conventional memory + * like firmware, proms, I/O registers, uncached memory. + * (DUMP_LEVEL_ALL) + * + * The dump_level will default to 1. + * + * dump_flags: + * These are the flags to use when talking about dumps. There + * are lots of possibilities. This is a BITMASK value, not an index. + * + * ----------------------------------------------------------------------- + */ + +#include +#include +#include +#include +#include +#include "dump_methods.h" +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +/* + * ----------------------------------------------------------------------- + * V A R I A B L E S + * ----------------------------------------------------------------------- + */ + +/* Dump tunables */ +struct dump_config dump_config = { + .level = 0, + .flags = 0, + .dumper = NULL +}; + + +/* Global variables used in dump.h */ +/* degree of system freeze when dumping */ +enum dump_silence_levels dump_silence_level = DUMP_HARD_SPIN_CPUS; + +/* Other global fields */ +extern struct __dump_header dump_header; +struct dump_dev *dump_dev = NULL; /* Active dump device */ +int dump_device = 0; +static int dump_compress = 0; + +static u16 dump_compress_none(const u8 *old, u16 oldsize, u8 *new, u16 newsize); +struct __dump_compress dump_none_compression = { + .compress_type = DUMP_COMPRESS_NONE, + .compress_func = dump_compress_none, + .compress_name = "none", +}; + +/* our device operations and functions */ +static int dump_ioctl(struct inode *i, struct file *f, + unsigned int cmd, unsigned long arg); + +static struct file_operations dump_fops = { + .ioctl = dump_ioctl, +}; + +/* static variables */ +static int dump_okay = 0; /* can we dump out to disk? */ +static spinlock_t dump_lock = SPIN_LOCK_UNLOCKED; + +/* used for dump compressors */ +static struct list_head dump_compress_list = LIST_HEAD_INIT(dump_compress_list); + +/* list of registered dump targets */ +static struct list_head dump_target_list = LIST_HEAD_INIT(dump_target_list); + +/* lkcd info structure -- this is used by lcrash for basic system data */ +struct __lkcdinfo lkcdinfo = { + .ptrsz = (sizeof(void *) * 8), +#if defined(__LITTLE_ENDIAN) + .byte_order = __LITTLE_ENDIAN, +#else + .byte_order = __BIG_ENDIAN, +#endif + .page_shift = PAGE_SHIFT, + .page_size = PAGE_SIZE, + .page_mask = PAGE_MASK, + .page_offset = PAGE_OFFSET, +}; + +/* + * ----------------------------------------------------------------------- + * / P R O C T U N A B L E F U N C T I O N S + * ----------------------------------------------------------------------- + */ + +static int proc_dump_device(ctl_table *ctl, int write, struct file *f, + void *buffer, size_t *lenp); + +/* + * sysctl-tuning infrastructure. + */ +static ctl_table dump_table[] = { + { .ctl_name = CTL_DUMP_LEVEL, + .procname = DUMP_LEVEL_NAME, + .data = &dump_config.level, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, }, + + { .ctl_name = CTL_DUMP_FLAGS, + .procname = DUMP_FLAGS_NAME, + .data = &dump_config.flags, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, }, + + { .ctl_name = CTL_DUMP_COMPRESS, + .procname = DUMP_COMPRESS_NAME, + .data = &dump_compress, /* FIXME */ + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, }, + + { .ctl_name = CTL_DUMP_DEVICE, + .procname = DUMP_DEVICE_NAME, + .mode = 0644, + .data = &dump_device, /* FIXME */ + .maxlen = sizeof(int), + .proc_handler = proc_dump_device }, + + { 0, } +}; + +static ctl_table dump_root[] = { + { .ctl_name = KERN_DUMP, + .procname = "dump", + .mode = 0555, + .child = dump_table }, + { 0, } +}; + +static ctl_table kernel_root[] = { + { .ctl_name = CTL_KERN, + .procname = "kernel", + .mode = 0555, + .child = dump_root, }, + { 0, } +}; + +static struct ctl_table_header *sysctl_header; + +/* + * ----------------------------------------------------------------------- + * C O M P R E S S I O N F U N C T I O N S + * ----------------------------------------------------------------------- + */ + +/* + * Name: dump_compress_none() + * Func: Don't do any compression, period. + */ +static u16 +dump_compress_none(const u8 *old, u16 oldsize, u8 *new, u16 newsize) +{ + /* just return the old size */ + return oldsize; +} + + +/* + * Name: dump_execute() + * Func: Execute the dumping process. This makes sure all the appropriate + * fields are updated correctly, and calls dump_execute_memdump(), + * which does the real work. + */ +void +dump_execute(const char *panic_str, const struct pt_regs *regs) +{ + int state = -1; + unsigned long flags; + + /* make sure we can dump */ + if (!dump_okay) { + pr_info("LKCD not yet configured, can't take dump now\n"); + return; + } + + /* Exclude multiple dumps at the same time, + * and disable interrupts, some drivers may re-enable + * interrupts in with silence() + * + * Try and acquire spin lock. If successful, leave preempt + * and interrupts disabled. See spin_lock_irqsave in spinlock.h + */ + local_irq_save(flags); + if (!spin_trylock(&dump_lock)) { + local_irq_restore(flags); + pr_info("LKCD dump already in progress\n"); + return; + } + + /* Bring system into the strictest level of quiescing for min drift + * dump drivers can soften this as required in dev->ops->silence() + */ + dump_oncpu = smp_processor_id() + 1; + dump_silence_level = DUMP_HARD_SPIN_CPUS; + + state = dump_generic_execute(panic_str, regs); + + dump_oncpu = 0; + spin_unlock_irqrestore(&dump_lock, flags); + + if (state < 0) { + printk("Dump Incomplete or failed!\n"); + } else { + printk("Dump Complete; %d dump pages saved.\n", + dump_header.dh_num_dump_pages); + } +} + +/* + * Name: dump_register_compression() + * Func: Register a dump compression mechanism. + */ +void +dump_register_compression(struct __dump_compress *item) +{ + if (item) + list_add(&(item->list), &dump_compress_list); +} + +/* + * Name: dump_unregister_compression() + * Func: Remove a dump compression mechanism, and re-assign the dump + * compression pointer if necessary. + */ +void +dump_unregister_compression(int compression_type) +{ + struct list_head *tmp; + struct __dump_compress *dc; + + /* let's make sure our list is valid */ + if (compression_type != DUMP_COMPRESS_NONE) { + list_for_each(tmp, &dump_compress_list) { + dc = list_entry(tmp, struct __dump_compress, list); + if (dc->compress_type == compression_type) { + list_del(&(dc->list)); + break; + } + } + } +} + +/* + * Name: dump_compress_init() + * Func: Initialize (or re-initialize) compression scheme. + */ +static int +dump_compress_init(int compression_type) +{ + struct list_head *tmp; + struct __dump_compress *dc; + + /* try to remove the compression item */ + list_for_each(tmp, &dump_compress_list) { + dc = list_entry(tmp, struct __dump_compress, list); + if (dc->compress_type == compression_type) { + dump_config.dumper->compress = dc; + dump_compress = compression_type; + pr_debug("Dump Compress %s\n", dc->compress_name); + return 0; + } + } + + /* + * nothing on the list -- return ENODATA to indicate an error + * + * NB: + * EAGAIN: reports "Resource temporarily unavailable" which + * isn't very enlightening. + */ + printk("compression_type:%d not found\n", compression_type); + + return -ENODATA; +} + +static int +dumper_setup(unsigned long flags, unsigned long devid) +{ + int ret = 0; + + /* unconfigure old dumper if it exists */ + dump_okay = 0; + if (dump_config.dumper) { + pr_debug("Unconfiguring current dumper\n"); + dump_unconfigure(); + } + /* set up new dumper */ + dump_config.dumper = &dumper_singlestage; + dump_config.dumper->dev = dump_dev; + + ret = dump_configure(devid); + if (!ret) { + dump_okay = 1; + pr_debug("%s dumper set up for dev 0x%lx\n", + dump_config.dumper->name, devid); + dump_device = devid; + } else { + printk("%s dumper set up failed for dev 0x%lx\n", + dump_config.dumper->name, devid); + } + return ret; +} + +static int +dump_target_init(int target) +{ + char type[20]; + struct list_head *tmp; + struct dump_dev *dev; + + switch (target) { + case DUMP_FLAGS_DISKDUMP: + strcpy(type, "blockdev"); break; + case DUMP_FLAGS_NETDUMP: + strcpy(type, "networkdev"); break; + default: + return -1; + } + + /* + * This is a bit stupid, generating strings from flag + * and doing strcmp. This is done because 'struct dump_dev' + * has string 'type_name' and not interger 'type'. + */ + list_for_each(tmp, &dump_target_list) { + dev = list_entry(tmp, struct dump_dev, list); + if (strcmp(type, dev->type_name) == 0) { + dump_dev = dev; + return 0; + } + } + return -1; +} + +/* + * Name: dump_ioctl() + * Func: Allow all dump tunables through a standard ioctl() mechanism. + * This is far better than before, where we'd go through /proc, + * because now this will work for multiple OS and architectures. + */ +static int +dump_ioctl(struct inode *i, struct file *f, unsigned int cmd, unsigned long arg) +{ + /* check capabilities */ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!dump_config.dumper && cmd == DIOSDUMPCOMPRESS) + /* dump device must be configured first */ + return -ENODEV; + + /* + * This is the main mechanism for controlling get/set data + * for various dump device parameters. The real trick here + * is setting the dump device (DIOSDUMPDEV). That's what + * triggers everything else. + */ + switch (cmd) { + case DIOSDUMPDEV: /* set dump_device */ + pr_debug("Configuring dump device\n"); + if (!(f->f_flags & O_RDWR)) + return -EPERM; + + __dump_open(); + return dumper_setup(dump_config.flags, arg); + + + case DIOGDUMPDEV: /* get dump_device */ + return put_user((long)dump_device, (long *)arg); + + case DIOSDUMPLEVEL: /* set dump_level */ + if (!(f->f_flags & O_RDWR)) + return -EPERM; + + /* make sure we have a positive value */ + if (arg < 0) + return -EINVAL; + + /* Fixme: clean this up */ + dump_config.level = 0; + switch ((int)arg) { + case DUMP_LEVEL_ALL: + case DUMP_LEVEL_ALL_RAM: + dump_config.level |= DUMP_MASK_UNUSED; + case DUMP_LEVEL_USED: + dump_config.level |= DUMP_MASK_USED; + case DUMP_LEVEL_KERN: + dump_config.level |= DUMP_MASK_KERN; + case DUMP_LEVEL_HEADER: + dump_config.level |= DUMP_MASK_HEADER; + case DUMP_LEVEL_NONE: + break; + default: + return (-EINVAL); + } + pr_debug("Dump Level 0x%lx\n", dump_config.level); + break; + + case DIOGDUMPLEVEL: /* get dump_level */ + /* fixme: handle conversion */ + return put_user((long)dump_config.level, (long *)arg); + + + case DIOSDUMPFLAGS: /* set dump_flags */ + /* check flags */ + if (!(f->f_flags & O_RDWR)) + return -EPERM; + + /* make sure we have a positive value */ + if (arg < 0) + return -EINVAL; + + if (dump_target_init(arg & DUMP_FLAGS_TARGETMASK) < 0) + return -EINVAL; /* return proper error */ + + dump_config.flags = arg; + + pr_debug("Dump Flags 0x%lx\n", dump_config.flags); + break; + + case DIOGDUMPFLAGS: /* get dump_flags */ + return put_user((long)dump_config.flags, (long *)arg); + + case DIOSDUMPCOMPRESS: /* set the dump_compress status */ + if (!(f->f_flags & O_RDWR)) + return -EPERM; + + return dump_compress_init((int)arg); + + case DIOGDUMPCOMPRESS: /* get the dump_compress status */ + return put_user((long)(dump_config.dumper ? + dump_config.dumper->compress->compress_type : 0), + (long *)arg); + + default: + /* + * these are network dump specific ioctls, let the + * module handle them. + */ + return dump_dev_ioctl(cmd, arg); + } + return 0; +} + +/* + * Handle special cases for dump_device + * changing dump device requires doing an opening the device + */ +static int +proc_dump_device(ctl_table *ctl, int write, struct file *f, + void *buffer, size_t *lenp) +{ + int *valp = ctl->data; + int oval = *valp; + int ret = -EPERM; + + /* same permission checks as ioctl */ + if (capable(CAP_SYS_ADMIN)) { + ret = proc_dointvec(ctl, write, f, buffer, lenp); + if (ret == 0 && write && *valp != oval) { + /* need to restore old value to close properly */ + dump_device = (dev_t) oval; + __dump_open(); + ret = dumper_setup(dump_config.flags, (dev_t) *valp); + } + } + + return ret; +} + +/* + * ----------------------------------------------------------------------- + * I N I T F U N C T I O N S + * ----------------------------------------------------------------------- + */ + +/* + * These register and unregister routines are exported for modules + * to register their dump drivers (like block, net etc) + */ +int +dump_register_device(struct dump_dev *ddev) +{ + struct list_head *tmp; + struct dump_dev *dev; + + list_for_each(tmp, &dump_target_list) { + dev = list_entry(tmp, struct dump_dev, list); + if (strcmp(ddev->type_name, dev->type_name) == 0) { + printk("Target type %s already registered\n", + dev->type_name); + return -1; /* return proper error */ + } + } + list_add(&(ddev->list), &dump_target_list); + + return 0; +} + +void +dump_unregister_device(struct dump_dev *ddev) +{ + list_del(&(ddev->list)); + if (ddev != dump_dev) + return; + + dump_okay = 0; + + if (dump_config.dumper) + dump_unconfigure(); + + dump_config.flags &= ~DUMP_FLAGS_TARGETMASK; + dump_okay = 0; + dump_dev = NULL; + dump_config.dumper = NULL; +} + + +#ifdef CONFIG_MAGIC_SYSRQ +/* Sysrq handler */ +static void sysrq_handle_crashdump(int key, struct pt_regs *pt_regs, + struct tty_struct *tty) { + dump_execute("sysrq", pt_regs); +} + +static struct sysrq_key_op sysrq_crashdump_op = { + .handler = sysrq_handle_crashdump, + .help_msg = "Dump", + .action_msg = "Starting crash dump", +}; +#endif + +static inline void +dump_sysrq_register(void) +{ +#ifdef CONFIG_MAGIC_SYSRQ + __sysrq_lock_table(); + __sysrq_put_key_op(DUMP_SYSRQ_KEY, &sysrq_crashdump_op); + __sysrq_unlock_table(); +#endif +} + +static inline void +dump_sysrq_unregister(void) +{ +#ifdef CONFIG_MAGIC_SYSRQ + __sysrq_lock_table(); + if (__sysrq_get_key_op(DUMP_SYSRQ_KEY) == &sysrq_crashdump_op) + __sysrq_put_key_op(DUMP_SYSRQ_KEY, NULL); + __sysrq_unlock_table(); +#endif +} + +/* + * Name: dump_init() + * Func: Initialize the dump process. This will set up any architecture + * dependent code. The big key is we need the memory offsets before + * the page table is initialized, because the base memory offset + * is changed after paging_init() is called. + */ +static int __init +dump_init(void) +{ + struct sysinfo info; + + /* try to create our dump device */ + if (register_chrdev(CRASH_DUMP_MAJOR, "dump", &dump_fops)) { + printk("cannot register dump character device!\n"); + return -EBUSY; + } + + __dump_init((u64)PAGE_OFFSET); + + /* set the dump_compression_list structure up */ + dump_register_compression(&dump_none_compression); + + /* grab the total memory size now (not if/when we crash) */ + si_meminfo(&info); + + /* set the memory size */ + dump_header.dh_memory_size = (u64)info.totalram; + + sysctl_header = register_sysctl_table(kernel_root, 0); + dump_sysrq_register(); + + pr_info("Crash dump driver initialized.\n"); + return 0; +} + +static void __exit +dump_cleanup(void) +{ + dump_okay = 0; + + if (dump_config.dumper) + dump_unconfigure(); + + /* arch-specific cleanup routine */ + __dump_cleanup(); + + /* ignore errors while unregistering -- since can't do anything */ + unregister_sysctl_table(sysctl_header); + unregister_chrdev(CRASH_DUMP_MAJOR, "dump"); + dump_sysrq_unregister(); +} + +EXPORT_SYMBOL(dump_register_compression); +EXPORT_SYMBOL(dump_unregister_compression); +EXPORT_SYMBOL(dump_register_device); +EXPORT_SYMBOL(dump_unregister_device); +EXPORT_SYMBOL(dump_config); +EXPORT_SYMBOL(dump_silence_level); + +EXPORT_SYMBOL(__dump_irq_enable); +EXPORT_SYMBOL(__dump_irq_restore); + +MODULE_AUTHOR("Matt D. Robinson "); +MODULE_DESCRIPTION("Linux Kernel Crash Dump (LKCD) driver"); +MODULE_LICENSE("GPL"); + +module_init(dump_init); +module_exit(dump_cleanup); diff -urpN -X /home/fletch/.diff.exclude 000-virgin/drivers/net/3c59x.c 900-mjb1/drivers/net/3c59x.c --- 000-virgin/drivers/net/3c59x.c Fri May 30 19:02:11 2003 +++ 900-mjb1/drivers/net/3c59x.c Fri May 30 19:24:56 2003 @@ -907,6 +907,7 @@ static void set_rx_mode(struct net_devic static int vortex_ioctl(struct net_device *dev, struct ifreq *rq, int cmd); static void vortex_tx_timeout(struct net_device *dev); static void acpi_set_WOL(struct net_device *dev); +static void vorboom_poll(struct net_device *dev); /* This driver uses 'options' to pass the media type, full-duplex flag, etc. */ /* Option count limit only -- unlimited interfaces are supported. */ @@ -1463,6 +1464,9 @@ static int __devinit vortex_probe1(struc dev->set_multicast_list = set_rx_mode; dev->tx_timeout = vortex_tx_timeout; dev->watchdog_timeo = (watchdog * HZ) / 1000; +#ifdef HAVE_POLL_CONTROLLER + dev->poll_controller = &vorboom_poll; +#endif if (pdev && vp->enable_wol) { vp->pm_state_valid = 1; pci_save_state(VORTEX_PCI(vp), vp->power_state); @@ -2452,6 +2456,29 @@ handler_exit: spin_unlock(&vp->lock); return IRQ_HANDLED; } + +#ifdef HAVE_POLL_CONTROLLER + +/* + * Polling 'interrupt' - used by things like netconsole to send skbs + * without having to re-enable interrupts. It's not called while + * the interrupt routine is executing. + */ + +static void vorboom_poll (struct net_device *dev) +{ + struct vortex_private *vp = (struct vortex_private *)dev->priv; + + disable_irq(dev->irq); + if (vp->full_bus_master_tx) + boomerang_interrupt(dev->irq, dev, 0); + else + vortex_interrupt(dev->irq, dev, 0); + enable_irq(dev->irq); +} + +#endif + static int vortex_rx(struct net_device *dev) { diff -urpN -X /home/fletch/.diff.exclude 000-virgin/drivers/net/acenic.c 900-mjb1/drivers/net/acenic.c --- 000-virgin/drivers/net/acenic.c Fri May 30 19:02:11 2003 +++ 900-mjb1/drivers/net/acenic.c Fri May 30 19:28:08 2003 @@ -132,7 +132,8 @@ #endif #if LINUX_VERSION_CODE >= 0x20400 -static struct pci_device_id acenic_pci_tbl[] __initdata = { +static struct pci_device_id acenic_pci_tbl[] + __initdata __attribute__ ((__unused__)) = { { PCI_VENDOR_ID_ALTEON, PCI_DEVICE_ID_ALTEON_ACENIC_FIBRE, PCI_ANY_ID, PCI_ANY_ID, PCI_CLASS_NETWORK_ETHERNET << 8, 0xffff00, }, { PCI_VENDOR_ID_ALTEON, PCI_DEVICE_ID_ALTEON_ACENIC_COPPER, diff -urpN -X /home/fletch/.diff.exclude 000-virgin/drivers/net/e100/e100_main.c 900-mjb1/drivers/net/e100/e100_main.c --- 000-virgin/drivers/net/e100/e100_main.c Fri May 30 19:02:12 2003 +++ 900-mjb1/drivers/net/e100/e100_main.c Fri May 30 19:24:56 2003 @@ -558,6 +558,22 @@ e100_trigger_SWI(struct e100_private *bd readw(&(bdp->scb->scb_status)); /* flushes last write, read-safe */ } +#ifdef HAVE_POLL_CONTROLLER + +/* + * Polling 'interrupt' - used by things like netconsole to send skbs + * without having to re-enable interrupts. It's not called while + * the interrupt routine is executing. + */ +static void +e100_poll(struct net_device *dev) +{ + disable_irq(dev->irq); + e100intr(dev->irq, dev, NULL); + enable_irq(dev->irq); +} +#endif + static int __devinit e100_found1(struct pci_dev *pcid, const struct pci_device_id *ent) { @@ -576,6 +592,9 @@ e100_found1(struct pci_dev *pcid, const SET_MODULE_OWNER(dev); +#ifdef HAVE_POLL_CONTROLLER + dev->poll_controller = &e100_poll; +#endif if (first_time) { first_time = false; printk(KERN_NOTICE "%s - version %s\n", diff -urpN -X /home/fletch/.diff.exclude 000-virgin/drivers/net/e1000/e1000_main.c 900-mjb1/drivers/net/e1000/e1000_main.c --- 000-virgin/drivers/net/e1000/e1000_main.c Fri May 30 19:02:12 2003 +++ 900-mjb1/drivers/net/e1000/e1000_main.c Fri May 30 19:24:56 2003 @@ -145,6 +145,7 @@ static void e1000_leave_82542_rst(struct static inline void e1000_rx_checksum(struct e1000_adapter *adapter, struct e1000_rx_desc *rx_desc, struct sk_buff *skb); +static void e1000_Poll(struct net_device *dev); static void e1000_tx_timeout(struct net_device *dev); static void e1000_tx_timeout_task(struct net_device *dev); static void e1000_smartspeed(struct e1000_adapter *adapter); @@ -413,6 +414,9 @@ e1000_probe(struct pci_dev *pdev, adapter->bd_number = cards_found; +#ifdef HAVE_POLL_CONTROLLER + netdev->poll_controller = &e1000_Poll; +#endif /* setup the private structure */ if(e1000_sw_init(adapter)) @@ -2002,6 +2006,15 @@ e1000_intr(int irq, void *data, struct p #endif return IRQ_HANDLED; } + +#ifdef HAVE_POLL_CONTROLLER +static void e1000_Poll(struct net_device *dev) +{ + disable_irq(dev->irq); + e1000_intr(dev->irq, dev, NULL); + enable_irq(dev->irq); +} +#endif #ifdef CONFIG_E1000_NAPI /** diff -urpN -X /home/fletch/.diff.exclude 000-virgin/drivers/net/eepro100.c 900-mjb1/drivers/net/eepro100.c --- 000-virgin/drivers/net/eepro100.c Fri May 30 19:02:12 2003 +++ 900-mjb1/drivers/net/eepro100.c Fri May 30 19:24:56 2003 @@ -542,6 +542,7 @@ static void speedo_refill_rx_buffers(str static int speedo_rx(struct net_device *dev); static void speedo_tx_buffer_gc(struct net_device *dev); static irqreturn_t speedo_interrupt(int irq, void *dev_instance, struct pt_regs *regs); +static void poll_speedo (struct net_device *dev); static int speedo_close(struct net_device *dev); static struct net_device_stats *speedo_get_stats(struct net_device *dev); static int speedo_ioctl(struct net_device *dev, struct ifreq *rq, int cmd); @@ -880,6 +881,9 @@ static int __devinit speedo_found1(struc dev->get_stats = &speedo_get_stats; dev->set_multicast_list = &set_rx_mode; dev->do_ioctl = &speedo_ioctl; +#ifdef HAVE_POLL_CONTROLLER + dev->poll_controller = &poll_speedo; +#endif return 0; } @@ -1660,6 +1664,23 @@ static irqreturn_t speedo_interrupt(int clear_bit(0, (void*)&sp->in_interrupt); return IRQ_RETVAL(handled); } + +#ifdef HAVE_POLL_CONTROLLER + +/* + * Polling 'interrupt' - used by things like netconsole to send skbs + * without having to re-enable interrupts. It's not called while + * the interrupt routine is executing. + */ + +static void poll_speedo (struct net_device *dev) +{ + disable_irq(dev->irq); + speedo_interrupt (dev->irq, dev, NULL); + enable_irq(dev->irq); +} + +#endif static inline struct RxFD *speedo_rx_alloc(struct net_device *dev, int entry) { diff -urpN -X /home/fletch/.diff.exclude 000-virgin/drivers/net/loopback.c 900-mjb1/drivers/net/loopback.c --- 000-virgin/drivers/net/loopback.c Sun Nov 17 20:29:25 2002 +++ 900-mjb1/drivers/net/loopback.c Fri May 30 19:28:11 2003 @@ -194,7 +194,7 @@ int __init loopback_init(struct net_devi /* Current netfilter will die with oom linearizing large skbs, * however this will be cured before 2.5.x is done. */ - dev->features |= NETIF_F_TSO; +/* dev->features |= NETIF_F_TSO; */ dev->priv = kmalloc(sizeof(struct net_device_stats), GFP_KERNEL); if (dev->priv == NULL) diff -urpN -X /home/fletch/.diff.exclude 000-virgin/drivers/net/smc-ultra.c 900-mjb1/drivers/net/smc-ultra.c --- 000-virgin/drivers/net/smc-ultra.c Wed Mar 5 07:37:02 2003 +++ 900-mjb1/drivers/net/smc-ultra.c Fri May 30 19:24:56 2003 @@ -122,6 +122,14 @@ MODULE_DEVICE_TABLE(isapnp, ultra_device #define ULTRA_IO_EXTENT 32 #define EN0_ERWCNT 0x08 /* Early receive warning count. */ + +static void ultra_poll(struct net_device *dev) +{ + disable_irq(dev->irq); + ei_interrupt(dev->irq, dev, NULL); + enable_irq(dev->irq); +} + /* Probe for the Ultra. This looks like a 8013 with the station address PROM at I/O ports +8 to +13, with a checksum following. @@ -134,6 +142,9 @@ int __init ultra_probe(struct net_device SET_MODULE_OWNER(dev); +#ifdef HAVE_POLL_CONTROLLER + dev->poll_controller = &ultra_poll; +#endif if (base_addr > 0x1ff) /* Check a single specified location. */ return ultra_probe1(dev, base_addr); else if (base_addr != 0) /* Don't probe at all. */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/drivers/net/tlan.c 900-mjb1/drivers/net/tlan.c --- 000-virgin/drivers/net/tlan.c Fri May 30 19:02:13 2003 +++ 900-mjb1/drivers/net/tlan.c Fri May 30 19:24:56 2003 @@ -345,6 +345,8 @@ static int TLan_EeSendByte( u16, u8, int static void TLan_EeReceiveByte( u16, u8 *, int ); static int TLan_EeReadByte( struct net_device *, u8, u8 * ); +static void TLan_Poll(struct net_device *); + static TLanIntVectorFunc *TLanIntVector[TLAN_INT_NUMBER_OF_INTS] = { TLan_HandleInvalid, @@ -855,6 +857,9 @@ static int TLan_Init( struct net_device dev->get_stats = &TLan_GetStats; dev->set_multicast_list = &TLan_SetMulticastList; dev->do_ioctl = &TLan_ioctl; +#ifdef HAVE_POLL_CONTROLLER + dev->poll_controller = &TLan_Poll; +#endif dev->tx_timeout = &TLan_tx_timeout; dev->watchdog_timeo = TX_TIMEOUT; @@ -1138,7 +1143,14 @@ static irqreturn_t TLan_HandleInterrupt( return IRQ_HANDLED; } /* TLan_HandleInterrupts */ - +#ifdef HAVE_POLL_CONTROLLER +static void TLan_Poll(struct net_device *dev) +{ + disable_irq(dev->irq); + TLan_HandleInterrupt(dev->irq, dev, NULL); + enable_irq(dev->irq); +} +#endif /*************************************************************** diff -urpN -X /home/fletch/.diff.exclude 000-virgin/drivers/net/tulip/tulip_core.c 900-mjb1/drivers/net/tulip/tulip_core.c --- 000-virgin/drivers/net/tulip/tulip_core.c Fri May 30 19:02:13 2003 +++ 900-mjb1/drivers/net/tulip/tulip_core.c Fri May 30 19:24:56 2003 @@ -243,6 +243,7 @@ static void tulip_down(struct net_device static struct net_device_stats *tulip_get_stats(struct net_device *dev); static int private_ioctl(struct net_device *dev, struct ifreq *rq, int cmd); static void set_rx_mode(struct net_device *dev); +static void poll_tulip(struct net_device *dev); @@ -1618,6 +1619,9 @@ static int __devinit tulip_init_one (str dev->get_stats = tulip_get_stats; dev->do_ioctl = private_ioctl; dev->set_multicast_list = set_rx_mode; +#ifdef HAVE_POLL_CONTROLLER + dev->poll_controller = &poll_tulip; +#endif if (register_netdev(dev)) goto err_out_free_ring; @@ -1773,6 +1777,24 @@ static void __devexit tulip_remove_one ( /* pci_power_off (pdev, -1); */ } + + +#ifdef HAVE_POLL_CONTROLLER + +/* + * Polling 'interrupt' - used by things like netconsole to send skbs + * without having to re-enable interrupts. It's not called while + * the interrupt routine is executing. + */ + +static void poll_tulip (struct net_device *dev) +{ + disable_irq(dev->irq); + tulip_interrupt (dev->irq, dev, NULL); + enable_irq(dev->irq); +} + +#endif static struct pci_driver tulip_driver = { diff -urpN -X /home/fletch/.diff.exclude 000-virgin/drivers/pci/probe.c 900-mjb1/drivers/pci/probe.c --- 000-virgin/drivers/pci/probe.c Fri May 30 19:02:13 2003 +++ 900-mjb1/drivers/pci/probe.c Fri May 30 19:28:16 2003 @@ -173,7 +173,7 @@ void __devinit pci_read_bridge_bases(str limit |= (io_limit_hi << 16); } - if (base && base <= limit) { + if (base <= limit) { res->flags = (io_base_lo & PCI_IO_RANGE_TYPE_MASK) | IORESOURCE_IO; res->start = base; res->end = limit + 0xfff; diff -urpN -X /home/fletch/.diff.exclude 000-virgin/drivers/scsi/sd.c 900-mjb1/drivers/scsi/sd.c --- 000-virgin/drivers/scsi/sd.c Fri May 30 19:02:15 2003 +++ 900-mjb1/drivers/scsi/sd.c Fri May 30 21:58:35 2003 @@ -59,7 +59,9 @@ * Remaining dev_t-handling stuff */ #define SD_MAJORS 16 -#define SD_DISKS (SD_MAJORS << 4) +#define SD_DISKS ((SD_MAJORS - 1) << 4) +#define LAST_MAJOR_DISKS (1 << (KDEV_MINOR_BITS - 4)) +#define TOTAL_SD_DISKS (SD_DISKS + LAST_MAJOR_DISKS) /* * Time out in seconds for disks and Magneto-opticals (which are slower). @@ -88,7 +90,7 @@ struct scsi_disk { static LIST_HEAD(sd_devlist); static spinlock_t sd_devlist_lock = SPIN_LOCK_UNLOCKED; -static unsigned long sd_index_bits[SD_DISKS / BITS_PER_LONG]; +static unsigned long sd_index_bits[TOTAL_SD_DISKS / BITS_PER_LONG]; static spinlock_t sd_index_lock = SPIN_LOCK_UNLOCKED; static void sd_init_onedisk(struct scsi_disk * sdkp, struct gendisk *disk); @@ -1328,8 +1330,8 @@ static int sd_attach(struct scsi_device goto out_free; spin_lock(&sd_index_lock); - index = find_first_zero_bit(sd_index_bits, SD_DISKS); - if (index == SD_DISKS) { + index = find_first_zero_bit(sd_index_bits, TOTAL_SD_DISKS); + if (index == TOTAL_SD_DISKS) { spin_unlock(&sd_index_lock); error = -EBUSY; goto out_put; @@ -1343,15 +1345,25 @@ static int sd_attach(struct scsi_device sdkp->index = index; gd->major = sd_major(index >> 4); - gd->first_minor = (index & 15) << 4; +#define DISKS_PER_MINOR_MASK ((1 << (KDEV_MINOR_BITS - 4)) - 1) + if (index > SD_DISKS) + gd->first_minor = ((index - SD_DISKS) & DISKS_PER_MINOR_MASK) << 4; + else + gd->first_minor = (index & 15) << 4; gd->minors = 16; gd->fops = &sd_fops; - if (index >= 26) { + if (index < 26) { + sprintf(gd->disk_name, "sd%c", 'a' + index % 26); + } else if (index < (26*27)) { sprintf(gd->disk_name, "sd%c%c", 'a' + index/26-1,'a' + index % 26); } else { - sprintf(gd->disk_name, "sd%c", 'a' + index % 26); + const unsigned int m1 = (index/ 26 - 1) / 26 - 1; + const unsigned int m2 = (index / 26 - 1) % 26; + const unsigned int m3 = index % 26; + sprintf(gd->disk_name, "sd%c%c%c", + 'a' + m1, 'a' + m2, 'a' + m3); } strcpy(gd->devfs_name, sdp->devfs_name); diff -urpN -X /home/fletch/.diff.exclude 000-virgin/drivers/serial/8250.c 900-mjb1/drivers/serial/8250.c --- 000-virgin/drivers/serial/8250.c Sat May 10 18:34:58 2003 +++ 900-mjb1/drivers/serial/8250.c Fri May 30 19:13:53 2003 @@ -2118,9 +2118,116 @@ void serial8250_resume_port(int line, u3 uart_resume_port(&serial8250_reg, &serial8250_ports[line].port, level); } -static int __init serial8250_init(void) +#ifdef CONFIG_X86_REMOTE_DEBUG +/* + * Takes: + * ttyS - integer specifying which serial port to use for debugging + * baud - baud rate of specified serial port + * Returns: + * port for use by the gdb serial driver + */ +int gdb_serial_setup(int ttyS, int baud, int *port, int *irq) +{ + struct uart_8250_port *up; + unsigned cval; + int bits = 8; + int parity = 'n'; + int cflag = CREAD | HUPCL | CLOCAL; + int quot = 0; + + /* + * Now construct a cflag setting. + */ + switch(baud) { + case 1200: + cflag |= B1200; + break; + case 2400: + cflag |= B2400; + break; + case 4800: + cflag |= B4800; + break; + case 19200: + cflag |= B19200; + break; + case 38400: + cflag |= B38400; + break; + case 57600: + cflag |= B57600; + break; + case 115200: + cflag |= B115200; + break; + case 9600: + default: + cflag |= B9600; + break; + } + switch(bits) { + case 7: + cflag |= CS7; + break; + default: + case 8: + cflag |= CS8; + break; + } + switch(parity) { + case 'o': case 'O': + cflag |= PARODD; + break; + case 'e': case 'E': + cflag |= PARENB; + break; + } + + /* + * Divisor, bytesize and parity + */ + + up = &serial8250_ports[ttyS]; +// ser->flags &= ~ASYNC_BOOT_AUTOCONF; + quot = ( 1843200 / 16 ) / baud; + cval = cflag & (CSIZE | CSTOPB); + cval >>= 4; + if (cflag & PARENB) + cval |= UART_LCR_PARITY; + if (!(cflag & PARODD)) + cval |= UART_LCR_EPAR; + + /* + * Disable UART interrupts, set DTR and RTS high + * and set speed. + */ + cval = 0x3; + serial_outp(up, UART_LCR, cval | UART_LCR_DLAB); /* set DLAB */ + serial_outp(up, UART_DLL, quot & 0xff); /* LS of divisor */ + serial_outp(up, UART_DLM, quot >> 8); /* MS of divisor */ + serial_outp(up, UART_LCR, cval); /* reset DLAB */ + serial_outp(up, UART_IER, UART_IER_RDI); /* turn on interrupts*/ + serial_outp(up, UART_MCR, UART_MCR_OUT2 | UART_MCR_DTR | UART_MCR_RTS); + + /* + * If we read 0xff from the LSR, there is no UART here. + */ + if (serial_inp(up, UART_LSR) == 0xff) + return 1; + *port = up->port.iobase; + *irq = up->port.irq; +// serial8250_shutdown(&up->port); + return 0; +} +#endif + +int serial8250_init(void) { int ret, i; + static int didit = 0; + + if (didit++) + return 0; printk(KERN_INFO "Serial: 8250/16550 driver $Revision: 1.90 $ " "IRQ sharing %sabled\n", share_irqs ? "en" : "dis"); diff -urpN -X /home/fletch/.diff.exclude 000-virgin/drivers/serial/core.c 900-mjb1/drivers/serial/core.c --- 000-virgin/drivers/serial/core.c Fri May 30 19:02:15 2003 +++ 900-mjb1/drivers/serial/core.c Fri May 30 19:13:53 2003 @@ -33,6 +33,10 @@ #include #include /* for serial_state and serial_icounter_struct */ +#ifdef CONFIG_X86_REMOTE_DEBUG +#include +#endif + #include #include @@ -1130,6 +1134,16 @@ uart_ioctl(struct tty_struct *tty, struc * protected against the tty being hung up. */ switch (cmd) { +#ifdef CONFIG_X86_REMOTE_DEBUG + case TIOCGDB: + ret = -ENOTTY; + if (capable(CAP_SYS_ADMIN)) { + gdb_ttyS = minor(tty->device) & 0x03F; + gdb_baud = tty_get_baud_rate(tty); + ret = gdb_hook(); + } + break; +#endif case TIOCSERGETLSR: /* Get line status register */ ret = uart_get_lsr_info(state, (unsigned int *)arg); break; @@ -1146,6 +1160,30 @@ uart_ioctl(struct tty_struct *tty, struc out: return ret; } + + /* + * ------------------------------------------------------------ + * Serial GDB driver (most in gdbserial.c) + * ------------------------------------------------------------ + */ + +#ifdef CONFIG_X86_REMOTE_DEBUG +#ifdef CONFIG_GDB_CONSOLE +static struct console gdbcons = { + name: "gdb", + write: gdb_console_write, + flags: CON_PRINTBUFFER | CON_ENABLED, + index: -1, +}; +#endif + +#ifdef CONFIG_GDB_CONSOLE +void __init gdb_console_init(void) +{ + register_console(&gdbcons); +} +#endif +#endif /* CONFIG_X86_REMOTE_DEBUG */ static void uart_set_termios(struct tty_struct *tty, struct termios *old_termios) { diff -urpN -X /home/fletch/.diff.exclude 000-virgin/fs/aio.c 900-mjb1/fs/aio.c --- 000-virgin/fs/aio.c Sat May 10 18:34:59 2003 +++ 900-mjb1/fs/aio.c Fri May 30 22:05:47 2003 @@ -39,6 +39,9 @@ #define dprintk(x...) do { ; } while (0) #endif +long aio_run = 0; /* for testing only */ +long aio_wakeups = 0; /* for testing only */ + /*------ sysctl variables----*/ atomic_t aio_nr = ATOMIC_INIT(0); /* current system wide number of aio requests */ unsigned aio_max_nr = 0x10000; /* system wide maximum number of aio requests */ @@ -204,6 +207,7 @@ static struct kioctx *ioctx_alloc(unsign { struct mm_struct *mm; struct kioctx *ctx; + int ret = 0; /* Prevent overflows */ if ((nr_events > (0x10000000U / sizeof(struct io_event))) || @@ -233,7 +237,8 @@ static struct kioctx *ioctx_alloc(unsign INIT_LIST_HEAD(&ctx->run_list); INIT_WORK(&ctx->wq, aio_kick_handler, ctx); - if (aio_setup_ring(ctx) < 0) + ret = aio_setup_ring(ctx); + if (unlikely(ret < 0)) goto out_freectx; /* limit the number of system wide aios */ @@ -259,7 +264,7 @@ out_cleanup: out_freectx: kmem_cache_free(kioctx_cachep, ctx); - ctx = ERR_PTR(-ENOMEM); + ctx = ERR_PTR(ret); dprintk("aio: error allocating ioctx %p\n", ctx); return ctx; @@ -281,6 +286,7 @@ static void aio_cancel_all(struct kioctx struct kiocb *iocb = list_kiocb(pos); list_del_init(&iocb->ki_list); cancel = iocb->ki_cancel; + kiocbSetCancelled(iocb); if (cancel) { iocb->ki_users++; spin_unlock_irq(&ctx->ctx_lock); @@ -395,6 +401,7 @@ static struct kiocb *__aio_get_req(struc req->ki_cancel = NULL; req->ki_retry = NULL; req->ki_user_obj = NULL; + INIT_LIST_HEAD(&req->ki_run_list); /* Check if the completion queue has enough free space to * accept an event from this io. @@ -544,60 +551,147 @@ static void use_mm(struct mm_struct *mm) struct mm_struct *active_mm = current->active_mm; atomic_inc(&mm->mm_count); current->mm = mm; - if (mm != active_mm) { - current->active_mm = mm; - activate_mm(active_mm, mm); - } + + current->active_mm = mm; + activate_mm(active_mm, mm); + mmdrop(active_mm); } -static void unuse_mm(struct mm_struct *mm) +void unuse_mm(struct mm_struct *mm) { current->mm = NULL; /* active_mm is still 'mm' */ enter_lazy_tlb(mm, current, smp_processor_id()); } -/* Run on kevent's context. FIXME: needs to be per-cpu and warn if an - * operation blocks. - */ -static void aio_kick_handler(void *data) +static inline int __queue_kicked_iocb(struct kiocb *iocb) { - struct kioctx *ctx = data; + struct kioctx *ctx = iocb->ki_ctx; - use_mm(ctx->mm); + if (list_empty(&iocb->ki_run_list)) { + list_add_tail(&iocb->ki_run_list, + &ctx->run_list); + iocb->ki_queued++; + return 1; + } + return 0; +} - spin_lock_irq(&ctx->ctx_lock); - while (!list_empty(&ctx->run_list)) { - struct kiocb *iocb; - long ret; +/* Expects to be called with iocb->ki_ctx->lock held */ +static ssize_t aio_run_iocb(struct kiocb *iocb) +{ + struct kioctx *ctx = iocb->ki_ctx; + ssize_t (*retry)(struct kiocb *); + ssize_t ret; - iocb = list_entry(ctx->run_list.next, struct kiocb, - ki_run_list); - list_del(&iocb->ki_run_list); - iocb->ki_users ++; - spin_unlock_irq(&ctx->ctx_lock); + if (iocb->ki_retried++ > 1024*1024) { + printk("Maximal retry count. Bytes done %d\n", + iocb->ki_nbytes - iocb->ki_left); + return -EAGAIN; + } + + if (!(iocb->ki_retried & 0xff)) { + dprintk("%ld retry: %d of %d (kick %ld, Q %ld run %ld, wake %ld)\n", + iocb->ki_retried, + iocb->ki_nbytes - iocb->ki_left, iocb->ki_nbytes, + iocb->ki_kicked, iocb->ki_queued, aio_run, aio_wakeups); + } + + if (!(retry = iocb->ki_retry)) { + printk("aio_run_iocb: iocb->ki_retry = NULL\n"); + return 0; + } + + iocb->ki_users ++; + kiocbClearKicked(iocb); + iocb->ki_run_list.next = iocb->ki_run_list.prev = NULL; + iocb->ki_retry = NULL; + spin_unlock_irq(&ctx->ctx_lock); + + if (kiocbIsCancelled(iocb)) { + aio_complete(iocb, -EINTR, 0); + spin_lock_irq(&ctx->ctx_lock); + __aio_put_req(ctx, iocb); + return -EINTR; + } - kiocbClearKicked(iocb); - ret = iocb->ki_retry(iocb); + BUG_ON(current->io_wait != NULL); + current->io_wait = &iocb->ki_wait; + ret = retry(iocb); + current->io_wait = NULL; + + if (-EIOCBRETRY != ret) { if (-EIOCBQUEUED != ret) { + BUG_ON(!list_empty(&iocb->ki_wait.task_list)); aio_complete(iocb, ret, 0); - iocb = NULL; } + } else { + if (list_empty(&iocb->ki_wait.task_list)) + kiocbSetKicked(iocb); + } + spin_lock_irq(&ctx->ctx_lock); - spin_lock_irq(&ctx->ctx_lock); - if (NULL != iocb) - __aio_put_req(ctx, iocb); + iocb->ki_retry = retry; + INIT_LIST_HEAD(&iocb->ki_run_list); + if (kiocbIsKicked(iocb)) { + BUG_ON(ret != -EIOCBRETRY); + __queue_kicked_iocb(iocb); + } + __aio_put_req(ctx, iocb); + return ret; +} + +static void aio_run_iocbs(struct kioctx *ctx) +{ + struct kiocb *iocb; + ssize_t ret; + int count = 0; + + spin_lock_irq(&ctx->ctx_lock); + while (!list_empty(&ctx->run_list)) { + iocb = list_entry(ctx->run_list.next, struct kiocb, + ki_run_list); + list_del(&iocb->ki_run_list); + ret = aio_run_iocb(iocb); + count++; } spin_unlock_irq(&ctx->ctx_lock); + aio_run++; +} +/* Run on aiod/kevent's context. FIXME: needs to be per-cpu and warn if an + * operation blocks. + */ +static void aio_kick_handler(void *data) +{ + struct kioctx *ctx = data; + + use_mm(ctx->mm); + aio_run_iocbs(ctx); unuse_mm(ctx->mm); } -void kick_iocb(struct kiocb *iocb) + +void queue_kicked_iocb(struct kiocb *iocb) { struct kioctx *ctx = iocb->ki_ctx; + unsigned long flags; + int run = 0; + + WARN_ON((!list_empty(&iocb->ki_wait.task_list))); + spin_lock_irqsave(&ctx->ctx_lock, flags); + run = __queue_kicked_iocb(iocb); + spin_unlock_irqrestore(&ctx->ctx_lock, flags); + if (run) { + queue_work(aio_wq, &ctx->wq); + aio_wakeups++; + } +} + +void kick_iocb(struct kiocb *iocb) +{ /* sync iocbs are easy: they can only ever be executing from a * single context. */ if (is_sync_kiocb(iocb)) { @@ -606,12 +700,9 @@ void kick_iocb(struct kiocb *iocb) return; } + iocb->ki_kicked++; if (!kiocbTryKick(iocb)) { - unsigned long flags; - spin_lock_irqsave(&ctx->ctx_lock, flags); - list_add_tail(&iocb->ki_run_list, &ctx->run_list); - spin_unlock_irqrestore(&ctx->ctx_lock, flags); - schedule_work(&ctx->wq); + queue_kicked_iocb(iocb); } } @@ -664,6 +755,9 @@ int aio_complete(struct kiocb *iocb, lon */ spin_lock_irqsave(&ctx->ctx_lock, flags); + if (iocb->ki_run_list.prev && !list_empty(&iocb->ki_run_list)) + list_del_init(&iocb->ki_run_list); + ring = kmap_atomic(info->ring_pages[0], KM_IRQ1); tail = info->tail; @@ -693,6 +787,11 @@ int aio_complete(struct kiocb *iocb, lon pr_debug("added to ring %p at [%lu]\n", iocb, tail); + pr_debug("%ld retries: %d of %d (kicked %ld, Q %ld run %ld wake %ld)\n", + iocb->ki_retried, + iocb->ki_nbytes - iocb->ki_left, iocb->ki_nbytes, + iocb->ki_kicked, iocb->ki_queued, aio_run, aio_wakeups); + /* everything turned out well, dispose of the aiocb. */ ret = __aio_put_req(ctx, iocb); @@ -807,6 +906,7 @@ static int read_events(struct kioctx *ct int i = 0; struct io_event ent; struct timeout to; + int event_loop = 0; /* testing only */ /* needed to zero any padding within an entry (there shouldn't be * any, but C is fun! @@ -856,7 +956,6 @@ static int read_events(struct kioctx *ct add_wait_queue_exclusive(&ctx->wait, &wait); do { set_task_state(tsk, TASK_INTERRUPTIBLE); - ret = aio_read_evt(ctx, &ent); if (ret) break; @@ -866,6 +965,7 @@ static int read_events(struct kioctx *ct if (to.timed_out) /* Only check after read evt */ break; schedule(); + event_loop++; if (signal_pending(tsk)) { ret = -EINTR; break; @@ -893,6 +993,9 @@ static int read_events(struct kioctx *ct if (timeout) clear_timeout(&to); out: + pr_debug("event loop executed %d times\n", event_loop); + pr_debug("aio_run %ld\n", aio_run); + pr_debug("aio_wakeups %ld\n", aio_wakeups); return i ? i : ret; } @@ -984,6 +1087,143 @@ asmlinkage long sys_io_destroy(aio_conte return -EINVAL; } +ssize_t aio_pread(struct kiocb *iocb) +{ + struct file *file = iocb->ki_filp; + ssize_t ret = 0; + + ret = file->f_op->aio_read(iocb, iocb->ki_buf, + iocb->ki_left, iocb->ki_pos); + + /* + * Can't just depend on iocb->ki_left to determine + * whether we are done. This may have been a short read. + */ + if (ret > 0) { + iocb->ki_buf += ret; + iocb->ki_left -= ret; + + ret = -EIOCBRETRY; + } + + /* This means we must have transferred all that we could */ + /* No need to retry anymore */ + if ((ret == 0) || (iocb->ki_left == 0)) + ret = iocb->ki_nbytes - iocb->ki_left; + + return ret; +} + +ssize_t aio_pwrite(struct kiocb *iocb) +{ + struct file *file = iocb->ki_filp; + ssize_t ret = 0; + + ret = file->f_op->aio_write(iocb, iocb->ki_buf, + iocb->ki_left, iocb->ki_pos); + + /* + * TBD: Even if iocb->ki_left = 0, could we need to + * wait for data to be sync'd ? Or can we assume + * that aio_fdsync/aio_fsync would be called explicitly + * as required. + */ + if (ret > 0) { + iocb->ki_buf += ret; + iocb->ki_left -= ret; + + ret = -EIOCBRETRY; + } + + /* This means we must have transferred all that we could */ + /* No need to retry anymore */ + if (ret == 0) + ret = iocb->ki_nbytes - iocb->ki_left; + + return ret; +} + +ssize_t aio_fdsync(struct kiocb *iocb) +{ + struct file *file = iocb->ki_filp; + ssize_t ret = -EINVAL; + + if (file->f_op->aio_fsync) + ret = file->f_op->aio_fsync(iocb, 1); + return ret; +} + +ssize_t aio_fsync(struct kiocb *iocb) +{ + struct file *file = iocb->ki_filp; + ssize_t ret = -EINVAL; + + if (file->f_op->aio_fsync) + ret = file->f_op->aio_fsync(iocb, 0); + return ret; +} + +/* Called during initial submission and subsequent retry operations */ +ssize_t aio_setup_iocb(struct kiocb *iocb) +{ + struct file *file = iocb->ki_filp; + ssize_t ret = 0; + + switch (iocb->ki_opcode) { + case IOCB_CMD_PREAD: + ret = -EBADF; + if (unlikely(!(file->f_mode & FMODE_READ))) + break; + ret = -EFAULT; + if (unlikely(!access_ok(VERIFY_WRITE, iocb->ki_buf, + iocb->ki_left))) + break; + ret = -EINVAL; + if (file->f_op->aio_read) + iocb->ki_retry = aio_pread; + break; + case IOCB_CMD_PWRITE: + ret = -EBADF; + if (unlikely(!(file->f_mode & FMODE_WRITE))) + break; + ret = -EFAULT; + if (unlikely(!access_ok(VERIFY_READ, iocb->ki_buf, + iocb->ki_left))) + break; + ret = -EINVAL; + if (file->f_op->aio_write) + iocb->ki_retry = aio_pwrite; + break; + case IOCB_CMD_FDSYNC: + ret = -EINVAL; + if (file->f_op->aio_fsync) + iocb->ki_retry = aio_fdsync; + break; + case IOCB_CMD_FSYNC: + ret = -EINVAL; + if (file->f_op->aio_fsync) + iocb->ki_retry = aio_fsync; + break; + default: + dprintk("EINVAL: io_submit: no operation provided\n"); + ret = -EINVAL; + } + + if (!iocb->ki_retry) + return ret; + + return 0; +} + +int aio_wake_function(wait_queue_t *wait, unsigned mode, int sync) +{ + struct kiocb *iocb = container_of(wait, struct kiocb, ki_wait); + + list_del_init(&wait->task_list); + kick_iocb(iocb); + return 1; +} + int FASTCALL(io_submit_one(struct kioctx *ctx, struct iocb *user_iocb, struct iocb *iocb)); int io_submit_one(struct kioctx *ctx, struct iocb *user_iocb, @@ -992,7 +1232,6 @@ int io_submit_one(struct kioctx *ctx, st struct kiocb *req; struct file *file; ssize_t ret; - char *buf; /* enforce forwards compatibility on users */ if (unlikely(iocb->aio_reserved1 || iocb->aio_reserved2 || @@ -1033,51 +1272,31 @@ int io_submit_one(struct kioctx *ctx, st req->ki_user_data = iocb->aio_data; req->ki_pos = iocb->aio_offset; - buf = (char *)(unsigned long)iocb->aio_buf; + req->ki_buf = (char *)(unsigned long)iocb->aio_buf; + req->ki_left = req->ki_nbytes = iocb->aio_nbytes; + req->ki_opcode = iocb->aio_lio_opcode; + init_waitqueue_func_entry(&req->ki_wait, aio_wake_function); + INIT_LIST_HEAD(&req->ki_wait.task_list); + req->ki_run_list.next = req->ki_run_list.prev = NULL; + req->ki_retry = NULL; + req->ki_retried = 0; + req->ki_kicked = 0; + req->ki_queued = 0; + aio_run = 0; + aio_wakeups = 0; - switch (iocb->aio_lio_opcode) { - case IOCB_CMD_PREAD: - ret = -EBADF; - if (unlikely(!(file->f_mode & FMODE_READ))) - goto out_put_req; - ret = -EFAULT; - if (unlikely(!access_ok(VERIFY_WRITE, buf, iocb->aio_nbytes))) - goto out_put_req; - ret = -EINVAL; - if (file->f_op->aio_read) - ret = file->f_op->aio_read(req, buf, - iocb->aio_nbytes, req->ki_pos); - break; - case IOCB_CMD_PWRITE: - ret = -EBADF; - if (unlikely(!(file->f_mode & FMODE_WRITE))) - goto out_put_req; - ret = -EFAULT; - if (unlikely(!access_ok(VERIFY_READ, buf, iocb->aio_nbytes))) - goto out_put_req; - ret = -EINVAL; - if (file->f_op->aio_write) - ret = file->f_op->aio_write(req, buf, - iocb->aio_nbytes, req->ki_pos); - break; - case IOCB_CMD_FDSYNC: - ret = -EINVAL; - if (file->f_op->aio_fsync) - ret = file->f_op->aio_fsync(req, 1); - break; - case IOCB_CMD_FSYNC: - ret = -EINVAL; - if (file->f_op->aio_fsync) - ret = file->f_op->aio_fsync(req, 0); - break; - default: - dprintk("EINVAL: io_submit: no operation provided\n"); - ret = -EINVAL; - } + ret = aio_setup_iocb(req); + + if ((-EBADF == ret) || (-EFAULT == ret)) + goto out_put_req; + + spin_lock_irq(&ctx->ctx_lock); + ret = aio_run_iocb(req); + spin_unlock_irq(&ctx->ctx_lock); + + if (-EIOCBRETRY == ret) + queue_work(aio_wq, &ctx->wq); - if (likely(-EIOCBQUEUED == ret)) - return 0; - aio_complete(req, ret, 0); return 0; out_put_req: diff -urpN -X /home/fletch/.diff.exclude 000-virgin/fs/buffer.c 900-mjb1/fs/buffer.c --- 000-virgin/fs/buffer.c Fri May 30 19:02:18 2003 +++ 900-mjb1/fs/buffer.c Fri May 30 22:04:55 2003 @@ -118,23 +118,35 @@ void unlock_buffer(struct buffer_head *b * from becoming locked again - you have to lock it yourself * if you want to preserve its state. */ -void __wait_on_buffer(struct buffer_head * bh) +int __wait_on_buffer_wq(struct buffer_head * bh, wait_queue_t *wait) { wait_queue_head_t *wqh = bh_waitq_head(bh); - DEFINE_WAIT(wait); + DEFINE_WAIT(local_wait); + + if (!wait) + wait = &local_wait; if (atomic_read(&bh->b_count) == 0 && (!bh->b_page || !PageLocked(bh->b_page))) buffer_error(); do { - prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); + prepare_to_wait(wqh, wait, TASK_UNINTERRUPTIBLE); if (buffer_locked(bh)) { blk_run_queues(); + if (!is_sync_wait(wait)) { + return -EIOCBRETRY; + } io_schedule(); } } while (buffer_locked(bh)); - finish_wait(wqh, &wait); + finish_wait(wqh, wait); + return 0; +} + +void __wait_on_buffer(struct buffer_head * bh) +{ + __wait_on_buffer_wq(bh, NULL); } static void @@ -409,6 +421,9 @@ __find_get_block_slow(struct block_devic bh = bh->b_this_page; } while (bh != head); buffer_error(); + printk("block=%llu, b_blocknr=%llu\n", + (unsigned long long)block, (unsigned long long)bh->b_blocknr); + printk("b_state=0x%08lx, b_size=%u\n", bh->b_state, bh->b_size); out_unlock: spin_unlock(&bd_mapping->private_lock); page_cache_release(page); @@ -1272,9 +1287,12 @@ void __bforget(struct buffer_head *bh) __brelse(bh); } -static struct buffer_head *__bread_slow(struct buffer_head *bh) +static struct buffer_head *__bread_slow_wq(struct buffer_head *bh, + wait_queue_t *wait) { - lock_buffer(bh); + if (-EIOCBRETRY == lock_buffer_wq(bh, wait)) + return ERR_PTR(-EIOCBRETRY); + if (buffer_uptodate(bh)) { unlock_buffer(bh); return bh; @@ -1284,7 +1302,8 @@ static struct buffer_head *__bread_slow( get_bh(bh); bh->b_end_io = end_buffer_io_sync; submit_bh(READ, bh); - wait_on_buffer(bh); + if (-EIOCBRETRY == wait_on_buffer_wq(bh, wait)) + return ERR_PTR(-EIOCBRETRY); if (buffer_uptodate(bh)) return bh; } @@ -1292,6 +1311,11 @@ static struct buffer_head *__bread_slow( return NULL; } +static inline struct buffer_head *__bread_slow(struct buffer_head *bh) +{ + return __bread_slow_wq(bh, NULL); +} + /* * Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block(). * The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their @@ -1466,6 +1490,18 @@ __bread(struct block_device *bdev, secto bh = __bread_slow(bh); return bh; } + + +struct buffer_head * +__bread_wq(struct block_device *bdev, sector_t block, int size, + wait_queue_t *wait) +{ + struct buffer_head *bh = __getblk(bdev, block, size); + + if (!buffer_uptodate(bh)) + bh = __bread_slow_wq(bh, wait); + return bh; +} EXPORT_SYMBOL(__bread); /* @@ -1904,8 +1940,11 @@ static int __block_prepare_write(struct clear_buffer_new(bh); if (!buffer_mapped(bh)) { err = get_block(inode, block, bh, 1); - if (err) + if (err) { + if (-EIOCBRETRY == err) + pr_debug("get_block queued\n"); goto out; + } if (buffer_new(bh)) { clear_buffer_new(bh); unmap_underlying_metadata(bh->b_bdev, @@ -1947,6 +1986,8 @@ static int __block_prepare_write(struct * If we issued read requests - let them complete. */ while(wait_bh > wait) { + if (!is_sync_wait(current->io_wait)) + printk("block_prepare_write: wait on buffer\n"); wait_on_buffer(*--wait_bh); if (!buffer_uptodate(*wait_bh)) return -EIO; diff -urpN -X /home/fletch/.diff.exclude 000-virgin/fs/exec.c 900-mjb1/fs/exec.c --- 000-virgin/fs/exec.c Fri May 30 19:02:18 2003 +++ 900-mjb1/fs/exec.c Fri May 30 19:08:49 2003 @@ -316,6 +316,7 @@ void put_dirty_page(struct task_struct * } lru_cache_add_active(page); flush_dcache_page(page); + SetPageAnon(page); set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(page, prot)))); pte_chain = page_add_rmap(page, pte, pte_chain); pte_unmap(pte); diff -urpN -X /home/fletch/.diff.exclude 000-virgin/fs/ext2/inode.c 900-mjb1/fs/ext2/inode.c --- 000-virgin/fs/ext2/inode.c Tue Apr 8 14:38:19 2003 +++ 900-mjb1/fs/ext2/inode.c Fri May 30 22:04:56 2003 @@ -257,11 +257,12 @@ static int ext2_block_to_path(struct ino * or when it reads all @depth-1 indirect blocks successfully and finds * the whole chain, all way to the data (returns %NULL, *err == 0). */ -static Indirect *ext2_get_branch(struct inode *inode, +static Indirect *ext2_get_branch_wq(struct inode *inode, int depth, int *offsets, Indirect chain[4], - int *err) + int *err, + wait_queue_t *wait) { struct super_block *sb = inode->i_sb; Indirect *p = chain; @@ -273,8 +274,8 @@ static Indirect *ext2_get_branch(struct if (!p->key) goto no_block; while (--depth) { - bh = sb_bread(sb, le32_to_cpu(p->key)); - if (!bh) + bh = sb_bread_wq(sb, le32_to_cpu(p->key), wait); + if (!bh || IS_ERR(bh)) goto failure; read_lock(&EXT2_I(inode)->i_meta_lock); if (!verify_chain(chain, p)) @@ -292,11 +293,21 @@ changed: *err = -EAGAIN; goto no_block; failure: - *err = -EIO; + *err = IS_ERR(bh) ? PTR_ERR(bh) : -EIO; no_block: return p; } +static Indirect *ext2_get_branch(struct inode *inode, + int depth, + int *offsets, + Indirect chain[4], + int *err) +{ + return ext2_get_branch_wq(inode, depth, offsets, chain, + err, NULL); +} + /** * ext2_find_near - find a place for allocation with sufficient locality * @inode: owner @@ -536,7 +547,8 @@ changed: * reachable from inode. */ -static int ext2_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) +static int ext2_get_block_wq(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create, wait_queue_t *wait) { int err = -EIO; int offsets[4]; @@ -551,7 +563,8 @@ static int ext2_get_block(struct inode * goto out; reread: - partial = ext2_get_branch(inode, depth, offsets, chain, &err); + partial = ext2_get_branch_wq(inode, depth, offsets, chain, &err, + wait); /* Simplest case - block found, no allocation needed */ if (!partial) { @@ -565,7 +578,7 @@ got_it: } /* Next simple case - plain lookup or failed read of indirect block */ - if (!create || err == -EIO) { + if (!create || err == -EIO || err == -EIOCBRETRY) { cleanup: while (partial > chain) { brelse(partial->bh); @@ -606,6 +619,19 @@ changed: goto reread; } +static int ext2_get_block_async(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + return ext2_get_block_wq(inode, iblock, bh_result, create, + current->io_wait); +} + +static int ext2_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + return ext2_get_block_wq(inode, iblock, bh_result, create, NULL); +} + static int ext2_writepage(struct page *page, struct writeback_control *wbc) { return block_write_full_page(page, ext2_get_block, wbc); @@ -627,7 +653,7 @@ static int ext2_prepare_write(struct file *file, struct page *page, unsigned from, unsigned to) { - return block_prepare_write(page,from,to,ext2_get_block); + return block_prepare_write(page,from,to,ext2_get_block_async); } static int diff -urpN -X /home/fletch/.diff.exclude 000-virgin/fs/inode.c 900-mjb1/fs/inode.c --- 000-virgin/fs/inode.c Fri May 30 19:02:18 2003 +++ 900-mjb1/fs/inode.c Fri May 30 19:28:10 2003 @@ -143,6 +143,7 @@ static struct inode *alloc_inode(struct mapping->a_ops = &empty_aops; mapping->host = inode; mapping->gfp_mask = GFP_HIGHUSER; + mapping->binding = NULL; mapping->dirtied_when = 0; mapping->assoc_mapping = NULL; mapping->backing_dev_info = &default_backing_dev_info; diff -urpN -X /home/fletch/.diff.exclude 000-virgin/fs/proc/array.c 900-mjb1/fs/proc/array.c --- 000-virgin/fs/proc/array.c Sat May 10 18:35:00 2003 +++ 900-mjb1/fs/proc/array.c Fri May 30 19:29:30 2003 @@ -336,7 +336,7 @@ int proc_pid_stat(struct task_struct *ta read_unlock(&tasklist_lock); res = sprintf(buffer,"%d (%s) %c %d %d %d %d %d %lu %lu \ %lu %lu %lu %lu %lu %ld %ld %ld %ld %ld %ld %llu %lu %ld %lu %lu %lu %lu %lu \ -%lu %lu %lu %lu %lu %lu %lu %lu %d %d %lu %lu\n", +%lu %lu %lu %lu %lu %lu %lu %lu %d %d %lu %lu %lu %lu %lu\n", task->pid, task->comm, state, @@ -382,7 +382,10 @@ int proc_pid_stat(struct task_struct *ta task->exit_signal, task_cpu(task), task->rt_priority, - task->policy); + task->policy, + jiffies_to_clock_t(task->sched_info.inter_arrival_time), + jiffies_to_clock_t(task->sched_info.service_time), + jiffies_to_clock_t(task->sched_info.response_time)); if(mm) mmput(mm); return res; diff -urpN -X /home/fletch/.diff.exclude 000-virgin/fs/proc/proc_misc.c 900-mjb1/fs/proc/proc_misc.c --- 000-virgin/fs/proc/proc_misc.c Fri May 30 19:02:19 2003 +++ 900-mjb1/fs/proc/proc_misc.c Fri May 30 19:29:30 2003 @@ -134,6 +134,37 @@ static struct vmalloc_info get_vmalloc_i return vmi; } +static int real_loadavg_read_proc(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + int a, b, c, cpu; + int len; + + a = tasks_running[0] + (FIXED_1/200); + b = tasks_running[1] + (FIXED_1/200); + c = tasks_running[2] + (FIXED_1/200); + len = sprintf(page,"Domain load1 load2 load3 nr_run/nr_thrd\n"); + len += sprintf(page+len,"SYSTEM %5d.%02d %5d.%02d %5d.%02d %7ld/%7d\n", + LOAD_INT(a), LOAD_FRAC(a), + LOAD_INT(b), LOAD_FRAC(b), + LOAD_INT(c), LOAD_FRAC(c), + nr_running(), nr_threads); + for (cpu = 0; cpu < NR_CPUS; ++cpu) { + if (!cpu_online(cpu)) + continue; + a = cpu_tasks_running[0][cpu] + (FIXED_1/200); + b = cpu_tasks_running[1][cpu] + (FIXED_1/200); + c = cpu_tasks_running[2][cpu] + (FIXED_1/200); + len += sprintf(page+len, "%5d %5d.%02d %5d.%02d %5d.%02d %7ld/7%d\n", + cpu, + LOAD_INT(a), LOAD_FRAC(a), + LOAD_INT(b), LOAD_FRAC(b), + LOAD_INT(c), LOAD_FRAC(c), + nr_running_cpu(cpu), nr_threads); + } + return proc_calc_metrics(page, start, off, count, eof, len); +} + static int uptime_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) { @@ -303,6 +334,9 @@ static struct file_operations proc_vmsta .release = seq_release, }; +extern int schedstats_read_proc(char *page, char **start, off_t off, + int count, int *eof, void *data); + #ifdef CONFIG_PROC_HARDWARE static int hardware_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) @@ -359,6 +393,71 @@ static struct file_operations proc_modul }; #endif +#ifdef CONFIG_NUMA +#define K(x) ((x) << (PAGE_SHIFT - 10)) +static int show_meminfo_numa (struct seq_file *m, void *v) +{ + int *d = v; + int nid = *d; + struct sysinfo i; + si_meminfo_node(&i, nid); + seq_printf(m, "\n" + "Node %d MemTotal: %8lu kB\n" + "Node %d MemFree: %8lu kB\n" + "Node %d MemUsed: %8lu kB\n" + "Node %d HighTotal: %8lu kB\n" + "Node %d HighFree: %8lu kB\n" + "Node %d LowTotal: %8lu kB\n" + "Node %d LowFree: %8lu kB\n", + nid, K(i.totalram), + nid, K(i.freeram), + nid, K(i.totalram-i.freeram), + nid, K(i.totalhigh), + nid, K(i.freehigh), + nid, K(i.totalram-i.totalhigh), + nid, K(i.freeram-i.freehigh)); + + return 0; +} +#undef K + +extern struct seq_operations meminfo_numa_op; +static int meminfo_numa_open(struct inode *inode, struct file *file) +{ + return seq_open(file,&meminfo_numa_op); +} + +static struct file_operations proc_meminfo_numa_operations = { + open: meminfo_numa_open, + read: seq_read, + llseek: seq_lseek, + release: seq_release, +}; + +static void *meminfo_numa_start(struct seq_file *m, loff_t *pos) +{ + return *pos < numnodes ? pos : NULL; +} + +static void *meminfo_numa_next(struct seq_file *m, void *v, loff_t *pos) +{ + ++*pos; + return meminfo_numa_start(m, pos); +} + +static void meminfo_numa_stop(struct seq_file *m, void *v) +{ +} + +struct seq_operations meminfo_numa_op = { + .start = meminfo_numa_start, + .next = meminfo_numa_next, + .stop = meminfo_numa_stop, + .show = show_meminfo_numa, +}; + +#endif + extern struct seq_operations slabinfo_op; extern ssize_t slabinfo_write(struct file *, const char *, size_t, loff_t *); static int slabinfo_open(struct inode *inode, struct file *file) @@ -403,14 +502,20 @@ static int kstat_read_proc(char *page, c jiffies_to_clock_t(idle), jiffies_to_clock_t(iowait)); for (i = 0 ; i < NR_CPUS; i++){ - if (!cpu_online(i)) continue; - len += sprintf(page + len, "cpu%d %u %u %u %u %u\n", + struct sched_info info; + if (!cpu_online(i)) + continue; + cpu_sched_info(&info, i); + len += sprintf(page + len, "cpu%d %u %u %u %u %u %u %u %u\n", i, jiffies_to_clock_t(kstat_cpu(i).cpustat.user), jiffies_to_clock_t(kstat_cpu(i).cpustat.nice), jiffies_to_clock_t(kstat_cpu(i).cpustat.system), jiffies_to_clock_t(kstat_cpu(i).cpustat.idle), - jiffies_to_clock_t(kstat_cpu(i).cpustat.iowait)); + jiffies_to_clock_t(kstat_cpu(i).cpustat.iowait), + (uint) jiffies_to_clock_t(info.inter_arrival_time), + (uint) jiffies_to_clock_t(info.service_time), + (uint) jiffies_to_clock_t(info.response_time)); } len += sprintf(page + len, "intr %u", sum); @@ -610,6 +715,36 @@ static void create_seq_entry(char *name, entry->proc_fops = f; } +#ifdef CONFIG_LOCKMETER +extern ssize_t get_lockmeter_info(char *, size_t, loff_t *); +extern ssize_t put_lockmeter_info(const char *, size_t); +extern int get_lockmeter_info_size(void); + +/* + * This function accesses lock metering information. + */ +static ssize_t read_lockmeter(struct file *file, char *buf, + size_t count, loff_t *ppos) +{ + return get_lockmeter_info(buf, count, ppos); +} + +/* + * Writing to /proc/lockmeter resets the counters + */ +static ssize_t write_lockmeter(struct file * file, const char * buf, + size_t count, loff_t *ppos) +{ + return put_lockmeter_info(buf, count); +} + +static struct file_operations proc_lockmeter_operations = { + NULL, /* lseek */ + read: read_lockmeter, + write: write_lockmeter, +}; +#endif /* CONFIG_LOCKMETER */ + void __init proc_misc_init(void) { struct proc_dir_entry *entry; @@ -618,6 +753,7 @@ void __init proc_misc_init(void) int (*read_proc)(char*,char**,off_t,int,int*,void*); } *p, simple_ones[] = { {"loadavg", loadavg_read_proc}, + {"real_loadavg",real_loadavg_read_proc}, {"uptime", uptime_read_proc}, {"meminfo", meminfo_read_proc}, {"version", version_read_proc}, @@ -636,6 +772,7 @@ void __init proc_misc_init(void) #endif {"locks", locks_read_proc}, {"execdomains", execdomains_read_proc}, + {"schedstat", schedstats_read_proc}, {NULL,} }; for (p = simple_ones; p->name; p++) @@ -659,6 +796,9 @@ void __init proc_misc_init(void) #ifdef CONFIG_MODULES create_seq_entry("modules", 0, &proc_modules_operations); #endif +#ifdef CONFIG_NUMA + create_seq_entry("meminfo.numa",0,&proc_meminfo_numa_operations); +#endif proc_root_kcore = create_proc_entry("kcore", S_IRUSR, NULL); if (proc_root_kcore) { proc_root_kcore->proc_fops = &proc_kcore_operations; @@ -676,6 +816,13 @@ void __init proc_misc_init(void) entry = create_proc_entry("sysrq-trigger", S_IWUSR, NULL); if (entry) entry->proc_fops = &proc_sysrq_trigger_operations; +#endif +#ifdef CONFIG_LOCKMETER + entry = create_proc_entry("lockmeter", S_IWUSR | S_IRUGO, NULL); + if (entry) { + entry->proc_fops = &proc_lockmeter_operations; + entry->size = get_lockmeter_info_size(); + } #endif #ifdef CONFIG_PPC32 { diff -urpN -X /home/fletch/.diff.exclude 000-virgin/fs/reiserfs/inode.c 900-mjb1/fs/reiserfs/inode.c --- 000-virgin/fs/reiserfs/inode.c Fri May 30 19:02:19 2003 +++ 900-mjb1/fs/reiserfs/inode.c Fri May 30 19:28:05 2003 @@ -306,7 +306,7 @@ research: ** read old data off disk. Set the up to date bit on the buffer instead ** and jump to the end */ - if (PageUptodate(bh_result->b_page)) { + if (!bh_result->b_page || PageUptodate(bh_result->b_page)) { set_buffer_uptodate(bh_result); goto finished ; } @@ -420,6 +420,40 @@ static int reiserfs_get_block_create_0 ( return reiserfs_get_block(inode, block, bh_result, GET_BLOCK_NO_HOLE) ; } +static int reiserfs_get_block_direct_io (struct inode * inode, + sector_t iblock, unsigned long max_blocks, + struct buffer_head * bh_result, int create) { + int ret ; + + bh_result->b_size = (1 << inode->i_blkbits); + bh_result->b_page = NULL; + + ret = reiserfs_get_block(inode, iblock, bh_result, create) ; + + if (ret != 0) + return ret; + + /* don't allow direct io onto tail pages */ + if (buffer_mapped(bh_result) && bh_result->b_blocknr == 0) { + /* make sure future calls to the direct io funcs for this offset + ** in the file fail by unmapping the buffer + */ + reiserfs_unmap_buffer(bh_result); + ret = -EINVAL ; + } + + /* Possible unpacked tail. Flush the data before pages have + disappeared */ + if (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) { + lock_kernel(); + reiserfs_commit_for_inode(inode); + REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask; + unlock_kernel(); + } + + return ret ; +} + /* ** helper function for when reiserfs_get_block is called for a hole ** but the file tail is still in a direct item @@ -448,7 +482,7 @@ static int convert_tail_for_hole(struct tail_end = (tail_start | (bh_result->b_size - 1)) + 1 ; index = tail_offset >> PAGE_CACHE_SHIFT ; - if (index != hole_page->index) { + if (!hole_page || index != hole_page->index) { tail_page = grab_cache_page(inode->i_mapping, index) ; retval = -ENOMEM; if (!tail_page) { @@ -554,7 +588,15 @@ int reiserfs_get_block (struct inode * i return ret; } - REISERFS_I(inode)->i_flags |= i_pack_on_close_mask ; + /* If file is of such a size, that it might have a tail and tails are enabled + ** we should mark it as possibly needing tail packing on close + */ + if ( (have_large_tails (inode->i_sb) && + inode->i_size < i_block_size (inode)*4) || + (have_small_tails (inode->i_sb) && + inode->i_size < i_block_size(inode)) ) + + REISERFS_I(inode)->i_flags |= i_pack_on_close_mask ; windex = push_journal_writer("reiserfs_get_block") ; @@ -744,22 +786,27 @@ int reiserfs_get_block (struct inode * i ** the disk */ set_buffer_uptodate (unbh); - - /* we've converted the tail, so we must - ** flush unbh before the transaction commits - */ - add_to_flushlist(inode, unbh) ; - - /* mark it dirty now to prevent commit_write from adding - ** this buffer to the inode's dirty buffer list - */ + /* unbh->b_page == NULL in case of DIRECT_IO request, this means + buffer will disappear shortly, so it should not be added to + any of our lists. + */ + if ( unbh->b_page ) { + /* we've converted the tail, so we must + ** flush unbh before the transaction commits + */ + add_to_flushlist(inode, unbh) ; + + /* mark it dirty now to prevent commit_write from adding + ** this buffer to the inode's dirty buffer list + */ /* * AKPM: changed __mark_buffer_dirty to mark_buffer_dirty(). * It's still atomic, but it sets the page dirty too, * which makes it eligible for writeback at any time by the * VM (which was also the case with __mark_buffer_dirty()) */ - mark_buffer_dirty(unbh) ; + mark_buffer_dirty(unbh) ; + } //inode->i_blocks += inode->i_sb->s_blocksize / 512; //mark_tail_converted (inode); @@ -2202,6 +2249,15 @@ static int reiserfs_commit_write(struct if (pos > inode->i_size) { struct reiserfs_transaction_handle th ; reiserfs_write_lock(inode->i_sb); + /* If the file have grown beyond the border where it + can have a tail, unmark it as needing a tail + packing */ + if ( (have_large_tails (inode->i_sb) && + inode->i_size < i_block_size(inode)*4) || + (have_small_tails (inode->i_sb) && + inode->i_size < i_block_size(inode)) ) + REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask ; + journal_begin(&th, inode->i_sb, 1) ; reiserfs_update_inode_transaction(inode) ; inode->i_size = pos ; @@ -2304,6 +2360,17 @@ static int reiserfs_releasepage(struct p return ret ; } +static int reiserfs_direct_IO(int rw, struct kiocb *iocb, + const struct iovec *iov, loff_t offset, + unsigned long nr_segs) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_dentry->d_inode->i_mapping->host; + + return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, + offset, nr_segs, reiserfs_get_block_direct_io); +} + struct address_space_operations reiserfs_address_space_operations = { .writepage = reiserfs_writepage, .readpage = reiserfs_readpage, @@ -2312,5 +2379,6 @@ struct address_space_operations reiserfs .sync_page = block_sync_page, .prepare_write = reiserfs_prepare_write, .commit_write = reiserfs_commit_write, - .bmap = reiserfs_aop_bmap + .bmap = reiserfs_aop_bmap, + .direct_IO = reiserfs_direct_IO } ; diff -urpN -X /home/fletch/.diff.exclude 000-virgin/fs/reiserfs/tail_conversion.c 900-mjb1/fs/reiserfs/tail_conversion.c --- 000-virgin/fs/reiserfs/tail_conversion.c Fri May 30 19:02:19 2003 +++ 900-mjb1/fs/reiserfs/tail_conversion.c Fri May 30 19:28:05 2003 @@ -104,8 +104,10 @@ int direct2indirect (struct reiserfs_tra /* we only send the unbh pointer if the buffer is not up to date. ** this avoids overwriting good data from writepage() with old data ** from the disk or buffer cache + ** Special case: unbh->b_page will be NULL if we are coming through + ** DIRECT_IO handler here. */ - if (buffer_uptodate(unbh) || PageUptodate(unbh->b_page)) { + if (!unbh->b_page || buffer_uptodate(unbh) || PageUptodate(unbh->b_page)) { up_to_date_bh = NULL ; } else { up_to_date_bh = unbh ; diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-alpha/lockmeter.h 900-mjb1/include/asm-alpha/lockmeter.h --- 000-virgin/include/asm-alpha/lockmeter.h Wed Dec 31 16:00:00 1969 +++ 900-mjb1/include/asm-alpha/lockmeter.h Fri May 30 19:26:45 2003 @@ -0,0 +1,90 @@ +/* + * Written by John Hawkes (hawkes@sgi.com) + * Based on klstat.h by Jack Steiner (steiner@sgi.com) + * + * Modified by Peter Rival (frival@zk3.dec.com) + */ + +#ifndef _ALPHA_LOCKMETER_H +#define _ALPHA_LOCKMETER_H + +#include +#define CPU_CYCLE_FREQUENCY hwrpb->cycle_freq + +#define get_cycles64() get_cycles() + +#define THIS_CPU_NUMBER smp_processor_id() + +#include +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,3,0) +#define local_irq_save(x) \ + __save_and_cli(x) +#define local_irq_restore(x) \ + __restore_flags(x) +#endif /* Linux version 2.2.x */ + +#define SPINLOCK_MAGIC_INIT /**/ + +/* + * Macros to cache and retrieve an index value inside of a lock + * these macros assume that there are less than 65536 simultaneous + * (read mode) holders of a rwlock. + * We also assume that the hash table has less than 32767 entries. + * the high order bit is used for write locking a rw_lock + * Note: although these defines and macros are the same as what is being used + * in include/asm-i386/lockmeter.h, they are present here to easily + * allow an alternate Alpha implementation. + */ +/* + * instrumented spinlock structure -- never used to allocate storage + * only used in macros below to overlay a spinlock_t + */ +typedef struct inst_spinlock_s { + /* remember, Alpha is little endian */ + unsigned short lock; + unsigned short index; +} inst_spinlock_t; +#define PUT_INDEX(lock_ptr,indexv) ((inst_spinlock_t *)(lock_ptr))->index = indexv +#define GET_INDEX(lock_ptr) ((inst_spinlock_t *)(lock_ptr))->index + +/* + * macros to cache and retrieve an index value in a read/write lock + * as well as the cpu where a reader busy period started + * we use the 2nd word (the debug word) for this, so require the + * debug word to be present + */ +/* + * instrumented rwlock structure -- never used to allocate storage + * only used in macros below to overlay a rwlock_t + */ +typedef struct inst_rwlock_s { + volatile int lock; + unsigned short index; + unsigned short cpu; +} inst_rwlock_t; +#define PUT_RWINDEX(rwlock_ptr,indexv) ((inst_rwlock_t *)(rwlock_ptr))->index = indexv +#define GET_RWINDEX(rwlock_ptr) ((inst_rwlock_t *)(rwlock_ptr))->index +#define PUT_RW_CPU(rwlock_ptr,cpuv) ((inst_rwlock_t *)(rwlock_ptr))->cpu = cpuv +#define GET_RW_CPU(rwlock_ptr) ((inst_rwlock_t *)(rwlock_ptr))->cpu + +/* + * return true if rwlock is write locked + * (note that other lock attempts can cause the lock value to be negative) + */ +#define RWLOCK_IS_WRITE_LOCKED(rwlock_ptr) (((inst_rwlock_t *)rwlock_ptr)->lock & 1) +#define IABS(x) ((x) > 0 ? (x) : -(x)) + +#define RWLOCK_READERS(rwlock_ptr) rwlock_readers(rwlock_ptr) +extern inline int rwlock_readers(rwlock_t *rwlock_ptr) +{ + int tmp = (int) ((inst_rwlock_t *)rwlock_ptr)->lock; + /* readers subtract 2, so we have to: */ + /* - andnot off a possible writer (bit 0) */ + /* - get the absolute value */ + /* - divide by 2 (right shift by one) */ + /* to find the number of readers */ + if (tmp == 0) return(0); + else return(IABS(tmp & ~1)>>1); +} + +#endif /* _ALPHA_LOCKMETER_H */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-alpha/spinlock.h 900-mjb1/include/asm-alpha/spinlock.h --- 000-virgin/include/asm-alpha/spinlock.h Fri May 30 19:02:20 2003 +++ 900-mjb1/include/asm-alpha/spinlock.h Fri May 30 19:26:45 2003 @@ -6,6 +6,10 @@ #include #include +#ifdef CONFIG_LOCKMETER +#undef DEBUG_SPINLOCK +#undef DEBUG_RWLOCK +#endif /* * Simple spin lock operations. There are two variants, one clears IRQ's @@ -95,9 +99,18 @@ static inline int _raw_spin_trylock(spin typedef struct { volatile int write_lock:1, read_counter:31; +#ifdef CONFIG_LOCKMETER + /* required for LOCKMETER since all bits in lock are used */ + /* need this storage for CPU and lock INDEX ............. */ + unsigned magic; +#endif } /*__attribute__((aligned(32)))*/ rwlock_t; +#ifdef CONFIG_LOCKMETER +#define RW_LOCK_UNLOCKED (rwlock_t) { 0, 0, 0 } +#else #define RW_LOCK_UNLOCKED (rwlock_t) { 0, 0 } +#endif #define rwlock_init(x) do { *(x) = RW_LOCK_UNLOCKED; } while(0) #define rwlock_is_locked(x) (*(volatile int *)(x) != 0) @@ -168,5 +181,42 @@ static inline void _raw_read_unlock(rwlo : "=m" (*lock), "=&r" (regx) : "m" (*lock) : "memory"); } + +#ifdef CONFIG_LOCKMETER +static inline int _raw_write_trylock(rwlock_t *lock) +{ + long temp,result; + + __asm__ __volatile__( + " ldl_l %1,%0\n" + " mov $31,%2\n" + " bne %1,1f\n" + " or $31,1,%2\n" + " stl_c %2,%0\n" + "1: mb\n" + : "=m" (*(volatile int *)lock), "=&r" (temp), "=&r" (result) + : "m" (*(volatile int *)lock) + ); + + return (result); +} + +static inline int _raw_read_trylock(rwlock_t *lock) +{ + unsigned long temp,result; + + __asm__ __volatile__( + " ldl_l %1,%0\n" + " mov $31,%2\n" + " blbs %1,1f\n" + " subl %1,2,%2\n" + " stl_c %2,%0\n" + "1: mb\n" + : "=m" (*(volatile int *)lock), "=&r" (temp), "=&r" (result) + : "m" (*(volatile int *)lock) + ); + return (result); +} +#endif /* CONFIG_LOCKMETER */ #endif /* _ALPHA_SPINLOCK_H */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/bug.h 900-mjb1/include/asm-i386/bug.h --- 000-virgin/include/asm-i386/bug.h Mon Mar 17 21:43:48 2003 +++ 900-mjb1/include/asm-i386/bug.h Fri May 30 19:13:53 2003 @@ -9,6 +9,11 @@ * undefined" opcode for parsing in the trap handler. */ +#ifdef CONFIG_X86_REMOTE_DEBUG +#define BUG() do { \ + asm ("int $0x3"); \ +} while (0) +#else #if 1 /* Set to zero for a slightly smaller kernel */ #define BUG() \ __asm__ __volatile__( "ud2\n" \ @@ -17,6 +22,7 @@ : : "i" (__LINE__), "i" (__FILE__)) #else #define BUG() __asm__ __volatile__("ud2\n") +#endif #endif #define PAGE_BUG(page) do { \ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/cpu.h 900-mjb1/include/asm-i386/cpu.h --- 000-virgin/include/asm-i386/cpu.h Thu Feb 13 11:08:13 2003 +++ 900-mjb1/include/asm-i386/cpu.h Fri May 30 21:58:44 2003 @@ -23,4 +23,6 @@ static inline int arch_register_cpu(int return register_cpu(&cpu_devices[num].cpu, num, parent); } +extern void setup_cpu_idt(void); +extern void setup_node_idts(void); #endif /* _ASM_I386_CPU_H_ */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/desc.h 900-mjb1/include/asm-i386/desc.h --- 000-virgin/include/asm-i386/desc.h Tue Feb 25 23:03:50 2003 +++ 900-mjb1/include/asm-i386/desc.h Fri May 30 21:58:44 2003 @@ -2,6 +2,7 @@ #define __ARCH_DESC_H #include +#include #include #ifndef __ASSEMBLY__ @@ -12,14 +13,15 @@ #include extern struct desc_struct cpu_gdt_table[NR_CPUS][GDT_ENTRIES]; +extern struct desc_struct node_idt_table[MAX_NUMNODES][IDT_ENTRIES]; -struct Xgt_desc_struct { +struct Xdt_desc_struct { unsigned short size; unsigned long address __attribute__((packed)); unsigned short pad; } __attribute__ ((packed)); -extern struct Xgt_desc_struct idt_descr, cpu_gdt_descr[NR_CPUS]; +extern struct Xdt_desc_struct node_idt_descr[MAX_NUMNODES], cpu_gdt_descr[NR_CPUS]; #define load_TR_desc() __asm__ __volatile__("ltr %%ax"::"a" (GDT_ENTRY_TSS*8)) #define load_LDT_desc() __asm__ __volatile__("lldt %%ax"::"a" (GDT_ENTRY_LDT*8)) @@ -29,7 +31,8 @@ extern struct Xgt_desc_struct idt_descr, * something other than this. */ extern struct desc_struct default_ldt[]; -extern void set_intr_gate(unsigned int irq, void * addr); +extern void node_set_intr_gate(unsigned int node, unsigned int vector, void * addr); +extern void set_intr_gate(unsigned int n, void *addr); #define _set_tssldt_desc(n,addr,limit,type) \ __asm__ __volatile__ ("movw %w3,0(%2)\n\t" \ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/dump.h 900-mjb1/include/asm-i386/dump.h --- 000-virgin/include/asm-i386/dump.h Wed Dec 31 16:00:00 1969 +++ 900-mjb1/include/asm-i386/dump.h Fri May 30 19:24:56 2003 @@ -0,0 +1,75 @@ +/* + * Kernel header file for Linux crash dumps. + * + * Created by: Matt Robinson (yakker@sgi.com) + * + * Copyright 1999 Silicon Graphics, Inc. All rights reserved. + * + * This code is released under version 2 of the GNU GPL. + */ + +/* This header file holds the architecture specific crash dump header */ +#ifndef _ASM_DUMP_H +#define _ASM_DUMP_H + +/* necessary header files */ +#include +#include +#include +#include + +/* definitions */ +#define DUMP_ASM_MAGIC_NUMBER 0xdeaddeadULL /* magic number */ +#define DUMP_ASM_VERSION_NUMBER 0x3 /* version number */ + +/* max number of cpus */ +#define DUMP_MAX_NUM_CPUS 32 + +/* + * Structure: __dump_header_asm + * Function: This is the header for architecture-specific stuff. It + * follows right after the dump header. + */ +struct __dump_header_asm { + /* the dump magic number -- unique to verify dump is valid */ + u64 dha_magic_number; + + /* the version number of this dump */ + u32 dha_version; + + /* the size of this header (in case we can't read it) */ + u32 dha_header_size; + + /* the esp for i386 systems */ + u32 dha_esp; + + /* the eip for i386 systems */ + u32 dha_eip; + + /* the dump registers */ + struct pt_regs dha_regs; + + /* smp specific */ + u32 dha_smp_num_cpus; + u32 dha_dumping_cpu; + struct pt_regs dha_smp_regs[DUMP_MAX_NUM_CPUS]; + u32 dha_smp_current_task[DUMP_MAX_NUM_CPUS]; + u32 dha_stack[DUMP_MAX_NUM_CPUS]; + u32 dha_stack_ptr[DUMP_MAX_NUM_CPUS]; +} __attribute__((packed)); + +#ifdef __KERNEL__ + +extern struct __dump_header_asm dump_header_asm; + +#ifdef CONFIG_SMP +extern unsigned long irq_affinity[]; +extern int (*dump_ipi_function_ptr)(struct pt_regs *); +extern void dump_send_ipi(void); +#else +#define dump_send_ipi() do { } while(0) +#endif + +#endif /* __KERNEL__ */ + +#endif /* _ASM_DUMP_H */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/e820.h 900-mjb1/include/asm-i386/e820.h --- 000-virgin/include/asm-i386/e820.h Sun Nov 17 20:29:50 2002 +++ 900-mjb1/include/asm-i386/e820.h Fri May 30 19:24:56 2003 @@ -35,6 +35,7 @@ struct e820map { }; extern struct e820map e820; + #endif/*!__ASSEMBLY__*/ #endif/*__E820_HEADER*/ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/early_printk.h 900-mjb1/include/asm-i386/early_printk.h --- 000-virgin/include/asm-i386/early_printk.h Wed Dec 31 16:00:00 1969 +++ 900-mjb1/include/asm-i386/early_printk.h Fri May 30 19:03:23 2003 @@ -0,0 +1,8 @@ +#ifndef __EARLY_PRINTK_H_I386_ +#define __EARLY_PRINTK_H_i386_ + +#define VGABASE 0xB8000 +#define SERIAL_BASES { 0x3f8, 0x2f8 } +#define SERIAL_BASES_LEN 2 + +#endif diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/ioctls.h 900-mjb1/include/asm-i386/ioctls.h --- 000-virgin/include/asm-i386/ioctls.h Tue Apr 8 14:38:20 2003 +++ 900-mjb1/include/asm-i386/ioctls.h Fri May 30 19:13:53 2003 @@ -68,6 +68,7 @@ #define TIOCGHAYESESP 0x545E /* Get Hayes ESP configuration */ #define TIOCSHAYESESP 0x545F /* Set Hayes ESP configuration */ #define FIOQSIZE 0x5460 +#define TIOCGDB 0x547F /* enable GDB stub mode on this tty */ /* Used for packet mode */ #define TIOCPKT_DATA 0 diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/kdebug.h 900-mjb1/include/asm-i386/kdebug.h --- 000-virgin/include/asm-i386/kdebug.h Wed Dec 31 16:00:00 1969 +++ 900-mjb1/include/asm-i386/kdebug.h Fri May 30 19:24:56 2003 @@ -0,0 +1,53 @@ +#ifndef _I386_KDEBUG_H +#define _I386_KDEBUG_H 1 + +#include + +struct pt_regs; + +struct die_args { + struct pt_regs *regs; + const char *str; + long err; +}; + + +/* Grossly misnamed. */ +enum die_val { + DIE_OOPS = 1, + DIE_INT3, + DIE_DEBUG, + DIE_PANIC, + DIE_NMI, + DIE_DIE, + DIE_CPUINIT, /* not really a die, but .. */ + DIE_TRAPINIT, + DIE_STOP, + DIE_PROTFAULT, + DIE_PAGEFAULT, + DIE_FPUTRAP, + DIE_WATCHDOG, +}; + +extern struct notifier_block *die_chain; + +static inline int register_die_notifier(struct notifier_block *nb) +{ + return notifier_chain_register(&die_chain, nb); + +} + +static inline int unregister_die_notifier(struct notifier_block *nb) +{ + return notifier_chain_unregister(&die_chain, nb); +} + +static inline int notify_die(enum die_val val, const char *str, + struct pt_regs *regs,long err) +{ + struct die_args args = { regs: regs, str: str, err: err }; + + return notifier_call_chain(&die_chain, val, &args); +} + +#endif diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/kmap_types.h 900-mjb1/include/asm-i386/kmap_types.h --- 000-virgin/include/asm-i386/kmap_types.h Fri May 30 19:02:20 2003 +++ 900-mjb1/include/asm-i386/kmap_types.h Fri May 30 19:24:56 2003 @@ -24,7 +24,8 @@ D(10) KM_IRQ0, D(11) KM_IRQ1, D(12) KM_SOFTIRQ0, D(13) KM_SOFTIRQ1, -D(14) KM_TYPE_NR +D(14) KM_TYPE_NR, +D(15) KM_DUMP }; #undef D diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/kprobes.h 900-mjb1/include/asm-i386/kprobes.h --- 000-virgin/include/asm-i386/kprobes.h Wed Dec 31 16:00:00 1969 +++ 900-mjb1/include/asm-i386/kprobes.h Fri May 30 19:15:14 2003 @@ -0,0 +1,34 @@ +#ifndef _ASM_KPROBES_H +#define _ASM_KPROBES_H +/* + * Dynamic Probes (kprobes) support + * Vamsi Krishna S , July, 2002 + * Mailing list: dprobes@www-124.ibm.com + */ +#include +#include + +struct pt_regs; + +typedef u8 kprobe_opcode_t; +#define BREAKPOINT_INSTRUCTION 0xcc + +/* trap3/1 are intr gates for kprobes. So, restore the status of IF, + * if necessary, before executing the original int3/1 (trap) handler. + */ +static inline void restore_interrupts(struct pt_regs *regs) +{ + if (regs->eflags & IF_MASK) + __asm__ __volatile__ ("sti"); +} + +#ifdef CONFIG_KPROBES +extern int kprobe_fault_handler(struct pt_regs *regs, int trapnr); +extern int post_kprobe_handler(struct pt_regs *regs); +extern int kprobe_handler(struct pt_regs *regs); +#else /* !CONFIG_KPROBES */ +static inline int kprobe_fault_handler(struct pt_regs *regs, int trapnr) { return 0; } +static inline int post_kprobe_handler(struct pt_regs *regs) { return 0; } +static inline int kprobe_handler(struct pt_regs *regs) { return 0; } +#endif +#endif /* _ASM_KPROBES_H */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/linkage.h 900-mjb1/include/asm-i386/linkage.h --- 000-virgin/include/asm-i386/linkage.h Sun Nov 17 20:29:46 2002 +++ 900-mjb1/include/asm-i386/linkage.h Fri May 30 19:23:03 2003 @@ -3,6 +3,7 @@ #define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0))) #define FASTCALL(x) x __attribute__((regparm(3))) +#define IRQHANDLER(x) x __attribute__((regparm(1))) #ifdef CONFIG_X86_ALIGNMENT_16 #define __ALIGN .align 16,0x90 diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/lockmeter.h 900-mjb1/include/asm-i386/lockmeter.h --- 000-virgin/include/asm-i386/lockmeter.h Wed Dec 31 16:00:00 1969 +++ 900-mjb1/include/asm-i386/lockmeter.h Fri May 30 19:26:45 2003 @@ -0,0 +1,127 @@ +/* + * Copyright (C) 1999,2000 Silicon Graphics, Inc. + * + * Written by John Hawkes (hawkes@sgi.com) + * Based on klstat.h by Jack Steiner (steiner@sgi.com) + * + * Modified by Ray Bryant (raybry@us.ibm.com) + * Changes Copyright (C) 2000 IBM, Inc. + * Added save of index in spinlock_t to improve efficiency + * of "hold" time reporting for spinlocks. + * Added support for hold time statistics for read and write + * locks. + * Moved machine dependent code here from include/lockmeter.h. + * + */ + +#ifndef _I386_LOCKMETER_H +#define _I386_LOCKMETER_H + +#include +#include + +#include + +#ifdef __KERNEL__ +extern unsigned long cpu_khz; +#define CPU_CYCLE_FREQUENCY (cpu_khz * 1000) +#else +#define CPU_CYCLE_FREQUENCY 450000000 +#endif + +#define THIS_CPU_NUMBER smp_processor_id() + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,3,0) +#define local_irq_save(x) \ + __asm__ __volatile__("pushfl ; popl %0 ; cli":"=g" (x): /* no input */ :"memory") + +#define local_irq_restore(x) \ + __asm__ __volatile__("pushl %0 ; popfl": /* no output */ :"g" (x):"memory") +#endif /* Linux version 2.2.x */ + +/* + * macros to cache and retrieve an index value inside of a spin lock + * these macros assume that there are less than 65536 simultaneous + * (read mode) holders of a rwlock. Not normally a problem!! + * we also assume that the hash table has less than 65535 entries. + */ +/* + * instrumented spinlock structure -- never used to allocate storage + * only used in macros below to overlay a spinlock_t + */ +typedef struct inst_spinlock_s { + /* remember, Intel is little endian */ + unsigned short lock; + unsigned short index; +} inst_spinlock_t; +#define PUT_INDEX(lock_ptr,indexv) ((inst_spinlock_t *)(lock_ptr))->index = indexv +#define GET_INDEX(lock_ptr) ((inst_spinlock_t *)(lock_ptr))->index + +/* + * macros to cache and retrieve an index value in a read/write lock + * as well as the cpu where a reader busy period started + * we use the 2nd word (the debug word) for this, so require the + * debug word to be present + */ +/* + * instrumented rwlock structure -- never used to allocate storage + * only used in macros below to overlay a rwlock_t + */ +typedef struct inst_rwlock_s { + volatile int lock; + unsigned short index; + unsigned short cpu; +} inst_rwlock_t; +#define PUT_RWINDEX(rwlock_ptr,indexv) ((inst_rwlock_t *)(rwlock_ptr))->index = indexv +#define GET_RWINDEX(rwlock_ptr) ((inst_rwlock_t *)(rwlock_ptr))->index +#define PUT_RW_CPU(rwlock_ptr,cpuv) ((inst_rwlock_t *)(rwlock_ptr))->cpu = cpuv +#define GET_RW_CPU(rwlock_ptr) ((inst_rwlock_t *)(rwlock_ptr))->cpu + +/* + * return the number of readers for a rwlock_t + */ +#define RWLOCK_READERS(rwlock_ptr) rwlock_readers(rwlock_ptr) + +extern inline int rwlock_readers(rwlock_t *rwlock_ptr) +{ + int tmp = (int) rwlock_ptr->lock; + /* read and write lock attempts may cause the lock value to temporarily */ + /* be negative. Until it is >= 0 we know nothing (i. e. can't tell if */ + /* is -1 because it was write locked and somebody tried to read lock it */ + /* or if it is -1 because it was read locked and somebody tried to write*/ + /* lock it. ........................................................... */ + do { + tmp = (int) rwlock_ptr->lock; + } while (tmp < 0); + if (tmp == 0) return(0); + else return(RW_LOCK_BIAS-tmp); +} + +/* + * return true if rwlock is write locked + * (note that other lock attempts can cause the lock value to be negative) + */ +#define RWLOCK_IS_WRITE_LOCKED(rwlock_ptr) ((rwlock_ptr)->lock <= 0) +#define IABS(x) ((x) > 0 ? (x) : -(x)) +#define RWLOCK_IS_READ_LOCKED(rwlock_ptr) ((IABS((rwlock_ptr)->lock) % RW_LOCK_BIAS) != 0) + +/* this is a lot of typing just to get gcc to emit "rdtsc" */ +static inline long long get_cycles64 (void) +{ +#ifndef CONFIG_X86_TSC + #error this code requires CONFIG_X86_TSC +#else + union longlong_u { + long long intlong; + struct intint_s { + uint32_t eax; + uint32_t edx; + } intint; + } longlong; + + rdtsc(longlong.intint.eax,longlong.intint.edx); + return longlong.intlong; +#endif +} + +#endif /* _I386_LOCKMETER_H */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/mach-default/irq_vectors.h 900-mjb1/include/asm-i386/mach-default/irq_vectors.h --- 000-virgin/include/asm-i386/mach-default/irq_vectors.h Sun Apr 20 19:35:05 2003 +++ 900-mjb1/include/asm-i386/mach-default/irq_vectors.h Fri May 30 21:58:44 2003 @@ -48,6 +48,7 @@ #define INVALIDATE_TLB_VECTOR 0xfd #define RESCHEDULE_VECTOR 0xfc #define CALL_FUNCTION_VECTOR 0xfb +#define DUMP_VECTOR 0xfa #define THERMAL_APIC_VECTOR 0xf0 /* @@ -68,15 +69,22 @@ #define TIMER_IRQ 0 /* - * 16 8259A IRQ's, 208 potential APIC interrupt sources. - * Right now the APIC is mostly only used for SMP. - * 256 vectors is an architectural limit. (we can have - * more than 256 devices theoretically, but they will - * have to use shared interrupts) + * 16 8259A IRQ's, MAX_IRQ_SOURCES-16 potential APIC + * interrupt sources. Right now the APIC is mostly only + * used for SMP. 256 vectors is an architectural limit. + * (we can have more than 256 devices theoretically, but + * they will have to use shared interrupts) * Since vectors 0x00-0x1f are used/reserved for the CPU, * the usable vector space is 0x20-0xff (224 vectors) + * Linux currently makes 190 vectors available for io interrupts + * starting at FIRST_DEVICE_VECTOR till FIRST_SYSTEM_VECTOR + * + * 0________0x31__________________________0xef_________0xff + * system io interrupts resvd/smp + * */ #ifdef CONFIG_X86_IO_APIC +#define NR_IRQ_VECTORS 190 #define NR_IRQS 224 #else #define NR_IRQS 16 diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/mach-summit/mach_mpparse.h 900-mjb1/include/asm-i386/mach-summit/mach_mpparse.h --- 000-virgin/include/asm-i386/mach-summit/mach_mpparse.h Fri May 30 19:02:20 2003 +++ 900-mjb1/include/asm-i386/mach-summit/mach_mpparse.h Fri May 30 19:26:44 2003 @@ -1,6 +1,8 @@ #ifndef __ASM_MACH_MPPARSE_H #define __ASM_MACH_MPPARSE_H +#include + extern int use_cyclone; static inline void mpc_oem_bus_info(struct mpc_config_bus *m, char *name, @@ -44,4 +46,71 @@ static inline int acpi_madt_oem_check(ch } return 0; } + +struct rio_table_hdr { + unsigned char version; /* Version number of this data structure */ + /* Version 3 adds chassis_num & WP_index */ + unsigned char num_scal_dev; /* # of Scalability devices (Twisters for Vigil) */ + unsigned char num_rio_dev; /* # of RIO I/O devices (Cyclones and Winnipegs) */ +} __attribute__((packed)); + +struct scal_detail { + unsigned char node_id; /* Scalability Node ID */ + unsigned long CBAR; /* Address of 1MB register space */ + unsigned char port0node; /* Node ID port connected to: 0xFF=None */ + unsigned char port0port; /* Port num port connected to: 0,1,2, or 0xFF=None */ + unsigned char port1node; /* Node ID port connected to: 0xFF = None */ + unsigned char port1port; /* Port num port connected to: 0,1,2, or 0xFF=None */ + unsigned char port2node; /* Node ID port connected to: 0xFF = None */ + unsigned char port2port; /* Port num port connected to: 0,1,2, or 0xFF=None */ + unsigned char chassis_num; /* 1 based Chassis number (1 = boot node) */ +} __attribute__((packed)); + +struct rio_detail { + unsigned char node_id; /* RIO Node ID */ + unsigned long BBAR; /* Address of 1MB register space */ + unsigned char type; /* Type of device */ + unsigned char owner_id; /* For WPEG: Node ID of Cyclone that owns this WPEG*/ + /* For CYC: Node ID of Twister that owns this CYC */ + unsigned char port0node; /* Node ID port connected to: 0xFF=None */ + unsigned char port0port; /* Port num port connected to: 0,1,2, or 0xFF=None */ + unsigned char port1node; /* Node ID port connected to: 0xFF=None */ + unsigned char port1port; /* Port num port connected to: 0,1,2, or 0xFF=None */ + unsigned char first_slot; /* For WPEG: Lowest slot number below this WPEG */ + /* For CYC: 0 */ + unsigned char status; /* For WPEG: Bit 0 = 1 : the XAPIC is used */ + /* = 0 : the XAPIC is not used, ie:*/ + /* ints fwded to another XAPIC */ + /* Bits1:7 Reserved */ + /* For CYC: Bits0:7 Reserved */ + unsigned char WP_index; /* For WPEG: WPEG instance index - lower ones have */ + /* lower slot numbers/PCI bus numbers */ + /* For CYC: No meaning */ + unsigned char chassis_num; /* 1 based Chassis number */ + /* For LookOut WPEGs this field indicates the */ + /* Expansion Chassis #, enumerated from Boot */ + /* Node WPEG external port, then Boot Node CYC */ + /* external port, then Next Vigil chassis WPEG */ + /* external port, etc. */ + /* Shared Lookouts have only 1 chassis number (the */ + /* first one assigned) */ +} __attribute__((packed)); + + +typedef enum { + CompatTwister = 0, /* Compatibility Twister */ + AltTwister = 1, /* Alternate Twister of internal 8-way */ + CompatCyclone = 2, /* Compatibility Cyclone */ + AltCyclone = 3, /* Alternate Cyclone of internal 8-way */ + CompatWPEG = 4, /* Compatibility WPEG */ + AltWPEG = 5, /* Second Planar WPEG */ + LookOutAWPEG = 6, /* LookOut WPEG */ + LookOutBWPEG = 7, /* LookOut WPEG */ +} node_type; + +static inline int is_WPEG(node_type type){ + return (type == CompatWPEG || type == AltWPEG || + type == LookOutAWPEG || type == LookOutBWPEG); +} + #endif /* __ASM_MACH_MPPARSE_H */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/mpspec.h 900-mjb1/include/asm-i386/mpspec.h --- 000-virgin/include/asm-i386/mpspec.h Fri May 30 19:02:20 2003 +++ 900-mjb1/include/asm-i386/mpspec.h Fri May 30 19:26:44 2003 @@ -222,6 +222,10 @@ extern unsigned long mp_lapic_addr; extern int pic_mode; extern int using_apic_timer; +#ifdef CONFIG_X86_SUMMIT +extern void setup_summit (void); +#endif + #ifdef CONFIG_ACPI_BOOT extern void mp_register_lapic (u8 id, u8 enabled); extern void mp_register_lapic_address (u64 address); diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/numaq.h 900-mjb1/include/asm-i386/numaq.h --- 000-virgin/include/asm-i386/numaq.h Mon Mar 17 21:43:48 2003 +++ 900-mjb1/include/asm-i386/numaq.h Fri May 30 21:58:44 2003 @@ -29,6 +29,8 @@ #ifdef CONFIG_X86_NUMAQ #define MAX_NUMNODES 8 + +#ifndef __ASSEMBLY__ extern void get_memcfg_numaq(void); #define get_memcfg_numa() get_memcfg_numaq() @@ -161,6 +163,7 @@ static inline unsigned long *get_zholes_ { return 0; } +#endif /* __ASSEMBLY__ */ #endif /* CONFIG_X86_NUMAQ */ #endif /* NUMAQ_H */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/page.h 900-mjb1/include/asm-i386/page.h --- 000-virgin/include/asm-i386/page.h Tue Apr 8 14:38:20 2003 +++ 900-mjb1/include/asm-i386/page.h Fri May 30 19:23:02 2003 @@ -3,7 +3,11 @@ /* PAGE_SHIFT determines the page size */ #define PAGE_SHIFT 12 -#define PAGE_SIZE (1UL << PAGE_SHIFT) +#ifndef __ASSEMBLY__ +#define PAGE_SIZE (1UL << PAGE_SHIFT) +#else +#define PAGE_SIZE (1 << PAGE_SHIFT) +#endif #define PAGE_MASK (~(PAGE_SIZE-1)) #define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1)) @@ -115,9 +119,26 @@ static __inline__ int get_order(unsigned #endif /* __ASSEMBLY__ */ #ifdef __ASSEMBLY__ -#define __PAGE_OFFSET (0xC0000000) +#include +#ifdef CONFIG_05GB +#define __PAGE_OFFSET (0xE0000000) +#elif defined(CONFIG_1GB) +#define __PAGE_OFFSET (0xC0000000) +#elif defined(CONFIG_2GB) +#define __PAGE_OFFSET (0x80000000) +#elif defined(CONFIG_3GB) +#define __PAGE_OFFSET (0x40000000) +#endif #else -#define __PAGE_OFFSET (0xC0000000UL) +#ifdef CONFIG_05GB +#define __PAGE_OFFSET (0xE0000000UL) +#elif defined(CONFIG_1GB) +#define __PAGE_OFFSET (0xC0000000UL) +#elif defined(CONFIG_2GB) +#define __PAGE_OFFSET (0x80000000UL) +#elif defined(CONFIG_3GB) +#define __PAGE_OFFSET (0x40000000UL) +#endif #endif diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/param.h 900-mjb1/include/asm-i386/param.h --- 000-virgin/include/asm-i386/param.h Sun Nov 17 20:29:26 2002 +++ 900-mjb1/include/asm-i386/param.h Fri May 30 19:03:25 2003 @@ -2,10 +2,18 @@ #define _ASMi386_PARAM_H #ifdef __KERNEL__ -# define HZ 1000 /* Internal kernel timer frequency */ -# define USER_HZ 100 /* .. some user interfaces are in "ticks" */ -# define CLOCKS_PER_SEC (USER_HZ) /* like times() */ +#include + +#ifdef CONFIG_1000HZ +# define HZ 1000 /* Internal kernel timer frequency */ +#else +# define HZ 100 #endif + +#define USER_HZ 100 /* .. some user interfaces are in "ticks" */ +#define CLOCKS_PER_SEC (USER_HZ) /* like times() */ + +#endif /* __KERNEL__ */ #ifndef HZ #define HZ 100 diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/pgtable.h 900-mjb1/include/asm-i386/pgtable.h --- 000-virgin/include/asm-i386/pgtable.h Sat May 10 18:35:02 2003 +++ 900-mjb1/include/asm-i386/pgtable.h Fri May 30 19:24:57 2003 @@ -15,6 +15,7 @@ #ifndef __ASSEMBLY__ #include #include +#include #include #ifndef _I386_BITOPS_H diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/posix_types.h 900-mjb1/include/asm-i386/posix_types.h --- 000-virgin/include/asm-i386/posix_types.h Sun Apr 20 19:35:05 2003 +++ 900-mjb1/include/asm-i386/posix_types.h Fri May 30 19:33:26 2003 @@ -7,7 +7,7 @@ * assume GCC is being used. */ -typedef unsigned short __kernel_dev_t; +typedef unsigned long __kernel_dev_t; typedef unsigned long __kernel_ino_t; typedef unsigned short __kernel_mode_t; typedef unsigned short __kernel_nlink_t; diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/processor.h 900-mjb1/include/asm-i386/processor.h --- 000-virgin/include/asm-i386/processor.h Fri May 30 19:02:20 2003 +++ 900-mjb1/include/asm-i386/processor.h Fri May 30 19:13:53 2003 @@ -288,7 +288,11 @@ extern unsigned int mca_pentium_flag; /* This decides where the kernel will search for a free chunk of vm * space during mmap's. */ +#ifdef CONFIG_05GB +#define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 16)) +#else #define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3)) +#endif /* * Size of io_bitmap in longwords: 32 is ports 0-0x3ff. @@ -406,6 +410,9 @@ struct thread_struct { unsigned int saved_fs, saved_gs; /* IO permissions */ unsigned long *ts_io_bitmap; +#ifdef CONFIG_X86_REMOTE_DEBUG + struct pt_regs *kgdbregs; +#endif }; #define INIT_THREAD { \ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/rwlock.h 900-mjb1/include/asm-i386/rwlock.h --- 000-virgin/include/asm-i386/rwlock.h Sun Nov 17 20:29:57 2002 +++ 900-mjb1/include/asm-i386/rwlock.h Fri May 30 19:26:43 2003 @@ -20,28 +20,52 @@ #define RW_LOCK_BIAS 0x01000000 #define RW_LOCK_BIAS_STR "0x01000000" -#define __build_read_lock_ptr(rw, helper) \ - asm volatile(LOCK "subl $1,(%0)\n\t" \ - "js 2f\n" \ - "1:\n" \ - LOCK_SECTION_START("") \ - "2:\tcall " helper "\n\t" \ - "jmp 1b\n" \ - LOCK_SECTION_END \ - ::"a" (rw) : "memory") - -#define __build_read_lock_const(rw, helper) \ - asm volatile(LOCK "subl $1,%0\n\t" \ - "js 2f\n" \ - "1:\n" \ - LOCK_SECTION_START("") \ - "2:\tpushl %%eax\n\t" \ - "leal %0,%%eax\n\t" \ - "call " helper "\n\t" \ - "popl %%eax\n\t" \ - "jmp 1b\n" \ - LOCK_SECTION_END \ - :"=m" (*(volatile int *)rw) : : "memory") +#ifdef CONFIG_SPINLINE + + #define __build_read_lock_ptr(rw, helper) \ + asm volatile(LOCK "subl $1,(%0)\n\t" \ + "jns 1f\n\t" \ + "call " helper "\n\t" \ + "1:\t" \ + ::"a" (rw) : "memory") + + #define __build_read_lock_const(rw, helper) \ + asm volatile(LOCK "subl $1,%0\n\t" \ + "jns 1f\n\t" \ + "pushl %%eax\n\t" \ + "leal %0,%%eax\n\t" \ + "call " helper "\n\t" \ + "popl %%eax\n\t" \ + "1:\t" \ + :"=m" (*(volatile int *)rw) : : "memory") + +#else /* !CONFIG_SPINLINE */ + + #define __build_read_lock_ptr(rw, helper) \ + asm volatile(LOCK "subl $1,(%0)\n\t" \ + "js 2f\n" \ + "1:\n" \ + LOCK_SECTION_START("") \ + "2:\tcall " helper "\n\t" \ + "jmp 1b\n" \ + LOCK_SECTION_END \ + ::"a" (rw) : "memory") + + #define __build_read_lock_const(rw, helper) \ + asm volatile(LOCK "subl $1,%0\n\t" \ + "js 2f\n" \ + "1:\n" \ + LOCK_SECTION_START("") \ + "2:\tpushl %%eax\n\t" \ + "leal %0,%%eax\n\t" \ + "call " helper "\n\t" \ + "popl %%eax\n\t" \ + "jmp 1b\n" \ + LOCK_SECTION_END \ + :"=m" (*(volatile int *)rw) : : "memory") + +#endif /* CONFIG_SPINLINE */ + #define __build_read_lock(rw, helper) do { \ if (__builtin_constant_p(rw)) \ @@ -50,28 +74,51 @@ __build_read_lock_ptr(rw, helper); \ } while (0) -#define __build_write_lock_ptr(rw, helper) \ - asm volatile(LOCK "subl $" RW_LOCK_BIAS_STR ",(%0)\n\t" \ - "jnz 2f\n" \ - "1:\n" \ - LOCK_SECTION_START("") \ - "2:\tcall " helper "\n\t" \ - "jmp 1b\n" \ - LOCK_SECTION_END \ - ::"a" (rw) : "memory") - -#define __build_write_lock_const(rw, helper) \ - asm volatile(LOCK "subl $" RW_LOCK_BIAS_STR ",%0\n\t" \ - "jnz 2f\n" \ - "1:\n" \ - LOCK_SECTION_START("") \ - "2:\tpushl %%eax\n\t" \ - "leal %0,%%eax\n\t" \ - "call " helper "\n\t" \ - "popl %%eax\n\t" \ - "jmp 1b\n" \ - LOCK_SECTION_END \ - :"=m" (*(volatile int *)rw) : : "memory") +#ifdef CONFIG_SPINLINE + + #define __build_write_lock_ptr(rw, helper) \ + asm volatile(LOCK "subl $" RW_LOCK_BIAS_STR ",(%0)\n\t" \ + "jz 1f\n\t" \ + "call " helper "\n\t" \ + "1:\n" \ + ::"a" (rw) : "memory") + + #define __build_write_lock_const(rw, helper) \ + asm volatile(LOCK "subl $" RW_LOCK_BIAS_STR ",%0\n\t" \ + "jz 1f\n\t" \ + "pushl %%eax\n\t" \ + "leal %0,%%eax\n\t" \ + "call " helper "\n\t" \ + "popl %%eax\n\t" \ + "1:\n" \ + :"=m" (*(volatile int *)rw) : : "memory") + +#else /* !CONFIG_SPINLINE */ + + #define __build_write_lock_ptr(rw, helper) \ + asm volatile(LOCK "subl $" RW_LOCK_BIAS_STR ",(%0)\n\t" \ + "jnz 2f\n" \ + "1:\n" \ + LOCK_SECTION_START("") \ + "2:\tcall " helper "\n\t" \ + "jmp 1b\n" \ + LOCK_SECTION_END \ + ::"a" (rw) : "memory") + + #define __build_write_lock_const(rw, helper) \ + asm volatile(LOCK "subl $" RW_LOCK_BIAS_STR ",%0\n\t" \ + "jnz 2f\n" \ + "1:\n" \ + LOCK_SECTION_START("") \ + "2:\tpushl %%eax\n\t" \ + "leal %0,%%eax\n\t" \ + "call " helper "\n\t" \ + "popl %%eax\n\t" \ + "jmp 1b\n" \ + LOCK_SECTION_END \ + :"=m" (*(volatile int *)rw) : : "memory") + +#endif /* CONFIG_SPINLINE */ #define __build_write_lock(rw, helper) do { \ if (__builtin_constant_p(rw)) \ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/segment.h 900-mjb1/include/asm-i386/segment.h --- 000-virgin/include/asm-i386/segment.h Tue Feb 25 23:03:50 2003 +++ 900-mjb1/include/asm-i386/segment.h Fri May 30 21:58:44 2003 @@ -94,5 +94,5 @@ * of tasks we can have.. */ #define IDT_ENTRIES 256 - +#define IDT_SIZE (IDT_ENTRIES * 8) #endif diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/semaphore.h 900-mjb1/include/asm-i386/semaphore.h --- 000-virgin/include/asm-i386/semaphore.h Wed Mar 5 07:37:06 2003 +++ 900-mjb1/include/asm-i386/semaphore.h Fri May 30 22:04:52 2003 @@ -96,39 +96,48 @@ static inline void init_MUTEX_LOCKED (st sema_init(sem, 0); } -asmlinkage void __down_failed(void /* special register calling convention */); +asmlinkage int __down_failed_wq(void /* special register calling convention */); asmlinkage int __down_failed_interruptible(void /* params in registers */); asmlinkage int __down_failed_trylock(void /* params in registers */); asmlinkage void __up_wakeup(void /* special register calling convention */); -asmlinkage void __down(struct semaphore * sem); +asmlinkage int __down_wq(struct semaphore * sem, wait_queue_t *wait); asmlinkage int __down_interruptible(struct semaphore * sem); asmlinkage int __down_trylock(struct semaphore * sem); asmlinkage void __up(struct semaphore * sem); /* * This is ugly, but we want the default case to fall through. - * "__down_failed" is a special asm handler that calls the C + * "__down_failed_wq" is a special asm handler that calls the C * routine that actually waits. See arch/i386/kernel/semaphore.c */ -static inline void down(struct semaphore * sem) +static inline int down_wq(struct semaphore * sem, wait_queue_t *wait) { + int result; + #ifdef WAITQUEUE_DEBUG CHECK_MAGIC(sem->__magic); #endif might_sleep(); __asm__ __volatile__( "# atomic down operation\n\t" - LOCK "decl %0\n\t" /* --sem->count */ - "js 2f\n" + LOCK "decl %1\n\t" /* --sem->count */ + "js 2f\n\t" + "xorl %0,%0\n" "1:\n" LOCK_SECTION_START("") - "2:\tcall __down_failed\n\t" + "2:\tcall __down_failed_wq\n\t" "jmp 1b\n" LOCK_SECTION_END - :"=m" (sem->count) - :"c" (sem) + :"=a" (result), "=m" (sem->count) + :"c" (sem), "d" (wait) :"memory"); + return result; +} + +static inline void down(struct semaphore * sem) +{ + down_wq(sem, NULL); } /* diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/smp.h 900-mjb1/include/asm-i386/smp.h --- 000-virgin/include/asm-i386/smp.h Fri May 30 19:02:20 2003 +++ 900-mjb1/include/asm-i386/smp.h Fri May 30 19:24:56 2003 @@ -39,6 +39,7 @@ extern int smp_num_siblings; extern int cpu_sibling_map[]; extern void smp_flush_tlb(void); +extern void dump_send_ipi(void); extern void smp_message_irq(int cpl, void *dev_id, struct pt_regs *regs); extern void smp_send_reschedule(int cpu); extern void smp_send_reschedule_all(void); diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/spinlock.h 900-mjb1/include/asm-i386/spinlock.h --- 000-virgin/include/asm-i386/spinlock.h Fri May 30 19:02:20 2003 +++ 900-mjb1/include/asm-i386/spinlock.h Fri May 30 19:26:45 2003 @@ -43,18 +43,35 @@ typedef struct { #define spin_is_locked(x) (*(volatile signed char *)(&(x)->lock) <= 0) #define spin_unlock_wait(x) do { barrier(); } while(spin_is_locked(x)) -#define spin_lock_string \ - "\n1:\t" \ - "lock ; decb %0\n\t" \ - "js 2f\n" \ - LOCK_SECTION_START("") \ - "2:\t" \ - "rep;nop\n\t" \ - "cmpb $0,%0\n\t" \ - "jle 2b\n\t" \ - "jmp 1b\n" \ - LOCK_SECTION_END +#ifdef CONFIG_SPINLINE + #define spin_lock_string \ + "\n1:\t" \ + "lock ; decb %0\n\t" \ + "js 2f\n" \ + "jmp 3f\n" \ + "2:\t" \ + "rep;nop\n\t" \ + "cmpb $0,%0\n\t" \ + "jle 2b\n\t" \ + "jmp 1b\n" \ + "3:\t" + +#else /* !CONFIG_SPINLINE */ + + #define spin_lock_string \ + "\n1:\t" \ + "lock ; decb %0\n\t" \ + "js 2f\n" \ + LOCK_SECTION_START("") \ + "2:\t" \ + "rep;nop\n\t" \ + "cmpb $0,%0\n\t" \ + "jle 2b\n\t" \ + "jmp 1b\n" \ + LOCK_SECTION_END + +#endif /* CONFIG_SPINLINE */ /* * This works. Despite all the confusion. * (except on PPro SMP or if we are using OOSTORE) @@ -138,6 +155,11 @@ here: */ typedef struct { volatile unsigned int lock; +#if CONFIG_LOCKMETER + /* required for LOCKMETER since all bits in lock are used */ + /* and we need this storage for CPU and lock INDEX */ + unsigned lockmeter_magic; +#endif #ifdef CONFIG_DEBUG_SPINLOCK unsigned magic; #endif @@ -145,11 +167,19 @@ typedef struct { #define RWLOCK_MAGIC 0xdeaf1eed +#ifdef CONFIG_LOCKMETER +#if CONFIG_DEBUG_SPINLOCK +#define RWLOCK_MAGIC_INIT , 0, RWLOCK_MAGIC +#else +#define RWLOCK_MAGIC_INIT , 0 +#endif +#else /* !CONFIG_LOCKMETER */ #ifdef CONFIG_DEBUG_SPINLOCK #define RWLOCK_MAGIC_INIT , RWLOCK_MAGIC #else #define RWLOCK_MAGIC_INIT /* */ #endif +#endif /* !CONFIG_LOCKMETER */ #define RW_LOCK_UNLOCKED (rwlock_t) { RW_LOCK_BIAS RWLOCK_MAGIC_INIT } @@ -195,5 +225,59 @@ static inline int _raw_write_trylock(rwl atomic_add(RW_LOCK_BIAS, count); return 0; } + +#ifdef CONFIG_LOCKMETER +static inline int _raw_read_trylock(rwlock_t *lock) +{ +/* FIXME -- replace with assembler */ + atomic_t *count = (atomic_t *)lock; + atomic_dec(count); + if (count->counter > 0) + return 1; + atomic_inc(count); + return 0; +} +#endif + +#if defined(CONFIG_LOCKMETER) && defined(CONFIG_HAVE_DEC_LOCK) +extern void _metered_spin_lock (spinlock_t *lock); +extern void _metered_spin_unlock(spinlock_t *lock); + +/* + * Matches what is in arch/i386/lib/dec_and_lock.c, except this one is + * "static inline" so that the spin_lock(), if actually invoked, is charged + * against the real caller, not against the catch-all atomic_dec_and_lock + */ +static inline int atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock) +{ + int counter; + int newcount; + +repeat: + counter = atomic_read(atomic); + newcount = counter-1; + + if (!newcount) + goto slow_path; + + asm volatile("lock; cmpxchgl %1,%2" + :"=a" (newcount) + :"r" (newcount), "m" (atomic->counter), "0" (counter)); + + /* If the above failed, "eax" will have changed */ + if (newcount != counter) + goto repeat; + return 0; + +slow_path: + _metered_spin_lock(lock); + if (atomic_dec_and_test(atomic)) + return 1; + _metered_spin_unlock(lock); + return 0; +} + +#define ATOMIC_DEC_AND_LOCK +#endif #endif /* __ASM_SPINLOCK_H */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/srat.h 900-mjb1/include/asm-i386/srat.h --- 000-virgin/include/asm-i386/srat.h Mon Mar 17 21:43:48 2003 +++ 900-mjb1/include/asm-i386/srat.h Fri May 30 21:58:44 2003 @@ -28,8 +28,9 @@ #define _ASM_SRAT_H_ #define MAX_NUMNODES 8 +#ifndef __ASSEMBLY__ extern void get_memcfg_from_srat(void); extern unsigned long *get_zholes_size(int); #define get_memcfg_numa() get_memcfg_from_srat() - +#endif #endif /* _ASM_SRAT_H_ */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/thread_info.h 900-mjb1/include/asm-i386/thread_info.h --- 000-virgin/include/asm-i386/thread_info.h Mon Mar 17 21:43:48 2003 +++ 900-mjb1/include/asm-i386/thread_info.h Fri May 30 19:24:54 2003 @@ -9,6 +9,7 @@ #ifdef __KERNEL__ +#include #ifndef __ASSEMBLY__ #include #endif @@ -30,9 +31,11 @@ struct thread_info { __s32 preempt_count; /* 0 => preemptable, <0 => BUG */ mm_segment_t addr_limit; /* thread address space: + 0 for interrupts: illegal 0-0xBFFFFFFF for user-thead 0-0xFFFFFFFF for kernel-thread */ + struct thread_info *irq_stack; /* pointer to cpu irq stack */ struct restart_block restart_block; __u8 supervisor_stack[0]; @@ -48,7 +51,8 @@ struct thread_info { #define TI_CPU 0x00000010 #define TI_PRE_COUNT 0x00000014 #define TI_ADDR_LIMIT 0x00000018 -#define TI_RESTART_BLOCK 0x000001C +#define TI_IRQ_STACK 0x0000001C +#define TI_RESTART_BLOCK 0x0000026 #endif @@ -59,48 +63,66 @@ struct thread_info { * * preempt_count needs to be 1 initially, until the scheduler is functional. */ +#ifdef CONFIG_4K_STACK +#define THREAD_ORDER 0 +#define STACK_WARN 0x200 +#define STACK_PANIC 0x100 +#else +#define THREAD_ORDER 1 +#define STACK_WARN ((THREAD_SIZE)>>1) +#define STACK_PANIC 0x100 +#endif +#define INIT_THREAD_SIZE THREAD_SIZE + #ifndef __ASSEMBLY__ -#define INIT_THREAD_INFO(tsk) \ -{ \ - .task = &tsk, \ - .exec_domain = &default_exec_domain, \ - .flags = 0, \ - .cpu = 0, \ - .preempt_count = 1, \ - .addr_limit = KERNEL_DS, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ +#define INIT_THREAD_INFO(tsk) \ +{ \ + .task = &tsk, \ + .exec_domain = &default_exec_domain, \ + .flags = 0, \ + .cpu = 0, \ + .preempt_count = 1, \ + .addr_limit = KERNEL_DS, \ + .irq_stack = &init_irq_union.thread_info, \ + .restart_block = { \ + .fn = do_no_restart_syscall, \ + } \ } #define init_thread_info (init_thread_union.thread_info) #define init_stack (init_thread_union.stack) +/* thread information allocation */ +#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER) +#define alloc_thread_info() kmalloc(THREAD_SIZE, GFP_KERNEL) +#define free_thread_info(ti) kfree(ti) +#define get_thread_info(ti) get_task_struct((ti)->task) +#define put_thread_info(ti) put_task_struct((ti)->task) + /* how to get the thread information struct from C */ static inline struct thread_info *current_thread_info(void) { struct thread_info *ti; - __asm__("andl %%esp,%0; ":"=r" (ti) : "0" (~8191UL)); + __asm__("andl %%esp,%0; ":"=r" (ti) : "0" (~(THREAD_SIZE - 1))); return ti; } -/* thread information allocation */ -#define THREAD_SIZE (2*PAGE_SIZE) -#define alloc_thread_info() ((struct thread_info *) __get_free_pages(GFP_KERNEL,1)) -#define free_thread_info(ti) free_pages((unsigned long) (ti), 1) -#define get_thread_info(ti) get_task_struct((ti)->task) -#define put_thread_info(ti) put_task_struct((ti)->task) - #else /* !__ASSEMBLY__ */ +#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER) + /* how to get the thread information struct from ASM */ #define GET_THREAD_INFO(reg) \ - movl $-8192, reg; \ + movl $-THREAD_SIZE, reg; \ andl %esp, reg -#endif +/* use this one if reg already contains %esp */ +#define GET_THREAD_INFO_WITH_ESP(reg) \ + andl $-THREAD_SIZE, reg +#endif + /* * thread information flags * - these are process state flags that various assembly files may need to access diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/unistd.h 900-mjb1/include/asm-i386/unistd.h --- 000-virgin/include/asm-i386/unistd.h Fri May 30 19:02:20 2003 +++ 900-mjb1/include/asm-i386/unistd.h Fri May 30 19:28:10 2003 @@ -273,8 +273,9 @@ #define __NR_clock_gettime (__NR_timer_create+6) #define __NR_clock_getres (__NR_timer_create+7) #define __NR_clock_nanosleep (__NR_timer_create+8) +#define __NR_membind 268 -#define NR_syscalls 268 +#define NR_syscalls 269 /* user-visible error numbers are in the range -1 - -124: see */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-ia64/lockmeter.h 900-mjb1/include/asm-ia64/lockmeter.h --- 000-virgin/include/asm-ia64/lockmeter.h Wed Dec 31 16:00:00 1969 +++ 900-mjb1/include/asm-ia64/lockmeter.h Fri May 30 19:26:45 2003 @@ -0,0 +1,72 @@ +/* + * Copyright (C) 1999,2000 Silicon Graphics, Inc. + * + * Written by John Hawkes (hawkes@sgi.com) + * Based on klstat.h by Jack Steiner (steiner@sgi.com) + */ + +#ifndef _IA64_LOCKMETER_H +#define _IA64_LOCKMETER_H + +#ifdef local_cpu_data +#define CPU_CYCLE_FREQUENCY local_cpu_data->itc_freq +#else +#define CPU_CYCLE_FREQUENCY my_cpu_data.itc_freq +#endif +#define get_cycles64() get_cycles() + +#define THIS_CPU_NUMBER smp_processor_id() + +/* + * macros to cache and retrieve an index value inside of a lock + * these macros assume that there are less than 65536 simultaneous + * (read mode) holders of a rwlock. + * we also assume that the hash table has less than 32767 entries. + */ +/* + * instrumented spinlock structure -- never used to allocate storage + * only used in macros below to overlay a spinlock_t + */ +typedef struct inst_spinlock_s { + /* remember, Intel is little endian */ + volatile unsigned short lock; + volatile unsigned short index; +} inst_spinlock_t; +#define PUT_INDEX(lock_ptr,indexv) ((inst_spinlock_t *)(lock_ptr))->index = indexv +#define GET_INDEX(lock_ptr) ((inst_spinlock_t *)(lock_ptr))->index + +/* + * macros to cache and retrieve an index value in a read/write lock + * as well as the cpu where a reader busy period started + * we use the 2nd word (the debug word) for this, so require the + * debug word to be present + */ +/* + * instrumented rwlock structure -- never used to allocate storage + * only used in macros below to overlay a rwlock_t + */ +typedef struct inst_rwlock_s { + volatile int read_counter:31; + volatile int write_lock:1; + volatile unsigned short index; + volatile unsigned short cpu; +} inst_rwlock_t; +#define PUT_RWINDEX(rwlock_ptr,indexv) ((inst_rwlock_t *)(rwlock_ptr))->index = indexv +#define GET_RWINDEX(rwlock_ptr) ((inst_rwlock_t *)(rwlock_ptr))->index +#define PUT_RW_CPU(rwlock_ptr,cpuv) ((inst_rwlock_t *)(rwlock_ptr))->cpu = cpuv +#define GET_RW_CPU(rwlock_ptr) ((inst_rwlock_t *)(rwlock_ptr))->cpu + +/* + * return the number of readers for a rwlock_t + */ +#define RWLOCK_READERS(rwlock_ptr) ((rwlock_ptr)->read_counter) + +/* + * return true if rwlock is write locked + * (note that other lock attempts can cause the lock value to be negative) + */ +#define RWLOCK_IS_WRITE_LOCKED(rwlock_ptr) ((rwlock_ptr)->write_lock) +#define RWLOCK_IS_READ_LOCKED(rwlock_ptr) ((rwlock_ptr)->read_counter) + +#endif /* _IA64_LOCKMETER_H */ + diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-ia64/spinlock.h 900-mjb1/include/asm-ia64/spinlock.h --- 000-virgin/include/asm-ia64/spinlock.h Fri May 30 19:02:21 2003 +++ 900-mjb1/include/asm-ia64/spinlock.h Fri May 30 19:26:45 2003 @@ -182,4 +182,25 @@ do { \ clear_bit(31, (x)); \ }) +#ifdef CONFIG_LOCKMETER +extern void _metered_spin_lock (spinlock_t *lock); +extern void _metered_spin_unlock(spinlock_t *lock); + +/* + * Use a less efficient, and inline, atomic_dec_and_lock() if lockmetering + * so we can see the callerPC of who is actually doing the spin_lock(). + * Otherwise, all we see is the generic rollup of all locks done by + * atomic_dec_and_lock(). + */ +static inline int atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock) +{ + _metered_spin_lock(lock); + if (atomic_dec_and_test(atomic)) + return 1; + _metered_spin_unlock(lock); + return 0; +} +#define ATOMIC_DEC_AND_LOCK +#endif + #endif /* _ASM_IA64_SPINLOCK_H */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-mips/lockmeter.h 900-mjb1/include/asm-mips/lockmeter.h --- 000-virgin/include/asm-mips/lockmeter.h Wed Dec 31 16:00:00 1969 +++ 900-mjb1/include/asm-mips/lockmeter.h Fri May 30 19:26:45 2003 @@ -0,0 +1,126 @@ +/* + * Copyright (C) 1999,2000 Silicon Graphics, Inc. + * + * Written by John Hawkes (hawkes@sgi.com) + * Based on klstat.h by Jack Steiner (steiner@sgi.com) + * Ported to mips32 for Asita Technologies + * by D.J. Barrow ( dj.barrow@asitatechnologies.com ) + */ +#ifndef _ASM_LOCKMETER_H +#define _ASM_LOCKMETER_H + +/* do_gettimeoffset is a function pointer on mips */ +/* & it is not included by */ +#include +#include +#include + +#define SPINLOCK_MAGIC_INIT /* */ + +#define CPU_CYCLE_FREQUENCY get_cpu_cycle_frequency() + +#define THIS_CPU_NUMBER smp_processor_id() + +static uint32_t cpu_cycle_frequency = 0; + +static uint32_t get_cpu_cycle_frequency(void) +{ + /* a total hack, slow and invasive, but ... it works */ + int sec; + uint32_t start_cycles; + struct timeval tv; + + if (cpu_cycle_frequency == 0) { /* uninitialized */ + do_gettimeofday(&tv); + sec = tv.tv_sec; /* set up to catch the tv_sec rollover */ + while (sec == tv.tv_sec) { do_gettimeofday(&tv); } + sec = tv.tv_sec; /* rolled over to a new sec value */ + start_cycles = get_cycles(); + while (sec == tv.tv_sec) { do_gettimeofday(&tv); } + cpu_cycle_frequency = get_cycles() - start_cycles; + } + + return cpu_cycle_frequency; +} + +extern struct timeval xtime; + +static uint64_t get_cycles64(void) +{ + static uint64_t last_get_cycles64 = 0; + uint64_t ret; + unsigned long sec; + unsigned long usec, usec_offset; + +again: + sec = xtime.tv_sec; + usec = xtime.tv_usec; + usec_offset = do_gettimeoffset(); + if ((xtime.tv_sec != sec) || + (xtime.tv_usec != usec)|| + (usec_offset >= 20000)) + goto again; + + ret = ((uint64_t)(usec + usec_offset) * cpu_cycle_frequency); + /* We can't do a normal 64 bit division on mips without libgcc.a */ + do_div(ret,1000000); + ret += ((uint64_t)sec * cpu_cycle_frequency); + + /* XXX why does time go backwards? do_gettimeoffset? general time adj? */ + if (ret <= last_get_cycles64) + ret = last_get_cycles64+1; + last_get_cycles64 = ret; + + return ret; +} + +/* + * macros to cache and retrieve an index value inside of a lock + * these macros assume that there are less than 65536 simultaneous + * (read mode) holders of a rwlock. + * we also assume that the hash table has less than 32767 entries. + * the high order bit is used for write locking a rw_lock + */ +#define INDEX_MASK 0x7FFF0000 +#define READERS_MASK 0x0000FFFF +#define INDEX_SHIFT 16 +#define PUT_INDEX(lockp,index) \ + lockp->lock = (((lockp->lock) & ~INDEX_MASK) | (index) << INDEX_SHIFT) +#define GET_INDEX(lockp) \ + (((lockp->lock) & INDEX_MASK) >> INDEX_SHIFT) + +/* + * macros to cache and retrieve an index value in a read/write lock + * as well as the cpu where a reader busy period started + * we use the 2nd word (the debug word) for this, so require the + * debug word to be present + */ +/* + * instrumented rwlock structure -- never used to allocate storage + * only used in macros below to overlay a rwlock_t + */ +typedef struct inst_rwlock_s { + volatile int lock; + unsigned short index; + unsigned short cpu; +} inst_rwlock_t; +#define PUT_RWINDEX(rwlock_ptr,indexv) ((inst_rwlock_t *)(rwlock_ptr))->index = indexv +#define GET_RWINDEX(rwlock_ptr) ((inst_rwlock_t *)(rwlock_ptr))->index +#define PUT_RW_CPU(rwlock_ptr,cpuv) ((inst_rwlock_t *)(rwlock_ptr))->cpu = cpuv +#define GET_RW_CPU(rwlock_ptr) ((inst_rwlock_t *)(rwlock_ptr))->cpu + +/* + * return the number of readers for a rwlock_t + */ +#define RWLOCK_READERS(rwlock_ptr) rwlock_readers(rwlock_ptr) + +extern inline int rwlock_readers(rwlock_t *rwlock_ptr) +{ + int tmp = (int) rwlock_ptr->lock; + return (tmp >= 0) ? tmp : 0; +} + +#define RWLOCK_IS_WRITE_LOCKED(rwlock_ptr) ((rwlock_ptr)->lock < 0) +#define RWLOCK_IS_READ_LOCKED(rwlock_ptr) ((rwlock_ptr)->lock > 0) + +#endif /* _ASM_LOCKMETER_H */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-mips/spinlock.h 900-mjb1/include/asm-mips/spinlock.h --- 000-virgin/include/asm-mips/spinlock.h Sun Nov 17 20:29:56 2002 +++ 900-mjb1/include/asm-mips/spinlock.h Fri May 30 19:26:45 2003 @@ -74,9 +74,18 @@ static inline void spin_unlock(spinlock_ typedef struct { volatile unsigned int lock; +#if CONFIG_LOCKMETER + /* required for LOCKMETER since all bits in lock are used */ + /* and we need this storage for CPU and lock INDEX */ + unsigned lockmeter_magic; +#endif } rwlock_t; +#ifdef CONFIG_LOCKMETER +#define RW_LOCK_UNLOCKED (rwlock_t) { 0, 0 } +#else #define RW_LOCK_UNLOCKED (rwlock_t) { 0 } +#endif static inline void read_lock(rwlock_t *rw) { diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-mips64/lockmeter.h 900-mjb1/include/asm-mips64/lockmeter.h --- 000-virgin/include/asm-mips64/lockmeter.h Wed Dec 31 16:00:00 1969 +++ 900-mjb1/include/asm-mips64/lockmeter.h Fri May 30 19:26:45 2003 @@ -0,0 +1,120 @@ +/* + * Copyright (C) 1999,2000 Silicon Graphics, Inc. + * + * Written by John Hawkes (hawkes@sgi.com) + * Based on klstat.h by Jack Steiner (steiner@sgi.com) + */ + +#ifndef _ASM_LOCKMETER_H +#define _ASM_LOCKMETER_H + +#include + +#define SPINLOCK_MAGIC_INIT /* */ + +#define CPU_CYCLE_FREQUENCY get_cpu_cycle_frequency() + +#define THIS_CPU_NUMBER smp_processor_id() + +static uint32_t cpu_cycle_frequency = 0; + +static uint32_t get_cpu_cycle_frequency(void) +{ + /* a total hack, slow and invasive, but ... it works */ + int sec; + uint32_t start_cycles; + struct timeval tv; + + if (cpu_cycle_frequency == 0) { /* uninitialized */ + do_gettimeofday(&tv); + sec = tv.tv_sec; /* set up to catch the tv_sec rollover */ + while (sec == tv.tv_sec) { do_gettimeofday(&tv); } + sec = tv.tv_sec; /* rolled over to a new sec value */ + start_cycles = get_cycles(); + while (sec == tv.tv_sec) { do_gettimeofday(&tv); } + cpu_cycle_frequency = get_cycles() - start_cycles; + } + + return cpu_cycle_frequency; +} + +extern struct timeval xtime; +extern long do_gettimeoffset(void); + +static uint64_t get_cycles64(void) +{ + static uint64_t last_get_cycles64 = 0; + uint64_t ret; + unsigned long sec; + unsigned long usec, usec_offset; + +again: + sec = xtime.tv_sec; + usec = xtime.tv_usec; + usec_offset = do_gettimeoffset(); + if ((xtime.tv_sec != sec) || + (xtime.tv_usec != usec)|| + (usec_offset >= 20000)) + goto again; + + ret = ((uint64_t)sec * cpu_cycle_frequency) + + ( ((uint64_t)(usec + usec_offset) * cpu_cycle_frequency) / 1000000 ); + + /* XXX why does time go backwards? do_gettimeoffset? general time adj? */ + if (ret <= last_get_cycles64) + ret = last_get_cycles64+1; + last_get_cycles64 = ret; + + return ret; +} + +/* + * macros to cache and retrieve an index value inside of a lock + * these macros assume that there are less than 65536 simultaneous + * (read mode) holders of a rwlock. + * we also assume that the hash table has less than 32767 entries. + * the high order bit is used for write locking a rw_lock + */ +#define INDEX_MASK 0x7FFF0000 +#define READERS_MASK 0x0000FFFF +#define INDEX_SHIFT 16 +#define PUT_INDEX(lockp,index) \ + lockp->lock = (((lockp->lock) & ~INDEX_MASK) | (index) << INDEX_SHIFT) +#define GET_INDEX(lockp) \ + (((lockp->lock) & INDEX_MASK) >> INDEX_SHIFT) + +/* + * macros to cache and retrieve an index value in a read/write lock + * as well as the cpu where a reader busy period started + * we use the 2nd word (the debug word) for this, so require the + * debug word to be present + */ +/* + * instrumented rwlock structure -- never used to allocate storage + * only used in macros below to overlay a rwlock_t + */ +typedef struct inst_rwlock_s { + volatile int lock; + unsigned short index; + unsigned short cpu; +} inst_rwlock_t; +#define PUT_RWINDEX(rwlock_ptr,indexv) ((inst_rwlock_t *)(rwlock_ptr))->index = indexv +#define GET_RWINDEX(rwlock_ptr) ((inst_rwlock_t *)(rwlock_ptr))->index +#define PUT_RW_CPU(rwlock_ptr,cpuv) ((inst_rwlock_t *)(rwlock_ptr))->cpu = cpuv +#define GET_RW_CPU(rwlock_ptr) ((inst_rwlock_t *)(rwlock_ptr))->cpu + +/* + * return the number of readers for a rwlock_t + */ +#define RWLOCK_READERS(rwlock_ptr) rwlock_readers(rwlock_ptr) + +extern inline int rwlock_readers(rwlock_t *rwlock_ptr) +{ + int tmp = (int) rwlock_ptr->lock; + return (tmp >= 0) ? tmp : 0; +} + +#define RWLOCK_IS_WRITE_LOCKED(rwlock_ptr) ((rwlock_ptr)->lock < 0) +#define RWLOCK_IS_READ_LOCKED(rwlock_ptr) ((rwlock_ptr)->lock > 0) + +#endif /* _ASM_LOCKMETER_H */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-mips64/pgtable.h 900-mjb1/include/asm-mips64/pgtable.h --- 000-virgin/include/asm-mips64/pgtable.h Sun Apr 20 19:35:05 2003 +++ 900-mjb1/include/asm-mips64/pgtable.h Fri May 30 19:26:45 2003 @@ -126,7 +126,7 @@ extern void (*_flush_cache_l1)(void); #define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE) #define FIRST_USER_PGD_NR 0 -#define KPTBL_PAGE_ORDER 1 +#define KPTBL_PAGE_ORDER 2 #define VMALLOC_START XKSEG #define VMALLOC_VMADDR(x) ((unsigned long)(x)) #define VMALLOC_END \ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-ppc64/semaphore.h 900-mjb1/include/asm-ppc64/semaphore.h --- 000-virgin/include/asm-ppc64/semaphore.h Tue Feb 25 23:03:50 2003 +++ 900-mjb1/include/asm-ppc64/semaphore.h Fri May 30 22:04:58 2003 @@ -68,12 +68,14 @@ static inline void init_MUTEX_LOCKED (st sema_init(sem, 0); } -extern void __down(struct semaphore * sem); +extern void __down_wq(struct semaphore * sem, wait_queue_t *wait); extern int __down_interruptible(struct semaphore * sem); extern void __up(struct semaphore * sem); -static inline void down(struct semaphore * sem) +static inline int down_wq(struct semaphore * sem, wait_queue_t *wait) { + int ret = 0; + #ifdef WAITQUEUE_DEBUG CHECK_MAGIC(sem->__magic); #endif @@ -83,8 +85,14 @@ static inline void down(struct semaphore * Try to get the semaphore, take the slow path if we fail. */ if (atomic_dec_return(&sem->count) < 0) - __down(sem); + ret =__down_wq(sem, wait); smp_wmb(); + return ret; +} + +static inline void down(struct semaphore * sem) +{ + down_wq(sem, NULL); } static inline int down_interruptible(struct semaphore * sem) diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-sparc64/lockmeter.h 900-mjb1/include/asm-sparc64/lockmeter.h --- 000-virgin/include/asm-sparc64/lockmeter.h Wed Dec 31 16:00:00 1969 +++ 900-mjb1/include/asm-sparc64/lockmeter.h Fri May 30 19:26:45 2003 @@ -0,0 +1,47 @@ +/* + * Copyright (C) 2000 Anton Blanchard (anton@linuxcare.com) + */ + +#ifndef _SPARC64_LOCKMETER_H +#define _SPARC64_LOCKMETER_H + +#include + +#include + +extern unsigned long cpu_hz; +#define CPU_CYCLE_FREQUENCY cpu_hz + +#define THIS_CPU_NUMBER __cpu_number_map[smp_processor_id()] + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,3,0) +#define local_irq_save(x) __save_and_cli(x) +#define local_irq_restore(x) __restore_flags(x) +#endif /* Linux version 2.2.x */ + +#define PUT_INDEX(lock_ptr,indexv) (lock_ptr)->index = (indexv) +#define GET_INDEX(lock_ptr) (lock_ptr)->index + +#define PUT_RWINDEX(rwlock_ptr,indexv) (rwlock_ptr)->index = (indexv) +#define GET_RWINDEX(rwlock_ptr) (rwlock_ptr)->index +#define PUT_RW_CPU(rwlock_ptr,cpuv) (rwlock_ptr)->cpu = (cpuv) +#define GET_RW_CPU(rwlock_ptr) (rwlock_ptr)->cpu + +#define RWLOCK_READERS(rwlock_ptr) rwlock_readers(rwlock_ptr) + +extern inline int rwlock_readers(rwlock_t *rwlock_ptr) +{ + signed int tmp = rwlock_ptr->lock; + + if (tmp > 0) + return tmp; + else + return 0; +} + +#define RWLOCK_IS_WRITE_LOCKED(rwlock_ptr) ((signed int)((rwlock_ptr)->lock) < 0) +#define RWLOCK_IS_READ_LOCKED(rwlock_ptr) ((signed int)((rwlock_ptr)->lock) > 0) + +#define get_cycles64() get_cycles() + +#endif /* _SPARC64_LOCKMETER_H */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-sparc64/spinlock.h 900-mjb1/include/asm-sparc64/spinlock.h --- 000-virgin/include/asm-sparc64/spinlock.h Sun Nov 17 20:29:27 2002 +++ 900-mjb1/include/asm-sparc64/spinlock.h Fri May 30 19:26:45 2003 @@ -30,15 +30,23 @@ #ifndef CONFIG_DEBUG_SPINLOCK -typedef unsigned char spinlock_t; -#define SPIN_LOCK_UNLOCKED 0 +typedef struct { + unsigned char lock; + unsigned int index; +} spinlock_t; -#define spin_lock_init(lock) (*((unsigned char *)(lock)) = 0) -#define spin_is_locked(lock) (*((volatile unsigned char *)(lock)) != 0) +#ifdef CONFIG_LOCKMETER +#define SPIN_LOCK_UNLOCKED (spinlock_t) {0, 0} +#else +#define SPIN_LOCK_UNLOCKED (spinlock_t) { 0 } +#endif + +#define spin_lock_init(__lock) do { *(__lock) = SPIN_LOCK_UNLOCKED; } while(0) +#define spin_is_locked(__lock) (*((volatile unsigned char *)(&((__lock)->lock))) != 0) -#define spin_unlock_wait(lock) \ +#define spin_unlock_wait(__lock) \ do { membar("#LoadLoad"); \ -} while(*((volatile unsigned char *)lock)) +} while(*((volatile unsigned char *)(&(((spinlock_t *)__lock)->lock)))) static __inline__ void _raw_spin_lock(spinlock_t *lock) { @@ -109,8 +117,20 @@ extern int _spin_trylock (spinlock_t *lo #ifndef CONFIG_DEBUG_SPINLOCK -typedef unsigned int rwlock_t; -#define RW_LOCK_UNLOCKED 0 +#ifdef CONFIG_LOCKMETER +typedef struct { + unsigned int lock; + unsigned int index; + unsigned int cpu; +} rwlock_t; +#define RW_LOCK_UNLOCKED (rwlock_t) { 0, 0, 0xff } +#else +typedef struct { + unsigned int lock; +} rwlock_t; +#define RW_LOCK_UNLOCKED (rwlock_t) { 0 } +#endif + #define rwlock_init(lp) do { *(lp) = RW_LOCK_UNLOCKED; } while(0) #define rwlock_is_locked(x) (*(x) != RW_LOCK_UNLOCKED) diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-x86_64/early_printk.h 900-mjb1/include/asm-x86_64/early_printk.h --- 000-virgin/include/asm-x86_64/early_printk.h Wed Dec 31 16:00:00 1969 +++ 900-mjb1/include/asm-x86_64/early_printk.h Fri May 30 19:03:23 2003 @@ -0,0 +1,8 @@ +#ifdef __EARLY_PRINTK_H_X86_64_ +#define __EARLY_PRINTK_H_X86_64_ + +#define VGABASE 0xffffffff800b8000UL +#define SERIAL_BASES { 0x3f8, 0x2f8 } +#define SERIAL_BASES_LEN 2 + +#endif diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-x86_64/semaphore.h 900-mjb1/include/asm-x86_64/semaphore.h --- 000-virgin/include/asm-x86_64/semaphore.h Wed Mar 5 07:37:07 2003 +++ 900-mjb1/include/asm-x86_64/semaphore.h Fri May 30 22:04:59 2003 @@ -98,39 +98,48 @@ static inline void init_MUTEX_LOCKED (st sema_init(sem, 0); } -asmlinkage void __down_failed(void /* special register calling convention */); +asmlinkage int __down_failed_wq(void /* special register calling convention */); asmlinkage int __down_failed_interruptible(void /* params in registers */); asmlinkage int __down_failed_trylock(void /* params in registers */); asmlinkage void __up_wakeup(void /* special register calling convention */); -asmlinkage void __down(struct semaphore * sem); +asmlinkage int __down_wq(struct semaphore * sem, wait_queue_t *wait); asmlinkage int __down_interruptible(struct semaphore * sem); asmlinkage int __down_trylock(struct semaphore * sem); asmlinkage void __up(struct semaphore * sem); /* * This is ugly, but we want the default case to fall through. - * "__down_failed" is a special asm handler that calls the C + * "__down_failed_wq" is a special asm handler that calls the C * routine that actually waits. See arch/x86_64/kernel/semaphore.c */ -static inline void down(struct semaphore * sem) +static inline int down_wq(struct semaphore * sem, wait_queue_t *wait) { + int result; + #if WAITQUEUE_DEBUG CHECK_MAGIC(sem->__magic); #endif __asm__ __volatile__( "# atomic down operation\n\t" - LOCK "decl %0\n\t" /* --sem->count */ - "js 2f\n" + LOCK "decl %1\n\t" /* --sem->count */ + "js 2f\n\t" + "xorl %0,%0\n" "1:\n" LOCK_SECTION_START("") - "2:\tcall __down_failed\n\t" + "2:\tcall __down_failed_wq\n\t" "jmp 1b\n" LOCK_SECTION_END - :"=m" (sem->count) - :"D" (sem) + :"=a" (result), "=m" (sem->count) + :"D" (sem), "S" (wait) :"memory"); + return result; +} + +static inline void down(struct semaphore * sem) +{ + down_wq(sem, NULL); } /* diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/aio.h 900-mjb1/include/linux/aio.h --- 000-virgin/include/linux/aio.h Fri May 30 19:02:22 2003 +++ 900-mjb1/include/linux/aio.h Fri May 30 22:02:34 2003 @@ -54,7 +54,7 @@ struct kiocb { struct file *ki_filp; struct kioctx *ki_ctx; /* may be NULL for sync ops */ int (*ki_cancel)(struct kiocb *, struct io_event *); - long (*ki_retry)(struct kiocb *); + ssize_t (*ki_retry)(struct kiocb *); struct list_head ki_list; /* the aio core uses this * for cancellation */ @@ -62,6 +62,16 @@ struct kiocb { void *ki_user_obj; /* pointer to userland's iocb */ __u64 ki_user_data; /* user's data for completion */ loff_t ki_pos; + + /* State that we remember to be able to restart/retry */ + unsigned short ki_opcode; + size_t ki_nbytes; /* copy of iocb->aio_nbytes */ + char *ki_buf; /* remaining iocb->aio_buf */ + size_t ki_left; /* remaining bytes */ + wait_queue_t ki_wait; + long ki_retried; /* just for testing */ + long ki_kicked; /* just for testing */ + long ki_queued; /* just for testing */ char private[KIOCB_PRIVATE_SIZE]; }; @@ -77,6 +87,8 @@ struct kiocb { (x)->ki_ctx = &tsk->active_mm->default_kioctx; \ (x)->ki_cancel = NULL; \ (x)->ki_user_obj = tsk; \ + (x)->ki_user_data = 0; \ + init_wait((&(x)->ki_wait)); \ } while (0) #define AIO_RING_MAGIC 0xa10a10a1 @@ -158,6 +170,24 @@ int FASTCALL(io_submit_one(struct kioctx #define get_ioctx(kioctx) do { if (unlikely(atomic_read(&(kioctx)->users) <= 0)) BUG(); atomic_inc(&(kioctx)->users); } while (0) #define put_ioctx(kioctx) do { if (unlikely(atomic_dec_and_test(&(kioctx)->users))) __put_ioctx(kioctx); else if (unlikely(atomic_read(&(kioctx)->users) < 0)) BUG(); } while (0) + +#define in_aio() !is_sync_wait(current->io_wait) + +/* when sync behaviour is desired even if running in async context */ +#define do_sync_op(op) if (in_aio()) { \ + wait_queue_t *wait = current->io_wait; \ + current->io_wait = NULL; \ + op; \ + current->io_wait = wait; \ +} else { \ + op; \ +} + +#define warn_if_async() if (in_aio()) {\ + printk(KERN_ERR "%s(%s:%d) called in async context!\n", \ + __FUNCTION__, __FILE__, __LINE__); \ + dump_stack(); \ + } #include diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/binding.h 900-mjb1/include/linux/binding.h --- 000-virgin/include/linux/binding.h Wed Dec 31 16:00:00 1969 +++ 900-mjb1/include/linux/binding.h Fri May 30 19:28:10 2003 @@ -0,0 +1,19 @@ +#ifndef _LINUX_BINDING_H +#define _LINUX_BINDING_H + +#include + +/* Policy flags for memory bindings */ +#define MB_SOFTBIND (1<<0) /* Let page faults fall back to other memblks */ +#define MB_HARDBIND (1<<1) /* Don't let page faults fall back to other memblks */ +#define MB_FIRSTREF (1<<2) /* Allocate pages on closest memblk to faulting CPU */ +#define MB_STRIPE (1<<3) /* Allocate pages evenly across memblks */ +#define MB_MOSTFREE (1<<4) /* Allocate pages on memblk with most free mem */ + +/* Structure to keep track of shared memory segment bindings */ +struct binding { + unsigned long policy; + struct zonelist zonelist; +}; + +#endif /* _LINUX_BINDING_H */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/blkdev.h 900-mjb1/include/linux/blkdev.h --- 000-virgin/include/linux/blkdev.h Fri May 30 19:02:22 2003 +++ 900-mjb1/include/linux/blkdev.h Fri May 30 22:04:50 2003 @@ -21,7 +21,7 @@ struct elevator_s; typedef struct elevator_s elevator_t; #define BLKDEV_MIN_RQ 4 -#define BLKDEV_MAX_RQ 128 +#define BLKDEV_MAX_RQ 32 struct request_list { int count[2]; @@ -454,6 +454,7 @@ extern int blk_queue_init_tags(request_q extern void blk_queue_free_tags(request_queue_t *); extern void blk_queue_invalidate_tags(request_queue_t *); extern void blk_congestion_wait(int rw, long timeout); +extern int blk_congestion_wait_wq(int rw, long timeout, wait_queue_t *wait); extern void blk_rq_bio_prep(request_queue_t *, struct request *, struct bio *); diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/buffer_head.h 900-mjb1/include/linux/buffer_head.h --- 000-virgin/include/linux/buffer_head.h Sat May 10 18:35:03 2003 +++ 900-mjb1/include/linux/buffer_head.h Fri May 30 22:04:55 2003 @@ -158,6 +158,7 @@ void mark_buffer_async_write(struct buff void invalidate_bdev(struct block_device *, int); int sync_blockdev(struct block_device *bdev); void __wait_on_buffer(struct buffer_head *); +int __wait_on_buffer_wq(struct buffer_head *, wait_queue_t *wait); wait_queue_head_t *bh_waitq_head(struct buffer_head *bh); void wake_up_buffer(struct buffer_head *bh); int fsync_bdev(struct block_device *); @@ -168,6 +169,8 @@ struct buffer_head * __getblk(struct blo void __brelse(struct buffer_head *); void __bforget(struct buffer_head *); struct buffer_head *__bread(struct block_device *, sector_t block, int size); +struct buffer_head *__bread_wq(struct block_device *, sector_t block, + int size, wait_queue_t *wait); struct buffer_head *alloc_buffer_head(int gfp_flags); void free_buffer_head(struct buffer_head * bh); void FASTCALL(unlock_buffer(struct buffer_head *bh)); @@ -225,13 +228,13 @@ static inline void put_bh(struct buffer_ static inline void brelse(struct buffer_head *bh) { - if (bh) + if (bh && !IS_ERR(bh)) __brelse(bh); } static inline void bforget(struct buffer_head *bh) { - if (bh) + if (bh && !IS_ERR(bh)) __bforget(bh); } @@ -242,7 +245,12 @@ sb_bread(struct super_block *sb, sector_ } static inline struct buffer_head * -sb_getblk(struct super_block *sb, sector_t block) +sb_bread_wq(struct super_block *sb, sector_t block, wait_queue_t *wait) +{ + return __bread_wq(sb->s_bdev, block, sb->s_blocksize, wait); +} + +static inline struct buffer_head *sb_getblk(struct super_block *sb, sector_t block) { return __getblk(sb->s_bdev, block, sb->s_blocksize); } @@ -272,10 +280,26 @@ static inline void wait_on_buffer(struct __wait_on_buffer(bh); } +static inline int wait_on_buffer_wq(struct buffer_head *bh, wait_queue_t *wait) +{ + if (buffer_locked(bh)) + return __wait_on_buffer_wq(bh, wait); + + return 0; +} + static inline void lock_buffer(struct buffer_head *bh) { while (test_set_buffer_locked(bh)) __wait_on_buffer(bh); +} + +static inline int lock_buffer_wq(struct buffer_head *bh, wait_queue_t *wait) +{ + if (test_set_buffer_locked(bh)) + return __wait_on_buffer_wq(bh, wait); + + return 0; } #endif /* _LINUX_BUFFER_HEAD_H */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/compiler.h 900-mjb1/include/linux/compiler.h --- 000-virgin/include/linux/compiler.h Sun Apr 20 19:35:07 2003 +++ 900-mjb1/include/linux/compiler.h Fri May 30 19:28:15 2003 @@ -60,6 +60,6 @@ shouldn't recognize the original var, and make assumptions about it */ #define RELOC_HIDE(ptr, off) \ ({ unsigned long __ptr; \ - __asm__ ("" : "=g"(__ptr) : "0"(ptr)); \ + __asm__ ("" : "=r"(__ptr) : "0"(ptr)); \ (typeof(ptr)) (__ptr + (off)); }) #endif /* __LINUX_COMPILER_H */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/dump.h 900-mjb1/include/linux/dump.h --- 000-virgin/include/linux/dump.h Wed Dec 31 16:00:00 1969 +++ 900-mjb1/include/linux/dump.h Fri May 30 19:24:57 2003 @@ -0,0 +1,362 @@ +/* + * Kernel header file for Linux crash dumps. + * + * Created by: Matt Robinson (yakker@sgi.com) + * Copyright 1999 - 2002 Silicon Graphics, Inc. All rights reserved. + * + * vmdump.h to dump.h by: Matt D. Robinson (yakker@sourceforge.net) + * Copyright 2001 - 2002 Matt D. Robinson. All rights reserved. + * Copyright (C) 2002 Free Software Foundation, Inc. All rights reserved. + * + * Most of this is the same old stuff from vmdump.h, except now we're + * actually a stand-alone driver plugged into the block layer interface, + * with the exception that we now allow for compression modes externally + * loaded (e.g., someone can come up with their own). + * + * This code is released under version 2 of the GNU GPL. + */ + +/* This header file includes all structure definitions for crash dumps. */ +#ifndef _DUMP_H +#define _DUMP_H + +#if defined(CONFIG_CRASH_DUMP) || defined (CONFIG_CRASH_DUMP_MODULE) + +#include +#include +#include + +/* + * Predefine default DUMP_PAGE constants, asm header may override. + * + * On ia64 discontinuous memory systems it's possible for the memory + * banks to stop at 2**12 page alignments, the smallest possible page + * size. But the system page size, PAGE_SIZE, is in fact larger. + */ +#define DUMP_PAGE_SHIFT PAGE_SHIFT +#define DUMP_PAGE_MASK PAGE_MASK +#define DUMP_PAGE_ALIGN(addr) PAGE_ALIGN(addr) +#define DUMP_HEADER_OFFSET PAGE_SIZE + +/* keep DUMP_PAGE_SIZE constant to 4K = 1<<12 + * it may be different from PAGE_SIZE then. + */ +#define DUMP_PAGE_SIZE 4096 + +/* + * Predefined default memcpy() to use when copying memory to the dump buffer. + * + * On ia64 there is a heads up function that can be called to let the prom + * machine check monitor know that the current activity is risky and it should + * ignore the fault (nofault). In this case the ia64 header will redefine this + * macro to __dump_memcpy() and use it's arch specific version. + */ +#define DUMP_memcpy memcpy + +/* necessary header files */ +#include /* for architecture-specific header */ + +/* + * Size of the buffer that's used to hold: + * + * 1. the dump header (padded to fill the complete buffer) + * 2. the possibly compressed page headers and data + */ +#define DUMP_BUFFER_SIZE (64 * 1024) /* size of dump buffer */ +#define DUMP_HEADER_SIZE DUMP_BUFFER_SIZE + +/* standard header definitions */ +#define DUMP_MAGIC_NUMBER 0xa8190173618f23edULL /* dump magic number */ +#define DUMP_MAGIC_LIVE 0xa8190173618f23cdULL /* live magic number */ +#define DUMP_VERSION_NUMBER 0x8 /* dump version number */ +#define DUMP_PANIC_LEN 0x100 /* dump panic string length */ + +/* dump levels - type specific stuff added later -- add as necessary */ +#define DUMP_LEVEL_NONE 0x0 /* no dumping at all -- just bail */ +#define DUMP_LEVEL_HEADER 0x1 /* kernel dump header only */ +#define DUMP_LEVEL_KERN 0x2 /* dump header and kernel pages */ +#define DUMP_LEVEL_USED 0x4 /* dump header, kernel/user pages */ +#define DUMP_LEVEL_ALL_RAM 0x8 /* dump header, all RAM pages */ +#define DUMP_LEVEL_ALL 0x10 /* dump all memory RAM and firmware */ + + +/* dump compression options -- add as necessary */ +#define DUMP_COMPRESS_NONE 0x0 /* don't compress this dump */ +#define DUMP_COMPRESS_RLE 0x1 /* use RLE compression */ +#define DUMP_COMPRESS_GZIP 0x2 /* use GZIP compression */ + +/* dump flags - any dump-type specific flags -- add as necessary */ +#define DUMP_FLAGS_NONE 0x0 /* no flags are set for this dump */ + +#define DUMP_FLAGS_TARGETMASK 0xf0000000 /* handle special case targets */ +#define DUMP_FLAGS_DISKDUMP 0x80000000 /* dump to local disk */ +#define DUMP_FLAGS_NETDUMP 0x40000000 /* dump over the network */ + +/* dump header flags -- add as necessary */ +#define DUMP_DH_FLAGS_NONE 0x0 /* no flags set (error condition!) */ +#define DUMP_DH_RAW 0x1 /* raw page (no compression) */ +#define DUMP_DH_COMPRESSED 0x2 /* page is compressed */ +#define DUMP_DH_END 0x4 /* end marker on a full dump */ +#define DUMP_DH_TRUNCATED 0x8 /* dump is incomplete */ +#define DUMP_DH_TEST_PATTERN 0x10 /* dump page is a test pattern */ +#define DUMP_DH_NOT_USED 0x20 /* 1st bit not used in flags */ + +/* names for various dump parameters in /proc/kernel */ +#define DUMP_ROOT_NAME "sys/dump" +#define DUMP_DEVICE_NAME "device" +#define DUMP_COMPRESS_NAME "compress" +#define DUMP_LEVEL_NAME "level" +#define DUMP_FLAGS_NAME "flags" + +#define DUMP_SYSRQ_KEY 'd' /* key to use for MAGIC_SYSRQ key */ + +/* CTL_DUMP names: */ +enum +{ + CTL_DUMP_DEVICE=1, + CTL_DUMP_COMPRESS=3, + CTL_DUMP_LEVEL=3, + CTL_DUMP_FLAGS=4, + CTL_DUMP_TEST=5, +}; + + +/* page size for gzip compression -- buffered slightly beyond hardware PAGE_SIZE used by DUMP */ +#define DUMP_DPC_PAGE_SIZE (DUMP_PAGE_SIZE + 512) + +/* dump ioctl() control options */ +#define DIOSDUMPDEV 1 /* set the dump device */ +#define DIOGDUMPDEV 2 /* get the dump device */ +#define DIOSDUMPLEVEL 3 /* set the dump level */ +#define DIOGDUMPLEVEL 4 /* get the dump level */ +#define DIOSDUMPFLAGS 5 /* set the dump flag parameters */ +#define DIOGDUMPFLAGS 6 /* get the dump flag parameters */ +#define DIOSDUMPCOMPRESS 7 /* set the dump compress level */ +#define DIOGDUMPCOMPRESS 8 /* get the dump compress level */ + +/* these ioctls are used only by netdump module */ +#define DIOSTARGETIP 9 /* set the target m/c's ip */ +#define DIOGTARGETIP 10 /* get the target m/c's ip */ +#define DIOSTARGETPORT 11 /* set the target m/c's port */ +#define DIOGTARGETPORT 12 /* get the target m/c's port */ +#define DIOSSOURCEPORT 13 /* set the source m/c's port */ +#define DIOGSOURCEPORT 14 /* get the source m/c's port */ +#define DIOSETHADDR 15 /* set ethernet address */ +#define DIOGETHADDR 16 /* get ethernet address */ + +/* + * Structure: __dump_header + * Function: This is the header dumped at the top of every valid crash + * dump. + */ +struct __dump_header { + /* the dump magic number -- unique to verify dump is valid */ + u64 dh_magic_number; + + /* the version number of this dump */ + u32 dh_version; + + /* the size of this header (in case we can't read it) */ + u32 dh_header_size; + + /* the level of this dump (just a header?) */ + u32 dh_dump_level; + + /* + * We assume dump_page_size to be 4K in every case. + * Store here the configurable system page size (4K, 8K, 16K, etc.) + */ + u32 dh_page_size; + + /* the size of all physical memory */ + u64 dh_memory_size; + + /* the start of physical memory */ + u64 dh_memory_start; + + /* the end of physical memory */ + u64 dh_memory_end; + + /* the number of hardware/physical pages in this dump specifically */ + u32 dh_num_dump_pages; + + /* the panic string, if available */ + char dh_panic_string[DUMP_PANIC_LEN]; + + /* timeval depends on architecture, two long values */ + struct { + u64 tv_sec; + u64 tv_usec; + } dh_time; /* the time of the system crash */ + + /* the NEW utsname (uname) information -- in character form */ + /* we do this so we don't have to include utsname.h */ + /* plus it helps us be more architecture independent */ + /* now maybe one day soon they'll make the [65] a #define! */ + char dh_utsname_sysname[65]; + char dh_utsname_nodename[65]; + char dh_utsname_release[65]; + char dh_utsname_version[65]; + char dh_utsname_machine[65]; + char dh_utsname_domainname[65]; + + /* the address of current task (OLD = void *, NEW = u64) */ + u64 dh_current_task; + + /* what type of compression we're using in this dump (if any) */ + u32 dh_dump_compress; + + /* any additional flags */ + u32 dh_dump_flags; + + /* any additional flags */ + u32 dh_dump_device; +} __attribute__((packed)); + +/* + * Structure: __dump_page + * Function: To act as the header associated to each physical page of + * memory saved in the system crash dump. This allows for + * easy reassembly of each crash dump page. The address bits + * are split to make things easier for 64-bit/32-bit system + * conversions. + * + * dp_byte_offset and dp_page_index are landmarks that are helpful when + * looking at a hex dump of /dev/vmdump, + */ +struct __dump_page { + /* the address of this dump page */ + u64 dp_address; + + /* the size of this dump page */ + u32 dp_size; + + /* flags (currently DUMP_COMPRESSED, DUMP_RAW or DUMP_END) */ + u32 dp_flags; +} __attribute__((packed)); + +/* + * Structure: __lkcdinfo + * Function: This structure contains information needed for the lkcdutils + * package (particularly lcrash) to determine what information is + * associated to this kernel, specifically. + */ +struct __lkcdinfo { + int arch; + int ptrsz; + int byte_order; + int linux_release; + int page_shift; + int page_size; + u64 page_mask; + u64 page_offset; + int stack_offset; +}; + +#ifdef __KERNEL__ + +/* + * Structure: __dump_compress + * Function: This is what an individual compression mechanism can use + * to plug in their own compression techniques. It's always + * best to build these as individual modules so that people + * can put in whatever they want. + */ +struct __dump_compress { + /* the list_head structure for list storage */ + struct list_head list; + + /* the type of compression to use (DUMP_COMPRESS_XXX) */ + int compress_type; + const char *compress_name; + + /* the compression function to call */ + u16 (*compress_func)(const u8 *, u16, u8 *, u16); +}; + +/* functions for dump compression registration */ +extern void dump_register_compression(struct __dump_compress *); +extern void dump_unregister_compression(int); + +/* + * Structure dump_mbank[]: + * + * For CONFIG_DISCONTIGMEM systems this array specifies the + * memory banks/chunks that need to be dumped after a panic. + * + * For classic systems it specifies a single set of pages from + * 0 to max_mapnr. + */ +struct __dump_mbank { + u64 start; + u64 end; + int type; + int pad1; + long pad2; +}; + +#define DUMP_MBANK_TYPE_CONVENTIONAL_MEMORY 1 +#define DUMP_MBANK_TYPE_OTHER 2 + +#define MAXCHUNKS 256 +extern int dump_mbanks; +extern struct __dump_mbank dump_mbank[MAXCHUNKS]; + +/* notification event codes */ +#define DUMP_BEGIN 0x0001 /* dump beginning */ +#define DUMP_END 0x0002 /* dump ending */ + +/* Scheduler soft spin control. + * + * 0 - no dump in progress + * 1 - cpu0 is dumping, ... + */ +extern unsigned long dump_oncpu; +extern void dump_execute(const char *, const struct pt_regs *); + +/* + * Notifier list for kernel code which wants to be called + * at kernel dump. + */ +extern struct notifier_block *dump_notifier_list; +static inline int register_dump_notifier(struct notifier_block *nb) +{ + return notifier_chain_register(&dump_notifier_list, nb); +} +static inline int unregister_dump_notifier(struct notifier_block * nb) +{ + return notifier_chain_unregister(&dump_notifier_list, nb); +} + +/* + * Common Arch Specific Functions should be declared here. + * This allows the C compiler to detect discrepancies. + */ +extern void __dump_open(void); +extern void __dump_cleanup(void); +extern void __dump_init(u64); +extern void __dump_save_regs(struct pt_regs *, const struct pt_regs *); +extern int __dump_configure_header(const struct pt_regs *); +extern void __dump_irq_enable(void); +extern void __dump_irq_restore(void); +extern int __dump_page_valid(unsigned long index); +#ifdef CONFIG_SMP +extern void __dump_save_other_cpus(void); +#else +#define __dump_save_other_cpus() +#endif + +#endif /* __KERNEL__ */ + +#else /* !CONFIG_CRASH_DUMP */ + +/* If not configured then make code disappear! */ +#define register_dump_watchdog(x) do { } while(0) +#define unregister_dump_watchdog(x) do { } while(0) +#define register_dump_notifier(x) do { } while(0) +#define unregister_dump_notifier(x) do { } while(0) +#define dump_in_progress() 0 + +#endif /* !CONFIG_CRASH_DUMP */ + +#endif /* _DUMP_H */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/dump_netdev.h 900-mjb1/include/linux/dump_netdev.h --- 000-virgin/include/linux/dump_netdev.h Wed Dec 31 16:00:00 1969 +++ 900-mjb1/include/linux/dump_netdev.h Fri May 30 19:24:56 2003 @@ -0,0 +1,80 @@ +/* + * linux/drivers/net/netconsole.h + * + * Copyright (C) 2001 Ingo Molnar + * + * This file contains the implementation of an IRQ-safe, crash-safe + * kernel console implementation that outputs kernel messages to the + * network. + * + * Modification history: + * + * 2001-09-17 started by Ingo Molnar. + */ + +/**************************************************************** + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + ****************************************************************/ + +#define NETCONSOLE_VERSION 0x03 + +enum netdump_commands { + COMM_NONE = 0, + COMM_SEND_MEM = 1, + COMM_EXIT = 2, + COMM_REBOOT = 3, + COMM_HELLO = 4, + COMM_GET_NR_PAGES = 5, + COMM_GET_PAGE_SIZE = 6, + COMM_START_NETDUMP_ACK = 7, + COMM_GET_REGS = 8, + COMM_GET_MAGIC = 9, + COMM_START_WRITE_NETDUMP_ACK = 10, +}; + +typedef struct netdump_req_s { + u64 magic; + u32 nr; + u32 command; + u32 from; + u32 to; +} req_t; + +enum netdump_replies { + REPLY_NONE = 0, + REPLY_ERROR = 1, + REPLY_LOG = 2, + REPLY_MEM = 3, + REPLY_RESERVED = 4, + REPLY_HELLO = 5, + REPLY_NR_PAGES = 6, + REPLY_PAGE_SIZE = 7, + REPLY_START_NETDUMP = 8, + REPLY_END_NETDUMP = 9, + REPLY_REGS = 10, + REPLY_MAGIC = 11, + REPLY_START_WRITE_NETDUMP = 12, +}; + +typedef struct netdump_reply_s { + u32 nr; + u32 code; + u32 info; +} reply_t; + +#define HEADER_LEN (1 + sizeof(reply_t)) + + diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/dumpdev.h 900-mjb1/include/linux/dumpdev.h --- 000-virgin/include/linux/dumpdev.h Wed Dec 31 16:00:00 1969 +++ 900-mjb1/include/linux/dumpdev.h Fri May 30 19:24:56 2003 @@ -0,0 +1,123 @@ +/* + * Generic dump device interfaces for flexible system dump + * (Enables variation of dump target types e.g disk, network, memory) + * + * These interfaces have evolved based on discussions on lkcd-devel. + * Eventually the intent is to support primary and secondary or + * alternate targets registered at the same time, with scope for + * situation based failover or multiple dump devices used for parallel + * dump i/o. + * + * Started: Oct 2002 - Suparna Bhattacharya (suparna@in.ibm.com) + * + * Copyright (C) 2001 - 2002 Matt D. Robinson. All rights reserved. + * Copyright (C) 2002 International Business Machines Corp. + * + * This code is released under version 2 of the GNU GPL. + */ + +#ifndef _LINUX_DUMPDEV_H +#define _LINUX_DUMPDEV_H + +#include +#include +#include + +/* Determined by the dump target (device) type */ + +struct dump_dev; + +struct dump_dev_ops { + int (*open)(struct dump_dev *, unsigned long); /* configure */ + int (*release)(struct dump_dev *); /* unconfigure */ + int (*silence)(struct dump_dev *); /* when dump starts */ + int (*resume)(struct dump_dev *); /* when dump is over */ + int (*seek)(struct dump_dev *, loff_t); + /* trigger a write (async in nature typically) */ + int (*write)(struct dump_dev *, void *, unsigned long); + /* not usually used during dump, but option available */ + int (*read)(struct dump_dev *, void *, unsigned long); + /* use to poll for completion */ + int (*ready)(struct dump_dev *, void *); + int (*ioctl)(struct dump_dev *, unsigned int, unsigned long); +}; + +struct dump_dev { + char type_name[32]; /* block, net-poll etc */ + unsigned long device_id; /* interpreted differently for various types */ + struct dump_dev_ops *ops; + struct list_head list; + loff_t curr_offset; +}; + +/* + * dump_dev type variations: + */ + +/* block */ +struct dump_blockdev { + struct dump_dev ddev; + kdev_t kdev_id; + struct block_device *bdev; + struct bio *bio; + loff_t start_offset; + loff_t limit; + int err; +}; + +static inline struct dump_blockdev *DUMP_BDEV(struct dump_dev *dev) +{ + return container_of(dev, struct dump_blockdev, ddev); +} + +/* Dump device / target operation wrappers */ +/* These assume that dump_dev is initiatized to dump_config.dumper->dev */ + +extern struct dump_dev *dump_dev; + +static inline int dump_dev_open(unsigned long arg) +{ + return dump_dev->ops->open(dump_dev, arg); +} + +static inline int dump_dev_release(void) +{ + return dump_dev->ops->release(dump_dev); +} + +static inline int dump_dev_silence(void) +{ + return dump_dev->ops->silence(dump_dev); +} + +static inline int dump_dev_resume(void) +{ + return dump_dev->ops->resume(dump_dev); +} + +static inline int dump_dev_seek(loff_t offset) +{ + return dump_dev->ops->seek(dump_dev, offset); +} + +static inline int dump_dev_write(void *buf, unsigned long len) +{ + return dump_dev->ops->write(dump_dev, buf, len); +} + +static inline int dump_dev_ready(void *buf) +{ + return dump_dev->ops->ready(dump_dev, buf); +} + +static inline int dump_dev_ioctl(unsigned int cmd, unsigned long arg) +{ + if (!dump_dev->ops->ioctl) + return -EINVAL; + return dump_dev->ops->ioctl(dump_dev, cmd, arg); +} + +extern int dump_register_device(struct dump_dev *); +extern void dump_unregister_device(struct dump_dev *); + +#endif /* _LINUX_DUMPDEV_H */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/early_printk.h 900-mjb1/include/linux/early_printk.h --- 000-virgin/include/linux/early_printk.h Wed Dec 31 16:00:00 1969 +++ 900-mjb1/include/linux/early_printk.h Fri May 30 19:28:12 2003 @@ -0,0 +1,47 @@ +#ifndef __EARLY_PRINTK_H_ +#define __EARLY_PRINTK_H_ + +#ifdef CONFIG_EARLY_PRINTK +#include +#include +#include +#include +#include +#include + +/* Simple VGA output */ + +#define MAX_YPOS 25 +#define MAX_XPOS 80 + +/* Simple serial port output */ + +#define DEFAULT_BAUD 57600 +#define XMTRDY 0x20 + +#define DLAB 0x80 + +#define TXR 0 /* Transmit register (WRITE) */ +#define RXR 0 /* Receive register (READ) */ +#define IER 1 /* Interrupt Enable */ +#define IIR 2 /* Interrupt ID */ +#define FCR 2 /* FIFO control */ +#define LCR 3 /* Line control */ +#define MCR 4 /* Modem control */ +#define LSR 5 /* Line Status */ +#define MSR 6 /* Modem Status */ +#define DLL 0 /* Divisor Latch Low */ +#define DLH 1 /* Divisor latch High */ + + +void early_printk(const char *fmt, ...); +int __init setup_early_printk(); + +#else + +#define early_printk(...) do {} while(0) +#define setup_early_printk() do {} while(0) + +#endif + +#endif diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/errno.h 900-mjb1/include/linux/errno.h --- 000-virgin/include/linux/errno.h Fri Dec 13 23:18:13 2002 +++ 900-mjb1/include/linux/errno.h Fri May 30 22:02:34 2003 @@ -22,6 +22,7 @@ #define EBADTYPE 527 /* Type not supported by server */ #define EJUKEBOX 528 /* Request initiated, but will not complete before timeout */ #define EIOCBQUEUED 529 /* iocb queued, will get completion event */ +#define EIOCBRETRY 530 /* iocb queued, will trigger a retry */ #endif diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/fs.h 900-mjb1/include/linux/fs.h --- 000-virgin/include/linux/fs.h Fri May 30 19:02:22 2003 +++ 900-mjb1/include/linux/fs.h Fri May 30 19:28:10 2003 @@ -19,6 +19,7 @@ #include #include #include +#include #include struct iovec; @@ -325,6 +326,7 @@ struct address_space { struct semaphore i_shared_sem; /* protect both above lists */ unsigned long dirtied_when; /* jiffies of first page dirtying */ int gfp_mask; /* how to allocate the pages */ + struct binding *binding; /* for memory bindings */ struct backing_dev_info *backing_dev_info; /* device readahead, etc */ spinlock_t private_lock; /* for use by the address_space */ struct list_head private_list; /* ditto */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/gdb.h 900-mjb1/include/linux/gdb.h --- 000-virgin/include/linux/gdb.h Wed Dec 31 16:00:00 1969 +++ 900-mjb1/include/linux/gdb.h Fri May 30 19:13:53 2003 @@ -0,0 +1,67 @@ +#ifndef _GDB_H_ +#define _GDB_H_ + +/* + * Copyright (C) 2001 Amit S. Kale + */ + +/* gdb locks */ +#define KGDB_MAX_NO_CPUS NR_CPUS + +extern int gdb_enter; /* 1 = enter debugger on boot */ +extern int gdb_ttyS; +extern int gdb_baud; +extern int gdb_initialized; + +extern int gdb_hook(void); +extern void breakpoint(void); + +typedef int gdb_debug_hook(int trapno, + int signo, + int err_code, + struct pt_regs *regs); +extern gdb_debug_hook *linux_debug_hook; + +#ifdef CONFIG_SMP +extern spinlock_t kgdb_spinlock; +extern spinlock_t kgdb_nmispinlock; +#else +extern unsigned kgdb_spinlock; +extern unsigned kgdb_nmispinlock; +#endif + +extern volatile int kgdb_memerr_expected; + +struct console; +void gdb_console_write(struct console *co, const char *s, + unsigned count); +void gdb_console_init(void); + +extern volatile int procindebug[KGDB_MAX_NO_CPUS]; + +#define KGDB_ASSERT(message, condition) do { \ + if (!(condition)) { \ + printk("kgdb assertion failed: %s\n", message); \ + asm ("int $0x3"); \ + } \ +} while (0) + +#ifdef CONFIG_KERNEL_ASSERTS +#define KERNEL_ASSERT(message, condition) KGDB_ASSERT(message, condition) +#else +#define KERNEL_ASSERT(message, condition) +#endif + +#define KA_VALID_ERRNO(errno) ((errno) > 0 && (errno) <= EMEDIUMTYPE) + +#define KA_VALID_PTR_ERR(ptr) KA_VALID_ERRNO(-PTR_ERR(ptr)) + +#define KA_VALID_KPTR(ptr) (!(ptr) || \ + ((void *)(ptr) >= (void *)PAGE_OFFSET && \ + (void *)(ptr) < ERR_PTR(-EMEDIUMTYPE))) + +#define KA_VALID_PTRORERR(errptr) (KA_VALID_KPTR(errptr) || KA_VALID_PTR_ERR(errptr)) + +#define KA_HELD_GKL() (current->lock_depth >= 0) + +#endif /* _GDB_H_ */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/init_task.h 900-mjb1/include/linux/init_task.h --- 000-virgin/include/linux/init_task.h Fri May 30 19:02:22 2003 +++ 900-mjb1/include/linux/init_task.h Fri May 30 22:02:34 2003 @@ -104,6 +104,7 @@ .proc_lock = SPIN_LOCK_UNLOCKED, \ .switch_lock = SPIN_LOCK_UNLOCKED, \ .journal_info = NULL, \ + .io_wait = NULL, \ } diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/kdev_t.h 900-mjb1/include/linux/kdev_t.h --- 000-virgin/include/linux/kdev_t.h Sat May 10 18:35:03 2003 +++ 900-mjb1/include/linux/kdev_t.h Fri May 30 19:33:26 2003 @@ -70,13 +70,13 @@ aeb - 950811 * static arrays, and they are sized for a 8-bit index. */ typedef struct { - unsigned short value; + unsigned int value; } kdev_t; -#define KDEV_MINOR_BITS 8 -#define KDEV_MAJOR_BITS 8 +#define KDEV_MINOR_BITS 16 +#define KDEV_MAJOR_BITS 16 -#define __mkdev(major,minor) (((major) << KDEV_MINOR_BITS) + (minor)) +#define __mkdev(major, minor) (((major) << KDEV_MINOR_BITS) + (minor)) #define mk_kdev(major, minor) ((kdev_t) { __mkdev(major,minor) } ) @@ -107,17 +107,55 @@ static inline int kdev_same(kdev_t dev1, #define kdev_none(d1) (!kdev_val(d1)) -/* Mask off the high bits for now.. */ -#define minor(dev) ((dev).value & 0xff) -#define major(dev) (((dev).value >> KDEV_MINOR_BITS) & 0xff) +#define minor(dev) ((dev).value & 0xffff) +#define major(dev) (((dev).value >> KDEV_MINOR_BITS) & 0xffff) /* These are for user-level "dev_t" */ +/* Since glibc uses 8+8 in , we'll get + incompatibilities with a simple scheme like 12+20. + Use 8+8 for 16-bit values, some other division, say 16+16, + for 32-bit values. */ #define MINORBITS 8 #define MINORMASK ((1U << MINORBITS) - 1) -#define MAJOR(dev) ((unsigned int) ((dev) >> MINORBITS)) -#define MINOR(dev) ((unsigned int) ((dev) & MINORMASK)) -#define MKDEV(ma,mi) (((ma) << MINORBITS) | (mi)) +#include /* dev_t */ +#if 1 +/* macro versions */ + +#define MAJOR(dev) ((unsigned int)(((dev) & 0xffff0000) ? ((dev) >> 16) & 0xffff : ((dev) >> 8) & 0xff)) +#define MINOR(dev) ((unsigned int)(((dev) & 0xffff0000) ? ((dev) & 0xffff) : ((dev) & 0xff))) +#define MKDEV(ma,mi) ((dev_t)((((ma) & ~0xff) == 0 && ((mi) & ~0xff) == 0) ? (((ma) << 8) | (mi)) : (((ma) << 16) | (mi)))) + +#else +/* inline function versions */ + +static inline unsigned int +MAJOR(dev_t dev) { + unsigned int ma; + + ma = ((dev >> 16) & 0xffff); + if (ma == 0) + ma = ((dev >> 8) & 0xff); + return ma; +} + +static inline unsigned int +MINOR(dev_t dev) { + unsigned int mi; + + mi = (dev & 0xffff); + if (mi == dev) + mi = (dev & 0xff); + return mi; +} + +static inline dev_t +MKDEV(unsigned int ma, unsigned int mi) { + if ((ma & ~0xff) == 0 && (mi & ~0xff) == 0) + return ((ma << 8) | mi); + return ((ma << 16) | mi); +} +#endif /* * Conversion functions @@ -125,12 +163,16 @@ static inline int kdev_same(kdev_t dev1, static inline int kdev_t_to_nr(kdev_t dev) { - return MKDEV(major(dev), minor(dev)); + unsigned int ma = major(dev); + unsigned int mi = minor(dev); + return MKDEV(ma, mi); } -static inline kdev_t to_kdev_t(int dev) +static inline kdev_t to_kdev_t(dev_t dev) { - return mk_kdev(MAJOR(dev),MINOR(dev)); + unsigned int ma = MAJOR(dev); + unsigned int mi = MINOR(dev); + return mk_kdev(ma, mi); } #else /* __KERNEL__ */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/kprobes.h 900-mjb1/include/linux/kprobes.h --- 000-virgin/include/linux/kprobes.h Wed Dec 31 16:00:00 1969 +++ 900-mjb1/include/linux/kprobes.h Fri May 30 19:15:14 2003 @@ -0,0 +1,60 @@ +#ifndef _LINUX_KPROBES_H +#define _LINUX_KPROBES_H +#include +#include +#include +#include +#include + +struct kprobe; +struct pt_regs; + +typedef void (*kprobe_pre_handler_t)(struct kprobe *, struct pt_regs *); +typedef void (*kprobe_post_handler_t)(struct kprobe *, struct pt_regs *, + unsigned long flags); +typedef int (*kprobe_fault_handler_t)(struct kprobe *, struct pt_regs *, + int trapnr); + +struct kprobe { + struct list_head list; + + /* location of the probe point */ + kprobe_opcode_t *addr; + + /* Called before addr is executed. */ + kprobe_pre_handler_t pre_handler; + + /* Called after addr is executed, unless... */ + kprobe_post_handler_t post_handler; + + /* ... called if executing addr causes a fault (eg. page fault). + * Return 1 if it handled fault, otherwise kernel will see it. */ + kprobe_fault_handler_t fault_handler; + + /* Saved opcode (which has been replaced with breakpoint) */ + kprobe_opcode_t opcode; +}; + +#ifdef CONFIG_KPROBES +/* Locks kprobe: irq must be disabled */ +void lock_kprobes(void); +void unlock_kprobes(void); + +/* kprobe running now on this CPU? */ +static inline int kprobe_running(void) +{ + extern unsigned int kprobe_cpu; + return kprobe_cpu == smp_processor_id(); +} + +/* Get the kprobe at this addr (if any). Must have called lock_kprobes */ +struct kprobe *get_kprobe(void *addr); + +int register_kprobe(struct kprobe *p); +void unregister_kprobe(struct kprobe *p); +#else +static inline int kprobe_running(void) { return 0; } +static inline int register_kprobe(struct kprobe *p) { return -ENOSYS; } +static inline void unregister_kprobe(struct kprobe *p) { } +#endif +#endif /* _LINUX_KPROBES_H */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/lockmeter.h 900-mjb1/include/linux/lockmeter.h --- 000-virgin/include/linux/lockmeter.h Wed Dec 31 16:00:00 1969 +++ 900-mjb1/include/linux/lockmeter.h Fri May 30 19:26:45 2003 @@ -0,0 +1,320 @@ +/* + * Copyright (C) 1999-2002 Silicon Graphics, Inc. + * + * Written by John Hawkes (hawkes@sgi.com) + * Based on klstat.h by Jack Steiner (steiner@sgi.com) + * + * Modified by Ray Bryant (raybry@us.ibm.com) Feb-Apr 2000 + * Changes Copyright (C) 2000 IBM, Inc. + * Added save of index in spinlock_t to improve efficiency + * of "hold" time reporting for spinlocks + * Added support for hold time statistics for read and write + * locks. + * Moved machine dependent code to include/asm/lockmeter.h. + * + */ + +#ifndef _LINUX_LOCKMETER_H +#define _LINUX_LOCKMETER_H + + +/*--------------------------------------------------- + * architecture-independent lockmeter.h + *-------------------------------------------------*/ + +/* + * raybry -- version 2: added efficient hold time statistics + * requires lstat recompile, so flagged as new version + * raybry -- version 3: added global reader lock data + * hawkes -- version 4: removed some unnecessary fields to simplify mips64 port + */ +#define LSTAT_VERSION 5 + +int lstat_update(void*, void*, int); +int lstat_update_time(void*, void*, int, uint32_t); + +/* + * Currently, the mips64 and sparc64 kernels talk to a 32-bit lockstat, so we + * need to force compatibility in the inter-communication data structure. + */ + +#if defined(CONFIG_MIPS32_COMPAT) +#define TIME_T uint32_t +#elif defined(CONFIG_SPARC32_COMPAT) +#define TIME_T uint64_t +#else +#define TIME_T time_t +#endif + +#if defined(__KERNEL__) || (!defined(CONFIG_MIPS32_COMPAT) && !defined(CONFIG_SPARC32_COMPAT)) || (_MIPS_SZLONG==32) +#define POINTER void * +#else +#define POINTER int64_t +#endif + +/* + * Values for the "action" parameter passed to lstat_update. + * ZZZ - do we want a try-success status here??? + */ +#define LSTAT_ACT_NO_WAIT 0 +#define LSTAT_ACT_SPIN 1 +#define LSTAT_ACT_REJECT 2 +#define LSTAT_ACT_WW_SPIN 3 +#define LSTAT_ACT_SLEPT 4 /* UNUSED */ + +#define LSTAT_ACT_MAX_VALUES 4 /* NOTE: Increase to 5 if use ACT_SLEPT */ + +/* + * Special values for the low 2 bits of an RA passed to + * lstat_update. + */ +/* we use these values to figure out what kind of lock data */ +/* is stored in the statistics table entry at index ....... */ +#define LSTAT_RA_SPIN 0 /* spin lock data */ +#define LSTAT_RA_READ 1 /* read lock statistics */ +#define LSTAT_RA_SEMA 2 /* RESERVED */ +#define LSTAT_RA_WRITE 3 /* write lock statistics*/ + +#define LSTAT_RA(n) \ + ((void*)( ((unsigned long)__builtin_return_address(0) & ~3) | n) ) + +/* + * Constants used for lock addresses in the lstat_directory + * to indicate special values of the lock address. + */ +#define LSTAT_MULTI_LOCK_ADDRESS NULL + +/* + * Maximum size of the lockstats tables. Increase this value + * if its not big enough. (Nothing bad happens if its not + * big enough although some locks will not be monitored.) + * We record overflows of this quantity in lstat_control.dir_overflows + * + * Note: The max value here must fit into the field set + * and obtained by the macro's PUT_INDEX() and GET_INDEX(). + * This value depends on how many bits are available in the + * lock word in the particular machine implementation we are on. + */ +#define LSTAT_MAX_STAT_INDEX 2000 + +/* + * Size and mask for the hash table into the directory. + */ +#define LSTAT_HASH_TABLE_SIZE 4096 /* must be 2**N */ +#define LSTAT_HASH_TABLE_MASK (LSTAT_HASH_TABLE_SIZE-1) + +#define DIRHASH(ra) ((unsigned long)(ra)>>2 & LSTAT_HASH_TABLE_MASK) + +/* + * This defines an entry in the lockstat directory. It contains + * information about a lock being monitored. + * A directory entry only contains the lock identification - + * counts on usage of the lock are kept elsewhere in a per-cpu + * data structure to minimize cache line pinging. + */ +typedef struct { + POINTER caller_ra; /* RA of code that set lock */ + POINTER lock_ptr; /* lock address */ + ushort next_stat_index; /* Used to link multiple locks that have the same hash table value */ +} lstat_directory_entry_t; + +/* + * A multi-dimensioned array used to contain counts for lock accesses. + * The array is 3-dimensional: + * - CPU number. Keep from thrashing cache lines between CPUs + * - Directory entry index. Identifies the lock + * - Action. Indicates what kind of contention occurred on an + * access to the lock. + * + * The index of an entry in the directory is the same as the 2nd index + * of the entry in the counts array. + */ +/* + * This table contains data for spin_locks, write locks, and read locks + * Not all data is used for all cases. In particular, the hold time + * information is not stored here for read locks since that is a global + * (e. g. cannot be separated out by return address) quantity. + * See the lstat_read_lock_counts_t structure for the global read lock + * hold time. + */ +typedef struct { + uint64_t cum_wait_ticks; /* sum of wait times */ + /* for write locks, sum of time a */ + /* writer is waiting for a reader */ + int64_t cum_hold_ticks; /* cumulative sum of holds */ + /* not used for read mode locks */ + /* must be signed. ............... */ + uint32_t max_wait_ticks; /* max waiting time */ + uint32_t max_hold_ticks; /* max holding time */ + uint64_t cum_wait_ww_ticks; /* sum times writer waits on writer*/ + uint32_t max_wait_ww_ticks; /* max wait time writer vs writer */ + /* prev 2 only used for write locks*/ + uint32_t acquire_time; /* time lock acquired this CPU */ + uint32_t count[LSTAT_ACT_MAX_VALUES]; +} lstat_lock_counts_t; + +typedef lstat_lock_counts_t lstat_cpu_counts_t[LSTAT_MAX_STAT_INDEX]; + +/* + * User request to: + * - turn statistic collection on/off, or to reset + */ +#define LSTAT_OFF 0 +#define LSTAT_ON 1 +#define LSTAT_RESET 2 +#define LSTAT_RELEASE 3 + +#define LSTAT_MAX_READ_LOCK_INDEX 1000 +typedef struct { + POINTER lock_ptr; /* address of lock for output stats */ + uint32_t read_lock_count; + int64_t cum_hold_ticks; /* sum of read lock hold times over */ + /* all callers. ....................*/ + uint32_t write_index; /* last write lock hash table index */ + uint32_t busy_periods; /* count of busy periods ended this */ + uint64_t start_busy; /* time this busy period started. ..*/ + uint64_t busy_ticks; /* sum of busy periods this lock. ..*/ + uint64_t max_busy; /* longest busy period for this lock*/ + uint32_t max_readers; /* maximum number of readers ...... */ +#ifdef USER_MODE_TESTING + rwlock_t entry_lock; /* lock for this read lock entry... */ + /* avoid having more than one rdr at*/ + /* needed for user space testing... */ + /* not needed for kernel 'cause it */ + /* is non-preemptive. ............. */ +#endif +} lstat_read_lock_counts_t; +typedef lstat_read_lock_counts_t lstat_read_lock_cpu_counts_t[LSTAT_MAX_READ_LOCK_INDEX]; + +#if defined(__KERNEL__) || defined(USER_MODE_TESTING) + +#ifndef USER_MODE_TESTING +#include +#else +#include "asm_newlockmeter.h" +#endif + +/* + * Size and mask for the hash table into the directory. + */ +#define LSTAT_HASH_TABLE_SIZE 4096 /* must be 2**N */ +#define LSTAT_HASH_TABLE_MASK (LSTAT_HASH_TABLE_SIZE-1) + +#define DIRHASH(ra) ((unsigned long)(ra)>>2 & LSTAT_HASH_TABLE_MASK) + +/* + * This version eliminates the per processor lock stack. What we do is to + * store the index of the lock hash structure in unused bits in the lock + * itself. Then on unlock we can find the statistics record without doing + * any additional hash or lock stack lookup. This works for spin_locks. + * Hold time reporting is now basically as cheap as wait time reporting + * so we ignore the difference between LSTAT_ON_HOLD and LSTAT_ON_WAIT + * as in version 1.1.* of lockmeter. + * + * For rw_locks, we store the index of a global reader stats structure in + * the lock and the writer index is stored in the latter structure. + * For read mode locks we hash at the time of the lock to find an entry + * in the directory for reader wait time and the like. + * At unlock time for read mode locks, we update just the global structure + * so we don't need to know the reader directory index value at unlock time. + * + */ + +/* + * Protocol to change lstat_control.state + * This is complicated because we don't want the cum_hold_time for + * a rw_lock to be decremented in _read_lock_ without making sure it + * is incremented in _read_lock_ and vice versa. So here is the + * way we change the state of lstat_control.state: + * I. To Turn Statistics On + * After allocating storage, set lstat_control.state non-zero. + * This works because we don't start updating statistics for in use + * locks until the reader lock count goes to zero. + * II. To Turn Statistics Off: + * (0) Disable interrupts on this CPU + * (1) Seize the lstat_control.directory_lock + * (2) Obtain the current value of lstat_control.next_free_read_lock_index + * (3) Store a zero in lstat_control.state. + * (4) Release the lstat_control.directory_lock + * (5) For each lock in the read lock list up to the saved value + * (well, -1) of the next_free_read_lock_index, do the following: + * (a) Check validity of the stored lock address + * by making sure that the word at the saved addr + * has an index that matches this entry. If not + * valid, then skip this entry. + * (b) If there is a write lock already set on this lock, + * skip to (d) below. + * (c) Set a non-metered write lock on the lock + * (d) set the cached INDEX in the lock to zero + * (e) Release the non-metered write lock. + * (6) Re-enable interrupts + * + * These rules ensure that a read lock will not have its statistics + * partially updated even though the global lock recording state has + * changed. See put_lockmeter_info() for implementation. + * + * The reason for (b) is that there may be write locks set on the + * syscall path to put_lockmeter_info() from user space. If we do + * not do this check, then we can deadlock. A similar problem would + * occur if the lock was read locked by the current CPU. At the + * moment this does not appear to happen. + */ + +/* + * Main control structure for lockstat. Used to turn statistics on/off + * and to maintain directory info. + */ +typedef struct { + int state; + spinlock_t control_lock; /* used to serialize turning statistics on/off */ + spinlock_t directory_lock; /* for serialize adding entries to directory */ + volatile int next_free_dir_index;/* next free entry in the directory */ + /* FIXME not all of these fields are used / needed .............. */ + /* the following fields represent data since */ + /* first "lstat on" or most recent "lstat reset" */ + TIME_T first_started_time; /* time when measurement first enabled */ + TIME_T started_time; /* time when measurement last started */ + TIME_T ending_time; /* time when measurement last disabled */ + uint64_t started_cycles64; /* cycles when measurement last started */ + uint64_t ending_cycles64; /* cycles when measurement last disabled */ + uint64_t enabled_cycles64; /* total cycles with measurement enabled */ + int intervals; /* number of measurement intervals recorded */ + /* i. e. number of times did lstat on;lstat off */ + lstat_directory_entry_t *dir; /* directory */ + int dir_overflow; /* count of times ran out of space in directory */ + int rwlock_overflow; /* count of times we couldn't allocate a rw block*/ + ushort *hashtab; /* hash table for quick dir scans */ + lstat_cpu_counts_t *counts[NR_CPUS]; /* Array of pointers to per-cpu stats */ + int next_free_read_lock_index; /* next rwlock reader (global) stats block */ + lstat_read_lock_cpu_counts_t *read_lock_counts[NR_CPUS]; /* per cpu read lock stats */ +} lstat_control_t; + +#endif /* defined(__KERNEL__) || defined(USER_MODE_TESTING) */ + +typedef struct { + short lstat_version; /* version of the data */ + short state; /* the current state is returned */ + int maxcpus; /* Number of cpus present */ + int next_free_dir_index; /* index of the next free directory entry */ + TIME_T first_started_time; /* when measurement enabled for first time */ + TIME_T started_time; /* time in secs since 1969 when stats last turned on */ + TIME_T ending_time; /* time in secs since 1969 when stats last turned off */ + uint32_t cycleval; /* cycles per second */ +#ifdef notyet + void *kernel_magic_addr; /* address of kernel_magic */ + void *kernel_end_addr; /* contents of kernel magic (points to "end") */ +#endif + int next_free_read_lock_index; /* index of next (global) read lock stats struct */ + uint64_t started_cycles64; /* cycles when measurement last started */ + uint64_t ending_cycles64; /* cycles when stats last turned off */ + uint64_t enabled_cycles64; /* total cycles with measurement enabled */ + int intervals; /* number of measurement intervals recorded */ + /* i.e. number of times we did lstat on;lstat off*/ + int dir_overflow; /* number of times we wanted more space in directory */ + int rwlock_overflow; /* # of times we wanted more space in read_locks_count */ + struct new_utsname uts; /* info about machine where stats are measured */ + /* -T option of lockstat allows data to be */ + /* moved to another machine. ................. */ +} lstat_user_request_t; + +#endif /* _LINUX_LOCKMETER_H */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/major.h 900-mjb1/include/linux/major.h --- 000-virgin/include/linux/major.h Fri May 30 19:02:22 2003 +++ 900-mjb1/include/linux/major.h Fri May 30 19:24:56 2003 @@ -157,6 +157,8 @@ #define OSST_MAJOR 206 /* OnStream-SCx0 SCSI tape */ +#define CRASH_DUMP_MAJOR 221 /* crash dump interface */ + #define IBM_TTY3270_MAJOR 227 #define IBM_FS3270_MAJOR 228 diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/mm.h 900-mjb1/include/linux/mm.h --- 000-virgin/include/linux/mm.h Fri May 30 19:02:22 2003 +++ 900-mjb1/include/linux/mm.h Fri May 30 19:07:08 2003 @@ -179,6 +179,7 @@ struct page { struct pte_chain *chain;/* Reverse pte mapping pointer. * protected by PG_chainlock */ pte_addr_t direct; + int mapcount; } pte; unsigned long private; /* mapping-private opaque data */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/mmzone.h 900-mjb1/include/linux/mmzone.h --- 000-virgin/include/linux/mmzone.h Fri May 30 19:02:22 2003 +++ 900-mjb1/include/linux/mmzone.h Fri May 30 19:29:30 2003 @@ -185,6 +185,7 @@ typedef struct pglist_data { struct bootmem_data *bdata; unsigned long node_start_pfn; unsigned long node_size; + unsigned long real_node_size; int node_id; struct pglist_data *pgdat_next; wait_queue_head_t kswapd_wait; diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/netdevice.h 900-mjb1/include/linux/netdevice.h --- 000-virgin/include/linux/netdevice.h Fri May 30 19:02:23 2003 +++ 900-mjb1/include/linux/netdevice.h Fri May 30 19:24:56 2003 @@ -426,6 +426,9 @@ struct net_device unsigned char *haddr); int (*neigh_setup)(struct net_device *dev, struct neigh_parms *); int (*accept_fastpath)(struct net_device *, struct dst_entry*); +#define HAVE_POLL_CONTROLLER + void (*poll_controller)(struct net_device *dev); + int (*rx_hook)(struct sk_buff *skb); /* bridge stuff */ struct net_bridge_port *br_port; diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/nfsd/syscall.h 900-mjb1/include/linux/nfsd/syscall.h --- 000-virgin/include/linux/nfsd/syscall.h Sat May 10 18:35:03 2003 +++ 900-mjb1/include/linux/nfsd/syscall.h Fri May 30 19:33:26 2003 @@ -59,7 +59,7 @@ struct nfsctl_client { struct nfsctl_export { char ex_client[NFSCLNT_IDMAX+1]; char ex_path[NFS_MAXPATHLEN+1]; - __kernel_old_dev_t ex_dev; + u16 ex_dev; __kernel_ino_t ex_ino; int ex_flags; __kernel_uid_t ex_anon_uid; diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/page-flags.h 900-mjb1/include/linux/page-flags.h --- 000-virgin/include/linux/page-flags.h Sun Apr 20 19:35:07 2003 +++ 900-mjb1/include/linux/page-flags.h Fri May 30 19:07:08 2003 @@ -74,6 +74,7 @@ #define PG_mappedtodisk 17 /* Has blocks allocated on-disk */ #define PG_reclaim 18 /* To be reclaimed asap */ #define PG_compound 19 /* Part of a compound page */ +#define PG_anon 20 /* Anonymous page */ /* @@ -256,6 +257,10 @@ extern void get_full_page_state(struct p #define PageCompound(page) test_bit(PG_compound, &(page)->flags) #define SetPageCompound(page) set_bit(PG_compound, &(page)->flags) #define ClearPageCompound(page) clear_bit(PG_compound, &(page)->flags) + +#define PageAnon(page) test_bit(PG_anon, &(page)->flags) +#define SetPageAnon(page) set_bit(PG_anon, &(page)->flags) +#define ClearPageAnon(page) clear_bit(PG_anon, &(page)->flags) /* * The PageSwapCache predicate doesn't use a PG_flag at this time, diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/pagemap.h 900-mjb1/include/linux/pagemap.h --- 000-virgin/include/linux/pagemap.h Fri May 30 19:02:23 2003 +++ 900-mjb1/include/linux/pagemap.h Fri May 30 22:04:48 2003 @@ -27,16 +27,25 @@ #define page_cache_release(page) put_page(page) void release_pages(struct page **pages, int nr, int cold); -static inline struct page *page_cache_alloc(struct address_space *x) +static inline struct page *__page_cache_alloc(struct address_space *x, int cold) { - return alloc_pages(x->gfp_mask, 0); -} + int gfp_mask; + struct zonelist *zonelist; -static inline struct page *page_cache_alloc_cold(struct address_space *x) -{ - return alloc_pages(x->gfp_mask|__GFP_COLD, 0); + gfp_mask = x->gfp_mask; + if (cold) + gfp_mask |= __GFP_COLD; + if (x->binding) + zonelist = &x->binding->zonelist; + else + zonelist = NODE_DATA(numa_node_id())->node_zonelists + (x->gfp_mask & GFP_ZONEMASK); + + return __alloc_pages(gfp_mask, 0, zonelist); } +#define page_cache_alloc(x) __page_cache_alloc((x), 0) +#define page_cache_alloc_cold(x) __page_cache_alloc((x), 1) + typedef int filler_t(void *, struct page *); extern struct page * find_get_page(struct address_space *mapping, @@ -135,6 +144,16 @@ static inline void lock_page(struct page if (TestSetPageLocked(page)) __lock_page(page); } + +extern int FASTCALL(__lock_page_wq(struct page *page, wait_queue_t *wait)); +static inline int lock_page_wq(struct page *page, wait_queue_t *wait) +{ + if (TestSetPageLocked(page)) + return __lock_page_wq(page, wait); + else + return 0; +} + /* * This is exported only for wait_on_page_locked/wait_on_page_writeback. @@ -153,6 +172,15 @@ static inline void wait_on_page_locked(s { if (PageLocked(page)) wait_on_page_bit(page, PG_locked); +} + +extern int FASTCALL(wait_on_page_bit_wq(struct page *page, int bit_nr, + wait_queue_t *wait)); +static inline int wait_on_page_locked_wq(struct page *page, wait_queue_t *wait) +{ + if (PageLocked(page)) + return wait_on_page_bit_wq(page, PG_locked, wait); + return 0; } /* diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/pci.h 900-mjb1/include/linux/pci.h --- 000-virgin/include/linux/pci.h Fri May 30 19:02:23 2003 +++ 900-mjb1/include/linux/pci.h Fri May 30 19:28:16 2003 @@ -451,10 +451,10 @@ struct pci_bus { void *sysdata; /* hook for sys-specific extension */ struct proc_dir_entry *procdir; /* directory entry in /proc/bus/pci */ - unsigned char number; /* bus number */ - unsigned char primary; /* number of primary bridge */ - unsigned char secondary; /* number of secondary bridge */ - unsigned char subordinate; /* max number of subordinate buses */ + unsigned int number; /* bus number */ + unsigned int primary; /* number of primary bridge */ + unsigned int secondary; /* number of secondary bridge */ + unsigned int subordinate; /* max number of subordinate buses */ char name[48]; diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/sched.h 900-mjb1/include/linux/sched.h --- 000-virgin/include/linux/sched.h Fri May 30 19:02:23 2003 +++ 900-mjb1/include/linux/sched.h Fri May 30 22:02:34 2003 @@ -69,7 +69,11 @@ struct exec_domain; * the EXP_n values would be 1981, 2034 and 2043 if still using only * 11 bit fractions. */ -extern unsigned long avenrun[]; /* Load averages */ +extern unsigned long avenrun[]; /* Load averages */ +extern unsigned long tasks_running[3]; /* Real load averages */ +extern unsigned long cpu_tasks_running[3][NR_CPUS]; /* Real load averages per cpu */ + +extern unsigned long tasks_running[]; /* Real load averages */ #define FSHIFT 11 /* nr of bits of precision */ #define FIXED_1 (1< #include #include @@ -166,7 +174,13 @@ extern unsigned long cache_decay_ticks; #define MAX_SCHEDULE_TIMEOUT LONG_MAX extern signed long FASTCALL(schedule_timeout(signed long timeout)); -asmlinkage void schedule(void); +#ifdef CONFIG_KGDB_THREAD + asmlinkage void do_schedule(void); + asmlinkage void kern_schedule(void); + asmlinkage void kern_do_schedule(struct pt_regs); +#else + asmlinkage void schedule(void); +#endif struct namespace; @@ -320,6 +334,13 @@ struct k_itimer { }; +struct sched_info { + /* running averages */ + unsigned long response_time, inter_arrival_time, service_time; + + /* timestamps */ + unsigned long last_arrival, began_service; +}; struct task_struct { volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ @@ -341,6 +362,8 @@ struct task_struct { unsigned long cpus_allowed; unsigned int time_slice, first_time_slice; + struct sched_info sched_info; + struct list_head tasks; struct list_head ptrace_children; struct list_head ptrace_list; @@ -451,6 +474,8 @@ struct task_struct { unsigned long ptrace_message; siginfo_t *last_siginfo; /* For ptrace use. */ +/* current io wait handle */ + wait_queue_t *io_wait; }; extern void __put_task_struct(struct task_struct *tsk); @@ -486,7 +511,7 @@ extern void set_cpus_allowed(task_t *p, # define set_cpus_allowed(p, new_mask) do { } while (0) #endif -#ifdef CONFIG_NUMA +#ifdef CONFIG_NUMA_SCHED extern void sched_balance_exec(void); extern void node_nr_running_init(void); #else @@ -700,6 +725,12 @@ static inline int thread_group_empty(tas (thread_group_leader(p) && !thread_group_empty(p)) extern void unhash_process(struct task_struct *p); + +#ifdef CONFIG_KGDB_THREAD +#define schedule() kern_schedule() +#else +#define user_schedule() schedule() +#endif /* Protects ->fs, ->files, ->mm, and synchronises with wait4(). * Nests both inside and outside of read_lock(&tasklist_lock). diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/spinlock.h 900-mjb1/include/linux/spinlock.h --- 000-virgin/include/linux/spinlock.h Fri May 30 19:02:23 2003 +++ 900-mjb1/include/linux/spinlock.h Fri May 30 22:02:11 2003 @@ -183,6 +183,17 @@ typedef struct { #endif /* !SMP */ +#ifdef CONFIG_LOCKMETER +extern void _metered_spin_lock (spinlock_t *lock); +extern void _metered_spin_unlock (spinlock_t *lock); +extern int _metered_spin_trylock(spinlock_t *lock); +extern void _metered_read_lock (rwlock_t *lock); +extern void _metered_read_unlock (rwlock_t *lock); +extern void _metered_write_lock (rwlock_t *lock); +extern void _metered_write_unlock (rwlock_t *lock); +extern int _metered_write_trylock(rwlock_t *lock); +#endif + /* * Define the various spin_lock and rw_lock methods. Note we define these * regardless of whether CONFIG_SMP or CONFIG_PREEMPT are set. The various @@ -387,6 +398,141 @@ do { \ #define spin_trylock_bh(lock) ({ local_bh_disable(); preempt_disable(); \ _raw_spin_trylock(lock) ? 1 : \ ({preempt_enable(); local_bh_enable(); 0;});}) + +#ifdef CONFIG_LOCKMETER +#undef spin_lock +#undef spin_trylock +#undef spin_unlock +#undef spin_lock_irqsave +#undef spin_lock_irq +#undef spin_lock_bh +#undef read_lock +#undef read_unlock +#undef write_lock +#undef write_unlock +#undef write_trylock +#undef spin_unlock_bh +#undef read_lock_irqsave +#undef read_lock_irq +#undef read_lock_bh +#undef read_unlock_bh +#undef write_lock_irqsave +#undef write_lock_irq +#undef write_lock_bh +#undef write_unlock_bh + +#define spin_lock(lock) \ +do { \ + preempt_disable(); \ + _metered_spin_lock(lock); \ +} while(0) + +#define spin_trylock(lock) ({preempt_disable(); _metered_spin_trylock(lock) ? \ + 1 : ({preempt_enable(); 0;});}) +#define spin_unlock(lock) \ +do { \ + _metered_spin_unlock(lock); \ + preempt_enable(); \ +} while (0) + +#define spin_lock_irqsave(lock, flags) \ +do { \ + local_irq_save(flags); \ + preempt_disable(); \ + _metered_spin_lock(lock); \ +} while (0) + +#define spin_lock_irq(lock) \ +do { \ + local_irq_disable(); \ + preempt_disable(); \ + _metered_spin_lock(lock); \ +} while (0) + +#define spin_lock_bh(lock) \ +do { \ + local_bh_disable(); \ + preempt_disable(); \ + _metered_spin_lock(lock); \ +} while (0) + +#define spin_unlock_bh(lock) \ +do { \ + _metered_spin_unlock(lock); \ + preempt_enable(); \ + local_bh_enable(); \ +} while (0) + + +#define read_lock(lock) ({preempt_disable(); _metered_read_lock(lock);}) +#define read_unlock(lock) ({_metered_read_unlock(lock); preempt_enable();}) +#define write_lock(lock) ({preempt_disable(); _metered_write_lock(lock);}) +#define write_unlock(lock) ({_metered_write_unlock(lock); preempt_enable();}) +#define write_trylock(lock) ({preempt_disable();_metered_write_trylock(lock) ? \ + 1 : ({preempt_enable(); 0;});}) +#define spin_unlock_no_resched(lock) \ +do { \ + _metered_spin_unlock(lock); \ + preempt_enable_no_resched(); \ +} while (0) + +#define read_lock_irqsave(lock, flags) \ +do { \ + local_irq_save(flags); \ + preempt_disable(); \ + _metered_read_lock(lock); \ +} while (0) + +#define read_lock_irq(lock) \ +do { \ + local_irq_disable(); \ + preempt_disable(); \ + _metered_read_lock(lock); \ +} while (0) + +#define read_lock_bh(lock) \ +do { \ + local_bh_disable(); \ + preempt_disable(); \ + _metered_read_lock(lock); \ +} while (0) + +#define read_unlock_bh(lock) \ +do { \ + _metered_read_unlock(lock); \ + preempt_enable(); \ + local_bh_enable(); \ +} while (0) + +#define write_lock_irqsave(lock, flags) \ +do { \ + local_irq_save(flags); \ + preempt_disable(); \ + _metered_write_lock(lock); \ +} while (0) + +#define write_lock_irq(lock) \ +do { \ + local_irq_disable(); \ + preempt_disable(); \ + _metered_write_lock(lock); \ +} while (0) + +#define write_lock_bh(lock) \ +do { \ + local_bh_disable(); \ + preempt_disable(); \ + _metered_write_lock(lock); \ +} while (0) + +#define write_unlock_bh(lock) \ +do { \ + _metered_write_unlock(lock); \ + preempt_enable(); \ + local_bh_enable(); \ +} while (0) + +#endif /* !CONFIG_LOCKMETER */ /* "lock on reference count zero" */ #ifndef ATOMIC_DEC_AND_LOCK diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/swap.h 900-mjb1/include/linux/swap.h --- 000-virgin/include/linux/swap.h Fri May 30 19:02:23 2003 +++ 900-mjb1/include/linux/swap.h Fri May 30 19:07:08 2003 @@ -186,6 +186,8 @@ struct pte_chain *FASTCALL(page_add_rmap void FASTCALL(page_remove_rmap(struct page *, pte_t *)); int FASTCALL(try_to_unmap(struct page *)); +int page_convert_anon(struct page *); + /* linux/mm/shmem.c */ extern int shmem_unuse(swp_entry_t entry, struct page *page); #else diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/sysctl.h 900-mjb1/include/linux/sysctl.h --- 000-virgin/include/linux/sysctl.h Fri May 30 19:02:23 2003 +++ 900-mjb1/include/linux/sysctl.h Fri May 30 19:24:56 2003 @@ -66,7 +66,8 @@ enum CTL_DEV=7, /* Devices */ CTL_BUS=8, /* Busses */ CTL_ABI=9, /* Binary emulation */ - CTL_CPU=10 /* CPU stuff (speed scaling, etc) */ + CTL_CPU=10, /* CPU stuff (speed scaling, etc) */ + CTL_SCHED=11, /* scheduler tunables */ }; /* CTL_BUS names: */ @@ -130,6 +131,7 @@ enum KERN_PIDMAX=55, /* int: PID # limit */ KERN_CORE_PATTERN=56, /* string: pattern for core-file names */ KERN_PANIC_ON_OOPS=57, /* int: whether we will panic on an oops */ + KERN_DUMP=60, /* directory: dump parameters */ }; @@ -158,6 +160,21 @@ enum VM_LOWER_ZONE_PROTECTION=20,/* Amount of protection of lower zones */ }; +/* Tunable scheduler parameters in /proc/sys/sched/ */ +enum { + SCHED_MIN_TIMESLICE=1, /* minimum process timeslice */ + SCHED_MAX_TIMESLICE=2, /* maximum process timeslice */ + SCHED_CHILD_PENALTY=3, /* penalty on fork to child */ + SCHED_PARENT_PENALTY=4, /* penalty on fork to parent */ + SCHED_EXIT_WEIGHT=5, /* penalty to parent of CPU hog child */ + SCHED_PRIO_BONUS_RATIO=6, /* percent of max prio given as bonus */ + SCHED_INTERACTIVE_DELTA=7, /* delta used to scale interactivity */ + SCHED_MAX_SLEEP_AVG=8, /* maximum sleep avg attainable */ + SCHED_STARVATION_LIMIT=9, /* no re-active if expired is starved */ + SCHED_NODE_THRESHOLD=10, /* NUMA node rebalance threshold */ + SCHED_IDLE_NODE_REBALANCE_RATIO=11, /* how often to global balance */ + SCHED_BUSY_NODE_REBALANCE_RATIO=12, /* how often to global balance */ +}; /* CTL_NET names: */ enum diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/timex.h 900-mjb1/include/linux/timex.h --- 000-virgin/include/linux/timex.h Sat May 10 18:35:04 2003 +++ 900-mjb1/include/linux/timex.h Fri May 30 19:03:25 2003 @@ -75,7 +75,7 @@ #elif HZ >= 768 && HZ < 1536 # define SHIFT_HZ 10 #else -# error You lose. +# error Please use a HZ value which is between 12 and 1536 #endif /* diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/wait.h 900-mjb1/include/linux/wait.h --- 000-virgin/include/linux/wait.h Fri May 30 19:02:23 2003 +++ 900-mjb1/include/linux/wait.h Fri May 30 22:02:34 2003 @@ -80,6 +80,8 @@ static inline int waitqueue_active(wait_ return !list_empty(&q->task_list); } +#define is_sync_wait(wait) (!(wait) || ((wait)->task)) + extern void FASTCALL(add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)); extern void FASTCALL(add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait)); extern void FASTCALL(remove_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)); diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/writeback.h 900-mjb1/include/linux/writeback.h --- 000-virgin/include/linux/writeback.h Mon Mar 17 21:43:50 2003 +++ 900-mjb1/include/linux/writeback.h Fri May 30 22:04:50 2003 @@ -80,8 +80,8 @@ extern int dirty_expire_centisecs; void page_writeback_init(void); -void balance_dirty_pages(struct address_space *mapping); -void balance_dirty_pages_ratelimited(struct address_space *mapping); +int balance_dirty_pages(struct address_space *mapping); +int balance_dirty_pages_ratelimited(struct address_space *mapping); int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0); int do_writepages(struct address_space *mapping, struct writeback_control *wbc); diff -urpN -X /home/fletch/.diff.exclude 000-virgin/init/Makefile 900-mjb1/init/Makefile --- 000-virgin/init/Makefile Sat May 10 18:35:04 2003 +++ 900-mjb1/init/Makefile Fri May 30 19:24:57 2003 @@ -9,6 +9,9 @@ mounts-$(CONFIG_BLK_DEV_RAM) += do_mount mounts-$(CONFIG_BLK_DEV_INITRD) += do_mounts_initrd.o mounts-$(CONFIG_BLK_DEV_MD) += do_mounts_md.o +extra-$(CONFIG_CRASH_DUMP) += kerntypes.o +CFLAGS_kerntypes.o := -gstabs + # files to be removed upon make clean clean-files := ../include/linux/compile.h @@ -24,3 +27,4 @@ $(obj)/version.o: include/linux/compile. include/linux/compile.h: FORCE @echo ' CHK $@' @sh $(srctree)/scripts/mkcompile_h $@ "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CC) $(CFLAGS)" + diff -urpN -X /home/fletch/.diff.exclude 000-virgin/init/kerntypes.c 900-mjb1/init/kerntypes.c --- 000-virgin/init/kerntypes.c Wed Dec 31 16:00:00 1969 +++ 900-mjb1/init/kerntypes.c Fri May 30 19:24:56 2003 @@ -0,0 +1,24 @@ +/* + * kerntypes.c + * + * Copyright (C) 2000 Tom Morano (tjm@sgi.com) and + * Matt D. Robinson (yakker@alacritech.com) + * + * Dummy module that includes headers for all kernel types of interest. + * The kernel type information is used by the lcrash utility when + * analyzing system crash dumps or the live system. Using the type + * information for the running system, rather than kernel header files, + * makes for a more flexible and robust analysis tool. + * + * This source code is released under version 2 of the GNU GPL. + */ +#include +#include +#include +#include +#include + +void +kerntypes_dummy(void) +{ +} diff -urpN -X /home/fletch/.diff.exclude 000-virgin/init/main.c 900-mjb1/init/main.c --- 000-virgin/init/main.c Fri May 30 19:02:23 2003 +++ 900-mjb1/init/main.c Fri May 30 19:28:12 2003 @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -55,6 +56,10 @@ #include #endif +#ifdef CONFIG_X86_REMOTE_DEBUG +#include +#endif + /* * Versions of gcc older than that listed below may actually compile * and link okay, but the end product can have subtle run time bugs. @@ -100,6 +105,16 @@ extern void ipc_init(void); int system_running = 0; /* + * The kernel_magic value represents the address of _end, which allows + * namelist tools to "match" each other respectively. That way a tool + * that looks at /dev/mem can verify that it is using the right System.map + * file -- if kernel_magic doesn't equal the namelist value of _end, + * something's wrong. + */ +extern unsigned long _end; +unsigned long *kernel_magic = &_end; + +/* * Boot command-line arguments */ #define MAX_INIT_ARGS 8 @@ -387,6 +402,8 @@ asmlinkage void __init start_kernel(void */ lock_kernel(); printk(linux_banner); + setup_early_printk(); + setup_arch(&command_line); setup_per_cpu_areas(); @@ -458,6 +475,12 @@ asmlinkage void __init start_kernel(void * make syscalls (and thus be locked). */ init_idle(current, smp_processor_id()); + +#ifdef CONFIG_X86_REMOTE_DEBUG + if (gdb_enter) { + gdb_hook(); /* right at boot time */ + } +#endif /* Do the rest non-__init'ed, we're now alive */ rest_init(); diff -urpN -X /home/fletch/.diff.exclude 000-virgin/kernel/Makefile 900-mjb1/kernel/Makefile --- 000-virgin/kernel/Makefile Fri May 30 19:02:24 2003 +++ 900-mjb1/kernel/Makefile Fri May 30 19:26:45 2003 @@ -11,6 +11,7 @@ obj-y = sched.o fork.o exec_domain.o obj-$(CONFIG_FUTEX) += futex.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o obj-$(CONFIG_SMP) += cpu.o +obj-$(CONFIG_LOCKMETER) += lockmeter.o obj-$(CONFIG_UID16) += uid16.o obj-$(CONFIG_MODULES) += ksyms.o module.o obj-$(CONFIG_KALLSYMS) += kallsyms.o @@ -19,6 +20,8 @@ obj-$(CONFIG_CPU_FREQ) += cpufreq.o obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend.o obj-$(CONFIG_COMPAT) += compat.o +obj-$(CONFIG_EARLY_PRINTK) += early_printk.o +obj-$(CONFIG_KPROBES) += kprobes.o ifneq ($(CONFIG_IA64),y) # According to Alan Modra , the -fno-omit-frame-pointer is diff -urpN -X /home/fletch/.diff.exclude 000-virgin/kernel/early_printk.c 900-mjb1/kernel/early_printk.c --- 000-virgin/kernel/early_printk.c Wed Dec 31 16:00:00 1969 +++ 900-mjb1/kernel/early_printk.c Fri May 30 19:28:12 2003 @@ -0,0 +1,218 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +/* Simple VGA output */ + +#define MAX_YPOS 25 +#define MAX_XPOS 80 + +static int current_ypos = 1, current_xpos = 0; + +static void early_vga_write(struct console *con, const char *str, unsigned n) +{ + char c; + int i, k, j; + + while ((c = *str++) != '\0' && n-- > 0) { + if (current_ypos >= MAX_YPOS) { + /* scroll 1 line up */ + for(k = 1, j = 0; k < MAX_YPOS; k++, j++) { + for(i = 0; i < MAX_XPOS; i++) { + writew(readw(VGABASE + 2*(MAX_XPOS*k + i)), + VGABASE + 2*(MAX_XPOS*j + i)); + } + } + for(i = 0; i < MAX_XPOS; i++) { + writew(0x720, VGABASE + 2*(MAX_XPOS*j + i)); + } + current_ypos = MAX_YPOS-1; + } + if (c == '\n') { + current_xpos = 0; + current_ypos++; + } else if (c != '\r') { + writew(((0x7 << 8) | (unsigned short) c), + VGABASE + 2*(MAX_XPOS*current_ypos + current_xpos++)); + if (current_xpos >= MAX_XPOS) { + current_xpos = 0; + current_ypos++; + } + } + } +} + +static struct console early_vga_console = { + .name = "earlyvga", + .write = early_vga_write, + .flags = CON_PRINTBUFFER, + .index = -1, +}; + +/* Serial functions losely based on a similar package from Klaus P. Gerlicher */ + +int early_serial_base; /* ttyS0 */ + +static int early_serial_putc(unsigned char ch) +{ + unsigned timeout = 0xffff; + while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout) + rep_nop(); + outb(ch, early_serial_base + TXR); + return timeout ? 0 : -1; +} + +static void early_serial_write(struct console *con, const char *s, unsigned n) +{ + while (*s && n-- > 0) { + early_serial_putc(*s); + if (*s == '\n') + early_serial_putc('\r'); + s++; + } +} + +static __init void early_serial_init(char *opt) +{ + unsigned char c; + unsigned divisor, baud = DEFAULT_BAUD; + static int bases[] = SERIAL_BASES; + char *s, *e; + + early_serial_base = bases[0]; + + if (*opt == ',') + ++opt; + + s = strsep(&opt, ","); + if (s != NULL) { + unsigned port; + if (!strncmp(s,"0x",2)) + early_serial_base = simple_strtoul(s, &e, 16); + else { + if (!strncmp(s,"ttyS",4)) + s+=4; + port = simple_strtoul(s, &e, 10); + if (port > (SERIAL_BASES_LEN-1) || s == e) + port = 0; + early_serial_base = bases[port]; + } + } + + outb(0x3, early_serial_base + LCR); /* 8n1 */ + outb(0, early_serial_base + IER); /* no interrupt */ + outb(0, early_serial_base + FCR); /* no fifo */ + outb(0x3, early_serial_base + MCR); /* DTR + RTS */ + + s = strsep(&opt, ","); + if (s != NULL) { + baud = simple_strtoul(s, &e, 0); + if (baud == 0 || s == e) + baud = DEFAULT_BAUD; + } + + divisor = 115200 / baud; + c = inb(early_serial_base + LCR); + outb(c | DLAB, early_serial_base + LCR); + outb(divisor & 0xff, early_serial_base + DLL); + outb((divisor >> 8) & 0xff, early_serial_base + DLH); + outb(c & ~DLAB, early_serial_base + LCR); +} + +static struct console early_serial_console = { + .name = "earlyser", + .write = early_serial_write, + .flags = CON_PRINTBUFFER, + .index = -1, +}; + +/* Direct interface for emergencies */ +struct console *early_console = &early_vga_console; +static int early_console_initialized = 0; + +void early_printk(const char *fmt, ...) +{ + char buf[512]; + int n; + va_list ap; + va_start(ap,fmt); + n = vsnprintf(buf,512,fmt,ap); + early_console->write(early_console,buf,n); + va_end(ap); +} + +static int keep_early; + +int __init setup_early_printk(void) +{ + char *space, *s; + char buf[256]; + char cmd[COMMAND_LINE_SIZE]; + char *opt; + + /* Get our own copy of the cmd line */ + memcpy(cmd, COMMAND_LINE, COMMAND_LINE_SIZE); + cmd[COMMAND_LINE_SIZE-1] = '\0'; + opt = cmd; + + s = strstr(opt, "earlyprintk="); + if (s == NULL) + return -1; + opt = s+12; + + if (early_console_initialized) + return -1; + + strncpy(buf,opt,256); + buf[255] = 0; + space = strchr(buf, ' '); + if (space) + *space = 0; + + if (strstr(buf,"keep")) + keep_early = 1; + + if (!strncmp(buf, "serial", 6)) { + early_serial_init(buf + 6); + early_console = &early_serial_console; + } else if (!strncmp(buf, "ttyS", 4)) { + early_serial_init(buf); + early_console = &early_serial_console; + } else if (!strncmp(buf, "vga", 3)) { + early_console = &early_vga_console; + } else { + early_console = NULL; + return -1; + } + early_console_initialized = 1; + register_console(early_console); + printk("early printk console registered\n"); + return 0; +} + +void __init disable_early_printk(void) +{ + if (!early_console_initialized || !early_console) + return; + if (!keep_early) { + printk("disabling early console...\n"); + unregister_console(early_console); + early_console_initialized = 0; + } else { + printk("keeping early console.\n"); + } +} + +/* syntax: earlyprintk=vga + earlyprintk=serial[,ttySn[,baudrate]] + Append ,keep to not disable it when the real console takes over. + Only vga or serial at a time, not both. + Currently only ttyS0 and ttyS1 are supported. + Interaction with the standard serial driver is not very good. + The VGA output is eventually overwritten by the real console. */ +__setup("earlyprintk=", setup_early_printk); diff -urpN -X /home/fletch/.diff.exclude 000-virgin/kernel/fork.c 900-mjb1/kernel/fork.c --- 000-virgin/kernel/fork.c Fri May 30 19:02:24 2003 +++ 900-mjb1/kernel/fork.c Fri May 30 22:02:34 2003 @@ -141,8 +141,9 @@ void remove_wait_queue(wait_queue_head_t void prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state) { unsigned long flags; - - __set_current_state(state); + + if (is_sync_wait(wait)) + __set_current_state(state); wait->flags &= ~WQ_FLAG_EXCLUSIVE; spin_lock_irqsave(&q->lock, flags); if (list_empty(&wait->task_list)) @@ -155,7 +156,8 @@ prepare_to_wait_exclusive(wait_queue_hea { unsigned long flags; - __set_current_state(state); + if (is_sync_wait(wait)) + __set_current_state(state); wait->flags |= WQ_FLAG_EXCLUSIVE; spin_lock_irqsave(&q->lock, flags); if (list_empty(&wait->task_list)) @@ -199,7 +201,10 @@ void __init fork_init(unsigned long memp * value: the thread structures can take up at most half * of memory. */ - max_threads = mempages / (THREAD_SIZE/PAGE_SIZE) / 8; + if (THREAD_SIZE >= PAGE_SIZE) + max_threads = mempages / (THREAD_SIZE/PAGE_SIZE) / 8; + else + max_threads = (mempages * (PAGE_SIZE/THREAD_SIZE)) / 8; /* * we need to allow at least 20 threads to boot a system */ @@ -857,6 +862,7 @@ struct task_struct *copy_process(unsigne p->lock_depth = -1; /* -1 = no lock */ p->start_time = get_jiffies_64(); p->security = NULL; + p->io_wait = NULL; retval = -ENOMEM; if ((retval = security_task_alloc(p))) diff -urpN -X /home/fletch/.diff.exclude 000-virgin/kernel/kprobes.c 900-mjb1/kernel/kprobes.c --- 000-virgin/kernel/kprobes.c Wed Dec 31 16:00:00 1969 +++ 900-mjb1/kernel/kprobes.c Fri May 30 19:15:14 2003 @@ -0,0 +1,89 @@ +/* Support for kernel probes. + (C) 2002 Vamsi Krishna S . +*/ +#include +#include +#include +#include +#include +#include +#include + +#define KPROBE_HASH_BITS 6 +#define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS) + +static struct list_head kprobe_table[KPROBE_TABLE_SIZE]; + +unsigned int kprobe_cpu = NR_CPUS; +static spinlock_t kprobe_lock = SPIN_LOCK_UNLOCKED; + +/* Locks kprobe: irqs must be disabled */ +void lock_kprobes(void) +{ + spin_lock(&kprobe_lock); + kprobe_cpu = smp_processor_id(); +} + +void unlock_kprobes(void) +{ + kprobe_cpu = NR_CPUS; + spin_unlock(&kprobe_lock); +} + +/* You have to be holding the kprobe_lock */ +struct kprobe *get_kprobe(void *addr) +{ + struct list_head *head, *tmp; + + head = &kprobe_table[hash_ptr(addr, KPROBE_HASH_BITS)]; + list_for_each(tmp, head) { + struct kprobe *p = list_entry(tmp, struct kprobe, list); + if (p->addr == addr) + return p; + } + return NULL; +} + +int register_kprobe(struct kprobe *p) +{ + int ret = 0; + + spin_lock_irq(&kprobe_lock); + if (get_kprobe(p->addr)) { + ret = -EEXIST; + goto out; + } + list_add(&p->list, &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); + + p->opcode = *p->addr; + *p->addr = BREAKPOINT_INSTRUCTION; + flush_icache_range(p->addr, p->addr + sizeof(kprobe_opcode_t)); + out: + spin_unlock_irq(&kprobe_lock); + return ret; +} + +void unregister_kprobe(struct kprobe *p) +{ + spin_lock_irq(&kprobe_lock); + *p->addr = p->opcode; + list_del(&p->list); + flush_icache_range(p->addr, p->addr + sizeof(kprobe_opcode_t)); + spin_unlock_irq(&kprobe_lock); +} + +static int __init init_kprobes(void) +{ + int i; + + /* FIXME allocate the probe table, currently defined statically */ + /* initialize all list heads */ + for (i = 0; i < KPROBE_TABLE_SIZE; i++) + INIT_LIST_HEAD(&kprobe_table[i]); + + return 0; +} +__initcall(init_kprobes); + +EXPORT_SYMBOL_GPL(register_kprobe); +EXPORT_SYMBOL_GPL(unregister_kprobe); diff -urpN -X /home/fletch/.diff.exclude 000-virgin/kernel/ksyms.c 900-mjb1/kernel/ksyms.c --- 000-virgin/kernel/ksyms.c Fri May 30 19:02:24 2003 +++ 900-mjb1/kernel/ksyms.c Fri May 30 19:28:07 2003 @@ -58,6 +58,8 @@ #include #include #include +#include +#include #include #if defined(CONFIG_PROC_FS) @@ -456,7 +458,12 @@ EXPORT_SYMBOL(sleep_on); EXPORT_SYMBOL(sleep_on_timeout); EXPORT_SYMBOL(interruptible_sleep_on); EXPORT_SYMBOL(interruptible_sleep_on_timeout); +#ifdef CONFIG_KGDB_THREAD +EXPORT_SYMBOL(kern_schedule); +EXPORT_SYMBOL(do_schedule); +#else EXPORT_SYMBOL(schedule); +#endif #ifdef CONFIG_PREEMPT EXPORT_SYMBOL(preempt_schedule); #endif @@ -601,8 +608,23 @@ EXPORT_SYMBOL(next_thread); EXPORT_SYMBOL(__per_cpu_offset); #endif +#if defined(CONFIG_LOCKMETER) +EXPORT_SYMBOL(_metered_spin_lock); +EXPORT_SYMBOL(_metered_spin_unlock); +EXPORT_SYMBOL(_metered_spin_trylock); +EXPORT_SYMBOL(_metered_read_lock); +EXPORT_SYMBOL(_metered_read_unlock); +EXPORT_SYMBOL(_metered_write_lock); +EXPORT_SYMBOL(_metered_write_unlock); +#endif + /* debug */ EXPORT_SYMBOL(dump_stack); EXPORT_SYMBOL(ptrace_notify); EXPORT_SYMBOL(current_kernel_time); + +#ifdef CONFIG_CRASH_DUMP_MODULE +EXPORT_SYMBOL(min_low_pfn); +EXPORT_SYMBOL(dump_oncpu); +#endif diff -urpN -X /home/fletch/.diff.exclude 000-virgin/kernel/lockmeter.c 900-mjb1/kernel/lockmeter.c --- 000-virgin/kernel/lockmeter.c Wed Dec 31 16:00:00 1969 +++ 900-mjb1/kernel/lockmeter.c Fri May 30 19:26:45 2003 @@ -0,0 +1,1088 @@ +/* + * Copyright (C) 1999,2000 Silicon Graphics, Inc. + * + * Written by John Hawkes (hawkes@sgi.com) + * Based on klstat.c by Jack Steiner (steiner@sgi.com) + * + * Modified by Ray Bryant (raybry@us.ibm.com) + * Changes Copyright (C) 2000 IBM, Inc. + * Added save of index in spinlock_t to improve efficiency + * of "hold" time reporting for spinlocks + * Added support for hold time statistics for read and write + * locks. + */ + +#ifdef __KERNEL__ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#else +#define __SMP__ +#include +#include +#include +#include "bitops.h" +#include "user_scaffold.h" +#include +#include +#include "newlockmeter.h" +#endif + +#ifdef __KERNEL__ +#define ASSERT(cond) +#define bzero(loc,size) memset(loc,0,size) +#endif + +/*<---------------------------------------------------*/ +/* lockmeter.c */ +/*>---------------------------------------------------*/ + +#ifdef __KERNEL__ +static lstat_control_t lstat_control __cacheline_aligned = {LSTAT_OFF, SPIN_LOCK_UNLOCKED, SPIN_LOCK_UNLOCKED, 19*0, NR_CPUS*0, 0, NR_CPUS*0}; +#else +lstat_control_t lstat_control = {LSTAT_OFF, SPIN_LOCK_UNLOCKED, SPIN_LOCK_UNLOCKED, 19*0, NR_CPUS*0, 0, NR_CPUS*0}; +#endif + +int smp_num_cpus=NR_CPUS; + +#undef BUG +#define BUG() + +static ushort lstat_make_dir_entry(void *, void *); + +/* + * lstat_lookup + * + * Given a RA, locate the directory entry for the lock. + */ +static ushort +lstat_lookup( + void *lock_ptr, + void *caller_ra) +{ + ushort index; + lstat_directory_entry_t *dirp; + + dirp = lstat_control.dir; + + index = lstat_control.hashtab[DIRHASH(caller_ra)]; + while (dirp[index].caller_ra != caller_ra) { + if (index == 0) { + return(lstat_make_dir_entry(lock_ptr, caller_ra)); + } + index = dirp[index].next_stat_index; + } + + if (dirp[index].lock_ptr != NULL && + dirp[index].lock_ptr != lock_ptr) { + dirp[index].lock_ptr = NULL; + } + + return(index); +} + + +/* + * lstat_make_dir_entry + * Called to add a new lock to the lock directory. + */ +static ushort +lstat_make_dir_entry( + void *lock_ptr, + void *caller_ra) +{ + lstat_directory_entry_t *dirp; + ushort index, hindex; + unsigned long flags; + + /* lock the table without recursively reentering this metering code */ + do { local_irq_save(flags); + _raw_spin_lock(&lstat_control.directory_lock); } while(0); + + hindex = DIRHASH(caller_ra); + index = lstat_control.hashtab[hindex]; + dirp = lstat_control.dir; + while (index && dirp[index].caller_ra != caller_ra) + index = dirp[index].next_stat_index; + + if (index == 0) { + if(lstat_control.next_free_dir_index < LSTAT_MAX_STAT_INDEX) { + index = lstat_control.next_free_dir_index++; + lstat_control.dir[index].caller_ra = caller_ra; + lstat_control.dir[index].lock_ptr = lock_ptr; + lstat_control.dir[index].next_stat_index = lstat_control.hashtab[hindex]; + lstat_control.hashtab[hindex] = index; + } else { + lstat_control.dir_overflow++; + } + } + + do { _raw_spin_unlock(&lstat_control.directory_lock); + local_irq_restore(flags);} while(0); + return(index); +} + +int +lstat_update ( + void *lock_ptr, + void *caller_ra, + int action) +{ + int index; + int cpu; + + ASSERT(action < LSTAT_ACT_MAX_VALUES); + + if (lstat_control.state == LSTAT_OFF) { + return(0); + } + + index = lstat_lookup(lock_ptr, caller_ra); + cpu = THIS_CPU_NUMBER; + (*lstat_control.counts[cpu])[index].count[action]++; + (*lstat_control.counts[cpu])[index].acquire_time = get_cycles(); + + return(index); +} + +int +lstat_update_time ( + void *lock_ptr, + void *caller_ra, + int action, + uint32_t ticks) +{ + ushort index; + int cpu; + + ASSERT(action < LSTAT_ACT_MAX_VALUES); + + if (lstat_control.state == LSTAT_OFF) { + return(0); + } + + index = lstat_lookup(lock_ptr, caller_ra); + cpu = THIS_CPU_NUMBER; + (*lstat_control.counts[cpu])[index].count[action]++; + (*lstat_control.counts[cpu])[index].cum_wait_ticks += (uint64_t)ticks; + if ((*lstat_control.counts[cpu])[index].max_wait_ticks < ticks) + (*lstat_control.counts[cpu])[index].max_wait_ticks = ticks; + + (*lstat_control.counts[cpu])[index].acquire_time = get_cycles(); + + return(index); +} + +void _metered_spin_lock(spinlock_t *lock_ptr) +{ + if (lstat_control.state == LSTAT_OFF) { + _raw_spin_lock(lock_ptr); /* do the real lock */ + PUT_INDEX(lock_ptr,0); /* clean index in case lockmetering */ + /* gets turned on before unlock */ + } else { + void *this_pc = LSTAT_RA(LSTAT_RA_SPIN); + int index; + + if (_raw_spin_trylock(lock_ptr)) { + index = lstat_update(lock_ptr, this_pc, LSTAT_ACT_NO_WAIT); + } else { + uint32_t start_cycles = get_cycles(); + _raw_spin_lock(lock_ptr); /* do the real lock */ + index = lstat_update_time(lock_ptr, this_pc, LSTAT_ACT_SPIN, + get_cycles() - start_cycles); + } + /* save the index in the lock itself for use in spin unlock */ + PUT_INDEX(lock_ptr,index); + } +} + +int _metered_spin_trylock(spinlock_t *lock_ptr) +{ + if (lstat_control.state == LSTAT_OFF) { + return _raw_spin_trylock(lock_ptr); + } else { + int retval; + void *this_pc = LSTAT_RA(LSTAT_RA_SPIN); + + if ((retval = _raw_spin_trylock(lock_ptr))) { + int index = lstat_update(lock_ptr, this_pc, LSTAT_ACT_NO_WAIT); + /* save the index in the lock itself for use in spin unlock */ + PUT_INDEX(lock_ptr,index); + } else { + lstat_update(lock_ptr, this_pc, LSTAT_ACT_REJECT); + } + + return retval; + } +} + +void _metered_spin_unlock(spinlock_t *lock_ptr) +{ + int index=-1; + + if (lstat_control.state != LSTAT_OFF) { + index = GET_INDEX(lock_ptr); + /* + * If statistics were turned off when we set the lock, + * then the index can be zero. If that is the case, + * then collect no stats on this call. + */ + if (index > 0) { + uint32_t hold_time; + int cpu = THIS_CPU_NUMBER; + hold_time = get_cycles() - (*lstat_control.counts[cpu])[index].acquire_time; + (*lstat_control.counts[cpu])[index].cum_hold_ticks += (uint64_t)hold_time; + if ((*lstat_control.counts[cpu])[index].max_hold_ticks < hold_time) + (*lstat_control.counts[cpu])[index].max_hold_ticks = hold_time; + } + } + + /* make sure we don't have a stale index value saved */ + PUT_INDEX(lock_ptr,0); + _raw_spin_unlock(lock_ptr); /* do the real unlock */ +} + +/* + * allocate the next global read lock structure and store its index + * in the rwlock at "lock_ptr". + */ +uint32_t alloc_rwlock_struct(rwlock_t *rwlock_ptr) +{ + int index; + int flags; + int cpu=THIS_CPU_NUMBER; + + /* If we've already overflowed, then do a quick exit */ + if (lstat_control.next_free_read_lock_index > LSTAT_MAX_READ_LOCK_INDEX) { + lstat_control.rwlock_overflow++; + return(0); + } + + do { local_irq_save(flags); + _raw_spin_lock(&lstat_control.directory_lock); } while(0); + + /* It is possible this changed while we were waiting for the directory_lock */ + if (lstat_control.state == LSTAT_OFF) { + index=0; + goto unlock; + } + + /* It is possible someone else got here first and set the index */ + if ((index=GET_RWINDEX(rwlock_ptr)) == 0) { + + /* we can't turn on read stats for this lock while there are readers */ + /* (this would mess up the running hold time sum at unlock time) */ + if (RWLOCK_READERS(rwlock_ptr) != 0) { + index=0; + goto unlock; + } + + /* if stats are turned on after being off, we may need to return an old */ + /* index from when the statistics were on last time. ................... */ + for(index=1;index= LSTAT_MAX_READ_LOCK_INDEX) { + lstat_control.rwlock_overflow++; + index = 0; + goto unlock; + } + index = lstat_control.next_free_read_lock_index++; + + /* initialize the global read stats data structure for each cpu */ + for(cpu=0; cpu < smp_num_cpus; cpu++) { + (*lstat_control.read_lock_counts[cpu])[index].lock_ptr = rwlock_ptr; + } +put_index_and_unlock: + /* store the index for the read lock structure into the lock */ + PUT_RWINDEX(rwlock_ptr,index); + } + +unlock: + do { _raw_spin_unlock(&lstat_control.directory_lock); + local_irq_restore(flags);} while(0); + + return(index); +} + +void +_metered_read_lock(rwlock_t *rwlock_ptr) +{ + void *this_pc; + uint32_t start_cycles; + int index; + int cpu; + int flags; + int readers_before, readers_after; + uint64_t cycles64; + + if (lstat_control.state == LSTAT_OFF) { + _raw_read_lock(rwlock_ptr); + /* clean index in case lockmetering turns on before an unlock */ + PUT_RWINDEX(rwlock_ptr, 0); + return; + } + + this_pc = LSTAT_RA(LSTAT_RA_READ); + cpu = THIS_CPU_NUMBER; + index = GET_RWINDEX(rwlock_ptr); + + /* allocate the global stats entry for this lock, if needed */ + if (index==0) { + index = alloc_rwlock_struct(rwlock_ptr); + } + + readers_before = RWLOCK_READERS(rwlock_ptr); + if (_raw_read_trylock(rwlock_ptr)) { + /* + * We have decremented the lock to count a new reader, + * and have confirmed that no writer has it locked. + */ + /* update statistics if enabled */ + if (index>0) { +#ifndef __KERNEL__ + _raw_spin_lock((spinlock_t *)&(*lstat_control.read_lock_counts[cpu])[index].entry_lock); +#else + do { local_irq_save(flags); } while(0); +#endif + lstat_update((void *)rwlock_ptr, this_pc, LSTAT_ACT_NO_WAIT); + /* preserve value of TSC so cum_hold_ticks and start_busy use same value */ + cycles64 = get_cycles64(); + (*lstat_control.read_lock_counts[cpu])[index].cum_hold_ticks -= cycles64; + + /* record time and cpu of start of busy period */ + /* this is not perfect (some race conditions are possible) */ + if (readers_before==0) { + (*lstat_control.read_lock_counts[cpu])[index].start_busy = cycles64; + PUT_RW_CPU(rwlock_ptr, cpu); + } + readers_after=RWLOCK_READERS(rwlock_ptr); + if (readers_after > (*lstat_control.read_lock_counts[cpu])[index].max_readers) + (*lstat_control.read_lock_counts[cpu])[index].max_readers = readers_after; +#ifndef __KERNEL__ + _raw_spin_unlock((spinlock_t*)&(*lstat_control.read_lock_counts[cpu])[index].entry_lock); +#else + do {local_irq_restore(flags);} while(0); +#endif + } + + return; + } + /* If we get here, then we could not quickly grab the read lock */ + + start_cycles = get_cycles(); /* start counting the wait time */ + + /* Now spin until read_lock is successful */ + _raw_read_lock(rwlock_ptr); + + lstat_update_time((void *)rwlock_ptr, this_pc, LSTAT_ACT_SPIN, + get_cycles() - start_cycles); + + /* update statistics if they are enabled for this lock */ + if (index>0) { +#ifndef __KERNEL__ + _raw_spin_lock((spinlock_t *)&(*lstat_control.read_lock_counts[cpu])[index].entry_lock); +#else + do { local_irq_save(flags); } while(0); +#endif + cycles64 = get_cycles64(); + (*lstat_control.read_lock_counts[cpu])[index].cum_hold_ticks -= cycles64; + + /* this is not perfect (some race conditions are possible) */ + if (readers_before==0) { + (*lstat_control.read_lock_counts[cpu])[index].start_busy = cycles64; + PUT_RW_CPU(rwlock_ptr, cpu); + } + readers_after=RWLOCK_READERS(rwlock_ptr); + if (readers_after > (*lstat_control.read_lock_counts[cpu])[index].max_readers) + (*lstat_control.read_lock_counts[cpu])[index].max_readers = readers_after; + +#ifndef __KERNEL__ + _raw_spin_unlock((spinlock_t *)&(*lstat_control.read_lock_counts[cpu])[index].entry_lock); +#else + do {local_irq_restore(flags);} while(0); +#endif + } +} + +void _metered_read_unlock(rwlock_t *rwlock_ptr) +{ + int index; + int cpu; + int flags; + uint64_t busy_length; + uint64_t cycles64; + + if (lstat_control.state == LSTAT_OFF) { + _raw_read_unlock(rwlock_ptr); + return; + } + + index = GET_RWINDEX(rwlock_ptr); + cpu = THIS_CPU_NUMBER; + + if (index>0) { +#ifndef __KERNEL__ + _raw_spin_lock((spinlock_t *)&(*lstat_control.read_lock_counts[cpu])[index].entry_lock); +#else + /* updates below are non-atomic */ + do { local_irq_save(flags); } while(0); +#endif + /* preserve value of TSC so cum_hold_ticks and busy_ticks are consistent.. */ + cycles64 = get_cycles64(); + (*lstat_control.read_lock_counts[cpu])[index].cum_hold_ticks += cycles64; + (*lstat_control.read_lock_counts[cpu])[index].read_lock_count++; + + /* once again, this is not perfect (some race conditions are possible) */ + if (RWLOCK_READERS(rwlock_ptr) == 1) { + int cpu1 = GET_RW_CPU(rwlock_ptr); + uint64_t last_start_busy = (*lstat_control.read_lock_counts[cpu1])[index].start_busy; + (*lstat_control.read_lock_counts[cpu])[index].busy_periods++; + if (cycles64 > last_start_busy) { + busy_length = cycles64 - last_start_busy; + (*lstat_control.read_lock_counts[cpu])[index].busy_ticks += busy_length; + if (busy_length > (*lstat_control.read_lock_counts[cpu])[index].max_busy) + (*lstat_control.read_lock_counts[cpu])[index].max_busy = busy_length; + } + } +#ifndef __KERNEL__ + _raw_spin_unlock((spinlock_t *)&(*lstat_control.read_lock_counts[cpu])[index].entry_lock); +#else + do {local_irq_restore(flags);} while(0); +#endif + } + + /* unlock the lock */ + _raw_read_unlock(rwlock_ptr); +} + +void _metered_write_lock(rwlock_t *rwlock_ptr) +{ + uint32_t start_cycles; + void *this_pc; + uint32_t spin_ticks = 0; /* in anticipation of a potential wait */ + int index; + int write_index = 0; + int cpu; + enum {writer_writer_conflict, writer_reader_conflict} why_wait = writer_writer_conflict; + + if (lstat_control.state == LSTAT_OFF) { + _raw_write_lock(rwlock_ptr); + /* clean index in case lockmetering turns on before an unlock */ + PUT_RWINDEX(rwlock_ptr, 0); + return; + } + + this_pc = LSTAT_RA(LSTAT_RA_WRITE); + cpu = THIS_CPU_NUMBER; + index = GET_RWINDEX(rwlock_ptr); + + /* allocate the global stats entry for this lock, if needed */ + if (index == 0) { + index = alloc_rwlock_struct(rwlock_ptr); + } + + if (_raw_write_trylock(rwlock_ptr)) { + /* We acquired the lock on the first try */ + write_index = lstat_update((void *)rwlock_ptr, this_pc, LSTAT_ACT_NO_WAIT); + /* save the write_index for use in unlock if stats enabled */ + if (index > 0) + (*lstat_control.read_lock_counts[cpu])[index].write_index = write_index; + return; + } + + /* If we get here, then we could not quickly grab the write lock */ + start_cycles = get_cycles(); /* start counting the wait time */ + + why_wait = RWLOCK_READERS(rwlock_ptr) ? writer_reader_conflict : writer_writer_conflict; + + /* Now set the lock and wait for conflicts to disappear */ + _raw_write_lock(rwlock_ptr); + + spin_ticks = get_cycles() - start_cycles; + + /* update stats -- if enabled */ + if (index > 0) + if (spin_ticks) { + if (why_wait == writer_reader_conflict) { + /* waited due to a reader holding the lock */ + write_index = lstat_update_time((void *)rwlock_ptr, this_pc, + LSTAT_ACT_SPIN, spin_ticks); + } else { + /* waited due to another writer holding the lock */ + write_index = lstat_update_time((void *)rwlock_ptr, this_pc, + LSTAT_ACT_WW_SPIN, spin_ticks); + (*lstat_control.counts[cpu])[write_index].cum_wait_ww_ticks += spin_ticks; + if (spin_ticks > + (*lstat_control.counts[cpu])[write_index].max_wait_ww_ticks) { + (*lstat_control.counts[cpu])[write_index].max_wait_ww_ticks = spin_ticks; + } + } + + /* save the directory index for use on write_unlock */ + (*lstat_control.read_lock_counts[cpu])[index].write_index = write_index; + } + +} + +void +_metered_write_unlock(rwlock_t *rwlock_ptr) +{ + int index; + int cpu; + int write_index; + uint32_t hold_time; + + if (lstat_control.state == LSTAT_OFF) { + _raw_write_unlock(rwlock_ptr); + return; + } + + cpu = THIS_CPU_NUMBER; + index = GET_RWINDEX(rwlock_ptr); + + /* update statistics if stats enabled for this lock */ + if (index>0) { + write_index = (*lstat_control.read_lock_counts[cpu])[index].write_index; + + hold_time = get_cycles() - (*lstat_control.counts[cpu])[write_index].acquire_time; + (*lstat_control.counts[cpu])[write_index].cum_hold_ticks += (uint64_t)hold_time; + if ((*lstat_control.counts[cpu])[write_index].max_hold_ticks < hold_time) + (*lstat_control.counts[cpu])[write_index].max_hold_ticks = hold_time; + } + _raw_write_unlock(rwlock_ptr); +} + +int _metered_write_trylock(rwlock_t *rwlock_ptr) +{ + int retval; + void *this_pc = LSTAT_RA(LSTAT_RA_WRITE); + + if ((retval = _raw_write_trylock(rwlock_ptr))) { + lstat_update(rwlock_ptr, this_pc, LSTAT_ACT_NO_WAIT); + } else { + lstat_update(rwlock_ptr, this_pc, LSTAT_ACT_REJECT); + } + + return retval; +} + +#ifdef __KERNEL__ +static void +init_control_space(void) +{ + /* Set all control space pointers to null and indices to "empty" */ + int cpu; + + /* + * Access CPU_CYCLE_FREQUENCY at the outset, which in some + * architectures may trigger a runtime calculation that uses a + * spinlock. Let's do this before lockmetering is turned on. + */ + if (CPU_CYCLE_FREQUENCY == 0) + BUG(); + + lstat_control.hashtab = NULL; + lstat_control.dir = NULL; + for (cpu=0; cpu max_len) + return actual_ret_bcount; + + copy_to_user(buffer, (void *)&req, next_ret_bcount); + actual_ret_bcount += next_ret_bcount; + return actual_ret_bcount; + } else { + /* measurement is off but valid data present */ + /* fetch time info from lstat_control */ + req.ending_time = lstat_control.ending_time; + req.ending_cycles64 = lstat_control.ending_cycles64; + req.enabled_cycles64 = lstat_control.enabled_cycles64; + } + } else { + /* this must be a read while data active--use current time, etc */ + do_gettimeofday(&tv); + req.ending_time = tv.tv_sec; + req.ending_cycles64 = get_cycles64(); + req.enabled_cycles64 = req.ending_cycles64-req.started_cycles64 + + lstat_control.enabled_cycles64; + } + + next_ret_bcount = sizeof(lstat_user_request_t); + if ((actual_ret_bcount + next_ret_bcount) > max_len) + return actual_ret_bcount; + + copy_to_user(buffer, (void *)&req, next_ret_bcount); + actual_ret_bcount += next_ret_bcount; + + if (!lstat_control.counts[0]) /* not initialized? */ + return actual_ret_bcount; + + next_ret_bcount = sizeof(lstat_cpu_counts_t); + for (cpu = 0; cpu < smp_num_cpus; cpu++) { + if ((actual_ret_bcount + next_ret_bcount) > max_len) + return actual_ret_bcount; /* leave early */ + copy_to_user(buffer + actual_ret_bcount, lstat_control.counts[cpu], + next_ret_bcount); + actual_ret_bcount += next_ret_bcount; + } + + next_ret_bcount = LSTAT_MAX_STAT_INDEX * sizeof(lstat_directory_entry_t); + if ( ((actual_ret_bcount + next_ret_bcount) > max_len) + || !lstat_control.dir ) + return actual_ret_bcount; /* leave early */ + + copy_to_user(buffer + actual_ret_bcount, lstat_control.dir, + next_ret_bcount); + actual_ret_bcount += next_ret_bcount; + + next_ret_bcount = sizeof(lstat_read_lock_cpu_counts_t); + for (cpu = 0; cpu < smp_num_cpus; cpu++) { + if (actual_ret_bcount + next_ret_bcount > max_len) + return actual_ret_bcount; + copy_to_user(buffer + actual_ret_bcount, lstat_control.read_lock_counts[cpu], + next_ret_bcount); + actual_ret_bcount += next_ret_bcount; + } + + return actual_ret_bcount; +} + +/* + * Writing to the /proc lockmeter node enables or disables metering. + * based upon the first byte of the "written" data. + * The following values are defined: + * LSTAT_ON: 1st call: allocates storage, intializes and turns on measurement + * subsequent calls just turn on measurement + * LSTAT_OFF: turns off measurement + * LSTAT_RESET: resets statistics + * LSTAT_RELEASE: releases statistics storage + * + * This allows one to accumulate statistics over several lockstat runs: + * + * lockstat on + * lockstat off + * ...repeat above as desired... + * lockstat get + * ...now start a new set of measurements... + * lockstat reset + * lockstat on + * ... + * + */ +ssize_t put_lockmeter_info(const char *buffer, size_t len) +{ + int error = 0; + int dirsize, countsize, read_lock_countsize, hashsize; + int cpu; + char put_char; + int i, read_lock_blocks, flags; + rwlock_t *lock_ptr; + struct timeval tv; + + if (len <= 0) + return -EINVAL; + + _raw_spin_lock(&lstat_control.control_lock); + + get_user(put_char, buffer); + switch (put_char) { + + case LSTAT_OFF: + if (lstat_control.state != LSTAT_OFF) { + /* + * To avoid seeing read lock hold times in an inconsisent state, + * we have to follow this protocol to turn off statistics + */ + do { local_irq_save(flags); } while(0); + /* getting this lock will stop any read lock block allocations */ + _raw_spin_lock(&lstat_control.directory_lock); + /* keep any more read lock blocks from being allocated */ + lstat_control.state = LSTAT_OFF; + /* record how may read lock blocks there are */ + read_lock_blocks = lstat_control.next_free_read_lock_index; + _raw_spin_unlock(&lstat_control.directory_lock); + /* now go through the list of read locks */ + cpu = THIS_CPU_NUMBER; + for(i=1;i 0) - { + if (panic_timeout > 0) { /* * Delay timeout seconds before rebooting the machine. * We can't use the "normal" timers since we just panicked.. diff -urpN -X /home/fletch/.diff.exclude 000-virgin/kernel/sched.c 900-mjb1/kernel/sched.c --- 000-virgin/kernel/sched.c Fri May 30 19:02:24 2003 +++ 900-mjb1/kernel/sched.c Fri May 30 22:02:10 2003 @@ -33,12 +33,15 @@ #include #include -#ifdef CONFIG_NUMA +#ifdef CONFIG_NUMA_SCHED #define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu)) #else #define cpu_to_node_mask(cpu) (cpu_online_map) #endif +/* used to soft spin in sched while dump is in progress */ +int dump_oncpu; + /* * Convert user-nice values [ -20 ... 0 ... 19 ] * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], @@ -57,6 +60,11 @@ #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) +/* the FIXED_1 gunk is so running averages don't vanish prematurely */ +#define RAVG_WEIGHT 128 +#define RAVG_FACTOR (RAVG_WEIGHT*FIXED_1) +#define RUNNING_AVG(x,y) (((RAVG_WEIGHT-1)*(x)+RAVG_FACTOR*(y))/RAVG_WEIGHT) + /* * These are the 'tuning knobs' of the scheduler: * @@ -64,16 +72,29 @@ * maximum timeslice is 200 msecs. Timeslices get refilled after * they expire. */ -#define MIN_TIMESLICE ( 10 * HZ / 1000) -#define MAX_TIMESLICE (200 * HZ / 1000) -#define CHILD_PENALTY 50 -#define PARENT_PENALTY 100 -#define EXIT_WEIGHT 3 -#define PRIO_BONUS_RATIO 25 -#define INTERACTIVE_DELTA 2 -#define MAX_SLEEP_AVG (10*HZ) -#define STARVATION_LIMIT (10*HZ) -#define NODE_THRESHOLD 125 +int min_timeslice = (10 * HZ) / 1000; +int max_timeslice = (200 * HZ) / 1000; +int child_penalty = 50; +int parent_penalty = 100; +int exit_weight = 3; +int prio_bonus_ratio = 25; +int interactive_delta = 2; +int max_sleep_avg = 10 * HZ; +int starvation_limit = 10 * HZ; +int node_threshold = 125; + +#define MIN_TIMESLICE (min_timeslice) +#define MAX_TIMESLICE (max_timeslice) +#define CHILD_PENALTY (child_penalty) +#define PARENT_PENALTY (parent_penalty) +#define EXIT_WEIGHT (exit_weight) +#define PRIO_BONUS_RATIO (prio_bonus_ratio) +#define INTERACTIVE_DELTA (interactive_delta) +#define MAX_SLEEP_AVG (max_sleep_avg) +#define STARVATION_LIMIT (starvation_limit) +#define NODE_THRESHOLD (node_threshold) + +#define TIMESLICE_GRANULARITY (HZ/20 ?: 1) /* * If a task is 'interactive' then we reinsert it in the active @@ -161,7 +182,7 @@ struct runqueue { struct mm_struct *prev_mm; prio_array_t *active, *expired, arrays[2]; int prev_cpu_load[NR_CPUS]; -#ifdef CONFIG_NUMA +#ifdef CONFIG_NUMA_SCHED atomic_t *node_nr_running; int prev_node_load[MAX_NUMNODES]; #endif @@ -169,6 +190,8 @@ struct runqueue { struct list_head migration_queue; atomic_t nr_iowait; + + struct sched_info info; } ____cacheline_aligned; static struct runqueue runqueues[NR_CPUS] __cacheline_aligned; @@ -188,7 +211,7 @@ static struct runqueue runqueues[NR_CPUS # define task_running(rq, p) ((rq)->curr == (p)) #endif -#ifdef CONFIG_NUMA +#ifdef CONFIG_NUMA_SCHED /* * Keep track of running tasks. @@ -222,13 +245,186 @@ __init void node_nr_running_init(void) cpu_rq(i)->node_nr_running = &node_nr_running[cpu_to_node(i)]; } -#else /* !CONFIG_NUMA */ +#else /* !CONFIG_NUMA_SCHED */ # define nr_running_init(rq) do { } while (0) # define nr_running_inc(rq) do { (rq)->nr_running++; } while (0) # define nr_running_dec(rq) do { (rq)->nr_running--; } while (0) -#endif /* CONFIG_NUMA */ +#endif /* CONFIG_NUMA_SCHED */ + + +struct schedstat { + /* sys_sched_yield stats */ + unsigned long yld_exp_empty; + unsigned long yld_act_empty; + unsigned long yld_both_empty; + unsigned long yld_cnt; + + /* schedule stats */ + unsigned long sched_noswitch; + unsigned long sched_switch; + unsigned long sched_cnt; + + /* load_balance stats */ + unsigned long lb_imbalance; + unsigned long lb_idle; + unsigned long lb_busy; + unsigned long lb_resched; + unsigned long lb_cnt; + unsigned long lb_nobusy; + unsigned long lb_bnode; + + /* pull_task stats */ + unsigned long pt_gained; + unsigned long pt_lost; + unsigned long pt_node_gained; + unsigned long pt_node_lost; + + /* balance_node stats */ + unsigned long bn_cnt; + unsigned long bn_idle; +} ____cacheline_aligned; + +static inline void sched_info_arrive(task_t *t) +{ + unsigned long now = jiffies; + unsigned long diff = now - t->sched_info.last_arrival; + struct runqueue *rq = task_rq(t); + + t->sched_info.inter_arrival_time = + RUNNING_AVG(t->sched_info.inter_arrival_time, diff); + t->sched_info.last_arrival = now; + + if (!rq) + return; + diff = now - rq->info.last_arrival; + rq->info.inter_arrival_time = + RUNNING_AVG(rq->info.inter_arrival_time, diff); + rq->info.last_arrival = now; +} + +/* is this ever used? */ +static inline void sched_info_depart(task_t *t) +{ + struct runqueue *rq = task_rq(t); + unsigned long diff, now = jiffies; + + diff = now - t->sched_info.began_service; + t->sched_info.service_time = + RUNNING_AVG(t->sched_info.service_time, diff); + + if (!rq) + return; + diff = now - rq->info.began_service; + rq->info.service_time = + RUNNING_AVG(rq->info.service_time, diff); +} + +static inline void sched_info_switch(task_t *prev, task_t *next) +{ + struct runqueue *rq = task_rq(prev); + unsigned long diff, now = jiffies; + + /* prev now departs the cpu */ + sched_info_depart(prev); + + /* only for involuntary context switches */ + if (prev->state == TASK_RUNNING) + sched_info_arrive(prev); + + diff = now - next->sched_info.last_arrival; + next->sched_info.response_time = + RUNNING_AVG(next->sched_info.response_time, diff); + next->sched_info.began_service = now; + + if (!rq) + return; + /* yes, reusing next's service time is valid */ + rq->info.response_time = + RUNNING_AVG(rq->info.response_time, diff); + rq->info.began_service = now; + + if (prev->state != TASK_RUNNING) + return; + /* if prev arrived subtract rq's last arrival from its arrival */ + diff = now - rq->info.last_arrival; + rq->info.inter_arrival_time = + RUNNING_AVG(rq->info.inter_arrival_time, diff); + rq->info.last_arrival = now; +} + +/* + * bump this up when changing the output format or the meaning of an existing + * format, so that tools can adapt (or abort) + */ +#define SCHEDSTAT_VERSION 2 + +struct schedstat schedstats[NR_CPUS]; + +/* + * This could conceivably exceed a page's worth of output on machines with + * large number of cpus, where large == about 4096/100 or 40ish. Start + * worrying when we pass 32, probably. Then this has to stop being a + * "simple" entry in proc/proc_misc.c and needs to be an actual seq_file. + */ +int schedstats_read_proc(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct schedstat sums; + int i, len; + + memset(&sums, 0, sizeof(sums)); + len = sprintf(page, "version %d\n", SCHEDSTAT_VERSION); + for (i = 0; i < NR_CPUS; i++) { + if (!cpu_online(i)) continue; + sums.yld_exp_empty += schedstats[i].yld_exp_empty; + sums.yld_act_empty += schedstats[i].yld_act_empty; + sums.yld_both_empty += schedstats[i].yld_both_empty; + sums.yld_cnt += schedstats[i].yld_cnt; + sums.sched_noswitch += schedstats[i].sched_noswitch; + sums.sched_switch += schedstats[i].sched_switch; + sums.sched_cnt += schedstats[i].sched_cnt; + sums.lb_idle += schedstats[i].lb_idle; + sums.lb_busy += schedstats[i].lb_busy; + sums.lb_resched += schedstats[i].lb_resched; + sums.lb_cnt += schedstats[i].lb_cnt; + sums.lb_imbalance += schedstats[i].lb_imbalance; + sums.lb_nobusy += schedstats[i].lb_nobusy; + sums.lb_bnode += schedstats[i].lb_bnode; + sums.pt_node_gained += schedstats[i].pt_node_gained; + sums.pt_node_lost += schedstats[i].pt_node_lost; + sums.pt_gained += schedstats[i].pt_gained; + sums.pt_lost += schedstats[i].pt_lost; + sums.bn_cnt += schedstats[i].bn_cnt; + sums.bn_idle += schedstats[i].bn_idle; + len += sprintf(page + len, + "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu " + "%lu %lu %lu %lu %lu %lu %lu %lu\n", + i, schedstats[i].yld_both_empty, + schedstats[i].yld_act_empty, schedstats[i].yld_exp_empty, + schedstats[i].yld_cnt, schedstats[i].sched_noswitch, + schedstats[i].sched_switch, schedstats[i].sched_cnt, + schedstats[i].lb_idle, schedstats[i].lb_busy, + schedstats[i].lb_resched, + schedstats[i].lb_cnt, schedstats[i].lb_imbalance, + schedstats[i].lb_nobusy, schedstats[i].lb_bnode, + schedstats[i].pt_gained, schedstats[i].pt_lost, + schedstats[i].pt_node_gained, schedstats[i].pt_node_lost, + schedstats[i].bn_cnt, schedstats[i].bn_idle); + } + len += sprintf(page + len, + "totals %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu " + "%lu %lu %lu %lu %lu %lu %lu\n", + sums.yld_both_empty, sums.yld_act_empty, sums.yld_exp_empty, + sums.yld_cnt, sums.sched_noswitch, sums.sched_switch, + sums.sched_cnt, sums.lb_idle, sums.lb_busy, sums.lb_resched, + sums.lb_cnt, sums.lb_imbalance, sums.lb_nobusy, sums.lb_bnode, + sums.pt_gained, sums.pt_lost, sums.pt_node_gained, + sums.pt_node_lost, sums.bn_cnt, sums.bn_idle); + + return len; +} /* * task_rq_lock - lock the runqueue a given task resides on and disable @@ -487,15 +683,18 @@ repeat_lock_task: (p->cpus_allowed & (1UL << smp_processor_id())))) { set_task_cpu(p, smp_processor_id()); + sched_info_arrive(p); task_rq_unlock(rq, &flags); goto repeat_lock_task; } if (old_state == TASK_UNINTERRUPTIBLE) rq->nr_uninterruptible--; - if (sync) + if (sync) { + sched_info_arrive(p); __activate_task(p, rq); - else { + } else { activate_task(p, rq); + sched_info_arrive(p); if (p->prio < rq->curr->prio) resched_task(rq->curr); } @@ -549,6 +748,7 @@ void wake_up_forked_process(task_t * p) p->sleep_avg = p->sleep_avg * CHILD_PENALTY / 100; p->prio = effective_prio(p); set_task_cpu(p, smp_processor_id()); + sched_info_arrive(p); if (unlikely(!current->array)) __activate_task(p, rq); @@ -656,7 +856,6 @@ static inline task_t * context_switch(ru return prev; } - /* * nr_running, nr_uninterruptible and nr_context_switches: * @@ -674,6 +873,11 @@ unsigned long nr_running(void) return sum; } +unsigned long nr_running_cpu(int cpu) +{ + return cpu_rq(cpu)->nr_running; +} + unsigned long nr_uninterruptible(void) { unsigned long i, sum = 0; @@ -710,6 +914,11 @@ unsigned long nr_iowait(void) return sum; } +void cpu_sched_info(struct sched_info *info, int cpu) +{ + memcpy(info, &cpu_rq(cpu)->info, sizeof(struct sched_info)); +} + /* * double_rq_lock - safely lock two runqueues * @@ -744,7 +953,7 @@ static inline void double_rq_unlock(runq spin_unlock(&rq2->lock); } -#ifdef CONFIG_NUMA +#ifdef CONFIG_NUMA_SCHED /* * If dest_cpu is allowed for this process, migrate the task to it. * This is accomplished by forcing the cpu_allowed mask to only @@ -771,30 +980,64 @@ static void sched_migrate_task(task_t *p */ static int sched_best_cpu(struct task_struct *p) { - int i, minload, load, best_cpu, node = 0; + int cpu, node, minload, load, best_cpu, best_node; + int this_cpu, this_node, this_node_load; unsigned long cpumask; - best_cpu = task_cpu(p); - if (cpu_rq(best_cpu)->nr_running <= 2) - return best_cpu; + this_cpu = best_cpu = task_cpu(p); + if (cpu_rq(this_cpu)->nr_running <= 2) + return this_cpu; + this_node = best_node = cpu_to_node(this_cpu); + + /* + * First look for any node-local idle queue and use that. + * This improves performance under light loads (mbligh). + * In case this node turns out to be the lightest node, store the best + * cpu that we find, so we don't go sniffing the same runqueues again. + */ + minload = 10000000; + cpumask = node_to_cpumask(this_node); + for (cpu = 0; cpu < NR_CPUS; ++cpu) { + if (!(cpumask & (1UL << cpu))) + continue; + load = cpu_rq(cpu)->nr_running; + if (load == 0) + return cpu; + if (load < minload) { + minload = load; + best_cpu = cpu; + } + } + /* Now find the lightest loaded node, and put it in best_node */ minload = 10000000; - for (i = 0; i < numnodes; i++) { - load = atomic_read(&node_nr_running[i]); + this_node_load = atomic_read(&node_nr_running[this_node]); + for (node = 0; node < numnodes; node++) { + if (node == this_node) + load = this_node_load; + else + load = atomic_read(&node_nr_running[node]); if (load < minload) { minload = load; - node = i; + best_node = node; } } + /* If we chose this node, we already did the legwork earlier */ + if (best_node == this_node) + return best_cpu; + + /* Now find the lightest loaded cpu on best_node, and use that */ minload = 10000000; - cpumask = node_to_cpumask(node); - for (i = 0; i < NR_CPUS; ++i) { - if (!(cpumask & (1UL << i))) + best_cpu = this_cpu; + cpumask = node_to_cpumask(best_node); + for (cpu = 0; cpu < NR_CPUS; ++cpu) { + if (!(cpumask & (1UL << cpu))) continue; - if (cpu_rq(i)->nr_running < minload) { - best_cpu = i; - minload = cpu_rq(i)->nr_running; + load = cpu_rq(cpu)->nr_running; + if (load < minload) { + minload = load; + best_cpu = cpu; } } return best_cpu; @@ -838,7 +1081,10 @@ static int find_busiest_node(int this_no return node; } -#endif /* CONFIG_NUMA */ +#endif /* CONFIG_NUMA_SCHED */ + +int idle_node_rebalance_ratio = 10; +int busy_node_rebalance_ratio = 2; #ifdef CONFIG_SMP @@ -951,6 +1197,12 @@ out: */ static inline void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, runqueue_t *this_rq, int this_cpu) { + if (cpu_to_node(this_cpu) != cpu_to_node(src_rq - runqueues)) { + schedstats[this_cpu].pt_node_gained++; + schedstats[src_rq - runqueues].pt_node_lost++; + } + schedstats[this_cpu].pt_gained++; + schedstats[src_rq - runqueues].pt_lost++; dequeue_task(p, src_array); nr_running_dec(src_rq); set_task_cpu(p, this_cpu); @@ -985,10 +1237,14 @@ static void load_balance(runqueue_t *thi struct list_head *head, *curr; task_t *tmp; + schedstats[this_cpu].lb_cnt++; busiest = find_busiest_queue(this_rq, this_cpu, idle, &imbalance, cpumask); - if (!busiest) + if (!busiest) { + schedstats[this_cpu].lb_nobusy++; goto out; + } + schedstats[this_cpu].lb_imbalance += imbalance; /* * We first consider expired tasks. Those will likely not be * executed in the near future, and they are most likely to @@ -1067,18 +1323,22 @@ out: */ #define IDLE_REBALANCE_TICK (HZ/1000 ?: 1) #define BUSY_REBALANCE_TICK (HZ/5 ?: 1) -#define IDLE_NODE_REBALANCE_TICK (IDLE_REBALANCE_TICK * 5) -#define BUSY_NODE_REBALANCE_TICK (BUSY_REBALANCE_TICK * 2) +#define IDLE_NODE_REBALANCE_TICK (IDLE_REBALANCE_TICK * idle_node_rebalance_ratio) +#define BUSY_NODE_REBALANCE_TICK (BUSY_REBALANCE_TICK * busy_node_rebalance_ratio) -#ifdef CONFIG_NUMA +#ifdef CONFIG_NUMA_SCHED static void balance_node(runqueue_t *this_rq, int idle, int this_cpu) { int node = find_busiest_node(cpu_to_node(this_cpu)); unsigned long cpumask, this_cpumask = 1UL << this_cpu; + schedstats[this_cpu].bn_cnt++; + if (idle) + schedstats[this_cpu].bn_idle++; if (node >= 0) { cpumask = node_to_cpumask(node) | this_cpumask; spin_lock(&this_rq->lock); + schedstats[this_cpu].lb_bnode++; load_balance(this_rq, idle, cpumask); spin_unlock(&this_rq->lock); } @@ -1087,9 +1347,7 @@ static void balance_node(runqueue_t *thi static void rebalance_tick(runqueue_t *this_rq, int idle) { -#ifdef CONFIG_NUMA int this_cpu = smp_processor_id(); -#endif unsigned long j = jiffies; /* @@ -1101,23 +1359,25 @@ static void rebalance_tick(runqueue_t *t * are not balanced.) */ if (idle) { -#ifdef CONFIG_NUMA +#ifdef CONFIG_NUMA_SCHED if (!(j % IDLE_NODE_REBALANCE_TICK)) balance_node(this_rq, idle, this_cpu); #endif if (!(j % IDLE_REBALANCE_TICK)) { spin_lock(&this_rq->lock); - load_balance(this_rq, 0, cpu_to_node_mask(this_cpu)); + schedstats[this_cpu].lb_idle++; + load_balance(this_rq, idle, cpu_to_node_mask(this_cpu)); spin_unlock(&this_rq->lock); } return; } -#ifdef CONFIG_NUMA +#ifdef CONFIG_NUMA_SCHED if (!(j % BUSY_NODE_REBALANCE_TICK)) balance_node(this_rq, idle, this_cpu); #endif if (!(j % BUSY_REBALANCE_TICK)) { spin_lock(&this_rq->lock); + schedstats[this_cpu].lb_busy++; load_balance(this_rq, idle, cpu_to_node_mask(this_cpu)); spin_unlock(&this_rq->lock); } @@ -1225,6 +1485,27 @@ void scheduler_tick(int user_ticks, int enqueue_task(p, rq->expired); } else enqueue_task(p, rq->active); + } else { + /* + * Prevent a too long timeslice allowing a task to monopolize + * the CPU. We do this by splitting up the timeslice into + * smaller pieces. + * + * Note: this does not mean the task's timeslices expire or + * get lost in any way, they just might be preempted by + * another task of equal priority. (one with higher + * priority would have preempted this task already.) We + * requeue this task to the end of the list on this priority + * level, which is in essence a round-robin of tasks with + * equal priority. + */ + if (!(p->time_slice % TIMESLICE_GRANULARITY) && + (p->array == rq->active)) { + dequeue_task(p, rq->active); + set_tsk_need_resched(p); + p->prio = effective_prio(p); + enqueue_task(p, rq->active); + } } out_unlock: spin_unlock(&rq->lock); @@ -1237,19 +1518,32 @@ void scheduling_functions_start_here(voi /* * schedule() is the main scheduler function. */ +#ifdef CONFIG_KGDB_THREAD +asmlinkage void do_schedule(void) +#else asmlinkage void schedule(void) +#endif { task_t *prev, *next; runqueue_t *rq; prio_array_t *array; struct list_head *queue; - int idx; + int idx, mycpu = smp_processor_id(); + + /* + * If crash dump is in progress, this other cpu's + * need to wait until it completes. + * NB: this code is optimized away for kernels without dumping enabled. + */ + if (unlikely(dump_oncpu)) + goto dump_scheduling_disabled; /* * Test if we are atomic. Since do_exit() needs to call into * schedule() atomically, we ignore that path for now. * Otherwise, whine if we are scheduling when we should not be. */ + schedstats[mycpu].sched_cnt++; if (likely(!(current->state & (TASK_DEAD | TASK_ZOMBIE)))) { if (unlikely(in_atomic())) { printk(KERN_ERR "bad: scheduling while atomic!\n"); @@ -1288,6 +1582,7 @@ need_resched: pick_next_task: if (unlikely(!rq->nr_running)) { #ifdef CONFIG_SMP + schedstats[mycpu].lb_resched++; load_balance(rq, 1, cpu_to_node_mask(smp_processor_id())); if (rq->nr_running) goto pick_next_task; @@ -1302,11 +1597,13 @@ pick_next_task: /* * Switch the active and expired arrays. */ + schedstats[mycpu].sched_switch++; rq->active = rq->expired; rq->expired = array; array = rq->active; rq->expired_timestamp = 0; } + schedstats[mycpu].sched_noswitch++; idx = sched_find_first_bit(array->bitmap); queue = array->queue + idx; @@ -1319,6 +1616,7 @@ switch_tasks: if (likely(prev != next)) { rq->nr_switches++; + sched_info_switch(prev, next); rq->curr = next; prepare_arch_switch(rq, next); @@ -1333,6 +1631,16 @@ switch_tasks: preempt_enable_no_resched(); if (test_thread_flag(TIF_NEED_RESCHED)) goto need_resched; + + return; + + dump_scheduling_disabled: + /* allow scheduling only if this is the dumping cpu */ + if (dump_oncpu != smp_processor_id()+1) { + while (dump_oncpu) + cpu_relax(); + } + return; } #ifdef CONFIG_PREEMPT @@ -1466,6 +1774,20 @@ void complete_all(struct completion *x) spin_unlock_irqrestore(&x->wait.lock, flags); } +#ifdef CONFIG_KGDB_THREAD +asmlinkage void user_schedule(void) +{ + current->thread.kgdbregs = NULL; + do_schedule(); +} + +asmlinkage void kern_do_schedule(struct pt_regs regs) +{ + current->thread.kgdbregs = ®s; + do_schedule(); +} +#endif + void wait_for_completion(struct completion *x) { might_sleep(); @@ -1958,6 +2280,7 @@ asmlinkage long sys_sched_yield(void) { runqueue_t *rq = this_rq_lock(); prio_array_t *array = current->array; + int mycpu = smp_processor_id(); /* * We implement yielding by moving the task into the expired @@ -1966,7 +2289,15 @@ asmlinkage long sys_sched_yield(void) * (special rule: RT tasks will just roundrobin in the active * array.) */ + schedstats[mycpu].yld_cnt++; if (likely(!rt_task(current))) { + if (current->array->nr_active == 1) { + schedstats[mycpu].yld_act_empty++; + if (!rq->expired->nr_active) + schedstats[mycpu].yld_both_empty++; + } else if (!rq->expired->nr_active) { + schedstats[mycpu].yld_exp_empty++; + } dequeue_task(current, array); enqueue_task(current, rq->expired); } else { diff -urpN -X /home/fletch/.diff.exclude 000-virgin/kernel/sysctl.c 900-mjb1/kernel/sysctl.c --- 000-virgin/kernel/sysctl.c Fri May 30 19:02:24 2003 +++ 900-mjb1/kernel/sysctl.c Fri May 30 19:05:11 2003 @@ -57,6 +57,18 @@ extern char core_pattern[]; extern int cad_pid; extern int pid_max; extern int sysctl_lower_zone_protection; +extern int min_timeslice; +extern int max_timeslice; +extern int child_penalty; +extern int parent_penalty; +extern int exit_weight; +extern int prio_bonus_ratio; +extern int interactive_delta; +extern int max_sleep_avg; +extern int starvation_limit; +extern int node_threshold; +extern int idle_node_rebalance_ratio; +extern int busy_node_rebalance_ratio; /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ static int maxolduid = 65535; @@ -114,6 +126,7 @@ static struct ctl_table_header root_tabl static ctl_table kern_table[]; static ctl_table vm_table[]; +static ctl_table sched_table[]; #ifdef CONFIG_NET extern ctl_table net_table[]; #endif @@ -158,6 +171,7 @@ static ctl_table root_table[] = { {CTL_FS, "fs", NULL, 0, 0555, fs_table}, {CTL_DEBUG, "debug", NULL, 0, 0555, debug_table}, {CTL_DEV, "dev", NULL, 0, 0555, dev_table}, + {CTL_SCHED, "sched", NULL, 0, 0555, sched_table}, {0} }; @@ -362,7 +376,49 @@ static ctl_table debug_table[] = { static ctl_table dev_table[] = { {0} -}; +}; + +static ctl_table sched_table[] = { + {SCHED_MAX_TIMESLICE, "max_timeslice", &max_timeslice, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &one, NULL}, + {SCHED_MIN_TIMESLICE, "min_timeslice", &min_timeslice, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &one, NULL}, + {SCHED_CHILD_PENALTY, "child_penalty", &child_penalty, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {SCHED_PARENT_PENALTY, "parent_penalty", &parent_penalty, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {SCHED_EXIT_WEIGHT, "exit_weight", &exit_weight, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {SCHED_PRIO_BONUS_RATIO, "prio_bonus_ratio", &prio_bonus_ratio, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {SCHED_INTERACTIVE_DELTA, "interactive_delta", &interactive_delta, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {SCHED_MAX_SLEEP_AVG, "max_sleep_avg", &max_sleep_avg, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &one, NULL}, + {SCHED_STARVATION_LIMIT, "starvation_limit", &starvation_limit, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {SCHED_NODE_THRESHOLD, "node_threshold", &node_threshold, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + sysctl_intvec, NULL, &one, NULL}, + {SCHED_IDLE_NODE_REBALANCE_RATIO, "idle_node_rebalance_ratio", + &idle_node_rebalance_ratio, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {SCHED_BUSY_NODE_REBALANCE_RATIO, "busy_node_rebalance_ratio", + &busy_node_rebalance_ratio, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {0} +}; extern void init_irq_proc (void); diff -urpN -X /home/fletch/.diff.exclude 000-virgin/kernel/timer.c 900-mjb1/kernel/timer.c --- 000-virgin/kernel/timer.c Fri May 30 19:02:24 2003 +++ 900-mjb1/kernel/timer.c Fri May 30 19:26:34 2003 @@ -747,6 +747,8 @@ static unsigned long count_active_tasks( * Requires xtime_lock to access. */ unsigned long avenrun[3]; +unsigned long tasks_running[3]; +unsigned long cpu_tasks_running[3][NR_CPUS]; /* * calc_load - given tick count, update the avenrun load estimates. @@ -754,8 +756,9 @@ unsigned long avenrun[3]; */ static inline void calc_load(unsigned long ticks) { - unsigned long active_tasks; /* fixed-point */ + unsigned long active_tasks, running_tasks; /* fixed-point */ static int count = LOAD_FREQ; + int cpu; count -= ticks; if (count < 0) { @@ -764,6 +767,19 @@ static inline void calc_load(unsigned lo CALC_LOAD(avenrun[0], EXP_1, active_tasks); CALC_LOAD(avenrun[1], EXP_5, active_tasks); CALC_LOAD(avenrun[2], EXP_15, active_tasks); + running_tasks = nr_running() * FIXED_1; + CALC_LOAD(tasks_running[0], EXP_1, running_tasks); + CALC_LOAD(tasks_running[1], EXP_5, running_tasks); + CALC_LOAD(tasks_running[2], EXP_15, running_tasks); + for (cpu = 0; cpu < NR_CPUS; ++cpu) { + if (!cpu_online(cpu)) + continue; + running_tasks = nr_running_cpu(cpu) * FIXED_1; + CALC_LOAD(cpu_tasks_running[0][cpu], EXP_1, running_tasks); + CALC_LOAD(cpu_tasks_running[1][cpu], EXP_5, running_tasks); + CALC_LOAD(cpu_tasks_running[2][cpu], EXP_15, running_tasks); + } + } } diff -urpN -X /home/fletch/.diff.exclude 000-virgin/lib/Kconfig 900-mjb1/lib/Kconfig --- 000-virgin/lib/Kconfig Tue Apr 8 14:38:21 2003 +++ 900-mjb1/lib/Kconfig Fri May 30 19:24:56 2003 @@ -17,14 +17,14 @@ config CRC32 # config ZLIB_INFLATE tristate - default y if CRAMFS=y || PPP_DEFLATE=y || JFFS2_FS=y || ZISOFS_FS=y || BINFMT_ZFLAT=y || CRYPTO_DEFLATE=y - default m if CRAMFS=m || PPP_DEFLATE=m || JFFS2_FS=m || ZISOFS_FS=m || BINFMT_ZFLAT=m || CRYPTO_DEFLATE=m + default y if CRAMFS=y || PPP_DEFLATE=y || JFFS2_FS=y || ZISOFS_FS=y || BINFMT_ZFLAT=y || CRYPTO_DEFLATE=y || CRASH_DUMP_COMPRESS_GZIP=y + default m if CRAMFS=m || PPP_DEFLATE=m || JFFS2_FS=m || ZISOFS_FS=m || BINFMT_ZFLAT=m || CRYPTO_DEFLATE=m || CRASH_DUMP_COMPRESS_GZIP=m config ZLIB_DEFLATE tristate default m if PPP_DEFLATE!=y && JFFS2_FS!=y && CRYPTO_DEFLATE!=y && \ - (PPP_DEFLATE=m || JFFS2_FS=m || CRYPTO_DEFLATE=m) - default y if PPP_DEFLATE=y || JFFS2_FS=y || CRYPTO_DEFLATE=y + (PPP_DEFLATE=m || JFFS2_FS=m || CRYPTO_DEFLATE=m) || CRASH_DUMP_COMPRESS_GZIP=m + default y if PPP_DEFLATE=y || JFFS2_FS=y || CRYPTO_DEFLATE=y || CRASH_DUMP_COMPRESS_GZIP=y endmenu diff -urpN -X /home/fletch/.diff.exclude 000-virgin/mm/Makefile 900-mjb1/mm/Makefile --- 000-virgin/mm/Makefile Thu Feb 13 11:08:15 2003 +++ 900-mjb1/mm/Makefile Fri May 30 19:28:10 2003 @@ -7,7 +7,7 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ shmem.o vmalloc.o -obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ +obj-y := bootmem.o fadvise.o filemap.o membind.o mempool.o oom_kill.o \ page_alloc.o page-writeback.o pdflush.o readahead.o \ slab.o swap.o truncate.o vcache.o vmscan.o $(mmu-y) diff -urpN -X /home/fletch/.diff.exclude 000-virgin/mm/filemap.c 900-mjb1/mm/filemap.c --- 000-virgin/mm/filemap.c Fri May 30 19:02:24 2003 +++ 900-mjb1/mm/filemap.c Fri May 30 22:04:53 2003 @@ -63,6 +63,9 @@ * ->mmap_sem * ->i_shared_sem (various places) * + * ->lock_page + * ->i_shared_sem (page_convert_anon) + * * ->inode_lock * ->sb_lock (fs/fs-writeback.c) * ->mapping->page_lock (__sync_single_inode) @@ -267,19 +270,32 @@ static wait_queue_head_t *page_waitqueue return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)]; } -void wait_on_page_bit(struct page *page, int bit_nr) +int wait_on_page_bit_wq(struct page *page, int bit_nr, wait_queue_t *wait) { wait_queue_head_t *waitqueue = page_waitqueue(page); - DEFINE_WAIT(wait); + DEFINE_WAIT(local_wait); + if (!wait) + wait = &local_wait; + do { - prepare_to_wait(waitqueue, &wait, TASK_UNINTERRUPTIBLE); + prepare_to_wait(waitqueue, wait, TASK_UNINTERRUPTIBLE); if (test_bit(bit_nr, &page->flags)) { sync_page(page); + if (!is_sync_wait(wait)) + return -EIOCBRETRY; io_schedule(); } } while (test_bit(bit_nr, &page->flags)); - finish_wait(waitqueue, &wait); + finish_wait(waitqueue, wait); + + return 0; +} +EXPORT_SYMBOL(wait_on_page_bit_wq); + +void wait_on_page_bit(struct page *page, int bit_nr) +{ + wait_on_page_bit_wq(page, bit_nr, NULL); } EXPORT_SYMBOL(wait_on_page_bit); @@ -335,19 +351,31 @@ EXPORT_SYMBOL(end_page_writeback); * chances are that on the second loop, the block layer's plug list is empty, * so sync_page() will then return in state TASK_UNINTERRUPTIBLE. */ -void __lock_page(struct page *page) +int __lock_page_wq(struct page *page, wait_queue_t *wait) { wait_queue_head_t *wqh = page_waitqueue(page); - DEFINE_WAIT(wait); + DEFINE_WAIT(local_wait); + if (!wait) + wait = &local_wait; + while (TestSetPageLocked(page)) { - prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); + prepare_to_wait(wqh, wait, TASK_UNINTERRUPTIBLE); if (PageLocked(page)) { sync_page(page); + if (!is_sync_wait(wait)) + return -EIOCBRETRY; io_schedule(); } } - finish_wait(wqh, &wait); + finish_wait(wqh, wait); + return 0; +} +EXPORT_SYMBOL(__lock_page_wq); + +void __lock_page(struct page *page) +{ + __lock_page_wq(page, NULL); } EXPORT_SYMBOL(__lock_page); @@ -397,8 +425,8 @@ struct page *find_trylock_page(struct ad * * Returns zero if the page was not present. find_lock_page() may sleep. */ -struct page *find_lock_page(struct address_space *mapping, - unsigned long offset) +struct page *find_lock_page_wq(struct address_space *mapping, + unsigned long offset, wait_queue_t *wait) { struct page *page; @@ -409,7 +437,10 @@ repeat: page_cache_get(page); if (TestSetPageLocked(page)) { spin_unlock(&mapping->page_lock); - lock_page(page); + if (-EIOCBRETRY == lock_page_wq(page, wait)) { + page_cache_release(page); + return ERR_PTR(-EIOCBRETRY); + } spin_lock(&mapping->page_lock); /* Has the page been truncated while we slept? */ @@ -424,6 +455,12 @@ repeat: return page; } +struct page *find_lock_page(struct address_space *mapping, + unsigned long offset) +{ + return find_lock_page_wq(mapping, offset, NULL); +} + /** * find_or_create_page - locate or add a pagecache page * @@ -620,7 +657,13 @@ page_not_up_to_date: goto page_ok; /* Get exclusive access to the page ... */ - lock_page(page); + + if (lock_page_wq(page, current->io_wait)) { + pr_debug("queued lock page \n"); + error = -EIOCBRETRY; + /* TBD: should we hold on to the cached page ? */ + goto sync_error; + } /* Did it get unhashed before we got the lock? */ if (!page->mapping) { @@ -642,12 +685,19 @@ readpage: if (!error) { if (PageUptodate(page)) goto page_ok; - wait_on_page_locked(page); + if (wait_on_page_locked_wq(page, current->io_wait)) { + pr_debug("queued wait_on_page \n"); + error = -EIOCBRETRY; + /*TBD:should we hold on to the cached page ?*/ + goto sync_error; + } + if (PageUptodate(page)) goto page_ok; error = -EIO; } +sync_error: /* UHHUH! A synchronous read error occurred. Report it */ desc->error = error; page_cache_release(page); @@ -818,6 +868,10 @@ generic_file_read(struct file *filp, cha struct kiocb kiocb; ssize_t ret; + if (current->io_wait != NULL) { + printk("current->io_wait != NULL\n"); + dump_stack(); + } init_sync_kiocb(&kiocb, filp); ret = __generic_file_aio_read(&kiocb, &local_iov, 1, ppos); if (-EIOCBQUEUED == ret) @@ -850,6 +904,7 @@ ssize_t generic_file_sendfile(struct fil { read_descriptor_t desc; + BUG_ON(current->io_wait != NULL); if (!count) return 0; @@ -1372,7 +1427,9 @@ __grab_cache_page(struct address_space * int err; struct page *page; repeat: - page = find_lock_page(mapping, index); + page = find_lock_page_wq(mapping, index, current->io_wait); + if (IS_ERR(page)) + return page; if (!page) { if (!*cached_page) { *cached_page = page_cache_alloc(mapping); @@ -1689,6 +1746,10 @@ generic_file_aio_write_nolock(struct kio fault_in_pages_readable(buf, bytes); page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec); + if (IS_ERR(page)) { + status = PTR_ERR(page); + break; + } if (!page) { status = -ENOMEM; break; @@ -1696,6 +1757,8 @@ generic_file_aio_write_nolock(struct kio status = a_ops->prepare_write(file, page, offset, offset+bytes); if (unlikely(status)) { + if (-EIOCBRETRY == status) + pr_debug("queued prepare_write\n"); /* * prepare_write() may have instantiated a few blocks * outside i_size. Trim these off again. @@ -1736,7 +1799,11 @@ generic_file_aio_write_nolock(struct kio page_cache_release(page); if (status < 0) break; - balance_dirty_pages_ratelimited(mapping); + status = balance_dirty_pages_ratelimited(mapping); + if (status < 0) { + pr_debug("async balance_dirty_pages\n"); + break; + } cond_resched(); } while (count); *ppos = pos; @@ -1785,7 +1852,8 @@ ssize_t generic_file_aio_write(struct ki BUG_ON(iocb->ki_pos != pos); - down(&inode->i_sem); + if ((err = down_wq(&inode->i_sem, current->io_wait))) + return err; err = generic_file_aio_write_nolock(iocb, &local_iov, 1, &iocb->ki_pos); up(&inode->i_sem); diff -urpN -X /home/fletch/.diff.exclude 000-virgin/mm/fremap.c 900-mjb1/mm/fremap.c --- 000-virgin/mm/fremap.c Fri May 30 19:02:24 2003 +++ 900-mjb1/mm/fremap.c Fri May 30 19:10:40 2003 @@ -60,10 +60,26 @@ int install_page(struct mm_struct *mm, s pgd_t *pgd; pmd_t *pmd; struct pte_chain *pte_chain; + unsigned long pgidx; pte_chain = pte_chain_alloc(GFP_KERNEL); if (!pte_chain) goto err; + + /* + * Convert this page to anon for objrmap if it's nonlinear + */ + pgidx = (addr - vma->vm_start) >> PAGE_SHIFT; + pgidx += vma->vm_pgoff; + pgidx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT; + if (!PageAnon(page) && (page->index != pgidx)) { + lock_page(page); + err = page_convert_anon(page); + unlock_page(page); + if (err < 0) + goto err_free; + } + pgd = pgd_offset(mm, addr); spin_lock(&mm->page_table_lock); @@ -85,12 +101,11 @@ int install_page(struct mm_struct *mm, s if (flush) flush_tlb_page(vma, addr); update_mmu_cache(vma, addr, *pte); - spin_unlock(&mm->page_table_lock); - pte_chain_free(pte_chain); - return 0; + err = 0; err_unlock: spin_unlock(&mm->page_table_lock); +err_free: pte_chain_free(pte_chain); err: return err; diff -urpN -X /home/fletch/.diff.exclude 000-virgin/mm/membind.c 900-mjb1/mm/membind.c --- 000-virgin/mm/membind.c Wed Dec 31 16:00:00 1969 +++ 900-mjb1/mm/membind.c Fri May 30 19:28:10 2003 @@ -0,0 +1,191 @@ +#include +#include +#include +#include +#include + +#include +#include +#include + + +static inline void cpumask_to_nodemask(DECLARE_BITMAP(cpumask, NR_CPUS), + DECLARE_BITMAP(nodemask, MAX_NUMNODES)) +{ + int i; + + for (i = 0; i < NR_CPUS; i++) + if (test_bit(i, (cpumask))) + set_bit(cpu_to_node(i), (nodemask)); +} + +/* + * Takes a BITMAP of nodes as an argument, and ensures that at least one of + * the nodes in the bitmap are actually online. + * Returns 0 if at least one specified node is online, -EINVAL otherwise. + */ +static inline int check_binding_nodemask(DECLARE_BITMAP(nodemask, MAX_NUMNODES)){ + int i; + + /* Make sure at least one specified node is online */ + for (i = 0; i < MAX_NUMNODES; i++) + if (test_bit(i, nodemask) && node_online(i)) + return 0; + return -EINVAL; +} + +/* + * Takes a policy as an argument and ensures it is a correct policy flag. + * Returns 0 if the policy flag is correct (and possibly alters passed in value), + * returns -EINVAL otherwise. + */ +static inline int check_binding_policy(unsigned long *policy) +{ + unsigned long test; + + /* Test SOFT/HARD binding */ + test = *policy & (MB_SOFTBIND | MB_HARDBIND); + switch (test){ + case 0: + /* If neither specified, default to HARDBIND */ + *policy |= MB_HARDBIND; + case MB_SOFTBIND: + case MB_HARDBIND: + break; + default: + /* User specified some combination. Bad User! */ + return -EINVAL; + } + + /* Test FIRSTREF/STRIPE/MOSTFREE binding */ + test = *policy & (MB_FIRSTREF | MB_STRIPE | MB_MOSTFREE); + switch (test){ + case 0: + /* If none specified, default to FIRSTREF */ + *policy |= MB_FIRSTREF; + case MB_FIRSTREF: + case MB_STRIPE: + break; + case MB_MOSTFREE: + /* Not Implemented Yet */ + default: + return -EINVAL; + } + return 0; +} + +static inline int add_zones(pg_data_t *pgdat, struct zonelist *zonelist, + int zone_num, int zone_type) +{ + switch (zone_type) { + struct zone *zone; + default: + BUG(); + case ZONE_HIGHMEM: + zone = pgdat->node_zones + ZONE_HIGHMEM; + if (zone->present_pages) + zonelist->zones[zone_num++] = zone; + case ZONE_NORMAL: + zone = pgdat->node_zones + ZONE_NORMAL; + if (zone->present_pages) + zonelist->zones[zone_num++] = zone; + case ZONE_DMA: + zone = pgdat->node_zones + ZONE_DMA; + if (zone->present_pages) + zonelist->zones[zone_num++] = zone; + } + return zone_num; +} + +static struct binding *alloc_binding(unsigned long policy, + DECLARE_BITMAP(nodemask, MAX_NUMNODES), int gfp_flag) +{ + struct binding *binding; + int node, zone_num, zone_type; + + if (check_binding_policy(&policy) || check_binding_nodemask(nodemask)) + return NULL; + + binding = (struct binding *)kmalloc(sizeof(struct binding), GFP_KERNEL); + if (!binding) + return NULL; + memset(binding, 0, sizeof(struct binding)); + + /* TODO: Figure in HARD/SOFT binding, policies */ + binding->policy = policy; + + gfp_flag &= GFP_ZONEMASK; + if (gfp_flag & __GFP_HIGHMEM) + zone_type = ZONE_HIGHMEM; + else if (gfp_flag & __GFP_DMA) + zone_type = ZONE_DMA; + else + zone_type = ZONE_NORMAL; + + for (node = 0, zone_num = 0; node < MAX_NUMNODES; node++) { + if (test_bit(node, nodemask) && node_online(node)) + zone_num = add_zones(NODE_DATA(node), &binding->zonelist, zone_num, zone_type); + } + binding->zonelist.zones[zone_num] = NULL; + + if (!zone_num) { + kfree(binding); + binding = NULL; + } + return binding; +} + + +/* + * membind - Bind a range of a process' VM space to a set of memory blocks + * according to a predefined policy. + * + * @start: beginning address of memory region to bind + * @len: length of memory region to bind + * @policy: flag specifying the policy to use for the segment + * @mask_ptr: pointer to bitmask of cpus + * @mask_len: length of the bitmask + */ +asmlinkage unsigned long sys_membind(unsigned long start, unsigned long len, + unsigned long policy, unsigned long *mask_ptr, unsigned int mask_len) +{ + DECLARE_BITMAP(cpu_mask, NR_CPUS); + DECLARE_BITMAP(node_mask, MAX_NUMNODES); + struct vm_area_struct *vma = NULL; + struct address_space *mapping; + int error = 0; + + /* + * Deal with getting cpumask from userspace + * and translating to nodemask + */ + if (mask_len > NR_CPUS) { + error = -EINVAL; + goto out; + } + CLEAR_BITMAP(cpu_mask, NR_CPUS); + CLEAR_BITMAP(node_mask, MAX_NUMNODES); + if (copy_from_user(cpu_mask, mask_ptr, (mask_len+7)/8)) { + error = -EFAULT; + goto out; + } + cpumask_to_nodemask(cpu_mask, node_mask); + + vma = find_vma(current->mm, start); + if (!(vma && vma->vm_file && vma->vm_ops && vma->vm_ops->nopage == shmem_nopage)){ + /* This isn't a shm segment. For now, we bail. */ + printk("%s: Can only bind shm(em) segments for now!\n", __FUNCTION__); + error = -EINVAL; + goto out; + } + + mapping = vma->vm_file->f_dentry->d_inode->i_mapping; + mapping->binding = alloc_binding(policy, node_mask, mapping->gfp_mask); + if (!mapping->binding){ + printk("%s: Error while building memory binding!\n", __FUNCTION__); + error = -EFAULT; + } + +out: + return error; +} diff -urpN -X /home/fletch/.diff.exclude 000-virgin/mm/memory.c 900-mjb1/mm/memory.c --- 000-virgin/mm/memory.c Fri May 30 19:02:24 2003 +++ 900-mjb1/mm/memory.c Fri May 30 19:07:08 2003 @@ -102,8 +102,7 @@ static inline void free_one_pmd(struct m static inline void free_one_pgd(struct mmu_gather *tlb, pgd_t * dir) { - int j; - pmd_t * pmd; + pmd_t * pmd, * md, * emd; if (pgd_none(*dir)) return; @@ -114,8 +113,21 @@ static inline void free_one_pgd(struct m } pmd = pmd_offset(dir, 0); pgd_clear(dir); - for (j = 0; j < PTRS_PER_PMD ; j++) - free_one_pmd(tlb, pmd+j); + /* + * Beware if changing the loop below. It once used int j, + * for (j = 0; j < PTRS_PER_PMD; j++) + * free_one_pmd(pmd+j); + * but some older i386 compilers (e.g. egcs-2.91.66, gcc-2.95.3) + * terminated the loop with a _signed_ address comparison + * using "jle", when configured for HIGHMEM64GB (X86_PAE). + * If also configured for 3GB of kernel virtual address space, + * if page at physical 0x3ffff000 virtual 0x7ffff000 is used as + * a pmd, when that mm exits the loop goes on to free "entries" + * found at 0x80000000 onwards. The loop below compiles instead + * to be terminated by unsigned address comparison using "jb". + */ + for (md = pmd, emd = pmd + PTRS_PER_PMD; md < emd; md++) + free_one_pmd(tlb,md); pmd_free_tlb(tlb, pmd); } @@ -1038,6 +1050,7 @@ static int do_wp_page(struct mm_struct * ++mm->rss; page_remove_rmap(old_page, page_table); break_cow(vma, new_page, address, page_table); + SetPageAnon(new_page); pte_chain = page_add_rmap(new_page, page_table, pte_chain); lru_cache_add_active(new_page); @@ -1241,6 +1254,7 @@ static int do_swap_page(struct mm_struct flush_icache_page(vma, page); set_pte(page_table, pte); + SetPageAnon(page); pte_chain = page_add_rmap(page, page_table, pte_chain); /* No need to invalidate - it was non-present before */ @@ -1306,6 +1320,7 @@ do_anonymous_page(struct mm_struct *mm, entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); lru_cache_add_active(page); mark_page_accessed(page); + SetPageAnon(page); } set_pte(page_table, entry); @@ -1365,6 +1380,10 @@ do_no_page(struct mm_struct *mm, struct if (!pte_chain) goto oom; + /* See if nopage returned an anon page */ + if (!new_page->mapping || PageSwapCache(new_page)) + SetPageAnon(new_page); + /* * Should we do an early C-O-W break? */ @@ -1377,6 +1396,7 @@ do_no_page(struct mm_struct *mm, struct copy_user_highpage(page, new_page, address); page_cache_release(new_page); lru_cache_add_active(page); + SetPageAnon(page); new_page = page; } diff -urpN -X /home/fletch/.diff.exclude 000-virgin/mm/mmap.c 900-mjb1/mm/mmap.c --- 000-virgin/mm/mmap.c Fri May 30 19:02:24 2003 +++ 900-mjb1/mm/mmap.c Fri May 30 19:13:42 2003 @@ -377,6 +377,28 @@ static inline int is_mergeable_vma(struc return 1; } +static void move_vma_start(struct vm_area_struct *vma, unsigned long addr) +{ + spinlock_t *lock = &vma->vm_mm->page_table_lock; + struct inode *inode = NULL; + + if (vma->vm_file) { + inode = vma->vm_file->f_dentry->d_inode; + down(&inode->i_mapping->i_shared_sem); + } + spin_lock(lock); + if (inode) + __remove_shared_vm_struct(vma, inode); + /* If no vm_file, perhaps we should always keep vm_pgoff at 0?? */ + vma->vm_pgoff += (long)(addr - vma->vm_start) >> PAGE_SHIFT; + vma->vm_start = addr; + if (inode) { + __vma_link_file(vma); + up(&inode->i_mapping->i_shared_sem); + } + spin_unlock(lock); +} + /* * Return true if we can merge this (vm_flags,file,vm_pgoff,size) * in front of (at a lower virtual address and file offset than) the vma. @@ -429,8 +451,6 @@ static int vma_merge(struct mm_struct *m unsigned long end, unsigned long vm_flags, struct file *file, unsigned long pgoff) { - spinlock_t * lock = &mm->page_table_lock; - /* * We later require that vma->vm_flags == vm_flags, so this tests * vma->vm_flags & VM_SPECIAL, too. @@ -450,6 +470,7 @@ static int vma_merge(struct mm_struct *m is_mergeable_vma(prev, file, vm_flags) && can_vma_merge_after(prev, vm_flags, file, pgoff)) { struct vm_area_struct *next; + spinlock_t *lock = &mm->page_table_lock; struct inode *inode = file ? file->f_dentry->d_inode : NULL; int need_up = 0; @@ -497,10 +518,7 @@ static int vma_merge(struct mm_struct *m pgoff, (end - addr) >> PAGE_SHIFT)) return 0; if (end == prev->vm_start) { - spin_lock(lock); - prev->vm_start = addr; - prev->vm_pgoff -= (end - addr) >> PAGE_SHIFT; - spin_unlock(lock); + move_vma_start(prev, addr); return 1; } } @@ -1220,8 +1238,7 @@ int split_vma(struct mm_struct * mm, str if (new_below) { new->vm_end = addr; - vma->vm_start = addr; - vma->vm_pgoff += ((addr - new->vm_start) >> PAGE_SHIFT); + move_vma_start(vma, addr); } else { vma->vm_end = addr; new->vm_start = addr; diff -urpN -X /home/fletch/.diff.exclude 000-virgin/mm/page-writeback.c 900-mjb1/mm/page-writeback.c --- 000-virgin/mm/page-writeback.c Fri May 30 19:02:24 2003 +++ 900-mjb1/mm/page-writeback.c Fri May 30 22:04:50 2003 @@ -134,7 +134,7 @@ get_dirty_limits(struct page_state *ps, * If we're over `background_thresh' then pdflush is woken to perform some * writeout. */ -void balance_dirty_pages(struct address_space *mapping) +int balance_dirty_pages(struct address_space *mapping) { struct page_state ps; long nr_reclaimable; @@ -151,6 +151,7 @@ void balance_dirty_pages(struct address_ .sync_mode = WB_SYNC_NONE, .older_than_this = NULL, .nr_to_write = write_chunk, + .nonblocking = !is_sync_wait(current->io_wait) }; get_dirty_limits(&ps, &background_thresh, &dirty_thresh); @@ -177,7 +178,11 @@ void balance_dirty_pages(struct address_ if (pages_written >= write_chunk) break; /* We've done our duty */ } - blk_congestion_wait(WRITE, HZ/10); + if (-EIOCBRETRY == blk_congestion_wait_wq(WRITE, HZ/10, + current->io_wait)) { + pr_debug("async blk congestion wait\n"); + return -EIOCBRETRY; + } } if (nr_reclaimable + ps.nr_writeback <= dirty_thresh) @@ -185,6 +190,8 @@ void balance_dirty_pages(struct address_ if (!writeback_in_progress(bdi) && nr_reclaimable > background_thresh) pdflush_operation(background_writeout, 0); + + return 0; } /** @@ -200,7 +207,7 @@ void balance_dirty_pages(struct address_ * decrease the ratelimiting by a lot, to prevent individual processes from * overshooting the limit by (ratelimit_pages) each. */ -void balance_dirty_pages_ratelimited(struct address_space *mapping) +int balance_dirty_pages_ratelimited(struct address_space *mapping) { static DEFINE_PER_CPU(int, ratelimits) = 0; int cpu; @@ -214,10 +221,10 @@ void balance_dirty_pages_ratelimited(str if (per_cpu(ratelimits, cpu)++ >= ratelimit) { per_cpu(ratelimits, cpu) = 0; put_cpu(); - balance_dirty_pages(mapping); - return; + return balance_dirty_pages(mapping); } put_cpu(); + return 0; } EXPORT_SYMBOL_GPL(balance_dirty_pages_ratelimited); diff -urpN -X /home/fletch/.diff.exclude 000-virgin/mm/page_alloc.c 900-mjb1/mm/page_alloc.c --- 000-virgin/mm/page_alloc.c Fri May 30 19:02:24 2003 +++ 900-mjb1/mm/page_alloc.c Fri May 30 19:29:30 2003 @@ -85,7 +85,8 @@ static void bad_page(const char *functio page->mapping = NULL; } -#ifndef CONFIG_HUGETLB_PAGE +#if !defined(CONFIG_HUGETLB_PAGE) && !defined(CONFIG_CRASH_DUMP) \ + && !defined(CONFIG_CRASH_DUMP_MODULE) #define prep_compound_page(page, order) do { } while (0) #define destroy_compound_page(page, order) do { } while (0) #else @@ -220,6 +221,8 @@ static inline void free_pages_check(cons bad_page(function, page); if (PageDirty(page)) ClearPageDirty(page); + if (PageAnon(page)) + ClearPageAnon(page); } /* @@ -265,6 +268,7 @@ void __free_pages_ok(struct page *page, mod_page_state(pgfree, 1 << order); free_pages_check(__FUNCTION__, page); + list_add(&page->list, &list); free_pages_bulk(page_zone(page), 1, &list, order); } @@ -443,6 +447,7 @@ static void free_hot_cold_page(struct pa inc_page_state(pgfree); free_pages_check(__FUNCTION__, page); + pcp = &zone->pageset[get_cpu()].pcp[cold]; local_irq_save(flags); if (pcp->count >= pcp->high) @@ -897,7 +902,7 @@ void si_meminfo_node(struct sysinfo *val { pg_data_t *pgdat = NODE_DATA(nid); - val->totalram = pgdat->node_size; + val->totalram = pgdat->real_node_size; val->freeram = nr_free_pages_pgdat(pgdat); val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].spanned_pages; val->freehigh = pgdat->node_zones[ZONE_HIGHMEM].free_pages; @@ -1138,6 +1143,8 @@ static void __init calculate_zone_totalp if (zholes_size) for (i = 0; i < MAX_NR_ZONES; i++) realtotalpages -= zholes_size[i]; + pgdat->real_node_size = realtotalpages; + printk("On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages); } diff -urpN -X /home/fletch/.diff.exclude 000-virgin/mm/rmap.c 900-mjb1/mm/rmap.c --- 000-virgin/mm/rmap.c Sun Apr 20 19:35:08 2003 +++ 900-mjb1/mm/rmap.c Fri May 30 19:13:42 2003 @@ -102,6 +102,136 @@ pte_chain_encode(struct pte_chain *pte_c **/ /** + * find_pte - Find a pte pointer given a vma and a struct page. + * @vma: the vma to search + * @page: the page to find + * + * Determine if this page is mapped in this vma. If it is, map and rethrn + * the pte pointer associated with it. Return null if the page is not + * mapped in this vma for any reason. + * + * This is strictly an internal helper function for the object-based rmap + * functions. + * + * It is the caller's responsibility to unmap the pte if it is returned. + */ +static inline pte_t * +find_pte(struct vm_area_struct *vma, struct page *page, unsigned long *addr) +{ + struct mm_struct *mm = vma->vm_mm; + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + unsigned long loffset; + unsigned long address; + + loffset = (page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT)); + address = vma->vm_start + ((loffset - vma->vm_pgoff) << PAGE_SHIFT); + if (address < vma->vm_start || address >= vma->vm_end) + goto out; + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + goto out; + + pmd = pmd_offset(pgd, address); + if (!pmd_present(*pmd)) + goto out; + + pte = pte_offset_map(pmd, address); + if (!pte_present(*pte)) + goto out_unmap; + + if (page_to_pfn(page) != pte_pfn(*pte)) + goto out_unmap; + + if (addr) + *addr = address; + + return pte; + +out_unmap: + pte_unmap(pte); +out: + return NULL; +} + +/** + * page_referenced_obj_one - referenced check for object-based rmap + * @vma: the vma to look in. + * @page: the page we're working on. + * + * Find a pte entry for a page/vma pair, then check and clear the referenced + * bit. + * + * This is strictly a helper function for page_referenced_obj. + */ +static int +page_referenced_obj_one(struct vm_area_struct *vma, struct page *page) +{ + struct mm_struct *mm = vma->vm_mm; + pte_t *pte; + int referenced = 0; + + if (!spin_trylock(&mm->page_table_lock)) + return 1; + + pte = find_pte(vma, page, NULL); + if (pte) { + if (ptep_test_and_clear_young(pte)) + referenced++; + pte_unmap(pte); + } + + spin_unlock(&mm->page_table_lock); + return referenced; +} + +/** + * page_referenced_obj_one - referenced check for object-based rmap + * @page: the page we're checking references on. + * + * For an object-based mapped page, find all the places it is mapped and + * check/clear the referenced flag. This is done by following the page->mapping + * pointer, then walking the chain of vmas it holds. It returns the number + * of references it found. + * + * This function is only called from page_referenced for object-based pages. + * + * The semaphore address_space->i_shared_sem is tried. If it can't be gotten, + * assume a reference count of 1. + */ +static int +page_referenced_obj(struct page *page) +{ + struct address_space *mapping = page->mapping; + struct vm_area_struct *vma; + int referenced = 0; + + if (!page->pte.mapcount) + return 0; + + if (!mapping) + BUG(); + + if (PageSwapCache(page)) + BUG(); + + if (down_trylock(&mapping->i_shared_sem)) + return 1; + + list_for_each_entry(vma, &mapping->i_mmap, shared) + referenced += page_referenced_obj_one(vma, page); + + list_for_each_entry(vma, &mapping->i_mmap_shared, shared) + referenced += page_referenced_obj_one(vma, page); + + up(&mapping->i_shared_sem); + + return referenced; +} + +/** * page_referenced - test if the page was referenced * @page: the page to test * @@ -120,6 +250,10 @@ int page_referenced(struct page * page) if (TestClearPageReferenced(page)) referenced++; + if (!PageAnon(page)) { + referenced += page_referenced_obj(page); + goto out; + } if (PageDirect(page)) { pte_t *pte = rmap_ptep_map(page->pte.direct); if (ptep_test_and_clear_young(pte)) @@ -153,6 +287,7 @@ int page_referenced(struct page * page) __pte_chain_free(pc); } } +out: return referenced; } @@ -175,6 +310,21 @@ page_add_rmap(struct page *page, pte_t * pte_chain_lock(page); + /* + * If this is an object-based page, just count it. We can + * find the mappings by walking the object vma chain for that object. + */ + if (!PageAnon(page)) { + if (!page->mapping) + BUG(); + if (PageSwapCache(page)) + BUG(); + if (!page->pte.mapcount) + inc_page_state(nr_mapped); + page->pte.mapcount++; + goto out; + } + if (page->pte.direct == 0) { page->pte.direct = pte_paddr; SetPageDirect(page); @@ -231,8 +381,25 @@ void page_remove_rmap(struct page *page, pte_chain_lock(page); if (!page_mapped(page)) - goto out_unlock; /* remap_page_range() from a driver? */ + goto out_unlock; + /* + * If this is an object-based page, just uncount it. We can + * find the mappings by walking the object vma chain for that object. + */ + if (!PageAnon(page)) { + if (!page->mapping) + BUG(); + if (PageSwapCache(page)) + BUG(); + if (!page->pte.mapcount) + BUG(); + page->pte.mapcount--; + if (!page->pte.mapcount) + dec_page_state(nr_mapped); + goto out_unlock; + } + if (PageDirect(page)) { if (page->pte.direct == pte_paddr) { page->pte.direct = 0; @@ -279,6 +446,102 @@ out_unlock: } /** + * try_to_unmap_obj - unmap a page using the object-based rmap method + * @page: the page to unmap + * + * Determine whether a page is mapped in a given vma and unmap it if it's found. + * + * This function is strictly a helper function for try_to_unmap_obj. + */ +static inline int +try_to_unmap_obj_one(struct vm_area_struct *vma, struct page *page) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long address; + pte_t *pte; + pte_t pteval; + int ret = SWAP_AGAIN; + + if (!spin_trylock(&mm->page_table_lock)) + return ret; + + pte = find_pte(vma, page, &address); + if (!pte) + goto out; + + if (vma->vm_flags & VM_LOCKED) { + ret = SWAP_FAIL; + goto out_unmap; + } + + flush_cache_page(vma, address); + pteval = ptep_get_and_clear(pte); + flush_tlb_page(vma, address); + + if (pte_dirty(pteval)) + set_page_dirty(page); + + if (!page->pte.mapcount) + BUG(); + + mm->rss--; + page->pte.mapcount--; + page_cache_release(page); + +out_unmap: + pte_unmap(pte); + +out: + spin_unlock(&mm->page_table_lock); + return ret; +} + +/** + * try_to_unmap_obj - unmap a page using the object-based rmap method + * @page: the page to unmap + * + * Find all the mappings of a page using the mapping pointer and the vma chains + * contained in the address_space struct it points to. + * + * This function is only called from try_to_unmap for object-based pages. + * + * The semaphore address_space->i_shared_sem is tried. If it can't be gotten, + * return a temporary error. + */ +static int +try_to_unmap_obj(struct page *page) +{ + struct address_space *mapping = page->mapping; + struct vm_area_struct *vma; + int ret = SWAP_AGAIN; + + if (!mapping) + BUG(); + + if (PageSwapCache(page)) + BUG(); + + if (down_trylock(&mapping->i_shared_sem)) + return ret; + + list_for_each_entry(vma, &mapping->i_mmap, shared) { + ret = try_to_unmap_obj_one(vma, page); + if (ret == SWAP_FAIL || !page->pte.mapcount) + goto out; + } + + list_for_each_entry(vma, &mapping->i_mmap_shared, shared) { + ret = try_to_unmap_obj_one(vma, page); + if (ret == SWAP_FAIL || !page->pte.mapcount) + goto out; + } + +out: + up(&mapping->i_shared_sem); + return ret; +} + +/** * try_to_unmap_one - worker function for try_to_unmap * @page: page to unmap * @ptep: page table entry to unmap from page @@ -397,6 +660,15 @@ int try_to_unmap(struct page * page) if (!page->mapping) BUG(); + /* + * If it's an object-based page, use the object vma chain to find all + * the mappings. + */ + if (!PageAnon(page)) { + ret = try_to_unmap_obj(page); + goto out; + } + if (PageDirect(page)) { ret = try_to_unmap_one(page, page->pte.direct); if (ret == SWAP_SUCCESS) { @@ -452,9 +724,112 @@ int try_to_unmap(struct page * page) } } out: - if (!page_mapped(page)) + if (!page_mapped(page)) { dec_page_state(nr_mapped); + ret = SWAP_SUCCESS; + } return ret; +} + +/** + * page_convert_anon - Convert an object-based mapped page to pte_chain-based. + * @page: the page to convert + * + * Find all the mappings for an object-based page and convert them + * to 'anonymous', ie create a pte_chain and store all the pte pointers there. + * + * This function takes the address_space->i_shared_sem, sets the PageAnon flag, + * then sets the mm->page_table_lock for each vma and calls page_add_rmap. This + * means there is a period when PageAnon is set, but still has some mappings + * with no pte_chain entry. This is in fact safe, since page_remove_rmap will + * simply not find it. try_to_unmap might erroneously return success, but it + * will never be called because the page_convert_anon() caller has locked the + * page. + * + * page_referenced() may fail to scan all the appropriate pte's and may return + * an inaccurate result. This is so rare that it does not matter. + */ +int page_convert_anon(struct page *page) +{ + struct address_space *mapping; + struct vm_area_struct *vma; + struct pte_chain *pte_chain = NULL; + pte_t *pte; + int err = 0; + + mapping = page->mapping; + if (mapping == NULL) + goto out; /* truncate won the lock_page() race */ + + down(&mapping->i_shared_sem); + pte_chain_lock(page); + + /* + * Has someone else done it for us before we got the lock? + * If so, pte.direct or pte.chain has replaced pte.mapcount. + */ + if (PageAnon(page)) { + pte_chain_unlock(page); + goto out_unlock; + } + + SetPageAnon(page); + if (page->pte.mapcount == 0) { + pte_chain_unlock(page); + goto out_unlock; + } + /* This is gonna get incremented by page_add_rmap */ + dec_page_state(nr_mapped); + page->pte.mapcount = 0; + + /* + * Now that the page is marked as anon, unlock it. page_add_rmap will + * lock it as necessary. + */ + pte_chain_unlock(page); + + list_for_each_entry(vma, &mapping->i_mmap, shared) { + if (!pte_chain) { + pte_chain = pte_chain_alloc(GFP_KERNEL); + if (!pte_chain) { + err = -ENOMEM; + goto out_unlock; + } + } + spin_lock(&vma->vm_mm->page_table_lock); + pte = find_pte(vma, page, NULL); + if (pte) { + /* Make sure this isn't a duplicate */ + page_remove_rmap(page, pte); + pte_chain = page_add_rmap(page, pte, pte_chain); + pte_unmap(pte); + } + spin_unlock(&vma->vm_mm->page_table_lock); + } + list_for_each_entry(vma, &mapping->i_mmap_shared, shared) { + if (!pte_chain) { + pte_chain = pte_chain_alloc(GFP_KERNEL); + if (!pte_chain) { + err = -ENOMEM; + goto out_unlock; + } + } + spin_lock(&vma->vm_mm->page_table_lock); + pte = find_pte(vma, page, NULL); + if (pte) { + /* Make sure this isn't a duplicate */ + page_remove_rmap(page, pte); + pte_chain = page_add_rmap(page, pte, pte_chain); + pte_unmap(pte); + } + spin_unlock(&vma->vm_mm->page_table_lock); + } + +out_unlock: + pte_chain_free(pte_chain); + up(&mapping->i_shared_sem); +out: + return err; } /** diff -urpN -X /home/fletch/.diff.exclude 000-virgin/mm/swap_state.c 900-mjb1/mm/swap_state.c --- 000-virgin/mm/swap_state.c Sat May 10 18:35:04 2003 +++ 900-mjb1/mm/swap_state.c Fri May 30 19:28:10 2003 @@ -41,6 +41,7 @@ struct address_space swapper_space = { .host = &swapper_inode, .a_ops = &swap_aops, .backing_dev_info = &swap_backing_dev_info, + .binding = NULL, .i_mmap = LIST_HEAD_INIT(swapper_space.i_mmap), .i_mmap_shared = LIST_HEAD_INIT(swapper_space.i_mmap_shared), .i_shared_sem = __MUTEX_INITIALIZER(swapper_space.i_shared_sem), diff -urpN -X /home/fletch/.diff.exclude 000-virgin/mm/swapfile.c 900-mjb1/mm/swapfile.c --- 000-virgin/mm/swapfile.c Fri May 30 19:02:24 2003 +++ 900-mjb1/mm/swapfile.c Fri May 30 19:07:08 2003 @@ -385,6 +385,7 @@ unuse_pte(struct vm_area_struct *vma, un vma->vm_mm->rss++; get_page(page); set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot))); + SetPageAnon(page); *pte_chainp = page_add_rmap(page, dir, *pte_chainp); swap_free(entry); } diff -urpN -X /home/fletch/.diff.exclude 000-virgin/net/core/dev.c 900-mjb1/net/core/dev.c --- 000-virgin/net/core/dev.c Fri May 30 19:02:24 2003 +++ 900-mjb1/net/core/dev.c Fri May 30 19:24:56 2003 @@ -1296,8 +1296,6 @@ int netif_rx(struct sk_buff *skb) struct softnet_data *queue; unsigned long flags; - if (!skb->stamp.tv_sec) - do_gettimeofday(&skb->stamp); /* * The code is rearranged so that the path is the most @@ -1307,6 +1305,13 @@ int netif_rx(struct sk_buff *skb) this_cpu = smp_processor_id(); queue = &softnet_data[this_cpu]; + if (skb->dev->rx_hook) + goto rx_hook; +rx_hook_continue: + + if (!skb->stamp.tv_sec ) + do_gettimeofday(&skb->stamp); + netdev_rx_stat[this_cpu].total++; if (queue->input_pkt_queue.qlen <= netdev_max_backlog) { if (queue->input_pkt_queue.qlen) { @@ -1349,6 +1354,15 @@ drop: kfree_skb(skb); return NET_RX_DROP; +rx_hook: + { + int ret; + + ret = skb->dev->rx_hook(skb); + if (ret == NET_RX_DROP) + goto drop; + goto rx_hook_continue; + } } /* Deliver skb to an old protocol, which is not threaded well diff -urpN -X /home/fletch/.diff.exclude 000-virgin/scripts/schedcapture 900-mjb1/scripts/schedcapture --- 000-virgin/scripts/schedcapture Wed Dec 31 16:00:00 1969 +++ 900-mjb1/scripts/schedcapture Fri May 30 19:05:10 2003 @@ -0,0 +1,6 @@ +while true +do + cat /proc/schedstat + echo + sleep 20 +done diff -urpN -X /home/fletch/.diff.exclude 000-virgin/scripts/schedstat 900-mjb1/scripts/schedstat --- 000-virgin/scripts/schedstat Wed Dec 31 16:00:00 1969 +++ 900-mjb1/scripts/schedstat Fri May 30 19:05:10 2003 @@ -0,0 +1,168 @@ +#!/usr/bin/perl + +$slice = 20; # seconds +while (<>) { + @curr = split; + if ($curr[0] =~ /cpu(\d)/) { + $per_cpu_curr[$1] = [ @curr ]; + $max_cpu = $1 if ($1 > $max_cpu); + next; + } + next if (/^$/); + if ($curr[0] eq "version") { + if ($curr[1] != 2) { + die "Version mismatch. Update this tool.\n"; + } + next; + } + # + # format of line in /proc/schedstat + # + # tag 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 + # + # tag is "cpuN" or "cpu". Right now, we ignore "cpuN" lines (this tool + # doesn't collate per-cpu statistics, although it would be trivial to + # do so.) + # + # version == 1 + # NOTE: the active queue is considered empty if it has only one process + # in it, since obviously the process calling sched_yield is that process. + # + # First four are sched_yield statistics: + # 1) # of times both the active and the expired queue were empty + # 2) # of times just the active queue was empty + # 3) # of times just the expired queue was empty + # 4) # of times sched_yield() was called + # + # Next two are schedule() statistics: + # 5) # of times the active queue had at least one other process on it. + # 6) # of times we switched to the expired queue and reused it + # 7) # of times schedule() was called + # + # Next seven are statistics dealing with load balancing: + # 8) # of times load_balance was called at an idle tick + # 9) # of times load_balance was called at an busy tick + # 10) # of times load_balance was called from schedule() + # 11) # of times load_balance was called + # 12) sum of imbalances discovered (if any) with each call to + # load_balance + # 13) # of times load_balance was called when we did not find a + # "busiest" queue + # 14) # of times load_balance was called from balance_node() + # + # Next four are statistics dealing with pull_task(): + # 15) # of times pull_task was called at an idle tick + # 16) # of times pull_task was called at an busy tick + # 17) # of times pull_task was called from schedule() + # 18) # of times pull_task was called + # + # Next two are statistics dealing with balance_node(): + # 19) # of times balance_node was called + # 20) # of times balance_node was called at an idle tick + # + #$curr[7] = $sched_cnt; + foreach $i (1..20) { + $diff[$i] = $curr[$i] - $prev[$i]; + } + + for ($cpu = 0; $cpu <= $max_cpu; $cpu++) { + @arr_curr = @{$per_cpu_curr[$cpu]}; + @arr_prev = @{$per_cpu_prev[$cpu]}; + foreach $i (1..20) { + $arr_diff[$i] = $arr_curr[$i] - $arr_prev[$i]; + } + $per_cpu_diff[$cpu] = [ @arr_diff ]; + } + + #for ($cpu = 0; $cpu <= $max_cpu; $cpu++) { +# print "@{$per_cpu_curr[$cpu]}\n"; +# } +# print "@curr\n"; + printf "%02d:%02d:%02d--------------------------------------------------------------\n", + $tick*$slice/3600, ($tick*$slice/60)%60, ($tick*$slice)%60; + + # + # sched_yield() stats + # + printf " %7d sys_sched_yield()\n", $diff[4]; + printf " %7d(%6.2f%%) found (only) active queue empty on current cpu\n", + $diff[2]-$diff[1], $diff[4] ? (100*($diff[2]-$diff[1])/$diff[4]) : 0; + printf " %7d(%6.2f%%) found (only) expired queue empty on current cpu\n", + $diff[3], $diff[4] ? (100*$diff[3]/$diff[4]) : 0; + printf " %7d(%6.2f%%) found both queues empty on current cpu\n", + $diff[1], $diff[4] ? (100*$diff[1]/$diff[4]) : 0; + printf " %7d(%6.2f%%) found neither queue empty on current cpu\n\n", + $diff[4]-($diff[3]+$diff[2]), + $diff[4] ? 100*($diff[4]-($diff[3]+$diff[2]))/$diff[4] : 0; + + # + # schedule() stats + # + printf " %7d schedule()\n", $diff[7]; + printf " %7d(%6.2f%%) switched active and expired queues\n", + $diff[6], $diff[7] ? (100*$diff[6]/$diff[7]) : 0; + printf " %7d(%6.2f%%) used existing active queue\n\n", + $diff[5]-$diff[6], $diff[7] ? (100*($diff[5]-$diff[6])/$diff[7]) : 0; + + # + # load_balance() stats + # + printf " %7d load_balance()\n", $diff[11]; + printf " %7d(%6.2f%%) called while idle\n", $diff[8], + 100*$diff[8]/$diff[11]; + printf " %7d(%6.2f%%) called while busy\n", $diff[9], + 100*($diff[9])/$diff[11]; + printf " %7d(%6.2f%%) called from schedule()\n", $diff[10], + 100*$diff[10]/$diff[11]; + printf " %7d(%6.2f%%) called from balance_node()\n", $diff[14], + 100*$diff[14]/$diff[11]; + printf " %7d no \"busiest\" queue found\n",$diff[13]; + if ($diff[11]-$diff[13]) { + $imbalance = $diff[12] / ($diff[11]-$diff[13]); + if ($imbalance < 10) { + printf " %7.3f average imbalance (over %d)\n", + $imbalance, $diff[11]-$diff[13]; + } elsif ($imbalance < 100) { + printf " %8.2f average imbalance (over %d)\n", + $imbalance, $diff[11]-$diff[13]; + } else { + printf " %9.1f average imbalance (over %d)\n", + $imbalance, $diff[11]-$diff[13]; + } + } + else { + printf " no imbalances\n"; + } + + # + # pull_task() stats + # + print "\n"; + printf " %7d pull_task()\n", $diff[15]; + for ($cpu = 0; $cpu <= $max_cpu; $cpu++) { + @arr = @{$per_cpu_diff[$cpu]}; + if ($arr[15] || $arr[16]) { + printf " %7d/%-7d cpu %d lost/gained task to/from another cpu\n", + $arr[15], $arr[16], $cpu; + } + if ($arr[17] || $arr[18]) { + printf " %7d/%-7d cpu %d lost/gained task to/from another node\n", + $arr[17], $arr[18], $cpu; + } + } + print "\n"; + + # + # balance_node() stats + # + printf " %7d balance_node()\n", $diff[19]; + printf " %7d(%6.2f%%) called while idle\n", $diff[20], + $diff[19] ? 100*$diff[20]/$diff[19] : 0; + printf " %7d(%6.2f%%) called while busy\n", $diff[19] - $diff[20], + $diff[19] ? 100*(($diff[19]-$diff[20]))/$diff[19] : 0; + + printf("\n"); + @prev = @curr; + @per_cpu_prev = @per_cpu_curr; + $tick++; +}